1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy 22*eda14cbcSMatt Macy /* 23*eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24*eda14cbcSMatt Macy * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25*eda14cbcSMatt Macy * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26*eda14cbcSMatt Macy */ 27*eda14cbcSMatt Macy 28*eda14cbcSMatt Macy #include <sys/zfs_context.h> 29*eda14cbcSMatt Macy #include <sys/dbuf.h> 30*eda14cbcSMatt Macy #include <sys/dnode.h> 31*eda14cbcSMatt Macy #include <sys/dmu.h> 32*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 33*eda14cbcSMatt Macy #include <sys/dmu_objset.h> 34*eda14cbcSMatt Macy #include <sys/dmu_recv.h> 35*eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 36*eda14cbcSMatt Macy #include <sys/spa.h> 37*eda14cbcSMatt Macy #include <sys/range_tree.h> 38*eda14cbcSMatt Macy #include <sys/zfeature.h> 39*eda14cbcSMatt Macy 40*eda14cbcSMatt Macy static void 41*eda14cbcSMatt Macy dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) 42*eda14cbcSMatt Macy { 43*eda14cbcSMatt Macy dmu_buf_impl_t *db; 44*eda14cbcSMatt Macy int txgoff = tx->tx_txg & TXG_MASK; 45*eda14cbcSMatt Macy int nblkptr = dn->dn_phys->dn_nblkptr; 46*eda14cbcSMatt Macy int old_toplvl = dn->dn_phys->dn_nlevels - 1; 47*eda14cbcSMatt Macy int new_level = dn->dn_next_nlevels[txgoff]; 48*eda14cbcSMatt Macy int i; 49*eda14cbcSMatt Macy 50*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 51*eda14cbcSMatt Macy 52*eda14cbcSMatt Macy /* this dnode can't be paged out because it's dirty */ 53*eda14cbcSMatt Macy ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); 54*eda14cbcSMatt Macy ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); 55*eda14cbcSMatt Macy 56*eda14cbcSMatt Macy db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); 57*eda14cbcSMatt Macy ASSERT(db != NULL); 58*eda14cbcSMatt Macy 59*eda14cbcSMatt Macy dn->dn_phys->dn_nlevels = new_level; 60*eda14cbcSMatt Macy dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, 61*eda14cbcSMatt Macy dn->dn_object, dn->dn_phys->dn_nlevels); 62*eda14cbcSMatt Macy 63*eda14cbcSMatt Macy /* 64*eda14cbcSMatt Macy * Lock ordering requires that we hold the children's db_mutexes (by 65*eda14cbcSMatt Macy * calling dbuf_find()) before holding the parent's db_rwlock. The lock 66*eda14cbcSMatt Macy * order is imposed by dbuf_read's steps of "grab the lock to protect 67*eda14cbcSMatt Macy * db_parent, get db_parent, hold db_parent's db_rwlock". 68*eda14cbcSMatt Macy */ 69*eda14cbcSMatt Macy dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; 70*eda14cbcSMatt Macy ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); 71*eda14cbcSMatt Macy for (i = 0; i < nblkptr; i++) { 72*eda14cbcSMatt Macy children[i] = 73*eda14cbcSMatt Macy dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); 74*eda14cbcSMatt Macy } 75*eda14cbcSMatt Macy 76*eda14cbcSMatt Macy /* transfer dnode's block pointers to new indirect block */ 77*eda14cbcSMatt Macy (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); 78*eda14cbcSMatt Macy if (dn->dn_dbuf != NULL) 79*eda14cbcSMatt Macy rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); 80*eda14cbcSMatt Macy rw_enter(&db->db_rwlock, RW_WRITER); 81*eda14cbcSMatt Macy ASSERT(db->db.db_data); 82*eda14cbcSMatt Macy ASSERT(arc_released(db->db_buf)); 83*eda14cbcSMatt Macy ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); 84*eda14cbcSMatt Macy bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, 85*eda14cbcSMatt Macy sizeof (blkptr_t) * nblkptr); 86*eda14cbcSMatt Macy arc_buf_freeze(db->db_buf); 87*eda14cbcSMatt Macy 88*eda14cbcSMatt Macy /* set dbuf's parent pointers to new indirect buf */ 89*eda14cbcSMatt Macy for (i = 0; i < nblkptr; i++) { 90*eda14cbcSMatt Macy dmu_buf_impl_t *child = children[i]; 91*eda14cbcSMatt Macy 92*eda14cbcSMatt Macy if (child == NULL) 93*eda14cbcSMatt Macy continue; 94*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 95*eda14cbcSMatt Macy DB_DNODE_ENTER(child); 96*eda14cbcSMatt Macy ASSERT3P(DB_DNODE(child), ==, dn); 97*eda14cbcSMatt Macy DB_DNODE_EXIT(child); 98*eda14cbcSMatt Macy #endif /* DEBUG */ 99*eda14cbcSMatt Macy if (child->db_parent && child->db_parent != dn->dn_dbuf) { 100*eda14cbcSMatt Macy ASSERT(child->db_parent->db_level == db->db_level); 101*eda14cbcSMatt Macy ASSERT(child->db_blkptr != 102*eda14cbcSMatt Macy &dn->dn_phys->dn_blkptr[child->db_blkid]); 103*eda14cbcSMatt Macy mutex_exit(&child->db_mtx); 104*eda14cbcSMatt Macy continue; 105*eda14cbcSMatt Macy } 106*eda14cbcSMatt Macy ASSERT(child->db_parent == NULL || 107*eda14cbcSMatt Macy child->db_parent == dn->dn_dbuf); 108*eda14cbcSMatt Macy 109*eda14cbcSMatt Macy child->db_parent = db; 110*eda14cbcSMatt Macy dbuf_add_ref(db, child); 111*eda14cbcSMatt Macy if (db->db.db_data) 112*eda14cbcSMatt Macy child->db_blkptr = (blkptr_t *)db->db.db_data + i; 113*eda14cbcSMatt Macy else 114*eda14cbcSMatt Macy child->db_blkptr = NULL; 115*eda14cbcSMatt Macy dprintf_dbuf_bp(child, child->db_blkptr, 116*eda14cbcSMatt Macy "changed db_blkptr to new indirect %s", ""); 117*eda14cbcSMatt Macy 118*eda14cbcSMatt Macy mutex_exit(&child->db_mtx); 119*eda14cbcSMatt Macy } 120*eda14cbcSMatt Macy 121*eda14cbcSMatt Macy bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); 122*eda14cbcSMatt Macy 123*eda14cbcSMatt Macy rw_exit(&db->db_rwlock); 124*eda14cbcSMatt Macy if (dn->dn_dbuf != NULL) 125*eda14cbcSMatt Macy rw_exit(&dn->dn_dbuf->db_rwlock); 126*eda14cbcSMatt Macy 127*eda14cbcSMatt Macy dbuf_rele(db, FTAG); 128*eda14cbcSMatt Macy 129*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 130*eda14cbcSMatt Macy } 131*eda14cbcSMatt Macy 132*eda14cbcSMatt Macy static void 133*eda14cbcSMatt Macy free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) 134*eda14cbcSMatt Macy { 135*eda14cbcSMatt Macy dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 136*eda14cbcSMatt Macy uint64_t bytesfreed = 0; 137*eda14cbcSMatt Macy 138*eda14cbcSMatt Macy dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); 139*eda14cbcSMatt Macy 140*eda14cbcSMatt Macy for (int i = 0; i < num; i++, bp++) { 141*eda14cbcSMatt Macy if (BP_IS_HOLE(bp)) 142*eda14cbcSMatt Macy continue; 143*eda14cbcSMatt Macy 144*eda14cbcSMatt Macy bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); 145*eda14cbcSMatt Macy ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); 146*eda14cbcSMatt Macy 147*eda14cbcSMatt Macy /* 148*eda14cbcSMatt Macy * Save some useful information on the holes being 149*eda14cbcSMatt Macy * punched, including logical size, type, and indirection 150*eda14cbcSMatt Macy * level. Retaining birth time enables detection of when 151*eda14cbcSMatt Macy * holes are punched for reducing the number of free 152*eda14cbcSMatt Macy * records transmitted during a zfs send. 153*eda14cbcSMatt Macy */ 154*eda14cbcSMatt Macy 155*eda14cbcSMatt Macy uint64_t lsize = BP_GET_LSIZE(bp); 156*eda14cbcSMatt Macy dmu_object_type_t type = BP_GET_TYPE(bp); 157*eda14cbcSMatt Macy uint64_t lvl = BP_GET_LEVEL(bp); 158*eda14cbcSMatt Macy 159*eda14cbcSMatt Macy bzero(bp, sizeof (blkptr_t)); 160*eda14cbcSMatt Macy 161*eda14cbcSMatt Macy if (spa_feature_is_active(dn->dn_objset->os_spa, 162*eda14cbcSMatt Macy SPA_FEATURE_HOLE_BIRTH)) { 163*eda14cbcSMatt Macy BP_SET_LSIZE(bp, lsize); 164*eda14cbcSMatt Macy BP_SET_TYPE(bp, type); 165*eda14cbcSMatt Macy BP_SET_LEVEL(bp, lvl); 166*eda14cbcSMatt Macy BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); 167*eda14cbcSMatt Macy } 168*eda14cbcSMatt Macy } 169*eda14cbcSMatt Macy dnode_diduse_space(dn, -bytesfreed); 170*eda14cbcSMatt Macy } 171*eda14cbcSMatt Macy 172*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 173*eda14cbcSMatt Macy static void 174*eda14cbcSMatt Macy free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) 175*eda14cbcSMatt Macy { 176*eda14cbcSMatt Macy int off, num; 177*eda14cbcSMatt Macy int i, err, epbs; 178*eda14cbcSMatt Macy uint64_t txg = tx->tx_txg; 179*eda14cbcSMatt Macy dnode_t *dn; 180*eda14cbcSMatt Macy 181*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 182*eda14cbcSMatt Macy dn = DB_DNODE(db); 183*eda14cbcSMatt Macy epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 184*eda14cbcSMatt Macy off = start - (db->db_blkid * 1<<epbs); 185*eda14cbcSMatt Macy num = end - start + 1; 186*eda14cbcSMatt Macy 187*eda14cbcSMatt Macy ASSERT3U(off, >=, 0); 188*eda14cbcSMatt Macy ASSERT3U(num, >=, 0); 189*eda14cbcSMatt Macy ASSERT3U(db->db_level, >, 0); 190*eda14cbcSMatt Macy ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 191*eda14cbcSMatt Macy ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); 192*eda14cbcSMatt Macy ASSERT(db->db_blkptr != NULL); 193*eda14cbcSMatt Macy 194*eda14cbcSMatt Macy for (i = off; i < off+num; i++) { 195*eda14cbcSMatt Macy uint64_t *buf; 196*eda14cbcSMatt Macy dmu_buf_impl_t *child; 197*eda14cbcSMatt Macy dbuf_dirty_record_t *dr; 198*eda14cbcSMatt Macy int j; 199*eda14cbcSMatt Macy 200*eda14cbcSMatt Macy ASSERT(db->db_level == 1); 201*eda14cbcSMatt Macy 202*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_READER); 203*eda14cbcSMatt Macy err = dbuf_hold_impl(dn, db->db_level - 1, 204*eda14cbcSMatt Macy (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); 205*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 206*eda14cbcSMatt Macy if (err == ENOENT) 207*eda14cbcSMatt Macy continue; 208*eda14cbcSMatt Macy ASSERT(err == 0); 209*eda14cbcSMatt Macy ASSERT(child->db_level == 0); 210*eda14cbcSMatt Macy dr = dbuf_find_dirty_eq(child, txg); 211*eda14cbcSMatt Macy 212*eda14cbcSMatt Macy /* data_old better be zeroed */ 213*eda14cbcSMatt Macy if (dr) { 214*eda14cbcSMatt Macy buf = dr->dt.dl.dr_data->b_data; 215*eda14cbcSMatt Macy for (j = 0; j < child->db.db_size >> 3; j++) { 216*eda14cbcSMatt Macy if (buf[j] != 0) { 217*eda14cbcSMatt Macy panic("freed data not zero: " 218*eda14cbcSMatt Macy "child=%p i=%d off=%d num=%d\n", 219*eda14cbcSMatt Macy (void *)child, i, off, num); 220*eda14cbcSMatt Macy } 221*eda14cbcSMatt Macy } 222*eda14cbcSMatt Macy } 223*eda14cbcSMatt Macy 224*eda14cbcSMatt Macy /* 225*eda14cbcSMatt Macy * db_data better be zeroed unless it's dirty in a 226*eda14cbcSMatt Macy * future txg. 227*eda14cbcSMatt Macy */ 228*eda14cbcSMatt Macy mutex_enter(&child->db_mtx); 229*eda14cbcSMatt Macy buf = child->db.db_data; 230*eda14cbcSMatt Macy if (buf != NULL && child->db_state != DB_FILL && 231*eda14cbcSMatt Macy list_is_empty(&child->db_dirty_records)) { 232*eda14cbcSMatt Macy for (j = 0; j < child->db.db_size >> 3; j++) { 233*eda14cbcSMatt Macy if (buf[j] != 0) { 234*eda14cbcSMatt Macy panic("freed data not zero: " 235*eda14cbcSMatt Macy "child=%p i=%d off=%d num=%d\n", 236*eda14cbcSMatt Macy (void *)child, i, off, num); 237*eda14cbcSMatt Macy } 238*eda14cbcSMatt Macy } 239*eda14cbcSMatt Macy } 240*eda14cbcSMatt Macy mutex_exit(&child->db_mtx); 241*eda14cbcSMatt Macy 242*eda14cbcSMatt Macy dbuf_rele(child, FTAG); 243*eda14cbcSMatt Macy } 244*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 245*eda14cbcSMatt Macy } 246*eda14cbcSMatt Macy #endif 247*eda14cbcSMatt Macy 248*eda14cbcSMatt Macy /* 249*eda14cbcSMatt Macy * We don't usually free the indirect blocks here. If in one txg we have a 250*eda14cbcSMatt Macy * free_range and a write to the same indirect block, it's important that we 251*eda14cbcSMatt Macy * preserve the hole's birth times. Therefore, we don't free any any indirect 252*eda14cbcSMatt Macy * blocks in free_children(). If an indirect block happens to turn into all 253*eda14cbcSMatt Macy * holes, it will be freed by dbuf_write_children_ready, which happens at a 254*eda14cbcSMatt Macy * point in the syncing process where we know for certain the contents of the 255*eda14cbcSMatt Macy * indirect block. 256*eda14cbcSMatt Macy * 257*eda14cbcSMatt Macy * However, if we're freeing a dnode, its space accounting must go to zero 258*eda14cbcSMatt Macy * before we actually try to free the dnode, or we will trip an assertion. In 259*eda14cbcSMatt Macy * addition, we know the case described above cannot occur, because the dnode is 260*eda14cbcSMatt Macy * being freed. Therefore, we free the indirect blocks immediately in that 261*eda14cbcSMatt Macy * case. 262*eda14cbcSMatt Macy */ 263*eda14cbcSMatt Macy static void 264*eda14cbcSMatt Macy free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, 265*eda14cbcSMatt Macy boolean_t free_indirects, dmu_tx_t *tx) 266*eda14cbcSMatt Macy { 267*eda14cbcSMatt Macy dnode_t *dn; 268*eda14cbcSMatt Macy blkptr_t *bp; 269*eda14cbcSMatt Macy dmu_buf_impl_t *subdb; 270*eda14cbcSMatt Macy uint64_t start, end, dbstart, dbend; 271*eda14cbcSMatt Macy unsigned int epbs, shift, i; 272*eda14cbcSMatt Macy 273*eda14cbcSMatt Macy /* 274*eda14cbcSMatt Macy * There is a small possibility that this block will not be cached: 275*eda14cbcSMatt Macy * 1 - if level > 1 and there are no children with level <= 1 276*eda14cbcSMatt Macy * 2 - if this block was evicted since we read it from 277*eda14cbcSMatt Macy * dmu_tx_hold_free(). 278*eda14cbcSMatt Macy */ 279*eda14cbcSMatt Macy if (db->db_state != DB_CACHED) 280*eda14cbcSMatt Macy (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 281*eda14cbcSMatt Macy 282*eda14cbcSMatt Macy /* 283*eda14cbcSMatt Macy * If we modify this indirect block, and we are not freeing the 284*eda14cbcSMatt Macy * dnode (!free_indirects), then this indirect block needs to get 285*eda14cbcSMatt Macy * written to disk by dbuf_write(). If it is dirty, we know it will 286*eda14cbcSMatt Macy * be written (otherwise, we would have incorrect on-disk state 287*eda14cbcSMatt Macy * because the space would be freed but still referenced by the BP 288*eda14cbcSMatt Macy * in this indirect block). Therefore we VERIFY that it is 289*eda14cbcSMatt Macy * dirty. 290*eda14cbcSMatt Macy * 291*eda14cbcSMatt Macy * Our VERIFY covers some cases that do not actually have to be 292*eda14cbcSMatt Macy * dirty, but the open-context code happens to dirty. E.g. if the 293*eda14cbcSMatt Macy * blocks we are freeing are all holes, because in that case, we 294*eda14cbcSMatt Macy * are only freeing part of this indirect block, so it is an 295*eda14cbcSMatt Macy * ancestor of the first or last block to be freed. The first and 296*eda14cbcSMatt Macy * last L1 indirect blocks are always dirtied by dnode_free_range(). 297*eda14cbcSMatt Macy */ 298*eda14cbcSMatt Macy db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); 299*eda14cbcSMatt Macy VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); 300*eda14cbcSMatt Macy dmu_buf_unlock_parent(db, dblt, FTAG); 301*eda14cbcSMatt Macy 302*eda14cbcSMatt Macy dbuf_release_bp(db); 303*eda14cbcSMatt Macy bp = db->db.db_data; 304*eda14cbcSMatt Macy 305*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 306*eda14cbcSMatt Macy dn = DB_DNODE(db); 307*eda14cbcSMatt Macy epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 308*eda14cbcSMatt Macy ASSERT3U(epbs, <, 31); 309*eda14cbcSMatt Macy shift = (db->db_level - 1) * epbs; 310*eda14cbcSMatt Macy dbstart = db->db_blkid << epbs; 311*eda14cbcSMatt Macy start = blkid >> shift; 312*eda14cbcSMatt Macy if (dbstart < start) { 313*eda14cbcSMatt Macy bp += start - dbstart; 314*eda14cbcSMatt Macy } else { 315*eda14cbcSMatt Macy start = dbstart; 316*eda14cbcSMatt Macy } 317*eda14cbcSMatt Macy dbend = ((db->db_blkid + 1) << epbs) - 1; 318*eda14cbcSMatt Macy end = (blkid + nblks - 1) >> shift; 319*eda14cbcSMatt Macy if (dbend <= end) 320*eda14cbcSMatt Macy end = dbend; 321*eda14cbcSMatt Macy 322*eda14cbcSMatt Macy ASSERT3U(start, <=, end); 323*eda14cbcSMatt Macy 324*eda14cbcSMatt Macy if (db->db_level == 1) { 325*eda14cbcSMatt Macy FREE_VERIFY(db, start, end, tx); 326*eda14cbcSMatt Macy rw_enter(&db->db_rwlock, RW_WRITER); 327*eda14cbcSMatt Macy free_blocks(dn, bp, end - start + 1, tx); 328*eda14cbcSMatt Macy rw_exit(&db->db_rwlock); 329*eda14cbcSMatt Macy } else { 330*eda14cbcSMatt Macy for (uint64_t id = start; id <= end; id++, bp++) { 331*eda14cbcSMatt Macy if (BP_IS_HOLE(bp)) 332*eda14cbcSMatt Macy continue; 333*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_READER); 334*eda14cbcSMatt Macy VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, 335*eda14cbcSMatt Macy id, TRUE, FALSE, FTAG, &subdb)); 336*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 337*eda14cbcSMatt Macy ASSERT3P(bp, ==, subdb->db_blkptr); 338*eda14cbcSMatt Macy 339*eda14cbcSMatt Macy free_children(subdb, blkid, nblks, free_indirects, tx); 340*eda14cbcSMatt Macy dbuf_rele(subdb, FTAG); 341*eda14cbcSMatt Macy } 342*eda14cbcSMatt Macy } 343*eda14cbcSMatt Macy 344*eda14cbcSMatt Macy if (free_indirects) { 345*eda14cbcSMatt Macy rw_enter(&db->db_rwlock, RW_WRITER); 346*eda14cbcSMatt Macy for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) 347*eda14cbcSMatt Macy ASSERT(BP_IS_HOLE(bp)); 348*eda14cbcSMatt Macy bzero(db->db.db_data, db->db.db_size); 349*eda14cbcSMatt Macy free_blocks(dn, db->db_blkptr, 1, tx); 350*eda14cbcSMatt Macy rw_exit(&db->db_rwlock); 351*eda14cbcSMatt Macy } 352*eda14cbcSMatt Macy 353*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 354*eda14cbcSMatt Macy arc_buf_freeze(db->db_buf); 355*eda14cbcSMatt Macy } 356*eda14cbcSMatt Macy 357*eda14cbcSMatt Macy /* 358*eda14cbcSMatt Macy * Traverse the indicated range of the provided file 359*eda14cbcSMatt Macy * and "free" all the blocks contained there. 360*eda14cbcSMatt Macy */ 361*eda14cbcSMatt Macy static void 362*eda14cbcSMatt Macy dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, 363*eda14cbcSMatt Macy boolean_t free_indirects, dmu_tx_t *tx) 364*eda14cbcSMatt Macy { 365*eda14cbcSMatt Macy blkptr_t *bp = dn->dn_phys->dn_blkptr; 366*eda14cbcSMatt Macy int dnlevel = dn->dn_phys->dn_nlevels; 367*eda14cbcSMatt Macy boolean_t trunc = B_FALSE; 368*eda14cbcSMatt Macy 369*eda14cbcSMatt Macy if (blkid > dn->dn_phys->dn_maxblkid) 370*eda14cbcSMatt Macy return; 371*eda14cbcSMatt Macy 372*eda14cbcSMatt Macy ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); 373*eda14cbcSMatt Macy if (blkid + nblks > dn->dn_phys->dn_maxblkid) { 374*eda14cbcSMatt Macy nblks = dn->dn_phys->dn_maxblkid - blkid + 1; 375*eda14cbcSMatt Macy trunc = B_TRUE; 376*eda14cbcSMatt Macy } 377*eda14cbcSMatt Macy 378*eda14cbcSMatt Macy /* There are no indirect blocks in the object */ 379*eda14cbcSMatt Macy if (dnlevel == 1) { 380*eda14cbcSMatt Macy if (blkid >= dn->dn_phys->dn_nblkptr) { 381*eda14cbcSMatt Macy /* this range was never made persistent */ 382*eda14cbcSMatt Macy return; 383*eda14cbcSMatt Macy } 384*eda14cbcSMatt Macy ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); 385*eda14cbcSMatt Macy free_blocks(dn, bp + blkid, nblks, tx); 386*eda14cbcSMatt Macy } else { 387*eda14cbcSMatt Macy int shift = (dnlevel - 1) * 388*eda14cbcSMatt Macy (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); 389*eda14cbcSMatt Macy int start = blkid >> shift; 390*eda14cbcSMatt Macy int end = (blkid + nblks - 1) >> shift; 391*eda14cbcSMatt Macy dmu_buf_impl_t *db; 392*eda14cbcSMatt Macy 393*eda14cbcSMatt Macy ASSERT(start < dn->dn_phys->dn_nblkptr); 394*eda14cbcSMatt Macy bp += start; 395*eda14cbcSMatt Macy for (int i = start; i <= end; i++, bp++) { 396*eda14cbcSMatt Macy if (BP_IS_HOLE(bp)) 397*eda14cbcSMatt Macy continue; 398*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_READER); 399*eda14cbcSMatt Macy VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, 400*eda14cbcSMatt Macy TRUE, FALSE, FTAG, &db)); 401*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 402*eda14cbcSMatt Macy free_children(db, blkid, nblks, free_indirects, tx); 403*eda14cbcSMatt Macy dbuf_rele(db, FTAG); 404*eda14cbcSMatt Macy } 405*eda14cbcSMatt Macy } 406*eda14cbcSMatt Macy 407*eda14cbcSMatt Macy /* 408*eda14cbcSMatt Macy * Do not truncate the maxblkid if we are performing a raw 409*eda14cbcSMatt Macy * receive. The raw receive sets the maxblkid manually and 410*eda14cbcSMatt Macy * must not be overridden. Usually, the last DRR_FREE record 411*eda14cbcSMatt Macy * will be at the maxblkid, because the source system sets 412*eda14cbcSMatt Macy * the maxblkid when truncating. However, if the last block 413*eda14cbcSMatt Macy * was freed by overwriting with zeros and being compressed 414*eda14cbcSMatt Macy * away to a hole, the source system will generate a DRR_FREE 415*eda14cbcSMatt Macy * record while leaving the maxblkid after the end of that 416*eda14cbcSMatt Macy * record. In this case we need to leave the maxblkid as 417*eda14cbcSMatt Macy * indicated in the DRR_OBJECT record, so that it matches the 418*eda14cbcSMatt Macy * source system, ensuring that the cryptographic hashes will 419*eda14cbcSMatt Macy * match. 420*eda14cbcSMatt Macy */ 421*eda14cbcSMatt Macy if (trunc && !dn->dn_objset->os_raw_receive) { 422*eda14cbcSMatt Macy uint64_t off __maybe_unused; 423*eda14cbcSMatt Macy dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; 424*eda14cbcSMatt Macy 425*eda14cbcSMatt Macy off = (dn->dn_phys->dn_maxblkid + 1) * 426*eda14cbcSMatt Macy (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); 427*eda14cbcSMatt Macy ASSERT(off < dn->dn_phys->dn_maxblkid || 428*eda14cbcSMatt Macy dn->dn_phys->dn_maxblkid == 0 || 429*eda14cbcSMatt Macy dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); 430*eda14cbcSMatt Macy } 431*eda14cbcSMatt Macy } 432*eda14cbcSMatt Macy 433*eda14cbcSMatt Macy typedef struct dnode_sync_free_range_arg { 434*eda14cbcSMatt Macy dnode_t *dsfra_dnode; 435*eda14cbcSMatt Macy dmu_tx_t *dsfra_tx; 436*eda14cbcSMatt Macy boolean_t dsfra_free_indirects; 437*eda14cbcSMatt Macy } dnode_sync_free_range_arg_t; 438*eda14cbcSMatt Macy 439*eda14cbcSMatt Macy static void 440*eda14cbcSMatt Macy dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) 441*eda14cbcSMatt Macy { 442*eda14cbcSMatt Macy dnode_sync_free_range_arg_t *dsfra = arg; 443*eda14cbcSMatt Macy dnode_t *dn = dsfra->dsfra_dnode; 444*eda14cbcSMatt Macy 445*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 446*eda14cbcSMatt Macy dnode_sync_free_range_impl(dn, blkid, nblks, 447*eda14cbcSMatt Macy dsfra->dsfra_free_indirects, dsfra->dsfra_tx); 448*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 449*eda14cbcSMatt Macy } 450*eda14cbcSMatt Macy 451*eda14cbcSMatt Macy /* 452*eda14cbcSMatt Macy * Try to kick all the dnode's dbufs out of the cache... 453*eda14cbcSMatt Macy */ 454*eda14cbcSMatt Macy void 455*eda14cbcSMatt Macy dnode_evict_dbufs(dnode_t *dn) 456*eda14cbcSMatt Macy { 457*eda14cbcSMatt Macy dmu_buf_impl_t *db_marker; 458*eda14cbcSMatt Macy dmu_buf_impl_t *db, *db_next; 459*eda14cbcSMatt Macy 460*eda14cbcSMatt Macy db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); 461*eda14cbcSMatt Macy 462*eda14cbcSMatt Macy mutex_enter(&dn->dn_dbufs_mtx); 463*eda14cbcSMatt Macy for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { 464*eda14cbcSMatt Macy 465*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 466*eda14cbcSMatt Macy DB_DNODE_ENTER(db); 467*eda14cbcSMatt Macy ASSERT3P(DB_DNODE(db), ==, dn); 468*eda14cbcSMatt Macy DB_DNODE_EXIT(db); 469*eda14cbcSMatt Macy #endif /* DEBUG */ 470*eda14cbcSMatt Macy 471*eda14cbcSMatt Macy mutex_enter(&db->db_mtx); 472*eda14cbcSMatt Macy if (db->db_state != DB_EVICTING && 473*eda14cbcSMatt Macy zfs_refcount_is_zero(&db->db_holds)) { 474*eda14cbcSMatt Macy db_marker->db_level = db->db_level; 475*eda14cbcSMatt Macy db_marker->db_blkid = db->db_blkid; 476*eda14cbcSMatt Macy db_marker->db_state = DB_SEARCH; 477*eda14cbcSMatt Macy avl_insert_here(&dn->dn_dbufs, db_marker, db, 478*eda14cbcSMatt Macy AVL_BEFORE); 479*eda14cbcSMatt Macy 480*eda14cbcSMatt Macy /* 481*eda14cbcSMatt Macy * We need to use the "marker" dbuf rather than 482*eda14cbcSMatt Macy * simply getting the next dbuf, because 483*eda14cbcSMatt Macy * dbuf_destroy() may actually remove multiple dbufs. 484*eda14cbcSMatt Macy * It can call itself recursively on the parent dbuf, 485*eda14cbcSMatt Macy * which may also be removed from dn_dbufs. The code 486*eda14cbcSMatt Macy * flow would look like: 487*eda14cbcSMatt Macy * 488*eda14cbcSMatt Macy * dbuf_destroy(): 489*eda14cbcSMatt Macy * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): 490*eda14cbcSMatt Macy * if (!cacheable || pending_evict) 491*eda14cbcSMatt Macy * dbuf_destroy() 492*eda14cbcSMatt Macy */ 493*eda14cbcSMatt Macy dbuf_destroy(db); 494*eda14cbcSMatt Macy 495*eda14cbcSMatt Macy db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); 496*eda14cbcSMatt Macy avl_remove(&dn->dn_dbufs, db_marker); 497*eda14cbcSMatt Macy } else { 498*eda14cbcSMatt Macy db->db_pending_evict = TRUE; 499*eda14cbcSMatt Macy mutex_exit(&db->db_mtx); 500*eda14cbcSMatt Macy db_next = AVL_NEXT(&dn->dn_dbufs, db); 501*eda14cbcSMatt Macy } 502*eda14cbcSMatt Macy } 503*eda14cbcSMatt Macy mutex_exit(&dn->dn_dbufs_mtx); 504*eda14cbcSMatt Macy 505*eda14cbcSMatt Macy kmem_free(db_marker, sizeof (dmu_buf_impl_t)); 506*eda14cbcSMatt Macy 507*eda14cbcSMatt Macy dnode_evict_bonus(dn); 508*eda14cbcSMatt Macy } 509*eda14cbcSMatt Macy 510*eda14cbcSMatt Macy void 511*eda14cbcSMatt Macy dnode_evict_bonus(dnode_t *dn) 512*eda14cbcSMatt Macy { 513*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 514*eda14cbcSMatt Macy if (dn->dn_bonus != NULL) { 515*eda14cbcSMatt Macy if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) { 516*eda14cbcSMatt Macy mutex_enter(&dn->dn_bonus->db_mtx); 517*eda14cbcSMatt Macy dbuf_destroy(dn->dn_bonus); 518*eda14cbcSMatt Macy dn->dn_bonus = NULL; 519*eda14cbcSMatt Macy } else { 520*eda14cbcSMatt Macy dn->dn_bonus->db_pending_evict = TRUE; 521*eda14cbcSMatt Macy } 522*eda14cbcSMatt Macy } 523*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 524*eda14cbcSMatt Macy } 525*eda14cbcSMatt Macy 526*eda14cbcSMatt Macy static void 527*eda14cbcSMatt Macy dnode_undirty_dbufs(list_t *list) 528*eda14cbcSMatt Macy { 529*eda14cbcSMatt Macy dbuf_dirty_record_t *dr; 530*eda14cbcSMatt Macy 531*eda14cbcSMatt Macy while ((dr = list_head(list))) { 532*eda14cbcSMatt Macy dmu_buf_impl_t *db = dr->dr_dbuf; 533*eda14cbcSMatt Macy uint64_t txg = dr->dr_txg; 534*eda14cbcSMatt Macy 535*eda14cbcSMatt Macy if (db->db_level != 0) 536*eda14cbcSMatt Macy dnode_undirty_dbufs(&dr->dt.di.dr_children); 537*eda14cbcSMatt Macy 538*eda14cbcSMatt Macy mutex_enter(&db->db_mtx); 539*eda14cbcSMatt Macy /* XXX - use dbuf_undirty()? */ 540*eda14cbcSMatt Macy list_remove(list, dr); 541*eda14cbcSMatt Macy ASSERT(list_head(&db->db_dirty_records) == dr); 542*eda14cbcSMatt Macy list_remove_head(&db->db_dirty_records); 543*eda14cbcSMatt Macy ASSERT(list_is_empty(&db->db_dirty_records)); 544*eda14cbcSMatt Macy db->db_dirtycnt -= 1; 545*eda14cbcSMatt Macy if (db->db_level == 0) { 546*eda14cbcSMatt Macy ASSERT(db->db_blkid == DMU_BONUS_BLKID || 547*eda14cbcSMatt Macy dr->dt.dl.dr_data == db->db_buf); 548*eda14cbcSMatt Macy dbuf_unoverride(dr); 549*eda14cbcSMatt Macy } else { 550*eda14cbcSMatt Macy mutex_destroy(&dr->dt.di.dr_mtx); 551*eda14cbcSMatt Macy list_destroy(&dr->dt.di.dr_children); 552*eda14cbcSMatt Macy } 553*eda14cbcSMatt Macy kmem_free(dr, sizeof (dbuf_dirty_record_t)); 554*eda14cbcSMatt Macy dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); 555*eda14cbcSMatt Macy } 556*eda14cbcSMatt Macy } 557*eda14cbcSMatt Macy 558*eda14cbcSMatt Macy static void 559*eda14cbcSMatt Macy dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) 560*eda14cbcSMatt Macy { 561*eda14cbcSMatt Macy int txgoff = tx->tx_txg & TXG_MASK; 562*eda14cbcSMatt Macy 563*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 564*eda14cbcSMatt Macy 565*eda14cbcSMatt Macy /* 566*eda14cbcSMatt Macy * Our contents should have been freed in dnode_sync() by the 567*eda14cbcSMatt Macy * free range record inserted by the caller of dnode_free(). 568*eda14cbcSMatt Macy */ 569*eda14cbcSMatt Macy ASSERT0(DN_USED_BYTES(dn->dn_phys)); 570*eda14cbcSMatt Macy ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); 571*eda14cbcSMatt Macy 572*eda14cbcSMatt Macy dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); 573*eda14cbcSMatt Macy dnode_evict_dbufs(dn); 574*eda14cbcSMatt Macy 575*eda14cbcSMatt Macy /* 576*eda14cbcSMatt Macy * XXX - It would be nice to assert this, but we may still 577*eda14cbcSMatt Macy * have residual holds from async evictions from the arc... 578*eda14cbcSMatt Macy * 579*eda14cbcSMatt Macy * zfs_obj_to_path() also depends on this being 580*eda14cbcSMatt Macy * commented out. 581*eda14cbcSMatt Macy * 582*eda14cbcSMatt Macy * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1); 583*eda14cbcSMatt Macy */ 584*eda14cbcSMatt Macy 585*eda14cbcSMatt Macy /* Undirty next bits */ 586*eda14cbcSMatt Macy dn->dn_next_nlevels[txgoff] = 0; 587*eda14cbcSMatt Macy dn->dn_next_indblkshift[txgoff] = 0; 588*eda14cbcSMatt Macy dn->dn_next_blksz[txgoff] = 0; 589*eda14cbcSMatt Macy dn->dn_next_maxblkid[txgoff] = 0; 590*eda14cbcSMatt Macy 591*eda14cbcSMatt Macy /* ASSERT(blkptrs are zero); */ 592*eda14cbcSMatt Macy ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); 593*eda14cbcSMatt Macy ASSERT(dn->dn_type != DMU_OT_NONE); 594*eda14cbcSMatt Macy 595*eda14cbcSMatt Macy ASSERT(dn->dn_free_txg > 0); 596*eda14cbcSMatt Macy if (dn->dn_allocated_txg != dn->dn_free_txg) 597*eda14cbcSMatt Macy dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); 598*eda14cbcSMatt Macy bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); 599*eda14cbcSMatt Macy dnode_free_interior_slots(dn); 600*eda14cbcSMatt Macy 601*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 602*eda14cbcSMatt Macy dn->dn_type = DMU_OT_NONE; 603*eda14cbcSMatt Macy dn->dn_maxblkid = 0; 604*eda14cbcSMatt Macy dn->dn_allocated_txg = 0; 605*eda14cbcSMatt Macy dn->dn_free_txg = 0; 606*eda14cbcSMatt Macy dn->dn_have_spill = B_FALSE; 607*eda14cbcSMatt Macy dn->dn_num_slots = 1; 608*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 609*eda14cbcSMatt Macy 610*eda14cbcSMatt Macy ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 611*eda14cbcSMatt Macy 612*eda14cbcSMatt Macy dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); 613*eda14cbcSMatt Macy /* 614*eda14cbcSMatt Macy * Now that we've released our hold, the dnode may 615*eda14cbcSMatt Macy * be evicted, so we mustn't access it. 616*eda14cbcSMatt Macy */ 617*eda14cbcSMatt Macy } 618*eda14cbcSMatt Macy 619*eda14cbcSMatt Macy /* 620*eda14cbcSMatt Macy * Write out the dnode's dirty buffers. 621*eda14cbcSMatt Macy */ 622*eda14cbcSMatt Macy void 623*eda14cbcSMatt Macy dnode_sync(dnode_t *dn, dmu_tx_t *tx) 624*eda14cbcSMatt Macy { 625*eda14cbcSMatt Macy objset_t *os = dn->dn_objset; 626*eda14cbcSMatt Macy dnode_phys_t *dnp = dn->dn_phys; 627*eda14cbcSMatt Macy int txgoff = tx->tx_txg & TXG_MASK; 628*eda14cbcSMatt Macy list_t *list = &dn->dn_dirty_records[txgoff]; 629*eda14cbcSMatt Macy static const dnode_phys_t zerodn __maybe_unused = { 0 }; 630*eda14cbcSMatt Macy boolean_t kill_spill = B_FALSE; 631*eda14cbcSMatt Macy 632*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 633*eda14cbcSMatt Macy ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); 634*eda14cbcSMatt Macy ASSERT(dnp->dn_type != DMU_OT_NONE || 635*eda14cbcSMatt Macy bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); 636*eda14cbcSMatt Macy DNODE_VERIFY(dn); 637*eda14cbcSMatt Macy 638*eda14cbcSMatt Macy ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); 639*eda14cbcSMatt Macy 640*eda14cbcSMatt Macy /* 641*eda14cbcSMatt Macy * Do user accounting if it is enabled and this is not 642*eda14cbcSMatt Macy * an encrypted receive. 643*eda14cbcSMatt Macy */ 644*eda14cbcSMatt Macy if (dmu_objset_userused_enabled(os) && 645*eda14cbcSMatt Macy !DMU_OBJECT_IS_SPECIAL(dn->dn_object) && 646*eda14cbcSMatt Macy (!os->os_encrypted || !dmu_objset_is_receiving(os))) { 647*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 648*eda14cbcSMatt Macy dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); 649*eda14cbcSMatt Macy dn->dn_oldflags = dn->dn_phys->dn_flags; 650*eda14cbcSMatt Macy dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; 651*eda14cbcSMatt Macy if (dmu_objset_userobjused_enabled(dn->dn_objset)) 652*eda14cbcSMatt Macy dn->dn_phys->dn_flags |= 653*eda14cbcSMatt Macy DNODE_FLAG_USEROBJUSED_ACCOUNTED; 654*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 655*eda14cbcSMatt Macy dmu_objset_userquota_get_ids(dn, B_FALSE, tx); 656*eda14cbcSMatt Macy } else { 657*eda14cbcSMatt Macy /* Once we account for it, we should always account for it */ 658*eda14cbcSMatt Macy ASSERT(!(dn->dn_phys->dn_flags & 659*eda14cbcSMatt Macy DNODE_FLAG_USERUSED_ACCOUNTED)); 660*eda14cbcSMatt Macy ASSERT(!(dn->dn_phys->dn_flags & 661*eda14cbcSMatt Macy DNODE_FLAG_USEROBJUSED_ACCOUNTED)); 662*eda14cbcSMatt Macy } 663*eda14cbcSMatt Macy 664*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 665*eda14cbcSMatt Macy if (dn->dn_allocated_txg == tx->tx_txg) { 666*eda14cbcSMatt Macy /* The dnode is newly allocated or reallocated */ 667*eda14cbcSMatt Macy if (dnp->dn_type == DMU_OT_NONE) { 668*eda14cbcSMatt Macy /* this is a first alloc, not a realloc */ 669*eda14cbcSMatt Macy dnp->dn_nlevels = 1; 670*eda14cbcSMatt Macy dnp->dn_nblkptr = dn->dn_nblkptr; 671*eda14cbcSMatt Macy } 672*eda14cbcSMatt Macy 673*eda14cbcSMatt Macy dnp->dn_type = dn->dn_type; 674*eda14cbcSMatt Macy dnp->dn_bonustype = dn->dn_bonustype; 675*eda14cbcSMatt Macy dnp->dn_bonuslen = dn->dn_bonuslen; 676*eda14cbcSMatt Macy } 677*eda14cbcSMatt Macy 678*eda14cbcSMatt Macy dnp->dn_extra_slots = dn->dn_num_slots - 1; 679*eda14cbcSMatt Macy 680*eda14cbcSMatt Macy ASSERT(dnp->dn_nlevels > 1 || 681*eda14cbcSMatt Macy BP_IS_HOLE(&dnp->dn_blkptr[0]) || 682*eda14cbcSMatt Macy BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || 683*eda14cbcSMatt Macy BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 684*eda14cbcSMatt Macy dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 685*eda14cbcSMatt Macy ASSERT(dnp->dn_nlevels < 2 || 686*eda14cbcSMatt Macy BP_IS_HOLE(&dnp->dn_blkptr[0]) || 687*eda14cbcSMatt Macy BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); 688*eda14cbcSMatt Macy 689*eda14cbcSMatt Macy if (dn->dn_next_type[txgoff] != 0) { 690*eda14cbcSMatt Macy dnp->dn_type = dn->dn_type; 691*eda14cbcSMatt Macy dn->dn_next_type[txgoff] = 0; 692*eda14cbcSMatt Macy } 693*eda14cbcSMatt Macy 694*eda14cbcSMatt Macy if (dn->dn_next_blksz[txgoff] != 0) { 695*eda14cbcSMatt Macy ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], 696*eda14cbcSMatt Macy SPA_MINBLOCKSIZE) == 0); 697*eda14cbcSMatt Macy ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || 698*eda14cbcSMatt Macy dn->dn_maxblkid == 0 || list_head(list) != NULL || 699*eda14cbcSMatt Macy dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == 700*eda14cbcSMatt Macy dnp->dn_datablkszsec || 701*eda14cbcSMatt Macy !range_tree_is_empty(dn->dn_free_ranges[txgoff])); 702*eda14cbcSMatt Macy dnp->dn_datablkszsec = 703*eda14cbcSMatt Macy dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; 704*eda14cbcSMatt Macy dn->dn_next_blksz[txgoff] = 0; 705*eda14cbcSMatt Macy } 706*eda14cbcSMatt Macy 707*eda14cbcSMatt Macy if (dn->dn_next_bonuslen[txgoff] != 0) { 708*eda14cbcSMatt Macy if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) 709*eda14cbcSMatt Macy dnp->dn_bonuslen = 0; 710*eda14cbcSMatt Macy else 711*eda14cbcSMatt Macy dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; 712*eda14cbcSMatt Macy ASSERT(dnp->dn_bonuslen <= 713*eda14cbcSMatt Macy DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); 714*eda14cbcSMatt Macy dn->dn_next_bonuslen[txgoff] = 0; 715*eda14cbcSMatt Macy } 716*eda14cbcSMatt Macy 717*eda14cbcSMatt Macy if (dn->dn_next_bonustype[txgoff] != 0) { 718*eda14cbcSMatt Macy ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); 719*eda14cbcSMatt Macy dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; 720*eda14cbcSMatt Macy dn->dn_next_bonustype[txgoff] = 0; 721*eda14cbcSMatt Macy } 722*eda14cbcSMatt Macy 723*eda14cbcSMatt Macy boolean_t freeing_dnode = dn->dn_free_txg > 0 && 724*eda14cbcSMatt Macy dn->dn_free_txg <= tx->tx_txg; 725*eda14cbcSMatt Macy 726*eda14cbcSMatt Macy /* 727*eda14cbcSMatt Macy * Remove the spill block if we have been explicitly asked to 728*eda14cbcSMatt Macy * remove it, or if the object is being removed. 729*eda14cbcSMatt Macy */ 730*eda14cbcSMatt Macy if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { 731*eda14cbcSMatt Macy if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) 732*eda14cbcSMatt Macy kill_spill = B_TRUE; 733*eda14cbcSMatt Macy dn->dn_rm_spillblk[txgoff] = 0; 734*eda14cbcSMatt Macy } 735*eda14cbcSMatt Macy 736*eda14cbcSMatt Macy if (dn->dn_next_indblkshift[txgoff] != 0) { 737*eda14cbcSMatt Macy ASSERT(dnp->dn_nlevels == 1); 738*eda14cbcSMatt Macy dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; 739*eda14cbcSMatt Macy dn->dn_next_indblkshift[txgoff] = 0; 740*eda14cbcSMatt Macy } 741*eda14cbcSMatt Macy 742*eda14cbcSMatt Macy /* 743*eda14cbcSMatt Macy * Just take the live (open-context) values for checksum and compress. 744*eda14cbcSMatt Macy * Strictly speaking it's a future leak, but nothing bad happens if we 745*eda14cbcSMatt Macy * start using the new checksum or compress algorithm a little early. 746*eda14cbcSMatt Macy */ 747*eda14cbcSMatt Macy dnp->dn_checksum = dn->dn_checksum; 748*eda14cbcSMatt Macy dnp->dn_compress = dn->dn_compress; 749*eda14cbcSMatt Macy 750*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 751*eda14cbcSMatt Macy 752*eda14cbcSMatt Macy if (kill_spill) { 753*eda14cbcSMatt Macy free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); 754*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 755*eda14cbcSMatt Macy dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; 756*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 757*eda14cbcSMatt Macy } 758*eda14cbcSMatt Macy 759*eda14cbcSMatt Macy /* process all the "freed" ranges in the file */ 760*eda14cbcSMatt Macy if (dn->dn_free_ranges[txgoff] != NULL) { 761*eda14cbcSMatt Macy dnode_sync_free_range_arg_t dsfra; 762*eda14cbcSMatt Macy dsfra.dsfra_dnode = dn; 763*eda14cbcSMatt Macy dsfra.dsfra_tx = tx; 764*eda14cbcSMatt Macy dsfra.dsfra_free_indirects = freeing_dnode; 765*eda14cbcSMatt Macy if (freeing_dnode) { 766*eda14cbcSMatt Macy ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], 767*eda14cbcSMatt Macy 0, dn->dn_maxblkid + 1)); 768*eda14cbcSMatt Macy } 769*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 770*eda14cbcSMatt Macy range_tree_vacate(dn->dn_free_ranges[txgoff], 771*eda14cbcSMatt Macy dnode_sync_free_range, &dsfra); 772*eda14cbcSMatt Macy range_tree_destroy(dn->dn_free_ranges[txgoff]); 773*eda14cbcSMatt Macy dn->dn_free_ranges[txgoff] = NULL; 774*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 775*eda14cbcSMatt Macy } 776*eda14cbcSMatt Macy 777*eda14cbcSMatt Macy if (freeing_dnode) { 778*eda14cbcSMatt Macy dn->dn_objset->os_freed_dnodes++; 779*eda14cbcSMatt Macy dnode_sync_free(dn, tx); 780*eda14cbcSMatt Macy return; 781*eda14cbcSMatt Macy } 782*eda14cbcSMatt Macy 783*eda14cbcSMatt Macy if (dn->dn_num_slots > DNODE_MIN_SLOTS) { 784*eda14cbcSMatt Macy dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 785*eda14cbcSMatt Macy mutex_enter(&ds->ds_lock); 786*eda14cbcSMatt Macy ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] = 787*eda14cbcSMatt Macy (void *)B_TRUE; 788*eda14cbcSMatt Macy mutex_exit(&ds->ds_lock); 789*eda14cbcSMatt Macy } 790*eda14cbcSMatt Macy 791*eda14cbcSMatt Macy if (dn->dn_next_nlevels[txgoff]) { 792*eda14cbcSMatt Macy dnode_increase_indirection(dn, tx); 793*eda14cbcSMatt Macy dn->dn_next_nlevels[txgoff] = 0; 794*eda14cbcSMatt Macy } 795*eda14cbcSMatt Macy 796*eda14cbcSMatt Macy /* 797*eda14cbcSMatt Macy * This must be done after dnode_sync_free_range() 798*eda14cbcSMatt Macy * and dnode_increase_indirection(). See dnode_new_blkid() 799*eda14cbcSMatt Macy * for an explanation of the high bit being set. 800*eda14cbcSMatt Macy */ 801*eda14cbcSMatt Macy if (dn->dn_next_maxblkid[txgoff]) { 802*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 803*eda14cbcSMatt Macy dnp->dn_maxblkid = 804*eda14cbcSMatt Macy dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET; 805*eda14cbcSMatt Macy dn->dn_next_maxblkid[txgoff] = 0; 806*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 807*eda14cbcSMatt Macy } 808*eda14cbcSMatt Macy 809*eda14cbcSMatt Macy if (dn->dn_next_nblkptr[txgoff]) { 810*eda14cbcSMatt Macy /* this should only happen on a realloc */ 811*eda14cbcSMatt Macy ASSERT(dn->dn_allocated_txg == tx->tx_txg); 812*eda14cbcSMatt Macy if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { 813*eda14cbcSMatt Macy /* zero the new blkptrs we are gaining */ 814*eda14cbcSMatt Macy bzero(dnp->dn_blkptr + dnp->dn_nblkptr, 815*eda14cbcSMatt Macy sizeof (blkptr_t) * 816*eda14cbcSMatt Macy (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); 817*eda14cbcSMatt Macy #ifdef ZFS_DEBUG 818*eda14cbcSMatt Macy } else { 819*eda14cbcSMatt Macy int i; 820*eda14cbcSMatt Macy ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); 821*eda14cbcSMatt Macy /* the blkptrs we are losing better be unallocated */ 822*eda14cbcSMatt Macy for (i = 0; i < dnp->dn_nblkptr; i++) { 823*eda14cbcSMatt Macy if (i >= dn->dn_next_nblkptr[txgoff]) 824*eda14cbcSMatt Macy ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); 825*eda14cbcSMatt Macy } 826*eda14cbcSMatt Macy #endif 827*eda14cbcSMatt Macy } 828*eda14cbcSMatt Macy mutex_enter(&dn->dn_mtx); 829*eda14cbcSMatt Macy dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; 830*eda14cbcSMatt Macy dn->dn_next_nblkptr[txgoff] = 0; 831*eda14cbcSMatt Macy mutex_exit(&dn->dn_mtx); 832*eda14cbcSMatt Macy } 833*eda14cbcSMatt Macy 834*eda14cbcSMatt Macy dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); 835*eda14cbcSMatt Macy 836*eda14cbcSMatt Macy if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { 837*eda14cbcSMatt Macy ASSERT3P(list_head(list), ==, NULL); 838*eda14cbcSMatt Macy dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); 839*eda14cbcSMatt Macy } 840*eda14cbcSMatt Macy 841*eda14cbcSMatt Macy /* 842*eda14cbcSMatt Macy * Although we have dropped our reference to the dnode, it 843*eda14cbcSMatt Macy * can't be evicted until its written, and we haven't yet 844*eda14cbcSMatt Macy * initiated the IO for the dnode's dbuf. 845*eda14cbcSMatt Macy */ 846*eda14cbcSMatt Macy } 847