1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy * Copyright 2014 HybridCluster. All rights reserved. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy 27*eda14cbcSMatt Macy #include <sys/dbuf.h> 28*eda14cbcSMatt Macy #include <sys/dmu.h> 29*eda14cbcSMatt Macy #include <sys/dmu_impl.h> 30*eda14cbcSMatt Macy #include <sys/dmu_objset.h> 31*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 32*eda14cbcSMatt Macy #include <sys/dnode.h> 33*eda14cbcSMatt Macy #include <sys/zap.h> 34*eda14cbcSMatt Macy #include <sys/zfeature.h> 35*eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 36*eda14cbcSMatt Macy 37*eda14cbcSMatt Macy /* 38*eda14cbcSMatt Macy * Each of the concurrent object allocators will grab 39*eda14cbcSMatt Macy * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 40*eda14cbcSMatt Macy * grab 128 slots, which is 4 blocks worth. This was experimentally 41*eda14cbcSMatt Macy * determined to be the lowest value that eliminates the measurable effect 42*eda14cbcSMatt Macy * of lock contention from this code path. 43*eda14cbcSMatt Macy */ 44*eda14cbcSMatt Macy int dmu_object_alloc_chunk_shift = 7; 45*eda14cbcSMatt Macy 46*eda14cbcSMatt Macy static uint64_t 47*eda14cbcSMatt Macy dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 48*eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 49*eda14cbcSMatt Macy int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) 50*eda14cbcSMatt Macy { 51*eda14cbcSMatt Macy uint64_t object; 52*eda14cbcSMatt Macy uint64_t L1_dnode_count = DNODES_PER_BLOCK << 53*eda14cbcSMatt Macy (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 54*eda14cbcSMatt Macy dnode_t *dn = NULL; 55*eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT; 56*eda14cbcSMatt Macy boolean_t restarted = B_FALSE; 57*eda14cbcSMatt Macy uint64_t *cpuobj = NULL; 58*eda14cbcSMatt Macy int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 59*eda14cbcSMatt Macy int error; 60*eda14cbcSMatt Macy 61*eda14cbcSMatt Macy kpreempt_disable(); 62*eda14cbcSMatt Macy cpuobj = &os->os_obj_next_percpu[CPU_SEQID % 63*eda14cbcSMatt Macy os->os_obj_next_percpu_len]; 64*eda14cbcSMatt Macy kpreempt_enable(); 65*eda14cbcSMatt Macy 66*eda14cbcSMatt Macy if (dn_slots == 0) { 67*eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS; 68*eda14cbcSMatt Macy } else { 69*eda14cbcSMatt Macy ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 70*eda14cbcSMatt Macy ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 71*eda14cbcSMatt Macy } 72*eda14cbcSMatt Macy 73*eda14cbcSMatt Macy /* 74*eda14cbcSMatt Macy * The "chunk" of dnodes that is assigned to a CPU-specific 75*eda14cbcSMatt Macy * allocator needs to be at least one block's worth, to avoid 76*eda14cbcSMatt Macy * lock contention on the dbuf. It can be at most one L1 block's 77*eda14cbcSMatt Macy * worth, so that the "rescan after polishing off a L1's worth" 78*eda14cbcSMatt Macy * logic below will be sure to kick in. 79*eda14cbcSMatt Macy */ 80*eda14cbcSMatt Macy if (dnodes_per_chunk < DNODES_PER_BLOCK) 81*eda14cbcSMatt Macy dnodes_per_chunk = DNODES_PER_BLOCK; 82*eda14cbcSMatt Macy if (dnodes_per_chunk > L1_dnode_count) 83*eda14cbcSMatt Macy dnodes_per_chunk = L1_dnode_count; 84*eda14cbcSMatt Macy 85*eda14cbcSMatt Macy /* 86*eda14cbcSMatt Macy * The caller requested the dnode be returned as a performance 87*eda14cbcSMatt Macy * optimization in order to avoid releasing the hold only to 88*eda14cbcSMatt Macy * immediately reacquire it. Since they caller is responsible 89*eda14cbcSMatt Macy * for releasing the hold they must provide the tag. 90*eda14cbcSMatt Macy */ 91*eda14cbcSMatt Macy if (allocated_dnode != NULL) { 92*eda14cbcSMatt Macy ASSERT3P(tag, !=, NULL); 93*eda14cbcSMatt Macy } else { 94*eda14cbcSMatt Macy ASSERT3P(tag, ==, NULL); 95*eda14cbcSMatt Macy tag = FTAG; 96*eda14cbcSMatt Macy } 97*eda14cbcSMatt Macy 98*eda14cbcSMatt Macy object = *cpuobj; 99*eda14cbcSMatt Macy for (;;) { 100*eda14cbcSMatt Macy /* 101*eda14cbcSMatt Macy * If we finished a chunk of dnodes, get a new one from 102*eda14cbcSMatt Macy * the global allocator. 103*eda14cbcSMatt Macy */ 104*eda14cbcSMatt Macy if ((P2PHASE(object, dnodes_per_chunk) == 0) || 105*eda14cbcSMatt Macy (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 106*eda14cbcSMatt Macy dn_slots)) { 107*eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_next_chunk); 108*eda14cbcSMatt Macy mutex_enter(&os->os_obj_lock); 109*eda14cbcSMatt Macy ASSERT0(P2PHASE(os->os_obj_next_chunk, 110*eda14cbcSMatt Macy dnodes_per_chunk)); 111*eda14cbcSMatt Macy object = os->os_obj_next_chunk; 112*eda14cbcSMatt Macy 113*eda14cbcSMatt Macy /* 114*eda14cbcSMatt Macy * Each time we polish off a L1 bp worth of dnodes 115*eda14cbcSMatt Macy * (2^12 objects), move to another L1 bp that's 116*eda14cbcSMatt Macy * still reasonably sparse (at most 1/4 full). Look 117*eda14cbcSMatt Macy * from the beginning at most once per txg. If we 118*eda14cbcSMatt Macy * still can't allocate from that L1 block, search 119*eda14cbcSMatt Macy * for an empty L0 block, which will quickly skip 120*eda14cbcSMatt Macy * to the end of the metadnode if no nearby L0 121*eda14cbcSMatt Macy * blocks are empty. This fallback avoids a 122*eda14cbcSMatt Macy * pathology where full dnode blocks containing 123*eda14cbcSMatt Macy * large dnodes appear sparse because they have a 124*eda14cbcSMatt Macy * low blk_fill, leading to many failed allocation 125*eda14cbcSMatt Macy * attempts. In the long term a better mechanism to 126*eda14cbcSMatt Macy * search for sparse metadnode regions, such as 127*eda14cbcSMatt Macy * spacemaps, could be implemented. 128*eda14cbcSMatt Macy * 129*eda14cbcSMatt Macy * os_scan_dnodes is set during txg sync if enough 130*eda14cbcSMatt Macy * objects have been freed since the previous 131*eda14cbcSMatt Macy * rescan to justify backfilling again. 132*eda14cbcSMatt Macy * 133*eda14cbcSMatt Macy * Note that dmu_traverse depends on the behavior 134*eda14cbcSMatt Macy * that we use multiple blocks of the dnode object 135*eda14cbcSMatt Macy * before going back to reuse objects. Any change 136*eda14cbcSMatt Macy * to this algorithm should preserve that property 137*eda14cbcSMatt Macy * or find another solution to the issues described 138*eda14cbcSMatt Macy * in traverse_visitbp. 139*eda14cbcSMatt Macy */ 140*eda14cbcSMatt Macy if (P2PHASE(object, L1_dnode_count) == 0) { 141*eda14cbcSMatt Macy uint64_t offset; 142*eda14cbcSMatt Macy uint64_t blkfill; 143*eda14cbcSMatt Macy int minlvl; 144*eda14cbcSMatt Macy if (os->os_rescan_dnodes) { 145*eda14cbcSMatt Macy offset = 0; 146*eda14cbcSMatt Macy os->os_rescan_dnodes = B_FALSE; 147*eda14cbcSMatt Macy } else { 148*eda14cbcSMatt Macy offset = object << DNODE_SHIFT; 149*eda14cbcSMatt Macy } 150*eda14cbcSMatt Macy blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 151*eda14cbcSMatt Macy minlvl = restarted ? 1 : 2; 152*eda14cbcSMatt Macy restarted = B_TRUE; 153*eda14cbcSMatt Macy error = dnode_next_offset(DMU_META_DNODE(os), 154*eda14cbcSMatt Macy DNODE_FIND_HOLE, &offset, minlvl, 155*eda14cbcSMatt Macy blkfill, 0); 156*eda14cbcSMatt Macy if (error == 0) { 157*eda14cbcSMatt Macy object = offset >> DNODE_SHIFT; 158*eda14cbcSMatt Macy } 159*eda14cbcSMatt Macy } 160*eda14cbcSMatt Macy /* 161*eda14cbcSMatt Macy * Note: if "restarted", we may find a L0 that 162*eda14cbcSMatt Macy * is not suitably aligned. 163*eda14cbcSMatt Macy */ 164*eda14cbcSMatt Macy os->os_obj_next_chunk = 165*eda14cbcSMatt Macy P2ALIGN(object, dnodes_per_chunk) + 166*eda14cbcSMatt Macy dnodes_per_chunk; 167*eda14cbcSMatt Macy (void) atomic_swap_64(cpuobj, object); 168*eda14cbcSMatt Macy mutex_exit(&os->os_obj_lock); 169*eda14cbcSMatt Macy } 170*eda14cbcSMatt Macy 171*eda14cbcSMatt Macy /* 172*eda14cbcSMatt Macy * The value of (*cpuobj) before adding dn_slots is the object 173*eda14cbcSMatt Macy * ID assigned to us. The value afterwards is the object ID 174*eda14cbcSMatt Macy * assigned to whoever wants to do an allocation next. 175*eda14cbcSMatt Macy */ 176*eda14cbcSMatt Macy object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 177*eda14cbcSMatt Macy 178*eda14cbcSMatt Macy /* 179*eda14cbcSMatt Macy * XXX We should check for an i/o error here and return 180*eda14cbcSMatt Macy * up to our caller. Actually we should pre-read it in 181*eda14cbcSMatt Macy * dmu_tx_assign(), but there is currently no mechanism 182*eda14cbcSMatt Macy * to do so. 183*eda14cbcSMatt Macy */ 184*eda14cbcSMatt Macy error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 185*eda14cbcSMatt Macy dn_slots, tag, &dn); 186*eda14cbcSMatt Macy if (error == 0) { 187*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 188*eda14cbcSMatt Macy /* 189*eda14cbcSMatt Macy * Another thread could have allocated it; check 190*eda14cbcSMatt Macy * again now that we have the struct lock. 191*eda14cbcSMatt Macy */ 192*eda14cbcSMatt Macy if (dn->dn_type == DMU_OT_NONE) { 193*eda14cbcSMatt Macy dnode_allocate(dn, ot, blocksize, 194*eda14cbcSMatt Macy indirect_blockshift, bonustype, 195*eda14cbcSMatt Macy bonuslen, dn_slots, tx); 196*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 197*eda14cbcSMatt Macy dmu_tx_add_new_object(tx, dn); 198*eda14cbcSMatt Macy 199*eda14cbcSMatt Macy /* 200*eda14cbcSMatt Macy * Caller requested the allocated dnode be 201*eda14cbcSMatt Macy * returned and is responsible for the hold. 202*eda14cbcSMatt Macy */ 203*eda14cbcSMatt Macy if (allocated_dnode != NULL) 204*eda14cbcSMatt Macy *allocated_dnode = dn; 205*eda14cbcSMatt Macy else 206*eda14cbcSMatt Macy dnode_rele(dn, tag); 207*eda14cbcSMatt Macy 208*eda14cbcSMatt Macy return (object); 209*eda14cbcSMatt Macy } 210*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 211*eda14cbcSMatt Macy dnode_rele(dn, tag); 212*eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_race); 213*eda14cbcSMatt Macy } 214*eda14cbcSMatt Macy 215*eda14cbcSMatt Macy /* 216*eda14cbcSMatt Macy * Skip to next known valid starting point on error. This 217*eda14cbcSMatt Macy * is the start of the next block of dnodes. 218*eda14cbcSMatt Macy */ 219*eda14cbcSMatt Macy if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 220*eda14cbcSMatt Macy object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 221*eda14cbcSMatt Macy DNODE_STAT_BUMP(dnode_alloc_next_block); 222*eda14cbcSMatt Macy } 223*eda14cbcSMatt Macy (void) atomic_swap_64(cpuobj, object); 224*eda14cbcSMatt Macy } 225*eda14cbcSMatt Macy } 226*eda14cbcSMatt Macy 227*eda14cbcSMatt Macy uint64_t 228*eda14cbcSMatt Macy dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 229*eda14cbcSMatt Macy dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 230*eda14cbcSMatt Macy { 231*eda14cbcSMatt Macy return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 232*eda14cbcSMatt Macy bonuslen, 0, NULL, NULL, tx); 233*eda14cbcSMatt Macy } 234*eda14cbcSMatt Macy 235*eda14cbcSMatt Macy uint64_t 236*eda14cbcSMatt Macy dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 237*eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 238*eda14cbcSMatt Macy dmu_tx_t *tx) 239*eda14cbcSMatt Macy { 240*eda14cbcSMatt Macy return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 241*eda14cbcSMatt Macy bonustype, bonuslen, 0, NULL, NULL, tx); 242*eda14cbcSMatt Macy } 243*eda14cbcSMatt Macy 244*eda14cbcSMatt Macy uint64_t 245*eda14cbcSMatt Macy dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 246*eda14cbcSMatt Macy dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 247*eda14cbcSMatt Macy { 248*eda14cbcSMatt Macy return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 249*eda14cbcSMatt Macy bonuslen, dnodesize, NULL, NULL, tx)); 250*eda14cbcSMatt Macy } 251*eda14cbcSMatt Macy 252*eda14cbcSMatt Macy /* 253*eda14cbcSMatt Macy * Allocate a new object and return a pointer to the newly allocated dnode 254*eda14cbcSMatt Macy * via the allocated_dnode argument. The returned dnode will be held and 255*eda14cbcSMatt Macy * the caller is responsible for releasing the hold by calling dnode_rele(). 256*eda14cbcSMatt Macy */ 257*eda14cbcSMatt Macy uint64_t 258*eda14cbcSMatt Macy dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, 259*eda14cbcSMatt Macy int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 260*eda14cbcSMatt Macy int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) 261*eda14cbcSMatt Macy { 262*eda14cbcSMatt Macy return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 263*eda14cbcSMatt Macy bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); 264*eda14cbcSMatt Macy } 265*eda14cbcSMatt Macy 266*eda14cbcSMatt Macy int 267*eda14cbcSMatt Macy dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 268*eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 269*eda14cbcSMatt Macy { 270*eda14cbcSMatt Macy return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 271*eda14cbcSMatt Macy bonuslen, 0, tx)); 272*eda14cbcSMatt Macy } 273*eda14cbcSMatt Macy 274*eda14cbcSMatt Macy int 275*eda14cbcSMatt Macy dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 276*eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, 277*eda14cbcSMatt Macy int dnodesize, dmu_tx_t *tx) 278*eda14cbcSMatt Macy { 279*eda14cbcSMatt Macy dnode_t *dn; 280*eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT; 281*eda14cbcSMatt Macy int err; 282*eda14cbcSMatt Macy 283*eda14cbcSMatt Macy if (dn_slots == 0) 284*eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS; 285*eda14cbcSMatt Macy ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 286*eda14cbcSMatt Macy ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 287*eda14cbcSMatt Macy 288*eda14cbcSMatt Macy if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 289*eda14cbcSMatt Macy return (SET_ERROR(EBADF)); 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 292*eda14cbcSMatt Macy FTAG, &dn); 293*eda14cbcSMatt Macy if (err) 294*eda14cbcSMatt Macy return (err); 295*eda14cbcSMatt Macy 296*eda14cbcSMatt Macy dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 297*eda14cbcSMatt Macy dmu_tx_add_new_object(tx, dn); 298*eda14cbcSMatt Macy 299*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 300*eda14cbcSMatt Macy 301*eda14cbcSMatt Macy return (0); 302*eda14cbcSMatt Macy } 303*eda14cbcSMatt Macy 304*eda14cbcSMatt Macy int 305*eda14cbcSMatt Macy dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 306*eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 307*eda14cbcSMatt Macy { 308*eda14cbcSMatt Macy return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 309*eda14cbcSMatt Macy bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); 310*eda14cbcSMatt Macy } 311*eda14cbcSMatt Macy 312*eda14cbcSMatt Macy int 313*eda14cbcSMatt Macy dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 314*eda14cbcSMatt Macy int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 315*eda14cbcSMatt Macy boolean_t keep_spill, dmu_tx_t *tx) 316*eda14cbcSMatt Macy { 317*eda14cbcSMatt Macy dnode_t *dn; 318*eda14cbcSMatt Macy int dn_slots = dnodesize >> DNODE_SHIFT; 319*eda14cbcSMatt Macy int err; 320*eda14cbcSMatt Macy 321*eda14cbcSMatt Macy if (dn_slots == 0) 322*eda14cbcSMatt Macy dn_slots = DNODE_MIN_SLOTS; 323*eda14cbcSMatt Macy 324*eda14cbcSMatt Macy if (object == DMU_META_DNODE_OBJECT) 325*eda14cbcSMatt Macy return (SET_ERROR(EBADF)); 326*eda14cbcSMatt Macy 327*eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 328*eda14cbcSMatt Macy FTAG, &dn); 329*eda14cbcSMatt Macy if (err) 330*eda14cbcSMatt Macy return (err); 331*eda14cbcSMatt Macy 332*eda14cbcSMatt Macy dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, 333*eda14cbcSMatt Macy keep_spill, tx); 334*eda14cbcSMatt Macy 335*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 336*eda14cbcSMatt Macy return (err); 337*eda14cbcSMatt Macy } 338*eda14cbcSMatt Macy 339*eda14cbcSMatt Macy int 340*eda14cbcSMatt Macy dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 341*eda14cbcSMatt Macy { 342*eda14cbcSMatt Macy dnode_t *dn; 343*eda14cbcSMatt Macy int err; 344*eda14cbcSMatt Macy 345*eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 346*eda14cbcSMatt Macy FTAG, &dn); 347*eda14cbcSMatt Macy if (err) 348*eda14cbcSMatt Macy return (err); 349*eda14cbcSMatt Macy 350*eda14cbcSMatt Macy rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 351*eda14cbcSMatt Macy if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 352*eda14cbcSMatt Macy dbuf_rm_spill(dn, tx); 353*eda14cbcSMatt Macy dnode_rm_spill(dn, tx); 354*eda14cbcSMatt Macy } 355*eda14cbcSMatt Macy rw_exit(&dn->dn_struct_rwlock); 356*eda14cbcSMatt Macy 357*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 358*eda14cbcSMatt Macy return (err); 359*eda14cbcSMatt Macy } 360*eda14cbcSMatt Macy 361*eda14cbcSMatt Macy int 362*eda14cbcSMatt Macy dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 363*eda14cbcSMatt Macy { 364*eda14cbcSMatt Macy dnode_t *dn; 365*eda14cbcSMatt Macy int err; 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 368*eda14cbcSMatt Macy 369*eda14cbcSMatt Macy err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 370*eda14cbcSMatt Macy FTAG, &dn); 371*eda14cbcSMatt Macy if (err) 372*eda14cbcSMatt Macy return (err); 373*eda14cbcSMatt Macy 374*eda14cbcSMatt Macy ASSERT(dn->dn_type != DMU_OT_NONE); 375*eda14cbcSMatt Macy /* 376*eda14cbcSMatt Macy * If we don't create this free range, we'll leak indirect blocks when 377*eda14cbcSMatt Macy * we get to freeing the dnode in syncing context. 378*eda14cbcSMatt Macy */ 379*eda14cbcSMatt Macy dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 380*eda14cbcSMatt Macy dnode_free(dn, tx); 381*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 382*eda14cbcSMatt Macy 383*eda14cbcSMatt Macy return (0); 384*eda14cbcSMatt Macy } 385*eda14cbcSMatt Macy 386*eda14cbcSMatt Macy /* 387*eda14cbcSMatt Macy * Return (in *objectp) the next object which is allocated (or a hole) 388*eda14cbcSMatt Macy * after *object, taking into account only objects that may have been modified 389*eda14cbcSMatt Macy * after the specified txg. 390*eda14cbcSMatt Macy */ 391*eda14cbcSMatt Macy int 392*eda14cbcSMatt Macy dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 393*eda14cbcSMatt Macy { 394*eda14cbcSMatt Macy uint64_t offset; 395*eda14cbcSMatt Macy uint64_t start_obj; 396*eda14cbcSMatt Macy struct dsl_dataset *ds = os->os_dsl_dataset; 397*eda14cbcSMatt Macy int error; 398*eda14cbcSMatt Macy 399*eda14cbcSMatt Macy if (*objectp == 0) { 400*eda14cbcSMatt Macy start_obj = 1; 401*eda14cbcSMatt Macy } else if (ds && dsl_dataset_feature_is_active(ds, 402*eda14cbcSMatt Macy SPA_FEATURE_LARGE_DNODE)) { 403*eda14cbcSMatt Macy uint64_t i = *objectp + 1; 404*eda14cbcSMatt Macy uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 405*eda14cbcSMatt Macy dmu_object_info_t doi; 406*eda14cbcSMatt Macy 407*eda14cbcSMatt Macy /* 408*eda14cbcSMatt Macy * Scan through the remaining meta dnode block. The contents 409*eda14cbcSMatt Macy * of each slot in the block are known so it can be quickly 410*eda14cbcSMatt Macy * checked. If the block is exhausted without a match then 411*eda14cbcSMatt Macy * hand off to dnode_next_offset() for further scanning. 412*eda14cbcSMatt Macy */ 413*eda14cbcSMatt Macy while (i <= last_obj) { 414*eda14cbcSMatt Macy error = dmu_object_info(os, i, &doi); 415*eda14cbcSMatt Macy if (error == ENOENT) { 416*eda14cbcSMatt Macy if (hole) { 417*eda14cbcSMatt Macy *objectp = i; 418*eda14cbcSMatt Macy return (0); 419*eda14cbcSMatt Macy } else { 420*eda14cbcSMatt Macy i++; 421*eda14cbcSMatt Macy } 422*eda14cbcSMatt Macy } else if (error == EEXIST) { 423*eda14cbcSMatt Macy i++; 424*eda14cbcSMatt Macy } else if (error == 0) { 425*eda14cbcSMatt Macy if (hole) { 426*eda14cbcSMatt Macy i += doi.doi_dnodesize >> DNODE_SHIFT; 427*eda14cbcSMatt Macy } else { 428*eda14cbcSMatt Macy *objectp = i; 429*eda14cbcSMatt Macy return (0); 430*eda14cbcSMatt Macy } 431*eda14cbcSMatt Macy } else { 432*eda14cbcSMatt Macy return (error); 433*eda14cbcSMatt Macy } 434*eda14cbcSMatt Macy } 435*eda14cbcSMatt Macy 436*eda14cbcSMatt Macy start_obj = i; 437*eda14cbcSMatt Macy } else { 438*eda14cbcSMatt Macy start_obj = *objectp + 1; 439*eda14cbcSMatt Macy } 440*eda14cbcSMatt Macy 441*eda14cbcSMatt Macy offset = start_obj << DNODE_SHIFT; 442*eda14cbcSMatt Macy 443*eda14cbcSMatt Macy error = dnode_next_offset(DMU_META_DNODE(os), 444*eda14cbcSMatt Macy (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 445*eda14cbcSMatt Macy 446*eda14cbcSMatt Macy *objectp = offset >> DNODE_SHIFT; 447*eda14cbcSMatt Macy 448*eda14cbcSMatt Macy return (error); 449*eda14cbcSMatt Macy } 450*eda14cbcSMatt Macy 451*eda14cbcSMatt Macy /* 452*eda14cbcSMatt Macy * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 453*eda14cbcSMatt Macy * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 454*eda14cbcSMatt Macy * 455*eda14cbcSMatt Macy * Only for use from syncing context, on MOS objects. 456*eda14cbcSMatt Macy */ 457*eda14cbcSMatt Macy void 458*eda14cbcSMatt Macy dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 459*eda14cbcSMatt Macy dmu_tx_t *tx) 460*eda14cbcSMatt Macy { 461*eda14cbcSMatt Macy dnode_t *dn; 462*eda14cbcSMatt Macy 463*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 464*eda14cbcSMatt Macy 465*eda14cbcSMatt Macy VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 466*eda14cbcSMatt Macy if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 467*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 468*eda14cbcSMatt Macy return; 469*eda14cbcSMatt Macy } 470*eda14cbcSMatt Macy ASSERT3U(dn->dn_type, ==, old_type); 471*eda14cbcSMatt Macy ASSERT0(dn->dn_maxblkid); 472*eda14cbcSMatt Macy 473*eda14cbcSMatt Macy /* 474*eda14cbcSMatt Macy * We must initialize the ZAP data before changing the type, 475*eda14cbcSMatt Macy * so that concurrent calls to *_is_zapified() can determine if 476*eda14cbcSMatt Macy * the object has been completely zapified by checking the type. 477*eda14cbcSMatt Macy */ 478*eda14cbcSMatt Macy mzap_create_impl(dn, 0, 0, tx); 479*eda14cbcSMatt Macy 480*eda14cbcSMatt Macy dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 481*eda14cbcSMatt Macy DMU_OTN_ZAP_METADATA; 482*eda14cbcSMatt Macy dnode_setdirty(dn, tx); 483*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 484*eda14cbcSMatt Macy 485*eda14cbcSMatt Macy spa_feature_incr(dmu_objset_spa(mos), 486*eda14cbcSMatt Macy SPA_FEATURE_EXTENSIBLE_DATASET, tx); 487*eda14cbcSMatt Macy } 488*eda14cbcSMatt Macy 489*eda14cbcSMatt Macy void 490*eda14cbcSMatt Macy dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 491*eda14cbcSMatt Macy { 492*eda14cbcSMatt Macy dnode_t *dn; 493*eda14cbcSMatt Macy dmu_object_type_t t; 494*eda14cbcSMatt Macy 495*eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx)); 496*eda14cbcSMatt Macy 497*eda14cbcSMatt Macy VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 498*eda14cbcSMatt Macy t = dn->dn_type; 499*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 500*eda14cbcSMatt Macy 501*eda14cbcSMatt Macy if (t == DMU_OTN_ZAP_METADATA) { 502*eda14cbcSMatt Macy spa_feature_decr(dmu_objset_spa(mos), 503*eda14cbcSMatt Macy SPA_FEATURE_EXTENSIBLE_DATASET, tx); 504*eda14cbcSMatt Macy } 505*eda14cbcSMatt Macy VERIFY0(dmu_object_free(mos, object, tx)); 506*eda14cbcSMatt Macy } 507*eda14cbcSMatt Macy 508*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc); 509*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_ibs); 510*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_dnsize); 511*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_alloc_hold); 512*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim); 513*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_claim_dnsize); 514*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim); 515*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_reclaim_dnsize); 516*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_rm_spill); 517*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free); 518*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_next); 519*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_zapify); 520*eda14cbcSMatt Macy EXPORT_SYMBOL(dmu_object_free_zapified); 521*eda14cbcSMatt Macy 522*eda14cbcSMatt Macy /* BEGIN CSTYLED */ 523*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW, 524*eda14cbcSMatt Macy "CPU-specific allocator grabs 2^N objects at once"); 525*eda14cbcSMatt Macy /* END CSTYLED */ 526