1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 24 * Copyright 2014 HybridCluster. All rights reserved. 25 */ 26 27 #include <sys/dmu.h> 28 #include <sys/dmu_objset.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/dnode.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/dsl_dataset.h> 34 35 /* 36 * Each of the concurrent object allocators will grab 37 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 38 * grab 128 slots, which is 4 blocks worth. This was experimentally 39 * determined to be the lowest value that eliminates the measurable effect 40 * of lock contention from this code path. 41 */ 42 int dmu_object_alloc_chunk_shift = 7; 43 44 static uint64_t 45 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 46 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 47 int dnodesize, dmu_tx_t *tx) 48 { 49 uint64_t object; 50 uint64_t L1_dnode_count = DNODES_PER_BLOCK << 51 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 52 dnode_t *dn = NULL; 53 int dn_slots = dnodesize >> DNODE_SHIFT; 54 boolean_t restarted = B_FALSE; 55 uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID % 56 os->os_obj_next_percpu_len]; 57 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 58 int error; 59 60 if (dn_slots == 0) { 61 dn_slots = DNODE_MIN_SLOTS; 62 } else { 63 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 64 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 65 } 66 67 /* 68 * The "chunk" of dnodes that is assigned to a CPU-specific 69 * allocator needs to be at least one block's worth, to avoid 70 * lock contention on the dbuf. It can be at most one L1 block's 71 * worth, so that the "rescan after polishing off a L1's worth" 72 * logic below will be sure to kick in. 73 */ 74 if (dnodes_per_chunk < DNODES_PER_BLOCK) 75 dnodes_per_chunk = DNODES_PER_BLOCK; 76 if (dnodes_per_chunk > L1_dnode_count) 77 dnodes_per_chunk = L1_dnode_count; 78 79 object = *cpuobj; 80 81 for (;;) { 82 /* 83 * If we finished a chunk of dnodes, get a new one from 84 * the global allocator. 85 */ 86 if ((P2PHASE(object, dnodes_per_chunk) == 0) || 87 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 88 dn_slots)) { 89 DNODE_STAT_BUMP(dnode_alloc_next_chunk); 90 mutex_enter(&os->os_obj_lock); 91 ASSERT0(P2PHASE(os->os_obj_next_chunk, 92 dnodes_per_chunk)); 93 object = os->os_obj_next_chunk; 94 95 /* 96 * Each time we polish off a L1 bp worth of dnodes 97 * (2^12 objects), move to another L1 bp that's 98 * still reasonably sparse (at most 1/4 full). Look 99 * from the beginning at most once per txg. If we 100 * still can't allocate from that L1 block, search 101 * for an empty L0 block, which will quickly skip 102 * to the end of the metadnode if the no nearby L0 103 * blocks are empty. This fallback avoids a 104 * pathology where full dnode blocks containing 105 * large dnodes appear sparse because they have a 106 * low blk_fill, leading to many failed allocation 107 * attempts. In the long term a better mechanism to 108 * search for sparse metadnode regions, such as 109 * spacemaps, could be implemented. 110 * 111 * os_scan_dnodes is set during txg sync if enough 112 * objects have been freed since the previous 113 * rescan to justify backfilling again. 114 * 115 * Note that dmu_traverse depends on the behavior 116 * that we use multiple blocks of the dnode object 117 * before going back to reuse objects. Any change 118 * to this algorithm should preserve that property 119 * or find another solution to the issues described 120 * in traverse_visitbp. 121 */ 122 if (P2PHASE(object, L1_dnode_count) == 0) { 123 uint64_t offset; 124 uint64_t blkfill; 125 int minlvl; 126 if (os->os_rescan_dnodes) { 127 offset = 0; 128 os->os_rescan_dnodes = B_FALSE; 129 } else { 130 offset = object << DNODE_SHIFT; 131 } 132 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 133 minlvl = restarted ? 1 : 2; 134 restarted = B_TRUE; 135 error = dnode_next_offset(DMU_META_DNODE(os), 136 DNODE_FIND_HOLE, &offset, minlvl, 137 blkfill, 0); 138 if (error == 0) { 139 object = offset >> DNODE_SHIFT; 140 } 141 } 142 /* 143 * Note: if "restarted", we may find a L0 that 144 * is not suitably aligned. 145 */ 146 os->os_obj_next_chunk = 147 P2ALIGN(object, dnodes_per_chunk) + 148 dnodes_per_chunk; 149 (void) atomic_swap_64(cpuobj, object); 150 mutex_exit(&os->os_obj_lock); 151 } 152 153 /* 154 * The value of (*cpuobj) before adding dn_slots is the object 155 * ID assigned to us. The value afterwards is the object ID 156 * assigned to whoever wants to do an allocation next. 157 */ 158 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 159 160 /* 161 * XXX We should check for an i/o error here and return 162 * up to our caller. Actually we should pre-read it in 163 * dmu_tx_assign(), but there is currently no mechanism 164 * to do so. 165 */ 166 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 167 dn_slots, FTAG, &dn); 168 if (error == 0) { 169 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 170 /* 171 * Another thread could have allocated it; check 172 * again now that we have the struct lock. 173 */ 174 if (dn->dn_type == DMU_OT_NONE) { 175 dnode_allocate(dn, ot, blocksize, 0, 176 bonustype, bonuslen, dn_slots, tx); 177 rw_exit(&dn->dn_struct_rwlock); 178 dmu_tx_add_new_object(tx, dn); 179 dnode_rele(dn, FTAG); 180 return (object); 181 } 182 rw_exit(&dn->dn_struct_rwlock); 183 dnode_rele(dn, FTAG); 184 DNODE_STAT_BUMP(dnode_alloc_race); 185 } 186 187 /* 188 * Skip to next known valid starting point on error. This 189 * is the start of the next block of dnodes. 190 */ 191 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 192 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 193 DNODE_STAT_BUMP(dnode_alloc_next_block); 194 } 195 (void) atomic_swap_64(cpuobj, object); 196 } 197 } 198 199 uint64_t 200 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 201 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 202 { 203 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 204 bonuslen, 0, tx)); 205 } 206 207 uint64_t 208 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 209 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 210 dmu_tx_t *tx) 211 { 212 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 213 bonustype, bonuslen, 0, tx)); 214 } 215 216 uint64_t 217 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 218 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 219 { 220 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 221 bonuslen, dnodesize, tx)); 222 } 223 224 int 225 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 226 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 227 { 228 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 229 bonuslen, 0, tx)); 230 } 231 232 int 233 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 234 int blocksize, dmu_object_type_t bonustype, int bonuslen, 235 int dnodesize, dmu_tx_t *tx) 236 { 237 dnode_t *dn; 238 int dn_slots = dnodesize >> DNODE_SHIFT; 239 int err; 240 241 if (dn_slots == 0) 242 dn_slots = DNODE_MIN_SLOTS; 243 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 244 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 245 246 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 247 return (SET_ERROR(EBADF)); 248 249 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 250 FTAG, &dn); 251 if (err) 252 return (err); 253 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 254 dmu_tx_add_new_object(tx, dn); 255 256 dnode_rele(dn, FTAG); 257 258 return (0); 259 } 260 261 int 262 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 263 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 264 { 265 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 266 bonuslen, DNODE_MIN_SIZE, tx)); 267 } 268 269 int 270 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 271 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 272 dmu_tx_t *tx) 273 { 274 dnode_t *dn; 275 int dn_slots = dnodesize >> DNODE_SHIFT; 276 int err; 277 278 if (dn_slots == 0) 279 dn_slots = DNODE_MIN_SLOTS; 280 281 if (object == DMU_META_DNODE_OBJECT) 282 return (SET_ERROR(EBADF)); 283 284 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 285 FTAG, &dn); 286 if (err) 287 return (err); 288 289 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); 290 291 dnode_rele(dn, FTAG); 292 return (err); 293 } 294 295 int 296 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 297 { 298 dnode_t *dn; 299 int err; 300 301 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 302 303 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 304 FTAG, &dn); 305 if (err) 306 return (err); 307 308 ASSERT(dn->dn_type != DMU_OT_NONE); 309 /* 310 * If we don't create this free range, we'll leak indirect blocks when 311 * we get to freeing the dnode in syncing context. 312 */ 313 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 314 dnode_free(dn, tx); 315 dnode_rele(dn, FTAG); 316 317 return (0); 318 } 319 320 /* 321 * Return (in *objectp) the next object which is allocated (or a hole) 322 * after *object, taking into account only objects that may have been modified 323 * after the specified txg. 324 */ 325 int 326 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 327 { 328 uint64_t offset; 329 uint64_t start_obj; 330 struct dsl_dataset *ds = os->os_dsl_dataset; 331 int error; 332 333 if (*objectp == 0) { 334 start_obj = 1; 335 } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) { 336 uint64_t i = *objectp + 1; 337 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 338 dmu_object_info_t doi; 339 340 /* 341 * Scan through the remaining meta dnode block. The contents 342 * of each slot in the block are known so it can be quickly 343 * checked. If the block is exhausted without a match then 344 * hand off to dnode_next_offset() for further scanning. 345 */ 346 while (i <= last_obj) { 347 error = dmu_object_info(os, i, &doi); 348 if (error == ENOENT) { 349 if (hole) { 350 *objectp = i; 351 return (0); 352 } else { 353 i++; 354 } 355 } else if (error == EEXIST) { 356 i++; 357 } else if (error == 0) { 358 if (hole) { 359 i += doi.doi_dnodesize >> DNODE_SHIFT; 360 } else { 361 *objectp = i; 362 return (0); 363 } 364 } else { 365 return (error); 366 } 367 } 368 369 start_obj = i; 370 } else { 371 start_obj = *objectp + 1; 372 } 373 374 offset = start_obj << DNODE_SHIFT; 375 376 error = dnode_next_offset(DMU_META_DNODE(os), 377 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 378 379 *objectp = offset >> DNODE_SHIFT; 380 381 return (error); 382 } 383 384 /* 385 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 386 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 387 * 388 * Only for use from syncing context, on MOS objects. 389 */ 390 void 391 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 392 dmu_tx_t *tx) 393 { 394 dnode_t *dn; 395 396 ASSERT(dmu_tx_is_syncing(tx)); 397 398 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 399 if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 400 dnode_rele(dn, FTAG); 401 return; 402 } 403 ASSERT3U(dn->dn_type, ==, old_type); 404 ASSERT0(dn->dn_maxblkid); 405 406 /* 407 * We must initialize the ZAP data before changing the type, 408 * so that concurrent calls to *_is_zapified() can determine if 409 * the object has been completely zapified by checking the type. 410 */ 411 mzap_create_impl(mos, object, 0, 0, tx); 412 413 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 414 DMU_OTN_ZAP_METADATA; 415 dnode_setdirty(dn, tx); 416 dnode_rele(dn, FTAG); 417 418 spa_feature_incr(dmu_objset_spa(mos), 419 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 420 } 421 422 void 423 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 424 { 425 dnode_t *dn; 426 dmu_object_type_t t; 427 428 ASSERT(dmu_tx_is_syncing(tx)); 429 430 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 431 t = dn->dn_type; 432 dnode_rele(dn, FTAG); 433 434 if (t == DMU_OTN_ZAP_METADATA) { 435 spa_feature_decr(dmu_objset_spa(mos), 436 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 437 } 438 VERIFY0(dmu_object_free(mos, object, tx)); 439 } 440