1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 25 * Copyright 2014 HybridCluster. All rights reserved. 26 */ 27 28 #include <sys/dbuf.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dnode.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/dsl_dataset.h> 37 38 /* 39 * Each of the concurrent object allocators will grab 40 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 41 * grab 128 slots, which is 4 blocks worth. This was experimentally 42 * determined to be the lowest value that eliminates the measurable effect 43 * of lock contention from this code path. 44 */ 45 uint_t dmu_object_alloc_chunk_shift = 7; 46 47 static uint64_t 48 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 49 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 50 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 51 { 52 uint64_t object; 53 uint64_t L1_dnode_count = DNODES_PER_BLOCK << 54 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 55 dnode_t *dn = NULL; 56 int dn_slots = dnodesize >> DNODE_SHIFT; 57 boolean_t restarted = B_FALSE; 58 uint64_t *cpuobj = NULL; 59 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 60 int error; 61 62 cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % 63 os->os_obj_next_percpu_len]; 64 65 if (dn_slots == 0) { 66 dn_slots = DNODE_MIN_SLOTS; 67 } else { 68 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 69 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 70 } 71 72 /* 73 * The "chunk" of dnodes that is assigned to a CPU-specific 74 * allocator needs to be at least one block's worth, to avoid 75 * lock contention on the dbuf. It can be at most one L1 block's 76 * worth, so that the "rescan after polishing off a L1's worth" 77 * logic below will be sure to kick in. 78 */ 79 if (dnodes_per_chunk < DNODES_PER_BLOCK) 80 dnodes_per_chunk = DNODES_PER_BLOCK; 81 if (dnodes_per_chunk > L1_dnode_count) 82 dnodes_per_chunk = L1_dnode_count; 83 84 /* 85 * The caller requested the dnode be returned as a performance 86 * optimization in order to avoid releasing the hold only to 87 * immediately reacquire it. Since they caller is responsible 88 * for releasing the hold they must provide the tag. 89 */ 90 if (allocated_dnode != NULL) { 91 ASSERT3P(tag, !=, NULL); 92 } else { 93 ASSERT3P(tag, ==, NULL); 94 tag = FTAG; 95 } 96 97 object = *cpuobj; 98 for (;;) { 99 /* 100 * If we finished a chunk of dnodes, get a new one from 101 * the global allocator. 102 */ 103 if ((P2PHASE(object, dnodes_per_chunk) == 0) || 104 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 105 dn_slots)) { 106 DNODE_STAT_BUMP(dnode_alloc_next_chunk); 107 mutex_enter(&os->os_obj_lock); 108 ASSERT0(P2PHASE(os->os_obj_next_chunk, 109 dnodes_per_chunk)); 110 object = os->os_obj_next_chunk; 111 112 /* 113 * Each time we polish off a L1 bp worth of dnodes 114 * (2^12 objects), move to another L1 bp that's 115 * still reasonably sparse (at most 1/4 full). Look 116 * from the beginning at most once per txg. If we 117 * still can't allocate from that L1 block, search 118 * for an empty L0 block, which will quickly skip 119 * to the end of the metadnode if no nearby L0 120 * blocks are empty. This fallback avoids a 121 * pathology where full dnode blocks containing 122 * large dnodes appear sparse because they have a 123 * low blk_fill, leading to many failed allocation 124 * attempts. In the long term a better mechanism to 125 * search for sparse metadnode regions, such as 126 * spacemaps, could be implemented. 127 * 128 * os_scan_dnodes is set during txg sync if enough 129 * objects have been freed since the previous 130 * rescan to justify backfilling again. 131 * 132 * Note that dmu_traverse depends on the behavior 133 * that we use multiple blocks of the dnode object 134 * before going back to reuse objects. Any change 135 * to this algorithm should preserve that property 136 * or find another solution to the issues described 137 * in traverse_visitbp. 138 */ 139 if (P2PHASE(object, L1_dnode_count) == 0) { 140 uint64_t offset; 141 uint64_t blkfill; 142 int minlvl; 143 if (os->os_rescan_dnodes) { 144 offset = 0; 145 os->os_rescan_dnodes = B_FALSE; 146 } else { 147 offset = object << DNODE_SHIFT; 148 } 149 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 150 minlvl = restarted ? 1 : 2; 151 restarted = B_TRUE; 152 error = dnode_next_offset(DMU_META_DNODE(os), 153 DNODE_FIND_HOLE, &offset, minlvl, 154 blkfill, 0); 155 if (error == 0) { 156 object = offset >> DNODE_SHIFT; 157 } 158 } 159 /* 160 * Note: if "restarted", we may find a L0 that 161 * is not suitably aligned. 162 */ 163 os->os_obj_next_chunk = 164 P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + 165 dnodes_per_chunk; 166 (void) atomic_swap_64(cpuobj, object); 167 mutex_exit(&os->os_obj_lock); 168 } 169 170 /* 171 * The value of (*cpuobj) before adding dn_slots is the object 172 * ID assigned to us. The value afterwards is the object ID 173 * assigned to whoever wants to do an allocation next. 174 */ 175 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 176 177 /* 178 * XXX We should check for an i/o error here and return 179 * up to our caller. Actually we should pre-read it in 180 * dmu_tx_assign(), but there is currently no mechanism 181 * to do so. 182 */ 183 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 184 dn_slots, tag, &dn); 185 if (error == 0) { 186 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 187 /* 188 * Another thread could have allocated it; check 189 * again now that we have the struct lock. 190 */ 191 if (dn->dn_type == DMU_OT_NONE) { 192 dnode_allocate(dn, ot, blocksize, 193 indirect_blockshift, bonustype, 194 bonuslen, dn_slots, tx); 195 rw_exit(&dn->dn_struct_rwlock); 196 dmu_tx_add_new_object(tx, dn); 197 198 /* 199 * Caller requested the allocated dnode be 200 * returned and is responsible for the hold. 201 */ 202 if (allocated_dnode != NULL) 203 *allocated_dnode = dn; 204 else 205 dnode_rele(dn, tag); 206 207 return (object); 208 } 209 rw_exit(&dn->dn_struct_rwlock); 210 dnode_rele(dn, tag); 211 DNODE_STAT_BUMP(dnode_alloc_race); 212 } 213 214 /* 215 * Skip to next known valid starting point on error. This 216 * is the start of the next block of dnodes. 217 */ 218 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 219 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 220 DNODE_STAT_BUMP(dnode_alloc_next_block); 221 } 222 (void) atomic_swap_64(cpuobj, object); 223 } 224 } 225 226 uint64_t 227 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 228 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 229 { 230 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 231 bonuslen, 0, NULL, NULL, tx); 232 } 233 234 uint64_t 235 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 236 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 237 dmu_tx_t *tx) 238 { 239 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 240 bonustype, bonuslen, 0, NULL, NULL, tx); 241 } 242 243 uint64_t 244 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 245 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 246 { 247 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 248 bonuslen, dnodesize, NULL, NULL, tx)); 249 } 250 251 /* 252 * Allocate a new object and return a pointer to the newly allocated dnode 253 * via the allocated_dnode argument. The returned dnode will be held and 254 * the caller is responsible for releasing the hold by calling dnode_rele(). 255 */ 256 uint64_t 257 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, 258 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 259 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 260 { 261 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 262 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); 263 } 264 265 int 266 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 267 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 268 { 269 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 270 bonuslen, 0, tx)); 271 } 272 273 int 274 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 275 int blocksize, dmu_object_type_t bonustype, int bonuslen, 276 int dnodesize, dmu_tx_t *tx) 277 { 278 dnode_t *dn; 279 int dn_slots = dnodesize >> DNODE_SHIFT; 280 int err; 281 282 if (dn_slots == 0) 283 dn_slots = DNODE_MIN_SLOTS; 284 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 285 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 286 287 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 288 return (SET_ERROR(EBADF)); 289 290 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 291 FTAG, &dn); 292 if (err) 293 return (err); 294 295 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 296 dmu_tx_add_new_object(tx, dn); 297 298 dnode_rele(dn, FTAG); 299 300 return (0); 301 } 302 303 int 304 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 305 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 306 { 307 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 308 bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); 309 } 310 311 int 312 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 313 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 314 boolean_t keep_spill, dmu_tx_t *tx) 315 { 316 dnode_t *dn; 317 int dn_slots = dnodesize >> DNODE_SHIFT; 318 int err; 319 320 if (dn_slots == 0) 321 dn_slots = DNODE_MIN_SLOTS; 322 323 if (object == DMU_META_DNODE_OBJECT) 324 return (SET_ERROR(EBADF)); 325 326 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 327 FTAG, &dn); 328 if (err) 329 return (err); 330 331 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, 332 keep_spill, tx); 333 334 dnode_rele(dn, FTAG); 335 return (err); 336 } 337 338 int 339 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 340 { 341 dnode_t *dn; 342 int err; 343 344 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 345 FTAG, &dn); 346 if (err) 347 return (err); 348 349 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 350 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 351 dbuf_rm_spill(dn, tx); 352 dnode_rm_spill(dn, tx); 353 } 354 rw_exit(&dn->dn_struct_rwlock); 355 356 dnode_rele(dn, FTAG); 357 return (err); 358 } 359 360 int 361 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 362 { 363 dnode_t *dn; 364 int err; 365 366 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 367 368 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 369 FTAG, &dn); 370 if (err) 371 return (err); 372 373 ASSERT(dn->dn_type != DMU_OT_NONE); 374 /* 375 * If we don't create this free range, we'll leak indirect blocks when 376 * we get to freeing the dnode in syncing context. 377 */ 378 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 379 dnode_free(dn, tx); 380 dnode_rele(dn, FTAG); 381 382 return (0); 383 } 384 385 /* 386 * Return (in *objectp) the next object which is allocated (or a hole) 387 * after *object, taking into account only objects that may have been modified 388 * after the specified txg. 389 */ 390 int 391 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 392 { 393 uint64_t offset; 394 uint64_t start_obj; 395 struct dsl_dataset *ds = os->os_dsl_dataset; 396 int error; 397 398 if (*objectp == 0) { 399 start_obj = 1; 400 } else if (ds && dsl_dataset_feature_is_active(ds, 401 SPA_FEATURE_LARGE_DNODE)) { 402 uint64_t i = *objectp + 1; 403 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 404 dmu_object_info_t doi; 405 406 /* 407 * Scan through the remaining meta dnode block. The contents 408 * of each slot in the block are known so it can be quickly 409 * checked. If the block is exhausted without a match then 410 * hand off to dnode_next_offset() for further scanning. 411 */ 412 while (i <= last_obj) { 413 if (i == 0) 414 return (SET_ERROR(ESRCH)); 415 error = dmu_object_info(os, i, &doi); 416 if (error == ENOENT) { 417 if (hole) { 418 *objectp = i; 419 return (0); 420 } else { 421 i++; 422 } 423 } else if (error == EEXIST) { 424 i++; 425 } else if (error == 0) { 426 if (hole) { 427 i += doi.doi_dnodesize >> DNODE_SHIFT; 428 } else { 429 *objectp = i; 430 return (0); 431 } 432 } else { 433 return (error); 434 } 435 } 436 437 start_obj = i; 438 } else { 439 start_obj = *objectp + 1; 440 } 441 442 offset = start_obj << DNODE_SHIFT; 443 444 error = dnode_next_offset(DMU_META_DNODE(os), 445 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 446 447 *objectp = offset >> DNODE_SHIFT; 448 449 return (error); 450 } 451 452 /* 453 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 454 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 455 * 456 * Only for use from syncing context, on MOS objects. 457 */ 458 void 459 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 460 dmu_tx_t *tx) 461 { 462 dnode_t *dn; 463 464 ASSERT(dmu_tx_is_syncing(tx)); 465 466 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 467 if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 468 dnode_rele(dn, FTAG); 469 return; 470 } 471 ASSERT3U(dn->dn_type, ==, old_type); 472 ASSERT0(dn->dn_maxblkid); 473 474 /* 475 * We must initialize the ZAP data before changing the type, 476 * so that concurrent calls to *_is_zapified() can determine if 477 * the object has been completely zapified by checking the type. 478 */ 479 mzap_create_impl(dn, 0, 0, tx); 480 481 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 482 DMU_OTN_ZAP_METADATA; 483 dnode_setdirty(dn, tx); 484 dnode_rele(dn, FTAG); 485 486 spa_feature_incr(dmu_objset_spa(mos), 487 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 488 } 489 490 void 491 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 492 { 493 dnode_t *dn; 494 dmu_object_type_t t; 495 496 ASSERT(dmu_tx_is_syncing(tx)); 497 498 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 499 t = dn->dn_type; 500 dnode_rele(dn, FTAG); 501 502 if (t == DMU_OTN_ZAP_METADATA) { 503 spa_feature_decr(dmu_objset_spa(mos), 504 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 505 } 506 VERIFY0(dmu_object_free(mos, object, tx)); 507 } 508 509 EXPORT_SYMBOL(dmu_object_alloc); 510 EXPORT_SYMBOL(dmu_object_alloc_ibs); 511 EXPORT_SYMBOL(dmu_object_alloc_dnsize); 512 EXPORT_SYMBOL(dmu_object_alloc_hold); 513 EXPORT_SYMBOL(dmu_object_claim); 514 EXPORT_SYMBOL(dmu_object_claim_dnsize); 515 EXPORT_SYMBOL(dmu_object_reclaim); 516 EXPORT_SYMBOL(dmu_object_reclaim_dnsize); 517 EXPORT_SYMBOL(dmu_object_rm_spill); 518 EXPORT_SYMBOL(dmu_object_free); 519 EXPORT_SYMBOL(dmu_object_next); 520 EXPORT_SYMBOL(dmu_object_zapify); 521 EXPORT_SYMBOL(dmu_object_free_zapified); 522 523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW, 524 "CPU-specific allocator grabs 2^N objects at once"); 525