1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 24 * Copyright 2014 HybridCluster. All rights reserved. 25 */ 26 27 #include <sys/dbuf.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dnode.h> 33 #include <sys/zap.h> 34 #include <sys/zfeature.h> 35 #include <sys/dsl_dataset.h> 36 37 /* 38 * Each of the concurrent object allocators will grab 39 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 40 * grab 128 slots, which is 4 blocks worth. This was experimentally 41 * determined to be the lowest value that eliminates the measurable effect 42 * of lock contention from this code path. 43 */ 44 uint_t dmu_object_alloc_chunk_shift = 7; 45 46 static uint64_t 47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 48 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 49 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 50 { 51 uint64_t object; 52 uint64_t L1_dnode_count = DNODES_PER_BLOCK << 53 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 54 dnode_t *dn = NULL; 55 int dn_slots = dnodesize >> DNODE_SHIFT; 56 boolean_t restarted = B_FALSE; 57 uint64_t *cpuobj = NULL; 58 uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 59 int error; 60 61 cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % 62 os->os_obj_next_percpu_len]; 63 64 if (dn_slots == 0) { 65 dn_slots = DNODE_MIN_SLOTS; 66 } else { 67 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 68 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 69 } 70 71 /* 72 * The "chunk" of dnodes that is assigned to a CPU-specific 73 * allocator needs to be at least one block's worth, to avoid 74 * lock contention on the dbuf. It can be at most one L1 block's 75 * worth, so that the "rescan after polishing off a L1's worth" 76 * logic below will be sure to kick in. 77 */ 78 if (dnodes_per_chunk < DNODES_PER_BLOCK) 79 dnodes_per_chunk = DNODES_PER_BLOCK; 80 if (dnodes_per_chunk > L1_dnode_count) 81 dnodes_per_chunk = L1_dnode_count; 82 83 /* 84 * The caller requested the dnode be returned as a performance 85 * optimization in order to avoid releasing the hold only to 86 * immediately reacquire it. Since they caller is responsible 87 * for releasing the hold they must provide the tag. 88 */ 89 if (allocated_dnode != NULL) { 90 ASSERT3P(tag, !=, NULL); 91 } else { 92 ASSERT3P(tag, ==, NULL); 93 tag = FTAG; 94 } 95 96 object = *cpuobj; 97 for (;;) { 98 /* 99 * If we finished a chunk of dnodes, get a new one from 100 * the global allocator. 101 */ 102 if ((P2PHASE(object, dnodes_per_chunk) == 0) || 103 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 104 dn_slots)) { 105 DNODE_STAT_BUMP(dnode_alloc_next_chunk); 106 mutex_enter(&os->os_obj_lock); 107 ASSERT0(P2PHASE(os->os_obj_next_chunk, 108 dnodes_per_chunk)); 109 object = os->os_obj_next_chunk; 110 111 /* 112 * Each time we polish off a L1 bp worth of dnodes 113 * (2^12 objects), move to another L1 bp that's 114 * still reasonably sparse (at most 1/4 full). Look 115 * from the beginning at most once per txg. If we 116 * still can't allocate from that L1 block, search 117 * for an empty L0 block, which will quickly skip 118 * to the end of the metadnode if no nearby L0 119 * blocks are empty. This fallback avoids a 120 * pathology where full dnode blocks containing 121 * large dnodes appear sparse because they have a 122 * low blk_fill, leading to many failed allocation 123 * attempts. In the long term a better mechanism to 124 * search for sparse metadnode regions, such as 125 * spacemaps, could be implemented. 126 * 127 * os_scan_dnodes is set during txg sync if enough 128 * objects have been freed since the previous 129 * rescan to justify backfilling again. 130 * 131 * Note that dmu_traverse depends on the behavior 132 * that we use multiple blocks of the dnode object 133 * before going back to reuse objects. Any change 134 * to this algorithm should preserve that property 135 * or find another solution to the issues described 136 * in traverse_visitbp. 137 */ 138 if (P2PHASE(object, L1_dnode_count) == 0) { 139 uint64_t offset; 140 uint64_t blkfill; 141 int minlvl; 142 if (os->os_rescan_dnodes) { 143 offset = 0; 144 os->os_rescan_dnodes = B_FALSE; 145 } else { 146 offset = object << DNODE_SHIFT; 147 } 148 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 149 minlvl = restarted ? 1 : 2; 150 restarted = B_TRUE; 151 error = dnode_next_offset(DMU_META_DNODE(os), 152 DNODE_FIND_HOLE, &offset, minlvl, 153 blkfill, 0); 154 if (error == 0) { 155 object = offset >> DNODE_SHIFT; 156 } 157 } 158 /* 159 * Note: if "restarted", we may find a L0 that 160 * is not suitably aligned. 161 */ 162 os->os_obj_next_chunk = 163 P2ALIGN(object, dnodes_per_chunk) + 164 dnodes_per_chunk; 165 (void) atomic_swap_64(cpuobj, object); 166 mutex_exit(&os->os_obj_lock); 167 } 168 169 /* 170 * The value of (*cpuobj) before adding dn_slots is the object 171 * ID assigned to us. The value afterwards is the object ID 172 * assigned to whoever wants to do an allocation next. 173 */ 174 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 175 176 /* 177 * XXX We should check for an i/o error here and return 178 * up to our caller. Actually we should pre-read it in 179 * dmu_tx_assign(), but there is currently no mechanism 180 * to do so. 181 */ 182 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 183 dn_slots, tag, &dn); 184 if (error == 0) { 185 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 186 /* 187 * Another thread could have allocated it; check 188 * again now that we have the struct lock. 189 */ 190 if (dn->dn_type == DMU_OT_NONE) { 191 dnode_allocate(dn, ot, blocksize, 192 indirect_blockshift, bonustype, 193 bonuslen, dn_slots, tx); 194 rw_exit(&dn->dn_struct_rwlock); 195 dmu_tx_add_new_object(tx, dn); 196 197 /* 198 * Caller requested the allocated dnode be 199 * returned and is responsible for the hold. 200 */ 201 if (allocated_dnode != NULL) 202 *allocated_dnode = dn; 203 else 204 dnode_rele(dn, tag); 205 206 return (object); 207 } 208 rw_exit(&dn->dn_struct_rwlock); 209 dnode_rele(dn, tag); 210 DNODE_STAT_BUMP(dnode_alloc_race); 211 } 212 213 /* 214 * Skip to next known valid starting point on error. This 215 * is the start of the next block of dnodes. 216 */ 217 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 218 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 219 DNODE_STAT_BUMP(dnode_alloc_next_block); 220 } 221 (void) atomic_swap_64(cpuobj, object); 222 } 223 } 224 225 uint64_t 226 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 227 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 228 { 229 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 230 bonuslen, 0, NULL, NULL, tx); 231 } 232 233 uint64_t 234 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 235 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 236 dmu_tx_t *tx) 237 { 238 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 239 bonustype, bonuslen, 0, NULL, NULL, tx); 240 } 241 242 uint64_t 243 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 244 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 245 { 246 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 247 bonuslen, dnodesize, NULL, NULL, tx)); 248 } 249 250 /* 251 * Allocate a new object and return a pointer to the newly allocated dnode 252 * via the allocated_dnode argument. The returned dnode will be held and 253 * the caller is responsible for releasing the hold by calling dnode_rele(). 254 */ 255 uint64_t 256 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, 257 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 258 int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) 259 { 260 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 261 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); 262 } 263 264 int 265 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 266 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 267 { 268 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 269 bonuslen, 0, tx)); 270 } 271 272 int 273 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 274 int blocksize, dmu_object_type_t bonustype, int bonuslen, 275 int dnodesize, dmu_tx_t *tx) 276 { 277 dnode_t *dn; 278 int dn_slots = dnodesize >> DNODE_SHIFT; 279 int err; 280 281 if (dn_slots == 0) 282 dn_slots = DNODE_MIN_SLOTS; 283 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 284 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 285 286 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 287 return (SET_ERROR(EBADF)); 288 289 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 290 FTAG, &dn); 291 if (err) 292 return (err); 293 294 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 295 dmu_tx_add_new_object(tx, dn); 296 297 dnode_rele(dn, FTAG); 298 299 return (0); 300 } 301 302 int 303 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 304 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 305 { 306 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 307 bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); 308 } 309 310 int 311 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 312 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 313 boolean_t keep_spill, dmu_tx_t *tx) 314 { 315 dnode_t *dn; 316 int dn_slots = dnodesize >> DNODE_SHIFT; 317 int err; 318 319 if (dn_slots == 0) 320 dn_slots = DNODE_MIN_SLOTS; 321 322 if (object == DMU_META_DNODE_OBJECT) 323 return (SET_ERROR(EBADF)); 324 325 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 326 FTAG, &dn); 327 if (err) 328 return (err); 329 330 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, 331 keep_spill, tx); 332 333 dnode_rele(dn, FTAG); 334 return (err); 335 } 336 337 int 338 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 339 { 340 dnode_t *dn; 341 int err; 342 343 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 344 FTAG, &dn); 345 if (err) 346 return (err); 347 348 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 349 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 350 dbuf_rm_spill(dn, tx); 351 dnode_rm_spill(dn, tx); 352 } 353 rw_exit(&dn->dn_struct_rwlock); 354 355 dnode_rele(dn, FTAG); 356 return (err); 357 } 358 359 int 360 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 361 { 362 dnode_t *dn; 363 int err; 364 365 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 366 367 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 368 FTAG, &dn); 369 if (err) 370 return (err); 371 372 ASSERT(dn->dn_type != DMU_OT_NONE); 373 /* 374 * If we don't create this free range, we'll leak indirect blocks when 375 * we get to freeing the dnode in syncing context. 376 */ 377 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 378 dnode_free(dn, tx); 379 dnode_rele(dn, FTAG); 380 381 return (0); 382 } 383 384 /* 385 * Return (in *objectp) the next object which is allocated (or a hole) 386 * after *object, taking into account only objects that may have been modified 387 * after the specified txg. 388 */ 389 int 390 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 391 { 392 uint64_t offset; 393 uint64_t start_obj; 394 struct dsl_dataset *ds = os->os_dsl_dataset; 395 int error; 396 397 if (*objectp == 0) { 398 start_obj = 1; 399 } else if (ds && dsl_dataset_feature_is_active(ds, 400 SPA_FEATURE_LARGE_DNODE)) { 401 uint64_t i = *objectp + 1; 402 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 403 dmu_object_info_t doi; 404 405 /* 406 * Scan through the remaining meta dnode block. The contents 407 * of each slot in the block are known so it can be quickly 408 * checked. If the block is exhausted without a match then 409 * hand off to dnode_next_offset() for further scanning. 410 */ 411 while (i <= last_obj) { 412 if (i == 0) 413 return (SET_ERROR(ESRCH)); 414 error = dmu_object_info(os, i, &doi); 415 if (error == ENOENT) { 416 if (hole) { 417 *objectp = i; 418 return (0); 419 } else { 420 i++; 421 } 422 } else if (error == EEXIST) { 423 i++; 424 } else if (error == 0) { 425 if (hole) { 426 i += doi.doi_dnodesize >> DNODE_SHIFT; 427 } else { 428 *objectp = i; 429 return (0); 430 } 431 } else { 432 return (error); 433 } 434 } 435 436 start_obj = i; 437 } else { 438 start_obj = *objectp + 1; 439 } 440 441 offset = start_obj << DNODE_SHIFT; 442 443 error = dnode_next_offset(DMU_META_DNODE(os), 444 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 445 446 *objectp = offset >> DNODE_SHIFT; 447 448 return (error); 449 } 450 451 /* 452 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 453 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 454 * 455 * Only for use from syncing context, on MOS objects. 456 */ 457 void 458 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 459 dmu_tx_t *tx) 460 { 461 dnode_t *dn; 462 463 ASSERT(dmu_tx_is_syncing(tx)); 464 465 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 466 if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 467 dnode_rele(dn, FTAG); 468 return; 469 } 470 ASSERT3U(dn->dn_type, ==, old_type); 471 ASSERT0(dn->dn_maxblkid); 472 473 /* 474 * We must initialize the ZAP data before changing the type, 475 * so that concurrent calls to *_is_zapified() can determine if 476 * the object has been completely zapified by checking the type. 477 */ 478 mzap_create_impl(dn, 0, 0, tx); 479 480 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 481 DMU_OTN_ZAP_METADATA; 482 dnode_setdirty(dn, tx); 483 dnode_rele(dn, FTAG); 484 485 spa_feature_incr(dmu_objset_spa(mos), 486 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 487 } 488 489 void 490 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 491 { 492 dnode_t *dn; 493 dmu_object_type_t t; 494 495 ASSERT(dmu_tx_is_syncing(tx)); 496 497 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 498 t = dn->dn_type; 499 dnode_rele(dn, FTAG); 500 501 if (t == DMU_OTN_ZAP_METADATA) { 502 spa_feature_decr(dmu_objset_spa(mos), 503 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 504 } 505 VERIFY0(dmu_object_free(mos, object, tx)); 506 } 507 508 EXPORT_SYMBOL(dmu_object_alloc); 509 EXPORT_SYMBOL(dmu_object_alloc_ibs); 510 EXPORT_SYMBOL(dmu_object_alloc_dnsize); 511 EXPORT_SYMBOL(dmu_object_alloc_hold); 512 EXPORT_SYMBOL(dmu_object_claim); 513 EXPORT_SYMBOL(dmu_object_claim_dnsize); 514 EXPORT_SYMBOL(dmu_object_reclaim); 515 EXPORT_SYMBOL(dmu_object_reclaim_dnsize); 516 EXPORT_SYMBOL(dmu_object_rm_spill); 517 EXPORT_SYMBOL(dmu_object_free); 518 EXPORT_SYMBOL(dmu_object_next); 519 EXPORT_SYMBOL(dmu_object_zapify); 520 EXPORT_SYMBOL(dmu_object_free_zapified); 521 522 /* BEGIN CSTYLED */ 523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW, 524 "CPU-specific allocator grabs 2^N objects at once"); 525 /* END CSTYLED */ 526