1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 24 * Copyright 2014 HybridCluster. All rights reserved. 25 */ 26 27 #include <sys/dbuf.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dnode.h> 33 #include <sys/zap.h> 34 #include <sys/zfeature.h> 35 #include <sys/dsl_dataset.h> 36 37 /* 38 * Each of the concurrent object allocators will grab 39 * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to 40 * grab 128 slots, which is 4 blocks worth. This was experimentally 41 * determined to be the lowest value that eliminates the measurable effect 42 * of lock contention from this code path. 43 */ 44 int dmu_object_alloc_chunk_shift = 7; 45 46 static uint64_t 47 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, 48 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 49 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) 50 { 51 uint64_t object; 52 uint64_t L1_dnode_count = DNODES_PER_BLOCK << 53 (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); 54 dnode_t *dn = NULL; 55 int dn_slots = dnodesize >> DNODE_SHIFT; 56 boolean_t restarted = B_FALSE; 57 uint64_t *cpuobj = NULL; 58 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 59 int error; 60 61 kpreempt_disable(); 62 cpuobj = &os->os_obj_next_percpu[CPU_SEQID % 63 os->os_obj_next_percpu_len]; 64 kpreempt_enable(); 65 66 if (dn_slots == 0) { 67 dn_slots = DNODE_MIN_SLOTS; 68 } else { 69 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 70 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 71 } 72 73 /* 74 * The "chunk" of dnodes that is assigned to a CPU-specific 75 * allocator needs to be at least one block's worth, to avoid 76 * lock contention on the dbuf. It can be at most one L1 block's 77 * worth, so that the "rescan after polishing off a L1's worth" 78 * logic below will be sure to kick in. 79 */ 80 if (dnodes_per_chunk < DNODES_PER_BLOCK) 81 dnodes_per_chunk = DNODES_PER_BLOCK; 82 if (dnodes_per_chunk > L1_dnode_count) 83 dnodes_per_chunk = L1_dnode_count; 84 85 /* 86 * The caller requested the dnode be returned as a performance 87 * optimization in order to avoid releasing the hold only to 88 * immediately reacquire it. Since they caller is responsible 89 * for releasing the hold they must provide the tag. 90 */ 91 if (allocated_dnode != NULL) { 92 ASSERT3P(tag, !=, NULL); 93 } else { 94 ASSERT3P(tag, ==, NULL); 95 tag = FTAG; 96 } 97 98 object = *cpuobj; 99 for (;;) { 100 /* 101 * If we finished a chunk of dnodes, get a new one from 102 * the global allocator. 103 */ 104 if ((P2PHASE(object, dnodes_per_chunk) == 0) || 105 (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < 106 dn_slots)) { 107 DNODE_STAT_BUMP(dnode_alloc_next_chunk); 108 mutex_enter(&os->os_obj_lock); 109 ASSERT0(P2PHASE(os->os_obj_next_chunk, 110 dnodes_per_chunk)); 111 object = os->os_obj_next_chunk; 112 113 /* 114 * Each time we polish off a L1 bp worth of dnodes 115 * (2^12 objects), move to another L1 bp that's 116 * still reasonably sparse (at most 1/4 full). Look 117 * from the beginning at most once per txg. If we 118 * still can't allocate from that L1 block, search 119 * for an empty L0 block, which will quickly skip 120 * to the end of the metadnode if no nearby L0 121 * blocks are empty. This fallback avoids a 122 * pathology where full dnode blocks containing 123 * large dnodes appear sparse because they have a 124 * low blk_fill, leading to many failed allocation 125 * attempts. In the long term a better mechanism to 126 * search for sparse metadnode regions, such as 127 * spacemaps, could be implemented. 128 * 129 * os_scan_dnodes is set during txg sync if enough 130 * objects have been freed since the previous 131 * rescan to justify backfilling again. 132 * 133 * Note that dmu_traverse depends on the behavior 134 * that we use multiple blocks of the dnode object 135 * before going back to reuse objects. Any change 136 * to this algorithm should preserve that property 137 * or find another solution to the issues described 138 * in traverse_visitbp. 139 */ 140 if (P2PHASE(object, L1_dnode_count) == 0) { 141 uint64_t offset; 142 uint64_t blkfill; 143 int minlvl; 144 if (os->os_rescan_dnodes) { 145 offset = 0; 146 os->os_rescan_dnodes = B_FALSE; 147 } else { 148 offset = object << DNODE_SHIFT; 149 } 150 blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; 151 minlvl = restarted ? 1 : 2; 152 restarted = B_TRUE; 153 error = dnode_next_offset(DMU_META_DNODE(os), 154 DNODE_FIND_HOLE, &offset, minlvl, 155 blkfill, 0); 156 if (error == 0) { 157 object = offset >> DNODE_SHIFT; 158 } 159 } 160 /* 161 * Note: if "restarted", we may find a L0 that 162 * is not suitably aligned. 163 */ 164 os->os_obj_next_chunk = 165 P2ALIGN(object, dnodes_per_chunk) + 166 dnodes_per_chunk; 167 (void) atomic_swap_64(cpuobj, object); 168 mutex_exit(&os->os_obj_lock); 169 } 170 171 /* 172 * The value of (*cpuobj) before adding dn_slots is the object 173 * ID assigned to us. The value afterwards is the object ID 174 * assigned to whoever wants to do an allocation next. 175 */ 176 object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; 177 178 /* 179 * XXX We should check for an i/o error here and return 180 * up to our caller. Actually we should pre-read it in 181 * dmu_tx_assign(), but there is currently no mechanism 182 * to do so. 183 */ 184 error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, 185 dn_slots, tag, &dn); 186 if (error == 0) { 187 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 188 /* 189 * Another thread could have allocated it; check 190 * again now that we have the struct lock. 191 */ 192 if (dn->dn_type == DMU_OT_NONE) { 193 dnode_allocate(dn, ot, blocksize, 194 indirect_blockshift, bonustype, 195 bonuslen, dn_slots, tx); 196 rw_exit(&dn->dn_struct_rwlock); 197 dmu_tx_add_new_object(tx, dn); 198 199 /* 200 * Caller requested the allocated dnode be 201 * returned and is responsible for the hold. 202 */ 203 if (allocated_dnode != NULL) 204 *allocated_dnode = dn; 205 else 206 dnode_rele(dn, tag); 207 208 return (object); 209 } 210 rw_exit(&dn->dn_struct_rwlock); 211 dnode_rele(dn, tag); 212 DNODE_STAT_BUMP(dnode_alloc_race); 213 } 214 215 /* 216 * Skip to next known valid starting point on error. This 217 * is the start of the next block of dnodes. 218 */ 219 if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { 220 object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); 221 DNODE_STAT_BUMP(dnode_alloc_next_block); 222 } 223 (void) atomic_swap_64(cpuobj, object); 224 } 225 } 226 227 uint64_t 228 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, 229 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 230 { 231 return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 232 bonuslen, 0, NULL, NULL, tx); 233 } 234 235 uint64_t 236 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, 237 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 238 dmu_tx_t *tx) 239 { 240 return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 241 bonustype, bonuslen, 0, NULL, NULL, tx); 242 } 243 244 uint64_t 245 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, 246 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) 247 { 248 return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, 249 bonuslen, dnodesize, NULL, NULL, tx)); 250 } 251 252 /* 253 * Allocate a new object and return a pointer to the newly allocated dnode 254 * via the allocated_dnode argument. The returned dnode will be held and 255 * the caller is responsible for releasing the hold by calling dnode_rele(). 256 */ 257 uint64_t 258 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, 259 int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, 260 int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) 261 { 262 return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, 263 bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); 264 } 265 266 int 267 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, 268 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 269 { 270 return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, 271 bonuslen, 0, tx)); 272 } 273 274 int 275 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 276 int blocksize, dmu_object_type_t bonustype, int bonuslen, 277 int dnodesize, dmu_tx_t *tx) 278 { 279 dnode_t *dn; 280 int dn_slots = dnodesize >> DNODE_SHIFT; 281 int err; 282 283 if (dn_slots == 0) 284 dn_slots = DNODE_MIN_SLOTS; 285 ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); 286 ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); 287 288 if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) 289 return (SET_ERROR(EBADF)); 290 291 err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, 292 FTAG, &dn); 293 if (err) 294 return (err); 295 296 dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); 297 dmu_tx_add_new_object(tx, dn); 298 299 dnode_rele(dn, FTAG); 300 301 return (0); 302 } 303 304 int 305 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, 306 int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) 307 { 308 return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, 309 bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); 310 } 311 312 int 313 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, 314 int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, 315 boolean_t keep_spill, dmu_tx_t *tx) 316 { 317 dnode_t *dn; 318 int dn_slots = dnodesize >> DNODE_SHIFT; 319 int err; 320 321 if (dn_slots == 0) 322 dn_slots = DNODE_MIN_SLOTS; 323 324 if (object == DMU_META_DNODE_OBJECT) 325 return (SET_ERROR(EBADF)); 326 327 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 328 FTAG, &dn); 329 if (err) 330 return (err); 331 332 dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, 333 keep_spill, tx); 334 335 dnode_rele(dn, FTAG); 336 return (err); 337 } 338 339 int 340 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) 341 { 342 dnode_t *dn; 343 int err; 344 345 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 346 FTAG, &dn); 347 if (err) 348 return (err); 349 350 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 351 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 352 dbuf_rm_spill(dn, tx); 353 dnode_rm_spill(dn, tx); 354 } 355 rw_exit(&dn->dn_struct_rwlock); 356 357 dnode_rele(dn, FTAG); 358 return (err); 359 } 360 361 int 362 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) 363 { 364 dnode_t *dn; 365 int err; 366 367 ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 368 369 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, 370 FTAG, &dn); 371 if (err) 372 return (err); 373 374 ASSERT(dn->dn_type != DMU_OT_NONE); 375 /* 376 * If we don't create this free range, we'll leak indirect blocks when 377 * we get to freeing the dnode in syncing context. 378 */ 379 dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 380 dnode_free(dn, tx); 381 dnode_rele(dn, FTAG); 382 383 return (0); 384 } 385 386 /* 387 * Return (in *objectp) the next object which is allocated (or a hole) 388 * after *object, taking into account only objects that may have been modified 389 * after the specified txg. 390 */ 391 int 392 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) 393 { 394 uint64_t offset; 395 uint64_t start_obj; 396 struct dsl_dataset *ds = os->os_dsl_dataset; 397 int error; 398 399 if (*objectp == 0) { 400 start_obj = 1; 401 } else if (ds && dsl_dataset_feature_is_active(ds, 402 SPA_FEATURE_LARGE_DNODE)) { 403 uint64_t i = *objectp + 1; 404 uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); 405 dmu_object_info_t doi; 406 407 /* 408 * Scan through the remaining meta dnode block. The contents 409 * of each slot in the block are known so it can be quickly 410 * checked. If the block is exhausted without a match then 411 * hand off to dnode_next_offset() for further scanning. 412 */ 413 while (i <= last_obj) { 414 error = dmu_object_info(os, i, &doi); 415 if (error == ENOENT) { 416 if (hole) { 417 *objectp = i; 418 return (0); 419 } else { 420 i++; 421 } 422 } else if (error == EEXIST) { 423 i++; 424 } else if (error == 0) { 425 if (hole) { 426 i += doi.doi_dnodesize >> DNODE_SHIFT; 427 } else { 428 *objectp = i; 429 return (0); 430 } 431 } else { 432 return (error); 433 } 434 } 435 436 start_obj = i; 437 } else { 438 start_obj = *objectp + 1; 439 } 440 441 offset = start_obj << DNODE_SHIFT; 442 443 error = dnode_next_offset(DMU_META_DNODE(os), 444 (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); 445 446 *objectp = offset >> DNODE_SHIFT; 447 448 return (error); 449 } 450 451 /* 452 * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the 453 * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. 454 * 455 * Only for use from syncing context, on MOS objects. 456 */ 457 void 458 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, 459 dmu_tx_t *tx) 460 { 461 dnode_t *dn; 462 463 ASSERT(dmu_tx_is_syncing(tx)); 464 465 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 466 if (dn->dn_type == DMU_OTN_ZAP_METADATA) { 467 dnode_rele(dn, FTAG); 468 return; 469 } 470 ASSERT3U(dn->dn_type, ==, old_type); 471 ASSERT0(dn->dn_maxblkid); 472 473 /* 474 * We must initialize the ZAP data before changing the type, 475 * so that concurrent calls to *_is_zapified() can determine if 476 * the object has been completely zapified by checking the type. 477 */ 478 mzap_create_impl(dn, 0, 0, tx); 479 480 dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = 481 DMU_OTN_ZAP_METADATA; 482 dnode_setdirty(dn, tx); 483 dnode_rele(dn, FTAG); 484 485 spa_feature_incr(dmu_objset_spa(mos), 486 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 487 } 488 489 void 490 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) 491 { 492 dnode_t *dn; 493 dmu_object_type_t t; 494 495 ASSERT(dmu_tx_is_syncing(tx)); 496 497 VERIFY0(dnode_hold(mos, object, FTAG, &dn)); 498 t = dn->dn_type; 499 dnode_rele(dn, FTAG); 500 501 if (t == DMU_OTN_ZAP_METADATA) { 502 spa_feature_decr(dmu_objset_spa(mos), 503 SPA_FEATURE_EXTENSIBLE_DATASET, tx); 504 } 505 VERIFY0(dmu_object_free(mos, object, tx)); 506 } 507 508 EXPORT_SYMBOL(dmu_object_alloc); 509 EXPORT_SYMBOL(dmu_object_alloc_ibs); 510 EXPORT_SYMBOL(dmu_object_alloc_dnsize); 511 EXPORT_SYMBOL(dmu_object_alloc_hold); 512 EXPORT_SYMBOL(dmu_object_claim); 513 EXPORT_SYMBOL(dmu_object_claim_dnsize); 514 EXPORT_SYMBOL(dmu_object_reclaim); 515 EXPORT_SYMBOL(dmu_object_reclaim_dnsize); 516 EXPORT_SYMBOL(dmu_object_rm_spill); 517 EXPORT_SYMBOL(dmu_object_free); 518 EXPORT_SYMBOL(dmu_object_next); 519 EXPORT_SYMBOL(dmu_object_zapify); 520 EXPORT_SYMBOL(dmu_object_free_zapified); 521 522 /* BEGIN CSTYLED */ 523 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW, 524 "CPU-specific allocator grabs 2^N objects at once"); 525 /* END CSTYLED */ 526