1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dbuf.h> 30 #include <sys/dnode.h> 31 #include <sys/dmu.h> 32 #include <sys/dmu_impl.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/spa.h> 38 #include <sys/zio.h> 39 #include <sys/dmu_zfetch.h> 40 #include <sys/range_tree.h> 41 #include <sys/trace_zfs.h> 42 #include <sys/zfs_project.h> 43 44 dnode_stats_t dnode_stats = { 45 { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, 46 { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, 47 { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, 48 { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, 49 { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, 50 { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, 51 { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, 52 { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, 53 { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, 54 { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, 55 { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, 56 { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, 57 { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, 58 { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, 59 { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, 60 { "dnode_allocate", KSTAT_DATA_UINT64 }, 61 { "dnode_reallocate", KSTAT_DATA_UINT64 }, 62 { "dnode_buf_evict", KSTAT_DATA_UINT64 }, 63 { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, 64 { "dnode_alloc_race", KSTAT_DATA_UINT64 }, 65 { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, 66 { "dnode_move_invalid", KSTAT_DATA_UINT64 }, 67 { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, 68 { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, 69 { "dnode_move_special", KSTAT_DATA_UINT64 }, 70 { "dnode_move_handle", KSTAT_DATA_UINT64 }, 71 { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, 72 { "dnode_move_active", KSTAT_DATA_UINT64 }, 73 }; 74 75 dnode_sums_t dnode_sums; 76 77 static kstat_t *dnode_ksp; 78 static kmem_cache_t *dnode_cache; 79 80 static dnode_phys_t dnode_phys_zero __maybe_unused; 81 82 int zfs_default_bs = SPA_MINBLOCKSHIFT; 83 int zfs_default_ibs = DN_MAX_INDBLKSHIFT; 84 85 #ifdef _KERNEL 86 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); 87 #endif /* _KERNEL */ 88 89 static char * 90 rt_name(dnode_t *dn, const char *name) 91 { 92 struct objset *os = dn->dn_objset; 93 94 return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}", 95 spa_name(os->os_spa), 96 (u_longlong_t)(os->os_dsl_dataset ? 97 os->os_dsl_dataset->ds_object : DMU_META_OBJSET), 98 (u_longlong_t)dn->dn_object, 99 name)); 100 } 101 102 static int 103 dbuf_compare(const void *x1, const void *x2) 104 { 105 const dmu_buf_impl_t *d1 = x1; 106 const dmu_buf_impl_t *d2 = x2; 107 108 int cmp = TREE_CMP(d1->db_level, d2->db_level); 109 if (likely(cmp)) 110 return (cmp); 111 112 cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); 113 if (likely(cmp)) 114 return (cmp); 115 116 if (d1->db_state == DB_MARKER) { 117 ASSERT3S(d2->db_state, !=, DB_MARKER); 118 return (TREE_PCMP(d1->db_parent, d2)); 119 } else if (d2->db_state == DB_MARKER) { 120 ASSERT3S(d1->db_state, !=, DB_MARKER); 121 return (TREE_PCMP(d1, d2->db_parent)); 122 } 123 124 if (d1->db_state == DB_SEARCH) { 125 ASSERT3S(d2->db_state, !=, DB_SEARCH); 126 return (-1); 127 } else if (d2->db_state == DB_SEARCH) { 128 ASSERT3S(d1->db_state, !=, DB_SEARCH); 129 return (1); 130 } 131 132 return (TREE_PCMP(d1, d2)); 133 } 134 135 static int 136 dnode_cons(void *arg, void *unused, int kmflag) 137 { 138 (void) unused, (void) kmflag; 139 dnode_t *dn = arg; 140 141 rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL); 142 mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); 144 cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); 145 cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL); 146 147 /* 148 * Every dbuf has a reference, and dropping a tracked reference is 149 * O(number of references), so don't track dn_holds. 150 */ 151 zfs_refcount_create_untracked(&dn->dn_holds); 152 zfs_refcount_create(&dn->dn_tx_holds); 153 list_link_init(&dn->dn_link); 154 155 memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type)); 156 memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr)); 157 memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels)); 158 memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift)); 159 memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype)); 160 memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk)); 161 memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen)); 162 memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz)); 163 memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid)); 164 165 for (int i = 0; i < TXG_SIZE; i++) { 166 multilist_link_init(&dn->dn_dirty_link[i]); 167 dn->dn_free_ranges[i] = NULL; 168 list_create(&dn->dn_dirty_records[i], 169 sizeof (dbuf_dirty_record_t), 170 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 171 } 172 173 dn->dn_allocated_txg = 0; 174 dn->dn_free_txg = 0; 175 dn->dn_assigned_txg = 0; 176 dn->dn_dirty_txg = 0; 177 dn->dn_dirtyctx = 0; 178 dn->dn_dirtyctx_firstset = NULL; 179 dn->dn_bonus = NULL; 180 dn->dn_have_spill = B_FALSE; 181 dn->dn_zio = NULL; 182 dn->dn_oldused = 0; 183 dn->dn_oldflags = 0; 184 dn->dn_olduid = 0; 185 dn->dn_oldgid = 0; 186 dn->dn_oldprojid = ZFS_DEFAULT_PROJID; 187 dn->dn_newuid = 0; 188 dn->dn_newgid = 0; 189 dn->dn_newprojid = ZFS_DEFAULT_PROJID; 190 dn->dn_id_flags = 0; 191 192 dn->dn_dbufs_count = 0; 193 avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), 194 offsetof(dmu_buf_impl_t, db_link)); 195 196 dn->dn_moved = 0; 197 return (0); 198 } 199 200 static void 201 dnode_dest(void *arg, void *unused) 202 { 203 (void) unused; 204 dnode_t *dn = arg; 205 206 rw_destroy(&dn->dn_struct_rwlock); 207 mutex_destroy(&dn->dn_mtx); 208 mutex_destroy(&dn->dn_dbufs_mtx); 209 cv_destroy(&dn->dn_notxholds); 210 cv_destroy(&dn->dn_nodnholds); 211 zfs_refcount_destroy(&dn->dn_holds); 212 zfs_refcount_destroy(&dn->dn_tx_holds); 213 ASSERT(!list_link_active(&dn->dn_link)); 214 215 for (int i = 0; i < TXG_SIZE; i++) { 216 ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); 217 ASSERT3P(dn->dn_free_ranges[i], ==, NULL); 218 list_destroy(&dn->dn_dirty_records[i]); 219 ASSERT0(dn->dn_next_nblkptr[i]); 220 ASSERT0(dn->dn_next_nlevels[i]); 221 ASSERT0(dn->dn_next_indblkshift[i]); 222 ASSERT0(dn->dn_next_bonustype[i]); 223 ASSERT0(dn->dn_rm_spillblk[i]); 224 ASSERT0(dn->dn_next_bonuslen[i]); 225 ASSERT0(dn->dn_next_blksz[i]); 226 ASSERT0(dn->dn_next_maxblkid[i]); 227 } 228 229 ASSERT0(dn->dn_allocated_txg); 230 ASSERT0(dn->dn_free_txg); 231 ASSERT0(dn->dn_assigned_txg); 232 ASSERT0(dn->dn_dirty_txg); 233 ASSERT0(dn->dn_dirtyctx); 234 ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); 235 ASSERT3P(dn->dn_bonus, ==, NULL); 236 ASSERT(!dn->dn_have_spill); 237 ASSERT3P(dn->dn_zio, ==, NULL); 238 ASSERT0(dn->dn_oldused); 239 ASSERT0(dn->dn_oldflags); 240 ASSERT0(dn->dn_olduid); 241 ASSERT0(dn->dn_oldgid); 242 ASSERT0(dn->dn_oldprojid); 243 ASSERT0(dn->dn_newuid); 244 ASSERT0(dn->dn_newgid); 245 ASSERT0(dn->dn_newprojid); 246 ASSERT0(dn->dn_id_flags); 247 248 ASSERT0(dn->dn_dbufs_count); 249 avl_destroy(&dn->dn_dbufs); 250 } 251 252 static int 253 dnode_kstats_update(kstat_t *ksp, int rw) 254 { 255 dnode_stats_t *ds = ksp->ks_data; 256 257 if (rw == KSTAT_WRITE) 258 return (EACCES); 259 ds->dnode_hold_dbuf_hold.value.ui64 = 260 wmsum_value(&dnode_sums.dnode_hold_dbuf_hold); 261 ds->dnode_hold_dbuf_read.value.ui64 = 262 wmsum_value(&dnode_sums.dnode_hold_dbuf_read); 263 ds->dnode_hold_alloc_hits.value.ui64 = 264 wmsum_value(&dnode_sums.dnode_hold_alloc_hits); 265 ds->dnode_hold_alloc_misses.value.ui64 = 266 wmsum_value(&dnode_sums.dnode_hold_alloc_misses); 267 ds->dnode_hold_alloc_interior.value.ui64 = 268 wmsum_value(&dnode_sums.dnode_hold_alloc_interior); 269 ds->dnode_hold_alloc_lock_retry.value.ui64 = 270 wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry); 271 ds->dnode_hold_alloc_lock_misses.value.ui64 = 272 wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses); 273 ds->dnode_hold_alloc_type_none.value.ui64 = 274 wmsum_value(&dnode_sums.dnode_hold_alloc_type_none); 275 ds->dnode_hold_free_hits.value.ui64 = 276 wmsum_value(&dnode_sums.dnode_hold_free_hits); 277 ds->dnode_hold_free_misses.value.ui64 = 278 wmsum_value(&dnode_sums.dnode_hold_free_misses); 279 ds->dnode_hold_free_lock_misses.value.ui64 = 280 wmsum_value(&dnode_sums.dnode_hold_free_lock_misses); 281 ds->dnode_hold_free_lock_retry.value.ui64 = 282 wmsum_value(&dnode_sums.dnode_hold_free_lock_retry); 283 ds->dnode_hold_free_refcount.value.ui64 = 284 wmsum_value(&dnode_sums.dnode_hold_free_refcount); 285 ds->dnode_hold_free_overflow.value.ui64 = 286 wmsum_value(&dnode_sums.dnode_hold_free_overflow); 287 ds->dnode_free_interior_lock_retry.value.ui64 = 288 wmsum_value(&dnode_sums.dnode_free_interior_lock_retry); 289 ds->dnode_allocate.value.ui64 = 290 wmsum_value(&dnode_sums.dnode_allocate); 291 ds->dnode_reallocate.value.ui64 = 292 wmsum_value(&dnode_sums.dnode_reallocate); 293 ds->dnode_buf_evict.value.ui64 = 294 wmsum_value(&dnode_sums.dnode_buf_evict); 295 ds->dnode_alloc_next_chunk.value.ui64 = 296 wmsum_value(&dnode_sums.dnode_alloc_next_chunk); 297 ds->dnode_alloc_race.value.ui64 = 298 wmsum_value(&dnode_sums.dnode_alloc_race); 299 ds->dnode_alloc_next_block.value.ui64 = 300 wmsum_value(&dnode_sums.dnode_alloc_next_block); 301 ds->dnode_move_invalid.value.ui64 = 302 wmsum_value(&dnode_sums.dnode_move_invalid); 303 ds->dnode_move_recheck1.value.ui64 = 304 wmsum_value(&dnode_sums.dnode_move_recheck1); 305 ds->dnode_move_recheck2.value.ui64 = 306 wmsum_value(&dnode_sums.dnode_move_recheck2); 307 ds->dnode_move_special.value.ui64 = 308 wmsum_value(&dnode_sums.dnode_move_special); 309 ds->dnode_move_handle.value.ui64 = 310 wmsum_value(&dnode_sums.dnode_move_handle); 311 ds->dnode_move_rwlock.value.ui64 = 312 wmsum_value(&dnode_sums.dnode_move_rwlock); 313 ds->dnode_move_active.value.ui64 = 314 wmsum_value(&dnode_sums.dnode_move_active); 315 return (0); 316 } 317 318 void 319 dnode_init(void) 320 { 321 ASSERT(dnode_cache == NULL); 322 dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 323 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); 324 kmem_cache_set_move(dnode_cache, dnode_move); 325 326 wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0); 327 wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0); 328 wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0); 329 wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0); 330 wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0); 331 wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0); 332 wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0); 333 wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0); 334 wmsum_init(&dnode_sums.dnode_hold_free_hits, 0); 335 wmsum_init(&dnode_sums.dnode_hold_free_misses, 0); 336 wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0); 337 wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0); 338 wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0); 339 wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0); 340 wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0); 341 wmsum_init(&dnode_sums.dnode_allocate, 0); 342 wmsum_init(&dnode_sums.dnode_reallocate, 0); 343 wmsum_init(&dnode_sums.dnode_buf_evict, 0); 344 wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0); 345 wmsum_init(&dnode_sums.dnode_alloc_race, 0); 346 wmsum_init(&dnode_sums.dnode_alloc_next_block, 0); 347 wmsum_init(&dnode_sums.dnode_move_invalid, 0); 348 wmsum_init(&dnode_sums.dnode_move_recheck1, 0); 349 wmsum_init(&dnode_sums.dnode_move_recheck2, 0); 350 wmsum_init(&dnode_sums.dnode_move_special, 0); 351 wmsum_init(&dnode_sums.dnode_move_handle, 0); 352 wmsum_init(&dnode_sums.dnode_move_rwlock, 0); 353 wmsum_init(&dnode_sums.dnode_move_active, 0); 354 355 dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", 356 KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), 357 KSTAT_FLAG_VIRTUAL); 358 if (dnode_ksp != NULL) { 359 dnode_ksp->ks_data = &dnode_stats; 360 dnode_ksp->ks_update = dnode_kstats_update; 361 kstat_install(dnode_ksp); 362 } 363 } 364 365 void 366 dnode_fini(void) 367 { 368 if (dnode_ksp != NULL) { 369 kstat_delete(dnode_ksp); 370 dnode_ksp = NULL; 371 } 372 373 wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold); 374 wmsum_fini(&dnode_sums.dnode_hold_dbuf_read); 375 wmsum_fini(&dnode_sums.dnode_hold_alloc_hits); 376 wmsum_fini(&dnode_sums.dnode_hold_alloc_misses); 377 wmsum_fini(&dnode_sums.dnode_hold_alloc_interior); 378 wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry); 379 wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses); 380 wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none); 381 wmsum_fini(&dnode_sums.dnode_hold_free_hits); 382 wmsum_fini(&dnode_sums.dnode_hold_free_misses); 383 wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses); 384 wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry); 385 wmsum_fini(&dnode_sums.dnode_hold_free_refcount); 386 wmsum_fini(&dnode_sums.dnode_hold_free_overflow); 387 wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry); 388 wmsum_fini(&dnode_sums.dnode_allocate); 389 wmsum_fini(&dnode_sums.dnode_reallocate); 390 wmsum_fini(&dnode_sums.dnode_buf_evict); 391 wmsum_fini(&dnode_sums.dnode_alloc_next_chunk); 392 wmsum_fini(&dnode_sums.dnode_alloc_race); 393 wmsum_fini(&dnode_sums.dnode_alloc_next_block); 394 wmsum_fini(&dnode_sums.dnode_move_invalid); 395 wmsum_fini(&dnode_sums.dnode_move_recheck1); 396 wmsum_fini(&dnode_sums.dnode_move_recheck2); 397 wmsum_fini(&dnode_sums.dnode_move_special); 398 wmsum_fini(&dnode_sums.dnode_move_handle); 399 wmsum_fini(&dnode_sums.dnode_move_rwlock); 400 wmsum_fini(&dnode_sums.dnode_move_active); 401 402 kmem_cache_destroy(dnode_cache); 403 dnode_cache = NULL; 404 } 405 406 407 #ifdef ZFS_DEBUG 408 void 409 dnode_verify(dnode_t *dn) 410 { 411 int drop_struct_lock = FALSE; 412 413 ASSERT(dn->dn_phys); 414 ASSERT(dn->dn_objset); 415 ASSERT(dn->dn_handle->dnh_dnode == dn); 416 417 ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); 418 419 if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) 420 return; 421 422 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 423 rw_enter(&dn->dn_struct_rwlock, RW_READER); 424 drop_struct_lock = TRUE; 425 } 426 if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { 427 int i; 428 int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); 429 ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); 430 if (dn->dn_datablkshift) { 431 ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); 432 ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); 433 ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz); 434 } 435 ASSERT3U(dn->dn_nlevels, <=, 30); 436 ASSERT(DMU_OT_IS_VALID(dn->dn_type)); 437 ASSERT3U(dn->dn_nblkptr, >=, 1); 438 ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); 439 ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); 440 ASSERT3U(dn->dn_datablksz, ==, 441 dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); 442 ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); 443 ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + 444 dn->dn_bonuslen, <=, max_bonuslen); 445 for (i = 0; i < TXG_SIZE; i++) { 446 ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); 447 } 448 } 449 if (dn->dn_phys->dn_type != DMU_OT_NONE) 450 ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); 451 ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); 452 if (dn->dn_dbuf != NULL) { 453 ASSERT3P(dn->dn_phys, ==, 454 (dnode_phys_t *)dn->dn_dbuf->db.db_data + 455 (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); 456 } 457 if (drop_struct_lock) 458 rw_exit(&dn->dn_struct_rwlock); 459 } 460 #endif 461 462 void 463 dnode_byteswap(dnode_phys_t *dnp) 464 { 465 uint64_t *buf64 = (void*)&dnp->dn_blkptr; 466 int i; 467 468 if (dnp->dn_type == DMU_OT_NONE) { 469 memset(dnp, 0, sizeof (dnode_phys_t)); 470 return; 471 } 472 473 dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); 474 dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); 475 dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); 476 dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); 477 dnp->dn_used = BSWAP_64(dnp->dn_used); 478 479 /* 480 * dn_nblkptr is only one byte, so it's OK to read it in either 481 * byte order. We can't read dn_bouslen. 482 */ 483 ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); 484 ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); 485 for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) 486 buf64[i] = BSWAP_64(buf64[i]); 487 488 /* 489 * OK to check dn_bonuslen for zero, because it won't matter if 490 * we have the wrong byte order. This is necessary because the 491 * dnode dnode is smaller than a regular dnode. 492 */ 493 if (dnp->dn_bonuslen != 0) { 494 dmu_object_byteswap_t byteswap; 495 ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); 496 byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); 497 dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp), 498 DN_MAX_BONUS_LEN(dnp)); 499 } 500 501 /* Swap SPILL block if we have one */ 502 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) 503 byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); 504 } 505 506 void 507 dnode_buf_byteswap(void *vbuf, size_t size) 508 { 509 int i = 0; 510 511 ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); 512 ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); 513 514 while (i < size) { 515 dnode_phys_t *dnp = (void *)(((char *)vbuf) + i); 516 dnode_byteswap(dnp); 517 518 i += DNODE_MIN_SIZE; 519 if (dnp->dn_type != DMU_OT_NONE) 520 i += dnp->dn_extra_slots * DNODE_MIN_SIZE; 521 } 522 } 523 524 void 525 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) 526 { 527 ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); 528 529 dnode_setdirty(dn, tx); 530 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 531 ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - 532 (dn->dn_nblkptr-1) * sizeof (blkptr_t)); 533 534 if (newsize < dn->dn_bonuslen) { 535 /* clear any data after the end of the new size */ 536 size_t diff = dn->dn_bonuslen - newsize; 537 char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; 538 memset(data_end, 0, diff); 539 } 540 541 dn->dn_bonuslen = newsize; 542 if (newsize == 0) 543 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; 544 else 545 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; 546 rw_exit(&dn->dn_struct_rwlock); 547 } 548 549 void 550 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) 551 { 552 ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); 553 dnode_setdirty(dn, tx); 554 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 555 dn->dn_bonustype = newtype; 556 dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; 557 rw_exit(&dn->dn_struct_rwlock); 558 } 559 560 void 561 dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype) 562 { 563 /* 564 * This is not in the dnode_phys, but it should be, and perhaps one day 565 * will. For now we require it be set after taking a hold. 566 */ 567 ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); 568 dn->dn_storage_type = newtype; 569 } 570 571 void 572 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) 573 { 574 ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); 575 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 576 dnode_setdirty(dn, tx); 577 dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK; 578 dn->dn_have_spill = B_FALSE; 579 } 580 581 static void 582 dnode_setdblksz(dnode_t *dn, int size) 583 { 584 ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); 585 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 586 ASSERT3U(size, >=, SPA_MINBLOCKSIZE); 587 ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 588 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); 589 dn->dn_datablksz = size; 590 dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; 591 dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; 592 } 593 594 static dnode_t * 595 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, 596 uint64_t object, dnode_handle_t *dnh) 597 { 598 dnode_t *dn; 599 600 dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); 601 dn->dn_moved = 0; 602 603 /* 604 * Defer setting dn_objset until the dnode is ready to be a candidate 605 * for the dnode_move() callback. 606 */ 607 dn->dn_object = object; 608 dn->dn_dbuf = db; 609 dn->dn_handle = dnh; 610 dn->dn_phys = dnp; 611 612 if (dnp->dn_datablkszsec) { 613 dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 614 } else { 615 dn->dn_datablksz = 0; 616 dn->dn_datablkszsec = 0; 617 dn->dn_datablkshift = 0; 618 } 619 dn->dn_indblkshift = dnp->dn_indblkshift; 620 dn->dn_nlevels = dnp->dn_nlevels; 621 dn->dn_type = dnp->dn_type; 622 dn->dn_nblkptr = dnp->dn_nblkptr; 623 dn->dn_checksum = dnp->dn_checksum; 624 dn->dn_compress = dnp->dn_compress; 625 dn->dn_bonustype = dnp->dn_bonustype; 626 dn->dn_bonuslen = dnp->dn_bonuslen; 627 dn->dn_num_slots = dnp->dn_extra_slots + 1; 628 dn->dn_maxblkid = dnp->dn_maxblkid; 629 dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); 630 dn->dn_id_flags = 0; 631 632 dn->dn_storage_type = DMU_OT_NONE; 633 634 dmu_zfetch_init(&dn->dn_zfetch, dn); 635 636 ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); 637 ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); 638 ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); 639 640 mutex_enter(&os->os_lock); 641 642 /* 643 * Exclude special dnodes from os_dnodes so an empty os_dnodes 644 * signifies that the special dnodes have no references from 645 * their children (the entries in os_dnodes). This allows 646 * dnode_destroy() to easily determine if the last child has 647 * been removed and then complete eviction of the objset. 648 */ 649 if (!DMU_OBJECT_IS_SPECIAL(object)) 650 list_insert_head(&os->os_dnodes, dn); 651 membar_producer(); 652 653 /* 654 * Everything else must be valid before assigning dn_objset 655 * makes the dnode eligible for dnode_move(). 656 */ 657 dn->dn_objset = os; 658 659 dnh->dnh_dnode = dn; 660 mutex_exit(&os->os_lock); 661 662 arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); 663 664 return (dn); 665 } 666 667 /* 668 * Caller must be holding the dnode handle, which is released upon return. 669 */ 670 static void 671 dnode_destroy(dnode_t *dn) 672 { 673 objset_t *os = dn->dn_objset; 674 boolean_t complete_os_eviction = B_FALSE; 675 676 ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); 677 678 mutex_enter(&os->os_lock); 679 POINTER_INVALIDATE(&dn->dn_objset); 680 if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { 681 list_remove(&os->os_dnodes, dn); 682 complete_os_eviction = 683 list_is_empty(&os->os_dnodes) && 684 list_link_active(&os->os_evicting_node); 685 } 686 mutex_exit(&os->os_lock); 687 688 /* the dnode can no longer move, so we can release the handle */ 689 if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) 690 zrl_remove(&dn->dn_handle->dnh_zrlock); 691 692 dn->dn_allocated_txg = 0; 693 dn->dn_free_txg = 0; 694 dn->dn_assigned_txg = 0; 695 dn->dn_dirty_txg = 0; 696 697 dn->dn_dirtyctx = 0; 698 dn->dn_dirtyctx_firstset = NULL; 699 if (dn->dn_bonus != NULL) { 700 mutex_enter(&dn->dn_bonus->db_mtx); 701 dbuf_destroy(dn->dn_bonus); 702 dn->dn_bonus = NULL; 703 } 704 dn->dn_zio = NULL; 705 706 dn->dn_have_spill = B_FALSE; 707 dn->dn_oldused = 0; 708 dn->dn_oldflags = 0; 709 dn->dn_olduid = 0; 710 dn->dn_oldgid = 0; 711 dn->dn_oldprojid = ZFS_DEFAULT_PROJID; 712 dn->dn_newuid = 0; 713 dn->dn_newgid = 0; 714 dn->dn_newprojid = ZFS_DEFAULT_PROJID; 715 dn->dn_id_flags = 0; 716 717 dn->dn_storage_type = DMU_OT_NONE; 718 719 dmu_zfetch_fini(&dn->dn_zfetch); 720 kmem_cache_free(dnode_cache, dn); 721 arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); 722 723 if (complete_os_eviction) 724 dmu_objset_evict_done(os); 725 } 726 727 void 728 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, 729 dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) 730 { 731 int i; 732 733 ASSERT3U(dn_slots, >, 0); 734 ASSERT3U(dn_slots << DNODE_SHIFT, <=, 735 spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); 736 ASSERT3U(blocksize, <=, 737 spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); 738 if (blocksize == 0) 739 blocksize = 1 << zfs_default_bs; 740 else 741 blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); 742 743 if (ibs == 0) 744 ibs = zfs_default_ibs; 745 746 ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); 747 748 dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", 749 dn->dn_objset, (u_longlong_t)dn->dn_object, 750 (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots); 751 DNODE_STAT_BUMP(dnode_allocate); 752 753 ASSERT(dn->dn_type == DMU_OT_NONE); 754 ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t))); 755 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); 756 ASSERT(ot != DMU_OT_NONE); 757 ASSERT(DMU_OT_IS_VALID(ot)); 758 ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || 759 (bonustype == DMU_OT_SA && bonuslen == 0) || 760 (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) || 761 (bonustype != DMU_OT_NONE && bonuslen != 0)); 762 ASSERT(DMU_OT_IS_VALID(bonustype)); 763 ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); 764 ASSERT(dn->dn_type == DMU_OT_NONE); 765 ASSERT0(dn->dn_maxblkid); 766 ASSERT0(dn->dn_allocated_txg); 767 ASSERT0(dn->dn_assigned_txg); 768 ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); 769 ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); 770 ASSERT(avl_is_empty(&dn->dn_dbufs)); 771 772 for (i = 0; i < TXG_SIZE; i++) { 773 ASSERT0(dn->dn_next_nblkptr[i]); 774 ASSERT0(dn->dn_next_nlevels[i]); 775 ASSERT0(dn->dn_next_indblkshift[i]); 776 ASSERT0(dn->dn_next_bonuslen[i]); 777 ASSERT0(dn->dn_next_bonustype[i]); 778 ASSERT0(dn->dn_rm_spillblk[i]); 779 ASSERT0(dn->dn_next_blksz[i]); 780 ASSERT0(dn->dn_next_maxblkid[i]); 781 ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); 782 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); 783 ASSERT3P(dn->dn_free_ranges[i], ==, NULL); 784 } 785 786 dn->dn_type = ot; 787 dnode_setdblksz(dn, blocksize); 788 dn->dn_indblkshift = ibs; 789 dn->dn_nlevels = 1; 790 dn->dn_num_slots = dn_slots; 791 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ 792 dn->dn_nblkptr = 1; 793 else { 794 dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, 795 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> 796 SPA_BLKPTRSHIFT)); 797 } 798 799 dn->dn_bonustype = bonustype; 800 dn->dn_bonuslen = bonuslen; 801 dn->dn_checksum = ZIO_CHECKSUM_INHERIT; 802 dn->dn_compress = ZIO_COMPRESS_INHERIT; 803 dn->dn_dirtyctx = 0; 804 805 dn->dn_free_txg = 0; 806 dn->dn_dirtyctx_firstset = NULL; 807 dn->dn_dirty_txg = 0; 808 809 dn->dn_allocated_txg = tx->tx_txg; 810 dn->dn_id_flags = 0; 811 812 dnode_setdirty(dn, tx); 813 dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; 814 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; 815 dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; 816 dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; 817 } 818 819 void 820 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, 821 dmu_object_type_t bonustype, int bonuslen, int dn_slots, 822 boolean_t keep_spill, dmu_tx_t *tx) 823 { 824 int nblkptr; 825 826 ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); 827 ASSERT3U(blocksize, <=, 828 spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); 829 ASSERT0(blocksize % SPA_MINBLOCKSIZE); 830 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); 831 ASSERT(tx->tx_txg != 0); 832 ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || 833 (bonustype != DMU_OT_NONE && bonuslen != 0) || 834 (bonustype == DMU_OT_SA && bonuslen == 0)); 835 ASSERT(DMU_OT_IS_VALID(bonustype)); 836 ASSERT3U(bonuslen, <=, 837 DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); 838 ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT)); 839 840 dnode_free_interior_slots(dn); 841 DNODE_STAT_BUMP(dnode_reallocate); 842 843 /* clean up any unreferenced dbufs */ 844 dnode_evict_dbufs(dn); 845 846 dn->dn_id_flags = 0; 847 848 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 849 dnode_setdirty(dn, tx); 850 if (dn->dn_datablksz != blocksize) { 851 /* change blocksize */ 852 ASSERT0(dn->dn_maxblkid); 853 ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || 854 dnode_block_freed(dn, 0)); 855 856 dnode_setdblksz(dn, blocksize); 857 dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize; 858 } 859 if (dn->dn_bonuslen != bonuslen) 860 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen; 861 862 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ 863 nblkptr = 1; 864 else 865 nblkptr = MIN(DN_MAX_NBLKPTR, 866 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> 867 SPA_BLKPTRSHIFT)); 868 if (dn->dn_bonustype != bonustype) 869 dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype; 870 if (dn->dn_nblkptr != nblkptr) 871 dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr; 872 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) { 873 dbuf_rm_spill(dn, tx); 874 dnode_rm_spill(dn, tx); 875 } 876 877 rw_exit(&dn->dn_struct_rwlock); 878 879 /* change type */ 880 dn->dn_type = ot; 881 882 /* change bonus size and type */ 883 mutex_enter(&dn->dn_mtx); 884 dn->dn_bonustype = bonustype; 885 dn->dn_bonuslen = bonuslen; 886 dn->dn_num_slots = dn_slots; 887 dn->dn_nblkptr = nblkptr; 888 dn->dn_checksum = ZIO_CHECKSUM_INHERIT; 889 dn->dn_compress = ZIO_COMPRESS_INHERIT; 890 ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); 891 892 /* fix up the bonus db_size */ 893 if (dn->dn_bonus) { 894 dn->dn_bonus->db.db_size = 895 DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - 896 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 897 ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); 898 } 899 900 dn->dn_allocated_txg = tx->tx_txg; 901 mutex_exit(&dn->dn_mtx); 902 } 903 904 #ifdef _KERNEL 905 static void 906 dnode_move_impl(dnode_t *odn, dnode_t *ndn) 907 { 908 ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); 909 ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); 910 ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); 911 912 /* Copy fields. */ 913 ndn->dn_objset = odn->dn_objset; 914 ndn->dn_object = odn->dn_object; 915 ndn->dn_dbuf = odn->dn_dbuf; 916 ndn->dn_handle = odn->dn_handle; 917 ndn->dn_phys = odn->dn_phys; 918 ndn->dn_type = odn->dn_type; 919 ndn->dn_bonuslen = odn->dn_bonuslen; 920 ndn->dn_bonustype = odn->dn_bonustype; 921 ndn->dn_nblkptr = odn->dn_nblkptr; 922 ndn->dn_checksum = odn->dn_checksum; 923 ndn->dn_compress = odn->dn_compress; 924 ndn->dn_nlevels = odn->dn_nlevels; 925 ndn->dn_indblkshift = odn->dn_indblkshift; 926 ndn->dn_datablkshift = odn->dn_datablkshift; 927 ndn->dn_datablkszsec = odn->dn_datablkszsec; 928 ndn->dn_datablksz = odn->dn_datablksz; 929 ndn->dn_maxblkid = odn->dn_maxblkid; 930 ndn->dn_num_slots = odn->dn_num_slots; 931 memcpy(ndn->dn_next_type, odn->dn_next_type, 932 sizeof (odn->dn_next_type)); 933 memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr, 934 sizeof (odn->dn_next_nblkptr)); 935 memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels, 936 sizeof (odn->dn_next_nlevels)); 937 memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift, 938 sizeof (odn->dn_next_indblkshift)); 939 memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype, 940 sizeof (odn->dn_next_bonustype)); 941 memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk, 942 sizeof (odn->dn_rm_spillblk)); 943 memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen, 944 sizeof (odn->dn_next_bonuslen)); 945 memcpy(ndn->dn_next_blksz, odn->dn_next_blksz, 946 sizeof (odn->dn_next_blksz)); 947 memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid, 948 sizeof (odn->dn_next_maxblkid)); 949 for (int i = 0; i < TXG_SIZE; i++) { 950 list_move_tail(&ndn->dn_dirty_records[i], 951 &odn->dn_dirty_records[i]); 952 } 953 memcpy(ndn->dn_free_ranges, odn->dn_free_ranges, 954 sizeof (odn->dn_free_ranges)); 955 ndn->dn_allocated_txg = odn->dn_allocated_txg; 956 ndn->dn_free_txg = odn->dn_free_txg; 957 ndn->dn_assigned_txg = odn->dn_assigned_txg; 958 ndn->dn_dirty_txg = odn->dn_dirty_txg; 959 ndn->dn_dirtyctx = odn->dn_dirtyctx; 960 ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; 961 ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0); 962 zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); 963 ASSERT(avl_is_empty(&ndn->dn_dbufs)); 964 avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); 965 ndn->dn_dbufs_count = odn->dn_dbufs_count; 966 ndn->dn_bonus = odn->dn_bonus; 967 ndn->dn_have_spill = odn->dn_have_spill; 968 ndn->dn_zio = odn->dn_zio; 969 ndn->dn_oldused = odn->dn_oldused; 970 ndn->dn_oldflags = odn->dn_oldflags; 971 ndn->dn_olduid = odn->dn_olduid; 972 ndn->dn_oldgid = odn->dn_oldgid; 973 ndn->dn_oldprojid = odn->dn_oldprojid; 974 ndn->dn_newuid = odn->dn_newuid; 975 ndn->dn_newgid = odn->dn_newgid; 976 ndn->dn_newprojid = odn->dn_newprojid; 977 ndn->dn_id_flags = odn->dn_id_flags; 978 ndn->dn_storage_type = odn->dn_storage_type; 979 dmu_zfetch_init(&ndn->dn_zfetch, ndn); 980 981 /* 982 * Update back pointers. Updating the handle fixes the back pointer of 983 * every descendant dbuf as well as the bonus dbuf. 984 */ 985 ASSERT(ndn->dn_handle->dnh_dnode == odn); 986 ndn->dn_handle->dnh_dnode = ndn; 987 988 /* 989 * Invalidate the original dnode by clearing all of its back pointers. 990 */ 991 odn->dn_dbuf = NULL; 992 odn->dn_handle = NULL; 993 avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), 994 offsetof(dmu_buf_impl_t, db_link)); 995 odn->dn_dbufs_count = 0; 996 odn->dn_bonus = NULL; 997 dmu_zfetch_fini(&odn->dn_zfetch); 998 999 /* 1000 * Set the low bit of the objset pointer to ensure that dnode_move() 1001 * recognizes the dnode as invalid in any subsequent callback. 1002 */ 1003 POINTER_INVALIDATE(&odn->dn_objset); 1004 1005 /* 1006 * Satisfy the destructor. 1007 */ 1008 for (int i = 0; i < TXG_SIZE; i++) { 1009 list_create(&odn->dn_dirty_records[i], 1010 sizeof (dbuf_dirty_record_t), 1011 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1012 odn->dn_free_ranges[i] = NULL; 1013 odn->dn_next_nlevels[i] = 0; 1014 odn->dn_next_indblkshift[i] = 0; 1015 odn->dn_next_bonustype[i] = 0; 1016 odn->dn_rm_spillblk[i] = 0; 1017 odn->dn_next_bonuslen[i] = 0; 1018 odn->dn_next_blksz[i] = 0; 1019 } 1020 odn->dn_allocated_txg = 0; 1021 odn->dn_free_txg = 0; 1022 odn->dn_assigned_txg = 0; 1023 odn->dn_dirty_txg = 0; 1024 odn->dn_dirtyctx = 0; 1025 odn->dn_dirtyctx_firstset = NULL; 1026 odn->dn_have_spill = B_FALSE; 1027 odn->dn_zio = NULL; 1028 odn->dn_oldused = 0; 1029 odn->dn_oldflags = 0; 1030 odn->dn_olduid = 0; 1031 odn->dn_oldgid = 0; 1032 odn->dn_oldprojid = ZFS_DEFAULT_PROJID; 1033 odn->dn_newuid = 0; 1034 odn->dn_newgid = 0; 1035 odn->dn_newprojid = ZFS_DEFAULT_PROJID; 1036 odn->dn_id_flags = 0; 1037 odn->dn_storage_type = DMU_OT_NONE; 1038 1039 /* 1040 * Mark the dnode. 1041 */ 1042 ndn->dn_moved = 1; 1043 odn->dn_moved = (uint8_t)-1; 1044 } 1045 1046 static kmem_cbrc_t 1047 dnode_move(void *buf, void *newbuf, size_t size, void *arg) 1048 { 1049 dnode_t *odn = buf, *ndn = newbuf; 1050 objset_t *os; 1051 int64_t refcount; 1052 uint32_t dbufs; 1053 1054 #ifndef USE_DNODE_HANDLE 1055 /* 1056 * We can't move dnodes if dbufs reference them directly without 1057 * using handles and respecitve locking. Unless USE_DNODE_HANDLE 1058 * is defined the code below is only to make sure it still builds, 1059 * but it should never be used, since it is unsafe. 1060 */ 1061 #ifdef ZFS_DEBUG 1062 PANIC("dnode_move() called without USE_DNODE_HANDLE"); 1063 #endif 1064 return (KMEM_CBRC_NO); 1065 #endif 1066 1067 /* 1068 * The dnode is on the objset's list of known dnodes if the objset 1069 * pointer is valid. We set the low bit of the objset pointer when 1070 * freeing the dnode to invalidate it, and the memory patterns written 1071 * by kmem (baddcafe and deadbeef) set at least one of the two low bits. 1072 * A newly created dnode sets the objset pointer last of all to indicate 1073 * that the dnode is known and in a valid state to be moved by this 1074 * function. 1075 */ 1076 os = odn->dn_objset; 1077 if (!POINTER_IS_VALID(os)) { 1078 DNODE_STAT_BUMP(dnode_move_invalid); 1079 return (KMEM_CBRC_DONT_KNOW); 1080 } 1081 1082 /* 1083 * Ensure that the objset does not go away during the move. 1084 */ 1085 rw_enter(&os_lock, RW_WRITER); 1086 if (os != odn->dn_objset) { 1087 rw_exit(&os_lock); 1088 DNODE_STAT_BUMP(dnode_move_recheck1); 1089 return (KMEM_CBRC_DONT_KNOW); 1090 } 1091 1092 /* 1093 * If the dnode is still valid, then so is the objset. We know that no 1094 * valid objset can be freed while we hold os_lock, so we can safely 1095 * ensure that the objset remains in use. 1096 */ 1097 mutex_enter(&os->os_lock); 1098 1099 /* 1100 * Recheck the objset pointer in case the dnode was removed just before 1101 * acquiring the lock. 1102 */ 1103 if (os != odn->dn_objset) { 1104 mutex_exit(&os->os_lock); 1105 rw_exit(&os_lock); 1106 DNODE_STAT_BUMP(dnode_move_recheck2); 1107 return (KMEM_CBRC_DONT_KNOW); 1108 } 1109 1110 /* 1111 * At this point we know that as long as we hold os->os_lock, the dnode 1112 * cannot be freed and fields within the dnode can be safely accessed. 1113 * The objset listing this dnode cannot go away as long as this dnode is 1114 * on its list. 1115 */ 1116 rw_exit(&os_lock); 1117 if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { 1118 mutex_exit(&os->os_lock); 1119 DNODE_STAT_BUMP(dnode_move_special); 1120 return (KMEM_CBRC_NO); 1121 } 1122 ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ 1123 1124 /* 1125 * Lock the dnode handle to prevent the dnode from obtaining any new 1126 * holds. This also prevents the descendant dbufs and the bonus dbuf 1127 * from accessing the dnode, so that we can discount their holds. The 1128 * handle is safe to access because we know that while the dnode cannot 1129 * go away, neither can its handle. Once we hold dnh_zrlock, we can 1130 * safely move any dnode referenced only by dbufs. 1131 */ 1132 if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { 1133 mutex_exit(&os->os_lock); 1134 DNODE_STAT_BUMP(dnode_move_handle); 1135 return (KMEM_CBRC_LATER); 1136 } 1137 1138 /* 1139 * Ensure a consistent view of the dnode's holds and the dnode's dbufs. 1140 * We need to guarantee that there is a hold for every dbuf in order to 1141 * determine whether the dnode is actively referenced. Falsely matching 1142 * a dbuf to an active hold would lead to an unsafe move. It's possible 1143 * that a thread already having an active dnode hold is about to add a 1144 * dbuf, and we can't compare hold and dbuf counts while the add is in 1145 * progress. 1146 */ 1147 if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { 1148 zrl_exit(&odn->dn_handle->dnh_zrlock); 1149 mutex_exit(&os->os_lock); 1150 DNODE_STAT_BUMP(dnode_move_rwlock); 1151 return (KMEM_CBRC_LATER); 1152 } 1153 1154 /* 1155 * A dbuf may be removed (evicted) without an active dnode hold. In that 1156 * case, the dbuf count is decremented under the handle lock before the 1157 * dbuf's hold is released. This order ensures that if we count the hold 1158 * after the dbuf is removed but before its hold is released, we will 1159 * treat the unmatched hold as active and exit safely. If we count the 1160 * hold before the dbuf is removed, the hold is discounted, and the 1161 * removal is blocked until the move completes. 1162 */ 1163 refcount = zfs_refcount_count(&odn->dn_holds); 1164 ASSERT(refcount >= 0); 1165 dbufs = DN_DBUFS_COUNT(odn); 1166 1167 /* We can't have more dbufs than dnode holds. */ 1168 ASSERT3U(dbufs, <=, refcount); 1169 DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, 1170 uint32_t, dbufs); 1171 1172 if (refcount > dbufs) { 1173 rw_exit(&odn->dn_struct_rwlock); 1174 zrl_exit(&odn->dn_handle->dnh_zrlock); 1175 mutex_exit(&os->os_lock); 1176 DNODE_STAT_BUMP(dnode_move_active); 1177 return (KMEM_CBRC_LATER); 1178 } 1179 1180 rw_exit(&odn->dn_struct_rwlock); 1181 1182 /* 1183 * At this point we know that anyone with a hold on the dnode is not 1184 * actively referencing it. The dnode is known and in a valid state to 1185 * move. We're holding the locks needed to execute the critical section. 1186 */ 1187 dnode_move_impl(odn, ndn); 1188 1189 list_link_replace(&odn->dn_link, &ndn->dn_link); 1190 /* If the dnode was safe to move, the refcount cannot have changed. */ 1191 ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds)); 1192 ASSERT(dbufs == DN_DBUFS_COUNT(ndn)); 1193 zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ 1194 mutex_exit(&os->os_lock); 1195 1196 return (KMEM_CBRC_YES); 1197 } 1198 #endif /* _KERNEL */ 1199 1200 static void 1201 dnode_slots_hold(dnode_children_t *children, int idx, int slots) 1202 { 1203 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1204 1205 for (int i = idx; i < idx + slots; i++) { 1206 dnode_handle_t *dnh = &children->dnc_children[i]; 1207 zrl_add(&dnh->dnh_zrlock); 1208 } 1209 } 1210 1211 static void 1212 dnode_slots_rele(dnode_children_t *children, int idx, int slots) 1213 { 1214 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1215 1216 for (int i = idx; i < idx + slots; i++) { 1217 dnode_handle_t *dnh = &children->dnc_children[i]; 1218 1219 if (zrl_is_locked(&dnh->dnh_zrlock)) 1220 zrl_exit(&dnh->dnh_zrlock); 1221 else 1222 zrl_remove(&dnh->dnh_zrlock); 1223 } 1224 } 1225 1226 static int 1227 dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) 1228 { 1229 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1230 1231 for (int i = idx; i < idx + slots; i++) { 1232 dnode_handle_t *dnh = &children->dnc_children[i]; 1233 1234 if (!zrl_tryenter(&dnh->dnh_zrlock)) { 1235 for (int j = idx; j < i; j++) { 1236 dnh = &children->dnc_children[j]; 1237 zrl_exit(&dnh->dnh_zrlock); 1238 } 1239 1240 return (0); 1241 } 1242 } 1243 1244 return (1); 1245 } 1246 1247 static void 1248 dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) 1249 { 1250 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1251 1252 for (int i = idx; i < idx + slots; i++) { 1253 dnode_handle_t *dnh = &children->dnc_children[i]; 1254 dnh->dnh_dnode = ptr; 1255 } 1256 } 1257 1258 static boolean_t 1259 dnode_check_slots_free(dnode_children_t *children, int idx, int slots) 1260 { 1261 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1262 1263 /* 1264 * If all dnode slots are either already free or 1265 * evictable return B_TRUE. 1266 */ 1267 for (int i = idx; i < idx + slots; i++) { 1268 dnode_handle_t *dnh = &children->dnc_children[i]; 1269 dnode_t *dn = dnh->dnh_dnode; 1270 1271 if (dn == DN_SLOT_FREE) { 1272 continue; 1273 } else if (DN_SLOT_IS_PTR(dn)) { 1274 mutex_enter(&dn->dn_mtx); 1275 boolean_t can_free = (dn->dn_type == DMU_OT_NONE && 1276 zfs_refcount_is_zero(&dn->dn_holds) && 1277 !DNODE_IS_DIRTY(dn)); 1278 mutex_exit(&dn->dn_mtx); 1279 1280 if (!can_free) 1281 return (B_FALSE); 1282 else 1283 continue; 1284 } else { 1285 return (B_FALSE); 1286 } 1287 } 1288 1289 return (B_TRUE); 1290 } 1291 1292 static uint_t 1293 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) 1294 { 1295 uint_t reclaimed = 0; 1296 1297 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1298 1299 for (int i = idx; i < idx + slots; i++) { 1300 dnode_handle_t *dnh = &children->dnc_children[i]; 1301 1302 ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); 1303 1304 if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { 1305 ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); 1306 dnode_destroy(dnh->dnh_dnode); 1307 dnh->dnh_dnode = DN_SLOT_FREE; 1308 reclaimed++; 1309 } 1310 } 1311 1312 return (reclaimed); 1313 } 1314 1315 void 1316 dnode_free_interior_slots(dnode_t *dn) 1317 { 1318 dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); 1319 int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; 1320 int idx = (dn->dn_object & (epb - 1)) + 1; 1321 int slots = dn->dn_num_slots - 1; 1322 1323 if (slots == 0) 1324 return; 1325 1326 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); 1327 1328 while (!dnode_slots_tryenter(children, idx, slots)) { 1329 DNODE_STAT_BUMP(dnode_free_interior_lock_retry); 1330 kpreempt(KPREEMPT_SYNC); 1331 } 1332 1333 dnode_set_slots(children, idx, slots, DN_SLOT_FREE); 1334 dnode_slots_rele(children, idx, slots); 1335 } 1336 1337 void 1338 dnode_special_close(dnode_handle_t *dnh) 1339 { 1340 dnode_t *dn = dnh->dnh_dnode; 1341 1342 /* 1343 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final 1344 * zfs_refcount_remove() 1345 */ 1346 mutex_enter(&dn->dn_mtx); 1347 if (zfs_refcount_count(&dn->dn_holds) > 0) 1348 cv_wait(&dn->dn_nodnholds, &dn->dn_mtx); 1349 mutex_exit(&dn->dn_mtx); 1350 ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0); 1351 1352 ASSERT(dn->dn_dbuf == NULL || 1353 dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); 1354 zrl_add(&dnh->dnh_zrlock); 1355 dnode_destroy(dn); /* implicit zrl_remove() */ 1356 zrl_destroy(&dnh->dnh_zrlock); 1357 dnh->dnh_dnode = NULL; 1358 } 1359 1360 void 1361 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, 1362 dnode_handle_t *dnh) 1363 { 1364 dnode_t *dn; 1365 1366 zrl_init(&dnh->dnh_zrlock); 1367 VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock)); 1368 1369 dn = dnode_create(os, dnp, NULL, object, dnh); 1370 DNODE_VERIFY(dn); 1371 1372 zrl_exit(&dnh->dnh_zrlock); 1373 } 1374 1375 static void 1376 dnode_buf_evict_async(void *dbu) 1377 { 1378 dnode_children_t *dnc = dbu; 1379 1380 DNODE_STAT_BUMP(dnode_buf_evict); 1381 1382 for (int i = 0; i < dnc->dnc_count; i++) { 1383 dnode_handle_t *dnh = &dnc->dnc_children[i]; 1384 dnode_t *dn; 1385 1386 /* 1387 * The dnode handle lock guards against the dnode moving to 1388 * another valid address, so there is no need here to guard 1389 * against changes to or from NULL. 1390 */ 1391 if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { 1392 zrl_destroy(&dnh->dnh_zrlock); 1393 dnh->dnh_dnode = DN_SLOT_UNINIT; 1394 continue; 1395 } 1396 1397 zrl_add(&dnh->dnh_zrlock); 1398 dn = dnh->dnh_dnode; 1399 /* 1400 * If there are holds on this dnode, then there should 1401 * be holds on the dnode's containing dbuf as well; thus 1402 * it wouldn't be eligible for eviction and this function 1403 * would not have been called. 1404 */ 1405 ASSERT(zfs_refcount_is_zero(&dn->dn_holds)); 1406 ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); 1407 1408 dnode_destroy(dn); /* implicit zrl_remove() for first slot */ 1409 zrl_destroy(&dnh->dnh_zrlock); 1410 dnh->dnh_dnode = DN_SLOT_UNINIT; 1411 } 1412 kmem_free(dnc, sizeof (dnode_children_t) + 1413 dnc->dnc_count * sizeof (dnode_handle_t)); 1414 } 1415 1416 /* 1417 * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used 1418 * to ensure the hole at the specified object offset is large enough to 1419 * hold the dnode being created. The slots parameter is also used to ensure 1420 * a dnode does not span multiple dnode blocks. In both of these cases, if 1421 * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases 1422 * are only possible when using DNODE_MUST_BE_FREE. 1423 * 1424 * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. 1425 * dnode_hold_impl() will check if the requested dnode is already consumed 1426 * as an extra dnode slot by an large dnode, in which case it returns 1427 * ENOENT. 1428 * 1429 * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just 1430 * return whether the hold would succeed or not. tag and dnp should set to 1431 * NULL in this case. 1432 * 1433 * errors: 1434 * EINVAL - Invalid object number or flags. 1435 * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) 1436 * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) 1437 * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) 1438 * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) 1439 * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) 1440 * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) 1441 * EIO - I/O error when reading the meta dnode dbuf. 1442 * 1443 * succeeds even for free dnodes. 1444 */ 1445 int 1446 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, 1447 const void *tag, dnode_t **dnp) 1448 { 1449 int epb, idx, err; 1450 int drop_struct_lock = FALSE; 1451 int type; 1452 uint64_t blk; 1453 dnode_t *mdn, *dn; 1454 dmu_buf_impl_t *db; 1455 dnode_children_t *dnc; 1456 dnode_phys_t *dn_block; 1457 dnode_handle_t *dnh; 1458 1459 ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); 1460 ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); 1461 IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL)); 1462 1463 /* 1464 * If you are holding the spa config lock as writer, you shouldn't 1465 * be asking the DMU to do *anything* unless it's the root pool 1466 * which may require us to read from the root filesystem while 1467 * holding some (not all) of the locks as writer. 1468 */ 1469 ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || 1470 (spa_is_root(os->os_spa) && 1471 spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); 1472 1473 ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); 1474 1475 if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT || 1476 object == DMU_PROJECTUSED_OBJECT) { 1477 if (object == DMU_USERUSED_OBJECT) 1478 dn = DMU_USERUSED_DNODE(os); 1479 else if (object == DMU_GROUPUSED_OBJECT) 1480 dn = DMU_GROUPUSED_DNODE(os); 1481 else 1482 dn = DMU_PROJECTUSED_DNODE(os); 1483 if (dn == NULL) 1484 return (SET_ERROR(ENOENT)); 1485 type = dn->dn_type; 1486 if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) 1487 return (SET_ERROR(ENOENT)); 1488 if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) 1489 return (SET_ERROR(EEXIST)); 1490 DNODE_VERIFY(dn); 1491 /* Don't actually hold if dry run, just return 0 */ 1492 if (!(flag & DNODE_DRY_RUN)) { 1493 (void) zfs_refcount_add(&dn->dn_holds, tag); 1494 *dnp = dn; 1495 } 1496 return (0); 1497 } 1498 1499 if (object == 0 || object >= DN_MAX_OBJECT) 1500 return (SET_ERROR(EINVAL)); 1501 1502 mdn = DMU_META_DNODE(os); 1503 ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); 1504 1505 DNODE_VERIFY(mdn); 1506 1507 if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { 1508 rw_enter(&mdn->dn_struct_rwlock, RW_READER); 1509 drop_struct_lock = TRUE; 1510 } 1511 1512 blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); 1513 db = dbuf_hold(mdn, blk, FTAG); 1514 if (drop_struct_lock) 1515 rw_exit(&mdn->dn_struct_rwlock); 1516 if (db == NULL) { 1517 DNODE_STAT_BUMP(dnode_hold_dbuf_hold); 1518 return (SET_ERROR(EIO)); 1519 } 1520 1521 /* 1522 * We do not need to decrypt to read the dnode so it doesn't matter 1523 * if we get the encrypted or decrypted version. 1524 */ 1525 err = dbuf_read(db, NULL, DB_RF_CANFAIL | 1526 DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); 1527 if (err) { 1528 DNODE_STAT_BUMP(dnode_hold_dbuf_read); 1529 dbuf_rele(db, FTAG); 1530 return (err); 1531 } 1532 1533 ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); 1534 epb = db->db.db_size >> DNODE_SHIFT; 1535 1536 idx = object & (epb - 1); 1537 dn_block = (dnode_phys_t *)db->db.db_data; 1538 1539 ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); 1540 dnc = dmu_buf_get_user(&db->db); 1541 dnh = NULL; 1542 if (dnc == NULL) { 1543 dnode_children_t *winner; 1544 int skip = 0; 1545 1546 dnc = kmem_zalloc(sizeof (dnode_children_t) + 1547 epb * sizeof (dnode_handle_t), KM_SLEEP); 1548 dnc->dnc_count = epb; 1549 dnh = &dnc->dnc_children[0]; 1550 1551 /* Initialize dnode slot status from dnode_phys_t */ 1552 for (int i = 0; i < epb; i++) { 1553 zrl_init(&dnh[i].dnh_zrlock); 1554 1555 if (skip) { 1556 skip--; 1557 continue; 1558 } 1559 1560 if (dn_block[i].dn_type != DMU_OT_NONE) { 1561 int interior = dn_block[i].dn_extra_slots; 1562 1563 dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); 1564 dnode_set_slots(dnc, i + 1, interior, 1565 DN_SLOT_INTERIOR); 1566 skip = interior; 1567 } else { 1568 dnh[i].dnh_dnode = DN_SLOT_FREE; 1569 skip = 0; 1570 } 1571 } 1572 1573 dmu_buf_init_user(&dnc->dnc_dbu, NULL, 1574 dnode_buf_evict_async, NULL); 1575 winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); 1576 if (winner != NULL) { 1577 1578 for (int i = 0; i < epb; i++) 1579 zrl_destroy(&dnh[i].dnh_zrlock); 1580 1581 kmem_free(dnc, sizeof (dnode_children_t) + 1582 epb * sizeof (dnode_handle_t)); 1583 dnc = winner; 1584 } 1585 } 1586 1587 ASSERT(dnc->dnc_count == epb); 1588 1589 if (flag & DNODE_MUST_BE_ALLOCATED) { 1590 slots = 1; 1591 1592 dnode_slots_hold(dnc, idx, slots); 1593 dnh = &dnc->dnc_children[idx]; 1594 1595 if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { 1596 dn = dnh->dnh_dnode; 1597 } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { 1598 DNODE_STAT_BUMP(dnode_hold_alloc_interior); 1599 dnode_slots_rele(dnc, idx, slots); 1600 dbuf_rele(db, FTAG); 1601 return (SET_ERROR(EEXIST)); 1602 } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { 1603 DNODE_STAT_BUMP(dnode_hold_alloc_misses); 1604 dnode_slots_rele(dnc, idx, slots); 1605 dbuf_rele(db, FTAG); 1606 return (SET_ERROR(ENOENT)); 1607 } else { 1608 dnode_slots_rele(dnc, idx, slots); 1609 while (!dnode_slots_tryenter(dnc, idx, slots)) { 1610 DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); 1611 kpreempt(KPREEMPT_SYNC); 1612 } 1613 1614 /* 1615 * Someone else won the race and called dnode_create() 1616 * after we checked DN_SLOT_IS_PTR() above but before 1617 * we acquired the lock. 1618 */ 1619 if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { 1620 DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); 1621 dn = dnh->dnh_dnode; 1622 } else { 1623 dn = dnode_create(os, dn_block + idx, db, 1624 object, dnh); 1625 dmu_buf_add_user_size(&db->db, 1626 sizeof (dnode_t)); 1627 } 1628 } 1629 1630 mutex_enter(&dn->dn_mtx); 1631 if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { 1632 DNODE_STAT_BUMP(dnode_hold_alloc_type_none); 1633 mutex_exit(&dn->dn_mtx); 1634 dnode_slots_rele(dnc, idx, slots); 1635 dbuf_rele(db, FTAG); 1636 return (SET_ERROR(ENOENT)); 1637 } 1638 1639 /* Don't actually hold if dry run, just return 0 */ 1640 if (flag & DNODE_DRY_RUN) { 1641 mutex_exit(&dn->dn_mtx); 1642 dnode_slots_rele(dnc, idx, slots); 1643 dbuf_rele(db, FTAG); 1644 return (0); 1645 } 1646 1647 DNODE_STAT_BUMP(dnode_hold_alloc_hits); 1648 } else if (flag & DNODE_MUST_BE_FREE) { 1649 1650 if (idx + slots - 1 >= DNODES_PER_BLOCK) { 1651 DNODE_STAT_BUMP(dnode_hold_free_overflow); 1652 dbuf_rele(db, FTAG); 1653 return (SET_ERROR(ENOSPC)); 1654 } 1655 1656 dnode_slots_hold(dnc, idx, slots); 1657 1658 if (!dnode_check_slots_free(dnc, idx, slots)) { 1659 DNODE_STAT_BUMP(dnode_hold_free_misses); 1660 dnode_slots_rele(dnc, idx, slots); 1661 dbuf_rele(db, FTAG); 1662 return (SET_ERROR(ENOSPC)); 1663 } 1664 1665 dnode_slots_rele(dnc, idx, slots); 1666 while (!dnode_slots_tryenter(dnc, idx, slots)) { 1667 DNODE_STAT_BUMP(dnode_hold_free_lock_retry); 1668 kpreempt(KPREEMPT_SYNC); 1669 } 1670 1671 if (!dnode_check_slots_free(dnc, idx, slots)) { 1672 DNODE_STAT_BUMP(dnode_hold_free_lock_misses); 1673 dnode_slots_rele(dnc, idx, slots); 1674 dbuf_rele(db, FTAG); 1675 return (SET_ERROR(ENOSPC)); 1676 } 1677 1678 /* 1679 * Allocated but otherwise free dnodes which would 1680 * be in the interior of a multi-slot dnodes need 1681 * to be freed. Single slot dnodes can be safely 1682 * re-purposed as a performance optimization. 1683 */ 1684 if (slots > 1) { 1685 uint_t reclaimed = 1686 dnode_reclaim_slots(dnc, idx + 1, slots - 1); 1687 if (reclaimed > 0) 1688 dmu_buf_sub_user_size(&db->db, 1689 reclaimed * sizeof (dnode_t)); 1690 } 1691 1692 dnh = &dnc->dnc_children[idx]; 1693 if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { 1694 dn = dnh->dnh_dnode; 1695 } else { 1696 dn = dnode_create(os, dn_block + idx, db, 1697 object, dnh); 1698 dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); 1699 } 1700 1701 mutex_enter(&dn->dn_mtx); 1702 if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { 1703 DNODE_STAT_BUMP(dnode_hold_free_refcount); 1704 mutex_exit(&dn->dn_mtx); 1705 dnode_slots_rele(dnc, idx, slots); 1706 dbuf_rele(db, FTAG); 1707 return (SET_ERROR(EEXIST)); 1708 } 1709 1710 /* Don't actually hold if dry run, just return 0 */ 1711 if (flag & DNODE_DRY_RUN) { 1712 mutex_exit(&dn->dn_mtx); 1713 dnode_slots_rele(dnc, idx, slots); 1714 dbuf_rele(db, FTAG); 1715 return (0); 1716 } 1717 1718 dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); 1719 DNODE_STAT_BUMP(dnode_hold_free_hits); 1720 } else { 1721 dbuf_rele(db, FTAG); 1722 return (SET_ERROR(EINVAL)); 1723 } 1724 1725 ASSERT0(dn->dn_free_txg); 1726 1727 if (zfs_refcount_add(&dn->dn_holds, tag) == 1) 1728 dbuf_add_ref(db, dnh); 1729 1730 mutex_exit(&dn->dn_mtx); 1731 1732 /* Now we can rely on the hold to prevent the dnode from moving. */ 1733 dnode_slots_rele(dnc, idx, slots); 1734 1735 DNODE_VERIFY(dn); 1736 ASSERT3P(dnp, !=, NULL); 1737 ASSERT3P(dn->dn_dbuf, ==, db); 1738 ASSERT3U(dn->dn_object, ==, object); 1739 dbuf_rele(db, FTAG); 1740 1741 *dnp = dn; 1742 return (0); 1743 } 1744 1745 /* 1746 * Return held dnode if the object is allocated, NULL if not. 1747 */ 1748 int 1749 dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) 1750 { 1751 return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, 1752 dnp)); 1753 } 1754 1755 /* 1756 * Can only add a reference if there is already at least one 1757 * reference on the dnode. Returns FALSE if unable to add a 1758 * new reference. 1759 */ 1760 boolean_t 1761 dnode_add_ref(dnode_t *dn, const void *tag) 1762 { 1763 mutex_enter(&dn->dn_mtx); 1764 if (zfs_refcount_is_zero(&dn->dn_holds)) { 1765 mutex_exit(&dn->dn_mtx); 1766 return (FALSE); 1767 } 1768 VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); 1769 mutex_exit(&dn->dn_mtx); 1770 return (TRUE); 1771 } 1772 1773 void 1774 dnode_rele(dnode_t *dn, const void *tag) 1775 { 1776 mutex_enter(&dn->dn_mtx); 1777 dnode_rele_and_unlock(dn, tag, B_FALSE); 1778 } 1779 1780 void 1781 dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting) 1782 { 1783 uint64_t refs; 1784 /* Get while the hold prevents the dnode from moving. */ 1785 dmu_buf_impl_t *db = dn->dn_dbuf; 1786 dnode_handle_t *dnh = dn->dn_handle; 1787 1788 refs = zfs_refcount_remove(&dn->dn_holds, tag); 1789 if (refs == 0) 1790 cv_broadcast(&dn->dn_nodnholds); 1791 mutex_exit(&dn->dn_mtx); 1792 /* dnode could get destroyed at this point, so don't use it anymore */ 1793 1794 /* 1795 * It's unsafe to release the last hold on a dnode by dnode_rele() or 1796 * indirectly by dbuf_rele() while relying on the dnode handle to 1797 * prevent the dnode from moving, since releasing the last hold could 1798 * result in the dnode's parent dbuf evicting its dnode handles. For 1799 * that reason anyone calling dnode_rele() or dbuf_rele() without some 1800 * other direct or indirect hold on the dnode must first drop the dnode 1801 * handle. 1802 */ 1803 #ifdef ZFS_DEBUG 1804 ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread); 1805 #endif 1806 1807 /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ 1808 if (refs == 0 && db != NULL) { 1809 /* 1810 * Another thread could add a hold to the dnode handle in 1811 * dnode_hold_impl() while holding the parent dbuf. Since the 1812 * hold on the parent dbuf prevents the handle from being 1813 * destroyed, the hold on the handle is OK. We can't yet assert 1814 * that the handle has zero references, but that will be 1815 * asserted anyway when the handle gets destroyed. 1816 */ 1817 mutex_enter(&db->db_mtx); 1818 dbuf_rele_and_unlock(db, dnh, evicting); 1819 } 1820 } 1821 1822 /* 1823 * Test whether we can create a dnode at the specified location. 1824 */ 1825 int 1826 dnode_try_claim(objset_t *os, uint64_t object, int slots) 1827 { 1828 return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN, 1829 slots, NULL, NULL)); 1830 } 1831 1832 /* 1833 * Checks if the dnode itself is dirty, or is carrying any uncommitted records. 1834 * It is important to check both conditions, as some operations (eg appending 1835 * to a file) can dirty both as a single logical unit, but they are not synced 1836 * out atomically, so checking one and not the other can result in an object 1837 * appearing to be clean mid-way through a commit. 1838 * 1839 * Do not change this lightly! If you get it wrong, dmu_offset_next() can 1840 * detect a hole where there is really data, leading to silent corruption. 1841 */ 1842 boolean_t 1843 dnode_is_dirty(dnode_t *dn) 1844 { 1845 mutex_enter(&dn->dn_mtx); 1846 1847 for (int i = 0; i < TXG_SIZE; i++) { 1848 if (multilist_link_active(&dn->dn_dirty_link[i]) || 1849 !list_is_empty(&dn->dn_dirty_records[i])) { 1850 mutex_exit(&dn->dn_mtx); 1851 return (B_TRUE); 1852 } 1853 } 1854 1855 mutex_exit(&dn->dn_mtx); 1856 1857 return (B_FALSE); 1858 } 1859 1860 void 1861 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) 1862 { 1863 objset_t *os = dn->dn_objset; 1864 uint64_t txg = tx->tx_txg; 1865 1866 if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { 1867 dsl_dataset_dirty(os->os_dsl_dataset, tx); 1868 return; 1869 } 1870 1871 DNODE_VERIFY(dn); 1872 1873 #ifdef ZFS_DEBUG 1874 mutex_enter(&dn->dn_mtx); 1875 ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); 1876 ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); 1877 mutex_exit(&dn->dn_mtx); 1878 #endif 1879 1880 /* 1881 * Determine old uid/gid when necessary 1882 */ 1883 dmu_objset_userquota_get_ids(dn, B_TRUE, tx); 1884 1885 multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK]; 1886 multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); 1887 1888 /* 1889 * If we are already marked dirty, we're done. 1890 */ 1891 if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { 1892 multilist_sublist_unlock(mls); 1893 return; 1894 } 1895 1896 ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) || 1897 !avl_is_empty(&dn->dn_dbufs)); 1898 ASSERT(dn->dn_datablksz != 0); 1899 ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]); 1900 ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]); 1901 ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]); 1902 1903 dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", 1904 (u_longlong_t)dn->dn_object, (u_longlong_t)txg); 1905 1906 multilist_sublist_insert_head(mls, dn); 1907 1908 multilist_sublist_unlock(mls); 1909 1910 /* 1911 * The dnode maintains a hold on its containing dbuf as 1912 * long as there are holds on it. Each instantiated child 1913 * dbuf maintains a hold on the dnode. When the last child 1914 * drops its hold, the dnode will drop its hold on the 1915 * containing dbuf. We add a "dirty hold" here so that the 1916 * dnode will hang around after we finish processing its 1917 * children. 1918 */ 1919 VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); 1920 1921 (void) dbuf_dirty(dn->dn_dbuf, tx); 1922 1923 dsl_dataset_dirty(os->os_dsl_dataset, tx); 1924 } 1925 1926 void 1927 dnode_free(dnode_t *dn, dmu_tx_t *tx) 1928 { 1929 mutex_enter(&dn->dn_mtx); 1930 if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { 1931 mutex_exit(&dn->dn_mtx); 1932 return; 1933 } 1934 dn->dn_free_txg = tx->tx_txg; 1935 mutex_exit(&dn->dn_mtx); 1936 1937 dnode_setdirty(dn, tx); 1938 } 1939 1940 /* 1941 * Try to change the block size for the indicated dnode. This can only 1942 * succeed if there are no blocks allocated or dirty beyond first block 1943 */ 1944 int 1945 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) 1946 { 1947 dmu_buf_impl_t *db; 1948 int err; 1949 1950 ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); 1951 if (size == 0) 1952 size = SPA_MINBLOCKSIZE; 1953 else 1954 size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); 1955 1956 if (ibs == dn->dn_indblkshift) 1957 ibs = 0; 1958 1959 if (size == dn->dn_datablksz && ibs == 0) 1960 return (0); 1961 1962 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1963 1964 /* Check for any allocated blocks beyond the first */ 1965 if (dn->dn_maxblkid != 0) 1966 goto fail; 1967 1968 mutex_enter(&dn->dn_dbufs_mtx); 1969 for (db = avl_first(&dn->dn_dbufs); db != NULL; 1970 db = AVL_NEXT(&dn->dn_dbufs, db)) { 1971 if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && 1972 db->db_blkid != DMU_SPILL_BLKID) { 1973 mutex_exit(&dn->dn_dbufs_mtx); 1974 goto fail; 1975 } 1976 } 1977 mutex_exit(&dn->dn_dbufs_mtx); 1978 1979 if (ibs && dn->dn_nlevels != 1) 1980 goto fail; 1981 1982 dnode_setdirty(dn, tx); 1983 if (size != dn->dn_datablksz) { 1984 /* resize the old block */ 1985 err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); 1986 if (err == 0) { 1987 dbuf_new_size(db, size, tx); 1988 } else if (err != ENOENT) { 1989 goto fail; 1990 } 1991 1992 dnode_setdblksz(dn, size); 1993 dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; 1994 if (db) 1995 dbuf_rele(db, FTAG); 1996 } 1997 if (ibs) { 1998 dn->dn_indblkshift = ibs; 1999 dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; 2000 } 2001 2002 rw_exit(&dn->dn_struct_rwlock); 2003 return (0); 2004 2005 fail: 2006 rw_exit(&dn->dn_struct_rwlock); 2007 return (SET_ERROR(ENOTSUP)); 2008 } 2009 2010 static void 2011 dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) 2012 { 2013 uint64_t txgoff = tx->tx_txg & TXG_MASK; 2014 int old_nlevels = dn->dn_nlevels; 2015 dmu_buf_impl_t *db; 2016 list_t *list; 2017 dbuf_dirty_record_t *new, *dr, *dr_next; 2018 2019 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2020 2021 ASSERT3U(new_nlevels, >, dn->dn_nlevels); 2022 dn->dn_nlevels = new_nlevels; 2023 2024 ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); 2025 dn->dn_next_nlevels[txgoff] = new_nlevels; 2026 2027 /* dirty the left indirects */ 2028 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); 2029 ASSERT(db != NULL); 2030 new = dbuf_dirty(db, tx); 2031 dbuf_rele(db, FTAG); 2032 2033 /* transfer the dirty records to the new indirect */ 2034 mutex_enter(&dn->dn_mtx); 2035 mutex_enter(&new->dt.di.dr_mtx); 2036 list = &dn->dn_dirty_records[txgoff]; 2037 for (dr = list_head(list); dr; dr = dr_next) { 2038 dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); 2039 2040 IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); 2041 if (dr->dr_dbuf == NULL || 2042 (dr->dr_dbuf->db_level == old_nlevels - 1 && 2043 dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 2044 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { 2045 list_remove(&dn->dn_dirty_records[txgoff], dr); 2046 list_insert_tail(&new->dt.di.dr_children, dr); 2047 dr->dr_parent = new; 2048 } 2049 } 2050 mutex_exit(&new->dt.di.dr_mtx); 2051 mutex_exit(&dn->dn_mtx); 2052 } 2053 2054 int 2055 dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx) 2056 { 2057 int ret = 0; 2058 2059 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2060 2061 if (dn->dn_nlevels == nlevels) { 2062 ret = 0; 2063 goto out; 2064 } else if (nlevels < dn->dn_nlevels) { 2065 ret = SET_ERROR(EINVAL); 2066 goto out; 2067 } 2068 2069 dnode_set_nlevels_impl(dn, nlevels, tx); 2070 2071 out: 2072 rw_exit(&dn->dn_struct_rwlock); 2073 return (ret); 2074 } 2075 2076 /* read-holding callers must not rely on the lock being continuously held */ 2077 void 2078 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, 2079 boolean_t force) 2080 { 2081 int epbs, new_nlevels; 2082 uint64_t sz; 2083 2084 ASSERT(blkid != DMU_BONUS_BLKID); 2085 2086 ASSERT(have_read ? 2087 RW_READ_HELD(&dn->dn_struct_rwlock) : 2088 RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2089 2090 /* 2091 * if we have a read-lock, check to see if we need to do any work 2092 * before upgrading to a write-lock. 2093 */ 2094 if (have_read) { 2095 if (blkid <= dn->dn_maxblkid) 2096 return; 2097 2098 if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { 2099 rw_exit(&dn->dn_struct_rwlock); 2100 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2101 } 2102 } 2103 2104 /* 2105 * Raw sends (indicated by the force flag) require that we take the 2106 * given blkid even if the value is lower than the current value. 2107 */ 2108 if (!force && blkid <= dn->dn_maxblkid) 2109 goto out; 2110 2111 /* 2112 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] 2113 * to indicate that this field is set. This allows us to set the 2114 * maxblkid to 0 on an existing object in dnode_sync(). 2115 */ 2116 dn->dn_maxblkid = blkid; 2117 dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = 2118 blkid | DMU_NEXT_MAXBLKID_SET; 2119 2120 /* 2121 * Compute the number of levels necessary to support the new maxblkid. 2122 * Raw sends will ensure nlevels is set correctly for us. 2123 */ 2124 new_nlevels = 1; 2125 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2126 for (sz = dn->dn_nblkptr; 2127 sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) 2128 new_nlevels++; 2129 2130 ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS); 2131 2132 if (!force) { 2133 if (new_nlevels > dn->dn_nlevels) 2134 dnode_set_nlevels_impl(dn, new_nlevels, tx); 2135 } else { 2136 ASSERT3U(dn->dn_nlevels, >=, new_nlevels); 2137 } 2138 2139 out: 2140 if (have_read) 2141 rw_downgrade(&dn->dn_struct_rwlock); 2142 } 2143 2144 static void 2145 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) 2146 { 2147 dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); 2148 if (db != NULL) { 2149 dmu_buf_will_dirty(&db->db, tx); 2150 dbuf_rele(db, FTAG); 2151 } 2152 } 2153 2154 /* 2155 * Dirty all the in-core level-1 dbufs in the range specified by start_blkid 2156 * and end_blkid. 2157 */ 2158 static void 2159 dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 2160 dmu_tx_t *tx) 2161 { 2162 dmu_buf_impl_t *db_search; 2163 dmu_buf_impl_t *db; 2164 avl_index_t where; 2165 2166 db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP); 2167 2168 mutex_enter(&dn->dn_dbufs_mtx); 2169 2170 db_search->db_level = 1; 2171 db_search->db_blkid = start_blkid + 1; 2172 db_search->db_state = DB_SEARCH; 2173 for (;;) { 2174 2175 db = avl_find(&dn->dn_dbufs, db_search, &where); 2176 if (db == NULL) 2177 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 2178 2179 if (db == NULL || db->db_level != 1 || 2180 db->db_blkid >= end_blkid) { 2181 break; 2182 } 2183 2184 /* 2185 * Setup the next blkid we want to search for. 2186 */ 2187 db_search->db_blkid = db->db_blkid + 1; 2188 ASSERT3U(db->db_blkid, >=, start_blkid); 2189 2190 /* 2191 * If the dbuf transitions to DB_EVICTING while we're trying 2192 * to dirty it, then we will be unable to discover it in 2193 * the dbuf hash table. This will result in a call to 2194 * dbuf_create() which needs to acquire the dn_dbufs_mtx 2195 * lock. To avoid a deadlock, we drop the lock before 2196 * dirtying the level-1 dbuf. 2197 */ 2198 mutex_exit(&dn->dn_dbufs_mtx); 2199 dnode_dirty_l1(dn, db->db_blkid, tx); 2200 mutex_enter(&dn->dn_dbufs_mtx); 2201 } 2202 2203 #ifdef ZFS_DEBUG 2204 /* 2205 * Walk all the in-core level-1 dbufs and verify they have been dirtied. 2206 */ 2207 db_search->db_level = 1; 2208 db_search->db_blkid = start_blkid + 1; 2209 db_search->db_state = DB_SEARCH; 2210 db = avl_find(&dn->dn_dbufs, db_search, &where); 2211 if (db == NULL) 2212 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 2213 for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { 2214 if (db->db_level != 1 || db->db_blkid >= end_blkid) 2215 break; 2216 if (db->db_state != DB_EVICTING) 2217 ASSERT(db->db_dirtycnt > 0); 2218 } 2219 #endif 2220 kmem_free(db_search, sizeof (dmu_buf_impl_t)); 2221 mutex_exit(&dn->dn_dbufs_mtx); 2222 } 2223 2224 void 2225 dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) 2226 { 2227 /* 2228 * Don't set dirtyctx to SYNC if we're just modifying this as we 2229 * initialize the objset. 2230 */ 2231 if (dn->dn_dirtyctx == DN_UNDIRTIED) { 2232 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2233 2234 if (ds != NULL) { 2235 rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); 2236 } 2237 if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 2238 if (dmu_tx_is_syncing(tx)) 2239 dn->dn_dirtyctx = DN_DIRTY_SYNC; 2240 else 2241 dn->dn_dirtyctx = DN_DIRTY_OPEN; 2242 dn->dn_dirtyctx_firstset = tag; 2243 } 2244 if (ds != NULL) { 2245 rrw_exit(&ds->ds_bp_rwlock, tag); 2246 } 2247 } 2248 } 2249 2250 static void 2251 dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, 2252 dmu_tx_t *tx) 2253 { 2254 dmu_buf_impl_t *db; 2255 int res; 2256 2257 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2258 res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, 2259 FTAG, &db); 2260 rw_exit(&dn->dn_struct_rwlock); 2261 if (res == 0) { 2262 db_lock_type_t dblt; 2263 boolean_t dirty; 2264 2265 dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); 2266 /* don't dirty if not on disk and not dirty */ 2267 dirty = !list_is_empty(&db->db_dirty_records) || 2268 (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); 2269 dmu_buf_unlock_parent(db, dblt, FTAG); 2270 if (dirty) { 2271 caddr_t data; 2272 2273 dmu_buf_will_dirty(&db->db, tx); 2274 data = db->db.db_data; 2275 memset(data + blkoff, 0, len); 2276 } 2277 dbuf_rele(db, FTAG); 2278 } 2279 } 2280 2281 void 2282 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) 2283 { 2284 uint64_t blkoff, blkid, nblks; 2285 int blksz, blkshift, head, tail; 2286 int trunc = FALSE; 2287 int epbs; 2288 2289 blksz = dn->dn_datablksz; 2290 blkshift = dn->dn_datablkshift; 2291 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2292 2293 if (len == DMU_OBJECT_END) { 2294 len = UINT64_MAX - off; 2295 trunc = TRUE; 2296 } 2297 2298 /* 2299 * First, block align the region to free: 2300 */ 2301 if (ISP2(blksz)) { 2302 head = P2NPHASE(off, blksz); 2303 blkoff = P2PHASE(off, blksz); 2304 if ((off >> blkshift) > dn->dn_maxblkid) 2305 return; 2306 } else { 2307 ASSERT(dn->dn_maxblkid == 0); 2308 if (off == 0 && len >= blksz) { 2309 /* 2310 * Freeing the whole block; fast-track this request. 2311 */ 2312 blkid = 0; 2313 nblks = 1; 2314 if (dn->dn_nlevels > 1) { 2315 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2316 dnode_dirty_l1(dn, 0, tx); 2317 rw_exit(&dn->dn_struct_rwlock); 2318 } 2319 goto done; 2320 } else if (off >= blksz) { 2321 /* Freeing past end-of-data */ 2322 return; 2323 } else { 2324 /* Freeing part of the block. */ 2325 head = blksz - off; 2326 ASSERT3U(head, >, 0); 2327 } 2328 blkoff = off; 2329 } 2330 /* zero out any partial block data at the start of the range */ 2331 if (head) { 2332 ASSERT3U(blkoff + head, ==, blksz); 2333 if (len < head) 2334 head = len; 2335 dnode_partial_zero(dn, off, blkoff, head, tx); 2336 off += head; 2337 len -= head; 2338 } 2339 2340 /* If the range was less than one block, we're done */ 2341 if (len == 0) 2342 return; 2343 2344 /* If the remaining range is past end of file, we're done */ 2345 if ((off >> blkshift) > dn->dn_maxblkid) 2346 return; 2347 2348 ASSERT(ISP2(blksz)); 2349 if (trunc) 2350 tail = 0; 2351 else 2352 tail = P2PHASE(len, blksz); 2353 2354 ASSERT0(P2PHASE(off, blksz)); 2355 /* zero out any partial block data at the end of the range */ 2356 if (tail) { 2357 if (len < tail) 2358 tail = len; 2359 dnode_partial_zero(dn, off + len, 0, tail, tx); 2360 len -= tail; 2361 } 2362 2363 /* If the range did not include a full block, we are done */ 2364 if (len == 0) 2365 return; 2366 2367 ASSERT(IS_P2ALIGNED(off, blksz)); 2368 ASSERT(trunc || IS_P2ALIGNED(len, blksz)); 2369 blkid = off >> blkshift; 2370 nblks = len >> blkshift; 2371 if (trunc) 2372 nblks += 1; 2373 2374 /* 2375 * Dirty all the indirect blocks in this range. Note that only 2376 * the first and last indirect blocks can actually be written 2377 * (if they were partially freed) -- they must be dirtied, even if 2378 * they do not exist on disk yet. The interior blocks will 2379 * be freed by free_children(), so they will not actually be written. 2380 * Even though these interior blocks will not be written, we 2381 * dirty them for two reasons: 2382 * 2383 * - It ensures that the indirect blocks remain in memory until 2384 * syncing context. (They have already been prefetched by 2385 * dmu_tx_hold_free(), so we don't have to worry about reading 2386 * them serially here.) 2387 * 2388 * - The dirty space accounting will put pressure on the txg sync 2389 * mechanism to begin syncing, and to delay transactions if there 2390 * is a large amount of freeing. Even though these indirect 2391 * blocks will not be written, we could need to write the same 2392 * amount of space if we copy the freed BPs into deadlists. 2393 */ 2394 if (dn->dn_nlevels > 1) { 2395 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2396 uint64_t first, last; 2397 2398 first = blkid >> epbs; 2399 dnode_dirty_l1(dn, first, tx); 2400 if (trunc) 2401 last = dn->dn_maxblkid >> epbs; 2402 else 2403 last = (blkid + nblks - 1) >> epbs; 2404 if (last != first) 2405 dnode_dirty_l1(dn, last, tx); 2406 2407 dnode_dirty_l1range(dn, first, last, tx); 2408 2409 int shift = dn->dn_datablkshift + dn->dn_indblkshift - 2410 SPA_BLKPTRSHIFT; 2411 for (uint64_t i = first + 1; i < last; i++) { 2412 /* 2413 * Set i to the blockid of the next non-hole 2414 * level-1 indirect block at or after i. Note 2415 * that dnode_next_offset() operates in terms of 2416 * level-0-equivalent bytes. 2417 */ 2418 uint64_t ibyte = i << shift; 2419 int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, 2420 &ibyte, 2, 1, 0); 2421 i = ibyte >> shift; 2422 if (i >= last) 2423 break; 2424 2425 /* 2426 * Normally we should not see an error, either 2427 * from dnode_next_offset() or dbuf_hold_level() 2428 * (except for ESRCH from dnode_next_offset). 2429 * If there is an i/o error, then when we read 2430 * this block in syncing context, it will use 2431 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according 2432 * to the "failmode" property. dnode_next_offset() 2433 * doesn't have a flag to indicate MUSTSUCCEED. 2434 */ 2435 if (err != 0) 2436 break; 2437 2438 dnode_dirty_l1(dn, i, tx); 2439 } 2440 rw_exit(&dn->dn_struct_rwlock); 2441 } 2442 2443 done: 2444 /* 2445 * Add this range to the dnode range list. 2446 * We will finish up this free operation in the syncing phase. 2447 */ 2448 mutex_enter(&dn->dn_mtx); 2449 { 2450 int txgoff = tx->tx_txg & TXG_MASK; 2451 if (dn->dn_free_ranges[txgoff] == NULL) { 2452 dn->dn_free_ranges[txgoff] = 2453 zfs_range_tree_create_flags( 2454 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 2455 ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges")); 2456 } 2457 zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); 2458 zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); 2459 } 2460 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", 2461 (u_longlong_t)blkid, (u_longlong_t)nblks, 2462 (u_longlong_t)tx->tx_txg); 2463 mutex_exit(&dn->dn_mtx); 2464 2465 dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); 2466 dnode_setdirty(dn, tx); 2467 } 2468 2469 static boolean_t 2470 dnode_spill_freed(dnode_t *dn) 2471 { 2472 int i; 2473 2474 mutex_enter(&dn->dn_mtx); 2475 for (i = 0; i < TXG_SIZE; i++) { 2476 if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) 2477 break; 2478 } 2479 mutex_exit(&dn->dn_mtx); 2480 return (i < TXG_SIZE); 2481 } 2482 2483 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ 2484 uint64_t 2485 dnode_block_freed(dnode_t *dn, uint64_t blkid) 2486 { 2487 int i; 2488 2489 if (blkid == DMU_BONUS_BLKID) 2490 return (FALSE); 2491 2492 if (dn->dn_free_txg) 2493 return (TRUE); 2494 2495 if (blkid == DMU_SPILL_BLKID) 2496 return (dnode_spill_freed(dn)); 2497 2498 mutex_enter(&dn->dn_mtx); 2499 for (i = 0; i < TXG_SIZE; i++) { 2500 if (dn->dn_free_ranges[i] != NULL && 2501 zfs_range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) 2502 break; 2503 } 2504 mutex_exit(&dn->dn_mtx); 2505 return (i < TXG_SIZE); 2506 } 2507 2508 /* call from syncing context when we actually write/free space for this dnode */ 2509 void 2510 dnode_diduse_space(dnode_t *dn, int64_t delta) 2511 { 2512 uint64_t space; 2513 dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", 2514 dn, dn->dn_phys, 2515 (u_longlong_t)dn->dn_phys->dn_used, 2516 (longlong_t)delta); 2517 2518 mutex_enter(&dn->dn_mtx); 2519 space = DN_USED_BYTES(dn->dn_phys); 2520 if (delta > 0) { 2521 ASSERT3U(space + delta, >=, space); /* no overflow */ 2522 } else { 2523 ASSERT3U(space, >=, -delta); /* no underflow */ 2524 } 2525 space += delta; 2526 if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { 2527 ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); 2528 ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT)); 2529 dn->dn_phys->dn_used = space >> DEV_BSHIFT; 2530 } else { 2531 dn->dn_phys->dn_used = space; 2532 dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; 2533 } 2534 mutex_exit(&dn->dn_mtx); 2535 } 2536 2537 /* 2538 * Scans a block at the indicated "level" looking for a hole or data, 2539 * depending on 'flags'. 2540 * 2541 * If level > 0, then we are scanning an indirect block looking at its 2542 * pointers. If level == 0, then we are looking at a block of dnodes. 2543 * 2544 * If we don't find what we are looking for in the block, we return ESRCH. 2545 * Otherwise, return with *offset pointing to the beginning (if searching 2546 * forwards) or end (if searching backwards) of the range covered by the 2547 * block pointer we matched on (or dnode). 2548 * 2549 * The basic search algorithm used below by dnode_next_offset() is to 2550 * use this function to search up the block tree (widen the search) until 2551 * we find something (i.e., we don't return ESRCH) and then search back 2552 * down the tree (narrow the search) until we reach our original search 2553 * level. 2554 */ 2555 static int 2556 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, 2557 int lvl, uint64_t blkfill, uint64_t txg) 2558 { 2559 dmu_buf_impl_t *db = NULL; 2560 void *data = NULL; 2561 uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2562 uint64_t epb = 1ULL << epbs; 2563 uint64_t minfill, maxfill; 2564 boolean_t hole; 2565 int i, inc, error, span; 2566 2567 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2568 2569 hole = ((flags & DNODE_FIND_HOLE) != 0); 2570 inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; 2571 ASSERT(txg == 0 || !hole); 2572 2573 if (lvl == dn->dn_phys->dn_nlevels) { 2574 error = 0; 2575 epb = dn->dn_phys->dn_nblkptr; 2576 data = dn->dn_phys->dn_blkptr; 2577 if (dn->dn_dbuf != NULL) 2578 rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER); 2579 else if (dmu_objset_ds(dn->dn_objset) != NULL) 2580 rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, 2581 RW_READER, FTAG); 2582 } else { 2583 uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); 2584 error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); 2585 if (error) { 2586 if (error != ENOENT) 2587 return (error); 2588 if (hole) 2589 return (0); 2590 /* 2591 * This can only happen when we are searching up 2592 * the block tree for data. We don't really need to 2593 * adjust the offset, as we will just end up looking 2594 * at the pointer to this block in its parent, and its 2595 * going to be unallocated, so we will skip over it. 2596 */ 2597 return (SET_ERROR(ESRCH)); 2598 } 2599 error = dbuf_read(db, NULL, 2600 DB_RF_CANFAIL | DB_RF_HAVESTRUCT | 2601 DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); 2602 if (error) { 2603 dbuf_rele(db, FTAG); 2604 return (error); 2605 } 2606 data = db->db.db_data; 2607 rw_enter(&db->db_rwlock, RW_READER); 2608 } 2609 2610 if (db != NULL && txg != 0 && (db->db_blkptr == NULL || 2611 BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg || 2612 BP_IS_HOLE(db->db_blkptr))) { 2613 /* 2614 * This can only happen when we are searching up the tree 2615 * and these conditions mean that we need to keep climbing. 2616 */ 2617 error = SET_ERROR(ESRCH); 2618 } else if (lvl == 0) { 2619 dnode_phys_t *dnp = data; 2620 2621 ASSERT(dn->dn_type == DMU_OT_DNODE); 2622 ASSERT(!(flags & DNODE_FIND_BACKWARDS)); 2623 2624 for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); 2625 i < blkfill; i += dnp[i].dn_extra_slots + 1) { 2626 if ((dnp[i].dn_type == DMU_OT_NONE) == hole) 2627 break; 2628 } 2629 2630 if (i == blkfill) 2631 error = SET_ERROR(ESRCH); 2632 2633 *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + 2634 (i << DNODE_SHIFT); 2635 } else { 2636 blkptr_t *bp = data; 2637 uint64_t start = *offset; 2638 span = (lvl - 1) * epbs + dn->dn_datablkshift; 2639 minfill = 0; 2640 maxfill = blkfill << ((lvl - 1) * epbs); 2641 2642 if (hole) 2643 maxfill--; 2644 else 2645 minfill++; 2646 2647 if (span >= 8 * sizeof (*offset)) { 2648 /* This only happens on the highest indirection level */ 2649 ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); 2650 *offset = 0; 2651 } else { 2652 *offset = *offset >> span; 2653 } 2654 2655 for (i = BF64_GET(*offset, 0, epbs); 2656 i >= 0 && i < epb; i += inc) { 2657 if (BP_GET_FILL(&bp[i]) >= minfill && 2658 BP_GET_FILL(&bp[i]) <= maxfill && 2659 (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) 2660 break; 2661 if (inc > 0 || *offset > 0) 2662 *offset += inc; 2663 } 2664 2665 if (span >= 8 * sizeof (*offset)) { 2666 *offset = start; 2667 } else { 2668 *offset = *offset << span; 2669 } 2670 2671 if (inc < 0) { 2672 /* traversing backwards; position offset at the end */ 2673 if (span < 8 * sizeof (*offset)) 2674 *offset = MIN(*offset + (1ULL << span) - 1, 2675 start); 2676 } else if (*offset < start) { 2677 *offset = start; 2678 } 2679 if (i < 0 || i >= epb) 2680 error = SET_ERROR(ESRCH); 2681 } 2682 2683 if (db != NULL) { 2684 rw_exit(&db->db_rwlock); 2685 dbuf_rele(db, FTAG); 2686 } else { 2687 if (dn->dn_dbuf != NULL) 2688 rw_exit(&dn->dn_dbuf->db_rwlock); 2689 else if (dmu_objset_ds(dn->dn_objset) != NULL) 2690 rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, 2691 FTAG); 2692 } 2693 2694 return (error); 2695 } 2696 2697 /* 2698 * Find the next hole, data, or sparse region at or after *offset. 2699 * The value 'blkfill' tells us how many items we expect to find 2700 * in an L0 data block; this value is 1 for normal objects, 2701 * DNODES_PER_BLOCK for the meta dnode, and some fraction of 2702 * DNODES_PER_BLOCK when searching for sparse regions thereof. 2703 * 2704 * Examples: 2705 * 2706 * dnode_next_offset(dn, flags, offset, 1, 1, 0); 2707 * Finds the next/previous hole/data in a file. 2708 * Used in dmu_offset_next(). 2709 * 2710 * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); 2711 * Finds the next free/allocated dnode an objset's meta-dnode. 2712 * Only finds objects that have new contents since txg (ie. 2713 * bonus buffer changes and content removal are ignored). 2714 * Used in dmu_object_next(). 2715 * 2716 * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); 2717 * Finds the next L2 meta-dnode bp that's at most 1/4 full. 2718 * Used in dmu_object_alloc(). 2719 */ 2720 int 2721 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, 2722 int minlvl, uint64_t blkfill, uint64_t txg) 2723 { 2724 uint64_t initial_offset = *offset; 2725 int lvl, maxlvl; 2726 int error = 0; 2727 2728 if (!(flags & DNODE_FIND_HAVELOCK)) 2729 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2730 2731 if (dn->dn_phys->dn_nlevels == 0) { 2732 error = SET_ERROR(ESRCH); 2733 goto out; 2734 } 2735 2736 if (dn->dn_datablkshift == 0) { 2737 if (*offset < dn->dn_datablksz) { 2738 if (flags & DNODE_FIND_HOLE) 2739 *offset = dn->dn_datablksz; 2740 } else { 2741 error = SET_ERROR(ESRCH); 2742 } 2743 goto out; 2744 } 2745 2746 maxlvl = dn->dn_phys->dn_nlevels; 2747 2748 for (lvl = minlvl; lvl <= maxlvl; lvl++) { 2749 error = dnode_next_offset_level(dn, 2750 flags, offset, lvl, blkfill, txg); 2751 if (error != ESRCH) 2752 break; 2753 } 2754 2755 while (error == 0 && --lvl >= minlvl) { 2756 error = dnode_next_offset_level(dn, 2757 flags, offset, lvl, blkfill, txg); 2758 } 2759 2760 /* 2761 * There's always a "virtual hole" at the end of the object, even 2762 * if all BP's which physically exist are non-holes. 2763 */ 2764 if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && 2765 minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { 2766 error = 0; 2767 } 2768 2769 if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? 2770 initial_offset < *offset : initial_offset > *offset)) 2771 error = SET_ERROR(ESRCH); 2772 out: 2773 if (!(flags & DNODE_FIND_HAVELOCK)) 2774 rw_exit(&dn->dn_struct_rwlock); 2775 2776 return (error); 2777 } 2778 2779 #if defined(_KERNEL) 2780 EXPORT_SYMBOL(dnode_hold); 2781 EXPORT_SYMBOL(dnode_rele); 2782 EXPORT_SYMBOL(dnode_set_nlevels); 2783 EXPORT_SYMBOL(dnode_set_blksz); 2784 EXPORT_SYMBOL(dnode_free_range); 2785 EXPORT_SYMBOL(dnode_evict_dbufs); 2786 EXPORT_SYMBOL(dnode_evict_bonus); 2787 #endif 2788 2789 ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW, 2790 "Default dnode block shift"); 2791 ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW, 2792 "Default dnode indirect block shift"); 2793