xref: /illumos-gate/usr/src/uts/common/fs/zfs/dnode.c (revision ddb365bfc9e868ad24ccdcb0dc91af18b10df082)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  * Copyright 2017 RackTop Systems.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/dbuf.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/spa.h>
39 #include <sys/zio.h>
40 #include <sys/dmu_zfetch.h>
41 #include <sys/range_tree.h>
42 #include <sys/zfs_project.h>
43 
44 dnode_stats_t dnode_stats = {
45 	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
46 	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
47 	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
48 	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
49 	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
50 	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
51 	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
52 	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
53 	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
54 	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
55 	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
56 	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
57 	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
58 	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
59 	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
60 	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
61 	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
62 	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
63 	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
64 	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
65 	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
66 	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
67 	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
68 	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
69 	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
70 	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
71 	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
72 	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
73 };
74 
75 static kstat_t *dnode_ksp;
76 static kmem_cache_t *dnode_cache;
77 
78 static dnode_phys_t dnode_phys_zero;
79 
80 int zfs_default_bs = SPA_MINBLOCKSHIFT;
81 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
82 
83 #ifdef	_KERNEL
84 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
85 #endif	/* _KERNEL */
86 
87 static int
88 dbuf_compare(const void *x1, const void *x2)
89 {
90 	const dmu_buf_impl_t *d1 = x1;
91 	const dmu_buf_impl_t *d2 = x2;
92 
93 	int cmp = TREE_CMP(d1->db_level, d2->db_level);
94 	if (likely(cmp))
95 		return (cmp);
96 
97 	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
98 	if (likely(cmp))
99 		return (cmp);
100 
101 	if (d1->db_state == DB_SEARCH) {
102 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
103 		return (-1);
104 	} else if (d2->db_state == DB_SEARCH) {
105 		ASSERT3S(d1->db_state, !=, DB_SEARCH);
106 		return (1);
107 	}
108 
109 	return (TREE_PCMP(d1, d2));
110 }
111 
112 /* ARGSUSED */
113 static int
114 dnode_cons(void *arg, void *unused, int kmflag)
115 {
116 	dnode_t *dn = arg;
117 	int i;
118 
119 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
120 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
121 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
122 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
123 	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
124 
125 	/*
126 	 * Every dbuf has a reference, and dropping a tracked reference is
127 	 * O(number of references), so don't track dn_holds.
128 	 */
129 	zfs_refcount_create_untracked(&dn->dn_holds);
130 	zfs_refcount_create(&dn->dn_tx_holds);
131 	list_link_init(&dn->dn_link);
132 
133 	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
134 	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
135 	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
136 	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
137 	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
138 	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
139 	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
140 	bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
141 
142 	for (i = 0; i < TXG_SIZE; i++) {
143 		multilist_link_init(&dn->dn_dirty_link[i]);
144 		dn->dn_free_ranges[i] = NULL;
145 		list_create(&dn->dn_dirty_records[i],
146 		    sizeof (dbuf_dirty_record_t),
147 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
148 	}
149 
150 	dn->dn_allocated_txg = 0;
151 	dn->dn_free_txg = 0;
152 	dn->dn_assigned_txg = 0;
153 	dn->dn_dirty_txg = 0;
154 	dn->dn_dirtyctx = 0;
155 	dn->dn_dirtyctx_firstset = NULL;
156 	dn->dn_bonus = NULL;
157 	dn->dn_have_spill = B_FALSE;
158 	dn->dn_zio = NULL;
159 	dn->dn_oldused = 0;
160 	dn->dn_oldflags = 0;
161 	dn->dn_olduid = 0;
162 	dn->dn_oldgid = 0;
163 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
164 	dn->dn_newuid = 0;
165 	dn->dn_newgid = 0;
166 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
167 	dn->dn_id_flags = 0;
168 
169 	dn->dn_dbufs_count = 0;
170 	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
171 	    offsetof(dmu_buf_impl_t, db_link));
172 
173 	dn->dn_moved = 0;
174 	return (0);
175 }
176 
177 /* ARGSUSED */
178 static void
179 dnode_dest(void *arg, void *unused)
180 {
181 	int i;
182 	dnode_t *dn = arg;
183 
184 	rw_destroy(&dn->dn_struct_rwlock);
185 	mutex_destroy(&dn->dn_mtx);
186 	mutex_destroy(&dn->dn_dbufs_mtx);
187 	cv_destroy(&dn->dn_notxholds);
188 	cv_destroy(&dn->dn_nodnholds);
189 	zfs_refcount_destroy(&dn->dn_holds);
190 	zfs_refcount_destroy(&dn->dn_tx_holds);
191 	ASSERT(!list_link_active(&dn->dn_link));
192 
193 	for (i = 0; i < TXG_SIZE; i++) {
194 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
195 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
196 		list_destroy(&dn->dn_dirty_records[i]);
197 		ASSERT0(dn->dn_next_nblkptr[i]);
198 		ASSERT0(dn->dn_next_nlevels[i]);
199 		ASSERT0(dn->dn_next_indblkshift[i]);
200 		ASSERT0(dn->dn_next_bonustype[i]);
201 		ASSERT0(dn->dn_rm_spillblk[i]);
202 		ASSERT0(dn->dn_next_bonuslen[i]);
203 		ASSERT0(dn->dn_next_blksz[i]);
204 		ASSERT0(dn->dn_next_maxblkid[i]);
205 	}
206 
207 	ASSERT0(dn->dn_allocated_txg);
208 	ASSERT0(dn->dn_free_txg);
209 	ASSERT0(dn->dn_assigned_txg);
210 	ASSERT0(dn->dn_dirty_txg);
211 	ASSERT0(dn->dn_dirtyctx);
212 	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
213 	ASSERT3P(dn->dn_bonus, ==, NULL);
214 	ASSERT(!dn->dn_have_spill);
215 	ASSERT3P(dn->dn_zio, ==, NULL);
216 	ASSERT0(dn->dn_oldused);
217 	ASSERT0(dn->dn_oldflags);
218 	ASSERT0(dn->dn_olduid);
219 	ASSERT0(dn->dn_oldgid);
220 	ASSERT0(dn->dn_oldprojid);
221 	ASSERT0(dn->dn_newuid);
222 	ASSERT0(dn->dn_newgid);
223 	ASSERT0(dn->dn_newprojid);
224 	ASSERT0(dn->dn_id_flags);
225 
226 	ASSERT0(dn->dn_dbufs_count);
227 	avl_destroy(&dn->dn_dbufs);
228 }
229 
230 void
231 dnode_init(void)
232 {
233 	ASSERT(dnode_cache == NULL);
234 	dnode_cache = kmem_cache_create("dnode_t",
235 	    sizeof (dnode_t),
236 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
237 #ifdef	_KERNEL
238 	kmem_cache_set_move(dnode_cache, dnode_move);
239 
240 	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
241 	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
242 	    KSTAT_FLAG_VIRTUAL);
243 	if (dnode_ksp != NULL) {
244 		dnode_ksp->ks_data = &dnode_stats;
245 		kstat_install(dnode_ksp);
246 	}
247 #endif	/* _KERNEL */
248 }
249 
250 void
251 dnode_fini(void)
252 {
253 	if (dnode_ksp != NULL) {
254 		kstat_delete(dnode_ksp);
255 		dnode_ksp = NULL;
256 	}
257 
258 	kmem_cache_destroy(dnode_cache);
259 	dnode_cache = NULL;
260 }
261 
262 
263 #ifdef ZFS_DEBUG
264 void
265 dnode_verify(dnode_t *dn)
266 {
267 	int drop_struct_lock = FALSE;
268 
269 	ASSERT(dn->dn_phys);
270 	ASSERT(dn->dn_objset);
271 	ASSERT(dn->dn_handle->dnh_dnode == dn);
272 
273 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
274 
275 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
276 		return;
277 
278 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
279 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
280 		drop_struct_lock = TRUE;
281 	}
282 	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
283 		int i;
284 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
285 		ASSERT3U(dn->dn_indblkshift, >=, 0);
286 		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
287 		if (dn->dn_datablkshift) {
288 			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
289 			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
290 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
291 		}
292 		ASSERT3U(dn->dn_nlevels, <=, 30);
293 		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
294 		ASSERT3U(dn->dn_nblkptr, >=, 1);
295 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
296 		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
297 		ASSERT3U(dn->dn_datablksz, ==,
298 		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
299 		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
300 		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
301 		    dn->dn_bonuslen, <=, max_bonuslen);
302 		for (i = 0; i < TXG_SIZE; i++) {
303 			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
304 		}
305 	}
306 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
307 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
308 	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
309 	if (dn->dn_dbuf != NULL) {
310 		ASSERT3P(dn->dn_phys, ==,
311 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
312 		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
313 	}
314 	if (drop_struct_lock)
315 		rw_exit(&dn->dn_struct_rwlock);
316 }
317 #endif
318 
319 void
320 dnode_byteswap(dnode_phys_t *dnp)
321 {
322 	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
323 	int i;
324 
325 	if (dnp->dn_type == DMU_OT_NONE) {
326 		bzero(dnp, sizeof (dnode_phys_t));
327 		return;
328 	}
329 
330 	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
331 	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
332 	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
333 	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
334 	dnp->dn_used = BSWAP_64(dnp->dn_used);
335 
336 	/*
337 	 * dn_nblkptr is only one byte, so it's OK to read it in either
338 	 * byte order.  We can't read dn_bouslen.
339 	 */
340 	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
341 	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
342 	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
343 		buf64[i] = BSWAP_64(buf64[i]);
344 
345 	/*
346 	 * OK to check dn_bonuslen for zero, because it won't matter if
347 	 * we have the wrong byte order.  This is necessary because the
348 	 * dnode dnode is smaller than a regular dnode.
349 	 */
350 	if (dnp->dn_bonuslen != 0) {
351 		dmu_object_byteswap_t byteswap;
352 		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
353 		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
354 		dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
355 		    DN_MAX_BONUS_LEN(dnp));
356 	}
357 
358 	/* Swap SPILL block if we have one */
359 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
360 		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
361 
362 }
363 
364 void
365 dnode_buf_byteswap(void *vbuf, size_t size)
366 {
367 	int i = 0;
368 
369 	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
370 	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
371 
372 	while (i < size) {
373 		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
374 		dnode_byteswap(dnp);
375 
376 		i += DNODE_MIN_SIZE;
377 		if (dnp->dn_type != DMU_OT_NONE)
378 			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
379 	}
380 }
381 
382 void
383 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
384 {
385 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
386 
387 	dnode_setdirty(dn, tx);
388 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
389 	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
390 	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
391 	dn->dn_bonuslen = newsize;
392 	if (newsize == 0)
393 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
394 	else
395 		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
396 	rw_exit(&dn->dn_struct_rwlock);
397 }
398 
399 void
400 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
401 {
402 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
403 	dnode_setdirty(dn, tx);
404 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
405 	dn->dn_bonustype = newtype;
406 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
407 	rw_exit(&dn->dn_struct_rwlock);
408 }
409 
410 void
411 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
412 {
413 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
414 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
415 	dnode_setdirty(dn, tx);
416 	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
417 	dn->dn_have_spill = B_FALSE;
418 }
419 
420 static void
421 dnode_setdblksz(dnode_t *dn, int size)
422 {
423 	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
424 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
425 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
426 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
427 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
428 	dn->dn_datablksz = size;
429 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
430 	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
431 }
432 
433 static dnode_t *
434 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
435     uint64_t object, dnode_handle_t *dnh)
436 {
437 	dnode_t *dn;
438 
439 	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
440 #ifdef _KERNEL
441 	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
442 #endif /* _KERNEL */
443 	dn->dn_moved = 0;
444 
445 	/*
446 	 * Defer setting dn_objset until the dnode is ready to be a candidate
447 	 * for the dnode_move() callback.
448 	 */
449 	dn->dn_object = object;
450 	dn->dn_dbuf = db;
451 	dn->dn_handle = dnh;
452 	dn->dn_phys = dnp;
453 
454 	if (dnp->dn_datablkszsec) {
455 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
456 	} else {
457 		dn->dn_datablksz = 0;
458 		dn->dn_datablkszsec = 0;
459 		dn->dn_datablkshift = 0;
460 	}
461 	dn->dn_indblkshift = dnp->dn_indblkshift;
462 	dn->dn_nlevels = dnp->dn_nlevels;
463 	dn->dn_type = dnp->dn_type;
464 	dn->dn_nblkptr = dnp->dn_nblkptr;
465 	dn->dn_checksum = dnp->dn_checksum;
466 	dn->dn_compress = dnp->dn_compress;
467 	dn->dn_bonustype = dnp->dn_bonustype;
468 	dn->dn_bonuslen = dnp->dn_bonuslen;
469 	dn->dn_num_slots = dnp->dn_extra_slots + 1;
470 	dn->dn_maxblkid = dnp->dn_maxblkid;
471 	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
472 	dn->dn_id_flags = 0;
473 
474 	dmu_zfetch_init(&dn->dn_zfetch, dn);
475 
476 	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
477 	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
478 	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
479 
480 	mutex_enter(&os->os_lock);
481 
482 	/*
483 	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
484 	 * signifies that the special dnodes have no references from
485 	 * their children (the entries in os_dnodes).  This allows
486 	 * dnode_destroy() to easily determine if the last child has
487 	 * been removed and then complete eviction of the objset.
488 	 */
489 	if (!DMU_OBJECT_IS_SPECIAL(object))
490 		list_insert_head(&os->os_dnodes, dn);
491 	membar_producer();
492 
493 	/*
494 	 * Everything else must be valid before assigning dn_objset
495 	 * makes the dnode eligible for dnode_move().
496 	 */
497 	dn->dn_objset = os;
498 
499 	dnh->dnh_dnode = dn;
500 	mutex_exit(&os->os_lock);
501 
502 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
503 
504 	return (dn);
505 }
506 
507 /*
508  * Caller must be holding the dnode handle, which is released upon return.
509  */
510 static void
511 dnode_destroy(dnode_t *dn)
512 {
513 	objset_t *os = dn->dn_objset;
514 	boolean_t complete_os_eviction = B_FALSE;
515 
516 	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
517 
518 	mutex_enter(&os->os_lock);
519 	POINTER_INVALIDATE(&dn->dn_objset);
520 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
521 		list_remove(&os->os_dnodes, dn);
522 		complete_os_eviction =
523 		    list_is_empty(&os->os_dnodes) &&
524 		    list_link_active(&os->os_evicting_node);
525 	}
526 	mutex_exit(&os->os_lock);
527 
528 	/* the dnode can no longer move, so we can release the handle */
529 	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
530 		zrl_remove(&dn->dn_handle->dnh_zrlock);
531 
532 	dn->dn_allocated_txg = 0;
533 	dn->dn_free_txg = 0;
534 	dn->dn_assigned_txg = 0;
535 	dn->dn_dirty_txg = 0;
536 
537 	dn->dn_dirtyctx = 0;
538 	if (dn->dn_dirtyctx_firstset != NULL) {
539 		kmem_free(dn->dn_dirtyctx_firstset, 1);
540 		dn->dn_dirtyctx_firstset = NULL;
541 	}
542 	if (dn->dn_bonus != NULL) {
543 		mutex_enter(&dn->dn_bonus->db_mtx);
544 		dbuf_destroy(dn->dn_bonus);
545 		dn->dn_bonus = NULL;
546 	}
547 	dn->dn_zio = NULL;
548 
549 	dn->dn_have_spill = B_FALSE;
550 	dn->dn_oldused = 0;
551 	dn->dn_oldflags = 0;
552 	dn->dn_olduid = 0;
553 	dn->dn_oldgid = 0;
554 	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
555 	dn->dn_newuid = 0;
556 	dn->dn_newgid = 0;
557 	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
558 	dn->dn_id_flags = 0;
559 
560 	dmu_zfetch_fini(&dn->dn_zfetch);
561 	kmem_cache_free(dnode_cache, dn);
562 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
563 
564 	if (complete_os_eviction)
565 		dmu_objset_evict_done(os);
566 }
567 
568 void
569 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
570     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
571 {
572 	int i;
573 
574 	ASSERT3U(dn_slots, >, 0);
575 	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
576 	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
577 	ASSERT3U(blocksize, <=,
578 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
579 	if (blocksize == 0)
580 		blocksize = 1 << zfs_default_bs;
581 	else
582 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
583 
584 	if (ibs == 0)
585 		ibs = zfs_default_ibs;
586 
587 	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
588 
589 	dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
590 	    " blocksize=%d ibs=%d dn_slots=%d\n",
591 	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
592 	DNODE_STAT_BUMP(dnode_allocate);
593 
594 	ASSERT(dn->dn_type == DMU_OT_NONE);
595 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
596 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
597 	ASSERT(ot != DMU_OT_NONE);
598 	ASSERT(DMU_OT_IS_VALID(ot));
599 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
600 	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
601 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
602 	ASSERT(DMU_OT_IS_VALID(bonustype));
603 	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
604 	ASSERT(dn->dn_type == DMU_OT_NONE);
605 	ASSERT0(dn->dn_maxblkid);
606 	ASSERT0(dn->dn_allocated_txg);
607 	ASSERT0(dn->dn_dirty_txg);
608 	ASSERT0(dn->dn_assigned_txg);
609 	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
610 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
611 	ASSERT(avl_is_empty(&dn->dn_dbufs));
612 
613 	for (i = 0; i < TXG_SIZE; i++) {
614 		ASSERT0(dn->dn_next_nblkptr[i]);
615 		ASSERT0(dn->dn_next_nlevels[i]);
616 		ASSERT0(dn->dn_next_indblkshift[i]);
617 		ASSERT0(dn->dn_next_bonuslen[i]);
618 		ASSERT0(dn->dn_next_bonustype[i]);
619 		ASSERT0(dn->dn_rm_spillblk[i]);
620 		ASSERT0(dn->dn_next_blksz[i]);
621 		ASSERT0(dn->dn_next_maxblkid[i]);
622 		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
623 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
624 		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
625 	}
626 
627 	dn->dn_type = ot;
628 	dnode_setdblksz(dn, blocksize);
629 	dn->dn_indblkshift = ibs;
630 	dn->dn_nlevels = 1;
631 	dn->dn_num_slots = dn_slots;
632 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
633 		dn->dn_nblkptr = 1;
634 	else {
635 		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
636 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
637 		    SPA_BLKPTRSHIFT));
638 	}
639 
640 	dn->dn_bonustype = bonustype;
641 	dn->dn_bonuslen = bonuslen;
642 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
643 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
644 	dn->dn_dirtyctx = 0;
645 
646 	dn->dn_free_txg = 0;
647 	if (dn->dn_dirtyctx_firstset) {
648 		kmem_free(dn->dn_dirtyctx_firstset, 1);
649 		dn->dn_dirtyctx_firstset = NULL;
650 	}
651 
652 	dn->dn_allocated_txg = tx->tx_txg;
653 	dn->dn_id_flags = 0;
654 
655 	dnode_setdirty(dn, tx);
656 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
657 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
658 	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
659 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
660 }
661 
662 void
663 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
664     dmu_object_type_t bonustype, int bonuslen, int dn_slots,
665     boolean_t keep_spill, dmu_tx_t *tx)
666 {
667 	int nblkptr;
668 
669 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
670 	ASSERT3U(blocksize, <=,
671 	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
672 	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
673 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
674 	ASSERT(tx->tx_txg != 0);
675 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
676 	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
677 	    (bonustype == DMU_OT_SA && bonuslen == 0));
678 	ASSERT(DMU_OT_IS_VALID(bonustype));
679 	ASSERT3U(bonuslen, <=,
680 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
681 	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
682 
683 	dnode_free_interior_slots(dn);
684 	DNODE_STAT_BUMP(dnode_reallocate);
685 
686 	/* clean up any unreferenced dbufs */
687 	dnode_evict_dbufs(dn);
688 
689 	dn->dn_id_flags = 0;
690 
691 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
692 	dnode_setdirty(dn, tx);
693 	if (dn->dn_datablksz != blocksize) {
694 		/* change blocksize */
695 		ASSERT(dn->dn_maxblkid == 0 &&
696 		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
697 		    dnode_block_freed(dn, 0)));
698 		dnode_setdblksz(dn, blocksize);
699 		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
700 	}
701 	if (dn->dn_bonuslen != bonuslen)
702 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
703 
704 	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
705 		nblkptr = 1;
706 	else
707 		nblkptr = MIN(DN_MAX_NBLKPTR,
708 		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
709 		    SPA_BLKPTRSHIFT));
710 	if (dn->dn_bonustype != bonustype)
711 		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
712 	if (dn->dn_nblkptr != nblkptr)
713 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
714 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
715 		dbuf_rm_spill(dn, tx);
716 		dnode_rm_spill(dn, tx);
717 	}
718 	rw_exit(&dn->dn_struct_rwlock);
719 
720 	/* change type */
721 	dn->dn_type = ot;
722 
723 	/* change bonus size and type */
724 	mutex_enter(&dn->dn_mtx);
725 	dn->dn_bonustype = bonustype;
726 	dn->dn_bonuslen = bonuslen;
727 	dn->dn_num_slots = dn_slots;
728 	dn->dn_nblkptr = nblkptr;
729 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
730 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
731 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
732 
733 	/* fix up the bonus db_size */
734 	if (dn->dn_bonus) {
735 		dn->dn_bonus->db.db_size =
736 		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
737 		    (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
738 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
739 	}
740 
741 	dn->dn_allocated_txg = tx->tx_txg;
742 	mutex_exit(&dn->dn_mtx);
743 }
744 
745 #ifdef	_KERNEL
746 static void
747 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
748 {
749 	int i;
750 
751 	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
752 	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
753 	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
754 	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
755 
756 	/* Copy fields. */
757 	ndn->dn_objset = odn->dn_objset;
758 	ndn->dn_object = odn->dn_object;
759 	ndn->dn_dbuf = odn->dn_dbuf;
760 	ndn->dn_handle = odn->dn_handle;
761 	ndn->dn_phys = odn->dn_phys;
762 	ndn->dn_type = odn->dn_type;
763 	ndn->dn_bonuslen = odn->dn_bonuslen;
764 	ndn->dn_bonustype = odn->dn_bonustype;
765 	ndn->dn_nblkptr = odn->dn_nblkptr;
766 	ndn->dn_checksum = odn->dn_checksum;
767 	ndn->dn_compress = odn->dn_compress;
768 	ndn->dn_nlevels = odn->dn_nlevels;
769 	ndn->dn_indblkshift = odn->dn_indblkshift;
770 	ndn->dn_datablkshift = odn->dn_datablkshift;
771 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
772 	ndn->dn_datablksz = odn->dn_datablksz;
773 	ndn->dn_maxblkid = odn->dn_maxblkid;
774 	ndn->dn_num_slots = odn->dn_num_slots;
775 	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
776 	    sizeof (odn->dn_next_type));
777 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
778 	    sizeof (odn->dn_next_nblkptr));
779 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
780 	    sizeof (odn->dn_next_nlevels));
781 	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
782 	    sizeof (odn->dn_next_indblkshift));
783 	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
784 	    sizeof (odn->dn_next_bonustype));
785 	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
786 	    sizeof (odn->dn_rm_spillblk));
787 	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
788 	    sizeof (odn->dn_next_bonuslen));
789 	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
790 	    sizeof (odn->dn_next_blksz));
791 	bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
792 	    sizeof (odn->dn_next_maxblkid));
793 	for (i = 0; i < TXG_SIZE; i++) {
794 		list_move_tail(&ndn->dn_dirty_records[i],
795 		    &odn->dn_dirty_records[i]);
796 	}
797 	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
798 	    sizeof (odn->dn_free_ranges));
799 	ndn->dn_allocated_txg = odn->dn_allocated_txg;
800 	ndn->dn_free_txg = odn->dn_free_txg;
801 	ndn->dn_assigned_txg = odn->dn_assigned_txg;
802 	ndn->dn_dirty_txg = odn->dn_dirty_txg;
803 	ndn->dn_dirtyctx = odn->dn_dirtyctx;
804 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
805 	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
806 	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
807 	ASSERT(avl_is_empty(&ndn->dn_dbufs));
808 	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
809 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
810 	ndn->dn_bonus = odn->dn_bonus;
811 	ndn->dn_have_spill = odn->dn_have_spill;
812 	ndn->dn_zio = odn->dn_zio;
813 	ndn->dn_oldused = odn->dn_oldused;
814 	ndn->dn_oldflags = odn->dn_oldflags;
815 	ndn->dn_olduid = odn->dn_olduid;
816 	ndn->dn_oldgid = odn->dn_oldgid;
817 	ndn->dn_oldprojid = odn->dn_oldprojid;
818 	ndn->dn_newuid = odn->dn_newuid;
819 	ndn->dn_newgid = odn->dn_newgid;
820 	ndn->dn_newprojid = odn->dn_newprojid;
821 	ndn->dn_id_flags = odn->dn_id_flags;
822 	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
823 	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
824 	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
825 
826 	/*
827 	 * Update back pointers. Updating the handle fixes the back pointer of
828 	 * every descendant dbuf as well as the bonus dbuf.
829 	 */
830 	ASSERT(ndn->dn_handle->dnh_dnode == odn);
831 	ndn->dn_handle->dnh_dnode = ndn;
832 	if (ndn->dn_zfetch.zf_dnode == odn) {
833 		ndn->dn_zfetch.zf_dnode = ndn;
834 	}
835 
836 	/*
837 	 * Invalidate the original dnode by clearing all of its back pointers.
838 	 */
839 	odn->dn_dbuf = NULL;
840 	odn->dn_handle = NULL;
841 	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
842 	    offsetof(dmu_buf_impl_t, db_link));
843 	odn->dn_dbufs_count = 0;
844 	odn->dn_bonus = NULL;
845 	odn->dn_zfetch.zf_dnode = NULL;
846 
847 	/*
848 	 * Set the low bit of the objset pointer to ensure that dnode_move()
849 	 * recognizes the dnode as invalid in any subsequent callback.
850 	 */
851 	POINTER_INVALIDATE(&odn->dn_objset);
852 
853 	/*
854 	 * Satisfy the destructor.
855 	 */
856 	for (i = 0; i < TXG_SIZE; i++) {
857 		list_create(&odn->dn_dirty_records[i],
858 		    sizeof (dbuf_dirty_record_t),
859 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
860 		odn->dn_free_ranges[i] = NULL;
861 		odn->dn_next_nlevels[i] = 0;
862 		odn->dn_next_indblkshift[i] = 0;
863 		odn->dn_next_bonustype[i] = 0;
864 		odn->dn_rm_spillblk[i] = 0;
865 		odn->dn_next_bonuslen[i] = 0;
866 		odn->dn_next_blksz[i] = 0;
867 	}
868 	odn->dn_allocated_txg = 0;
869 	odn->dn_free_txg = 0;
870 	odn->dn_assigned_txg = 0;
871 	odn->dn_dirty_txg = 0;
872 	odn->dn_dirtyctx = 0;
873 	odn->dn_dirtyctx_firstset = NULL;
874 	odn->dn_have_spill = B_FALSE;
875 	odn->dn_zio = NULL;
876 	odn->dn_oldused = 0;
877 	odn->dn_oldflags = 0;
878 	odn->dn_olduid = 0;
879 	odn->dn_oldgid = 0;
880 	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
881 	odn->dn_newuid = 0;
882 	odn->dn_newgid = 0;
883 	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
884 	odn->dn_id_flags = 0;
885 
886 	/*
887 	 * Mark the dnode.
888 	 */
889 	ndn->dn_moved = 1;
890 	odn->dn_moved = (uint8_t)-1;
891 }
892 
893 /*ARGSUSED*/
894 static kmem_cbrc_t
895 dnode_move(void *buf, void *newbuf, size_t size, void *arg)
896 {
897 	dnode_t *odn = buf, *ndn = newbuf;
898 	objset_t *os;
899 	int64_t refcount;
900 	uint32_t dbufs;
901 
902 	/*
903 	 * The dnode is on the objset's list of known dnodes if the objset
904 	 * pointer is valid. We set the low bit of the objset pointer when
905 	 * freeing the dnode to invalidate it, and the memory patterns written
906 	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
907 	 * A newly created dnode sets the objset pointer last of all to indicate
908 	 * that the dnode is known and in a valid state to be moved by this
909 	 * function.
910 	 */
911 	os = odn->dn_objset;
912 	if (!POINTER_IS_VALID(os)) {
913 		DNODE_STAT_BUMP(dnode_move_invalid);
914 		return (KMEM_CBRC_DONT_KNOW);
915 	}
916 
917 	/*
918 	 * Ensure that the objset does not go away during the move.
919 	 */
920 	rw_enter(&os_lock, RW_WRITER);
921 	if (os != odn->dn_objset) {
922 		rw_exit(&os_lock);
923 		DNODE_STAT_BUMP(dnode_move_recheck1);
924 		return (KMEM_CBRC_DONT_KNOW);
925 	}
926 
927 	/*
928 	 * If the dnode is still valid, then so is the objset. We know that no
929 	 * valid objset can be freed while we hold os_lock, so we can safely
930 	 * ensure that the objset remains in use.
931 	 */
932 	mutex_enter(&os->os_lock);
933 
934 	/*
935 	 * Recheck the objset pointer in case the dnode was removed just before
936 	 * acquiring the lock.
937 	 */
938 	if (os != odn->dn_objset) {
939 		mutex_exit(&os->os_lock);
940 		rw_exit(&os_lock);
941 		DNODE_STAT_BUMP(dnode_move_recheck2);
942 		return (KMEM_CBRC_DONT_KNOW);
943 	}
944 
945 	/*
946 	 * At this point we know that as long as we hold os->os_lock, the dnode
947 	 * cannot be freed and fields within the dnode can be safely accessed.
948 	 * The objset listing this dnode cannot go away as long as this dnode is
949 	 * on its list.
950 	 */
951 	rw_exit(&os_lock);
952 	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
953 		mutex_exit(&os->os_lock);
954 		DNODE_STAT_BUMP(dnode_move_special);
955 		return (KMEM_CBRC_NO);
956 	}
957 	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
958 
959 	/*
960 	 * Lock the dnode handle to prevent the dnode from obtaining any new
961 	 * holds. This also prevents the descendant dbufs and the bonus dbuf
962 	 * from accessing the dnode, so that we can discount their holds. The
963 	 * handle is safe to access because we know that while the dnode cannot
964 	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
965 	 * safely move any dnode referenced only by dbufs.
966 	 */
967 	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
968 		mutex_exit(&os->os_lock);
969 		DNODE_STAT_BUMP(dnode_move_handle);
970 		return (KMEM_CBRC_LATER);
971 	}
972 
973 	/*
974 	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
975 	 * We need to guarantee that there is a hold for every dbuf in order to
976 	 * determine whether the dnode is actively referenced. Falsely matching
977 	 * a dbuf to an active hold would lead to an unsafe move. It's possible
978 	 * that a thread already having an active dnode hold is about to add a
979 	 * dbuf, and we can't compare hold and dbuf counts while the add is in
980 	 * progress.
981 	 */
982 	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
983 		zrl_exit(&odn->dn_handle->dnh_zrlock);
984 		mutex_exit(&os->os_lock);
985 		DNODE_STAT_BUMP(dnode_move_rwlock);
986 		return (KMEM_CBRC_LATER);
987 	}
988 
989 	/*
990 	 * A dbuf may be removed (evicted) without an active dnode hold. In that
991 	 * case, the dbuf count is decremented under the handle lock before the
992 	 * dbuf's hold is released. This order ensures that if we count the hold
993 	 * after the dbuf is removed but before its hold is released, we will
994 	 * treat the unmatched hold as active and exit safely. If we count the
995 	 * hold before the dbuf is removed, the hold is discounted, and the
996 	 * removal is blocked until the move completes.
997 	 */
998 	refcount = zfs_refcount_count(&odn->dn_holds);
999 	ASSERT(refcount >= 0);
1000 	dbufs = odn->dn_dbufs_count;
1001 
1002 	/* We can't have more dbufs than dnode holds. */
1003 	ASSERT3U(dbufs, <=, refcount);
1004 	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
1005 	    uint32_t, dbufs);
1006 
1007 	if (refcount > dbufs) {
1008 		rw_exit(&odn->dn_struct_rwlock);
1009 		zrl_exit(&odn->dn_handle->dnh_zrlock);
1010 		mutex_exit(&os->os_lock);
1011 		DNODE_STAT_BUMP(dnode_move_active);
1012 		return (KMEM_CBRC_LATER);
1013 	}
1014 
1015 	rw_exit(&odn->dn_struct_rwlock);
1016 
1017 	/*
1018 	 * At this point we know that anyone with a hold on the dnode is not
1019 	 * actively referencing it. The dnode is known and in a valid state to
1020 	 * move. We're holding the locks needed to execute the critical section.
1021 	 */
1022 	dnode_move_impl(odn, ndn);
1023 
1024 	list_link_replace(&odn->dn_link, &ndn->dn_link);
1025 	/* If the dnode was safe to move, the refcount cannot have changed. */
1026 	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
1027 	ASSERT(dbufs == ndn->dn_dbufs_count);
1028 	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
1029 	mutex_exit(&os->os_lock);
1030 
1031 	return (KMEM_CBRC_YES);
1032 }
1033 #endif	/* _KERNEL */
1034 
1035 static void
1036 dnode_slots_hold(dnode_children_t *children, int idx, int slots)
1037 {
1038 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1039 
1040 	for (int i = idx; i < idx + slots; i++) {
1041 		dnode_handle_t *dnh = &children->dnc_children[i];
1042 		zrl_add(&dnh->dnh_zrlock);
1043 	}
1044 }
1045 
1046 static void
1047 dnode_slots_rele(dnode_children_t *children, int idx, int slots)
1048 {
1049 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1050 
1051 	for (int i = idx; i < idx + slots; i++) {
1052 		dnode_handle_t *dnh = &children->dnc_children[i];
1053 
1054 		if (zrl_is_locked(&dnh->dnh_zrlock))
1055 			zrl_exit(&dnh->dnh_zrlock);
1056 		else
1057 			zrl_remove(&dnh->dnh_zrlock);
1058 	}
1059 }
1060 
1061 static int
1062 dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
1063 {
1064 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1065 
1066 	for (int i = idx; i < idx + slots; i++) {
1067 		dnode_handle_t *dnh = &children->dnc_children[i];
1068 
1069 		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
1070 			for (int j = idx; j < i; j++) {
1071 				dnh = &children->dnc_children[j];
1072 				zrl_exit(&dnh->dnh_zrlock);
1073 			}
1074 
1075 			return (0);
1076 		}
1077 	}
1078 
1079 	return (1);
1080 }
1081 
1082 static void
1083 dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
1084 {
1085 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1086 
1087 	for (int i = idx; i < idx + slots; i++) {
1088 		dnode_handle_t *dnh = &children->dnc_children[i];
1089 		dnh->dnh_dnode = ptr;
1090 	}
1091 }
1092 
1093 static boolean_t
1094 dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
1095 {
1096 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1097 
1098 	/*
1099 	 * If all dnode slots are either already free or
1100 	 * evictable return B_TRUE.
1101 	 */
1102 	for (int i = idx; i < idx + slots; i++) {
1103 		dnode_handle_t *dnh = &children->dnc_children[i];
1104 		dnode_t *dn = dnh->dnh_dnode;
1105 
1106 		if (dn == DN_SLOT_FREE) {
1107 			continue;
1108 		} else if (DN_SLOT_IS_PTR(dn)) {
1109 			mutex_enter(&dn->dn_mtx);
1110 			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
1111 			    zfs_refcount_is_zero(&dn->dn_holds) &&
1112 			    !DNODE_IS_DIRTY(dn));
1113 			mutex_exit(&dn->dn_mtx);
1114 
1115 			if (!can_free)
1116 				return (B_FALSE);
1117 			else
1118 				continue;
1119 		} else {
1120 			return (B_FALSE);
1121 		}
1122 	}
1123 
1124 	return (B_TRUE);
1125 }
1126 
1127 static void
1128 dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
1129 {
1130 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1131 
1132 	for (int i = idx; i < idx + slots; i++) {
1133 		dnode_handle_t *dnh = &children->dnc_children[i];
1134 
1135 		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
1136 
1137 		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1138 			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
1139 			dnode_destroy(dnh->dnh_dnode);
1140 			dnh->dnh_dnode = DN_SLOT_FREE;
1141 		}
1142 	}
1143 }
1144 
1145 void
1146 dnode_free_interior_slots(dnode_t *dn)
1147 {
1148 	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
1149 	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
1150 	int idx = (dn->dn_object & (epb - 1)) + 1;
1151 	int slots = dn->dn_num_slots - 1;
1152 
1153 	if (slots == 0)
1154 		return;
1155 
1156 	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
1157 
1158 	while (!dnode_slots_tryenter(children, idx, slots))
1159 		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
1160 
1161 	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
1162 	dnode_slots_rele(children, idx, slots);
1163 }
1164 
1165 void
1166 dnode_special_close(dnode_handle_t *dnh)
1167 {
1168 	dnode_t *dn = dnh->dnh_dnode;
1169 
1170 	/*
1171 	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
1172 	 * zfs_refcount_remove()
1173 	 */
1174 	mutex_enter(&dn->dn_mtx);
1175 	if (zfs_refcount_count(&dn->dn_holds) > 0)
1176 		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
1177 	mutex_exit(&dn->dn_mtx);
1178 	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
1179 
1180 	ASSERT(dn->dn_dbuf == NULL ||
1181 	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1182 	zrl_add(&dnh->dnh_zrlock);
1183 	dnode_destroy(dn); /* implicit zrl_remove() */
1184 	zrl_destroy(&dnh->dnh_zrlock);
1185 	dnh->dnh_dnode = NULL;
1186 }
1187 
1188 void
1189 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1190     dnode_handle_t *dnh)
1191 {
1192 	dnode_t *dn;
1193 
1194 	zrl_init(&dnh->dnh_zrlock);
1195 	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
1196 
1197 	dn = dnode_create(os, dnp, NULL, object, dnh);
1198 	DNODE_VERIFY(dn);
1199 
1200 	zrl_exit(&dnh->dnh_zrlock);
1201 }
1202 
1203 static void
1204 dnode_buf_evict_async(void *dbu)
1205 {
1206 	dnode_children_t *dnc = dbu;
1207 
1208 	DNODE_STAT_BUMP(dnode_buf_evict);
1209 
1210 	for (int i = 0; i < dnc->dnc_count; i++) {
1211 		dnode_handle_t *dnh = &dnc->dnc_children[i];
1212 		dnode_t *dn;
1213 
1214 		/*
1215 		 * The dnode handle lock guards against the dnode moving to
1216 		 * another valid address, so there is no need here to guard
1217 		 * against changes to or from NULL.
1218 		 */
1219 		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1220 			zrl_destroy(&dnh->dnh_zrlock);
1221 			dnh->dnh_dnode = DN_SLOT_UNINIT;
1222 			continue;
1223 		}
1224 
1225 		zrl_add(&dnh->dnh_zrlock);
1226 		dn = dnh->dnh_dnode;
1227 		/*
1228 		 * If there are holds on this dnode, then there should
1229 		 * be holds on the dnode's containing dbuf as well; thus
1230 		 * it wouldn't be eligible for eviction and this function
1231 		 * would not have been called.
1232 		 */
1233 		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
1234 		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
1235 
1236 		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
1237 		zrl_destroy(&dnh->dnh_zrlock);
1238 		dnh->dnh_dnode = DN_SLOT_UNINIT;
1239 	}
1240 	kmem_free(dnc, sizeof (dnode_children_t) +
1241 	    dnc->dnc_count * sizeof (dnode_handle_t));
1242 }
1243 
1244 /*
1245  * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
1246  * to ensure the hole at the specified object offset is large enough to
1247  * hold the dnode being created. The slots parameter is also used to ensure
1248  * a dnode does not span multiple dnode blocks. In both of these cases, if
1249  * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
1250  * are only possible when using DNODE_MUST_BE_FREE.
1251  *
1252  * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
1253  * dnode_hold_impl() will check if the requested dnode is already consumed
1254  * as an extra dnode slot by an large dnode, in which case it returns
1255  * ENOENT.
1256  *
1257  * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
1258  * return whether the hold would succeed or not. tag and dnp should set to
1259  * NULL in this case.
1260  *
1261  * errors:
1262  * EINVAL - invalid object number or flags.
1263  * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
1264  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
1265  *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
1266  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
1267  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
1268  *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
1269  * EIO    - i/o error error when reading the meta dnode dbuf.
1270  * succeeds even for free dnodes.
1271  */
1272 int
1273 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
1274     void *tag, dnode_t **dnp)
1275 {
1276 	int epb, idx, err;
1277 	int drop_struct_lock = FALSE;
1278 	int type;
1279 	uint64_t blk;
1280 	dnode_t *mdn, *dn;
1281 	dmu_buf_impl_t *db;
1282 	dnode_children_t *dnc;
1283 	dnode_phys_t *dn_block;
1284 	dnode_handle_t *dnh;
1285 
1286 	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
1287 	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
1288 	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
1289 
1290 	/*
1291 	 * If you are holding the spa config lock as writer, you shouldn't
1292 	 * be asking the DMU to do *anything* unless it's the root pool
1293 	 * which may require us to read from the root filesystem while
1294 	 * holding some (not all) of the locks as writer.
1295 	 */
1296 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
1297 	    (spa_is_root(os->os_spa) &&
1298 	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
1299 
1300 	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
1301 
1302 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
1303 	    object == DMU_PROJECTUSED_OBJECT) {
1304 		if (object == DMU_USERUSED_OBJECT)
1305 			dn = DMU_USERUSED_DNODE(os);
1306 		else if (object == DMU_GROUPUSED_OBJECT)
1307 			dn = DMU_GROUPUSED_DNODE(os);
1308 		else
1309 			dn = DMU_PROJECTUSED_DNODE(os);
1310 		if (dn == NULL)
1311 			return (SET_ERROR(ENOENT));
1312 		type = dn->dn_type;
1313 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
1314 			return (SET_ERROR(ENOENT));
1315 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
1316 			return (SET_ERROR(EEXIST));
1317 		DNODE_VERIFY(dn);
1318 		/* Don't actually hold if dry run, just return 0 */
1319 		if (!(flag & DNODE_DRY_RUN)) {
1320 			(void) zfs_refcount_add(&dn->dn_holds, tag);
1321 			*dnp = dn;
1322 		}
1323 		return (0);
1324 	}
1325 
1326 	if (object == 0 || object >= DN_MAX_OBJECT)
1327 		return (SET_ERROR(EINVAL));
1328 
1329 	mdn = DMU_META_DNODE(os);
1330 	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
1331 
1332 	DNODE_VERIFY(mdn);
1333 
1334 	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
1335 		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
1336 		drop_struct_lock = TRUE;
1337 	}
1338 
1339 	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
1340 	db = dbuf_hold(mdn, blk, FTAG);
1341 	if (drop_struct_lock)
1342 		rw_exit(&mdn->dn_struct_rwlock);
1343 	if (db == NULL) {
1344 		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
1345 		return (SET_ERROR(EIO));
1346 	}
1347 	/*
1348 	 * We do not need to decrypt to read the dnode so it doesn't matter
1349 	 * if we get the encrypted or decrypted version.
1350 	 */
1351 	err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
1352 	if (err) {
1353 		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
1354 		dbuf_rele(db, FTAG);
1355 		return (err);
1356 	}
1357 
1358 	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1359 	epb = db->db.db_size >> DNODE_SHIFT;
1360 
1361 	idx = object & (epb - 1);
1362 	dn_block = (dnode_phys_t *)db->db.db_data;
1363 
1364 	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1365 	dnc = dmu_buf_get_user(&db->db);
1366 	dnh = NULL;
1367 	if (dnc == NULL) {
1368 		dnode_children_t *winner;
1369 		int skip = 0;
1370 
1371 		dnc = kmem_zalloc(sizeof (dnode_children_t) +
1372 		    epb * sizeof (dnode_handle_t), KM_SLEEP);
1373 		dnc->dnc_count = epb;
1374 		dnh = &dnc->dnc_children[0];
1375 
1376 		/* Initialize dnode slot status from dnode_phys_t */
1377 		for (int i = 0; i < epb; i++) {
1378 			zrl_init(&dnh[i].dnh_zrlock);
1379 
1380 			if (skip) {
1381 				skip--;
1382 				continue;
1383 			}
1384 
1385 			if (dn_block[i].dn_type != DMU_OT_NONE) {
1386 				int interior = dn_block[i].dn_extra_slots;
1387 
1388 				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
1389 				dnode_set_slots(dnc, i + 1, interior,
1390 				    DN_SLOT_INTERIOR);
1391 				skip = interior;
1392 			} else {
1393 				dnh[i].dnh_dnode = DN_SLOT_FREE;
1394 				skip = 0;
1395 			}
1396 		}
1397 
1398 		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
1399 		    dnode_buf_evict_async, NULL);
1400 		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
1401 		if (winner != NULL) {
1402 
1403 			for (int i = 0; i < epb; i++)
1404 				zrl_destroy(&dnh[i].dnh_zrlock);
1405 
1406 			kmem_free(dnc, sizeof (dnode_children_t) +
1407 			    epb * sizeof (dnode_handle_t));
1408 			dnc = winner;
1409 		}
1410 	}
1411 
1412 	ASSERT(dnc->dnc_count == epb);
1413 	dn = DN_SLOT_UNINIT;
1414 
1415 	if (flag & DNODE_MUST_BE_ALLOCATED) {
1416 		slots = 1;
1417 
1418 		while (dn == DN_SLOT_UNINIT) {
1419 			dnode_slots_hold(dnc, idx, slots);
1420 			dnh = &dnc->dnc_children[idx];
1421 
1422 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1423 				dn = dnh->dnh_dnode;
1424 				break;
1425 			} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
1426 				DNODE_STAT_BUMP(dnode_hold_alloc_interior);
1427 				dnode_slots_rele(dnc, idx, slots);
1428 				dbuf_rele(db, FTAG);
1429 				return (SET_ERROR(EEXIST));
1430 			} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
1431 				DNODE_STAT_BUMP(dnode_hold_alloc_misses);
1432 				dnode_slots_rele(dnc, idx, slots);
1433 				dbuf_rele(db, FTAG);
1434 				return (SET_ERROR(ENOENT));
1435 			}
1436 
1437 			dnode_slots_rele(dnc, idx, slots);
1438 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
1439 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
1440 				continue;
1441 			}
1442 
1443 			/*
1444 			 * Someone else won the race and called dnode_create()
1445 			 * after we checked DN_SLOT_IS_PTR() above but before
1446 			 * we acquired the lock.
1447 			 */
1448 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1449 				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
1450 				dn = dnh->dnh_dnode;
1451 			} else {
1452 				dn = dnode_create(os, dn_block + idx, db,
1453 				    object, dnh);
1454 			}
1455 		}
1456 
1457 		mutex_enter(&dn->dn_mtx);
1458 		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
1459 			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
1460 			mutex_exit(&dn->dn_mtx);
1461 			dnode_slots_rele(dnc, idx, slots);
1462 			dbuf_rele(db, FTAG);
1463 			return (SET_ERROR(ENOENT));
1464 		}
1465 
1466 		/* Don't actually hold if dry run, just return 0 */
1467 		if (flag & DNODE_DRY_RUN) {
1468 			mutex_exit(&dn->dn_mtx);
1469 			dnode_slots_rele(dnc, idx, slots);
1470 			dbuf_rele(db, FTAG);
1471 			return (0);
1472 		}
1473 
1474 		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
1475 	} else if (flag & DNODE_MUST_BE_FREE) {
1476 
1477 		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
1478 			DNODE_STAT_BUMP(dnode_hold_free_overflow);
1479 			dbuf_rele(db, FTAG);
1480 			return (SET_ERROR(ENOSPC));
1481 		}
1482 
1483 		while (dn == DN_SLOT_UNINIT) {
1484 			dnode_slots_hold(dnc, idx, slots);
1485 
1486 			if (!dnode_check_slots_free(dnc, idx, slots)) {
1487 				DNODE_STAT_BUMP(dnode_hold_free_misses);
1488 				dnode_slots_rele(dnc, idx, slots);
1489 				dbuf_rele(db, FTAG);
1490 				return (SET_ERROR(ENOSPC));
1491 			}
1492 
1493 			dnode_slots_rele(dnc, idx, slots);
1494 			if (!dnode_slots_tryenter(dnc, idx, slots)) {
1495 				DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
1496 				continue;
1497 			}
1498 
1499 			if (!dnode_check_slots_free(dnc, idx, slots)) {
1500 				DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
1501 				dnode_slots_rele(dnc, idx, slots);
1502 				dbuf_rele(db, FTAG);
1503 				return (SET_ERROR(ENOSPC));
1504 			}
1505 
1506 			/*
1507 			 * Allocated but otherwise free dnodes which would
1508 			 * be in the interior of a multi-slot dnodes need
1509 			 * to be freed.  Single slot dnodes can be safely
1510 			 * re-purposed as a performance optimization.
1511 			 */
1512 			if (slots > 1)
1513 				dnode_reclaim_slots(dnc, idx + 1, slots - 1);
1514 
1515 			dnh = &dnc->dnc_children[idx];
1516 			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
1517 				dn = dnh->dnh_dnode;
1518 			} else {
1519 				dn = dnode_create(os, dn_block + idx, db,
1520 				    object, dnh);
1521 			}
1522 		}
1523 
1524 		mutex_enter(&dn->dn_mtx);
1525 		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
1526 			DNODE_STAT_BUMP(dnode_hold_free_refcount);
1527 			mutex_exit(&dn->dn_mtx);
1528 			dnode_slots_rele(dnc, idx, slots);
1529 			dbuf_rele(db, FTAG);
1530 			return (SET_ERROR(EEXIST));
1531 		}
1532 
1533 		/* Don't actually hold if dry run, just return 0 */
1534 		if (flag & DNODE_DRY_RUN) {
1535 			mutex_exit(&dn->dn_mtx);
1536 			dnode_slots_rele(dnc, idx, slots);
1537 			dbuf_rele(db, FTAG);
1538 			return (0);
1539 		}
1540 
1541 		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
1542 		DNODE_STAT_BUMP(dnode_hold_free_hits);
1543 	} else {
1544 		dbuf_rele(db, FTAG);
1545 		return (SET_ERROR(EINVAL));
1546 	}
1547 
1548 	ASSERT0(dn->dn_free_txg);
1549 
1550 	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
1551 		dbuf_add_ref(db, dnh);
1552 
1553 	mutex_exit(&dn->dn_mtx);
1554 
1555 	/* Now we can rely on the hold to prevent the dnode from moving. */
1556 	dnode_slots_rele(dnc, idx, slots);
1557 
1558 	DNODE_VERIFY(dn);
1559 	ASSERT3P(dn->dn_dbuf, ==, db);
1560 	ASSERT3U(dn->dn_object, ==, object);
1561 	dbuf_rele(db, FTAG);
1562 
1563 	*dnp = dn;
1564 	return (0);
1565 }
1566 
1567 /*
1568  * Return held dnode if the object is allocated, NULL if not.
1569  */
1570 int
1571 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1572 {
1573 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
1574 	    dnp));
1575 }
1576 
1577 /*
1578  * Can only add a reference if there is already at least one
1579  * reference on the dnode.  Returns FALSE if unable to add a
1580  * new reference.
1581  */
1582 boolean_t
1583 dnode_add_ref(dnode_t *dn, void *tag)
1584 {
1585 	mutex_enter(&dn->dn_mtx);
1586 	if (zfs_refcount_is_zero(&dn->dn_holds)) {
1587 		mutex_exit(&dn->dn_mtx);
1588 		return (FALSE);
1589 	}
1590 	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
1591 	mutex_exit(&dn->dn_mtx);
1592 	return (TRUE);
1593 }
1594 
1595 void
1596 dnode_rele(dnode_t *dn, void *tag)
1597 {
1598 	mutex_enter(&dn->dn_mtx);
1599 	dnode_rele_and_unlock(dn, tag, B_FALSE);
1600 }
1601 
1602 void
1603 dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
1604 {
1605 	uint64_t refs;
1606 	/* Get while the hold prevents the dnode from moving. */
1607 	dmu_buf_impl_t *db = dn->dn_dbuf;
1608 	dnode_handle_t *dnh = dn->dn_handle;
1609 
1610 	refs = zfs_refcount_remove(&dn->dn_holds, tag);
1611 	if (refs == 0)
1612 		cv_broadcast(&dn->dn_nodnholds);
1613 	mutex_exit(&dn->dn_mtx);
1614 	/* dnode could get destroyed at this point, so don't use it anymore */
1615 
1616 	/*
1617 	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
1618 	 * indirectly by dbuf_rele() while relying on the dnode handle to
1619 	 * prevent the dnode from moving, since releasing the last hold could
1620 	 * result in the dnode's parent dbuf evicting its dnode handles. For
1621 	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
1622 	 * other direct or indirect hold on the dnode must first drop the dnode
1623 	 * handle.
1624 	 */
1625 	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1626 
1627 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1628 	if (refs == 0 && db != NULL) {
1629 		/*
1630 		 * Another thread could add a hold to the dnode handle in
1631 		 * dnode_hold_impl() while holding the parent dbuf. Since the
1632 		 * hold on the parent dbuf prevents the handle from being
1633 		 * destroyed, the hold on the handle is OK. We can't yet assert
1634 		 * that the handle has zero references, but that will be
1635 		 * asserted anyway when the handle gets destroyed.
1636 		 */
1637 		mutex_enter(&db->db_mtx);
1638 		dbuf_rele_and_unlock(db, dnh, evicting);
1639 	}
1640 }
1641 
1642 /*
1643  * Test whether we can create a dnode at the specified location.
1644  */
1645 int
1646 dnode_try_claim(objset_t *os, uint64_t object, int slots)
1647 {
1648 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
1649 	    slots, NULL, NULL));
1650 }
1651 
1652 void
1653 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1654 {
1655 	objset_t *os = dn->dn_objset;
1656 	uint64_t txg = tx->tx_txg;
1657 
1658 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1659 		dsl_dataset_dirty(os->os_dsl_dataset, tx);
1660 		return;
1661 	}
1662 
1663 	DNODE_VERIFY(dn);
1664 
1665 #ifdef ZFS_DEBUG
1666 	mutex_enter(&dn->dn_mtx);
1667 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1668 	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1669 	mutex_exit(&dn->dn_mtx);
1670 #endif
1671 
1672 	/*
1673 	 * Determine old uid/gid when necessary
1674 	 */
1675 	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
1676 
1677 	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
1678 	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
1679 
1680 	/*
1681 	 * If we are already marked dirty, we're done.
1682 	 */
1683 	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
1684 		multilist_sublist_unlock(mls);
1685 		return;
1686 	}
1687 
1688 	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
1689 	    !avl_is_empty(&dn->dn_dbufs));
1690 	ASSERT(dn->dn_datablksz != 0);
1691 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
1692 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
1693 	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1694 
1695 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1696 	    dn->dn_object, txg);
1697 
1698 	multilist_sublist_insert_head(mls, dn);
1699 
1700 	multilist_sublist_unlock(mls);
1701 
1702 	/*
1703 	 * The dnode maintains a hold on its containing dbuf as
1704 	 * long as there are holds on it.  Each instantiated child
1705 	 * dbuf maintains a hold on the dnode.  When the last child
1706 	 * drops its hold, the dnode will drop its hold on the
1707 	 * containing dbuf. We add a "dirty hold" here so that the
1708 	 * dnode will hang around after we finish processing its
1709 	 * children.
1710 	 */
1711 	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1712 
1713 	(void) dbuf_dirty(dn->dn_dbuf, tx);
1714 
1715 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
1716 }
1717 
1718 void
1719 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1720 {
1721 	mutex_enter(&dn->dn_mtx);
1722 	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1723 		mutex_exit(&dn->dn_mtx);
1724 		return;
1725 	}
1726 	dn->dn_free_txg = tx->tx_txg;
1727 	mutex_exit(&dn->dn_mtx);
1728 
1729 	dnode_setdirty(dn, tx);
1730 }
1731 
1732 /*
1733  * Try to change the block size for the indicated dnode.  This can only
1734  * succeed if there are no blocks allocated or dirty beyond first block
1735  */
1736 int
1737 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
1738 {
1739 	dmu_buf_impl_t *db;
1740 	int err;
1741 
1742 	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
1743 	if (size == 0)
1744 		size = SPA_MINBLOCKSIZE;
1745 	else
1746 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
1747 
1748 	if (ibs == dn->dn_indblkshift)
1749 		ibs = 0;
1750 
1751 	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
1752 		return (0);
1753 
1754 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1755 
1756 	/* Check for any allocated blocks beyond the first */
1757 	if (dn->dn_maxblkid != 0)
1758 		goto fail;
1759 
1760 	mutex_enter(&dn->dn_dbufs_mtx);
1761 	for (db = avl_first(&dn->dn_dbufs); db != NULL;
1762 	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
1763 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
1764 		    db->db_blkid != DMU_SPILL_BLKID) {
1765 			mutex_exit(&dn->dn_dbufs_mtx);
1766 			goto fail;
1767 		}
1768 	}
1769 	mutex_exit(&dn->dn_dbufs_mtx);
1770 
1771 	if (ibs && dn->dn_nlevels != 1)
1772 		goto fail;
1773 
1774 	/* resize the old block */
1775 	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
1776 	if (err == 0) {
1777 		dbuf_new_size(db, size, tx);
1778 	} else if (err != ENOENT) {
1779 		goto fail;
1780 	}
1781 
1782 	dnode_setdblksz(dn, size);
1783 	dnode_setdirty(dn, tx);
1784 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1785 	if (ibs) {
1786 		dn->dn_indblkshift = ibs;
1787 		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1788 	}
1789 	/* rele after we have fixed the blocksize in the dnode */
1790 	if (db)
1791 		dbuf_rele(db, FTAG);
1792 
1793 	rw_exit(&dn->dn_struct_rwlock);
1794 	return (0);
1795 
1796 fail:
1797 	rw_exit(&dn->dn_struct_rwlock);
1798 	return (SET_ERROR(ENOTSUP));
1799 }
1800 
1801 static void
1802 dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
1803 {
1804 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
1805 	int old_nlevels = dn->dn_nlevels;
1806 	dmu_buf_impl_t *db;
1807 	list_t *list;
1808 	dbuf_dirty_record_t *new, *dr, *dr_next;
1809 
1810 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1811 
1812 	dn->dn_nlevels = new_nlevels;
1813 
1814 	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1815 	dn->dn_next_nlevels[txgoff] = new_nlevels;
1816 
1817 	/* dirty the left indirects */
1818 	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1819 	ASSERT(db != NULL);
1820 	new = dbuf_dirty(db, tx);
1821 	dbuf_rele(db, FTAG);
1822 
1823 	/* transfer the dirty records to the new indirect */
1824 	mutex_enter(&dn->dn_mtx);
1825 	mutex_enter(&new->dt.di.dr_mtx);
1826 	list = &dn->dn_dirty_records[txgoff];
1827 	for (dr = list_head(list); dr; dr = dr_next) {
1828 		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1829 		if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1830 		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1831 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1832 			ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1833 			list_remove(&dn->dn_dirty_records[txgoff], dr);
1834 			list_insert_tail(&new->dt.di.dr_children, dr);
1835 			dr->dr_parent = new;
1836 		}
1837 	}
1838 	mutex_exit(&new->dt.di.dr_mtx);
1839 	mutex_exit(&dn->dn_mtx);
1840 }
1841 
1842 int
1843 dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
1844 {
1845 	int ret = 0;
1846 
1847 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1848 
1849 	if (dn->dn_nlevels == nlevels) {
1850 		ret = 0;
1851 		goto out;
1852 	} else if (nlevels < dn->dn_nlevels) {
1853 		ret = SET_ERROR(EINVAL);
1854 		goto out;
1855 	}
1856 
1857 	dnode_set_nlevels_impl(dn, nlevels, tx);
1858 
1859 out:
1860 	rw_exit(&dn->dn_struct_rwlock);
1861 	return (ret);
1862 }
1863 
1864 /* read-holding callers must not rely on the lock being continuously held */
1865 void
1866 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
1867     boolean_t force)
1868 {
1869 	int epbs, new_nlevels;
1870 	uint64_t sz;
1871 
1872 	ASSERT(blkid != DMU_BONUS_BLKID);
1873 
1874 	ASSERT(have_read ?
1875 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
1876 	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
1877 
1878 	/*
1879 	 * if we have a read-lock, check to see if we need to do any work
1880 	 * before upgrading to a write-lock.
1881 	 */
1882 	if (have_read) {
1883 		if (blkid <= dn->dn_maxblkid)
1884 			return;
1885 
1886 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1887 			rw_exit(&dn->dn_struct_rwlock);
1888 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1889 		}
1890 	}
1891 
1892 	/*
1893 	 * Raw sends (indicated by the force flag) require that we take the
1894 	 * given blkid even if the value is lower than the current value.
1895 	 */
1896 	if (!force && blkid <= dn->dn_maxblkid)
1897 		goto out;
1898 
1899 	/*
1900 	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
1901 	 * to indicate that this field is set. This allows us to set the
1902 	 * maxblkid to 0 on an existing object in dnode_sync().
1903 	 */
1904 	dn->dn_maxblkid = blkid;
1905 	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
1906 	    blkid | DMU_NEXT_MAXBLKID_SET;
1907 
1908 	/*
1909 	 * Compute the number of levels necessary to support the new maxblkid.
1910 	 * Raw sends will ensure nlevels is set correctly for us.
1911 	 */
1912 	new_nlevels = 1;
1913 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1914 	for (sz = dn->dn_nblkptr;
1915 	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1916 		new_nlevels++;
1917 
1918 	if (!force) {
1919 		if (new_nlevels > dn->dn_nlevels)
1920 			dnode_set_nlevels_impl(dn, new_nlevels, tx);
1921 	} else {
1922 		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
1923 	}
1924 
1925 out:
1926 	if (have_read)
1927 		rw_downgrade(&dn->dn_struct_rwlock);
1928 }
1929 
1930 static void
1931 dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
1932 {
1933 	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1934 	if (db != NULL) {
1935 		dmu_buf_will_dirty(&db->db, tx);
1936 		dbuf_rele(db, FTAG);
1937 	}
1938 }
1939 
1940 /*
1941  * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
1942  * and end_blkid.
1943  */
1944 static void
1945 dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1946     dmu_tx_t *tx)
1947 {
1948 	dmu_buf_impl_t db_search;
1949 	dmu_buf_impl_t *db;
1950 	avl_index_t where;
1951 
1952 	mutex_enter(&dn->dn_dbufs_mtx);
1953 
1954 	db_search.db_level = 1;
1955 	db_search.db_blkid = start_blkid + 1;
1956 	db_search.db_state = DB_SEARCH;
1957 	for (;;) {
1958 
1959 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
1960 		if (db == NULL)
1961 			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1962 
1963 		if (db == NULL || db->db_level != 1 ||
1964 		    db->db_blkid >= end_blkid) {
1965 			break;
1966 		}
1967 
1968 		/*
1969 		 * Setup the next blkid we want to search for.
1970 		 */
1971 		db_search.db_blkid = db->db_blkid + 1;
1972 		ASSERT3U(db->db_blkid, >=, start_blkid);
1973 
1974 		/*
1975 		 * If the dbuf transitions to DB_EVICTING while we're trying
1976 		 * to dirty it, then we will be unable to discover it in
1977 		 * the dbuf hash table. This will result in a call to
1978 		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
1979 		 * lock. To avoid a deadlock, we drop the lock before
1980 		 * dirtying the level-1 dbuf.
1981 		 */
1982 		mutex_exit(&dn->dn_dbufs_mtx);
1983 		dnode_dirty_l1(dn, db->db_blkid, tx);
1984 		mutex_enter(&dn->dn_dbufs_mtx);
1985 	}
1986 
1987 #ifdef ZFS_DEBUG
1988 	/*
1989 	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
1990 	 */
1991 	db_search.db_level = 1;
1992 	db_search.db_blkid = start_blkid + 1;
1993 	db_search.db_state = DB_SEARCH;
1994 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
1995 	if (db == NULL)
1996 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1997 	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
1998 		if (db->db_level != 1 || db->db_blkid >= end_blkid)
1999 			break;
2000 		ASSERT(db->db_dirtycnt > 0);
2001 	}
2002 #endif
2003 	mutex_exit(&dn->dn_dbufs_mtx);
2004 }
2005 
2006 void
2007 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
2008 {
2009 	dmu_buf_impl_t *db;
2010 	uint64_t blkoff, blkid, nblks;
2011 	int blksz, blkshift, head, tail;
2012 	int trunc = FALSE;
2013 	int epbs;
2014 
2015 	blksz = dn->dn_datablksz;
2016 	blkshift = dn->dn_datablkshift;
2017 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2018 
2019 	if (len == DMU_OBJECT_END) {
2020 		len = UINT64_MAX - off;
2021 		trunc = TRUE;
2022 	}
2023 
2024 	/*
2025 	 * First, block align the region to free:
2026 	 */
2027 	if (ISP2(blksz)) {
2028 		head = P2NPHASE(off, blksz);
2029 		blkoff = P2PHASE(off, blksz);
2030 		if ((off >> blkshift) > dn->dn_maxblkid)
2031 			return;
2032 	} else {
2033 		ASSERT(dn->dn_maxblkid == 0);
2034 		if (off == 0 && len >= blksz) {
2035 			/*
2036 			 * Freeing the whole block; fast-track this request.
2037 			 */
2038 			blkid = 0;
2039 			nblks = 1;
2040 			if (dn->dn_nlevels > 1) {
2041 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2042 				dnode_dirty_l1(dn, 0, tx);
2043 				rw_exit(&dn->dn_struct_rwlock);
2044 			}
2045 			goto done;
2046 		} else if (off >= blksz) {
2047 			/* Freeing past end-of-data */
2048 			return;
2049 		} else {
2050 			/* Freeing part of the block. */
2051 			head = blksz - off;
2052 			ASSERT3U(head, >, 0);
2053 		}
2054 		blkoff = off;
2055 	}
2056 	/* zero out any partial block data at the start of the range */
2057 	if (head) {
2058 		int res;
2059 		ASSERT3U(blkoff + head, ==, blksz);
2060 		if (len < head)
2061 			head = len;
2062 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2063 		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
2064 		    TRUE, FALSE, FTAG, &db);
2065 		rw_exit(&dn->dn_struct_rwlock);
2066 		if (res == 0) {
2067 			caddr_t data;
2068 			boolean_t dirty;
2069 
2070 			db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
2071 			    FTAG);
2072 			/* don't dirty if it isn't on disk and isn't dirty */
2073 			dirty = db->db_last_dirty ||
2074 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
2075 			dmu_buf_unlock_parent(db, dblt, FTAG);
2076 			if (dirty) {
2077 				dmu_buf_will_dirty(&db->db, tx);
2078 				data = db->db.db_data;
2079 				bzero(data + blkoff, head);
2080 			}
2081 			dbuf_rele(db, FTAG);
2082 		}
2083 		off += head;
2084 		len -= head;
2085 	}
2086 
2087 	/* If the range was less than one block, we're done */
2088 	if (len == 0)
2089 		return;
2090 
2091 	/* If the remaining range is past end of file, we're done */
2092 	if ((off >> blkshift) > dn->dn_maxblkid)
2093 		return;
2094 
2095 	ASSERT(ISP2(blksz));
2096 	if (trunc)
2097 		tail = 0;
2098 	else
2099 		tail = P2PHASE(len, blksz);
2100 
2101 	ASSERT0(P2PHASE(off, blksz));
2102 	/* zero out any partial block data at the end of the range */
2103 	if (tail) {
2104 		int res;
2105 		if (len < tail)
2106 			tail = len;
2107 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2108 		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
2109 		    TRUE, FALSE, FTAG, &db);
2110 		rw_exit(&dn->dn_struct_rwlock);
2111 		if (res == 0) {
2112 			boolean_t dirty;
2113 			/* don't dirty if not on disk and not dirty */
2114 			db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
2115 			    FTAG);
2116 			dirty = db->db_last_dirty ||
2117 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
2118 			dmu_buf_unlock_parent(db, type, FTAG);
2119 			if (dirty) {
2120 				dmu_buf_will_dirty(&db->db, tx);
2121 				bzero(db->db.db_data, tail);
2122 			}
2123 			dbuf_rele(db, FTAG);
2124 		}
2125 		len -= tail;
2126 	}
2127 
2128 	/* If the range did not include a full block, we are done */
2129 	if (len == 0)
2130 		return;
2131 
2132 	ASSERT(IS_P2ALIGNED(off, blksz));
2133 	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
2134 	blkid = off >> blkshift;
2135 	nblks = len >> blkshift;
2136 	if (trunc)
2137 		nblks += 1;
2138 
2139 	/*
2140 	 * Dirty all the indirect blocks in this range.  Note that only
2141 	 * the first and last indirect blocks can actually be written
2142 	 * (if they were partially freed) -- they must be dirtied, even if
2143 	 * they do not exist on disk yet.  The interior blocks will
2144 	 * be freed by free_children(), so they will not actually be written.
2145 	 * Even though these interior blocks will not be written, we
2146 	 * dirty them for two reasons:
2147 	 *
2148 	 *  - It ensures that the indirect blocks remain in memory until
2149 	 *    syncing context.  (They have already been prefetched by
2150 	 *    dmu_tx_hold_free(), so we don't have to worry about reading
2151 	 *    them serially here.)
2152 	 *
2153 	 *  - The dirty space accounting will put pressure on the txg sync
2154 	 *    mechanism to begin syncing, and to delay transactions if there
2155 	 *    is a large amount of freeing.  Even though these indirect
2156 	 *    blocks will not be written, we could need to write the same
2157 	 *    amount of space if we copy the freed BPs into deadlists.
2158 	 */
2159 	if (dn->dn_nlevels > 1) {
2160 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2161 		uint64_t first, last;
2162 
2163 		first = blkid >> epbs;
2164 		dnode_dirty_l1(dn, first, tx);
2165 		if (trunc)
2166 			last = dn->dn_maxblkid >> epbs;
2167 		else
2168 			last = (blkid + nblks - 1) >> epbs;
2169 		if (last != first)
2170 			dnode_dirty_l1(dn, last, tx);
2171 
2172 		dnode_dirty_l1range(dn, first, last, tx);
2173 
2174 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
2175 		    SPA_BLKPTRSHIFT;
2176 		for (uint64_t i = first + 1; i < last; i++) {
2177 			/*
2178 			 * Set i to the blockid of the next non-hole
2179 			 * level-1 indirect block at or after i.  Note
2180 			 * that dnode_next_offset() operates in terms of
2181 			 * level-0-equivalent bytes.
2182 			 */
2183 			uint64_t ibyte = i << shift;
2184 			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
2185 			    &ibyte, 2, 1, 0);
2186 			i = ibyte >> shift;
2187 			if (i >= last)
2188 				break;
2189 
2190 			/*
2191 			 * Normally we should not see an error, either
2192 			 * from dnode_next_offset() or dbuf_hold_level()
2193 			 * (except for ESRCH from dnode_next_offset).
2194 			 * If there is an i/o error, then when we read
2195 			 * this block in syncing context, it will use
2196 			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
2197 			 * to the "failmode" property.  dnode_next_offset()
2198 			 * doesn't have a flag to indicate MUSTSUCCEED.
2199 			 */
2200 			if (err != 0)
2201 				break;
2202 
2203 			dnode_dirty_l1(dn, i, tx);
2204 		}
2205 		rw_exit(&dn->dn_struct_rwlock);
2206 	}
2207 
2208 done:
2209 	/*
2210 	 * Add this range to the dnode range list.
2211 	 * We will finish up this free operation in the syncing phase.
2212 	 */
2213 	mutex_enter(&dn->dn_mtx);
2214 	int txgoff = tx->tx_txg & TXG_MASK;
2215 	if (dn->dn_free_ranges[txgoff] == NULL) {
2216 		dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
2217 		    RANGE_SEG64, NULL, 0, 0);
2218 	}
2219 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
2220 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
2221 	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
2222 	    blkid, nblks, tx->tx_txg);
2223 	mutex_exit(&dn->dn_mtx);
2224 
2225 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
2226 	dnode_setdirty(dn, tx);
2227 }
2228 
2229 static boolean_t
2230 dnode_spill_freed(dnode_t *dn)
2231 {
2232 	int i;
2233 
2234 	mutex_enter(&dn->dn_mtx);
2235 	for (i = 0; i < TXG_SIZE; i++) {
2236 		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
2237 			break;
2238 	}
2239 	mutex_exit(&dn->dn_mtx);
2240 	return (i < TXG_SIZE);
2241 }
2242 
2243 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
2244 uint64_t
2245 dnode_block_freed(dnode_t *dn, uint64_t blkid)
2246 {
2247 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
2248 	int i;
2249 
2250 	if (blkid == DMU_BONUS_BLKID)
2251 		return (FALSE);
2252 
2253 	/*
2254 	 * If we're in the process of opening the pool, dp will not be
2255 	 * set yet, but there shouldn't be anything dirty.
2256 	 */
2257 	if (dp == NULL)
2258 		return (FALSE);
2259 
2260 	if (dn->dn_free_txg)
2261 		return (TRUE);
2262 
2263 	if (blkid == DMU_SPILL_BLKID)
2264 		return (dnode_spill_freed(dn));
2265 
2266 	mutex_enter(&dn->dn_mtx);
2267 	for (i = 0; i < TXG_SIZE; i++) {
2268 		if (dn->dn_free_ranges[i] != NULL &&
2269 		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
2270 			break;
2271 	}
2272 	mutex_exit(&dn->dn_mtx);
2273 	return (i < TXG_SIZE);
2274 }
2275 
2276 /* call from syncing context when we actually write/free space for this dnode */
2277 void
2278 dnode_diduse_space(dnode_t *dn, int64_t delta)
2279 {
2280 	uint64_t space;
2281 	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
2282 	    dn, dn->dn_phys,
2283 	    (u_longlong_t)dn->dn_phys->dn_used,
2284 	    (longlong_t)delta);
2285 
2286 	mutex_enter(&dn->dn_mtx);
2287 	space = DN_USED_BYTES(dn->dn_phys);
2288 	if (delta > 0) {
2289 		ASSERT3U(space + delta, >=, space); /* no overflow */
2290 	} else {
2291 		ASSERT3U(space, >=, -delta); /* no underflow */
2292 	}
2293 	space += delta;
2294 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
2295 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
2296 		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
2297 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
2298 	} else {
2299 		dn->dn_phys->dn_used = space;
2300 		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
2301 	}
2302 	mutex_exit(&dn->dn_mtx);
2303 }
2304 
2305 /*
2306  * Scans a block at the indicated "level" looking for a hole or data,
2307  * depending on 'flags'.
2308  *
2309  * If level > 0, then we are scanning an indirect block looking at its
2310  * pointers.  If level == 0, then we are looking at a block of dnodes.
2311  *
2312  * If we don't find what we are looking for in the block, we return ESRCH.
2313  * Otherwise, return with *offset pointing to the beginning (if searching
2314  * forwards) or end (if searching backwards) of the range covered by the
2315  * block pointer we matched on (or dnode).
2316  *
2317  * The basic search algorithm used below by dnode_next_offset() is to
2318  * use this function to search up the block tree (widen the search) until
2319  * we find something (i.e., we don't return ESRCH) and then search back
2320  * down the tree (narrow the search) until we reach our original search
2321  * level.
2322  */
2323 static int
2324 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
2325     int lvl, uint64_t blkfill, uint64_t txg)
2326 {
2327 	dmu_buf_impl_t *db = NULL;
2328 	void *data = NULL;
2329 	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2330 	uint64_t epb = 1ULL << epbs;
2331 	uint64_t minfill, maxfill;
2332 	boolean_t hole;
2333 	int i, inc, error, span;
2334 
2335 	dprintf("probing object %llu offset %llx level %d of %u\n",
2336 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
2337 
2338 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2339 
2340 	hole = ((flags & DNODE_FIND_HOLE) != 0);
2341 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
2342 	ASSERT(txg == 0 || !hole);
2343 
2344 	if (lvl == dn->dn_phys->dn_nlevels) {
2345 		error = 0;
2346 		epb = dn->dn_phys->dn_nblkptr;
2347 		data = dn->dn_phys->dn_blkptr;
2348 	} else {
2349 		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
2350 		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
2351 		if (error) {
2352 			if (error != ENOENT)
2353 				return (error);
2354 			if (hole)
2355 				return (0);
2356 			/*
2357 			 * This can only happen when we are searching up
2358 			 * the block tree for data.  We don't really need to
2359 			 * adjust the offset, as we will just end up looking
2360 			 * at the pointer to this block in its parent, and its
2361 			 * going to be unallocated, so we will skip over it.
2362 			 */
2363 			return (SET_ERROR(ESRCH));
2364 		}
2365 		error = dbuf_read(db, NULL,
2366 		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
2367 		if (error) {
2368 			dbuf_rele(db, FTAG);
2369 			return (error);
2370 		}
2371 		data = db->db.db_data;
2372 		rw_enter(&db->db_rwlock, RW_READER);
2373 	}
2374 
2375 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
2376 	    db->db_blkptr->blk_birth <= txg ||
2377 	    BP_IS_HOLE(db->db_blkptr))) {
2378 		/*
2379 		 * This can only happen when we are searching up the tree
2380 		 * and these conditions mean that we need to keep climbing.
2381 		 */
2382 		error = SET_ERROR(ESRCH);
2383 	} else if (lvl == 0) {
2384 		dnode_phys_t *dnp = data;
2385 
2386 		ASSERT(dn->dn_type == DMU_OT_DNODE);
2387 		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
2388 
2389 		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
2390 		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
2391 			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
2392 				break;
2393 		}
2394 
2395 		if (i == blkfill)
2396 			error = SET_ERROR(ESRCH);
2397 
2398 		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
2399 		    (i << DNODE_SHIFT);
2400 	} else {
2401 		blkptr_t *bp = data;
2402 		uint64_t start = *offset;
2403 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
2404 		minfill = 0;
2405 		maxfill = blkfill << ((lvl - 1) * epbs);
2406 
2407 		if (hole)
2408 			maxfill--;
2409 		else
2410 			minfill++;
2411 
2412 		*offset = *offset >> span;
2413 		for (i = BF64_GET(*offset, 0, epbs);
2414 		    i >= 0 && i < epb; i += inc) {
2415 			if (BP_GET_FILL(&bp[i]) >= minfill &&
2416 			    BP_GET_FILL(&bp[i]) <= maxfill &&
2417 			    (hole || bp[i].blk_birth > txg))
2418 				break;
2419 			if (inc > 0 || *offset > 0)
2420 				*offset += inc;
2421 		}
2422 		*offset = *offset << span;
2423 		if (inc < 0) {
2424 			/* traversing backwards; position offset at the end */
2425 			ASSERT3U(*offset, <=, start);
2426 			*offset = MIN(*offset + (1ULL << span) - 1, start);
2427 		} else if (*offset < start) {
2428 			*offset = start;
2429 		}
2430 		if (i < 0 || i >= epb)
2431 			error = SET_ERROR(ESRCH);
2432 	}
2433 
2434 	if (db != NULL) {
2435 		rw_exit(&db->db_rwlock);
2436 		dbuf_rele(db, FTAG);
2437 	}
2438 
2439 	return (error);
2440 }
2441 
2442 /*
2443  * Find the next hole, data, or sparse region at or after *offset.
2444  * The value 'blkfill' tells us how many items we expect to find
2445  * in an L0 data block; this value is 1 for normal objects,
2446  * DNODES_PER_BLOCK for the meta dnode, and some fraction of
2447  * DNODES_PER_BLOCK when searching for sparse regions thereof.
2448  *
2449  * Examples:
2450  *
2451  * dnode_next_offset(dn, flags, offset, 1, 1, 0);
2452  *	Finds the next/previous hole/data in a file.
2453  *	Used in dmu_offset_next().
2454  *
2455  * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
2456  *	Finds the next free/allocated dnode an objset's meta-dnode.
2457  *	Only finds objects that have new contents since txg (ie.
2458  *	bonus buffer changes and content removal are ignored).
2459  *	Used in dmu_object_next().
2460  *
2461  * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
2462  *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
2463  *	Used in dmu_object_alloc().
2464  */
2465 int
2466 dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
2467     int minlvl, uint64_t blkfill, uint64_t txg)
2468 {
2469 	uint64_t initial_offset = *offset;
2470 	int lvl, maxlvl;
2471 	int error = 0;
2472 
2473 	if (!(flags & DNODE_FIND_HAVELOCK))
2474 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2475 
2476 	if (dn->dn_phys->dn_nlevels == 0) {
2477 		error = SET_ERROR(ESRCH);
2478 		goto out;
2479 	}
2480 
2481 	if (dn->dn_datablkshift == 0) {
2482 		if (*offset < dn->dn_datablksz) {
2483 			if (flags & DNODE_FIND_HOLE)
2484 				*offset = dn->dn_datablksz;
2485 		} else {
2486 			error = SET_ERROR(ESRCH);
2487 		}
2488 		goto out;
2489 	}
2490 
2491 	maxlvl = dn->dn_phys->dn_nlevels;
2492 
2493 	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
2494 		error = dnode_next_offset_level(dn,
2495 		    flags, offset, lvl, blkfill, txg);
2496 		if (error != ESRCH)
2497 			break;
2498 	}
2499 
2500 	while (error == 0 && --lvl >= minlvl) {
2501 		error = dnode_next_offset_level(dn,
2502 		    flags, offset, lvl, blkfill, txg);
2503 	}
2504 
2505 	/*
2506 	 * There's always a "virtual hole" at the end of the object, even
2507 	 * if all BP's which physically exist are non-holes.
2508 	 */
2509 	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2510 	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2511 		error = 0;
2512 	}
2513 
2514 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2515 	    initial_offset < *offset : initial_offset > *offset))
2516 		error = SET_ERROR(ESRCH);
2517 out:
2518 	if (!(flags & DNODE_FIND_HAVELOCK))
2519 		rw_exit(&dn->dn_struct_rwlock);
2520 
2521 	return (error);
2522 }
2523