xref: /freebsd/sys/contrib/openzfs/module/zfs/dbuf.c (revision 77013d11e6483b970af25e13c9b892075742f7e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27  * Copyright (c) 2019, Klara Inc.
28  * Copyright (c) 2019, Allan Jude
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/arc.h>
33 #include <sys/dmu.h>
34 #include <sys/dmu_send.h>
35 #include <sys/dmu_impl.h>
36 #include <sys/dbuf.h>
37 #include <sys/dmu_objset.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/dsl_dir.h>
40 #include <sys/dmu_tx.h>
41 #include <sys/spa.h>
42 #include <sys/zio.h>
43 #include <sys/dmu_zfetch.h>
44 #include <sys/sa.h>
45 #include <sys/sa_impl.h>
46 #include <sys/zfeature.h>
47 #include <sys/blkptr.h>
48 #include <sys/range_tree.h>
49 #include <sys/trace_zfs.h>
50 #include <sys/callb.h>
51 #include <sys/abd.h>
52 #include <sys/vdev.h>
53 #include <cityhash.h>
54 #include <sys/spa_impl.h>
55 #include <sys/wmsum.h>
56 
57 kstat_t *dbuf_ksp;
58 
59 typedef struct dbuf_stats {
60 	/*
61 	 * Various statistics about the size of the dbuf cache.
62 	 */
63 	kstat_named_t cache_count;
64 	kstat_named_t cache_size_bytes;
65 	kstat_named_t cache_size_bytes_max;
66 	/*
67 	 * Statistics regarding the bounds on the dbuf cache size.
68 	 */
69 	kstat_named_t cache_target_bytes;
70 	kstat_named_t cache_lowater_bytes;
71 	kstat_named_t cache_hiwater_bytes;
72 	/*
73 	 * Total number of dbuf cache evictions that have occurred.
74 	 */
75 	kstat_named_t cache_total_evicts;
76 	/*
77 	 * The distribution of dbuf levels in the dbuf cache and
78 	 * the total size of all dbufs at each level.
79 	 */
80 	kstat_named_t cache_levels[DN_MAX_LEVELS];
81 	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
82 	/*
83 	 * Statistics about the dbuf hash table.
84 	 */
85 	kstat_named_t hash_hits;
86 	kstat_named_t hash_misses;
87 	kstat_named_t hash_collisions;
88 	kstat_named_t hash_elements;
89 	kstat_named_t hash_elements_max;
90 	/*
91 	 * Number of sublists containing more than one dbuf in the dbuf
92 	 * hash table. Keep track of the longest hash chain.
93 	 */
94 	kstat_named_t hash_chains;
95 	kstat_named_t hash_chain_max;
96 	/*
97 	 * Number of times a dbuf_create() discovers that a dbuf was
98 	 * already created and in the dbuf hash table.
99 	 */
100 	kstat_named_t hash_insert_race;
101 	/*
102 	 * Statistics about the size of the metadata dbuf cache.
103 	 */
104 	kstat_named_t metadata_cache_count;
105 	kstat_named_t metadata_cache_size_bytes;
106 	kstat_named_t metadata_cache_size_bytes_max;
107 	/*
108 	 * For diagnostic purposes, this is incremented whenever we can't add
109 	 * something to the metadata cache because it's full, and instead put
110 	 * the data in the regular dbuf cache.
111 	 */
112 	kstat_named_t metadata_cache_overflow;
113 } dbuf_stats_t;
114 
115 dbuf_stats_t dbuf_stats = {
116 	{ "cache_count",			KSTAT_DATA_UINT64 },
117 	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
118 	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
119 	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
120 	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
121 	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
122 	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
123 	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
124 	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
125 	{ "hash_hits",				KSTAT_DATA_UINT64 },
126 	{ "hash_misses",			KSTAT_DATA_UINT64 },
127 	{ "hash_collisions",			KSTAT_DATA_UINT64 },
128 	{ "hash_elements",			KSTAT_DATA_UINT64 },
129 	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
130 	{ "hash_chains",			KSTAT_DATA_UINT64 },
131 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
132 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
133 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
134 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
135 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
136 	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
137 };
138 
139 struct {
140 	wmsum_t cache_count;
141 	wmsum_t cache_total_evicts;
142 	wmsum_t cache_levels[DN_MAX_LEVELS];
143 	wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
144 	wmsum_t hash_hits;
145 	wmsum_t hash_misses;
146 	wmsum_t hash_collisions;
147 	wmsum_t hash_chains;
148 	wmsum_t hash_insert_race;
149 	wmsum_t metadata_cache_count;
150 	wmsum_t metadata_cache_overflow;
151 } dbuf_sums;
152 
153 #define	DBUF_STAT_INCR(stat, val)	\
154 	wmsum_add(&dbuf_sums.stat, val);
155 #define	DBUF_STAT_DECR(stat, val)	\
156 	DBUF_STAT_INCR(stat, -(val));
157 #define	DBUF_STAT_BUMP(stat)		\
158 	DBUF_STAT_INCR(stat, 1);
159 #define	DBUF_STAT_BUMPDOWN(stat)	\
160 	DBUF_STAT_INCR(stat, -1);
161 #define	DBUF_STAT_MAX(stat, v) {					\
162 	uint64_t _m;							\
163 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
164 	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
165 		continue;						\
166 }
167 
168 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
169 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
170 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
171 static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
172 
173 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
174     dmu_buf_evict_func_t *evict_func_sync,
175     dmu_buf_evict_func_t *evict_func_async,
176     dmu_buf_t **clear_on_evict_dbufp);
177 
178 /*
179  * Global data structures and functions for the dbuf cache.
180  */
181 static kmem_cache_t *dbuf_kmem_cache;
182 static taskq_t *dbu_evict_taskq;
183 
184 static kthread_t *dbuf_cache_evict_thread;
185 static kmutex_t dbuf_evict_lock;
186 static kcondvar_t dbuf_evict_cv;
187 static boolean_t dbuf_evict_thread_exit;
188 
189 /*
190  * There are two dbuf caches; each dbuf can only be in one of them at a time.
191  *
192  * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
193  *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
194  *    that represent the metadata that describes filesystems/snapshots/
195  *    bookmarks/properties/etc. We only evict from this cache when we export a
196  *    pool, to short-circuit as much I/O as possible for all administrative
197  *    commands that need the metadata. There is no eviction policy for this
198  *    cache, because we try to only include types in it which would occupy a
199  *    very small amount of space per object but create a large impact on the
200  *    performance of these commands. Instead, after it reaches a maximum size
201  *    (which should only happen on very small memory systems with a very large
202  *    number of filesystem objects), we stop taking new dbufs into the
203  *    metadata cache, instead putting them in the normal dbuf cache.
204  *
205  * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
206  *    are not currently held but have been recently released. These dbufs
207  *    are not eligible for arc eviction until they are aged out of the cache.
208  *    Dbufs that are aged out of the cache will be immediately destroyed and
209  *    become eligible for arc eviction.
210  *
211  * Dbufs are added to these caches once the last hold is released. If a dbuf is
212  * later accessed and still exists in the dbuf cache, then it will be removed
213  * from the cache and later re-added to the head of the cache.
214  *
215  * If a given dbuf meets the requirements for the metadata cache, it will go
216  * there, otherwise it will be considered for the generic LRU dbuf cache. The
217  * caches and the refcounts tracking their sizes are stored in an array indexed
218  * by those caches' matching enum values (from dbuf_cached_state_t).
219  */
220 typedef struct dbuf_cache {
221 	multilist_t cache;
222 	zfs_refcount_t size ____cacheline_aligned;
223 } dbuf_cache_t;
224 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
225 
226 /* Size limits for the caches */
227 unsigned long dbuf_cache_max_bytes = ULONG_MAX;
228 unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
229 
230 /* Set the default sizes of the caches to log2 fraction of arc size */
231 int dbuf_cache_shift = 5;
232 int dbuf_metadata_cache_shift = 6;
233 
234 static unsigned long dbuf_cache_target_bytes(void);
235 static unsigned long dbuf_metadata_cache_target_bytes(void);
236 
237 /*
238  * The LRU dbuf cache uses a three-stage eviction policy:
239  *	- A low water marker designates when the dbuf eviction thread
240  *	should stop evicting from the dbuf cache.
241  *	- When we reach the maximum size (aka mid water mark), we
242  *	signal the eviction thread to run.
243  *	- The high water mark indicates when the eviction thread
244  *	is unable to keep up with the incoming load and eviction must
245  *	happen in the context of the calling thread.
246  *
247  * The dbuf cache:
248  *                                                 (max size)
249  *                                      low water   mid water   hi water
250  * +----------------------------------------+----------+----------+
251  * |                                        |          |          |
252  * |                                        |          |          |
253  * |                                        |          |          |
254  * |                                        |          |          |
255  * +----------------------------------------+----------+----------+
256  *                                        stop        signal     evict
257  *                                      evicting     eviction   directly
258  *                                                    thread
259  *
260  * The high and low water marks indicate the operating range for the eviction
261  * thread. The low water mark is, by default, 90% of the total size of the
262  * cache and the high water mark is at 110% (both of these percentages can be
263  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
264  * respectively). The eviction thread will try to ensure that the cache remains
265  * within this range by waking up every second and checking if the cache is
266  * above the low water mark. The thread can also be woken up by callers adding
267  * elements into the cache if the cache is larger than the mid water (i.e max
268  * cache size). Once the eviction thread is woken up and eviction is required,
269  * it will continue evicting buffers until it's able to reduce the cache size
270  * to the low water mark. If the cache size continues to grow and hits the high
271  * water mark, then callers adding elements to the cache will begin to evict
272  * directly from the cache until the cache is no longer above the high water
273  * mark.
274  */
275 
276 /*
277  * The percentage above and below the maximum cache size.
278  */
279 uint_t dbuf_cache_hiwater_pct = 10;
280 uint_t dbuf_cache_lowater_pct = 10;
281 
282 /* ARGSUSED */
283 static int
284 dbuf_cons(void *vdb, void *unused, int kmflag)
285 {
286 	dmu_buf_impl_t *db = vdb;
287 	bzero(db, sizeof (dmu_buf_impl_t));
288 
289 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
290 	rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
291 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
292 	multilist_link_init(&db->db_cache_link);
293 	zfs_refcount_create(&db->db_holds);
294 
295 	return (0);
296 }
297 
298 /* ARGSUSED */
299 static void
300 dbuf_dest(void *vdb, void *unused)
301 {
302 	dmu_buf_impl_t *db = vdb;
303 	mutex_destroy(&db->db_mtx);
304 	rw_destroy(&db->db_rwlock);
305 	cv_destroy(&db->db_changed);
306 	ASSERT(!multilist_link_active(&db->db_cache_link));
307 	zfs_refcount_destroy(&db->db_holds);
308 }
309 
310 /*
311  * dbuf hash table routines
312  */
313 static dbuf_hash_table_t dbuf_hash_table;
314 
315 /*
316  * We use Cityhash for this. It's fast, and has good hash properties without
317  * requiring any large static buffers.
318  */
319 static uint64_t
320 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
321 {
322 	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
323 }
324 
325 #define	DTRACE_SET_STATE(db, why) \
326 	DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db,	\
327 	    const char *, why)
328 
329 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
330 	((dbuf)->db.db_object == (obj) &&		\
331 	(dbuf)->db_objset == (os) &&			\
332 	(dbuf)->db_level == (level) &&			\
333 	(dbuf)->db_blkid == (blkid))
334 
335 dmu_buf_impl_t *
336 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
337 {
338 	dbuf_hash_table_t *h = &dbuf_hash_table;
339 	uint64_t hv;
340 	uint64_t idx;
341 	dmu_buf_impl_t *db;
342 
343 	hv = dbuf_hash(os, obj, level, blkid);
344 	idx = hv & h->hash_table_mask;
345 
346 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
347 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
348 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
349 			mutex_enter(&db->db_mtx);
350 			if (db->db_state != DB_EVICTING) {
351 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
352 				return (db);
353 			}
354 			mutex_exit(&db->db_mtx);
355 		}
356 	}
357 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
358 	return (NULL);
359 }
360 
361 static dmu_buf_impl_t *
362 dbuf_find_bonus(objset_t *os, uint64_t object)
363 {
364 	dnode_t *dn;
365 	dmu_buf_impl_t *db = NULL;
366 
367 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
368 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
369 		if (dn->dn_bonus != NULL) {
370 			db = dn->dn_bonus;
371 			mutex_enter(&db->db_mtx);
372 		}
373 		rw_exit(&dn->dn_struct_rwlock);
374 		dnode_rele(dn, FTAG);
375 	}
376 	return (db);
377 }
378 
379 /*
380  * Insert an entry into the hash table.  If there is already an element
381  * equal to elem in the hash table, then the already existing element
382  * will be returned and the new element will not be inserted.
383  * Otherwise returns NULL.
384  */
385 static dmu_buf_impl_t *
386 dbuf_hash_insert(dmu_buf_impl_t *db)
387 {
388 	dbuf_hash_table_t *h = &dbuf_hash_table;
389 	objset_t *os = db->db_objset;
390 	uint64_t obj = db->db.db_object;
391 	int level = db->db_level;
392 	uint64_t blkid, hv, idx;
393 	dmu_buf_impl_t *dbf;
394 	uint32_t i;
395 
396 	blkid = db->db_blkid;
397 	hv = dbuf_hash(os, obj, level, blkid);
398 	idx = hv & h->hash_table_mask;
399 
400 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
401 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
402 	    dbf = dbf->db_hash_next, i++) {
403 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
404 			mutex_enter(&dbf->db_mtx);
405 			if (dbf->db_state != DB_EVICTING) {
406 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
407 				return (dbf);
408 			}
409 			mutex_exit(&dbf->db_mtx);
410 		}
411 	}
412 
413 	if (i > 0) {
414 		DBUF_STAT_BUMP(hash_collisions);
415 		if (i == 1)
416 			DBUF_STAT_BUMP(hash_chains);
417 
418 		DBUF_STAT_MAX(hash_chain_max, i);
419 	}
420 
421 	mutex_enter(&db->db_mtx);
422 	db->db_hash_next = h->hash_table[idx];
423 	h->hash_table[idx] = db;
424 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
425 	uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
426 	DBUF_STAT_MAX(hash_elements_max, he);
427 
428 	return (NULL);
429 }
430 
431 /*
432  * This returns whether this dbuf should be stored in the metadata cache, which
433  * is based on whether it's from one of the dnode types that store data related
434  * to traversing dataset hierarchies.
435  */
436 static boolean_t
437 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
438 {
439 	DB_DNODE_ENTER(db);
440 	dmu_object_type_t type = DB_DNODE(db)->dn_type;
441 	DB_DNODE_EXIT(db);
442 
443 	/* Check if this dbuf is one of the types we care about */
444 	if (DMU_OT_IS_METADATA_CACHED(type)) {
445 		/* If we hit this, then we set something up wrong in dmu_ot */
446 		ASSERT(DMU_OT_IS_METADATA(type));
447 
448 		/*
449 		 * Sanity check for small-memory systems: don't allocate too
450 		 * much memory for this purpose.
451 		 */
452 		if (zfs_refcount_count(
453 		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
454 		    dbuf_metadata_cache_target_bytes()) {
455 			DBUF_STAT_BUMP(metadata_cache_overflow);
456 			return (B_FALSE);
457 		}
458 
459 		return (B_TRUE);
460 	}
461 
462 	return (B_FALSE);
463 }
464 
465 /*
466  * Remove an entry from the hash table.  It must be in the EVICTING state.
467  */
468 static void
469 dbuf_hash_remove(dmu_buf_impl_t *db)
470 {
471 	dbuf_hash_table_t *h = &dbuf_hash_table;
472 	uint64_t hv, idx;
473 	dmu_buf_impl_t *dbf, **dbp;
474 
475 	hv = dbuf_hash(db->db_objset, db->db.db_object,
476 	    db->db_level, db->db_blkid);
477 	idx = hv & h->hash_table_mask;
478 
479 	/*
480 	 * We mustn't hold db_mtx to maintain lock ordering:
481 	 * DBUF_HASH_MUTEX > db_mtx.
482 	 */
483 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
484 	ASSERT(db->db_state == DB_EVICTING);
485 	ASSERT(!MUTEX_HELD(&db->db_mtx));
486 
487 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
488 	dbp = &h->hash_table[idx];
489 	while ((dbf = *dbp) != db) {
490 		dbp = &dbf->db_hash_next;
491 		ASSERT(dbf != NULL);
492 	}
493 	*dbp = db->db_hash_next;
494 	db->db_hash_next = NULL;
495 	if (h->hash_table[idx] &&
496 	    h->hash_table[idx]->db_hash_next == NULL)
497 		DBUF_STAT_BUMPDOWN(hash_chains);
498 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
499 	atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
500 }
501 
502 typedef enum {
503 	DBVU_EVICTING,
504 	DBVU_NOT_EVICTING
505 } dbvu_verify_type_t;
506 
507 static void
508 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
509 {
510 #ifdef ZFS_DEBUG
511 	int64_t holds;
512 
513 	if (db->db_user == NULL)
514 		return;
515 
516 	/* Only data blocks support the attachment of user data. */
517 	ASSERT(db->db_level == 0);
518 
519 	/* Clients must resolve a dbuf before attaching user data. */
520 	ASSERT(db->db.db_data != NULL);
521 	ASSERT3U(db->db_state, ==, DB_CACHED);
522 
523 	holds = zfs_refcount_count(&db->db_holds);
524 	if (verify_type == DBVU_EVICTING) {
525 		/*
526 		 * Immediate eviction occurs when holds == dirtycnt.
527 		 * For normal eviction buffers, holds is zero on
528 		 * eviction, except when dbuf_fix_old_data() calls
529 		 * dbuf_clear_data().  However, the hold count can grow
530 		 * during eviction even though db_mtx is held (see
531 		 * dmu_bonus_hold() for an example), so we can only
532 		 * test the generic invariant that holds >= dirtycnt.
533 		 */
534 		ASSERT3U(holds, >=, db->db_dirtycnt);
535 	} else {
536 		if (db->db_user_immediate_evict == TRUE)
537 			ASSERT3U(holds, >=, db->db_dirtycnt);
538 		else
539 			ASSERT3U(holds, >, 0);
540 	}
541 #endif
542 }
543 
544 static void
545 dbuf_evict_user(dmu_buf_impl_t *db)
546 {
547 	dmu_buf_user_t *dbu = db->db_user;
548 
549 	ASSERT(MUTEX_HELD(&db->db_mtx));
550 
551 	if (dbu == NULL)
552 		return;
553 
554 	dbuf_verify_user(db, DBVU_EVICTING);
555 	db->db_user = NULL;
556 
557 #ifdef ZFS_DEBUG
558 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
559 		*dbu->dbu_clear_on_evict_dbufp = NULL;
560 #endif
561 
562 	/*
563 	 * There are two eviction callbacks - one that we call synchronously
564 	 * and one that we invoke via a taskq.  The async one is useful for
565 	 * avoiding lock order reversals and limiting stack depth.
566 	 *
567 	 * Note that if we have a sync callback but no async callback,
568 	 * it's likely that the sync callback will free the structure
569 	 * containing the dbu.  In that case we need to take care to not
570 	 * dereference dbu after calling the sync evict func.
571 	 */
572 	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
573 
574 	if (dbu->dbu_evict_func_sync != NULL)
575 		dbu->dbu_evict_func_sync(dbu);
576 
577 	if (has_async) {
578 		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
579 		    dbu, 0, &dbu->dbu_tqent);
580 	}
581 }
582 
583 boolean_t
584 dbuf_is_metadata(dmu_buf_impl_t *db)
585 {
586 	/*
587 	 * Consider indirect blocks and spill blocks to be meta data.
588 	 */
589 	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
590 		return (B_TRUE);
591 	} else {
592 		boolean_t is_metadata;
593 
594 		DB_DNODE_ENTER(db);
595 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
596 		DB_DNODE_EXIT(db);
597 
598 		return (is_metadata);
599 	}
600 }
601 
602 
603 /*
604  * This function *must* return indices evenly distributed between all
605  * sublists of the multilist. This is needed due to how the dbuf eviction
606  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
607  * distributed between all sublists and uses this assumption when
608  * deciding which sublist to evict from and how much to evict from it.
609  */
610 static unsigned int
611 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
612 {
613 	dmu_buf_impl_t *db = obj;
614 
615 	/*
616 	 * The assumption here, is the hash value for a given
617 	 * dmu_buf_impl_t will remain constant throughout it's lifetime
618 	 * (i.e. it's objset, object, level and blkid fields don't change).
619 	 * Thus, we don't need to store the dbuf's sublist index
620 	 * on insertion, as this index can be recalculated on removal.
621 	 *
622 	 * Also, the low order bits of the hash value are thought to be
623 	 * distributed evenly. Otherwise, in the case that the multilist
624 	 * has a power of two number of sublists, each sublists' usage
625 	 * would not be evenly distributed. In this context full 64bit
626 	 * division would be a waste of time, so limit it to 32 bits.
627 	 */
628 	return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
629 	    db->db_level, db->db_blkid) %
630 	    multilist_get_num_sublists(ml));
631 }
632 
633 /*
634  * The target size of the dbuf cache can grow with the ARC target,
635  * unless limited by the tunable dbuf_cache_max_bytes.
636  */
637 static inline unsigned long
638 dbuf_cache_target_bytes(void)
639 {
640 	return (MIN(dbuf_cache_max_bytes,
641 	    arc_target_bytes() >> dbuf_cache_shift));
642 }
643 
644 /*
645  * The target size of the dbuf metadata cache can grow with the ARC target,
646  * unless limited by the tunable dbuf_metadata_cache_max_bytes.
647  */
648 static inline unsigned long
649 dbuf_metadata_cache_target_bytes(void)
650 {
651 	return (MIN(dbuf_metadata_cache_max_bytes,
652 	    arc_target_bytes() >> dbuf_metadata_cache_shift));
653 }
654 
655 static inline uint64_t
656 dbuf_cache_hiwater_bytes(void)
657 {
658 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
659 	return (dbuf_cache_target +
660 	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
661 }
662 
663 static inline uint64_t
664 dbuf_cache_lowater_bytes(void)
665 {
666 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
667 	return (dbuf_cache_target -
668 	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
669 }
670 
671 static inline boolean_t
672 dbuf_cache_above_lowater(void)
673 {
674 	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
675 	    dbuf_cache_lowater_bytes());
676 }
677 
678 /*
679  * Evict the oldest eligible dbuf from the dbuf cache.
680  */
681 static void
682 dbuf_evict_one(void)
683 {
684 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
685 	multilist_sublist_t *mls = multilist_sublist_lock(
686 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
687 
688 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
689 
690 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
691 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
692 		db = multilist_sublist_prev(mls, db);
693 	}
694 
695 	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
696 	    multilist_sublist_t *, mls);
697 
698 	if (db != NULL) {
699 		multilist_sublist_remove(mls, db);
700 		multilist_sublist_unlock(mls);
701 		(void) zfs_refcount_remove_many(
702 		    &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
703 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
704 		DBUF_STAT_BUMPDOWN(cache_count);
705 		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
706 		    db->db.db_size);
707 		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
708 		db->db_caching_status = DB_NO_CACHE;
709 		dbuf_destroy(db);
710 		DBUF_STAT_BUMP(cache_total_evicts);
711 	} else {
712 		multilist_sublist_unlock(mls);
713 	}
714 }
715 
716 /*
717  * The dbuf evict thread is responsible for aging out dbufs from the
718  * cache. Once the cache has reached it's maximum size, dbufs are removed
719  * and destroyed. The eviction thread will continue running until the size
720  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
721  * out of the cache it is destroyed and becomes eligible for arc eviction.
722  */
723 /* ARGSUSED */
724 static void
725 dbuf_evict_thread(void *unused)
726 {
727 	callb_cpr_t cpr;
728 
729 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
730 
731 	mutex_enter(&dbuf_evict_lock);
732 	while (!dbuf_evict_thread_exit) {
733 		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
734 			CALLB_CPR_SAFE_BEGIN(&cpr);
735 			(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
736 			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
737 			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
738 		}
739 		mutex_exit(&dbuf_evict_lock);
740 
741 		/*
742 		 * Keep evicting as long as we're above the low water mark
743 		 * for the cache. We do this without holding the locks to
744 		 * minimize lock contention.
745 		 */
746 		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
747 			dbuf_evict_one();
748 		}
749 
750 		mutex_enter(&dbuf_evict_lock);
751 	}
752 
753 	dbuf_evict_thread_exit = B_FALSE;
754 	cv_broadcast(&dbuf_evict_cv);
755 	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
756 	thread_exit();
757 }
758 
759 /*
760  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
761  * If the dbuf cache is at its high water mark, then evict a dbuf from the
762  * dbuf cache using the callers context.
763  */
764 static void
765 dbuf_evict_notify(uint64_t size)
766 {
767 	/*
768 	 * We check if we should evict without holding the dbuf_evict_lock,
769 	 * because it's OK to occasionally make the wrong decision here,
770 	 * and grabbing the lock results in massive lock contention.
771 	 */
772 	if (size > dbuf_cache_target_bytes()) {
773 		if (size > dbuf_cache_hiwater_bytes())
774 			dbuf_evict_one();
775 		cv_signal(&dbuf_evict_cv);
776 	}
777 }
778 
779 static int
780 dbuf_kstat_update(kstat_t *ksp, int rw)
781 {
782 	dbuf_stats_t *ds = ksp->ks_data;
783 
784 	if (rw == KSTAT_WRITE)
785 		return (SET_ERROR(EACCES));
786 
787 	ds->cache_count.value.ui64 =
788 	    wmsum_value(&dbuf_sums.cache_count);
789 	ds->cache_size_bytes.value.ui64 =
790 	    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
791 	ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
792 	ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
793 	ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
794 	ds->cache_total_evicts.value.ui64 =
795 	    wmsum_value(&dbuf_sums.cache_total_evicts);
796 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
797 		ds->cache_levels[i].value.ui64 =
798 		    wmsum_value(&dbuf_sums.cache_levels[i]);
799 		ds->cache_levels_bytes[i].value.ui64 =
800 		    wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
801 	}
802 	ds->hash_hits.value.ui64 =
803 	    wmsum_value(&dbuf_sums.hash_hits);
804 	ds->hash_misses.value.ui64 =
805 	    wmsum_value(&dbuf_sums.hash_misses);
806 	ds->hash_collisions.value.ui64 =
807 	    wmsum_value(&dbuf_sums.hash_collisions);
808 	ds->hash_chains.value.ui64 =
809 	    wmsum_value(&dbuf_sums.hash_chains);
810 	ds->hash_insert_race.value.ui64 =
811 	    wmsum_value(&dbuf_sums.hash_insert_race);
812 	ds->metadata_cache_count.value.ui64 =
813 	    wmsum_value(&dbuf_sums.metadata_cache_count);
814 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
815 	    &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
816 	ds->metadata_cache_overflow.value.ui64 =
817 	    wmsum_value(&dbuf_sums.metadata_cache_overflow);
818 	return (0);
819 }
820 
821 void
822 dbuf_init(void)
823 {
824 	uint64_t hsize = 1ULL << 16;
825 	dbuf_hash_table_t *h = &dbuf_hash_table;
826 	int i;
827 
828 	/*
829 	 * The hash table is big enough to fill one eighth of physical memory
830 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
831 	 * By default, the table will take up
832 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
833 	 */
834 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
835 		hsize <<= 1;
836 
837 retry:
838 	h->hash_table_mask = hsize - 1;
839 #if defined(_KERNEL)
840 	/*
841 	 * Large allocations which do not require contiguous pages
842 	 * should be using vmem_alloc() in the linux kernel
843 	 */
844 	h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
845 #else
846 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
847 #endif
848 	if (h->hash_table == NULL) {
849 		/* XXX - we should really return an error instead of assert */
850 		ASSERT(hsize > (1ULL << 10));
851 		hsize >>= 1;
852 		goto retry;
853 	}
854 
855 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
856 	    sizeof (dmu_buf_impl_t),
857 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
858 
859 	for (i = 0; i < DBUF_MUTEXES; i++)
860 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
861 
862 	dbuf_stats_init(h);
863 
864 	/*
865 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
866 	 * configuration is not required.
867 	 */
868 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
869 
870 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
871 		multilist_create(&dbuf_caches[dcs].cache,
872 		    sizeof (dmu_buf_impl_t),
873 		    offsetof(dmu_buf_impl_t, db_cache_link),
874 		    dbuf_cache_multilist_index_func);
875 		zfs_refcount_create(&dbuf_caches[dcs].size);
876 	}
877 
878 	dbuf_evict_thread_exit = B_FALSE;
879 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
880 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
881 	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
882 	    NULL, 0, &p0, TS_RUN, minclsyspri);
883 
884 	wmsum_init(&dbuf_sums.cache_count, 0);
885 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
886 	for (i = 0; i < DN_MAX_LEVELS; i++) {
887 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
888 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
889 	}
890 	wmsum_init(&dbuf_sums.hash_hits, 0);
891 	wmsum_init(&dbuf_sums.hash_misses, 0);
892 	wmsum_init(&dbuf_sums.hash_collisions, 0);
893 	wmsum_init(&dbuf_sums.hash_chains, 0);
894 	wmsum_init(&dbuf_sums.hash_insert_race, 0);
895 	wmsum_init(&dbuf_sums.metadata_cache_count, 0);
896 	wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
897 
898 	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
899 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
900 	    KSTAT_FLAG_VIRTUAL);
901 	if (dbuf_ksp != NULL) {
902 		for (i = 0; i < DN_MAX_LEVELS; i++) {
903 			snprintf(dbuf_stats.cache_levels[i].name,
904 			    KSTAT_STRLEN, "cache_level_%d", i);
905 			dbuf_stats.cache_levels[i].data_type =
906 			    KSTAT_DATA_UINT64;
907 			snprintf(dbuf_stats.cache_levels_bytes[i].name,
908 			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
909 			dbuf_stats.cache_levels_bytes[i].data_type =
910 			    KSTAT_DATA_UINT64;
911 		}
912 		dbuf_ksp->ks_data = &dbuf_stats;
913 		dbuf_ksp->ks_update = dbuf_kstat_update;
914 		kstat_install(dbuf_ksp);
915 	}
916 }
917 
918 void
919 dbuf_fini(void)
920 {
921 	dbuf_hash_table_t *h = &dbuf_hash_table;
922 	int i;
923 
924 	dbuf_stats_destroy();
925 
926 	for (i = 0; i < DBUF_MUTEXES; i++)
927 		mutex_destroy(&h->hash_mutexes[i]);
928 #if defined(_KERNEL)
929 	/*
930 	 * Large allocations which do not require contiguous pages
931 	 * should be using vmem_free() in the linux kernel
932 	 */
933 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
934 #else
935 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
936 #endif
937 	kmem_cache_destroy(dbuf_kmem_cache);
938 	taskq_destroy(dbu_evict_taskq);
939 
940 	mutex_enter(&dbuf_evict_lock);
941 	dbuf_evict_thread_exit = B_TRUE;
942 	while (dbuf_evict_thread_exit) {
943 		cv_signal(&dbuf_evict_cv);
944 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
945 	}
946 	mutex_exit(&dbuf_evict_lock);
947 
948 	mutex_destroy(&dbuf_evict_lock);
949 	cv_destroy(&dbuf_evict_cv);
950 
951 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
952 		zfs_refcount_destroy(&dbuf_caches[dcs].size);
953 		multilist_destroy(&dbuf_caches[dcs].cache);
954 	}
955 
956 	if (dbuf_ksp != NULL) {
957 		kstat_delete(dbuf_ksp);
958 		dbuf_ksp = NULL;
959 	}
960 
961 	wmsum_fini(&dbuf_sums.cache_count);
962 	wmsum_fini(&dbuf_sums.cache_total_evicts);
963 	for (i = 0; i < DN_MAX_LEVELS; i++) {
964 		wmsum_fini(&dbuf_sums.cache_levels[i]);
965 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
966 	}
967 	wmsum_fini(&dbuf_sums.hash_hits);
968 	wmsum_fini(&dbuf_sums.hash_misses);
969 	wmsum_fini(&dbuf_sums.hash_collisions);
970 	wmsum_fini(&dbuf_sums.hash_chains);
971 	wmsum_fini(&dbuf_sums.hash_insert_race);
972 	wmsum_fini(&dbuf_sums.metadata_cache_count);
973 	wmsum_fini(&dbuf_sums.metadata_cache_overflow);
974 }
975 
976 /*
977  * Other stuff.
978  */
979 
980 #ifdef ZFS_DEBUG
981 static void
982 dbuf_verify(dmu_buf_impl_t *db)
983 {
984 	dnode_t *dn;
985 	dbuf_dirty_record_t *dr;
986 	uint32_t txg_prev;
987 
988 	ASSERT(MUTEX_HELD(&db->db_mtx));
989 
990 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
991 		return;
992 
993 	ASSERT(db->db_objset != NULL);
994 	DB_DNODE_ENTER(db);
995 	dn = DB_DNODE(db);
996 	if (dn == NULL) {
997 		ASSERT(db->db_parent == NULL);
998 		ASSERT(db->db_blkptr == NULL);
999 	} else {
1000 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
1001 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
1002 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
1003 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
1004 		    db->db_blkid == DMU_SPILL_BLKID ||
1005 		    !avl_is_empty(&dn->dn_dbufs));
1006 	}
1007 	if (db->db_blkid == DMU_BONUS_BLKID) {
1008 		ASSERT(dn != NULL);
1009 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1010 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
1011 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
1012 		ASSERT(dn != NULL);
1013 		ASSERT0(db->db.db_offset);
1014 	} else {
1015 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
1016 	}
1017 
1018 	if ((dr = list_head(&db->db_dirty_records)) != NULL) {
1019 		ASSERT(dr->dr_dbuf == db);
1020 		txg_prev = dr->dr_txg;
1021 		for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
1022 		    dr = list_next(&db->db_dirty_records, dr)) {
1023 			ASSERT(dr->dr_dbuf == db);
1024 			ASSERT(txg_prev > dr->dr_txg);
1025 			txg_prev = dr->dr_txg;
1026 		}
1027 	}
1028 
1029 	/*
1030 	 * We can't assert that db_size matches dn_datablksz because it
1031 	 * can be momentarily different when another thread is doing
1032 	 * dnode_set_blksz().
1033 	 */
1034 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
1035 		dr = db->db_data_pending;
1036 		/*
1037 		 * It should only be modified in syncing context, so
1038 		 * make sure we only have one copy of the data.
1039 		 */
1040 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
1041 	}
1042 
1043 	/* verify db->db_blkptr */
1044 	if (db->db_blkptr) {
1045 		if (db->db_parent == dn->dn_dbuf) {
1046 			/* db is pointed to by the dnode */
1047 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
1048 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
1049 				ASSERT(db->db_parent == NULL);
1050 			else
1051 				ASSERT(db->db_parent != NULL);
1052 			if (db->db_blkid != DMU_SPILL_BLKID)
1053 				ASSERT3P(db->db_blkptr, ==,
1054 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
1055 		} else {
1056 			/* db is pointed to by an indirect block */
1057 			int epb __maybe_unused = db->db_parent->db.db_size >>
1058 			    SPA_BLKPTRSHIFT;
1059 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
1060 			ASSERT3U(db->db_parent->db.db_object, ==,
1061 			    db->db.db_object);
1062 			/*
1063 			 * dnode_grow_indblksz() can make this fail if we don't
1064 			 * have the parent's rwlock.  XXX indblksz no longer
1065 			 * grows.  safe to do this now?
1066 			 */
1067 			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
1068 				ASSERT3P(db->db_blkptr, ==,
1069 				    ((blkptr_t *)db->db_parent->db.db_data +
1070 				    db->db_blkid % epb));
1071 			}
1072 		}
1073 	}
1074 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
1075 	    (db->db_buf == NULL || db->db_buf->b_data) &&
1076 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1077 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
1078 		/*
1079 		 * If the blkptr isn't set but they have nonzero data,
1080 		 * it had better be dirty, otherwise we'll lose that
1081 		 * data when we evict this buffer.
1082 		 *
1083 		 * There is an exception to this rule for indirect blocks; in
1084 		 * this case, if the indirect block is a hole, we fill in a few
1085 		 * fields on each of the child blocks (importantly, birth time)
1086 		 * to prevent hole birth times from being lost when you
1087 		 * partially fill in a hole.
1088 		 */
1089 		if (db->db_dirtycnt == 0) {
1090 			if (db->db_level == 0) {
1091 				uint64_t *buf = db->db.db_data;
1092 				int i;
1093 
1094 				for (i = 0; i < db->db.db_size >> 3; i++) {
1095 					ASSERT(buf[i] == 0);
1096 				}
1097 			} else {
1098 				blkptr_t *bps = db->db.db_data;
1099 				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
1100 				    db->db.db_size);
1101 				/*
1102 				 * We want to verify that all the blkptrs in the
1103 				 * indirect block are holes, but we may have
1104 				 * automatically set up a few fields for them.
1105 				 * We iterate through each blkptr and verify
1106 				 * they only have those fields set.
1107 				 */
1108 				for (int i = 0;
1109 				    i < db->db.db_size / sizeof (blkptr_t);
1110 				    i++) {
1111 					blkptr_t *bp = &bps[i];
1112 					ASSERT(ZIO_CHECKSUM_IS_ZERO(
1113 					    &bp->blk_cksum));
1114 					ASSERT(
1115 					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
1116 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
1117 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
1118 					ASSERT0(bp->blk_fill);
1119 					ASSERT0(bp->blk_pad[0]);
1120 					ASSERT0(bp->blk_pad[1]);
1121 					ASSERT(!BP_IS_EMBEDDED(bp));
1122 					ASSERT(BP_IS_HOLE(bp));
1123 					ASSERT0(bp->blk_phys_birth);
1124 				}
1125 			}
1126 		}
1127 	}
1128 	DB_DNODE_EXIT(db);
1129 }
1130 #endif
1131 
1132 static void
1133 dbuf_clear_data(dmu_buf_impl_t *db)
1134 {
1135 	ASSERT(MUTEX_HELD(&db->db_mtx));
1136 	dbuf_evict_user(db);
1137 	ASSERT3P(db->db_buf, ==, NULL);
1138 	db->db.db_data = NULL;
1139 	if (db->db_state != DB_NOFILL) {
1140 		db->db_state = DB_UNCACHED;
1141 		DTRACE_SET_STATE(db, "clear data");
1142 	}
1143 }
1144 
1145 static void
1146 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
1147 {
1148 	ASSERT(MUTEX_HELD(&db->db_mtx));
1149 	ASSERT(buf != NULL);
1150 
1151 	db->db_buf = buf;
1152 	ASSERT(buf->b_data != NULL);
1153 	db->db.db_data = buf->b_data;
1154 }
1155 
1156 static arc_buf_t *
1157 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
1158 {
1159 	spa_t *spa = db->db_objset->os_spa;
1160 
1161 	return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
1162 }
1163 
1164 /*
1165  * Loan out an arc_buf for read.  Return the loaned arc_buf.
1166  */
1167 arc_buf_t *
1168 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
1169 {
1170 	arc_buf_t *abuf;
1171 
1172 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1173 	mutex_enter(&db->db_mtx);
1174 	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
1175 		int blksz = db->db.db_size;
1176 		spa_t *spa = db->db_objset->os_spa;
1177 
1178 		mutex_exit(&db->db_mtx);
1179 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
1180 		bcopy(db->db.db_data, abuf->b_data, blksz);
1181 	} else {
1182 		abuf = db->db_buf;
1183 		arc_loan_inuse_buf(abuf, db);
1184 		db->db_buf = NULL;
1185 		dbuf_clear_data(db);
1186 		mutex_exit(&db->db_mtx);
1187 	}
1188 	return (abuf);
1189 }
1190 
1191 /*
1192  * Calculate which level n block references the data at the level 0 offset
1193  * provided.
1194  */
1195 uint64_t
1196 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
1197 {
1198 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
1199 		/*
1200 		 * The level n blkid is equal to the level 0 blkid divided by
1201 		 * the number of level 0s in a level n block.
1202 		 *
1203 		 * The level 0 blkid is offset >> datablkshift =
1204 		 * offset / 2^datablkshift.
1205 		 *
1206 		 * The number of level 0s in a level n is the number of block
1207 		 * pointers in an indirect block, raised to the power of level.
1208 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
1209 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
1210 		 *
1211 		 * Thus, the level n blkid is: offset /
1212 		 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
1213 		 * = offset / 2^(datablkshift + level *
1214 		 *   (indblkshift - SPA_BLKPTRSHIFT))
1215 		 * = offset >> (datablkshift + level *
1216 		 *   (indblkshift - SPA_BLKPTRSHIFT))
1217 		 */
1218 
1219 		const unsigned exp = dn->dn_datablkshift +
1220 		    level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
1221 
1222 		if (exp >= 8 * sizeof (offset)) {
1223 			/* This only happens on the highest indirection level */
1224 			ASSERT3U(level, ==, dn->dn_nlevels - 1);
1225 			return (0);
1226 		}
1227 
1228 		ASSERT3U(exp, <, 8 * sizeof (offset));
1229 
1230 		return (offset >> exp);
1231 	} else {
1232 		ASSERT3U(offset, <, dn->dn_datablksz);
1233 		return (0);
1234 	}
1235 }
1236 
1237 /*
1238  * This function is used to lock the parent of the provided dbuf. This should be
1239  * used when modifying or reading db_blkptr.
1240  */
1241 db_lock_type_t
1242 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
1243 {
1244 	enum db_lock_type ret = DLT_NONE;
1245 	if (db->db_parent != NULL) {
1246 		rw_enter(&db->db_parent->db_rwlock, rw);
1247 		ret = DLT_PARENT;
1248 	} else if (dmu_objset_ds(db->db_objset) != NULL) {
1249 		rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
1250 		    tag);
1251 		ret = DLT_OBJSET;
1252 	}
1253 	/*
1254 	 * We only return a DLT_NONE lock when it's the top-most indirect block
1255 	 * of the meta-dnode of the MOS.
1256 	 */
1257 	return (ret);
1258 }
1259 
1260 /*
1261  * We need to pass the lock type in because it's possible that the block will
1262  * move from being the topmost indirect block in a dnode (and thus, have no
1263  * parent) to not the top-most via an indirection increase. This would cause a
1264  * panic if we didn't pass the lock type in.
1265  */
1266 void
1267 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
1268 {
1269 	if (type == DLT_PARENT)
1270 		rw_exit(&db->db_parent->db_rwlock);
1271 	else if (type == DLT_OBJSET)
1272 		rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
1273 }
1274 
1275 static void
1276 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
1277     arc_buf_t *buf, void *vdb)
1278 {
1279 	dmu_buf_impl_t *db = vdb;
1280 
1281 	mutex_enter(&db->db_mtx);
1282 	ASSERT3U(db->db_state, ==, DB_READ);
1283 	/*
1284 	 * All reads are synchronous, so we must have a hold on the dbuf
1285 	 */
1286 	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
1287 	ASSERT(db->db_buf == NULL);
1288 	ASSERT(db->db.db_data == NULL);
1289 	if (buf == NULL) {
1290 		/* i/o error */
1291 		ASSERT(zio == NULL || zio->io_error != 0);
1292 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1293 		ASSERT3P(db->db_buf, ==, NULL);
1294 		db->db_state = DB_UNCACHED;
1295 		DTRACE_SET_STATE(db, "i/o error");
1296 	} else if (db->db_level == 0 && db->db_freed_in_flight) {
1297 		/* freed in flight */
1298 		ASSERT(zio == NULL || zio->io_error == 0);
1299 		arc_release(buf, db);
1300 		bzero(buf->b_data, db->db.db_size);
1301 		arc_buf_freeze(buf);
1302 		db->db_freed_in_flight = FALSE;
1303 		dbuf_set_data(db, buf);
1304 		db->db_state = DB_CACHED;
1305 		DTRACE_SET_STATE(db, "freed in flight");
1306 	} else {
1307 		/* success */
1308 		ASSERT(zio == NULL || zio->io_error == 0);
1309 		dbuf_set_data(db, buf);
1310 		db->db_state = DB_CACHED;
1311 		DTRACE_SET_STATE(db, "successful read");
1312 	}
1313 	cv_broadcast(&db->db_changed);
1314 	dbuf_rele_and_unlock(db, NULL, B_FALSE);
1315 }
1316 
1317 /*
1318  * Shortcut for performing reads on bonus dbufs.  Returns
1319  * an error if we fail to verify the dnode associated with
1320  * a decrypted block. Otherwise success.
1321  */
1322 static int
1323 dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
1324 {
1325 	int bonuslen, max_bonuslen, err;
1326 
1327 	err = dbuf_read_verify_dnode_crypt(db, flags);
1328 	if (err)
1329 		return (err);
1330 
1331 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
1332 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1333 	ASSERT(MUTEX_HELD(&db->db_mtx));
1334 	ASSERT(DB_DNODE_HELD(db));
1335 	ASSERT3U(bonuslen, <=, db->db.db_size);
1336 	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
1337 	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
1338 	if (bonuslen < max_bonuslen)
1339 		bzero(db->db.db_data, max_bonuslen);
1340 	if (bonuslen)
1341 		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
1342 	db->db_state = DB_CACHED;
1343 	DTRACE_SET_STATE(db, "bonus buffer filled");
1344 	return (0);
1345 }
1346 
1347 static void
1348 dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
1349 {
1350 	blkptr_t *bps = db->db.db_data;
1351 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
1352 	int n_bps = indbs >> SPA_BLKPTRSHIFT;
1353 
1354 	for (int i = 0; i < n_bps; i++) {
1355 		blkptr_t *bp = &bps[i];
1356 
1357 		ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
1358 		BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
1359 		    dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
1360 		BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
1361 		BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
1362 		BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
1363 	}
1364 }
1365 
1366 /*
1367  * Handle reads on dbufs that are holes, if necessary.  This function
1368  * requires that the dbuf's mutex is held. Returns success (0) if action
1369  * was taken, ENOENT if no action was taken.
1370  */
1371 static int
1372 dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
1373 {
1374 	ASSERT(MUTEX_HELD(&db->db_mtx));
1375 
1376 	int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
1377 	/*
1378 	 * For level 0 blocks only, if the above check fails:
1379 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
1380 	 * processes the delete record and clears the bp while we are waiting
1381 	 * for the dn_mtx (resulting in a "no" from block_freed).
1382 	 */
1383 	if (!is_hole && db->db_level == 0) {
1384 		is_hole = dnode_block_freed(dn, db->db_blkid) ||
1385 		    BP_IS_HOLE(db->db_blkptr);
1386 	}
1387 
1388 	if (is_hole) {
1389 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
1390 		bzero(db->db.db_data, db->db.db_size);
1391 
1392 		if (db->db_blkptr != NULL && db->db_level > 0 &&
1393 		    BP_IS_HOLE(db->db_blkptr) &&
1394 		    db->db_blkptr->blk_birth != 0) {
1395 			dbuf_handle_indirect_hole(db, dn);
1396 		}
1397 		db->db_state = DB_CACHED;
1398 		DTRACE_SET_STATE(db, "hole read satisfied");
1399 		return (0);
1400 	}
1401 	return (ENOENT);
1402 }
1403 
1404 /*
1405  * This function ensures that, when doing a decrypting read of a block,
1406  * we make sure we have decrypted the dnode associated with it. We must do
1407  * this so that we ensure we are fully authenticating the checksum-of-MACs
1408  * tree from the root of the objset down to this block. Indirect blocks are
1409  * always verified against their secure checksum-of-MACs assuming that the
1410  * dnode containing them is correct. Now that we are doing a decrypting read,
1411  * we can be sure that the key is loaded and verify that assumption. This is
1412  * especially important considering that we always read encrypted dnode
1413  * blocks as raw data (without verifying their MACs) to start, and
1414  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
1415  */
1416 static int
1417 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
1418 {
1419 	int err = 0;
1420 	objset_t *os = db->db_objset;
1421 	arc_buf_t *dnode_abuf;
1422 	dnode_t *dn;
1423 	zbookmark_phys_t zb;
1424 
1425 	ASSERT(MUTEX_HELD(&db->db_mtx));
1426 
1427 	if (!os->os_encrypted || os->os_raw_receive ||
1428 	    (flags & DB_RF_NO_DECRYPT) != 0)
1429 		return (0);
1430 
1431 	DB_DNODE_ENTER(db);
1432 	dn = DB_DNODE(db);
1433 	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
1434 
1435 	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
1436 		DB_DNODE_EXIT(db);
1437 		return (0);
1438 	}
1439 
1440 	SET_BOOKMARK(&zb, dmu_objset_id(os),
1441 	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
1442 	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
1443 
1444 	/*
1445 	 * An error code of EACCES tells us that the key is still not
1446 	 * available. This is ok if we are only reading authenticated
1447 	 * (and therefore non-encrypted) blocks.
1448 	 */
1449 	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
1450 	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
1451 	    (db->db_blkid == DMU_BONUS_BLKID &&
1452 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
1453 		err = 0;
1454 
1455 	DB_DNODE_EXIT(db);
1456 
1457 	return (err);
1458 }
1459 
1460 /*
1461  * Drops db_mtx and the parent lock specified by dblt and tag before
1462  * returning.
1463  */
1464 static int
1465 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
1466     db_lock_type_t dblt, void *tag)
1467 {
1468 	dnode_t *dn;
1469 	zbookmark_phys_t zb;
1470 	uint32_t aflags = ARC_FLAG_NOWAIT;
1471 	int err, zio_flags;
1472 
1473 	err = zio_flags = 0;
1474 	DB_DNODE_ENTER(db);
1475 	dn = DB_DNODE(db);
1476 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1477 	ASSERT(MUTEX_HELD(&db->db_mtx));
1478 	ASSERT(db->db_state == DB_UNCACHED);
1479 	ASSERT(db->db_buf == NULL);
1480 	ASSERT(db->db_parent == NULL ||
1481 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
1482 
1483 	if (db->db_blkid == DMU_BONUS_BLKID) {
1484 		err = dbuf_read_bonus(db, dn, flags);
1485 		goto early_unlock;
1486 	}
1487 
1488 	err = dbuf_read_hole(db, dn, flags);
1489 	if (err == 0)
1490 		goto early_unlock;
1491 
1492 	/*
1493 	 * Any attempt to read a redacted block should result in an error. This
1494 	 * will never happen under normal conditions, but can be useful for
1495 	 * debugging purposes.
1496 	 */
1497 	if (BP_IS_REDACTED(db->db_blkptr)) {
1498 		ASSERT(dsl_dataset_feature_is_active(
1499 		    db->db_objset->os_dsl_dataset,
1500 		    SPA_FEATURE_REDACTED_DATASETS));
1501 		err = SET_ERROR(EIO);
1502 		goto early_unlock;
1503 	}
1504 
1505 	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1506 	    db->db.db_object, db->db_level, db->db_blkid);
1507 
1508 	/*
1509 	 * All bps of an encrypted os should have the encryption bit set.
1510 	 * If this is not true it indicates tampering and we report an error.
1511 	 */
1512 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
1513 		spa_log_error(db->db_objset->os_spa, &zb);
1514 		zfs_panic_recover("unencrypted block in encrypted "
1515 		    "object set %llu", dmu_objset_id(db->db_objset));
1516 		err = SET_ERROR(EIO);
1517 		goto early_unlock;
1518 	}
1519 
1520 	err = dbuf_read_verify_dnode_crypt(db, flags);
1521 	if (err != 0)
1522 		goto early_unlock;
1523 
1524 	DB_DNODE_EXIT(db);
1525 
1526 	db->db_state = DB_READ;
1527 	DTRACE_SET_STATE(db, "read issued");
1528 	mutex_exit(&db->db_mtx);
1529 
1530 	if (DBUF_IS_L2CACHEABLE(db))
1531 		aflags |= ARC_FLAG_L2CACHE;
1532 
1533 	dbuf_add_ref(db, NULL);
1534 
1535 	zio_flags = (flags & DB_RF_CANFAIL) ?
1536 	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
1537 
1538 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
1539 		zio_flags |= ZIO_FLAG_RAW;
1540 	/*
1541 	 * The zio layer will copy the provided blkptr later, but we need to
1542 	 * do this now so that we can release the parent's rwlock. We have to
1543 	 * do that now so that if dbuf_read_done is called synchronously (on
1544 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
1545 	 * parent's rwlock, which would be a lock ordering violation.
1546 	 */
1547 	blkptr_t bp = *db->db_blkptr;
1548 	dmu_buf_unlock_parent(db, dblt, tag);
1549 	(void) arc_read(zio, db->db_objset->os_spa, &bp,
1550 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
1551 	    &aflags, &zb);
1552 	return (err);
1553 early_unlock:
1554 	DB_DNODE_EXIT(db);
1555 	mutex_exit(&db->db_mtx);
1556 	dmu_buf_unlock_parent(db, dblt, tag);
1557 	return (err);
1558 }
1559 
1560 /*
1561  * This is our just-in-time copy function.  It makes a copy of buffers that
1562  * have been modified in a previous transaction group before we access them in
1563  * the current active group.
1564  *
1565  * This function is used in three places: when we are dirtying a buffer for the
1566  * first time in a txg, when we are freeing a range in a dnode that includes
1567  * this buffer, and when we are accessing a buffer which was received compressed
1568  * and later referenced in a WRITE_BYREF record.
1569  *
1570  * Note that when we are called from dbuf_free_range() we do not put a hold on
1571  * the buffer, we just traverse the active dbuf list for the dnode.
1572  */
1573 static void
1574 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1575 {
1576 	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
1577 
1578 	ASSERT(MUTEX_HELD(&db->db_mtx));
1579 	ASSERT(db->db.db_data != NULL);
1580 	ASSERT(db->db_level == 0);
1581 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1582 
1583 	if (dr == NULL ||
1584 	    (dr->dt.dl.dr_data !=
1585 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1586 		return;
1587 
1588 	/*
1589 	 * If the last dirty record for this dbuf has not yet synced
1590 	 * and its referencing the dbuf data, either:
1591 	 *	reset the reference to point to a new copy,
1592 	 * or (if there a no active holders)
1593 	 *	just null out the current db_data pointer.
1594 	 */
1595 	ASSERT3U(dr->dr_txg, >=, txg - 2);
1596 	if (db->db_blkid == DMU_BONUS_BLKID) {
1597 		dnode_t *dn = DB_DNODE(db);
1598 		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1599 		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
1600 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1601 		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
1602 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
1603 		dnode_t *dn = DB_DNODE(db);
1604 		int size = arc_buf_size(db->db_buf);
1605 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1606 		spa_t *spa = db->db_objset->os_spa;
1607 		enum zio_compress compress_type =
1608 		    arc_get_compression(db->db_buf);
1609 		uint8_t complevel = arc_get_complevel(db->db_buf);
1610 
1611 		if (arc_is_encrypted(db->db_buf)) {
1612 			boolean_t byteorder;
1613 			uint8_t salt[ZIO_DATA_SALT_LEN];
1614 			uint8_t iv[ZIO_DATA_IV_LEN];
1615 			uint8_t mac[ZIO_DATA_MAC_LEN];
1616 
1617 			arc_get_raw_params(db->db_buf, &byteorder, salt,
1618 			    iv, mac);
1619 			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
1620 			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
1621 			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
1622 			    compress_type, complevel);
1623 		} else if (compress_type != ZIO_COMPRESS_OFF) {
1624 			ASSERT3U(type, ==, ARC_BUFC_DATA);
1625 			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
1626 			    size, arc_buf_lsize(db->db_buf), compress_type,
1627 			    complevel);
1628 		} else {
1629 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
1630 		}
1631 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
1632 	} else {
1633 		db->db_buf = NULL;
1634 		dbuf_clear_data(db);
1635 	}
1636 }
1637 
1638 int
1639 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
1640 {
1641 	int err = 0;
1642 	boolean_t prefetch;
1643 	dnode_t *dn;
1644 
1645 	/*
1646 	 * We don't have to hold the mutex to check db_state because it
1647 	 * can't be freed while we have a hold on the buffer.
1648 	 */
1649 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1650 
1651 	if (db->db_state == DB_NOFILL)
1652 		return (SET_ERROR(EIO));
1653 
1654 	DB_DNODE_ENTER(db);
1655 	dn = DB_DNODE(db);
1656 
1657 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1658 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
1659 	    DBUF_IS_CACHEABLE(db);
1660 
1661 	mutex_enter(&db->db_mtx);
1662 	if (db->db_state == DB_CACHED) {
1663 		spa_t *spa = dn->dn_objset->os_spa;
1664 
1665 		/*
1666 		 * Ensure that this block's dnode has been decrypted if
1667 		 * the caller has requested decrypted data.
1668 		 */
1669 		err = dbuf_read_verify_dnode_crypt(db, flags);
1670 
1671 		/*
1672 		 * If the arc buf is compressed or encrypted and the caller
1673 		 * requested uncompressed data, we need to untransform it
1674 		 * before returning. We also call arc_untransform() on any
1675 		 * unauthenticated blocks, which will verify their MAC if
1676 		 * the key is now available.
1677 		 */
1678 		if (err == 0 && db->db_buf != NULL &&
1679 		    (flags & DB_RF_NO_DECRYPT) == 0 &&
1680 		    (arc_is_encrypted(db->db_buf) ||
1681 		    arc_is_unauthenticated(db->db_buf) ||
1682 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
1683 			zbookmark_phys_t zb;
1684 
1685 			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1686 			    db->db.db_object, db->db_level, db->db_blkid);
1687 			dbuf_fix_old_data(db, spa_syncing_txg(spa));
1688 			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
1689 			dbuf_set_data(db, db->db_buf);
1690 		}
1691 		mutex_exit(&db->db_mtx);
1692 		if (err == 0 && prefetch) {
1693 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1694 			    B_FALSE, flags & DB_RF_HAVESTRUCT);
1695 		}
1696 		DB_DNODE_EXIT(db);
1697 		DBUF_STAT_BUMP(hash_hits);
1698 	} else if (db->db_state == DB_UNCACHED) {
1699 		spa_t *spa = dn->dn_objset->os_spa;
1700 		boolean_t need_wait = B_FALSE;
1701 
1702 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
1703 
1704 		if (zio == NULL &&
1705 		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1706 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1707 			need_wait = B_TRUE;
1708 		}
1709 		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
1710 		/*
1711 		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
1712 		 * for us
1713 		 */
1714 		if (!err && prefetch) {
1715 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1716 			    db->db_state != DB_CACHED,
1717 			    flags & DB_RF_HAVESTRUCT);
1718 		}
1719 
1720 		DB_DNODE_EXIT(db);
1721 		DBUF_STAT_BUMP(hash_misses);
1722 
1723 		/*
1724 		 * If we created a zio_root we must execute it to avoid
1725 		 * leaking it, even if it isn't attached to any work due
1726 		 * to an error in dbuf_read_impl().
1727 		 */
1728 		if (need_wait) {
1729 			if (err == 0)
1730 				err = zio_wait(zio);
1731 			else
1732 				VERIFY0(zio_wait(zio));
1733 		}
1734 	} else {
1735 		/*
1736 		 * Another reader came in while the dbuf was in flight
1737 		 * between UNCACHED and CACHED.  Either a writer will finish
1738 		 * writing the buffer (sending the dbuf to CACHED) or the
1739 		 * first reader's request will reach the read_done callback
1740 		 * and send the dbuf to CACHED.  Otherwise, a failure
1741 		 * occurred and the dbuf went to UNCACHED.
1742 		 */
1743 		mutex_exit(&db->db_mtx);
1744 		if (prefetch) {
1745 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1746 			    B_TRUE, flags & DB_RF_HAVESTRUCT);
1747 		}
1748 		DB_DNODE_EXIT(db);
1749 		DBUF_STAT_BUMP(hash_misses);
1750 
1751 		/* Skip the wait per the caller's request. */
1752 		if ((flags & DB_RF_NEVERWAIT) == 0) {
1753 			mutex_enter(&db->db_mtx);
1754 			while (db->db_state == DB_READ ||
1755 			    db->db_state == DB_FILL) {
1756 				ASSERT(db->db_state == DB_READ ||
1757 				    (flags & DB_RF_HAVESTRUCT) == 0);
1758 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
1759 				    db, zio_t *, zio);
1760 				cv_wait(&db->db_changed, &db->db_mtx);
1761 			}
1762 			if (db->db_state == DB_UNCACHED)
1763 				err = SET_ERROR(EIO);
1764 			mutex_exit(&db->db_mtx);
1765 		}
1766 	}
1767 
1768 	return (err);
1769 }
1770 
1771 static void
1772 dbuf_noread(dmu_buf_impl_t *db)
1773 {
1774 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1775 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1776 	mutex_enter(&db->db_mtx);
1777 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1778 		cv_wait(&db->db_changed, &db->db_mtx);
1779 	if (db->db_state == DB_UNCACHED) {
1780 		ASSERT(db->db_buf == NULL);
1781 		ASSERT(db->db.db_data == NULL);
1782 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
1783 		db->db_state = DB_FILL;
1784 		DTRACE_SET_STATE(db, "assigning filled buffer");
1785 	} else if (db->db_state == DB_NOFILL) {
1786 		dbuf_clear_data(db);
1787 	} else {
1788 		ASSERT3U(db->db_state, ==, DB_CACHED);
1789 	}
1790 	mutex_exit(&db->db_mtx);
1791 }
1792 
1793 void
1794 dbuf_unoverride(dbuf_dirty_record_t *dr)
1795 {
1796 	dmu_buf_impl_t *db = dr->dr_dbuf;
1797 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1798 	uint64_t txg = dr->dr_txg;
1799 
1800 	ASSERT(MUTEX_HELD(&db->db_mtx));
1801 	/*
1802 	 * This assert is valid because dmu_sync() expects to be called by
1803 	 * a zilog's get_data while holding a range lock.  This call only
1804 	 * comes from dbuf_dirty() callers who must also hold a range lock.
1805 	 */
1806 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1807 	ASSERT(db->db_level == 0);
1808 
1809 	if (db->db_blkid == DMU_BONUS_BLKID ||
1810 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1811 		return;
1812 
1813 	ASSERT(db->db_data_pending != dr);
1814 
1815 	/* free this block */
1816 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1817 		zio_free(db->db_objset->os_spa, txg, bp);
1818 
1819 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1820 	dr->dt.dl.dr_nopwrite = B_FALSE;
1821 	dr->dt.dl.dr_has_raw_params = B_FALSE;
1822 
1823 	/*
1824 	 * Release the already-written buffer, so we leave it in
1825 	 * a consistent dirty state.  Note that all callers are
1826 	 * modifying the buffer, so they will immediately do
1827 	 * another (redundant) arc_release().  Therefore, leave
1828 	 * the buf thawed to save the effort of freezing &
1829 	 * immediately re-thawing it.
1830 	 */
1831 	arc_release(dr->dt.dl.dr_data, db);
1832 }
1833 
1834 /*
1835  * Evict (if its unreferenced) or clear (if its referenced) any level-0
1836  * data blocks in the free range, so that any future readers will find
1837  * empty blocks.
1838  */
1839 void
1840 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1841     dmu_tx_t *tx)
1842 {
1843 	dmu_buf_impl_t *db_search;
1844 	dmu_buf_impl_t *db, *db_next;
1845 	uint64_t txg = tx->tx_txg;
1846 	avl_index_t where;
1847 	dbuf_dirty_record_t *dr;
1848 
1849 	if (end_blkid > dn->dn_maxblkid &&
1850 	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
1851 		end_blkid = dn->dn_maxblkid;
1852 	dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
1853 	    (u_longlong_t)end_blkid);
1854 
1855 	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
1856 	db_search->db_level = 0;
1857 	db_search->db_blkid = start_blkid;
1858 	db_search->db_state = DB_SEARCH;
1859 
1860 	mutex_enter(&dn->dn_dbufs_mtx);
1861 	db = avl_find(&dn->dn_dbufs, db_search, &where);
1862 	ASSERT3P(db, ==, NULL);
1863 
1864 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1865 
1866 	for (; db != NULL; db = db_next) {
1867 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
1868 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1869 
1870 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
1871 			break;
1872 		}
1873 		ASSERT3U(db->db_blkid, >=, start_blkid);
1874 
1875 		/* found a level 0 buffer in the range */
1876 		mutex_enter(&db->db_mtx);
1877 		if (dbuf_undirty(db, tx)) {
1878 			/* mutex has been dropped and dbuf destroyed */
1879 			continue;
1880 		}
1881 
1882 		if (db->db_state == DB_UNCACHED ||
1883 		    db->db_state == DB_NOFILL ||
1884 		    db->db_state == DB_EVICTING) {
1885 			ASSERT(db->db.db_data == NULL);
1886 			mutex_exit(&db->db_mtx);
1887 			continue;
1888 		}
1889 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1890 			/* will be handled in dbuf_read_done or dbuf_rele */
1891 			db->db_freed_in_flight = TRUE;
1892 			mutex_exit(&db->db_mtx);
1893 			continue;
1894 		}
1895 		if (zfs_refcount_count(&db->db_holds) == 0) {
1896 			ASSERT(db->db_buf);
1897 			dbuf_destroy(db);
1898 			continue;
1899 		}
1900 		/* The dbuf is referenced */
1901 
1902 		dr = list_head(&db->db_dirty_records);
1903 		if (dr != NULL) {
1904 			if (dr->dr_txg == txg) {
1905 				/*
1906 				 * This buffer is "in-use", re-adjust the file
1907 				 * size to reflect that this buffer may
1908 				 * contain new data when we sync.
1909 				 */
1910 				if (db->db_blkid != DMU_SPILL_BLKID &&
1911 				    db->db_blkid > dn->dn_maxblkid)
1912 					dn->dn_maxblkid = db->db_blkid;
1913 				dbuf_unoverride(dr);
1914 			} else {
1915 				/*
1916 				 * This dbuf is not dirty in the open context.
1917 				 * Either uncache it (if its not referenced in
1918 				 * the open context) or reset its contents to
1919 				 * empty.
1920 				 */
1921 				dbuf_fix_old_data(db, txg);
1922 			}
1923 		}
1924 		/* clear the contents if its cached */
1925 		if (db->db_state == DB_CACHED) {
1926 			ASSERT(db->db.db_data != NULL);
1927 			arc_release(db->db_buf, db);
1928 			rw_enter(&db->db_rwlock, RW_WRITER);
1929 			bzero(db->db.db_data, db->db.db_size);
1930 			rw_exit(&db->db_rwlock);
1931 			arc_buf_freeze(db->db_buf);
1932 		}
1933 
1934 		mutex_exit(&db->db_mtx);
1935 	}
1936 
1937 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
1938 	mutex_exit(&dn->dn_dbufs_mtx);
1939 }
1940 
1941 void
1942 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1943 {
1944 	arc_buf_t *buf, *old_buf;
1945 	dbuf_dirty_record_t *dr;
1946 	int osize = db->db.db_size;
1947 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1948 	dnode_t *dn;
1949 
1950 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1951 
1952 	DB_DNODE_ENTER(db);
1953 	dn = DB_DNODE(db);
1954 
1955 	/*
1956 	 * XXX we should be doing a dbuf_read, checking the return
1957 	 * value and returning that up to our callers
1958 	 */
1959 	dmu_buf_will_dirty(&db->db, tx);
1960 
1961 	/* create the data buffer for the new block */
1962 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
1963 
1964 	/* copy old block data to the new block */
1965 	old_buf = db->db_buf;
1966 	bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
1967 	/* zero the remainder */
1968 	if (size > osize)
1969 		bzero((uint8_t *)buf->b_data + osize, size - osize);
1970 
1971 	mutex_enter(&db->db_mtx);
1972 	dbuf_set_data(db, buf);
1973 	arc_buf_destroy(old_buf, db);
1974 	db->db.db_size = size;
1975 
1976 	dr = list_head(&db->db_dirty_records);
1977 	/* dirty record added by dmu_buf_will_dirty() */
1978 	VERIFY(dr != NULL);
1979 	if (db->db_level == 0)
1980 		dr->dt.dl.dr_data = buf;
1981 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
1982 	ASSERT3U(dr->dr_accounted, ==, osize);
1983 	dr->dr_accounted = size;
1984 	mutex_exit(&db->db_mtx);
1985 
1986 	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
1987 	DB_DNODE_EXIT(db);
1988 }
1989 
1990 void
1991 dbuf_release_bp(dmu_buf_impl_t *db)
1992 {
1993 	objset_t *os __maybe_unused = db->db_objset;
1994 
1995 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1996 	ASSERT(arc_released(os->os_phys_buf) ||
1997 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1998 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1999 
2000 	(void) arc_release(db->db_buf, db);
2001 }
2002 
2003 /*
2004  * We already have a dirty record for this TXG, and we are being
2005  * dirtied again.
2006  */
2007 static void
2008 dbuf_redirty(dbuf_dirty_record_t *dr)
2009 {
2010 	dmu_buf_impl_t *db = dr->dr_dbuf;
2011 
2012 	ASSERT(MUTEX_HELD(&db->db_mtx));
2013 
2014 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
2015 		/*
2016 		 * If this buffer has already been written out,
2017 		 * we now need to reset its state.
2018 		 */
2019 		dbuf_unoverride(dr);
2020 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
2021 		    db->db_state != DB_NOFILL) {
2022 			/* Already released on initial dirty, so just thaw. */
2023 			ASSERT(arc_released(db->db_buf));
2024 			arc_buf_thaw(db->db_buf);
2025 		}
2026 	}
2027 }
2028 
2029 dbuf_dirty_record_t *
2030 dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
2031 {
2032 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
2033 	IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
2034 	dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
2035 	ASSERT(dn->dn_maxblkid >= blkid);
2036 
2037 	dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
2038 	list_link_init(&dr->dr_dirty_node);
2039 	list_link_init(&dr->dr_dbuf_node);
2040 	dr->dr_dnode = dn;
2041 	dr->dr_txg = tx->tx_txg;
2042 	dr->dt.dll.dr_blkid = blkid;
2043 	dr->dr_accounted = dn->dn_datablksz;
2044 
2045 	/*
2046 	 * There should not be any dbuf for the block that we're dirtying.
2047 	 * Otherwise the buffer contents could be inconsistent between the
2048 	 * dbuf and the lightweight dirty record.
2049 	 */
2050 	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
2051 
2052 	mutex_enter(&dn->dn_mtx);
2053 	int txgoff = tx->tx_txg & TXG_MASK;
2054 	if (dn->dn_free_ranges[txgoff] != NULL) {
2055 		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
2056 	}
2057 
2058 	if (dn->dn_nlevels == 1) {
2059 		ASSERT3U(blkid, <, dn->dn_nblkptr);
2060 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2061 		mutex_exit(&dn->dn_mtx);
2062 		rw_exit(&dn->dn_struct_rwlock);
2063 		dnode_setdirty(dn, tx);
2064 	} else {
2065 		mutex_exit(&dn->dn_mtx);
2066 
2067 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2068 		dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
2069 		    1, blkid >> epbs, FTAG);
2070 		rw_exit(&dn->dn_struct_rwlock);
2071 		if (parent_db == NULL) {
2072 			kmem_free(dr, sizeof (*dr));
2073 			return (NULL);
2074 		}
2075 		int err = dbuf_read(parent_db, NULL,
2076 		    (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2077 		if (err != 0) {
2078 			dbuf_rele(parent_db, FTAG);
2079 			kmem_free(dr, sizeof (*dr));
2080 			return (NULL);
2081 		}
2082 
2083 		dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
2084 		dbuf_rele(parent_db, FTAG);
2085 		mutex_enter(&parent_dr->dt.di.dr_mtx);
2086 		ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
2087 		list_insert_tail(&parent_dr->dt.di.dr_children, dr);
2088 		mutex_exit(&parent_dr->dt.di.dr_mtx);
2089 		dr->dr_parent = parent_dr;
2090 	}
2091 
2092 	dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
2093 
2094 	return (dr);
2095 }
2096 
2097 dbuf_dirty_record_t *
2098 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2099 {
2100 	dnode_t *dn;
2101 	objset_t *os;
2102 	dbuf_dirty_record_t *dr, *dr_next, *dr_head;
2103 	int txgoff = tx->tx_txg & TXG_MASK;
2104 	boolean_t drop_struct_rwlock = B_FALSE;
2105 
2106 	ASSERT(tx->tx_txg != 0);
2107 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2108 	DMU_TX_DIRTY_BUF(tx, db);
2109 
2110 	DB_DNODE_ENTER(db);
2111 	dn = DB_DNODE(db);
2112 	/*
2113 	 * Shouldn't dirty a regular buffer in syncing context.  Private
2114 	 * objects may be dirtied in syncing context, but only if they
2115 	 * were already pre-dirtied in open context.
2116 	 */
2117 #ifdef ZFS_DEBUG
2118 	if (dn->dn_objset->os_dsl_dataset != NULL) {
2119 		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
2120 		    RW_READER, FTAG);
2121 	}
2122 	ASSERT(!dmu_tx_is_syncing(tx) ||
2123 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
2124 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2125 	    dn->dn_objset->os_dsl_dataset == NULL);
2126 	if (dn->dn_objset->os_dsl_dataset != NULL)
2127 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
2128 #endif
2129 	/*
2130 	 * We make this assert for private objects as well, but after we
2131 	 * check if we're already dirty.  They are allowed to re-dirty
2132 	 * in syncing context.
2133 	 */
2134 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2135 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
2136 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
2137 
2138 	mutex_enter(&db->db_mtx);
2139 	/*
2140 	 * XXX make this true for indirects too?  The problem is that
2141 	 * transactions created with dmu_tx_create_assigned() from
2142 	 * syncing context don't bother holding ahead.
2143 	 */
2144 	ASSERT(db->db_level != 0 ||
2145 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
2146 	    db->db_state == DB_NOFILL);
2147 
2148 	mutex_enter(&dn->dn_mtx);
2149 	dnode_set_dirtyctx(dn, tx, db);
2150 	if (tx->tx_txg > dn->dn_dirty_txg)
2151 		dn->dn_dirty_txg = tx->tx_txg;
2152 	mutex_exit(&dn->dn_mtx);
2153 
2154 	if (db->db_blkid == DMU_SPILL_BLKID)
2155 		dn->dn_have_spill = B_TRUE;
2156 
2157 	/*
2158 	 * If this buffer is already dirty, we're done.
2159 	 */
2160 	dr_head = list_head(&db->db_dirty_records);
2161 	ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
2162 	    db->db.db_object == DMU_META_DNODE_OBJECT);
2163 	dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
2164 	if (dr_next && dr_next->dr_txg == tx->tx_txg) {
2165 		DB_DNODE_EXIT(db);
2166 
2167 		dbuf_redirty(dr_next);
2168 		mutex_exit(&db->db_mtx);
2169 		return (dr_next);
2170 	}
2171 
2172 	/*
2173 	 * Only valid if not already dirty.
2174 	 */
2175 	ASSERT(dn->dn_object == 0 ||
2176 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
2177 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
2178 
2179 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
2180 
2181 	/*
2182 	 * We should only be dirtying in syncing context if it's the
2183 	 * mos or we're initializing the os or it's a special object.
2184 	 * However, we are allowed to dirty in syncing context provided
2185 	 * we already dirtied it in open context.  Hence we must make
2186 	 * this assertion only if we're not already dirty.
2187 	 */
2188 	os = dn->dn_objset;
2189 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
2190 #ifdef ZFS_DEBUG
2191 	if (dn->dn_objset->os_dsl_dataset != NULL)
2192 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
2193 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2194 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
2195 	if (dn->dn_objset->os_dsl_dataset != NULL)
2196 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
2197 #endif
2198 	ASSERT(db->db.db_size != 0);
2199 
2200 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2201 
2202 	if (db->db_blkid != DMU_BONUS_BLKID) {
2203 		dmu_objset_willuse_space(os, db->db.db_size, tx);
2204 	}
2205 
2206 	/*
2207 	 * If this buffer is dirty in an old transaction group we need
2208 	 * to make a copy of it so that the changes we make in this
2209 	 * transaction group won't leak out when we sync the older txg.
2210 	 */
2211 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
2212 	list_link_init(&dr->dr_dirty_node);
2213 	list_link_init(&dr->dr_dbuf_node);
2214 	dr->dr_dnode = dn;
2215 	if (db->db_level == 0) {
2216 		void *data_old = db->db_buf;
2217 
2218 		if (db->db_state != DB_NOFILL) {
2219 			if (db->db_blkid == DMU_BONUS_BLKID) {
2220 				dbuf_fix_old_data(db, tx->tx_txg);
2221 				data_old = db->db.db_data;
2222 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
2223 				/*
2224 				 * Release the data buffer from the cache so
2225 				 * that we can modify it without impacting
2226 				 * possible other users of this cached data
2227 				 * block.  Note that indirect blocks and
2228 				 * private objects are not released until the
2229 				 * syncing state (since they are only modified
2230 				 * then).
2231 				 */
2232 				arc_release(db->db_buf, db);
2233 				dbuf_fix_old_data(db, tx->tx_txg);
2234 				data_old = db->db_buf;
2235 			}
2236 			ASSERT(data_old != NULL);
2237 		}
2238 		dr->dt.dl.dr_data = data_old;
2239 	} else {
2240 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
2241 		list_create(&dr->dt.di.dr_children,
2242 		    sizeof (dbuf_dirty_record_t),
2243 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
2244 	}
2245 	if (db->db_blkid != DMU_BONUS_BLKID)
2246 		dr->dr_accounted = db->db.db_size;
2247 	dr->dr_dbuf = db;
2248 	dr->dr_txg = tx->tx_txg;
2249 	list_insert_before(&db->db_dirty_records, dr_next, dr);
2250 
2251 	/*
2252 	 * We could have been freed_in_flight between the dbuf_noread
2253 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
2254 	 * happened after the free.
2255 	 */
2256 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2257 	    db->db_blkid != DMU_SPILL_BLKID) {
2258 		mutex_enter(&dn->dn_mtx);
2259 		if (dn->dn_free_ranges[txgoff] != NULL) {
2260 			range_tree_clear(dn->dn_free_ranges[txgoff],
2261 			    db->db_blkid, 1);
2262 		}
2263 		mutex_exit(&dn->dn_mtx);
2264 		db->db_freed_in_flight = FALSE;
2265 	}
2266 
2267 	/*
2268 	 * This buffer is now part of this txg
2269 	 */
2270 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
2271 	db->db_dirtycnt += 1;
2272 	ASSERT3U(db->db_dirtycnt, <=, 3);
2273 
2274 	mutex_exit(&db->db_mtx);
2275 
2276 	if (db->db_blkid == DMU_BONUS_BLKID ||
2277 	    db->db_blkid == DMU_SPILL_BLKID) {
2278 		mutex_enter(&dn->dn_mtx);
2279 		ASSERT(!list_link_active(&dr->dr_dirty_node));
2280 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2281 		mutex_exit(&dn->dn_mtx);
2282 		dnode_setdirty(dn, tx);
2283 		DB_DNODE_EXIT(db);
2284 		return (dr);
2285 	}
2286 
2287 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
2288 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
2289 		drop_struct_rwlock = B_TRUE;
2290 	}
2291 
2292 	/*
2293 	 * If we are overwriting a dedup BP, then unless it is snapshotted,
2294 	 * when we get to syncing context we will need to decrement its
2295 	 * refcount in the DDT.  Prefetch the relevant DDT block so that
2296 	 * syncing context won't have to wait for the i/o.
2297 	 */
2298 	if (db->db_blkptr != NULL) {
2299 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2300 		ddt_prefetch(os->os_spa, db->db_blkptr);
2301 		dmu_buf_unlock_parent(db, dblt, FTAG);
2302 	}
2303 
2304 	/*
2305 	 * We need to hold the dn_struct_rwlock to make this assertion,
2306 	 * because it protects dn_phys / dn_next_nlevels from changing.
2307 	 */
2308 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
2309 	    dn->dn_phys->dn_nlevels > db->db_level ||
2310 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
2311 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
2312 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
2313 
2314 
2315 	if (db->db_level == 0) {
2316 		ASSERT(!db->db_objset->os_raw_receive ||
2317 		    dn->dn_maxblkid >= db->db_blkid);
2318 		dnode_new_blkid(dn, db->db_blkid, tx,
2319 		    drop_struct_rwlock, B_FALSE);
2320 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
2321 	}
2322 
2323 	if (db->db_level+1 < dn->dn_nlevels) {
2324 		dmu_buf_impl_t *parent = db->db_parent;
2325 		dbuf_dirty_record_t *di;
2326 		int parent_held = FALSE;
2327 
2328 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
2329 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2330 			parent = dbuf_hold_level(dn, db->db_level + 1,
2331 			    db->db_blkid >> epbs, FTAG);
2332 			ASSERT(parent != NULL);
2333 			parent_held = TRUE;
2334 		}
2335 		if (drop_struct_rwlock)
2336 			rw_exit(&dn->dn_struct_rwlock);
2337 		ASSERT3U(db->db_level + 1, ==, parent->db_level);
2338 		di = dbuf_dirty(parent, tx);
2339 		if (parent_held)
2340 			dbuf_rele(parent, FTAG);
2341 
2342 		mutex_enter(&db->db_mtx);
2343 		/*
2344 		 * Since we've dropped the mutex, it's possible that
2345 		 * dbuf_undirty() might have changed this out from under us.
2346 		 */
2347 		if (list_head(&db->db_dirty_records) == dr ||
2348 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
2349 			mutex_enter(&di->dt.di.dr_mtx);
2350 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
2351 			ASSERT(!list_link_active(&dr->dr_dirty_node));
2352 			list_insert_tail(&di->dt.di.dr_children, dr);
2353 			mutex_exit(&di->dt.di.dr_mtx);
2354 			dr->dr_parent = di;
2355 		}
2356 		mutex_exit(&db->db_mtx);
2357 	} else {
2358 		ASSERT(db->db_level + 1 == dn->dn_nlevels);
2359 		ASSERT(db->db_blkid < dn->dn_nblkptr);
2360 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
2361 		mutex_enter(&dn->dn_mtx);
2362 		ASSERT(!list_link_active(&dr->dr_dirty_node));
2363 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2364 		mutex_exit(&dn->dn_mtx);
2365 		if (drop_struct_rwlock)
2366 			rw_exit(&dn->dn_struct_rwlock);
2367 	}
2368 
2369 	dnode_setdirty(dn, tx);
2370 	DB_DNODE_EXIT(db);
2371 	return (dr);
2372 }
2373 
2374 static void
2375 dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
2376 {
2377 	dmu_buf_impl_t *db = dr->dr_dbuf;
2378 
2379 	if (dr->dt.dl.dr_data != db->db.db_data) {
2380 		struct dnode *dn = dr->dr_dnode;
2381 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
2382 
2383 		kmem_free(dr->dt.dl.dr_data, max_bonuslen);
2384 		arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
2385 	}
2386 	db->db_data_pending = NULL;
2387 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
2388 	list_remove(&db->db_dirty_records, dr);
2389 	if (dr->dr_dbuf->db_level != 0) {
2390 		mutex_destroy(&dr->dt.di.dr_mtx);
2391 		list_destroy(&dr->dt.di.dr_children);
2392 	}
2393 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2394 	ASSERT3U(db->db_dirtycnt, >, 0);
2395 	db->db_dirtycnt -= 1;
2396 }
2397 
2398 /*
2399  * Undirty a buffer in the transaction group referenced by the given
2400  * transaction.  Return whether this evicted the dbuf.
2401  */
2402 static boolean_t
2403 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2404 {
2405 	uint64_t txg = tx->tx_txg;
2406 
2407 	ASSERT(txg != 0);
2408 
2409 	/*
2410 	 * Due to our use of dn_nlevels below, this can only be called
2411 	 * in open context, unless we are operating on the MOS.
2412 	 * From syncing context, dn_nlevels may be different from the
2413 	 * dn_nlevels used when dbuf was dirtied.
2414 	 */
2415 	ASSERT(db->db_objset ==
2416 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
2417 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
2418 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2419 	ASSERT0(db->db_level);
2420 	ASSERT(MUTEX_HELD(&db->db_mtx));
2421 
2422 	/*
2423 	 * If this buffer is not dirty, we're done.
2424 	 */
2425 	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
2426 	if (dr == NULL)
2427 		return (B_FALSE);
2428 	ASSERT(dr->dr_dbuf == db);
2429 
2430 	dnode_t *dn = dr->dr_dnode;
2431 
2432 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2433 
2434 	ASSERT(db->db.db_size != 0);
2435 
2436 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
2437 	    dr->dr_accounted, txg);
2438 
2439 	list_remove(&db->db_dirty_records, dr);
2440 
2441 	/*
2442 	 * Note that there are three places in dbuf_dirty()
2443 	 * where this dirty record may be put on a list.
2444 	 * Make sure to do a list_remove corresponding to
2445 	 * every one of those list_insert calls.
2446 	 */
2447 	if (dr->dr_parent) {
2448 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
2449 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
2450 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
2451 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
2452 	    db->db_level + 1 == dn->dn_nlevels) {
2453 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
2454 		mutex_enter(&dn->dn_mtx);
2455 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
2456 		mutex_exit(&dn->dn_mtx);
2457 	}
2458 
2459 	if (db->db_state != DB_NOFILL) {
2460 		dbuf_unoverride(dr);
2461 
2462 		ASSERT(db->db_buf != NULL);
2463 		ASSERT(dr->dt.dl.dr_data != NULL);
2464 		if (dr->dt.dl.dr_data != db->db_buf)
2465 			arc_buf_destroy(dr->dt.dl.dr_data, db);
2466 	}
2467 
2468 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2469 
2470 	ASSERT(db->db_dirtycnt > 0);
2471 	db->db_dirtycnt -= 1;
2472 
2473 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
2474 		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
2475 		dbuf_destroy(db);
2476 		return (B_TRUE);
2477 	}
2478 
2479 	return (B_FALSE);
2480 }
2481 
2482 static void
2483 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
2484 {
2485 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2486 
2487 	ASSERT(tx->tx_txg != 0);
2488 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2489 
2490 	/*
2491 	 * Quick check for dirtiness.  For already dirty blocks, this
2492 	 * reduces runtime of this function by >90%, and overall performance
2493 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
2494 	 * cached).
2495 	 */
2496 	mutex_enter(&db->db_mtx);
2497 
2498 	if (db->db_state == DB_CACHED) {
2499 		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2500 		/*
2501 		 * It's possible that it is already dirty but not cached,
2502 		 * because there are some calls to dbuf_dirty() that don't
2503 		 * go through dmu_buf_will_dirty().
2504 		 */
2505 		if (dr != NULL) {
2506 			/* This dbuf is already dirty and cached. */
2507 			dbuf_redirty(dr);
2508 			mutex_exit(&db->db_mtx);
2509 			return;
2510 		}
2511 	}
2512 	mutex_exit(&db->db_mtx);
2513 
2514 	DB_DNODE_ENTER(db);
2515 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2516 		flags |= DB_RF_HAVESTRUCT;
2517 	DB_DNODE_EXIT(db);
2518 	(void) dbuf_read(db, NULL, flags);
2519 	(void) dbuf_dirty(db, tx);
2520 }
2521 
2522 void
2523 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2524 {
2525 	dmu_buf_will_dirty_impl(db_fake,
2526 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
2527 }
2528 
2529 boolean_t
2530 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2531 {
2532 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2533 	dbuf_dirty_record_t *dr;
2534 
2535 	mutex_enter(&db->db_mtx);
2536 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2537 	mutex_exit(&db->db_mtx);
2538 	return (dr != NULL);
2539 }
2540 
2541 void
2542 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2543 {
2544 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2545 
2546 	db->db_state = DB_NOFILL;
2547 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
2548 	dmu_buf_will_fill(db_fake, tx);
2549 }
2550 
2551 void
2552 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2553 {
2554 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2555 
2556 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2557 	ASSERT(tx->tx_txg != 0);
2558 	ASSERT(db->db_level == 0);
2559 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2560 
2561 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
2562 	    dmu_tx_private_ok(tx));
2563 
2564 	dbuf_noread(db);
2565 	(void) dbuf_dirty(db, tx);
2566 }
2567 
2568 /*
2569  * This function is effectively the same as dmu_buf_will_dirty(), but
2570  * indicates the caller expects raw encrypted data in the db, and provides
2571  * the crypt params (byteorder, salt, iv, mac) which should be stored in the
2572  * blkptr_t when this dbuf is written.  This is only used for blocks of
2573  * dnodes, during raw receive.
2574  */
2575 void
2576 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
2577     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
2578 {
2579 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2580 	dbuf_dirty_record_t *dr;
2581 
2582 	/*
2583 	 * dr_has_raw_params is only processed for blocks of dnodes
2584 	 * (see dbuf_sync_dnode_leaf_crypt()).
2585 	 */
2586 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
2587 	ASSERT3U(db->db_level, ==, 0);
2588 	ASSERT(db->db_objset->os_raw_receive);
2589 
2590 	dmu_buf_will_dirty_impl(db_fake,
2591 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
2592 
2593 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2594 
2595 	ASSERT3P(dr, !=, NULL);
2596 
2597 	dr->dt.dl.dr_has_raw_params = B_TRUE;
2598 	dr->dt.dl.dr_byteorder = byteorder;
2599 	bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
2600 	bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
2601 	bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
2602 }
2603 
2604 static void
2605 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
2606 {
2607 	struct dirty_leaf *dl;
2608 	dbuf_dirty_record_t *dr;
2609 
2610 	dr = list_head(&db->db_dirty_records);
2611 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2612 	dl = &dr->dt.dl;
2613 	dl->dr_overridden_by = *bp;
2614 	dl->dr_override_state = DR_OVERRIDDEN;
2615 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
2616 }
2617 
2618 /* ARGSUSED */
2619 void
2620 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
2621 {
2622 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2623 	dbuf_states_t old_state;
2624 	mutex_enter(&db->db_mtx);
2625 	DBUF_VERIFY(db);
2626 
2627 	old_state = db->db_state;
2628 	db->db_state = DB_CACHED;
2629 	if (old_state == DB_FILL) {
2630 		if (db->db_level == 0 && db->db_freed_in_flight) {
2631 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2632 			/* we were freed while filling */
2633 			/* XXX dbuf_undirty? */
2634 			bzero(db->db.db_data, db->db.db_size);
2635 			db->db_freed_in_flight = FALSE;
2636 			DTRACE_SET_STATE(db,
2637 			    "fill done handling freed in flight");
2638 		} else {
2639 			DTRACE_SET_STATE(db, "fill done");
2640 		}
2641 		cv_broadcast(&db->db_changed);
2642 	}
2643 	mutex_exit(&db->db_mtx);
2644 }
2645 
2646 void
2647 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
2648     bp_embedded_type_t etype, enum zio_compress comp,
2649     int uncompressed_size, int compressed_size, int byteorder,
2650     dmu_tx_t *tx)
2651 {
2652 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2653 	struct dirty_leaf *dl;
2654 	dmu_object_type_t type;
2655 	dbuf_dirty_record_t *dr;
2656 
2657 	if (etype == BP_EMBEDDED_TYPE_DATA) {
2658 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
2659 		    SPA_FEATURE_EMBEDDED_DATA));
2660 	}
2661 
2662 	DB_DNODE_ENTER(db);
2663 	type = DB_DNODE(db)->dn_type;
2664 	DB_DNODE_EXIT(db);
2665 
2666 	ASSERT0(db->db_level);
2667 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2668 
2669 	dmu_buf_will_not_fill(dbuf, tx);
2670 
2671 	dr = list_head(&db->db_dirty_records);
2672 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2673 	dl = &dr->dt.dl;
2674 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
2675 	    data, comp, uncompressed_size, compressed_size);
2676 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
2677 	BP_SET_TYPE(&dl->dr_overridden_by, type);
2678 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
2679 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
2680 
2681 	dl->dr_override_state = DR_OVERRIDDEN;
2682 	dl->dr_overridden_by.blk_birth = dr->dr_txg;
2683 }
2684 
2685 void
2686 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
2687 {
2688 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2689 	dmu_object_type_t type;
2690 	ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
2691 	    SPA_FEATURE_REDACTED_DATASETS));
2692 
2693 	DB_DNODE_ENTER(db);
2694 	type = DB_DNODE(db)->dn_type;
2695 	DB_DNODE_EXIT(db);
2696 
2697 	ASSERT0(db->db_level);
2698 	dmu_buf_will_not_fill(dbuf, tx);
2699 
2700 	blkptr_t bp = { { { {0} } } };
2701 	BP_SET_TYPE(&bp, type);
2702 	BP_SET_LEVEL(&bp, 0);
2703 	BP_SET_BIRTH(&bp, tx->tx_txg, 0);
2704 	BP_SET_REDACTED(&bp);
2705 	BPE_SET_LSIZE(&bp, dbuf->db_size);
2706 
2707 	dbuf_override_impl(db, &bp, tx);
2708 }
2709 
2710 /*
2711  * Directly assign a provided arc buf to a given dbuf if it's not referenced
2712  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
2713  */
2714 void
2715 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
2716 {
2717 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2718 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2719 	ASSERT(db->db_level == 0);
2720 	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
2721 	ASSERT(buf != NULL);
2722 	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
2723 	ASSERT(tx->tx_txg != 0);
2724 
2725 	arc_return_buf(buf, db);
2726 	ASSERT(arc_released(buf));
2727 
2728 	mutex_enter(&db->db_mtx);
2729 
2730 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
2731 		cv_wait(&db->db_changed, &db->db_mtx);
2732 
2733 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
2734 
2735 	if (db->db_state == DB_CACHED &&
2736 	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
2737 		/*
2738 		 * In practice, we will never have a case where we have an
2739 		 * encrypted arc buffer while additional holds exist on the
2740 		 * dbuf. We don't handle this here so we simply assert that
2741 		 * fact instead.
2742 		 */
2743 		ASSERT(!arc_is_encrypted(buf));
2744 		mutex_exit(&db->db_mtx);
2745 		(void) dbuf_dirty(db, tx);
2746 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
2747 		arc_buf_destroy(buf, db);
2748 		return;
2749 	}
2750 
2751 	if (db->db_state == DB_CACHED) {
2752 		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
2753 
2754 		ASSERT(db->db_buf != NULL);
2755 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
2756 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
2757 
2758 			if (!arc_released(db->db_buf)) {
2759 				ASSERT(dr->dt.dl.dr_override_state ==
2760 				    DR_OVERRIDDEN);
2761 				arc_release(db->db_buf, db);
2762 			}
2763 			dr->dt.dl.dr_data = buf;
2764 			arc_buf_destroy(db->db_buf, db);
2765 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
2766 			arc_release(db->db_buf, db);
2767 			arc_buf_destroy(db->db_buf, db);
2768 		}
2769 		db->db_buf = NULL;
2770 	}
2771 	ASSERT(db->db_buf == NULL);
2772 	dbuf_set_data(db, buf);
2773 	db->db_state = DB_FILL;
2774 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
2775 	mutex_exit(&db->db_mtx);
2776 	(void) dbuf_dirty(db, tx);
2777 	dmu_buf_fill_done(&db->db, tx);
2778 }
2779 
2780 void
2781 dbuf_destroy(dmu_buf_impl_t *db)
2782 {
2783 	dnode_t *dn;
2784 	dmu_buf_impl_t *parent = db->db_parent;
2785 	dmu_buf_impl_t *dndb;
2786 
2787 	ASSERT(MUTEX_HELD(&db->db_mtx));
2788 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
2789 
2790 	if (db->db_buf != NULL) {
2791 		arc_buf_destroy(db->db_buf, db);
2792 		db->db_buf = NULL;
2793 	}
2794 
2795 	if (db->db_blkid == DMU_BONUS_BLKID) {
2796 		int slots = DB_DNODE(db)->dn_num_slots;
2797 		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
2798 		if (db->db.db_data != NULL) {
2799 			kmem_free(db->db.db_data, bonuslen);
2800 			arc_space_return(bonuslen, ARC_SPACE_BONUS);
2801 			db->db_state = DB_UNCACHED;
2802 			DTRACE_SET_STATE(db, "buffer cleared");
2803 		}
2804 	}
2805 
2806 	dbuf_clear_data(db);
2807 
2808 	if (multilist_link_active(&db->db_cache_link)) {
2809 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2810 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
2811 
2812 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
2813 		(void) zfs_refcount_remove_many(
2814 		    &dbuf_caches[db->db_caching_status].size,
2815 		    db->db.db_size, db);
2816 
2817 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
2818 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
2819 		} else {
2820 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
2821 			DBUF_STAT_BUMPDOWN(cache_count);
2822 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
2823 			    db->db.db_size);
2824 		}
2825 		db->db_caching_status = DB_NO_CACHE;
2826 	}
2827 
2828 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2829 	ASSERT(db->db_data_pending == NULL);
2830 	ASSERT(list_is_empty(&db->db_dirty_records));
2831 
2832 	db->db_state = DB_EVICTING;
2833 	DTRACE_SET_STATE(db, "buffer eviction started");
2834 	db->db_blkptr = NULL;
2835 
2836 	/*
2837 	 * Now that db_state is DB_EVICTING, nobody else can find this via
2838 	 * the hash table.  We can now drop db_mtx, which allows us to
2839 	 * acquire the dn_dbufs_mtx.
2840 	 */
2841 	mutex_exit(&db->db_mtx);
2842 
2843 	DB_DNODE_ENTER(db);
2844 	dn = DB_DNODE(db);
2845 	dndb = dn->dn_dbuf;
2846 	if (db->db_blkid != DMU_BONUS_BLKID) {
2847 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2848 		if (needlock)
2849 			mutex_enter_nested(&dn->dn_dbufs_mtx,
2850 			    NESTED_SINGLE);
2851 		avl_remove(&dn->dn_dbufs, db);
2852 		membar_producer();
2853 		DB_DNODE_EXIT(db);
2854 		if (needlock)
2855 			mutex_exit(&dn->dn_dbufs_mtx);
2856 		/*
2857 		 * Decrementing the dbuf count means that the hold corresponding
2858 		 * to the removed dbuf is no longer discounted in dnode_move(),
2859 		 * so the dnode cannot be moved until after we release the hold.
2860 		 * The membar_producer() ensures visibility of the decremented
2861 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2862 		 * release any lock.
2863 		 */
2864 		mutex_enter(&dn->dn_mtx);
2865 		dnode_rele_and_unlock(dn, db, B_TRUE);
2866 		db->db_dnode_handle = NULL;
2867 
2868 		dbuf_hash_remove(db);
2869 	} else {
2870 		DB_DNODE_EXIT(db);
2871 	}
2872 
2873 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
2874 
2875 	db->db_parent = NULL;
2876 
2877 	ASSERT(db->db_buf == NULL);
2878 	ASSERT(db->db.db_data == NULL);
2879 	ASSERT(db->db_hash_next == NULL);
2880 	ASSERT(db->db_blkptr == NULL);
2881 	ASSERT(db->db_data_pending == NULL);
2882 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
2883 	ASSERT(!multilist_link_active(&db->db_cache_link));
2884 
2885 	kmem_cache_free(dbuf_kmem_cache, db);
2886 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
2887 
2888 	/*
2889 	 * If this dbuf is referenced from an indirect dbuf,
2890 	 * decrement the ref count on the indirect dbuf.
2891 	 */
2892 	if (parent && parent != dndb) {
2893 		mutex_enter(&parent->db_mtx);
2894 		dbuf_rele_and_unlock(parent, db, B_TRUE);
2895 	}
2896 }
2897 
2898 /*
2899  * Note: While bpp will always be updated if the function returns success,
2900  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2901  * this happens when the dnode is the meta-dnode, or {user|group|project}used
2902  * object.
2903  */
2904 __attribute__((always_inline))
2905 static inline int
2906 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
2907     dmu_buf_impl_t **parentp, blkptr_t **bpp)
2908 {
2909 	*parentp = NULL;
2910 	*bpp = NULL;
2911 
2912 	ASSERT(blkid != DMU_BONUS_BLKID);
2913 
2914 	if (blkid == DMU_SPILL_BLKID) {
2915 		mutex_enter(&dn->dn_mtx);
2916 		if (dn->dn_have_spill &&
2917 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2918 			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
2919 		else
2920 			*bpp = NULL;
2921 		dbuf_add_ref(dn->dn_dbuf, NULL);
2922 		*parentp = dn->dn_dbuf;
2923 		mutex_exit(&dn->dn_mtx);
2924 		return (0);
2925 	}
2926 
2927 	int nlevels =
2928 	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
2929 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2930 
2931 	ASSERT3U(level * epbs, <, 64);
2932 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2933 	/*
2934 	 * This assertion shouldn't trip as long as the max indirect block size
2935 	 * is less than 1M.  The reason for this is that up to that point,
2936 	 * the number of levels required to address an entire object with blocks
2937 	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.	 In
2938 	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
2939 	 * (i.e. we can address the entire object), objects will all use at most
2940 	 * N-1 levels and the assertion won't overflow.	 However, once epbs is
2941 	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
2942 	 * enough to address an entire object, so objects will have 5 levels,
2943 	 * but then this assertion will overflow.
2944 	 *
2945 	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
2946 	 * need to redo this logic to handle overflows.
2947 	 */
2948 	ASSERT(level >= nlevels ||
2949 	    ((nlevels - level - 1) * epbs) +
2950 	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
2951 	if (level >= nlevels ||
2952 	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
2953 	    ((nlevels - level - 1) * epbs)) ||
2954 	    (fail_sparse &&
2955 	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
2956 		/* the buffer has no parent yet */
2957 		return (SET_ERROR(ENOENT));
2958 	} else if (level < nlevels-1) {
2959 		/* this block is referenced from an indirect block */
2960 		int err;
2961 
2962 		err = dbuf_hold_impl(dn, level + 1,
2963 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
2964 
2965 		if (err)
2966 			return (err);
2967 		err = dbuf_read(*parentp, NULL,
2968 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2969 		if (err) {
2970 			dbuf_rele(*parentp, NULL);
2971 			*parentp = NULL;
2972 			return (err);
2973 		}
2974 		rw_enter(&(*parentp)->db_rwlock, RW_READER);
2975 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
2976 		    (blkid & ((1ULL << epbs) - 1));
2977 		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
2978 			ASSERT(BP_IS_HOLE(*bpp));
2979 		rw_exit(&(*parentp)->db_rwlock);
2980 		return (0);
2981 	} else {
2982 		/* the block is referenced from the dnode */
2983 		ASSERT3U(level, ==, nlevels-1);
2984 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
2985 		    blkid < dn->dn_phys->dn_nblkptr);
2986 		if (dn->dn_dbuf) {
2987 			dbuf_add_ref(dn->dn_dbuf, NULL);
2988 			*parentp = dn->dn_dbuf;
2989 		}
2990 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
2991 		return (0);
2992 	}
2993 }
2994 
2995 static dmu_buf_impl_t *
2996 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
2997     dmu_buf_impl_t *parent, blkptr_t *blkptr)
2998 {
2999 	objset_t *os = dn->dn_objset;
3000 	dmu_buf_impl_t *db, *odb;
3001 
3002 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3003 	ASSERT(dn->dn_type != DMU_OT_NONE);
3004 
3005 	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
3006 
3007 	list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
3008 	    offsetof(dbuf_dirty_record_t, dr_dbuf_node));
3009 
3010 	db->db_objset = os;
3011 	db->db.db_object = dn->dn_object;
3012 	db->db_level = level;
3013 	db->db_blkid = blkid;
3014 	db->db_dirtycnt = 0;
3015 	db->db_dnode_handle = dn->dn_handle;
3016 	db->db_parent = parent;
3017 	db->db_blkptr = blkptr;
3018 
3019 	db->db_user = NULL;
3020 	db->db_user_immediate_evict = FALSE;
3021 	db->db_freed_in_flight = FALSE;
3022 	db->db_pending_evict = FALSE;
3023 
3024 	if (blkid == DMU_BONUS_BLKID) {
3025 		ASSERT3P(parent, ==, dn->dn_dbuf);
3026 		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
3027 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
3028 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
3029 		db->db.db_offset = DMU_BONUS_BLKID;
3030 		db->db_state = DB_UNCACHED;
3031 		DTRACE_SET_STATE(db, "bonus buffer created");
3032 		db->db_caching_status = DB_NO_CACHE;
3033 		/* the bonus dbuf is not placed in the hash table */
3034 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3035 		return (db);
3036 	} else if (blkid == DMU_SPILL_BLKID) {
3037 		db->db.db_size = (blkptr != NULL) ?
3038 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
3039 		db->db.db_offset = 0;
3040 	} else {
3041 		int blocksize =
3042 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
3043 		db->db.db_size = blocksize;
3044 		db->db.db_offset = db->db_blkid * blocksize;
3045 	}
3046 
3047 	/*
3048 	 * Hold the dn_dbufs_mtx while we get the new dbuf
3049 	 * in the hash table *and* added to the dbufs list.
3050 	 * This prevents a possible deadlock with someone
3051 	 * trying to look up this dbuf before it's added to the
3052 	 * dn_dbufs list.
3053 	 */
3054 	mutex_enter(&dn->dn_dbufs_mtx);
3055 	db->db_state = DB_EVICTING; /* not worth logging this state change */
3056 	if ((odb = dbuf_hash_insert(db)) != NULL) {
3057 		/* someone else inserted it first */
3058 		mutex_exit(&dn->dn_dbufs_mtx);
3059 		kmem_cache_free(dbuf_kmem_cache, db);
3060 		DBUF_STAT_BUMP(hash_insert_race);
3061 		return (odb);
3062 	}
3063 	avl_add(&dn->dn_dbufs, db);
3064 
3065 	db->db_state = DB_UNCACHED;
3066 	DTRACE_SET_STATE(db, "regular buffer created");
3067 	db->db_caching_status = DB_NO_CACHE;
3068 	mutex_exit(&dn->dn_dbufs_mtx);
3069 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
3070 
3071 	if (parent && parent != dn->dn_dbuf)
3072 		dbuf_add_ref(parent, db);
3073 
3074 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
3075 	    zfs_refcount_count(&dn->dn_holds) > 0);
3076 	(void) zfs_refcount_add(&dn->dn_holds, db);
3077 
3078 	dprintf_dbuf(db, "db=%p\n", db);
3079 
3080 	return (db);
3081 }
3082 
3083 /*
3084  * This function returns a block pointer and information about the object,
3085  * given a dnode and a block.  This is a publicly accessible version of
3086  * dbuf_findbp that only returns some information, rather than the
3087  * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
3088  * should be locked as (at least) a reader.
3089  */
3090 int
3091 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
3092     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
3093 {
3094 	dmu_buf_impl_t *dbp = NULL;
3095 	blkptr_t *bp2;
3096 	int err = 0;
3097 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3098 
3099 	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
3100 	if (err == 0) {
3101 		*bp = *bp2;
3102 		if (dbp != NULL)
3103 			dbuf_rele(dbp, NULL);
3104 		if (datablkszsec != NULL)
3105 			*datablkszsec = dn->dn_phys->dn_datablkszsec;
3106 		if (indblkshift != NULL)
3107 			*indblkshift = dn->dn_phys->dn_indblkshift;
3108 	}
3109 
3110 	return (err);
3111 }
3112 
3113 typedef struct dbuf_prefetch_arg {
3114 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
3115 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
3116 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
3117 	int dpa_curlevel; /* The current level that we're reading */
3118 	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
3119 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
3120 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
3121 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
3122 	dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
3123 	void *dpa_arg; /* prefetch completion arg */
3124 } dbuf_prefetch_arg_t;
3125 
3126 static void
3127 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
3128 {
3129 	if (dpa->dpa_cb != NULL)
3130 		dpa->dpa_cb(dpa->dpa_arg, io_done);
3131 	kmem_free(dpa, sizeof (*dpa));
3132 }
3133 
3134 static void
3135 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
3136     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3137 {
3138 	dbuf_prefetch_arg_t *dpa = private;
3139 
3140 	dbuf_prefetch_fini(dpa, B_TRUE);
3141 	if (abuf != NULL)
3142 		arc_buf_destroy(abuf, private);
3143 }
3144 
3145 /*
3146  * Actually issue the prefetch read for the block given.
3147  */
3148 static void
3149 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
3150 {
3151 	ASSERT(!BP_IS_REDACTED(bp) ||
3152 	    dsl_dataset_feature_is_active(
3153 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3154 	    SPA_FEATURE_REDACTED_DATASETS));
3155 
3156 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
3157 		return (dbuf_prefetch_fini(dpa, B_FALSE));
3158 
3159 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
3160 	arc_flags_t aflags =
3161 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
3162 	    ARC_FLAG_NO_BUF;
3163 
3164 	/* dnodes are always read as raw and then converted later */
3165 	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
3166 	    dpa->dpa_curlevel == 0)
3167 		zio_flags |= ZIO_FLAG_RAW;
3168 
3169 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3170 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
3171 	ASSERT(dpa->dpa_zio != NULL);
3172 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
3173 	    dbuf_issue_final_prefetch_done, dpa,
3174 	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
3175 }
3176 
3177 /*
3178  * Called when an indirect block above our prefetch target is read in.  This
3179  * will either read in the next indirect block down the tree or issue the actual
3180  * prefetch if the next block down is our target.
3181  */
3182 static void
3183 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
3184     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
3185 {
3186 	dbuf_prefetch_arg_t *dpa = private;
3187 
3188 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
3189 	ASSERT3S(dpa->dpa_curlevel, >, 0);
3190 
3191 	if (abuf == NULL) {
3192 		ASSERT(zio == NULL || zio->io_error != 0);
3193 		return (dbuf_prefetch_fini(dpa, B_TRUE));
3194 	}
3195 	ASSERT(zio == NULL || zio->io_error == 0);
3196 
3197 	/*
3198 	 * The dpa_dnode is only valid if we are called with a NULL
3199 	 * zio. This indicates that the arc_read() returned without
3200 	 * first calling zio_read() to issue a physical read. Once
3201 	 * a physical read is made the dpa_dnode must be invalidated
3202 	 * as the locks guarding it may have been dropped. If the
3203 	 * dpa_dnode is still valid, then we want to add it to the dbuf
3204 	 * cache. To do so, we must hold the dbuf associated with the block
3205 	 * we just prefetched, read its contents so that we associate it
3206 	 * with an arc_buf_t, and then release it.
3207 	 */
3208 	if (zio != NULL) {
3209 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
3210 		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
3211 			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
3212 		} else {
3213 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
3214 		}
3215 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
3216 
3217 		dpa->dpa_dnode = NULL;
3218 	} else if (dpa->dpa_dnode != NULL) {
3219 		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
3220 		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
3221 		    dpa->dpa_zb.zb_level));
3222 		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
3223 		    dpa->dpa_curlevel, curblkid, FTAG);
3224 		if (db == NULL) {
3225 			arc_buf_destroy(abuf, private);
3226 			return (dbuf_prefetch_fini(dpa, B_TRUE));
3227 		}
3228 		(void) dbuf_read(db, NULL,
3229 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
3230 		dbuf_rele(db, FTAG);
3231 	}
3232 
3233 	dpa->dpa_curlevel--;
3234 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
3235 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
3236 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
3237 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
3238 
3239 	ASSERT(!BP_IS_REDACTED(bp) ||
3240 	    dsl_dataset_feature_is_active(
3241 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3242 	    SPA_FEATURE_REDACTED_DATASETS));
3243 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
3244 		dbuf_prefetch_fini(dpa, B_TRUE);
3245 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
3246 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
3247 		dbuf_issue_final_prefetch(dpa, bp);
3248 	} else {
3249 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3250 		zbookmark_phys_t zb;
3251 
3252 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
3253 		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
3254 			iter_aflags |= ARC_FLAG_L2CACHE;
3255 
3256 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3257 
3258 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
3259 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
3260 
3261 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3262 		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
3263 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3264 		    &iter_aflags, &zb);
3265 	}
3266 
3267 	arc_buf_destroy(abuf, private);
3268 }
3269 
3270 /*
3271  * Issue prefetch reads for the given block on the given level.  If the indirect
3272  * blocks above that block are not in memory, we will read them in
3273  * asynchronously.  As a result, this call never blocks waiting for a read to
3274  * complete. Note that the prefetch might fail if the dataset is encrypted and
3275  * the encryption key is unmapped before the IO completes.
3276  */
3277 int
3278 dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
3279     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
3280     void *arg)
3281 {
3282 	blkptr_t bp;
3283 	int epbs, nlevels, curlevel;
3284 	uint64_t curblkid;
3285 
3286 	ASSERT(blkid != DMU_BONUS_BLKID);
3287 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3288 
3289 	if (blkid > dn->dn_maxblkid)
3290 		goto no_issue;
3291 
3292 	if (level == 0 && dnode_block_freed(dn, blkid))
3293 		goto no_issue;
3294 
3295 	/*
3296 	 * This dnode hasn't been written to disk yet, so there's nothing to
3297 	 * prefetch.
3298 	 */
3299 	nlevels = dn->dn_phys->dn_nlevels;
3300 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
3301 		goto no_issue;
3302 
3303 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3304 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
3305 		goto no_issue;
3306 
3307 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
3308 	    level, blkid);
3309 	if (db != NULL) {
3310 		mutex_exit(&db->db_mtx);
3311 		/*
3312 		 * This dbuf already exists.  It is either CACHED, or
3313 		 * (we assume) about to be read or filled.
3314 		 */
3315 		goto no_issue;
3316 	}
3317 
3318 	/*
3319 	 * Find the closest ancestor (indirect block) of the target block
3320 	 * that is present in the cache.  In this indirect block, we will
3321 	 * find the bp that is at curlevel, curblkid.
3322 	 */
3323 	curlevel = level;
3324 	curblkid = blkid;
3325 	while (curlevel < nlevels - 1) {
3326 		int parent_level = curlevel + 1;
3327 		uint64_t parent_blkid = curblkid >> epbs;
3328 		dmu_buf_impl_t *db;
3329 
3330 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
3331 		    FALSE, TRUE, FTAG, &db) == 0) {
3332 			blkptr_t *bpp = db->db_buf->b_data;
3333 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
3334 			dbuf_rele(db, FTAG);
3335 			break;
3336 		}
3337 
3338 		curlevel = parent_level;
3339 		curblkid = parent_blkid;
3340 	}
3341 
3342 	if (curlevel == nlevels - 1) {
3343 		/* No cached indirect blocks found. */
3344 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
3345 		bp = dn->dn_phys->dn_blkptr[curblkid];
3346 	}
3347 	ASSERT(!BP_IS_REDACTED(&bp) ||
3348 	    dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
3349 	    SPA_FEATURE_REDACTED_DATASETS));
3350 	if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
3351 		goto no_issue;
3352 
3353 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
3354 
3355 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
3356 	    ZIO_FLAG_CANFAIL);
3357 
3358 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
3359 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
3360 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3361 	    dn->dn_object, level, blkid);
3362 	dpa->dpa_curlevel = curlevel;
3363 	dpa->dpa_prio = prio;
3364 	dpa->dpa_aflags = aflags;
3365 	dpa->dpa_spa = dn->dn_objset->os_spa;
3366 	dpa->dpa_dnode = dn;
3367 	dpa->dpa_epbs = epbs;
3368 	dpa->dpa_zio = pio;
3369 	dpa->dpa_cb = cb;
3370 	dpa->dpa_arg = arg;
3371 
3372 	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
3373 	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
3374 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
3375 
3376 	/*
3377 	 * If we have the indirect just above us, no need to do the asynchronous
3378 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
3379 	 * a higher level, though, we want to issue the prefetches for all the
3380 	 * indirect blocks asynchronously, so we can go on with whatever we were
3381 	 * doing.
3382 	 */
3383 	if (curlevel == level) {
3384 		ASSERT3U(curblkid, ==, blkid);
3385 		dbuf_issue_final_prefetch(dpa, &bp);
3386 	} else {
3387 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3388 		zbookmark_phys_t zb;
3389 
3390 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
3391 		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
3392 			iter_aflags |= ARC_FLAG_L2CACHE;
3393 
3394 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3395 		    dn->dn_object, curlevel, curblkid);
3396 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3397 		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
3398 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3399 		    &iter_aflags, &zb);
3400 	}
3401 	/*
3402 	 * We use pio here instead of dpa_zio since it's possible that
3403 	 * dpa may have already been freed.
3404 	 */
3405 	zio_nowait(pio);
3406 	return (1);
3407 no_issue:
3408 	if (cb != NULL)
3409 		cb(arg, B_FALSE);
3410 	return (0);
3411 }
3412 
3413 int
3414 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
3415     arc_flags_t aflags)
3416 {
3417 
3418 	return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
3419 }
3420 
3421 /*
3422  * Helper function for dbuf_hold_impl() to copy a buffer. Handles
3423  * the case of encrypted, compressed and uncompressed buffers by
3424  * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
3425  * arc_alloc_compressed_buf() or arc_alloc_buf().*
3426  *
3427  * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
3428  */
3429 noinline static void
3430 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
3431 {
3432 	dbuf_dirty_record_t *dr = db->db_data_pending;
3433 	arc_buf_t *data = dr->dt.dl.dr_data;
3434 	enum zio_compress compress_type = arc_get_compression(data);
3435 	uint8_t complevel = arc_get_complevel(data);
3436 
3437 	if (arc_is_encrypted(data)) {
3438 		boolean_t byteorder;
3439 		uint8_t salt[ZIO_DATA_SALT_LEN];
3440 		uint8_t iv[ZIO_DATA_IV_LEN];
3441 		uint8_t mac[ZIO_DATA_MAC_LEN];
3442 
3443 		arc_get_raw_params(data, &byteorder, salt, iv, mac);
3444 		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
3445 		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
3446 		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
3447 		    compress_type, complevel));
3448 	} else if (compress_type != ZIO_COMPRESS_OFF) {
3449 		dbuf_set_data(db, arc_alloc_compressed_buf(
3450 		    dn->dn_objset->os_spa, db, arc_buf_size(data),
3451 		    arc_buf_lsize(data), compress_type, complevel));
3452 	} else {
3453 		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
3454 		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
3455 	}
3456 
3457 	rw_enter(&db->db_rwlock, RW_WRITER);
3458 	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
3459 	rw_exit(&db->db_rwlock);
3460 }
3461 
3462 /*
3463  * Returns with db_holds incremented, and db_mtx not held.
3464  * Note: dn_struct_rwlock must be held.
3465  */
3466 int
3467 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
3468     boolean_t fail_sparse, boolean_t fail_uncached,
3469     void *tag, dmu_buf_impl_t **dbp)
3470 {
3471 	dmu_buf_impl_t *db, *parent = NULL;
3472 
3473 	/* If the pool has been created, verify the tx_sync_lock is not held */
3474 	spa_t *spa = dn->dn_objset->os_spa;
3475 	dsl_pool_t *dp = spa->spa_dsl_pool;
3476 	if (dp != NULL) {
3477 		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
3478 	}
3479 
3480 	ASSERT(blkid != DMU_BONUS_BLKID);
3481 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3482 	ASSERT3U(dn->dn_nlevels, >, level);
3483 
3484 	*dbp = NULL;
3485 
3486 	/* dbuf_find() returns with db_mtx held */
3487 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
3488 
3489 	if (db == NULL) {
3490 		blkptr_t *bp = NULL;
3491 		int err;
3492 
3493 		if (fail_uncached)
3494 			return (SET_ERROR(ENOENT));
3495 
3496 		ASSERT3P(parent, ==, NULL);
3497 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
3498 		if (fail_sparse) {
3499 			if (err == 0 && bp && BP_IS_HOLE(bp))
3500 				err = SET_ERROR(ENOENT);
3501 			if (err) {
3502 				if (parent)
3503 					dbuf_rele(parent, NULL);
3504 				return (err);
3505 			}
3506 		}
3507 		if (err && err != ENOENT)
3508 			return (err);
3509 		db = dbuf_create(dn, level, blkid, parent, bp);
3510 	}
3511 
3512 	if (fail_uncached && db->db_state != DB_CACHED) {
3513 		mutex_exit(&db->db_mtx);
3514 		return (SET_ERROR(ENOENT));
3515 	}
3516 
3517 	if (db->db_buf != NULL) {
3518 		arc_buf_access(db->db_buf);
3519 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
3520 	}
3521 
3522 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
3523 
3524 	/*
3525 	 * If this buffer is currently syncing out, and we are
3526 	 * still referencing it from db_data, we need to make a copy
3527 	 * of it in case we decide we want to dirty it again in this txg.
3528 	 */
3529 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
3530 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
3531 	    db->db_state == DB_CACHED && db->db_data_pending) {
3532 		dbuf_dirty_record_t *dr = db->db_data_pending;
3533 		if (dr->dt.dl.dr_data == db->db_buf)
3534 			dbuf_hold_copy(dn, db);
3535 	}
3536 
3537 	if (multilist_link_active(&db->db_cache_link)) {
3538 		ASSERT(zfs_refcount_is_zero(&db->db_holds));
3539 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3540 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
3541 
3542 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
3543 		(void) zfs_refcount_remove_many(
3544 		    &dbuf_caches[db->db_caching_status].size,
3545 		    db->db.db_size, db);
3546 
3547 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
3548 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
3549 		} else {
3550 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
3551 			DBUF_STAT_BUMPDOWN(cache_count);
3552 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
3553 			    db->db.db_size);
3554 		}
3555 		db->db_caching_status = DB_NO_CACHE;
3556 	}
3557 	(void) zfs_refcount_add(&db->db_holds, tag);
3558 	DBUF_VERIFY(db);
3559 	mutex_exit(&db->db_mtx);
3560 
3561 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
3562 	if (parent)
3563 		dbuf_rele(parent, NULL);
3564 
3565 	ASSERT3P(DB_DNODE(db), ==, dn);
3566 	ASSERT3U(db->db_blkid, ==, blkid);
3567 	ASSERT3U(db->db_level, ==, level);
3568 	*dbp = db;
3569 
3570 	return (0);
3571 }
3572 
3573 dmu_buf_impl_t *
3574 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
3575 {
3576 	return (dbuf_hold_level(dn, 0, blkid, tag));
3577 }
3578 
3579 dmu_buf_impl_t *
3580 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
3581 {
3582 	dmu_buf_impl_t *db;
3583 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
3584 	return (err ? NULL : db);
3585 }
3586 
3587 void
3588 dbuf_create_bonus(dnode_t *dn)
3589 {
3590 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
3591 
3592 	ASSERT(dn->dn_bonus == NULL);
3593 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
3594 }
3595 
3596 int
3597 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
3598 {
3599 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3600 
3601 	if (db->db_blkid != DMU_SPILL_BLKID)
3602 		return (SET_ERROR(ENOTSUP));
3603 	if (blksz == 0)
3604 		blksz = SPA_MINBLOCKSIZE;
3605 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
3606 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
3607 
3608 	dbuf_new_size(db, blksz, tx);
3609 
3610 	return (0);
3611 }
3612 
3613 void
3614 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
3615 {
3616 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
3617 }
3618 
3619 #pragma weak dmu_buf_add_ref = dbuf_add_ref
3620 void
3621 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
3622 {
3623 	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
3624 	VERIFY3S(holds, >, 1);
3625 }
3626 
3627 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
3628 boolean_t
3629 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
3630     void *tag)
3631 {
3632 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3633 	dmu_buf_impl_t *found_db;
3634 	boolean_t result = B_FALSE;
3635 
3636 	if (blkid == DMU_BONUS_BLKID)
3637 		found_db = dbuf_find_bonus(os, obj);
3638 	else
3639 		found_db = dbuf_find(os, obj, 0, blkid);
3640 
3641 	if (found_db != NULL) {
3642 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
3643 			(void) zfs_refcount_add(&db->db_holds, tag);
3644 			result = B_TRUE;
3645 		}
3646 		mutex_exit(&found_db->db_mtx);
3647 	}
3648 	return (result);
3649 }
3650 
3651 /*
3652  * If you call dbuf_rele() you had better not be referencing the dnode handle
3653  * unless you have some other direct or indirect hold on the dnode. (An indirect
3654  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
3655  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
3656  * dnode's parent dbuf evicting its dnode handles.
3657  */
3658 void
3659 dbuf_rele(dmu_buf_impl_t *db, void *tag)
3660 {
3661 	mutex_enter(&db->db_mtx);
3662 	dbuf_rele_and_unlock(db, tag, B_FALSE);
3663 }
3664 
3665 void
3666 dmu_buf_rele(dmu_buf_t *db, void *tag)
3667 {
3668 	dbuf_rele((dmu_buf_impl_t *)db, tag);
3669 }
3670 
3671 /*
3672  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
3673  * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
3674  * argument should be set if we are already in the dbuf-evicting code
3675  * path, in which case we don't want to recursively evict.  This allows us to
3676  * avoid deeply nested stacks that would have a call flow similar to this:
3677  *
3678  * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
3679  *	^						|
3680  *	|						|
3681  *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
3682  *
3683  */
3684 void
3685 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
3686 {
3687 	int64_t holds;
3688 	uint64_t size;
3689 
3690 	ASSERT(MUTEX_HELD(&db->db_mtx));
3691 	DBUF_VERIFY(db);
3692 
3693 	/*
3694 	 * Remove the reference to the dbuf before removing its hold on the
3695 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
3696 	 * buffer has a corresponding dnode hold.
3697 	 */
3698 	holds = zfs_refcount_remove(&db->db_holds, tag);
3699 	ASSERT(holds >= 0);
3700 
3701 	/*
3702 	 * We can't freeze indirects if there is a possibility that they
3703 	 * may be modified in the current syncing context.
3704 	 */
3705 	if (db->db_buf != NULL &&
3706 	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
3707 		arc_buf_freeze(db->db_buf);
3708 	}
3709 
3710 	if (holds == db->db_dirtycnt &&
3711 	    db->db_level == 0 && db->db_user_immediate_evict)
3712 		dbuf_evict_user(db);
3713 
3714 	if (holds == 0) {
3715 		if (db->db_blkid == DMU_BONUS_BLKID) {
3716 			dnode_t *dn;
3717 			boolean_t evict_dbuf = db->db_pending_evict;
3718 
3719 			/*
3720 			 * If the dnode moves here, we cannot cross this
3721 			 * barrier until the move completes.
3722 			 */
3723 			DB_DNODE_ENTER(db);
3724 
3725 			dn = DB_DNODE(db);
3726 			atomic_dec_32(&dn->dn_dbufs_count);
3727 
3728 			/*
3729 			 * Decrementing the dbuf count means that the bonus
3730 			 * buffer's dnode hold is no longer discounted in
3731 			 * dnode_move(). The dnode cannot move until after
3732 			 * the dnode_rele() below.
3733 			 */
3734 			DB_DNODE_EXIT(db);
3735 
3736 			/*
3737 			 * Do not reference db after its lock is dropped.
3738 			 * Another thread may evict it.
3739 			 */
3740 			mutex_exit(&db->db_mtx);
3741 
3742 			if (evict_dbuf)
3743 				dnode_evict_bonus(dn);
3744 
3745 			dnode_rele(dn, db);
3746 		} else if (db->db_buf == NULL) {
3747 			/*
3748 			 * This is a special case: we never associated this
3749 			 * dbuf with any data allocated from the ARC.
3750 			 */
3751 			ASSERT(db->db_state == DB_UNCACHED ||
3752 			    db->db_state == DB_NOFILL);
3753 			dbuf_destroy(db);
3754 		} else if (arc_released(db->db_buf)) {
3755 			/*
3756 			 * This dbuf has anonymous data associated with it.
3757 			 */
3758 			dbuf_destroy(db);
3759 		} else {
3760 			boolean_t do_arc_evict = B_FALSE;
3761 			blkptr_t bp;
3762 			spa_t *spa = dmu_objset_spa(db->db_objset);
3763 
3764 			if (!DBUF_IS_CACHEABLE(db) &&
3765 			    db->db_blkptr != NULL &&
3766 			    !BP_IS_HOLE(db->db_blkptr) &&
3767 			    !BP_IS_EMBEDDED(db->db_blkptr)) {
3768 				do_arc_evict = B_TRUE;
3769 				bp = *db->db_blkptr;
3770 			}
3771 
3772 			if (!DBUF_IS_CACHEABLE(db) ||
3773 			    db->db_pending_evict) {
3774 				dbuf_destroy(db);
3775 			} else if (!multilist_link_active(&db->db_cache_link)) {
3776 				ASSERT3U(db->db_caching_status, ==,
3777 				    DB_NO_CACHE);
3778 
3779 				dbuf_cached_state_t dcs =
3780 				    dbuf_include_in_metadata_cache(db) ?
3781 				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
3782 				db->db_caching_status = dcs;
3783 
3784 				multilist_insert(&dbuf_caches[dcs].cache, db);
3785 				uint64_t db_size = db->db.db_size;
3786 				size = zfs_refcount_add_many(
3787 				    &dbuf_caches[dcs].size, db_size, db);
3788 				uint8_t db_level = db->db_level;
3789 				mutex_exit(&db->db_mtx);
3790 
3791 				if (dcs == DB_DBUF_METADATA_CACHE) {
3792 					DBUF_STAT_BUMP(metadata_cache_count);
3793 					DBUF_STAT_MAX(
3794 					    metadata_cache_size_bytes_max,
3795 					    size);
3796 				} else {
3797 					DBUF_STAT_BUMP(cache_count);
3798 					DBUF_STAT_MAX(cache_size_bytes_max,
3799 					    size);
3800 					DBUF_STAT_BUMP(cache_levels[db_level]);
3801 					DBUF_STAT_INCR(
3802 					    cache_levels_bytes[db_level],
3803 					    db_size);
3804 				}
3805 
3806 				if (dcs == DB_DBUF_CACHE && !evicting)
3807 					dbuf_evict_notify(size);
3808 			}
3809 
3810 			if (do_arc_evict)
3811 				arc_freed(spa, &bp);
3812 		}
3813 	} else {
3814 		mutex_exit(&db->db_mtx);
3815 	}
3816 
3817 }
3818 
3819 #pragma weak dmu_buf_refcount = dbuf_refcount
3820 uint64_t
3821 dbuf_refcount(dmu_buf_impl_t *db)
3822 {
3823 	return (zfs_refcount_count(&db->db_holds));
3824 }
3825 
3826 uint64_t
3827 dmu_buf_user_refcount(dmu_buf_t *db_fake)
3828 {
3829 	uint64_t holds;
3830 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3831 
3832 	mutex_enter(&db->db_mtx);
3833 	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
3834 	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
3835 	mutex_exit(&db->db_mtx);
3836 
3837 	return (holds);
3838 }
3839 
3840 void *
3841 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
3842     dmu_buf_user_t *new_user)
3843 {
3844 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3845 
3846 	mutex_enter(&db->db_mtx);
3847 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3848 	if (db->db_user == old_user)
3849 		db->db_user = new_user;
3850 	else
3851 		old_user = db->db_user;
3852 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3853 	mutex_exit(&db->db_mtx);
3854 
3855 	return (old_user);
3856 }
3857 
3858 void *
3859 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3860 {
3861 	return (dmu_buf_replace_user(db_fake, NULL, user));
3862 }
3863 
3864 void *
3865 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3866 {
3867 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3868 
3869 	db->db_user_immediate_evict = TRUE;
3870 	return (dmu_buf_set_user(db_fake, user));
3871 }
3872 
3873 void *
3874 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3875 {
3876 	return (dmu_buf_replace_user(db_fake, user, NULL));
3877 }
3878 
3879 void *
3880 dmu_buf_get_user(dmu_buf_t *db_fake)
3881 {
3882 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3883 
3884 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
3885 	return (db->db_user);
3886 }
3887 
3888 void
3889 dmu_buf_user_evict_wait()
3890 {
3891 	taskq_wait(dbu_evict_taskq);
3892 }
3893 
3894 blkptr_t *
3895 dmu_buf_get_blkptr(dmu_buf_t *db)
3896 {
3897 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3898 	return (dbi->db_blkptr);
3899 }
3900 
3901 objset_t *
3902 dmu_buf_get_objset(dmu_buf_t *db)
3903 {
3904 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3905 	return (dbi->db_objset);
3906 }
3907 
3908 dnode_t *
3909 dmu_buf_dnode_enter(dmu_buf_t *db)
3910 {
3911 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3912 	DB_DNODE_ENTER(dbi);
3913 	return (DB_DNODE(dbi));
3914 }
3915 
3916 void
3917 dmu_buf_dnode_exit(dmu_buf_t *db)
3918 {
3919 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3920 	DB_DNODE_EXIT(dbi);
3921 }
3922 
3923 static void
3924 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
3925 {
3926 	/* ASSERT(dmu_tx_is_syncing(tx) */
3927 	ASSERT(MUTEX_HELD(&db->db_mtx));
3928 
3929 	if (db->db_blkptr != NULL)
3930 		return;
3931 
3932 	if (db->db_blkid == DMU_SPILL_BLKID) {
3933 		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
3934 		BP_ZERO(db->db_blkptr);
3935 		return;
3936 	}
3937 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
3938 		/*
3939 		 * This buffer was allocated at a time when there was
3940 		 * no available blkptrs from the dnode, or it was
3941 		 * inappropriate to hook it in (i.e., nlevels mismatch).
3942 		 */
3943 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
3944 		ASSERT(db->db_parent == NULL);
3945 		db->db_parent = dn->dn_dbuf;
3946 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
3947 		DBUF_VERIFY(db);
3948 	} else {
3949 		dmu_buf_impl_t *parent = db->db_parent;
3950 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3951 
3952 		ASSERT(dn->dn_phys->dn_nlevels > 1);
3953 		if (parent == NULL) {
3954 			mutex_exit(&db->db_mtx);
3955 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
3956 			parent = dbuf_hold_level(dn, db->db_level + 1,
3957 			    db->db_blkid >> epbs, db);
3958 			rw_exit(&dn->dn_struct_rwlock);
3959 			mutex_enter(&db->db_mtx);
3960 			db->db_parent = parent;
3961 		}
3962 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
3963 		    (db->db_blkid & ((1ULL << epbs) - 1));
3964 		DBUF_VERIFY(db);
3965 	}
3966 }
3967 
3968 static void
3969 dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3970 {
3971 	dmu_buf_impl_t *db = dr->dr_dbuf;
3972 	void *data = dr->dt.dl.dr_data;
3973 
3974 	ASSERT0(db->db_level);
3975 	ASSERT(MUTEX_HELD(&db->db_mtx));
3976 	ASSERT(db->db_blkid == DMU_BONUS_BLKID);
3977 	ASSERT(data != NULL);
3978 
3979 	dnode_t *dn = dr->dr_dnode;
3980 	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
3981 	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
3982 	bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
3983 
3984 	dbuf_sync_leaf_verify_bonus_dnode(dr);
3985 
3986 	dbuf_undirty_bonus(dr);
3987 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
3988 }
3989 
3990 /*
3991  * When syncing out a blocks of dnodes, adjust the block to deal with
3992  * encryption.  Normally, we make sure the block is decrypted before writing
3993  * it.  If we have crypt params, then we are writing a raw (encrypted) block,
3994  * from a raw receive.  In this case, set the ARC buf's crypt params so
3995  * that the BP will be filled with the correct byteorder, salt, iv, and mac.
3996  */
3997 static void
3998 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
3999 {
4000 	int err;
4001 	dmu_buf_impl_t *db = dr->dr_dbuf;
4002 
4003 	ASSERT(MUTEX_HELD(&db->db_mtx));
4004 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
4005 	ASSERT3U(db->db_level, ==, 0);
4006 
4007 	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
4008 		zbookmark_phys_t zb;
4009 
4010 		/*
4011 		 * Unfortunately, there is currently no mechanism for
4012 		 * syncing context to handle decryption errors. An error
4013 		 * here is only possible if an attacker maliciously
4014 		 * changed a dnode block and updated the associated
4015 		 * checksums going up the block tree.
4016 		 */
4017 		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
4018 		    db->db.db_object, db->db_level, db->db_blkid);
4019 		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
4020 		    &zb, B_TRUE);
4021 		if (err)
4022 			panic("Invalid dnode block MAC");
4023 	} else if (dr->dt.dl.dr_has_raw_params) {
4024 		(void) arc_release(dr->dt.dl.dr_data, db);
4025 		arc_convert_to_raw(dr->dt.dl.dr_data,
4026 		    dmu_objset_id(db->db_objset),
4027 		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
4028 		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
4029 	}
4030 }
4031 
4032 /*
4033  * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
4034  * is critical the we not allow the compiler to inline this function in to
4035  * dbuf_sync_list() thereby drastically bloating the stack usage.
4036  */
4037 noinline static void
4038 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4039 {
4040 	dmu_buf_impl_t *db = dr->dr_dbuf;
4041 	dnode_t *dn = dr->dr_dnode;
4042 
4043 	ASSERT(dmu_tx_is_syncing(tx));
4044 
4045 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4046 
4047 	mutex_enter(&db->db_mtx);
4048 
4049 	ASSERT(db->db_level > 0);
4050 	DBUF_VERIFY(db);
4051 
4052 	/* Read the block if it hasn't been read yet. */
4053 	if (db->db_buf == NULL) {
4054 		mutex_exit(&db->db_mtx);
4055 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
4056 		mutex_enter(&db->db_mtx);
4057 	}
4058 	ASSERT3U(db->db_state, ==, DB_CACHED);
4059 	ASSERT(db->db_buf != NULL);
4060 
4061 	/* Indirect block size must match what the dnode thinks it is. */
4062 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4063 	dbuf_check_blkptr(dn, db);
4064 
4065 	/* Provide the pending dirty record to child dbufs */
4066 	db->db_data_pending = dr;
4067 
4068 	mutex_exit(&db->db_mtx);
4069 
4070 	dbuf_write(dr, db->db_buf, tx);
4071 
4072 	zio_t *zio = dr->dr_zio;
4073 	mutex_enter(&dr->dt.di.dr_mtx);
4074 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
4075 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4076 	mutex_exit(&dr->dt.di.dr_mtx);
4077 	zio_nowait(zio);
4078 }
4079 
4080 /*
4081  * Verify that the size of the data in our bonus buffer does not exceed
4082  * its recorded size.
4083  *
4084  * The purpose of this verification is to catch any cases in development
4085  * where the size of a phys structure (i.e space_map_phys_t) grows and,
4086  * due to incorrect feature management, older pools expect to read more
4087  * data even though they didn't actually write it to begin with.
4088  *
4089  * For a example, this would catch an error in the feature logic where we
4090  * open an older pool and we expect to write the space map histogram of
4091  * a space map with size SPACE_MAP_SIZE_V0.
4092  */
4093 static void
4094 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
4095 {
4096 #ifdef ZFS_DEBUG
4097 	dnode_t *dn = dr->dr_dnode;
4098 
4099 	/*
4100 	 * Encrypted bonus buffers can have data past their bonuslen.
4101 	 * Skip the verification of these blocks.
4102 	 */
4103 	if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
4104 		return;
4105 
4106 	uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
4107 	uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
4108 	ASSERT3U(bonuslen, <=, maxbonuslen);
4109 
4110 	arc_buf_t *datap = dr->dt.dl.dr_data;
4111 	char *datap_end = ((char *)datap) + bonuslen;
4112 	char *datap_max = ((char *)datap) + maxbonuslen;
4113 
4114 	/* ensure that everything is zero after our data */
4115 	for (; datap_end < datap_max; datap_end++)
4116 		ASSERT(*datap_end == 0);
4117 #endif
4118 }
4119 
4120 static blkptr_t *
4121 dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
4122 {
4123 	/* This must be a lightweight dirty record. */
4124 	ASSERT3P(dr->dr_dbuf, ==, NULL);
4125 	dnode_t *dn = dr->dr_dnode;
4126 
4127 	if (dn->dn_phys->dn_nlevels == 1) {
4128 		VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
4129 		return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
4130 	} else {
4131 		dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4132 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
4133 		VERIFY3U(parent_db->db_level, ==, 1);
4134 		VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
4135 		VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4136 		blkptr_t *bp = parent_db->db.db_data;
4137 		return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
4138 	}
4139 }
4140 
4141 static void
4142 dbuf_lightweight_ready(zio_t *zio)
4143 {
4144 	dbuf_dirty_record_t *dr = zio->io_private;
4145 	blkptr_t *bp = zio->io_bp;
4146 
4147 	if (zio->io_error != 0)
4148 		return;
4149 
4150 	dnode_t *dn = dr->dr_dnode;
4151 
4152 	blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
4153 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
4154 	int64_t delta = bp_get_dsize_sync(spa, bp) -
4155 	    bp_get_dsize_sync(spa, bp_orig);
4156 	dnode_diduse_space(dn, delta);
4157 
4158 	uint64_t blkid = dr->dt.dll.dr_blkid;
4159 	mutex_enter(&dn->dn_mtx);
4160 	if (blkid > dn->dn_phys->dn_maxblkid) {
4161 		ASSERT0(dn->dn_objset->os_raw_receive);
4162 		dn->dn_phys->dn_maxblkid = blkid;
4163 	}
4164 	mutex_exit(&dn->dn_mtx);
4165 
4166 	if (!BP_IS_EMBEDDED(bp)) {
4167 		uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
4168 		BP_SET_FILL(bp, fill);
4169 	}
4170 
4171 	dmu_buf_impl_t *parent_db;
4172 	EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4173 	if (dr->dr_parent == NULL) {
4174 		parent_db = dn->dn_dbuf;
4175 	} else {
4176 		parent_db = dr->dr_parent->dr_dbuf;
4177 	}
4178 	rw_enter(&parent_db->db_rwlock, RW_WRITER);
4179 	*bp_orig = *bp;
4180 	rw_exit(&parent_db->db_rwlock);
4181 }
4182 
4183 static void
4184 dbuf_lightweight_physdone(zio_t *zio)
4185 {
4186 	dbuf_dirty_record_t *dr = zio->io_private;
4187 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
4188 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
4189 
4190 	/*
4191 	 * The callback will be called io_phys_children times.  Retire one
4192 	 * portion of our dirty space each time we are called.  Any rounding
4193 	 * error will be cleaned up by dbuf_lightweight_done().
4194 	 */
4195 	int delta = dr->dr_accounted / zio->io_phys_children;
4196 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
4197 }
4198 
4199 static void
4200 dbuf_lightweight_done(zio_t *zio)
4201 {
4202 	dbuf_dirty_record_t *dr = zio->io_private;
4203 
4204 	VERIFY0(zio->io_error);
4205 
4206 	objset_t *os = dr->dr_dnode->dn_objset;
4207 	dmu_tx_t *tx = os->os_synctx;
4208 
4209 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4210 		ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
4211 	} else {
4212 		dsl_dataset_t *ds = os->os_dsl_dataset;
4213 		(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
4214 		dsl_dataset_block_born(ds, zio->io_bp, tx);
4215 	}
4216 
4217 	/*
4218 	 * See comment in dbuf_write_done().
4219 	 */
4220 	if (zio->io_phys_children == 0) {
4221 		dsl_pool_undirty_space(dmu_objset_pool(os),
4222 		    dr->dr_accounted, zio->io_txg);
4223 	} else {
4224 		dsl_pool_undirty_space(dmu_objset_pool(os),
4225 		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
4226 	}
4227 
4228 	abd_free(dr->dt.dll.dr_abd);
4229 	kmem_free(dr, sizeof (*dr));
4230 }
4231 
4232 noinline static void
4233 dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4234 {
4235 	dnode_t *dn = dr->dr_dnode;
4236 	zio_t *pio;
4237 	if (dn->dn_phys->dn_nlevels == 1) {
4238 		pio = dn->dn_zio;
4239 	} else {
4240 		pio = dr->dr_parent->dr_zio;
4241 	}
4242 
4243 	zbookmark_phys_t zb = {
4244 		.zb_objset = dmu_objset_id(dn->dn_objset),
4245 		.zb_object = dn->dn_object,
4246 		.zb_level = 0,
4247 		.zb_blkid = dr->dt.dll.dr_blkid,
4248 	};
4249 
4250 	/*
4251 	 * See comment in dbuf_write().  This is so that zio->io_bp_orig
4252 	 * will have the old BP in dbuf_lightweight_done().
4253 	 */
4254 	dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
4255 
4256 	dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
4257 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
4258 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
4259 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
4260 	    dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
4261 	    ZIO_PRIORITY_ASYNC_WRITE,
4262 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
4263 
4264 	zio_nowait(dr->dr_zio);
4265 }
4266 
4267 /*
4268  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
4269  * critical the we not allow the compiler to inline this function in to
4270  * dbuf_sync_list() thereby drastically bloating the stack usage.
4271  */
4272 noinline static void
4273 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
4274 {
4275 	arc_buf_t **datap = &dr->dt.dl.dr_data;
4276 	dmu_buf_impl_t *db = dr->dr_dbuf;
4277 	dnode_t *dn = dr->dr_dnode;
4278 	objset_t *os;
4279 	uint64_t txg = tx->tx_txg;
4280 
4281 	ASSERT(dmu_tx_is_syncing(tx));
4282 
4283 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
4284 
4285 	mutex_enter(&db->db_mtx);
4286 	/*
4287 	 * To be synced, we must be dirtied.  But we
4288 	 * might have been freed after the dirty.
4289 	 */
4290 	if (db->db_state == DB_UNCACHED) {
4291 		/* This buffer has been freed since it was dirtied */
4292 		ASSERT(db->db.db_data == NULL);
4293 	} else if (db->db_state == DB_FILL) {
4294 		/* This buffer was freed and is now being re-filled */
4295 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
4296 	} else {
4297 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
4298 	}
4299 	DBUF_VERIFY(db);
4300 
4301 	if (db->db_blkid == DMU_SPILL_BLKID) {
4302 		mutex_enter(&dn->dn_mtx);
4303 		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
4304 			/*
4305 			 * In the previous transaction group, the bonus buffer
4306 			 * was entirely used to store the attributes for the
4307 			 * dnode which overrode the dn_spill field.  However,
4308 			 * when adding more attributes to the file a spill
4309 			 * block was required to hold the extra attributes.
4310 			 *
4311 			 * Make sure to clear the garbage left in the dn_spill
4312 			 * field from the previous attributes in the bonus
4313 			 * buffer.  Otherwise, after writing out the spill
4314 			 * block to the new allocated dva, it will free
4315 			 * the old block pointed to by the invalid dn_spill.
4316 			 */
4317 			db->db_blkptr = NULL;
4318 		}
4319 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
4320 		mutex_exit(&dn->dn_mtx);
4321 	}
4322 
4323 	/*
4324 	 * If this is a bonus buffer, simply copy the bonus data into the
4325 	 * dnode.  It will be written out when the dnode is synced (and it
4326 	 * will be synced, since it must have been dirty for dbuf_sync to
4327 	 * be called).
4328 	 */
4329 	if (db->db_blkid == DMU_BONUS_BLKID) {
4330 		ASSERT(dr->dr_dbuf == db);
4331 		dbuf_sync_bonus(dr, tx);
4332 		return;
4333 	}
4334 
4335 	os = dn->dn_objset;
4336 
4337 	/*
4338 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
4339 	 * operation to sneak in. As a result, we need to ensure that we
4340 	 * don't check the dr_override_state until we have returned from
4341 	 * dbuf_check_blkptr.
4342 	 */
4343 	dbuf_check_blkptr(dn, db);
4344 
4345 	/*
4346 	 * If this buffer is in the middle of an immediate write,
4347 	 * wait for the synchronous IO to complete.
4348 	 */
4349 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
4350 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
4351 		cv_wait(&db->db_changed, &db->db_mtx);
4352 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
4353 	}
4354 
4355 	/*
4356 	 * If this is a dnode block, ensure it is appropriately encrypted
4357 	 * or decrypted, depending on what we are writing to it this txg.
4358 	 */
4359 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
4360 		dbuf_prepare_encrypted_dnode_leaf(dr);
4361 
4362 	if (db->db_state != DB_NOFILL &&
4363 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
4364 	    zfs_refcount_count(&db->db_holds) > 1 &&
4365 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
4366 	    *datap == db->db_buf) {
4367 		/*
4368 		 * If this buffer is currently "in use" (i.e., there
4369 		 * are active holds and db_data still references it),
4370 		 * then make a copy before we start the write so that
4371 		 * any modifications from the open txg will not leak
4372 		 * into this write.
4373 		 *
4374 		 * NOTE: this copy does not need to be made for
4375 		 * objects only modified in the syncing context (e.g.
4376 		 * DNONE_DNODE blocks).
4377 		 */
4378 		int psize = arc_buf_size(*datap);
4379 		int lsize = arc_buf_lsize(*datap);
4380 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
4381 		enum zio_compress compress_type = arc_get_compression(*datap);
4382 		uint8_t complevel = arc_get_complevel(*datap);
4383 
4384 		if (arc_is_encrypted(*datap)) {
4385 			boolean_t byteorder;
4386 			uint8_t salt[ZIO_DATA_SALT_LEN];
4387 			uint8_t iv[ZIO_DATA_IV_LEN];
4388 			uint8_t mac[ZIO_DATA_MAC_LEN];
4389 
4390 			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
4391 			*datap = arc_alloc_raw_buf(os->os_spa, db,
4392 			    dmu_objset_id(os), byteorder, salt, iv, mac,
4393 			    dn->dn_type, psize, lsize, compress_type,
4394 			    complevel);
4395 		} else if (compress_type != ZIO_COMPRESS_OFF) {
4396 			ASSERT3U(type, ==, ARC_BUFC_DATA);
4397 			*datap = arc_alloc_compressed_buf(os->os_spa, db,
4398 			    psize, lsize, compress_type, complevel);
4399 		} else {
4400 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
4401 		}
4402 		bcopy(db->db.db_data, (*datap)->b_data, psize);
4403 	}
4404 	db->db_data_pending = dr;
4405 
4406 	mutex_exit(&db->db_mtx);
4407 
4408 	dbuf_write(dr, *datap, tx);
4409 
4410 	ASSERT(!list_link_active(&dr->dr_dirty_node));
4411 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
4412 		list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
4413 	} else {
4414 		zio_nowait(dr->dr_zio);
4415 	}
4416 }
4417 
4418 void
4419 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
4420 {
4421 	dbuf_dirty_record_t *dr;
4422 
4423 	while ((dr = list_head(list))) {
4424 		if (dr->dr_zio != NULL) {
4425 			/*
4426 			 * If we find an already initialized zio then we
4427 			 * are processing the meta-dnode, and we have finished.
4428 			 * The dbufs for all dnodes are put back on the list
4429 			 * during processing, so that we can zio_wait()
4430 			 * these IOs after initiating all child IOs.
4431 			 */
4432 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
4433 			    DMU_META_DNODE_OBJECT);
4434 			break;
4435 		}
4436 		list_remove(list, dr);
4437 		if (dr->dr_dbuf == NULL) {
4438 			dbuf_sync_lightweight(dr, tx);
4439 		} else {
4440 			if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
4441 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
4442 				VERIFY3U(dr->dr_dbuf->db_level, ==, level);
4443 			}
4444 			if (dr->dr_dbuf->db_level > 0)
4445 				dbuf_sync_indirect(dr, tx);
4446 			else
4447 				dbuf_sync_leaf(dr, tx);
4448 		}
4449 	}
4450 }
4451 
4452 /* ARGSUSED */
4453 static void
4454 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4455 {
4456 	dmu_buf_impl_t *db = vdb;
4457 	dnode_t *dn;
4458 	blkptr_t *bp = zio->io_bp;
4459 	blkptr_t *bp_orig = &zio->io_bp_orig;
4460 	spa_t *spa = zio->io_spa;
4461 	int64_t delta;
4462 	uint64_t fill = 0;
4463 	int i;
4464 
4465 	ASSERT3P(db->db_blkptr, !=, NULL);
4466 	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
4467 
4468 	DB_DNODE_ENTER(db);
4469 	dn = DB_DNODE(db);
4470 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
4471 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
4472 	zio->io_prev_space_delta = delta;
4473 
4474 	if (bp->blk_birth != 0) {
4475 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
4476 		    BP_GET_TYPE(bp) == dn->dn_type) ||
4477 		    (db->db_blkid == DMU_SPILL_BLKID &&
4478 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
4479 		    BP_IS_EMBEDDED(bp));
4480 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
4481 	}
4482 
4483 	mutex_enter(&db->db_mtx);
4484 
4485 #ifdef ZFS_DEBUG
4486 	if (db->db_blkid == DMU_SPILL_BLKID) {
4487 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4488 		ASSERT(!(BP_IS_HOLE(bp)) &&
4489 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4490 	}
4491 #endif
4492 
4493 	if (db->db_level == 0) {
4494 		mutex_enter(&dn->dn_mtx);
4495 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
4496 		    db->db_blkid != DMU_SPILL_BLKID) {
4497 			ASSERT0(db->db_objset->os_raw_receive);
4498 			dn->dn_phys->dn_maxblkid = db->db_blkid;
4499 		}
4500 		mutex_exit(&dn->dn_mtx);
4501 
4502 		if (dn->dn_type == DMU_OT_DNODE) {
4503 			i = 0;
4504 			while (i < db->db.db_size) {
4505 				dnode_phys_t *dnp =
4506 				    (void *)(((char *)db->db.db_data) + i);
4507 
4508 				i += DNODE_MIN_SIZE;
4509 				if (dnp->dn_type != DMU_OT_NONE) {
4510 					fill++;
4511 					i += dnp->dn_extra_slots *
4512 					    DNODE_MIN_SIZE;
4513 				}
4514 			}
4515 		} else {
4516 			if (BP_IS_HOLE(bp)) {
4517 				fill = 0;
4518 			} else {
4519 				fill = 1;
4520 			}
4521 		}
4522 	} else {
4523 		blkptr_t *ibp = db->db.db_data;
4524 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4525 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
4526 			if (BP_IS_HOLE(ibp))
4527 				continue;
4528 			fill += BP_GET_FILL(ibp);
4529 		}
4530 	}
4531 	DB_DNODE_EXIT(db);
4532 
4533 	if (!BP_IS_EMBEDDED(bp))
4534 		BP_SET_FILL(bp, fill);
4535 
4536 	mutex_exit(&db->db_mtx);
4537 
4538 	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
4539 	*db->db_blkptr = *bp;
4540 	dmu_buf_unlock_parent(db, dblt, FTAG);
4541 }
4542 
4543 /* ARGSUSED */
4544 /*
4545  * This function gets called just prior to running through the compression
4546  * stage of the zio pipeline. If we're an indirect block comprised of only
4547  * holes, then we want this indirect to be compressed away to a hole. In
4548  * order to do that we must zero out any information about the holes that
4549  * this indirect points to prior to before we try to compress it.
4550  */
4551 static void
4552 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4553 {
4554 	dmu_buf_impl_t *db = vdb;
4555 	dnode_t *dn;
4556 	blkptr_t *bp;
4557 	unsigned int epbs, i;
4558 
4559 	ASSERT3U(db->db_level, >, 0);
4560 	DB_DNODE_ENTER(db);
4561 	dn = DB_DNODE(db);
4562 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
4563 	ASSERT3U(epbs, <, 31);
4564 
4565 	/* Determine if all our children are holes */
4566 	for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
4567 		if (!BP_IS_HOLE(bp))
4568 			break;
4569 	}
4570 
4571 	/*
4572 	 * If all the children are holes, then zero them all out so that
4573 	 * we may get compressed away.
4574 	 */
4575 	if (i == 1ULL << epbs) {
4576 		/*
4577 		 * We only found holes. Grab the rwlock to prevent
4578 		 * anybody from reading the blocks we're about to
4579 		 * zero out.
4580 		 */
4581 		rw_enter(&db->db_rwlock, RW_WRITER);
4582 		bzero(db->db.db_data, db->db.db_size);
4583 		rw_exit(&db->db_rwlock);
4584 	}
4585 	DB_DNODE_EXIT(db);
4586 }
4587 
4588 /*
4589  * The SPA will call this callback several times for each zio - once
4590  * for every physical child i/o (zio->io_phys_children times).  This
4591  * allows the DMU to monitor the progress of each logical i/o.  For example,
4592  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
4593  * block.  There may be a long delay before all copies/fragments are completed,
4594  * so this callback allows us to retire dirty space gradually, as the physical
4595  * i/os complete.
4596  */
4597 /* ARGSUSED */
4598 static void
4599 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
4600 {
4601 	dmu_buf_impl_t *db = arg;
4602 	objset_t *os = db->db_objset;
4603 	dsl_pool_t *dp = dmu_objset_pool(os);
4604 	dbuf_dirty_record_t *dr;
4605 	int delta = 0;
4606 
4607 	dr = db->db_data_pending;
4608 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
4609 
4610 	/*
4611 	 * The callback will be called io_phys_children times.  Retire one
4612 	 * portion of our dirty space each time we are called.  Any rounding
4613 	 * error will be cleaned up by dbuf_write_done().
4614 	 */
4615 	delta = dr->dr_accounted / zio->io_phys_children;
4616 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
4617 }
4618 
4619 /* ARGSUSED */
4620 static void
4621 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
4622 {
4623 	dmu_buf_impl_t *db = vdb;
4624 	blkptr_t *bp_orig = &zio->io_bp_orig;
4625 	blkptr_t *bp = db->db_blkptr;
4626 	objset_t *os = db->db_objset;
4627 	dmu_tx_t *tx = os->os_synctx;
4628 
4629 	ASSERT0(zio->io_error);
4630 	ASSERT(db->db_blkptr == bp);
4631 
4632 	/*
4633 	 * For nopwrites and rewrites we ensure that the bp matches our
4634 	 * original and bypass all the accounting.
4635 	 */
4636 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4637 		ASSERT(BP_EQUAL(bp, bp_orig));
4638 	} else {
4639 		dsl_dataset_t *ds = os->os_dsl_dataset;
4640 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
4641 		dsl_dataset_block_born(ds, bp, tx);
4642 	}
4643 
4644 	mutex_enter(&db->db_mtx);
4645 
4646 	DBUF_VERIFY(db);
4647 
4648 	dbuf_dirty_record_t *dr = db->db_data_pending;
4649 	dnode_t *dn = dr->dr_dnode;
4650 	ASSERT(!list_link_active(&dr->dr_dirty_node));
4651 	ASSERT(dr->dr_dbuf == db);
4652 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
4653 	list_remove(&db->db_dirty_records, dr);
4654 
4655 #ifdef ZFS_DEBUG
4656 	if (db->db_blkid == DMU_SPILL_BLKID) {
4657 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4658 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
4659 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4660 	}
4661 #endif
4662 
4663 	if (db->db_level == 0) {
4664 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
4665 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
4666 		if (db->db_state != DB_NOFILL) {
4667 			if (dr->dt.dl.dr_data != db->db_buf)
4668 				arc_buf_destroy(dr->dt.dl.dr_data, db);
4669 		}
4670 	} else {
4671 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4672 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
4673 		if (!BP_IS_HOLE(db->db_blkptr)) {
4674 			int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
4675 			    SPA_BLKPTRSHIFT;
4676 			ASSERT3U(db->db_blkid, <=,
4677 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
4678 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
4679 			    db->db.db_size);
4680 		}
4681 		mutex_destroy(&dr->dt.di.dr_mtx);
4682 		list_destroy(&dr->dt.di.dr_children);
4683 	}
4684 
4685 	cv_broadcast(&db->db_changed);
4686 	ASSERT(db->db_dirtycnt > 0);
4687 	db->db_dirtycnt -= 1;
4688 	db->db_data_pending = NULL;
4689 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
4690 
4691 	/*
4692 	 * If we didn't do a physical write in this ZIO and we
4693 	 * still ended up here, it means that the space of the
4694 	 * dbuf that we just released (and undirtied) above hasn't
4695 	 * been marked as undirtied in the pool's accounting.
4696 	 *
4697 	 * Thus, we undirty that space in the pool's view of the
4698 	 * world here. For physical writes this type of update
4699 	 * happens in dbuf_write_physdone().
4700 	 *
4701 	 * If we did a physical write, cleanup any rounding errors
4702 	 * that came up due to writing multiple copies of a block
4703 	 * on disk [see dbuf_write_physdone()].
4704 	 */
4705 	if (zio->io_phys_children == 0) {
4706 		dsl_pool_undirty_space(dmu_objset_pool(os),
4707 		    dr->dr_accounted, zio->io_txg);
4708 	} else {
4709 		dsl_pool_undirty_space(dmu_objset_pool(os),
4710 		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
4711 	}
4712 
4713 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
4714 }
4715 
4716 static void
4717 dbuf_write_nofill_ready(zio_t *zio)
4718 {
4719 	dbuf_write_ready(zio, NULL, zio->io_private);
4720 }
4721 
4722 static void
4723 dbuf_write_nofill_done(zio_t *zio)
4724 {
4725 	dbuf_write_done(zio, NULL, zio->io_private);
4726 }
4727 
4728 static void
4729 dbuf_write_override_ready(zio_t *zio)
4730 {
4731 	dbuf_dirty_record_t *dr = zio->io_private;
4732 	dmu_buf_impl_t *db = dr->dr_dbuf;
4733 
4734 	dbuf_write_ready(zio, NULL, db);
4735 }
4736 
4737 static void
4738 dbuf_write_override_done(zio_t *zio)
4739 {
4740 	dbuf_dirty_record_t *dr = zio->io_private;
4741 	dmu_buf_impl_t *db = dr->dr_dbuf;
4742 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
4743 
4744 	mutex_enter(&db->db_mtx);
4745 	if (!BP_EQUAL(zio->io_bp, obp)) {
4746 		if (!BP_IS_HOLE(obp))
4747 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
4748 		arc_release(dr->dt.dl.dr_data, db);
4749 	}
4750 	mutex_exit(&db->db_mtx);
4751 
4752 	dbuf_write_done(zio, NULL, db);
4753 
4754 	if (zio->io_abd != NULL)
4755 		abd_free(zio->io_abd);
4756 }
4757 
4758 typedef struct dbuf_remap_impl_callback_arg {
4759 	objset_t	*drica_os;
4760 	uint64_t	drica_blk_birth;
4761 	dmu_tx_t	*drica_tx;
4762 } dbuf_remap_impl_callback_arg_t;
4763 
4764 static void
4765 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
4766     void *arg)
4767 {
4768 	dbuf_remap_impl_callback_arg_t *drica = arg;
4769 	objset_t *os = drica->drica_os;
4770 	spa_t *spa = dmu_objset_spa(os);
4771 	dmu_tx_t *tx = drica->drica_tx;
4772 
4773 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4774 
4775 	if (os == spa_meta_objset(spa)) {
4776 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
4777 	} else {
4778 		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
4779 		    size, drica->drica_blk_birth, tx);
4780 	}
4781 }
4782 
4783 static void
4784 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
4785 {
4786 	blkptr_t bp_copy = *bp;
4787 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
4788 	dbuf_remap_impl_callback_arg_t drica;
4789 
4790 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4791 
4792 	drica.drica_os = dn->dn_objset;
4793 	drica.drica_blk_birth = bp->blk_birth;
4794 	drica.drica_tx = tx;
4795 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
4796 	    &drica)) {
4797 		/*
4798 		 * If the blkptr being remapped is tracked by a livelist,
4799 		 * then we need to make sure the livelist reflects the update.
4800 		 * First, cancel out the old blkptr by appending a 'FREE'
4801 		 * entry. Next, add an 'ALLOC' to track the new version. This
4802 		 * way we avoid trying to free an inaccurate blkptr at delete.
4803 		 * Note that embedded blkptrs are not tracked in livelists.
4804 		 */
4805 		if (dn->dn_objset != spa_meta_objset(spa)) {
4806 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
4807 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
4808 			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
4809 				ASSERT(!BP_IS_EMBEDDED(bp));
4810 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
4811 				ASSERT(spa_feature_is_enabled(spa,
4812 				    SPA_FEATURE_LIVELIST));
4813 				bplist_append(&ds->ds_dir->dd_pending_frees,
4814 				    bp);
4815 				bplist_append(&ds->ds_dir->dd_pending_allocs,
4816 				    &bp_copy);
4817 			}
4818 		}
4819 
4820 		/*
4821 		 * The db_rwlock prevents dbuf_read_impl() from
4822 		 * dereferencing the BP while we are changing it.  To
4823 		 * avoid lock contention, only grab it when we are actually
4824 		 * changing the BP.
4825 		 */
4826 		if (rw != NULL)
4827 			rw_enter(rw, RW_WRITER);
4828 		*bp = bp_copy;
4829 		if (rw != NULL)
4830 			rw_exit(rw);
4831 	}
4832 }
4833 
4834 /*
4835  * Remap any existing BP's to concrete vdevs, if possible.
4836  */
4837 static void
4838 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
4839 {
4840 	spa_t *spa = dmu_objset_spa(db->db_objset);
4841 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4842 
4843 	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
4844 		return;
4845 
4846 	if (db->db_level > 0) {
4847 		blkptr_t *bp = db->db.db_data;
4848 		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
4849 			dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
4850 		}
4851 	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
4852 		dnode_phys_t *dnp = db->db.db_data;
4853 		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
4854 		    DMU_OT_DNODE);
4855 		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
4856 		    i += dnp[i].dn_extra_slots + 1) {
4857 			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
4858 				krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
4859 				    &dn->dn_dbuf->db_rwlock);
4860 				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
4861 				    tx);
4862 			}
4863 		}
4864 	}
4865 }
4866 
4867 
4868 /* Issue I/O to commit a dirty buffer to disk. */
4869 static void
4870 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
4871 {
4872 	dmu_buf_impl_t *db = dr->dr_dbuf;
4873 	dnode_t *dn = dr->dr_dnode;
4874 	objset_t *os;
4875 	dmu_buf_impl_t *parent = db->db_parent;
4876 	uint64_t txg = tx->tx_txg;
4877 	zbookmark_phys_t zb;
4878 	zio_prop_t zp;
4879 	zio_t *pio; /* parent I/O */
4880 	int wp_flag = 0;
4881 
4882 	ASSERT(dmu_tx_is_syncing(tx));
4883 
4884 	os = dn->dn_objset;
4885 
4886 	if (db->db_state != DB_NOFILL) {
4887 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
4888 			/*
4889 			 * Private object buffers are released here rather
4890 			 * than in dbuf_dirty() since they are only modified
4891 			 * in the syncing context and we don't want the
4892 			 * overhead of making multiple copies of the data.
4893 			 */
4894 			if (BP_IS_HOLE(db->db_blkptr)) {
4895 				arc_buf_thaw(data);
4896 			} else {
4897 				dbuf_release_bp(db);
4898 			}
4899 			dbuf_remap(dn, db, tx);
4900 		}
4901 	}
4902 
4903 	if (parent != dn->dn_dbuf) {
4904 		/* Our parent is an indirect block. */
4905 		/* We have a dirty parent that has been scheduled for write. */
4906 		ASSERT(parent && parent->db_data_pending);
4907 		/* Our parent's buffer is one level closer to the dnode. */
4908 		ASSERT(db->db_level == parent->db_level-1);
4909 		/*
4910 		 * We're about to modify our parent's db_data by modifying
4911 		 * our block pointer, so the parent must be released.
4912 		 */
4913 		ASSERT(arc_released(parent->db_buf));
4914 		pio = parent->db_data_pending->dr_zio;
4915 	} else {
4916 		/* Our parent is the dnode itself. */
4917 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
4918 		    db->db_blkid != DMU_SPILL_BLKID) ||
4919 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
4920 		if (db->db_blkid != DMU_SPILL_BLKID)
4921 			ASSERT3P(db->db_blkptr, ==,
4922 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
4923 		pio = dn->dn_zio;
4924 	}
4925 
4926 	ASSERT(db->db_level == 0 || data == db->db_buf);
4927 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
4928 	ASSERT(pio);
4929 
4930 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
4931 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
4932 	    db->db.db_object, db->db_level, db->db_blkid);
4933 
4934 	if (db->db_blkid == DMU_SPILL_BLKID)
4935 		wp_flag = WP_SPILL;
4936 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
4937 
4938 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
4939 
4940 	/*
4941 	 * We copy the blkptr now (rather than when we instantiate the dirty
4942 	 * record), because its value can change between open context and
4943 	 * syncing context. We do not need to hold dn_struct_rwlock to read
4944 	 * db_blkptr because we are in syncing context.
4945 	 */
4946 	dr->dr_bp_copy = *db->db_blkptr;
4947 
4948 	if (db->db_level == 0 &&
4949 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
4950 		/*
4951 		 * The BP for this block has been provided by open context
4952 		 * (by dmu_sync() or dmu_buf_write_embedded()).
4953 		 */
4954 		abd_t *contents = (data != NULL) ?
4955 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
4956 
4957 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
4958 		    contents, db->db.db_size, db->db.db_size, &zp,
4959 		    dbuf_write_override_ready, NULL, NULL,
4960 		    dbuf_write_override_done,
4961 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
4962 		mutex_enter(&db->db_mtx);
4963 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
4964 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
4965 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
4966 		mutex_exit(&db->db_mtx);
4967 	} else if (db->db_state == DB_NOFILL) {
4968 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
4969 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
4970 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
4971 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
4972 		    dbuf_write_nofill_ready, NULL, NULL,
4973 		    dbuf_write_nofill_done, db,
4974 		    ZIO_PRIORITY_ASYNC_WRITE,
4975 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
4976 	} else {
4977 		ASSERT(arc_released(data));
4978 
4979 		/*
4980 		 * For indirect blocks, we want to setup the children
4981 		 * ready callback so that we can properly handle an indirect
4982 		 * block that only contains holes.
4983 		 */
4984 		arc_write_done_func_t *children_ready_cb = NULL;
4985 		if (db->db_level != 0)
4986 			children_ready_cb = dbuf_write_children_ready;
4987 
4988 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
4989 		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
4990 		    &zp, dbuf_write_ready,
4991 		    children_ready_cb, dbuf_write_physdone,
4992 		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
4993 		    ZIO_FLAG_MUSTSUCCEED, &zb);
4994 	}
4995 }
4996 
4997 EXPORT_SYMBOL(dbuf_find);
4998 EXPORT_SYMBOL(dbuf_is_metadata);
4999 EXPORT_SYMBOL(dbuf_destroy);
5000 EXPORT_SYMBOL(dbuf_loan_arcbuf);
5001 EXPORT_SYMBOL(dbuf_whichblock);
5002 EXPORT_SYMBOL(dbuf_read);
5003 EXPORT_SYMBOL(dbuf_unoverride);
5004 EXPORT_SYMBOL(dbuf_free_range);
5005 EXPORT_SYMBOL(dbuf_new_size);
5006 EXPORT_SYMBOL(dbuf_release_bp);
5007 EXPORT_SYMBOL(dbuf_dirty);
5008 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
5009 EXPORT_SYMBOL(dmu_buf_will_dirty);
5010 EXPORT_SYMBOL(dmu_buf_is_dirty);
5011 EXPORT_SYMBOL(dmu_buf_will_not_fill);
5012 EXPORT_SYMBOL(dmu_buf_will_fill);
5013 EXPORT_SYMBOL(dmu_buf_fill_done);
5014 EXPORT_SYMBOL(dmu_buf_rele);
5015 EXPORT_SYMBOL(dbuf_assign_arcbuf);
5016 EXPORT_SYMBOL(dbuf_prefetch);
5017 EXPORT_SYMBOL(dbuf_hold_impl);
5018 EXPORT_SYMBOL(dbuf_hold);
5019 EXPORT_SYMBOL(dbuf_hold_level);
5020 EXPORT_SYMBOL(dbuf_create_bonus);
5021 EXPORT_SYMBOL(dbuf_spill_set_blksz);
5022 EXPORT_SYMBOL(dbuf_rm_spill);
5023 EXPORT_SYMBOL(dbuf_add_ref);
5024 EXPORT_SYMBOL(dbuf_rele);
5025 EXPORT_SYMBOL(dbuf_rele_and_unlock);
5026 EXPORT_SYMBOL(dbuf_refcount);
5027 EXPORT_SYMBOL(dbuf_sync_list);
5028 EXPORT_SYMBOL(dmu_buf_set_user);
5029 EXPORT_SYMBOL(dmu_buf_set_user_ie);
5030 EXPORT_SYMBOL(dmu_buf_get_user);
5031 EXPORT_SYMBOL(dmu_buf_get_blkptr);
5032 
5033 /* BEGIN CSTYLED */
5034 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
5035 	"Maximum size in bytes of the dbuf cache.");
5036 
5037 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
5038 	"Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
5039 	"directly.");
5040 
5041 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
5042 	"Percentage below dbuf_cache_max_bytes when the evict thread stops "
5043 	"evicting dbufs.");
5044 
5045 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
5046 	"Maximum size in bytes of the dbuf metadata cache.");
5047 
5048 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
5049 	"Set the size of the dbuf cache to a log2 fraction of arc size.");
5050 
5051 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
5052 	"Set the size of the dbuf metadata cache to a log2 fraction of arc "
5053 	"size.");
5054 /* END CSTYLED */
5055