xref: /illumos-gate/usr/src/uts/common/fs/zfs/dbuf.c (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1*fa9e4066Sahrens /*
2*fa9e4066Sahrens  * CDDL HEADER START
3*fa9e4066Sahrens  *
4*fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5*fa9e4066Sahrens  * Common Development and Distribution License, Version 1.0 only
6*fa9e4066Sahrens  * (the "License").  You may not use this file except in compliance
7*fa9e4066Sahrens  * with the License.
8*fa9e4066Sahrens  *
9*fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
11*fa9e4066Sahrens  * See the License for the specific language governing permissions
12*fa9e4066Sahrens  * and limitations under the License.
13*fa9e4066Sahrens  *
14*fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*fa9e4066Sahrens  *
20*fa9e4066Sahrens  * CDDL HEADER END
21*fa9e4066Sahrens  */
22*fa9e4066Sahrens /*
23*fa9e4066Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*fa9e4066Sahrens  * Use is subject to license terms.
25*fa9e4066Sahrens  */
26*fa9e4066Sahrens 
27*fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*fa9e4066Sahrens 
29*fa9e4066Sahrens #include <sys/zfs_context.h>
30*fa9e4066Sahrens #include <sys/dmu.h>
31*fa9e4066Sahrens #include <sys/dmu_impl.h>
32*fa9e4066Sahrens #include <sys/dbuf.h>
33*fa9e4066Sahrens #include <sys/dmu_objset.h>
34*fa9e4066Sahrens #include <sys/dsl_dataset.h>
35*fa9e4066Sahrens #include <sys/dsl_dir.h>
36*fa9e4066Sahrens #include <sys/dmu_tx.h>
37*fa9e4066Sahrens #include <sys/spa.h>
38*fa9e4066Sahrens #include <sys/zio.h>
39*fa9e4066Sahrens #include <sys/dmu_zfetch.h>
40*fa9e4066Sahrens 
41*fa9e4066Sahrens static void dbuf_destroy(dmu_buf_impl_t *db);
42*fa9e4066Sahrens static void dbuf_verify(dmu_buf_impl_t *db);
43*fa9e4066Sahrens static void dbuf_evict_user(dmu_buf_impl_t *db);
44*fa9e4066Sahrens static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
45*fa9e4066Sahrens static arc_done_func_t dbuf_read_done;
46*fa9e4066Sahrens static arc_done_func_t dbuf_write_done;
47*fa9e4066Sahrens 
48*fa9e4066Sahrens /*
49*fa9e4066Sahrens  * Global data structures and functions for the dbuf cache.
50*fa9e4066Sahrens  */
51*fa9e4066Sahrens taskq_t *dbuf_tq;
52*fa9e4066Sahrens static kmem_cache_t *dbuf_cache;
53*fa9e4066Sahrens 
54*fa9e4066Sahrens /* ARGSUSED */
55*fa9e4066Sahrens static int
56*fa9e4066Sahrens dbuf_cons(void *vdb, void *unused, int kmflag)
57*fa9e4066Sahrens {
58*fa9e4066Sahrens 	dmu_buf_impl_t *db = vdb;
59*fa9e4066Sahrens 	bzero(db, sizeof (dmu_buf_impl_t));
60*fa9e4066Sahrens 
61*fa9e4066Sahrens 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
62*fa9e4066Sahrens 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
63*fa9e4066Sahrens 	refcount_create(&db->db_holds);
64*fa9e4066Sahrens 	return (0);
65*fa9e4066Sahrens }
66*fa9e4066Sahrens 
67*fa9e4066Sahrens /* ARGSUSED */
68*fa9e4066Sahrens static void
69*fa9e4066Sahrens dbuf_dest(void *vdb, void *unused)
70*fa9e4066Sahrens {
71*fa9e4066Sahrens 	dmu_buf_impl_t *db = vdb;
72*fa9e4066Sahrens 	mutex_destroy(&db->db_mtx);
73*fa9e4066Sahrens 	cv_destroy(&db->db_changed);
74*fa9e4066Sahrens 	refcount_destroy(&db->db_holds);
75*fa9e4066Sahrens }
76*fa9e4066Sahrens 
77*fa9e4066Sahrens /*
78*fa9e4066Sahrens  * dbuf hash table routines
79*fa9e4066Sahrens  */
80*fa9e4066Sahrens static dbuf_hash_table_t dbuf_hash_table;
81*fa9e4066Sahrens 
82*fa9e4066Sahrens static uint64_t dbuf_hash_count;
83*fa9e4066Sahrens 
84*fa9e4066Sahrens static uint64_t
85*fa9e4066Sahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
86*fa9e4066Sahrens {
87*fa9e4066Sahrens 	uintptr_t osv = (uintptr_t)os;
88*fa9e4066Sahrens 	uint64_t crc = -1ULL;
89*fa9e4066Sahrens 
90*fa9e4066Sahrens 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
91*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
92*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
93*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
94*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
95*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
96*fa9e4066Sahrens 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
97*fa9e4066Sahrens 
98*fa9e4066Sahrens 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
99*fa9e4066Sahrens 
100*fa9e4066Sahrens 	return (crc);
101*fa9e4066Sahrens }
102*fa9e4066Sahrens 
103*fa9e4066Sahrens #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
104*fa9e4066Sahrens 
105*fa9e4066Sahrens #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
106*fa9e4066Sahrens 	((dbuf)->db.db_object == (obj) &&		\
107*fa9e4066Sahrens 	(dbuf)->db_objset == (os) &&			\
108*fa9e4066Sahrens 	(dbuf)->db_level == (level) &&			\
109*fa9e4066Sahrens 	(dbuf)->db_blkid == (blkid))
110*fa9e4066Sahrens 
111*fa9e4066Sahrens dmu_buf_impl_t *
112*fa9e4066Sahrens dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
113*fa9e4066Sahrens {
114*fa9e4066Sahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
115*fa9e4066Sahrens 	objset_impl_t *os = dn->dn_objset;
116*fa9e4066Sahrens 	uint64_t obj = dn->dn_object;
117*fa9e4066Sahrens 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
118*fa9e4066Sahrens 	uint64_t idx = hv & h->hash_table_mask;
119*fa9e4066Sahrens 	dmu_buf_impl_t *db;
120*fa9e4066Sahrens 
121*fa9e4066Sahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
122*fa9e4066Sahrens 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
123*fa9e4066Sahrens 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
124*fa9e4066Sahrens 			mutex_enter(&db->db_mtx);
125*fa9e4066Sahrens 			if (!refcount_is_zero(&db->db_holds)) {
126*fa9e4066Sahrens 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
127*fa9e4066Sahrens 				return (db);
128*fa9e4066Sahrens 			}
129*fa9e4066Sahrens 			mutex_exit(&db->db_mtx);
130*fa9e4066Sahrens 		}
131*fa9e4066Sahrens 	}
132*fa9e4066Sahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
133*fa9e4066Sahrens 	return (NULL);
134*fa9e4066Sahrens }
135*fa9e4066Sahrens 
136*fa9e4066Sahrens /*
137*fa9e4066Sahrens  * Insert an entry into the hash table.  If there is already an element
138*fa9e4066Sahrens  * equal to elem in the hash table, then the already existing element
139*fa9e4066Sahrens  * will be returned and the new element will not be inserted.
140*fa9e4066Sahrens  * Otherwise returns NULL.
141*fa9e4066Sahrens  */
142*fa9e4066Sahrens static dmu_buf_impl_t *
143*fa9e4066Sahrens dbuf_hash_insert(dmu_buf_impl_t *db)
144*fa9e4066Sahrens {
145*fa9e4066Sahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
146*fa9e4066Sahrens 	objset_impl_t *os = db->db_objset;
147*fa9e4066Sahrens 	uint64_t obj = db->db.db_object;
148*fa9e4066Sahrens 	int level = db->db_level;
149*fa9e4066Sahrens 	uint64_t blkid = db->db_blkid;
150*fa9e4066Sahrens 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
151*fa9e4066Sahrens 	uint64_t idx = hv & h->hash_table_mask;
152*fa9e4066Sahrens 	dmu_buf_impl_t *dbf;
153*fa9e4066Sahrens 
154*fa9e4066Sahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
155*fa9e4066Sahrens 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
156*fa9e4066Sahrens 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
157*fa9e4066Sahrens 			mutex_enter(&dbf->db_mtx);
158*fa9e4066Sahrens 			if (!refcount_is_zero(&dbf->db_holds)) {
159*fa9e4066Sahrens 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
160*fa9e4066Sahrens 				return (dbf);
161*fa9e4066Sahrens 			}
162*fa9e4066Sahrens 			mutex_exit(&dbf->db_mtx);
163*fa9e4066Sahrens 		}
164*fa9e4066Sahrens 	}
165*fa9e4066Sahrens 
166*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
167*fa9e4066Sahrens 	db->db_hash_next = h->hash_table[idx];
168*fa9e4066Sahrens 	h->hash_table[idx] = db;
169*fa9e4066Sahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
170*fa9e4066Sahrens 	atomic_add_64(&dbuf_hash_count, 1);
171*fa9e4066Sahrens 
172*fa9e4066Sahrens 	return (NULL);
173*fa9e4066Sahrens }
174*fa9e4066Sahrens 
175*fa9e4066Sahrens /*
176*fa9e4066Sahrens  * Remove an entry from the hash table.  This operation will
177*fa9e4066Sahrens  * fail if there are any existing holds on the db.
178*fa9e4066Sahrens  */
179*fa9e4066Sahrens static void
180*fa9e4066Sahrens dbuf_hash_remove(dmu_buf_impl_t *db)
181*fa9e4066Sahrens {
182*fa9e4066Sahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
183*fa9e4066Sahrens 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
184*fa9e4066Sahrens 	    db->db_level, db->db_blkid);
185*fa9e4066Sahrens 	uint64_t idx = hv & h->hash_table_mask;
186*fa9e4066Sahrens 	dmu_buf_impl_t *dbf, **dbp;
187*fa9e4066Sahrens 
188*fa9e4066Sahrens 	/*
189*fa9e4066Sahrens 	 * We musn't hold db_mtx to maintin lock ordering:
190*fa9e4066Sahrens 	 * DBUF_HASH_MUTEX > db_mtx.
191*fa9e4066Sahrens 	 */
192*fa9e4066Sahrens 	ASSERT(refcount_is_zero(&db->db_holds));
193*fa9e4066Sahrens 	ASSERT(db->db_dnode != NULL);
194*fa9e4066Sahrens 	ASSERT(!MUTEX_HELD(&db->db_mtx));
195*fa9e4066Sahrens 
196*fa9e4066Sahrens 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
197*fa9e4066Sahrens 	dbp = &h->hash_table[idx];
198*fa9e4066Sahrens 	while ((dbf = *dbp) != db) {
199*fa9e4066Sahrens 		dbp = &dbf->db_hash_next;
200*fa9e4066Sahrens 		ASSERT(dbf != NULL);
201*fa9e4066Sahrens 	}
202*fa9e4066Sahrens 	*dbp = db->db_hash_next;
203*fa9e4066Sahrens 	db->db_hash_next = NULL;
204*fa9e4066Sahrens 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
205*fa9e4066Sahrens 	atomic_add_64(&dbuf_hash_count, -1);
206*fa9e4066Sahrens }
207*fa9e4066Sahrens 
208*fa9e4066Sahrens static int dbuf_evictable(dmu_buf_impl_t *db);
209*fa9e4066Sahrens static void dbuf_clear(dmu_buf_impl_t *db);
210*fa9e4066Sahrens 
211*fa9e4066Sahrens void
212*fa9e4066Sahrens dbuf_evict(dmu_buf_impl_t *db)
213*fa9e4066Sahrens {
214*fa9e4066Sahrens 	int err;
215*fa9e4066Sahrens 
216*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
217*fa9e4066Sahrens 	err = dbuf_evictable(db);
218*fa9e4066Sahrens 	ASSERT(err == TRUE);
219*fa9e4066Sahrens 	dbuf_clear(db);
220*fa9e4066Sahrens 	dbuf_destroy(db);
221*fa9e4066Sahrens }
222*fa9e4066Sahrens 
223*fa9e4066Sahrens static void
224*fa9e4066Sahrens dbuf_evict_user(dmu_buf_impl_t *db)
225*fa9e4066Sahrens {
226*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
227*fa9e4066Sahrens 
228*fa9e4066Sahrens 	if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
229*fa9e4066Sahrens 		return;
230*fa9e4066Sahrens 
231*fa9e4066Sahrens 	if (db->db_d.db_user_data_ptr_ptr)
232*fa9e4066Sahrens 		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
233*fa9e4066Sahrens 	db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
234*fa9e4066Sahrens 	db->db_d.db_user_ptr = NULL;
235*fa9e4066Sahrens 	db->db_d.db_user_data_ptr_ptr = NULL;
236*fa9e4066Sahrens 	db->db_d.db_evict_func = NULL;
237*fa9e4066Sahrens }
238*fa9e4066Sahrens 
239*fa9e4066Sahrens void
240*fa9e4066Sahrens dbuf_init(void)
241*fa9e4066Sahrens {
242*fa9e4066Sahrens 	uint64_t hsize = 1;
243*fa9e4066Sahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
244*fa9e4066Sahrens 	int i;
245*fa9e4066Sahrens 
246*fa9e4066Sahrens 	/*
247*fa9e4066Sahrens 	 * The hash table is big enough to fill all of physical memory
248*fa9e4066Sahrens 	 * with an average 64k block size.  The table will take up
249*fa9e4066Sahrens 	 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
250*fa9e4066Sahrens 	 * pointers).
251*fa9e4066Sahrens 	 */
252*fa9e4066Sahrens 	while (hsize * 65536 < physmem * PAGESIZE)
253*fa9e4066Sahrens 		hsize <<= 1;
254*fa9e4066Sahrens 
255*fa9e4066Sahrens 	h->hash_table_mask = hsize - 1;
256*fa9e4066Sahrens 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
257*fa9e4066Sahrens 
258*fa9e4066Sahrens 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
259*fa9e4066Sahrens 	    sizeof (dmu_buf_impl_t),
260*fa9e4066Sahrens 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
261*fa9e4066Sahrens 	dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
262*fa9e4066Sahrens 	    TASKQ_PREPOPULATE);
263*fa9e4066Sahrens 
264*fa9e4066Sahrens 	for (i = 0; i < DBUF_MUTEXES; i++)
265*fa9e4066Sahrens 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
266*fa9e4066Sahrens }
267*fa9e4066Sahrens 
268*fa9e4066Sahrens void
269*fa9e4066Sahrens dbuf_fini(void)
270*fa9e4066Sahrens {
271*fa9e4066Sahrens 	dbuf_hash_table_t *h = &dbuf_hash_table;
272*fa9e4066Sahrens 	int i;
273*fa9e4066Sahrens 
274*fa9e4066Sahrens 	taskq_destroy(dbuf_tq);
275*fa9e4066Sahrens 	dbuf_tq = NULL;
276*fa9e4066Sahrens 
277*fa9e4066Sahrens 	for (i = 0; i < DBUF_MUTEXES; i++)
278*fa9e4066Sahrens 		mutex_destroy(&h->hash_mutexes[i]);
279*fa9e4066Sahrens 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
280*fa9e4066Sahrens 	kmem_cache_destroy(dbuf_cache);
281*fa9e4066Sahrens }
282*fa9e4066Sahrens 
283*fa9e4066Sahrens /*
284*fa9e4066Sahrens  * Other stuff.
285*fa9e4066Sahrens  */
286*fa9e4066Sahrens 
287*fa9e4066Sahrens static void
288*fa9e4066Sahrens dbuf_verify(dmu_buf_impl_t *db)
289*fa9e4066Sahrens {
290*fa9e4066Sahrens #ifdef ZFS_DEBUG
291*fa9e4066Sahrens 	int i;
292*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
293*fa9e4066Sahrens 
294*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
295*fa9e4066Sahrens 
296*fa9e4066Sahrens 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
297*fa9e4066Sahrens 		return;
298*fa9e4066Sahrens 
299*fa9e4066Sahrens 	ASSERT(db->db_objset != NULL);
300*fa9e4066Sahrens 	if (dn == NULL) {
301*fa9e4066Sahrens 		ASSERT(db->db_parent == NULL);
302*fa9e4066Sahrens 		ASSERT(db->db_blkptr == NULL);
303*fa9e4066Sahrens 	} else {
304*fa9e4066Sahrens 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
305*fa9e4066Sahrens 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
306*fa9e4066Sahrens 		ASSERT(list_head(&dn->dn_dbufs));
307*fa9e4066Sahrens 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
308*fa9e4066Sahrens 	}
309*fa9e4066Sahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
310*fa9e4066Sahrens 		ASSERT(dn != NULL);
311*fa9e4066Sahrens 		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
312*fa9e4066Sahrens 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
313*fa9e4066Sahrens 	} else {
314*fa9e4066Sahrens 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
315*fa9e4066Sahrens 	}
316*fa9e4066Sahrens 
317*fa9e4066Sahrens 	if (db->db_level == 0) {
318*fa9e4066Sahrens 		void **udpp = db->db_d.db_user_data_ptr_ptr;
319*fa9e4066Sahrens 		/* we can be momentarily larger in dnode_set_blksz() */
320*fa9e4066Sahrens 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
321*fa9e4066Sahrens 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
322*fa9e4066Sahrens 		}
323*fa9e4066Sahrens 		if (udpp) {
324*fa9e4066Sahrens 			ASSERT((refcount_is_zero(&db->db_holds) &&
325*fa9e4066Sahrens 			    *udpp == NULL) ||
326*fa9e4066Sahrens 			    (!refcount_is_zero(&db->db_holds) &&
327*fa9e4066Sahrens 			    *udpp == db->db.db_data));
328*fa9e4066Sahrens 		}
329*fa9e4066Sahrens 
330*fa9e4066Sahrens 		if (IS_DNODE_DNODE(db->db.db_object)) {
331*fa9e4066Sahrens 			for (i = 0; i < TXG_SIZE; i++) {
332*fa9e4066Sahrens 				/*
333*fa9e4066Sahrens 				 * it should only be modified in syncing
334*fa9e4066Sahrens 				 * context, so make sure we only have
335*fa9e4066Sahrens 				 * one copy of the data.
336*fa9e4066Sahrens 				 */
337*fa9e4066Sahrens 				ASSERT(db->db_d.db_data_old[i] == NULL ||
338*fa9e4066Sahrens 				    db->db_d.db_data_old[i] == db->db_buf);
339*fa9e4066Sahrens 			}
340*fa9e4066Sahrens 		}
341*fa9e4066Sahrens 	}
342*fa9e4066Sahrens 
343*fa9e4066Sahrens 	/* verify db->db_blkptr */
344*fa9e4066Sahrens 	if (db->db_blkptr) {
345*fa9e4066Sahrens 		if (db->db_parent == dn->dn_dbuf) {
346*fa9e4066Sahrens 			/* db is pointed to by the dnode */
347*fa9e4066Sahrens 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
348*fa9e4066Sahrens 			if (IS_DNODE_DNODE(db->db.db_object))
349*fa9e4066Sahrens 				ASSERT(db->db_parent == NULL);
350*fa9e4066Sahrens 			else
351*fa9e4066Sahrens 				ASSERT(db->db_parent != NULL);
352*fa9e4066Sahrens 			ASSERT3P(db->db_blkptr, ==,
353*fa9e4066Sahrens 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
354*fa9e4066Sahrens 		} else {
355*fa9e4066Sahrens 			/* db is pointed to by an indirect block */
356*fa9e4066Sahrens 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
357*fa9e4066Sahrens 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
358*fa9e4066Sahrens 			ASSERT3U(db->db_parent->db.db_object, ==,
359*fa9e4066Sahrens 			    db->db.db_object);
360*fa9e4066Sahrens 			/*
361*fa9e4066Sahrens 			 * dnode_grow_indblksz() can make this fail if we don't
362*fa9e4066Sahrens 			 * have the struct_rwlock.  XXX indblksz no longer
363*fa9e4066Sahrens 			 * grows.  safe to do this now?
364*fa9e4066Sahrens 			 */
365*fa9e4066Sahrens 			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
366*fa9e4066Sahrens 				ASSERT3P(db->db_blkptr, ==,
367*fa9e4066Sahrens 				    ((blkptr_t *)db->db_parent->db.db_data +
368*fa9e4066Sahrens 				    db->db_blkid % epb));
369*fa9e4066Sahrens 			}
370*fa9e4066Sahrens 		}
371*fa9e4066Sahrens 	}
372*fa9e4066Sahrens 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
373*fa9e4066Sahrens 	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
374*fa9e4066Sahrens 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
375*fa9e4066Sahrens 		/*
376*fa9e4066Sahrens 		 * If the blkptr isn't set but they have nonzero data,
377*fa9e4066Sahrens 		 * it had better be dirty, otherwise we'll lose that
378*fa9e4066Sahrens 		 * data when we evict this buffer.
379*fa9e4066Sahrens 		 */
380*fa9e4066Sahrens 		if (db->db_dirtycnt == 0) {
381*fa9e4066Sahrens 			uint64_t *buf = db->db.db_data;
382*fa9e4066Sahrens 			int i;
383*fa9e4066Sahrens 
384*fa9e4066Sahrens 			for (i = 0; i < db->db.db_size >> 3; i++) {
385*fa9e4066Sahrens 				ASSERT(buf[i] == 0);
386*fa9e4066Sahrens 			}
387*fa9e4066Sahrens 		}
388*fa9e4066Sahrens 	}
389*fa9e4066Sahrens #endif
390*fa9e4066Sahrens }
391*fa9e4066Sahrens 
392*fa9e4066Sahrens static void
393*fa9e4066Sahrens dbuf_update_data(dmu_buf_impl_t *db)
394*fa9e4066Sahrens {
395*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
396*fa9e4066Sahrens 	if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
397*fa9e4066Sahrens 		ASSERT(!refcount_is_zero(&db->db_holds));
398*fa9e4066Sahrens 		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
399*fa9e4066Sahrens 	}
400*fa9e4066Sahrens }
401*fa9e4066Sahrens 
402*fa9e4066Sahrens static void
403*fa9e4066Sahrens dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
404*fa9e4066Sahrens {
405*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
406*fa9e4066Sahrens 	ASSERT(buf->b_data != NULL);
407*fa9e4066Sahrens 	db->db_buf = buf;
408*fa9e4066Sahrens 	db->db.db_data = buf->b_data;
409*fa9e4066Sahrens 	dbuf_update_data(db);
410*fa9e4066Sahrens }
411*fa9e4066Sahrens 
412*fa9e4066Sahrens uint64_t
413*fa9e4066Sahrens dbuf_whichblock(dnode_t *dn, uint64_t offset)
414*fa9e4066Sahrens {
415*fa9e4066Sahrens 	if (dn->dn_datablkshift) {
416*fa9e4066Sahrens 		return (offset >> dn->dn_datablkshift);
417*fa9e4066Sahrens 	} else {
418*fa9e4066Sahrens 		ASSERT3U(offset, <, dn->dn_datablksz);
419*fa9e4066Sahrens 		return (0);
420*fa9e4066Sahrens 	}
421*fa9e4066Sahrens }
422*fa9e4066Sahrens 
423*fa9e4066Sahrens static void
424*fa9e4066Sahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
425*fa9e4066Sahrens {
426*fa9e4066Sahrens 	dmu_buf_impl_t *db = vdb;
427*fa9e4066Sahrens 
428*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
429*fa9e4066Sahrens 	ASSERT3U(db->db_state, ==, DB_READ);
430*fa9e4066Sahrens 	/*
431*fa9e4066Sahrens 	 * All reads are synchronous, so we must have a hold on the dbuf
432*fa9e4066Sahrens 	 */
433*fa9e4066Sahrens 	ASSERT(refcount_count(&db->db_holds) > 0);
434*fa9e4066Sahrens 	ASSERT(db->db.db_data == NULL);
435*fa9e4066Sahrens 	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
436*fa9e4066Sahrens 		/* we were freed in flight; disregard any error */
437*fa9e4066Sahrens 		arc_release(buf, db);
438*fa9e4066Sahrens 		bzero(buf->b_data, db->db.db_size);
439*fa9e4066Sahrens 		db->db_d.db_freed_in_flight = FALSE;
440*fa9e4066Sahrens 		dbuf_set_data(db, buf);
441*fa9e4066Sahrens 		db->db_state = DB_CACHED;
442*fa9e4066Sahrens 	} else if (zio == NULL || zio->io_error == 0) {
443*fa9e4066Sahrens 		dbuf_set_data(db, buf);
444*fa9e4066Sahrens 		db->db_state = DB_CACHED;
445*fa9e4066Sahrens 	} else {
446*fa9e4066Sahrens 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
447*fa9e4066Sahrens 		arc_buf_free(buf, db);
448*fa9e4066Sahrens 		db->db_state = DB_UNCACHED;
449*fa9e4066Sahrens 		ASSERT3P(db->db_buf, ==, NULL);
450*fa9e4066Sahrens 	}
451*fa9e4066Sahrens 	cv_broadcast(&db->db_changed);
452*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
453*fa9e4066Sahrens }
454*fa9e4066Sahrens 
455*fa9e4066Sahrens void
456*fa9e4066Sahrens dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
457*fa9e4066Sahrens {
458*fa9e4066Sahrens 	arc_buf_t *buf;
459*fa9e4066Sahrens 	blkptr_t *bp;
460*fa9e4066Sahrens 
461*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
462*fa9e4066Sahrens 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
463*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
464*fa9e4066Sahrens 
465*fa9e4066Sahrens 	/*
466*fa9e4066Sahrens 	 * prefetch only data blocks (level 0) -- don't prefetch indirect
467*fa9e4066Sahrens 	 * blocks
468*fa9e4066Sahrens 	 */
469*fa9e4066Sahrens 	if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
470*fa9e4066Sahrens 		flags |= DB_RF_NOPREFETCH;
471*fa9e4066Sahrens 	}
472*fa9e4066Sahrens 
473*fa9e4066Sahrens 	if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
474*fa9e4066Sahrens 		dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
475*fa9e4066Sahrens 		    db->db.db_size);
476*fa9e4066Sahrens 	}
477*fa9e4066Sahrens 
478*fa9e4066Sahrens 	if (db->db_state == DB_CACHED) {
479*fa9e4066Sahrens 		ASSERT(db->db.db_data != NULL);
480*fa9e4066Sahrens 		return;
481*fa9e4066Sahrens 	}
482*fa9e4066Sahrens 
483*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
484*fa9e4066Sahrens 
485*fa9e4066Sahrens 	if (db->db_state != DB_UNCACHED) {
486*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
487*fa9e4066Sahrens 		return;
488*fa9e4066Sahrens 	}
489*fa9e4066Sahrens 
490*fa9e4066Sahrens 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
491*fa9e4066Sahrens 
492*fa9e4066Sahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
493*fa9e4066Sahrens 		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
494*fa9e4066Sahrens 		buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
495*fa9e4066Sahrens 		    DN_MAX_BONUSLEN, db);
496*fa9e4066Sahrens 		if (db->db.db_size < DN_MAX_BONUSLEN)
497*fa9e4066Sahrens 			bzero(buf->b_data, DN_MAX_BONUSLEN);
498*fa9e4066Sahrens 		bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
499*fa9e4066Sahrens 		    db->db.db_size);
500*fa9e4066Sahrens 		dbuf_set_data(db, buf);
501*fa9e4066Sahrens 		db->db_state = DB_CACHED;
502*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
503*fa9e4066Sahrens 		return;
504*fa9e4066Sahrens 	}
505*fa9e4066Sahrens 
506*fa9e4066Sahrens 	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
507*fa9e4066Sahrens 		bp = NULL;
508*fa9e4066Sahrens 	else
509*fa9e4066Sahrens 		bp = db->db_blkptr;
510*fa9e4066Sahrens 
511*fa9e4066Sahrens 	if (bp == NULL)
512*fa9e4066Sahrens 		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
513*fa9e4066Sahrens 	else
514*fa9e4066Sahrens 		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
515*fa9e4066Sahrens 
516*fa9e4066Sahrens 	if (bp == NULL || BP_IS_HOLE(bp)) {
517*fa9e4066Sahrens 		ASSERT(bp == NULL || BP_IS_HOLE(bp));
518*fa9e4066Sahrens 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
519*fa9e4066Sahrens 		    db->db.db_size, db));
520*fa9e4066Sahrens 		bzero(db->db.db_data, db->db.db_size);
521*fa9e4066Sahrens 		db->db_state = DB_CACHED;
522*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
523*fa9e4066Sahrens 		return;
524*fa9e4066Sahrens 	}
525*fa9e4066Sahrens 
526*fa9e4066Sahrens 	db->db_state = DB_READ;
527*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
528*fa9e4066Sahrens 
529*fa9e4066Sahrens 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
530*fa9e4066Sahrens 	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
531*fa9e4066Sahrens 	    db->db_level > 0 ? byteswap_uint64_array :
532*fa9e4066Sahrens 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
533*fa9e4066Sahrens 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
534*fa9e4066Sahrens 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
535*fa9e4066Sahrens 	    ARC_NOWAIT);
536*fa9e4066Sahrens }
537*fa9e4066Sahrens 
538*fa9e4066Sahrens static int
539*fa9e4066Sahrens dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
540*fa9e4066Sahrens {
541*fa9e4066Sahrens 	zio_t *zio;
542*fa9e4066Sahrens 	int err;
543*fa9e4066Sahrens 
544*fa9e4066Sahrens 	/*
545*fa9e4066Sahrens 	 * We don't have to hold the mutex to check db_state because it
546*fa9e4066Sahrens 	 * can't be freed while we have a hold on the buffer.
547*fa9e4066Sahrens 	 */
548*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
549*fa9e4066Sahrens 	if (db->db_state == DB_CACHED)
550*fa9e4066Sahrens 		return (0);
551*fa9e4066Sahrens 
552*fa9e4066Sahrens 	if (db->db_state == DB_UNCACHED) {
553*fa9e4066Sahrens 		zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
554*fa9e4066Sahrens 		    ZIO_FLAG_CANFAIL);
555*fa9e4066Sahrens 		if ((flags & DB_RF_HAVESTRUCT) == 0)
556*fa9e4066Sahrens 			rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
557*fa9e4066Sahrens 		dbuf_read_impl(db, zio, flags);
558*fa9e4066Sahrens 		if ((flags & DB_RF_HAVESTRUCT) == 0)
559*fa9e4066Sahrens 			rw_exit(&db->db_dnode->dn_struct_rwlock);
560*fa9e4066Sahrens 		err = zio_wait(zio);
561*fa9e4066Sahrens 		if (err)
562*fa9e4066Sahrens 			return (err);
563*fa9e4066Sahrens 	}
564*fa9e4066Sahrens 
565*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
566*fa9e4066Sahrens 	while (db->db_state == DB_READ || db->db_state == DB_FILL) {
567*fa9e4066Sahrens 		ASSERT(db->db_state == DB_READ ||
568*fa9e4066Sahrens 		    (flags & DB_RF_HAVESTRUCT) == 0);
569*fa9e4066Sahrens 		cv_wait(&db->db_changed, &db->db_mtx);
570*fa9e4066Sahrens 	}
571*fa9e4066Sahrens 	ASSERT3U(db->db_state, ==, DB_CACHED);
572*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
573*fa9e4066Sahrens 
574*fa9e4066Sahrens 	return (0);
575*fa9e4066Sahrens }
576*fa9e4066Sahrens 
577*fa9e4066Sahrens #pragma weak dmu_buf_read = dbuf_read
578*fa9e4066Sahrens void
579*fa9e4066Sahrens dbuf_read(dmu_buf_impl_t *db)
580*fa9e4066Sahrens {
581*fa9e4066Sahrens 	int err;
582*fa9e4066Sahrens 
583*fa9e4066Sahrens 	err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
584*fa9e4066Sahrens 	ASSERT(err == 0);
585*fa9e4066Sahrens }
586*fa9e4066Sahrens 
587*fa9e4066Sahrens #pragma weak dmu_buf_read_canfail = dbuf_read_canfail
588*fa9e4066Sahrens int
589*fa9e4066Sahrens dbuf_read_canfail(dmu_buf_impl_t *db)
590*fa9e4066Sahrens {
591*fa9e4066Sahrens 	return (dbuf_read_generic(db, DB_RF_CANFAIL));
592*fa9e4066Sahrens }
593*fa9e4066Sahrens 
594*fa9e4066Sahrens void
595*fa9e4066Sahrens dbuf_read_havestruct(dmu_buf_impl_t *db)
596*fa9e4066Sahrens {
597*fa9e4066Sahrens 	int err;
598*fa9e4066Sahrens 
599*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
600*fa9e4066Sahrens 	err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
601*fa9e4066Sahrens 	ASSERT(err == 0);
602*fa9e4066Sahrens }
603*fa9e4066Sahrens 
604*fa9e4066Sahrens static void
605*fa9e4066Sahrens dbuf_noread(dmu_buf_impl_t *db)
606*fa9e4066Sahrens {
607*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
608*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
609*fa9e4066Sahrens 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
610*fa9e4066Sahrens 		cv_wait(&db->db_changed, &db->db_mtx);
611*fa9e4066Sahrens 	if (db->db_state == DB_UNCACHED) {
612*fa9e4066Sahrens 		int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
613*fa9e4066Sahrens 		    DN_MAX_BONUSLEN : db->db.db_size;
614*fa9e4066Sahrens 		ASSERT(db->db.db_data == NULL);
615*fa9e4066Sahrens 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
616*fa9e4066Sahrens 		    blksz, db));
617*fa9e4066Sahrens 		db->db_state = DB_FILL;
618*fa9e4066Sahrens 	} else {
619*fa9e4066Sahrens 		ASSERT3U(db->db_state, ==, DB_CACHED);
620*fa9e4066Sahrens 	}
621*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
622*fa9e4066Sahrens }
623*fa9e4066Sahrens 
624*fa9e4066Sahrens /*
625*fa9e4066Sahrens  * This is our just-in-time copy function.  It makes a copy of
626*fa9e4066Sahrens  * buffers, that have been modified in a previous transaction
627*fa9e4066Sahrens  * group, before we modify them in the current active group.
628*fa9e4066Sahrens  *
629*fa9e4066Sahrens  * This function is used in two places: when we are dirtying a
630*fa9e4066Sahrens  * buffer for the first time in a txg, and when we are freeing
631*fa9e4066Sahrens  * a range in a dnode that includes this buffer.
632*fa9e4066Sahrens  *
633*fa9e4066Sahrens  * Note that when we are called from dbuf_free_range() we do
634*fa9e4066Sahrens  * not put a hold on the buffer, we just traverse the active
635*fa9e4066Sahrens  * dbuf list for the dnode.
636*fa9e4066Sahrens  */
637*fa9e4066Sahrens static void
638*fa9e4066Sahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
639*fa9e4066Sahrens {
640*fa9e4066Sahrens 	arc_buf_t **quiescing, **syncing;
641*fa9e4066Sahrens 	int size = (db->db_blkid == DB_BONUS_BLKID) ?
642*fa9e4066Sahrens 	    DN_MAX_BONUSLEN : db->db.db_size;
643*fa9e4066Sahrens 
644*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
645*fa9e4066Sahrens 	ASSERT(db->db.db_data != NULL);
646*fa9e4066Sahrens 
647*fa9e4066Sahrens 	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
648*fa9e4066Sahrens 	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
649*fa9e4066Sahrens 
650*fa9e4066Sahrens 	/*
651*fa9e4066Sahrens 	 * If this buffer is referenced from the current quiescing
652*fa9e4066Sahrens 	 * transaction group: either make a copy and reset the reference
653*fa9e4066Sahrens 	 * to point to the copy, or (if there a no active holders) just
654*fa9e4066Sahrens 	 * null out the current db_data pointer.
655*fa9e4066Sahrens 	 */
656*fa9e4066Sahrens 	if (*quiescing == db->db_buf) {
657*fa9e4066Sahrens 		/*
658*fa9e4066Sahrens 		 * If the quiescing txg is "dirty", then we better not
659*fa9e4066Sahrens 		 * be referencing the same buffer from the syncing txg.
660*fa9e4066Sahrens 		 */
661*fa9e4066Sahrens 		ASSERT(*syncing != db->db_buf);
662*fa9e4066Sahrens 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
663*fa9e4066Sahrens 			*quiescing = arc_buf_alloc(
664*fa9e4066Sahrens 			    db->db_dnode->dn_objset->os_spa, size, db);
665*fa9e4066Sahrens 			bcopy(db->db.db_data, (*quiescing)->b_data, size);
666*fa9e4066Sahrens 		} else {
667*fa9e4066Sahrens 			db->db.db_data = NULL;
668*fa9e4066Sahrens 			db->db_buf = NULL;
669*fa9e4066Sahrens 			db->db_state = DB_UNCACHED;
670*fa9e4066Sahrens 		}
671*fa9e4066Sahrens 		return;
672*fa9e4066Sahrens 	}
673*fa9e4066Sahrens 
674*fa9e4066Sahrens 	/*
675*fa9e4066Sahrens 	 * If this buffer is referenced from the current syncing
676*fa9e4066Sahrens 	 * transaction group: either
677*fa9e4066Sahrens 	 *	1 - make a copy and reset the reference, or
678*fa9e4066Sahrens 	 *	2 - if there are no holders, just null the current db_data.
679*fa9e4066Sahrens 	 */
680*fa9e4066Sahrens 	if (*syncing == db->db_buf) {
681*fa9e4066Sahrens 		ASSERT3P(*quiescing, ==, NULL);
682*fa9e4066Sahrens 		ASSERT3U(db->db_dirtycnt, ==, 1);
683*fa9e4066Sahrens 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
684*fa9e4066Sahrens 			/* we can't copy if we have already started a write */
685*fa9e4066Sahrens 			ASSERT(*syncing != db->db_data_pending);
686*fa9e4066Sahrens 			*syncing = arc_buf_alloc(
687*fa9e4066Sahrens 			    db->db_dnode->dn_objset->os_spa, size, db);
688*fa9e4066Sahrens 			bcopy(db->db.db_data, (*syncing)->b_data, size);
689*fa9e4066Sahrens 		} else {
690*fa9e4066Sahrens 			db->db.db_data = NULL;
691*fa9e4066Sahrens 			db->db_buf = NULL;
692*fa9e4066Sahrens 			db->db_state = DB_UNCACHED;
693*fa9e4066Sahrens 		}
694*fa9e4066Sahrens 	}
695*fa9e4066Sahrens }
696*fa9e4066Sahrens 
697*fa9e4066Sahrens void
698*fa9e4066Sahrens dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
699*fa9e4066Sahrens {
700*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
701*fa9e4066Sahrens 	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
702*fa9e4066Sahrens 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
703*fa9e4066Sahrens 	} else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
704*fa9e4066Sahrens 		/* free this block */
705*fa9e4066Sahrens 		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
706*fa9e4066Sahrens 		    db->db_dnode->dn_free_txg == txg);
707*fa9e4066Sahrens 		if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
708*fa9e4066Sahrens 			/* XXX can get silent EIO here */
709*fa9e4066Sahrens 			(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
710*fa9e4066Sahrens 			    txg, db->db_d.db_overridden_by[txg&TXG_MASK],
711*fa9e4066Sahrens 			    NULL, NULL, ARC_WAIT);
712*fa9e4066Sahrens 		}
713*fa9e4066Sahrens 		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
714*fa9e4066Sahrens 		    sizeof (blkptr_t));
715*fa9e4066Sahrens 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
716*fa9e4066Sahrens 		/* release the already-written buffer */
717*fa9e4066Sahrens 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
718*fa9e4066Sahrens 	}
719*fa9e4066Sahrens }
720*fa9e4066Sahrens 
721*fa9e4066Sahrens void
722*fa9e4066Sahrens dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
723*fa9e4066Sahrens {
724*fa9e4066Sahrens 	dmu_buf_impl_t *db, *db_next;
725*fa9e4066Sahrens 	uint64_t txg = tx->tx_txg;
726*fa9e4066Sahrens 
727*fa9e4066Sahrens 	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
728*fa9e4066Sahrens 	mutex_enter(&dn->dn_dbufs_mtx);
729*fa9e4066Sahrens 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
730*fa9e4066Sahrens 		db_next = list_next(&dn->dn_dbufs, db);
731*fa9e4066Sahrens 		if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
732*fa9e4066Sahrens 			continue;
733*fa9e4066Sahrens 		dprintf_dbuf(db, "found buf %s\n", "");
734*fa9e4066Sahrens 		if (db->db_blkid < blkid ||
735*fa9e4066Sahrens 		    db->db_blkid >= blkid+nblks)
736*fa9e4066Sahrens 			continue;
737*fa9e4066Sahrens 
738*fa9e4066Sahrens 		/* found a level 0 buffer in the range */
739*fa9e4066Sahrens 		if (dbuf_undirty(db, tx))
740*fa9e4066Sahrens 			continue;
741*fa9e4066Sahrens 
742*fa9e4066Sahrens 		mutex_enter(&db->db_mtx);
743*fa9e4066Sahrens 		if (db->db_state == DB_UNCACHED) {
744*fa9e4066Sahrens 			ASSERT(db->db.db_data == NULL);
745*fa9e4066Sahrens 			mutex_exit(&db->db_mtx);
746*fa9e4066Sahrens 			continue;
747*fa9e4066Sahrens 		}
748*fa9e4066Sahrens 		if (db->db_state == DB_READ) {
749*fa9e4066Sahrens 			/* this will be handled in dbuf_read_done() */
750*fa9e4066Sahrens 			db->db_d.db_freed_in_flight = TRUE;
751*fa9e4066Sahrens 			mutex_exit(&db->db_mtx);
752*fa9e4066Sahrens 			continue;
753*fa9e4066Sahrens 		}
754*fa9e4066Sahrens 		if (db->db_state == DB_FILL) {
755*fa9e4066Sahrens 			/* this will be handled in dbuf_rele() */
756*fa9e4066Sahrens 			db->db_d.db_freed_in_flight = TRUE;
757*fa9e4066Sahrens 			mutex_exit(&db->db_mtx);
758*fa9e4066Sahrens 			continue;
759*fa9e4066Sahrens 		}
760*fa9e4066Sahrens 
761*fa9e4066Sahrens 		/* make a copy of the data if necessary */
762*fa9e4066Sahrens 		dbuf_fix_old_data(db, txg);
763*fa9e4066Sahrens 
764*fa9e4066Sahrens 		if (db->db.db_data) {
765*fa9e4066Sahrens 			/* fill in with appropriate data */
766*fa9e4066Sahrens 			arc_release(db->db_buf, db);
767*fa9e4066Sahrens 			bzero(db->db.db_data, db->db.db_size);
768*fa9e4066Sahrens 		}
769*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
770*fa9e4066Sahrens 	}
771*fa9e4066Sahrens 	mutex_exit(&dn->dn_dbufs_mtx);
772*fa9e4066Sahrens }
773*fa9e4066Sahrens 
774*fa9e4066Sahrens static int
775*fa9e4066Sahrens dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
776*fa9e4066Sahrens {
777*fa9e4066Sahrens 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
778*fa9e4066Sahrens 	uint64_t birth_txg = 0;
779*fa9e4066Sahrens 
780*fa9e4066Sahrens 	/* Don't count meta-objects */
781*fa9e4066Sahrens 	if (ds == NULL)
782*fa9e4066Sahrens 		return (FALSE);
783*fa9e4066Sahrens 
784*fa9e4066Sahrens 	/*
785*fa9e4066Sahrens 	 * We don't need any locking to protect db_blkptr:
786*fa9e4066Sahrens 	 * If it's syncing, then db_dirtied will be set so we'll
787*fa9e4066Sahrens 	 * ignore db_blkptr.
788*fa9e4066Sahrens 	 */
789*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
790*fa9e4066Sahrens 	/* If we have been dirtied since the last snapshot, its not new */
791*fa9e4066Sahrens 	if (db->db_dirtied)
792*fa9e4066Sahrens 		birth_txg = db->db_dirtied;
793*fa9e4066Sahrens 	else if (db->db_blkptr)
794*fa9e4066Sahrens 		birth_txg = db->db_blkptr->blk_birth;
795*fa9e4066Sahrens 
796*fa9e4066Sahrens 	if (birth_txg)
797*fa9e4066Sahrens 		return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
798*fa9e4066Sahrens 	else
799*fa9e4066Sahrens 		return (TRUE);
800*fa9e4066Sahrens }
801*fa9e4066Sahrens 
802*fa9e4066Sahrens void
803*fa9e4066Sahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
804*fa9e4066Sahrens {
805*fa9e4066Sahrens 	arc_buf_t *buf, *obuf;
806*fa9e4066Sahrens 	int osize = db->db.db_size;
807*fa9e4066Sahrens 
808*fa9e4066Sahrens 	/* XXX does *this* func really need the lock? */
809*fa9e4066Sahrens 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
810*fa9e4066Sahrens 
811*fa9e4066Sahrens 	ASSERT3U(osize, <=, size);
812*fa9e4066Sahrens 	if (osize == size)
813*fa9e4066Sahrens 		return;
814*fa9e4066Sahrens 
815*fa9e4066Sahrens 	/*
816*fa9e4066Sahrens 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
817*fa9e4066Sahrens 	 * is OK, because there can be no other references to the db
818*fa9e4066Sahrens 	 * when we are changing its size, so no concurrent DB_FILL can
819*fa9e4066Sahrens 	 * be happening.
820*fa9e4066Sahrens 	 */
821*fa9e4066Sahrens 	/* Make a copy of the data if necessary */
822*fa9e4066Sahrens 	dbuf_will_dirty(db, tx);
823*fa9e4066Sahrens 
824*fa9e4066Sahrens 	/* create the data buffer for the new block */
825*fa9e4066Sahrens 	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
826*fa9e4066Sahrens 
827*fa9e4066Sahrens 	/* copy old block data to the new block */
828*fa9e4066Sahrens 	obuf = db->db_buf;
829*fa9e4066Sahrens 	bcopy(obuf->b_data, buf->b_data, osize);
830*fa9e4066Sahrens 	/* zero the remainder */
831*fa9e4066Sahrens 	bzero((uint8_t *)buf->b_data + osize, size - osize);
832*fa9e4066Sahrens 
833*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
834*fa9e4066Sahrens 	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
835*fa9e4066Sahrens 	dbuf_set_data(db, buf);
836*fa9e4066Sahrens 	arc_buf_free(obuf, db);
837*fa9e4066Sahrens 	db->db.db_size = size;
838*fa9e4066Sahrens 
839*fa9e4066Sahrens 	/* fix up the dirty info */
840*fa9e4066Sahrens 	if (db->db_level == 0)
841*fa9e4066Sahrens 		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
842*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
843*fa9e4066Sahrens 
844*fa9e4066Sahrens 	dnode_willuse_space(db->db_dnode, size-osize, tx);
845*fa9e4066Sahrens }
846*fa9e4066Sahrens 
847*fa9e4066Sahrens void
848*fa9e4066Sahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
849*fa9e4066Sahrens {
850*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
851*fa9e4066Sahrens 	objset_impl_t *os = dn->dn_objset;
852*fa9e4066Sahrens 	int drop_struct_lock = FALSE;
853*fa9e4066Sahrens 	int txgoff = tx->tx_txg & TXG_MASK;
854*fa9e4066Sahrens 
855*fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
856*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
857*fa9e4066Sahrens 	dmu_tx_dirty_buf(tx, db);
858*fa9e4066Sahrens 
859*fa9e4066Sahrens 	/*
860*fa9e4066Sahrens 	 * Shouldn't dirty a regular buffer in syncing context.  Private
861*fa9e4066Sahrens 	 * objects may be dirtied in syncing context, but only if they
862*fa9e4066Sahrens 	 * were already pre-dirtied in open context.
863*fa9e4066Sahrens 	 * XXX We may want to prohibit dirtying in syncing context even
864*fa9e4066Sahrens 	 * if they did pre-dirty.
865*fa9e4066Sahrens 	 */
866*fa9e4066Sahrens 	ASSERT(!(dmu_tx_is_syncing(tx) &&
867*fa9e4066Sahrens 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
868*fa9e4066Sahrens 	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
869*fa9e4066Sahrens 	    dn->dn_objset->os_dsl_dataset != NULL &&
870*fa9e4066Sahrens 	    !dsl_dir_is_private(
871*fa9e4066Sahrens 	    dn->dn_objset->os_dsl_dataset->ds_dir)));
872*fa9e4066Sahrens 
873*fa9e4066Sahrens 	/*
874*fa9e4066Sahrens 	 * We make this assert for private objects as well, but after we
875*fa9e4066Sahrens 	 * check if we're already dirty.  They are allowed to re-dirty
876*fa9e4066Sahrens 	 * in syncing context.
877*fa9e4066Sahrens 	 */
878*fa9e4066Sahrens 	ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
879*fa9e4066Sahrens 	    dn->dn_dirtyctx == DN_UNDIRTIED ||
880*fa9e4066Sahrens 	    dn->dn_dirtyctx ==
881*fa9e4066Sahrens 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
882*fa9e4066Sahrens 
883*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
884*fa9e4066Sahrens 	/* XXX make this true for indirects too? */
885*fa9e4066Sahrens 	ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
886*fa9e4066Sahrens 	    db->db_state == DB_FILL);
887*fa9e4066Sahrens 
888*fa9e4066Sahrens 	/*
889*fa9e4066Sahrens 	 * If this buffer is currently part of an "overridden" region,
890*fa9e4066Sahrens 	 * we now need to remove it from that region.
891*fa9e4066Sahrens 	 */
892*fa9e4066Sahrens 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
893*fa9e4066Sahrens 	    db->db_d.db_overridden_by[txgoff] != NULL) {
894*fa9e4066Sahrens 		dbuf_unoverride(db, tx->tx_txg);
895*fa9e4066Sahrens 	}
896*fa9e4066Sahrens 
897*fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
898*fa9e4066Sahrens 	/*
899*fa9e4066Sahrens 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
900*fa9e4066Sahrens 	 * initialize the objset.
901*fa9e4066Sahrens 	 */
902*fa9e4066Sahrens 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
903*fa9e4066Sahrens 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
904*fa9e4066Sahrens 		dn->dn_dirtyctx =
905*fa9e4066Sahrens 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
906*fa9e4066Sahrens 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
907*fa9e4066Sahrens 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
908*fa9e4066Sahrens 	}
909*fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
910*fa9e4066Sahrens 
911*fa9e4066Sahrens 	/*
912*fa9e4066Sahrens 	 * If this buffer is already dirty, we're done.
913*fa9e4066Sahrens 	 */
914*fa9e4066Sahrens 	if (list_link_active(&db->db_dirty_node[txgoff])) {
915*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
916*fa9e4066Sahrens 		return;
917*fa9e4066Sahrens 	}
918*fa9e4066Sahrens 
919*fa9e4066Sahrens 	/*
920*fa9e4066Sahrens 	 * Only valid if not already dirty.
921*fa9e4066Sahrens 	 */
922*fa9e4066Sahrens 	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
923*fa9e4066Sahrens 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
924*fa9e4066Sahrens 
925*fa9e4066Sahrens 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
926*fa9e4066Sahrens 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
927*fa9e4066Sahrens 	    dn->dn_phys->dn_nlevels > db->db_level ||
928*fa9e4066Sahrens 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
929*fa9e4066Sahrens 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
930*fa9e4066Sahrens 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
931*fa9e4066Sahrens 
932*fa9e4066Sahrens 	/*
933*fa9e4066Sahrens 	 * We should only be dirtying in syncing context if it's the
934*fa9e4066Sahrens 	 * mos, a spa os, or we're initializing the os.  However, we are
935*fa9e4066Sahrens 	 * allowed to dirty in syncing context provided we already
936*fa9e4066Sahrens 	 * dirtied it in open context.  Hence we must make this
937*fa9e4066Sahrens 	 * assertion only if we're not already dirty.
938*fa9e4066Sahrens 	 */
939*fa9e4066Sahrens 	ASSERT(!dmu_tx_is_syncing(tx) ||
940*fa9e4066Sahrens 	    os->os_dsl_dataset == NULL ||
941*fa9e4066Sahrens 	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
942*fa9e4066Sahrens 	    !BP_IS_HOLE(&os->os_rootbp));
943*fa9e4066Sahrens 	ASSERT(db->db.db_size != 0);
944*fa9e4066Sahrens 
945*fa9e4066Sahrens 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
946*fa9e4066Sahrens 
947*fa9e4066Sahrens 	if (db->db_level == 0) {
948*fa9e4066Sahrens 		/*
949*fa9e4066Sahrens 		 * Release the data buffer from the cache so that we
950*fa9e4066Sahrens 		 * can modify it without impacting possible other users
951*fa9e4066Sahrens 		 * of this cached data block.  Note that indirect blocks
952*fa9e4066Sahrens 		 * and private objects are not released until the syncing
953*fa9e4066Sahrens 		 * state (since they are only modified then).
954*fa9e4066Sahrens 		 *
955*fa9e4066Sahrens 		 * If this buffer is dirty in an old transaction group we need
956*fa9e4066Sahrens 		 * to make a copy of it so that the changes we make in this
957*fa9e4066Sahrens 		 * transaction group won't leak out when we sync the older txg.
958*fa9e4066Sahrens 		 */
959*fa9e4066Sahrens 		ASSERT(db->db_buf != NULL);
960*fa9e4066Sahrens 		ASSERT(db->db.db_data != NULL);
961*fa9e4066Sahrens 		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
962*fa9e4066Sahrens 		if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
963*fa9e4066Sahrens 			arc_release(db->db_buf, db);
964*fa9e4066Sahrens 			dbuf_fix_old_data(db, tx->tx_txg);
965*fa9e4066Sahrens 			ASSERT(db->db_buf != NULL);
966*fa9e4066Sahrens 		}
967*fa9e4066Sahrens 		db->db_d.db_data_old[txgoff] = db->db_buf;
968*fa9e4066Sahrens 	}
969*fa9e4066Sahrens 
970*fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
971*fa9e4066Sahrens 	/*
972*fa9e4066Sahrens 	 * We could have been freed_in_flight between the dbuf_noread
973*fa9e4066Sahrens 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
974*fa9e4066Sahrens 	 * happened after the free.
975*fa9e4066Sahrens 	 */
976*fa9e4066Sahrens 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
977*fa9e4066Sahrens 		dnode_clear_range(dn, db->db_blkid, 1, tx);
978*fa9e4066Sahrens 		db->db_d.db_freed_in_flight = FALSE;
979*fa9e4066Sahrens 	}
980*fa9e4066Sahrens 
981*fa9e4066Sahrens 	db->db_dirtied = tx->tx_txg;
982*fa9e4066Sahrens 	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
983*fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
984*fa9e4066Sahrens 
985*fa9e4066Sahrens 	/*
986*fa9e4066Sahrens 	 * If writting this buffer will consume a new block on disk,
987*fa9e4066Sahrens 	 * then update the accounting.
988*fa9e4066Sahrens 	 */
989*fa9e4066Sahrens 	if (db->db_blkid != DB_BONUS_BLKID) {
990*fa9e4066Sahrens 		if (!dbuf_new_block(db, tx) && db->db_blkptr) {
991*fa9e4066Sahrens 			/*
992*fa9e4066Sahrens 			 * This is only a guess -- if the dbuf is dirty
993*fa9e4066Sahrens 			 * in a previous txg, we don't know how much
994*fa9e4066Sahrens 			 * space it will use on disk yet.  We should
995*fa9e4066Sahrens 			 * really have the struct_rwlock to access
996*fa9e4066Sahrens 			 * db_blkptr, but since this is just a guess,
997*fa9e4066Sahrens 			 * it's OK if we get an odd answer.
998*fa9e4066Sahrens 			 */
999*fa9e4066Sahrens 			dnode_willuse_space(dn,
1000*fa9e4066Sahrens 			    -BP_GET_ASIZE(db->db_blkptr), tx);
1001*fa9e4066Sahrens 		}
1002*fa9e4066Sahrens 		dnode_willuse_space(dn, db->db.db_size, tx);
1003*fa9e4066Sahrens 	}
1004*fa9e4066Sahrens 
1005*fa9e4066Sahrens 	/*
1006*fa9e4066Sahrens 	 * This buffer is now part of this txg
1007*fa9e4066Sahrens 	 */
1008*fa9e4066Sahrens 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1009*fa9e4066Sahrens 	db->db_dirtycnt += 1;
1010*fa9e4066Sahrens 	ASSERT3U(db->db_dirtycnt, <=, 3);
1011*fa9e4066Sahrens 
1012*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1013*fa9e4066Sahrens 
1014*fa9e4066Sahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
1015*fa9e4066Sahrens 		dnode_setdirty(dn, tx);
1016*fa9e4066Sahrens 		return;
1017*fa9e4066Sahrens 	}
1018*fa9e4066Sahrens 
1019*fa9e4066Sahrens 	if (db->db_level == 0)
1020*fa9e4066Sahrens 		dnode_new_blkid(dn, db->db_blkid, tx);
1021*fa9e4066Sahrens 
1022*fa9e4066Sahrens 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1023*fa9e4066Sahrens 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1024*fa9e4066Sahrens 		drop_struct_lock = TRUE;
1025*fa9e4066Sahrens 	}
1026*fa9e4066Sahrens 
1027*fa9e4066Sahrens 	if (db->db_level < dn->dn_nlevels-1) {
1028*fa9e4066Sahrens 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1029*fa9e4066Sahrens 		dmu_buf_impl_t *parent;
1030*fa9e4066Sahrens 		parent = dbuf_hold_level(dn, db->db_level+1,
1031*fa9e4066Sahrens 		    db->db_blkid >> epbs, FTAG);
1032*fa9e4066Sahrens 		if (drop_struct_lock)
1033*fa9e4066Sahrens 			rw_exit(&dn->dn_struct_rwlock);
1034*fa9e4066Sahrens 		dbuf_dirty(parent, tx);
1035*fa9e4066Sahrens 		dbuf_remove_ref(parent, FTAG);
1036*fa9e4066Sahrens 	} else {
1037*fa9e4066Sahrens 		if (drop_struct_lock)
1038*fa9e4066Sahrens 			rw_exit(&dn->dn_struct_rwlock);
1039*fa9e4066Sahrens 	}
1040*fa9e4066Sahrens 
1041*fa9e4066Sahrens 	dnode_setdirty(dn, tx);
1042*fa9e4066Sahrens }
1043*fa9e4066Sahrens 
1044*fa9e4066Sahrens static int
1045*fa9e4066Sahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1046*fa9e4066Sahrens {
1047*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
1048*fa9e4066Sahrens 	int txgoff = tx->tx_txg & TXG_MASK;
1049*fa9e4066Sahrens 
1050*fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
1051*fa9e4066Sahrens 
1052*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1053*fa9e4066Sahrens 
1054*fa9e4066Sahrens 	/*
1055*fa9e4066Sahrens 	 * If this buffer is not dirty, we're done.
1056*fa9e4066Sahrens 	 */
1057*fa9e4066Sahrens 	if (!list_link_active(&db->db_dirty_node[txgoff])) {
1058*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1059*fa9e4066Sahrens 		return (0);
1060*fa9e4066Sahrens 	}
1061*fa9e4066Sahrens 
1062*fa9e4066Sahrens 	/*
1063*fa9e4066Sahrens 	 * If this buffer is currently held, we cannot undirty
1064*fa9e4066Sahrens 	 * it, since one of the current holders may be in the
1065*fa9e4066Sahrens 	 * middle of an update.  Note that users of dbuf_undirty()
1066*fa9e4066Sahrens 	 * should not place a hold on the dbuf before the call.
1067*fa9e4066Sahrens 	 * XXX - this check assumes we are being called from
1068*fa9e4066Sahrens 	 * dbuf_free_range(), perhaps we should move it there?
1069*fa9e4066Sahrens 	 */
1070*fa9e4066Sahrens 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1071*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1072*fa9e4066Sahrens 		mutex_enter(&dn->dn_mtx);
1073*fa9e4066Sahrens 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1074*fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
1075*fa9e4066Sahrens 		return (0);
1076*fa9e4066Sahrens 	}
1077*fa9e4066Sahrens 
1078*fa9e4066Sahrens 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1079*fa9e4066Sahrens 
1080*fa9e4066Sahrens 	dbuf_unoverride(db, tx->tx_txg);
1081*fa9e4066Sahrens 
1082*fa9e4066Sahrens 	ASSERT(db->db.db_size != 0);
1083*fa9e4066Sahrens 	if (db->db_level == 0) {
1084*fa9e4066Sahrens 		ASSERT(db->db_buf != NULL);
1085*fa9e4066Sahrens 		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
1086*fa9e4066Sahrens 		if (db->db_d.db_data_old[txgoff] != db->db_buf)
1087*fa9e4066Sahrens 			arc_buf_free(db->db_d.db_data_old[txgoff], db);
1088*fa9e4066Sahrens 		db->db_d.db_data_old[txgoff] = NULL;
1089*fa9e4066Sahrens 	}
1090*fa9e4066Sahrens 
1091*fa9e4066Sahrens 	/* XXX would be nice to fix up dn_towrite_space[] */
1092*fa9e4066Sahrens 	/* XXX undo db_dirtied? but how? */
1093*fa9e4066Sahrens 	/* db->db_dirtied = tx->tx_txg; */
1094*fa9e4066Sahrens 
1095*fa9e4066Sahrens 	mutex_enter(&dn->dn_mtx);
1096*fa9e4066Sahrens 	list_remove(&dn->dn_dirty_dbufs[txgoff], db);
1097*fa9e4066Sahrens 	mutex_exit(&dn->dn_mtx);
1098*fa9e4066Sahrens 
1099*fa9e4066Sahrens 	ASSERT(db->db_dirtycnt > 0);
1100*fa9e4066Sahrens 	db->db_dirtycnt -= 1;
1101*fa9e4066Sahrens 
1102*fa9e4066Sahrens 	if (refcount_remove(&db->db_holds,
1103*fa9e4066Sahrens 	    (void *)(uintptr_t)tx->tx_txg) == 0) {
1104*fa9e4066Sahrens 		/* make duf_verify() happy */
1105*fa9e4066Sahrens 		if (db->db.db_data)
1106*fa9e4066Sahrens 			bzero(db->db.db_data, db->db.db_size);
1107*fa9e4066Sahrens 
1108*fa9e4066Sahrens 		dbuf_evict(db);
1109*fa9e4066Sahrens 		return (1);
1110*fa9e4066Sahrens 	}
1111*fa9e4066Sahrens 
1112*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1113*fa9e4066Sahrens 	return (0);
1114*fa9e4066Sahrens }
1115*fa9e4066Sahrens 
1116*fa9e4066Sahrens #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1117*fa9e4066Sahrens void
1118*fa9e4066Sahrens dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1119*fa9e4066Sahrens {
1120*fa9e4066Sahrens 	int rf = DB_RF_MUST_SUCCEED;
1121*fa9e4066Sahrens 
1122*fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
1123*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
1124*fa9e4066Sahrens 
1125*fa9e4066Sahrens 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1126*fa9e4066Sahrens 		rf |= DB_RF_HAVESTRUCT;
1127*fa9e4066Sahrens 	(void) dbuf_read_generic(db, rf);
1128*fa9e4066Sahrens 	dbuf_dirty(db, tx);
1129*fa9e4066Sahrens }
1130*fa9e4066Sahrens 
1131*fa9e4066Sahrens #pragma weak dmu_buf_will_fill = dbuf_will_fill
1132*fa9e4066Sahrens void
1133*fa9e4066Sahrens dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
1134*fa9e4066Sahrens {
1135*fa9e4066Sahrens 	ASSERT(tx->tx_txg != 0);
1136*fa9e4066Sahrens 	ASSERT(db->db_level == 0);
1137*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
1138*fa9e4066Sahrens 
1139*fa9e4066Sahrens 	ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
1140*fa9e4066Sahrens 	    dmu_tx_private_ok(tx));
1141*fa9e4066Sahrens 
1142*fa9e4066Sahrens 	dbuf_noread(db);
1143*fa9e4066Sahrens 	dbuf_dirty(db, tx);
1144*fa9e4066Sahrens }
1145*fa9e4066Sahrens 
1146*fa9e4066Sahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done
1147*fa9e4066Sahrens /* ARGSUSED */
1148*fa9e4066Sahrens void
1149*fa9e4066Sahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1150*fa9e4066Sahrens {
1151*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1152*fa9e4066Sahrens 	dbuf_verify(db);
1153*fa9e4066Sahrens 
1154*fa9e4066Sahrens 	if (db->db_state == DB_FILL) {
1155*fa9e4066Sahrens 		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
1156*fa9e4066Sahrens 			/* we were freed while filling */
1157*fa9e4066Sahrens 			/* XXX dbuf_undirty? */
1158*fa9e4066Sahrens 			bzero(db->db.db_data, db->db.db_size);
1159*fa9e4066Sahrens 			db->db_d.db_freed_in_flight = FALSE;
1160*fa9e4066Sahrens 		}
1161*fa9e4066Sahrens 		db->db_state = DB_CACHED;
1162*fa9e4066Sahrens 		cv_broadcast(&db->db_changed);
1163*fa9e4066Sahrens 	}
1164*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1165*fa9e4066Sahrens }
1166*fa9e4066Sahrens 
1167*fa9e4066Sahrens 
1168*fa9e4066Sahrens static void
1169*fa9e4066Sahrens dbuf_clear(dmu_buf_impl_t *db)
1170*fa9e4066Sahrens {
1171*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
1172*fa9e4066Sahrens 
1173*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
1174*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
1175*fa9e4066Sahrens 	ASSERT(refcount_is_zero(&db->db_holds));
1176*fa9e4066Sahrens 
1177*fa9e4066Sahrens 	if (db->db_state == DB_CACHED) {
1178*fa9e4066Sahrens 		ASSERT(db->db_buf != NULL);
1179*fa9e4066Sahrens 		arc_buf_free(db->db_buf, db);
1180*fa9e4066Sahrens 		db->db.db_data = NULL;
1181*fa9e4066Sahrens 		db->db_buf = NULL;
1182*fa9e4066Sahrens 		db->db_state = DB_UNCACHED;
1183*fa9e4066Sahrens 	}
1184*fa9e4066Sahrens 
1185*fa9e4066Sahrens 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
1186*fa9e4066Sahrens 	ASSERT(db->db_buf == NULL);
1187*fa9e4066Sahrens 	ASSERT(db->db_data_pending == NULL);
1188*fa9e4066Sahrens 
1189*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1190*fa9e4066Sahrens 
1191*fa9e4066Sahrens 	/*
1192*fa9e4066Sahrens 	 * If this dbuf is referened from an indirect dbuf,
1193*fa9e4066Sahrens 	 * decrement the ref count on the indirect dbuf.
1194*fa9e4066Sahrens 	 */
1195*fa9e4066Sahrens 	if (db->db_parent && db->db_parent != dn->dn_dbuf)
1196*fa9e4066Sahrens 		dbuf_remove_ref(db->db_parent, db);
1197*fa9e4066Sahrens 
1198*fa9e4066Sahrens 	/* remove from dn_dbufs */
1199*fa9e4066Sahrens 	list_remove(&dn->dn_dbufs, db);
1200*fa9e4066Sahrens 
1201*fa9e4066Sahrens 	dnode_rele(dn, db);
1202*fa9e4066Sahrens 
1203*fa9e4066Sahrens 	dbuf_hash_remove(db);
1204*fa9e4066Sahrens 
1205*fa9e4066Sahrens 	db->db_dnode = NULL;
1206*fa9e4066Sahrens 	db->db_parent = NULL;
1207*fa9e4066Sahrens 	db->db_blkptr = NULL;
1208*fa9e4066Sahrens }
1209*fa9e4066Sahrens 
1210*fa9e4066Sahrens static int
1211*fa9e4066Sahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1212*fa9e4066Sahrens     dmu_buf_impl_t **parentp, blkptr_t **bpp)
1213*fa9e4066Sahrens {
1214*fa9e4066Sahrens 	int nlevels, epbs;
1215*fa9e4066Sahrens 
1216*fa9e4066Sahrens 	if (dn->dn_phys->dn_nlevels == 0)
1217*fa9e4066Sahrens 		nlevels = 1;
1218*fa9e4066Sahrens 	else
1219*fa9e4066Sahrens 		nlevels = dn->dn_phys->dn_nlevels;
1220*fa9e4066Sahrens 
1221*fa9e4066Sahrens 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1222*fa9e4066Sahrens 
1223*fa9e4066Sahrens 	ASSERT3U(level * epbs, <, 64);
1224*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1225*fa9e4066Sahrens 	if (blkid == DB_BONUS_BLKID) {
1226*fa9e4066Sahrens 		/* this is the bonus buffer */
1227*fa9e4066Sahrens 		*parentp = NULL;
1228*fa9e4066Sahrens 		*bpp = NULL;
1229*fa9e4066Sahrens 		return (0);
1230*fa9e4066Sahrens 	} else if (level >= nlevels ||
1231*fa9e4066Sahrens 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1232*fa9e4066Sahrens 		/* the buffer has no parent yet */
1233*fa9e4066Sahrens 		*parentp = NULL;
1234*fa9e4066Sahrens 		*bpp = NULL;
1235*fa9e4066Sahrens 		return (ENOENT);
1236*fa9e4066Sahrens 	} else if (level < nlevels-1) {
1237*fa9e4066Sahrens 		/* this block is referenced from an indirect block */
1238*fa9e4066Sahrens 		int err = dbuf_hold_impl(dn, level+1,
1239*fa9e4066Sahrens 		    blkid >> epbs, fail_sparse, NULL, parentp);
1240*fa9e4066Sahrens 		if (err)
1241*fa9e4066Sahrens 			return (err);
1242*fa9e4066Sahrens 		dbuf_read_havestruct(*parentp);
1243*fa9e4066Sahrens 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1244*fa9e4066Sahrens 		    (blkid & ((1ULL << epbs) - 1));
1245*fa9e4066Sahrens 		return (0);
1246*fa9e4066Sahrens 	} else {
1247*fa9e4066Sahrens 		/* the block is referenced from the dnode */
1248*fa9e4066Sahrens 		ASSERT3U(level, ==, nlevels-1);
1249*fa9e4066Sahrens 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1250*fa9e4066Sahrens 		    blkid < dn->dn_phys->dn_nblkptr);
1251*fa9e4066Sahrens 		*parentp = dn->dn_dbuf;
1252*fa9e4066Sahrens 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1253*fa9e4066Sahrens 		return (0);
1254*fa9e4066Sahrens 	}
1255*fa9e4066Sahrens }
1256*fa9e4066Sahrens 
1257*fa9e4066Sahrens static dmu_buf_impl_t *
1258*fa9e4066Sahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1259*fa9e4066Sahrens     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1260*fa9e4066Sahrens {
1261*fa9e4066Sahrens 	objset_impl_t *os = dn->dn_objset;
1262*fa9e4066Sahrens 	dmu_buf_impl_t *db, *odb;
1263*fa9e4066Sahrens 
1264*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1265*fa9e4066Sahrens 	ASSERT(dn->dn_type != DMU_OT_NONE);
1266*fa9e4066Sahrens 
1267*fa9e4066Sahrens 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1268*fa9e4066Sahrens 
1269*fa9e4066Sahrens 	db->db_objset = os;
1270*fa9e4066Sahrens 	db->db.db_object = dn->dn_object;
1271*fa9e4066Sahrens 	db->db_level = level;
1272*fa9e4066Sahrens 	db->db_blkid = blkid;
1273*fa9e4066Sahrens 	db->db_state = DB_UNCACHED;
1274*fa9e4066Sahrens 
1275*fa9e4066Sahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
1276*fa9e4066Sahrens 		db->db.db_size = dn->dn_bonuslen;
1277*fa9e4066Sahrens 		db->db.db_offset = DB_BONUS_BLKID;
1278*fa9e4066Sahrens 	} else {
1279*fa9e4066Sahrens 		int blocksize =
1280*fa9e4066Sahrens 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1281*fa9e4066Sahrens 		db->db.db_size = blocksize;
1282*fa9e4066Sahrens 		db->db.db_offset = db->db_blkid * blocksize;
1283*fa9e4066Sahrens 	}
1284*fa9e4066Sahrens 
1285*fa9e4066Sahrens 	db->db_dirtied = 0;
1286*fa9e4066Sahrens 	db->db_dirtycnt = 0;
1287*fa9e4066Sahrens 
1288*fa9e4066Sahrens 	bzero(&db->db_d, sizeof (db->db_d));
1289*fa9e4066Sahrens 
1290*fa9e4066Sahrens 	/*
1291*fa9e4066Sahrens 	 * Hold the dn_dbufs_mtx while we get the new dbuf
1292*fa9e4066Sahrens 	 * in the hash table *and* added to the dbufs list.
1293*fa9e4066Sahrens 	 * This prevents a possible deadlock with someone
1294*fa9e4066Sahrens 	 * trying to look up this dbuf before its added to the
1295*fa9e4066Sahrens 	 * dn_dbufs list.
1296*fa9e4066Sahrens 	 */
1297*fa9e4066Sahrens 	mutex_enter(&dn->dn_dbufs_mtx);
1298*fa9e4066Sahrens 	if ((odb = dbuf_hash_insert(db)) != NULL) {
1299*fa9e4066Sahrens 		/* someone else inserted it first */
1300*fa9e4066Sahrens 		kmem_cache_free(dbuf_cache, db);
1301*fa9e4066Sahrens 		mutex_exit(&dn->dn_dbufs_mtx);
1302*fa9e4066Sahrens 		return (odb);
1303*fa9e4066Sahrens 	}
1304*fa9e4066Sahrens 	list_insert_head(&dn->dn_dbufs, db);
1305*fa9e4066Sahrens 	mutex_exit(&dn->dn_dbufs_mtx);
1306*fa9e4066Sahrens 
1307*fa9e4066Sahrens 	if (parent && parent != dn->dn_dbuf)
1308*fa9e4066Sahrens 		dbuf_add_ref(parent, db);
1309*fa9e4066Sahrens 
1310*fa9e4066Sahrens 	(void) refcount_add(&dn->dn_holds, db);
1311*fa9e4066Sahrens 
1312*fa9e4066Sahrens 	db->db_dnode = dn;
1313*fa9e4066Sahrens 	db->db_parent = parent;
1314*fa9e4066Sahrens 	db->db_blkptr = blkptr;
1315*fa9e4066Sahrens 
1316*fa9e4066Sahrens 	dprintf_dbuf(db, "db=%p\n", db);
1317*fa9e4066Sahrens 
1318*fa9e4066Sahrens 	return (db);
1319*fa9e4066Sahrens }
1320*fa9e4066Sahrens 
1321*fa9e4066Sahrens static int
1322*fa9e4066Sahrens dbuf_evictable(dmu_buf_impl_t *db)
1323*fa9e4066Sahrens {
1324*fa9e4066Sahrens 	int i;
1325*fa9e4066Sahrens 
1326*fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&db->db_mtx));
1327*fa9e4066Sahrens 	dbuf_verify(db);
1328*fa9e4066Sahrens 
1329*fa9e4066Sahrens 	if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
1330*fa9e4066Sahrens 		return (FALSE);
1331*fa9e4066Sahrens 
1332*fa9e4066Sahrens 	if (!refcount_is_zero(&db->db_holds))
1333*fa9e4066Sahrens 		return (FALSE);
1334*fa9e4066Sahrens 
1335*fa9e4066Sahrens #ifdef ZFS_DEBUG
1336*fa9e4066Sahrens 	for (i = 0; i < TXG_SIZE; i++) {
1337*fa9e4066Sahrens 		ASSERT(!list_link_active(&db->db_dirty_node[i]));
1338*fa9e4066Sahrens 		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
1339*fa9e4066Sahrens 	}
1340*fa9e4066Sahrens #endif
1341*fa9e4066Sahrens 
1342*fa9e4066Sahrens 	/*
1343*fa9e4066Sahrens 	 * Now we know we want to free it.
1344*fa9e4066Sahrens 	 * This call must be done last, since it has side effects -
1345*fa9e4066Sahrens 	 * calling the db_evict_func().
1346*fa9e4066Sahrens 	 */
1347*fa9e4066Sahrens 	dbuf_evict_user(db);
1348*fa9e4066Sahrens 	return (TRUE);
1349*fa9e4066Sahrens }
1350*fa9e4066Sahrens 
1351*fa9e4066Sahrens static void
1352*fa9e4066Sahrens dbuf_destroy(dmu_buf_impl_t *db)
1353*fa9e4066Sahrens {
1354*fa9e4066Sahrens 	ASSERT(refcount_is_zero(&db->db_holds));
1355*fa9e4066Sahrens 
1356*fa9e4066Sahrens 	ASSERT(db->db.db_data == NULL);
1357*fa9e4066Sahrens 	ASSERT(db->db_dnode == NULL);
1358*fa9e4066Sahrens 	ASSERT(db->db_parent == NULL);
1359*fa9e4066Sahrens 	ASSERT(db->db_hash_next == NULL);
1360*fa9e4066Sahrens 	ASSERT(db->db_blkptr == NULL);
1361*fa9e4066Sahrens 	ASSERT(db->db_data_pending == NULL);
1362*fa9e4066Sahrens 
1363*fa9e4066Sahrens 	kmem_cache_free(dbuf_cache, db);
1364*fa9e4066Sahrens }
1365*fa9e4066Sahrens 
1366*fa9e4066Sahrens void
1367*fa9e4066Sahrens dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1368*fa9e4066Sahrens {
1369*fa9e4066Sahrens 	dmu_buf_impl_t *db, *parent = NULL;
1370*fa9e4066Sahrens 	blkptr_t *bp = NULL;
1371*fa9e4066Sahrens 
1372*fa9e4066Sahrens 	ASSERT(blkid != DB_BONUS_BLKID);
1373*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1374*fa9e4066Sahrens 
1375*fa9e4066Sahrens 	if (dnode_block_freed(dn, blkid))
1376*fa9e4066Sahrens 		return;
1377*fa9e4066Sahrens 
1378*fa9e4066Sahrens 	/* dbuf_find() returns with db_mtx held */
1379*fa9e4066Sahrens 	if (db = dbuf_find(dn, 0, blkid)) {
1380*fa9e4066Sahrens 		/*
1381*fa9e4066Sahrens 		 * This dbuf is already in the cache.  We assume that
1382*fa9e4066Sahrens 		 * it is already CACHED, or else about to be either
1383*fa9e4066Sahrens 		 * read or filled.
1384*fa9e4066Sahrens 		 */
1385*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1386*fa9e4066Sahrens 		return;
1387*fa9e4066Sahrens 	}
1388*fa9e4066Sahrens 
1389*fa9e4066Sahrens 	if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
1390*fa9e4066Sahrens 		if (bp && !BP_IS_HOLE(bp)) {
1391*fa9e4066Sahrens 			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
1392*fa9e4066Sahrens 			    dmu_ot[dn->dn_type].ot_byteswap,
1393*fa9e4066Sahrens 			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1394*fa9e4066Sahrens 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1395*fa9e4066Sahrens 			    (ARC_NOWAIT | ARC_PREFETCH));
1396*fa9e4066Sahrens 		}
1397*fa9e4066Sahrens 		if (parent && parent != dn->dn_dbuf)
1398*fa9e4066Sahrens 			dbuf_rele(parent);
1399*fa9e4066Sahrens 	}
1400*fa9e4066Sahrens }
1401*fa9e4066Sahrens 
1402*fa9e4066Sahrens /*
1403*fa9e4066Sahrens  * Returns with db_holds incremented, and db_mtx not held.
1404*fa9e4066Sahrens  * Note: dn_struct_rwlock must be held.
1405*fa9e4066Sahrens  */
1406*fa9e4066Sahrens int
1407*fa9e4066Sahrens dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1408*fa9e4066Sahrens     void *tag, dmu_buf_impl_t **dbp)
1409*fa9e4066Sahrens {
1410*fa9e4066Sahrens 	dmu_buf_impl_t *db, *parent = NULL;
1411*fa9e4066Sahrens 
1412*fa9e4066Sahrens 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1413*fa9e4066Sahrens 	ASSERT3U(dn->dn_nlevels, >, level);
1414*fa9e4066Sahrens 
1415*fa9e4066Sahrens 	*dbp = NULL;
1416*fa9e4066Sahrens 
1417*fa9e4066Sahrens 	/* dbuf_find() returns with db_mtx held */
1418*fa9e4066Sahrens 	db = dbuf_find(dn, level, blkid);
1419*fa9e4066Sahrens 
1420*fa9e4066Sahrens 	if (db == NULL) {
1421*fa9e4066Sahrens 		blkptr_t *bp = NULL;
1422*fa9e4066Sahrens 		int err;
1423*fa9e4066Sahrens 
1424*fa9e4066Sahrens 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1425*fa9e4066Sahrens 		if (fail_sparse) {
1426*fa9e4066Sahrens 			if (err == 0 && bp && BP_IS_HOLE(bp))
1427*fa9e4066Sahrens 				err = ENOENT;
1428*fa9e4066Sahrens 			if (err) {
1429*fa9e4066Sahrens 				if (parent && parent != dn->dn_dbuf)
1430*fa9e4066Sahrens 					dbuf_rele(parent);
1431*fa9e4066Sahrens 				return (err);
1432*fa9e4066Sahrens 			}
1433*fa9e4066Sahrens 		}
1434*fa9e4066Sahrens 		db = dbuf_create(dn, level, blkid, parent, bp);
1435*fa9e4066Sahrens 	}
1436*fa9e4066Sahrens 
1437*fa9e4066Sahrens 	/*
1438*fa9e4066Sahrens 	 * If this buffer is currently syncing out, and we are
1439*fa9e4066Sahrens 	 * are still referencing it from db_data, we need to make
1440*fa9e4066Sahrens 	 * a copy of it in case we decide we want to dirty it
1441*fa9e4066Sahrens 	 * again in this txg.
1442*fa9e4066Sahrens 	 */
1443*fa9e4066Sahrens 	if (db->db_level == 0 && db->db_state == DB_CACHED &&
1444*fa9e4066Sahrens 	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
1445*fa9e4066Sahrens 	    db->db_data_pending == db->db_buf) {
1446*fa9e4066Sahrens 		int size = (db->db_blkid == DB_BONUS_BLKID) ?
1447*fa9e4066Sahrens 		    DN_MAX_BONUSLEN : db->db.db_size;
1448*fa9e4066Sahrens 
1449*fa9e4066Sahrens 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1450*fa9e4066Sahrens 		    size, db));
1451*fa9e4066Sahrens 		bcopy(db->db_data_pending->b_data, db->db.db_data,
1452*fa9e4066Sahrens 		    db->db.db_size);
1453*fa9e4066Sahrens 	}
1454*fa9e4066Sahrens 
1455*fa9e4066Sahrens 	dbuf_add_ref(db, tag);
1456*fa9e4066Sahrens 	dbuf_update_data(db);
1457*fa9e4066Sahrens 	dbuf_verify(db);
1458*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1459*fa9e4066Sahrens 
1460*fa9e4066Sahrens 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1461*fa9e4066Sahrens 	if (parent && parent != dn->dn_dbuf)
1462*fa9e4066Sahrens 		dbuf_rele(parent);
1463*fa9e4066Sahrens 
1464*fa9e4066Sahrens 	ASSERT3P(db->db_dnode, ==, dn);
1465*fa9e4066Sahrens 	ASSERT3U(db->db_blkid, ==, blkid);
1466*fa9e4066Sahrens 	ASSERT3U(db->db_level, ==, level);
1467*fa9e4066Sahrens 	*dbp = db;
1468*fa9e4066Sahrens 
1469*fa9e4066Sahrens 	return (0);
1470*fa9e4066Sahrens }
1471*fa9e4066Sahrens 
1472*fa9e4066Sahrens dmu_buf_impl_t *
1473*fa9e4066Sahrens dbuf_hold(dnode_t *dn, uint64_t blkid)
1474*fa9e4066Sahrens {
1475*fa9e4066Sahrens 	dmu_buf_impl_t *db;
1476*fa9e4066Sahrens 	(void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
1477*fa9e4066Sahrens 	return (db);
1478*fa9e4066Sahrens }
1479*fa9e4066Sahrens 
1480*fa9e4066Sahrens dmu_buf_impl_t *
1481*fa9e4066Sahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1482*fa9e4066Sahrens {
1483*fa9e4066Sahrens 	dmu_buf_impl_t *db;
1484*fa9e4066Sahrens 	(void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1485*fa9e4066Sahrens 	return (db);
1486*fa9e4066Sahrens }
1487*fa9e4066Sahrens 
1488*fa9e4066Sahrens dmu_buf_impl_t *
1489*fa9e4066Sahrens dbuf_hold_bonus(dnode_t *dn, void *tag)
1490*fa9e4066Sahrens {
1491*fa9e4066Sahrens 	dmu_buf_impl_t *db;
1492*fa9e4066Sahrens 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1493*fa9e4066Sahrens 	(void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
1494*fa9e4066Sahrens 	rw_exit(&dn->dn_struct_rwlock);
1495*fa9e4066Sahrens 	return (db);
1496*fa9e4066Sahrens }
1497*fa9e4066Sahrens 
1498*fa9e4066Sahrens void
1499*fa9e4066Sahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1500*fa9e4066Sahrens {
1501*fa9e4066Sahrens 	(void) refcount_add(&db->db_holds, tag);
1502*fa9e4066Sahrens 	/* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
1503*fa9e4066Sahrens }
1504*fa9e4066Sahrens 
1505*fa9e4066Sahrens void
1506*fa9e4066Sahrens dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
1507*fa9e4066Sahrens {
1508*fa9e4066Sahrens 	int64_t holds;
1509*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
1510*fa9e4066Sahrens 	int need_mutex;
1511*fa9e4066Sahrens 
1512*fa9e4066Sahrens 	ASSERT(dn != NULL);
1513*fa9e4066Sahrens 	need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
1514*fa9e4066Sahrens 
1515*fa9e4066Sahrens 	if (need_mutex) {
1516*fa9e4066Sahrens 		dnode_add_ref(dn, FTAG);
1517*fa9e4066Sahrens 		mutex_enter(&dn->dn_dbufs_mtx);
1518*fa9e4066Sahrens 	}
1519*fa9e4066Sahrens 
1520*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1521*fa9e4066Sahrens 	dbuf_verify(db);
1522*fa9e4066Sahrens 
1523*fa9e4066Sahrens 	holds = refcount_remove(&db->db_holds, tag);
1524*fa9e4066Sahrens 
1525*fa9e4066Sahrens 	if (holds == 0) {
1526*fa9e4066Sahrens 		ASSERT3U(db->db_state, !=, DB_FILL);
1527*fa9e4066Sahrens 		if (db->db_level == 0 &&
1528*fa9e4066Sahrens 		    db->db_d.db_user_data_ptr_ptr != NULL)
1529*fa9e4066Sahrens 			*db->db_d.db_user_data_ptr_ptr = NULL;
1530*fa9e4066Sahrens 		dbuf_evict(db);
1531*fa9e4066Sahrens 	} else {
1532*fa9e4066Sahrens 		if (holds == db->db_dirtycnt &&
1533*fa9e4066Sahrens 		    db->db_level == 0 && db->db_d.db_immediate_evict)
1534*fa9e4066Sahrens 			dbuf_evict_user(db);
1535*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1536*fa9e4066Sahrens 	}
1537*fa9e4066Sahrens 
1538*fa9e4066Sahrens 	if (need_mutex) {
1539*fa9e4066Sahrens 		mutex_exit(&dn->dn_dbufs_mtx);
1540*fa9e4066Sahrens 		dnode_rele(dn, FTAG);
1541*fa9e4066Sahrens 	}
1542*fa9e4066Sahrens }
1543*fa9e4066Sahrens 
1544*fa9e4066Sahrens void
1545*fa9e4066Sahrens dbuf_rele(dmu_buf_impl_t *db)
1546*fa9e4066Sahrens {
1547*fa9e4066Sahrens 	dbuf_remove_ref(db, NULL);
1548*fa9e4066Sahrens }
1549*fa9e4066Sahrens 
1550*fa9e4066Sahrens #pragma weak dmu_buf_refcount = dbuf_refcount
1551*fa9e4066Sahrens uint64_t
1552*fa9e4066Sahrens dbuf_refcount(dmu_buf_impl_t *db)
1553*fa9e4066Sahrens {
1554*fa9e4066Sahrens 	return (refcount_count(&db->db_holds));
1555*fa9e4066Sahrens }
1556*fa9e4066Sahrens 
1557*fa9e4066Sahrens void *
1558*fa9e4066Sahrens dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1559*fa9e4066Sahrens     dmu_buf_evict_func_t *evict_func)
1560*fa9e4066Sahrens {
1561*fa9e4066Sahrens 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1562*fa9e4066Sahrens 	    user_data_ptr_ptr, evict_func));
1563*fa9e4066Sahrens }
1564*fa9e4066Sahrens 
1565*fa9e4066Sahrens void *
1566*fa9e4066Sahrens dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1567*fa9e4066Sahrens     dmu_buf_evict_func_t *evict_func)
1568*fa9e4066Sahrens {
1569*fa9e4066Sahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1570*fa9e4066Sahrens 
1571*fa9e4066Sahrens 	db->db_d.db_immediate_evict = TRUE;
1572*fa9e4066Sahrens 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1573*fa9e4066Sahrens 	    user_data_ptr_ptr, evict_func));
1574*fa9e4066Sahrens }
1575*fa9e4066Sahrens 
1576*fa9e4066Sahrens void *
1577*fa9e4066Sahrens dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1578*fa9e4066Sahrens     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1579*fa9e4066Sahrens {
1580*fa9e4066Sahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1581*fa9e4066Sahrens 	ASSERT(db->db_level == 0);
1582*fa9e4066Sahrens 
1583*fa9e4066Sahrens 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
1584*fa9e4066Sahrens 
1585*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1586*fa9e4066Sahrens 
1587*fa9e4066Sahrens 	if (db->db_d.db_user_ptr == old_user_ptr) {
1588*fa9e4066Sahrens 		db->db_d.db_user_ptr = user_ptr;
1589*fa9e4066Sahrens 		db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
1590*fa9e4066Sahrens 		db->db_d.db_evict_func = evict_func;
1591*fa9e4066Sahrens 
1592*fa9e4066Sahrens 		dbuf_update_data(db);
1593*fa9e4066Sahrens 	} else {
1594*fa9e4066Sahrens 		old_user_ptr = db->db_d.db_user_ptr;
1595*fa9e4066Sahrens 	}
1596*fa9e4066Sahrens 
1597*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
1598*fa9e4066Sahrens 	return (old_user_ptr);
1599*fa9e4066Sahrens }
1600*fa9e4066Sahrens 
1601*fa9e4066Sahrens void *
1602*fa9e4066Sahrens dmu_buf_get_user(dmu_buf_t *db_fake)
1603*fa9e4066Sahrens {
1604*fa9e4066Sahrens 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1605*fa9e4066Sahrens 	ASSERT(!refcount_is_zero(&db->db_holds));
1606*fa9e4066Sahrens 
1607*fa9e4066Sahrens 	return (db->db_d.db_user_ptr);
1608*fa9e4066Sahrens }
1609*fa9e4066Sahrens 
1610*fa9e4066Sahrens void
1611*fa9e4066Sahrens dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
1612*fa9e4066Sahrens {
1613*fa9e4066Sahrens 	arc_buf_t **data;
1614*fa9e4066Sahrens 	uint64_t txg = tx->tx_txg;
1615*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
1616*fa9e4066Sahrens 	objset_impl_t *os = dn->dn_objset;
1617*fa9e4066Sahrens 	int blksz;
1618*fa9e4066Sahrens 
1619*fa9e4066Sahrens 	ASSERT(dmu_tx_is_syncing(tx));
1620*fa9e4066Sahrens 
1621*fa9e4066Sahrens 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1622*fa9e4066Sahrens 
1623*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1624*fa9e4066Sahrens 	/*
1625*fa9e4066Sahrens 	 * To be synced, we must be dirtied.  But we
1626*fa9e4066Sahrens 	 * might have been freed after the dirty.
1627*fa9e4066Sahrens 	 */
1628*fa9e4066Sahrens 	if (db->db_state == DB_UNCACHED) {
1629*fa9e4066Sahrens 		/* This buffer has been freed since it was dirtied */
1630*fa9e4066Sahrens 		ASSERT(db->db.db_data == NULL);
1631*fa9e4066Sahrens 	} else if (db->db_state == DB_FILL) {
1632*fa9e4066Sahrens 		/* This buffer was freed and is now being re-filled */
1633*fa9e4066Sahrens 		ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
1634*fa9e4066Sahrens 	} else {
1635*fa9e4066Sahrens 		ASSERT3U(db->db_state, ==, DB_CACHED);
1636*fa9e4066Sahrens 	}
1637*fa9e4066Sahrens 	dbuf_verify(db);
1638*fa9e4066Sahrens 
1639*fa9e4066Sahrens 	/*
1640*fa9e4066Sahrens 	 * Don't need a lock on db_dirty (dn_mtx), because it can't
1641*fa9e4066Sahrens 	 * be modified yet.
1642*fa9e4066Sahrens 	 */
1643*fa9e4066Sahrens 
1644*fa9e4066Sahrens 	if (db->db_level == 0) {
1645*fa9e4066Sahrens 		data = &db->db_d.db_data_old[txg&TXG_MASK];
1646*fa9e4066Sahrens 		blksz = arc_buf_size(*data);
1647*fa9e4066Sahrens 		/*
1648*fa9e4066Sahrens 		 * If this buffer is currently "in use" (i.e., there are
1649*fa9e4066Sahrens 		 * active holds and db_data still references it), then make
1650*fa9e4066Sahrens 		 * a copy before we start the write so that any modifications
1651*fa9e4066Sahrens 		 * from the open txg will not leak into this write.
1652*fa9e4066Sahrens 		 *
1653*fa9e4066Sahrens 		 * NOTE: this copy does not need to be made for objects only
1654*fa9e4066Sahrens 		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
1655*fa9e4066Sahrens 		 * or if there is no actual write involved (bonus blocks).
1656*fa9e4066Sahrens 		 */
1657*fa9e4066Sahrens 		if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
1658*fa9e4066Sahrens 		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
1659*fa9e4066Sahrens 		    db->db_blkid != DB_BONUS_BLKID) {
1660*fa9e4066Sahrens 			if (refcount_count(&db->db_holds) > 1 &&
1661*fa9e4066Sahrens 			    *data == db->db_buf) {
1662*fa9e4066Sahrens 				*data = arc_buf_alloc(
1663*fa9e4066Sahrens 				    db->db_dnode->dn_objset->os_spa, blksz, db);
1664*fa9e4066Sahrens 				bcopy(db->db.db_data, (*data)->b_data, blksz);
1665*fa9e4066Sahrens 			}
1666*fa9e4066Sahrens 			db->db_data_pending = *data;
1667*fa9e4066Sahrens 		} else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
1668*fa9e4066Sahrens 			/*
1669*fa9e4066Sahrens 			 * Private object buffers are released here rather
1670*fa9e4066Sahrens 			 * than in dbuf_dirty() since they are only modified
1671*fa9e4066Sahrens 			 * in the syncing context and we don't want the
1672*fa9e4066Sahrens 			 * overhead of making multiple copies of the data.
1673*fa9e4066Sahrens 			 */
1674*fa9e4066Sahrens 			arc_release(db->db_buf, db);
1675*fa9e4066Sahrens 		}
1676*fa9e4066Sahrens 	} else {
1677*fa9e4066Sahrens 		data = &db->db_buf;
1678*fa9e4066Sahrens 		if (*data == NULL) {
1679*fa9e4066Sahrens 			/*
1680*fa9e4066Sahrens 			 * This can happen if we dirty and then free
1681*fa9e4066Sahrens 			 * the level-0 data blocks in the same txg. So
1682*fa9e4066Sahrens 			 * this indirect remains unchanged.
1683*fa9e4066Sahrens 			 */
1684*fa9e4066Sahrens 			if (db->db_dirtied == txg)
1685*fa9e4066Sahrens 				db->db_dirtied = 0;
1686*fa9e4066Sahrens 			ASSERT(db->db_dirtycnt > 0);
1687*fa9e4066Sahrens 			db->db_dirtycnt -= 1;
1688*fa9e4066Sahrens 			mutex_exit(&db->db_mtx);
1689*fa9e4066Sahrens 			dbuf_remove_ref(db, (void *)(uintptr_t)txg);
1690*fa9e4066Sahrens 			return;
1691*fa9e4066Sahrens 		}
1692*fa9e4066Sahrens 		blksz = db->db.db_size;
1693*fa9e4066Sahrens 		ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
1694*fa9e4066Sahrens 	}
1695*fa9e4066Sahrens 
1696*fa9e4066Sahrens 	ASSERT(*data != NULL);
1697*fa9e4066Sahrens 
1698*fa9e4066Sahrens 	if (db->db_blkid == DB_BONUS_BLKID) {
1699*fa9e4066Sahrens 		/*
1700*fa9e4066Sahrens 		 * Simply copy the bonus data into the dnode.  It will
1701*fa9e4066Sahrens 		 * be written out when the dnode is synced (and it will
1702*fa9e4066Sahrens 		 * be synced, since it must have been dirty for dbuf_sync
1703*fa9e4066Sahrens 		 * to be called).  The bonus data will be byte swapped
1704*fa9e4066Sahrens 		 * in dnode_byteswap.
1705*fa9e4066Sahrens 		 */
1706*fa9e4066Sahrens 		/*
1707*fa9e4066Sahrens 		 * Use dn_phys->dn_bonuslen since db.db_size is the length
1708*fa9e4066Sahrens 		 * of the bonus buffer in the open transaction rather than
1709*fa9e4066Sahrens 		 * the syncing transaction.
1710*fa9e4066Sahrens 		 */
1711*fa9e4066Sahrens 		ASSERT3U(db->db_level, ==, 0);
1712*fa9e4066Sahrens 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
1713*fa9e4066Sahrens 		bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
1714*fa9e4066Sahrens 		    dn->dn_phys->dn_bonuslen);
1715*fa9e4066Sahrens 		if (*data != db->db_buf)
1716*fa9e4066Sahrens 			arc_buf_free(*data, db);
1717*fa9e4066Sahrens 		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
1718*fa9e4066Sahrens 		db->db_data_pending = NULL;
1719*fa9e4066Sahrens 		if (db->db_dirtied == txg)
1720*fa9e4066Sahrens 			db->db_dirtied = 0;
1721*fa9e4066Sahrens 		ASSERT(db->db_dirtycnt > 0);
1722*fa9e4066Sahrens 		db->db_dirtycnt -= 1;
1723*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1724*fa9e4066Sahrens 		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
1725*fa9e4066Sahrens 		return;
1726*fa9e4066Sahrens 	} else if (db->db_level > 0 && !arc_released(db->db_buf)) {
1727*fa9e4066Sahrens 		/*
1728*fa9e4066Sahrens 		 * This indirect buffer was marked dirty, but
1729*fa9e4066Sahrens 		 * never modified (if it had been modified, then
1730*fa9e4066Sahrens 		 * we would have released the buffer).  There is
1731*fa9e4066Sahrens 		 * no reason to write anything.
1732*fa9e4066Sahrens 		 */
1733*fa9e4066Sahrens 		db->db_data_pending = NULL;
1734*fa9e4066Sahrens 		if (db->db_dirtied == txg)
1735*fa9e4066Sahrens 			db->db_dirtied = 0;
1736*fa9e4066Sahrens 		ASSERT(db->db_dirtycnt > 0);
1737*fa9e4066Sahrens 		db->db_dirtycnt -= 1;
1738*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1739*fa9e4066Sahrens 		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
1740*fa9e4066Sahrens 		return;
1741*fa9e4066Sahrens 	} else if (db->db_blkptr == NULL &&
1742*fa9e4066Sahrens 	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
1743*fa9e4066Sahrens 	    db->db_blkid < dn->dn_phys->dn_nblkptr) {
1744*fa9e4066Sahrens 		/*
1745*fa9e4066Sahrens 		 * This buffer was allocated at a time when there was
1746*fa9e4066Sahrens 		 * no available blkptrs from the dnode, or it was
1747*fa9e4066Sahrens 		 * inappropriate to hook it in (i.e., nlevels mis-match).
1748*fa9e4066Sahrens 		 */
1749*fa9e4066Sahrens 		ASSERT(db->db_blkptr == NULL);
1750*fa9e4066Sahrens 		ASSERT(db->db_parent == NULL);
1751*fa9e4066Sahrens 		db->db_parent = dn->dn_dbuf;
1752*fa9e4066Sahrens 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1753*fa9e4066Sahrens 		dbuf_verify(db);
1754*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1755*fa9e4066Sahrens 	} else if (db->db_blkptr == NULL) {
1756*fa9e4066Sahrens 		dmu_buf_impl_t *parent = db->db_parent;
1757*fa9e4066Sahrens 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1758*fa9e4066Sahrens 
1759*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1760*fa9e4066Sahrens 		ASSERT(dn->dn_phys->dn_nlevels > 1);
1761*fa9e4066Sahrens 		if (parent == NULL) {
1762*fa9e4066Sahrens 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
1763*fa9e4066Sahrens 			(void) dbuf_hold_impl(dn, db->db_level+1,
1764*fa9e4066Sahrens 			    db->db_blkid >> epbs, FALSE, NULL, &parent);
1765*fa9e4066Sahrens 			rw_exit(&dn->dn_struct_rwlock);
1766*fa9e4066Sahrens 			dbuf_add_ref(parent, db);
1767*fa9e4066Sahrens 			db->db_parent = parent;
1768*fa9e4066Sahrens 			dbuf_rele(parent);
1769*fa9e4066Sahrens 		}
1770*fa9e4066Sahrens 		dbuf_read(parent);
1771*fa9e4066Sahrens 	} else {
1772*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1773*fa9e4066Sahrens 	}
1774*fa9e4066Sahrens 
1775*fa9e4066Sahrens 	ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
1776*fa9e4066Sahrens 
1777*fa9e4066Sahrens 	if (db->db_parent != dn->dn_dbuf) {
1778*fa9e4066Sahrens 		dmu_buf_impl_t *parent = db->db_parent;
1779*fa9e4066Sahrens 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1780*fa9e4066Sahrens 
1781*fa9e4066Sahrens 		mutex_enter(&db->db_mtx);
1782*fa9e4066Sahrens 		ASSERT(db->db_level == parent->db_level-1);
1783*fa9e4066Sahrens 		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
1784*fa9e4066Sahrens 		/*
1785*fa9e4066Sahrens 		 * We may have read this block after we dirtied it,
1786*fa9e4066Sahrens 		 * so never released it from the cache.
1787*fa9e4066Sahrens 		 */
1788*fa9e4066Sahrens 		arc_release(parent->db_buf, parent);
1789*fa9e4066Sahrens 
1790*fa9e4066Sahrens 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
1791*fa9e4066Sahrens 		    (db->db_blkid & ((1ULL << epbs) - 1));
1792*fa9e4066Sahrens 		dbuf_verify(db);
1793*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1794*fa9e4066Sahrens 	}
1795*fa9e4066Sahrens 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1796*fa9e4066Sahrens 
1797*fa9e4066Sahrens #ifdef ZFS_DEBUG
1798*fa9e4066Sahrens 	if (db->db_parent == dn->dn_dbuf) {
1799*fa9e4066Sahrens 		/*
1800*fa9e4066Sahrens 		 * We don't need to dnode_setdirty(dn) because if we got
1801*fa9e4066Sahrens 		 * here then the parent is already dirty.
1802*fa9e4066Sahrens 		 */
1803*fa9e4066Sahrens 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
1804*fa9e4066Sahrens 		ASSERT3P(db->db_blkptr, ==,
1805*fa9e4066Sahrens 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
1806*fa9e4066Sahrens 	}
1807*fa9e4066Sahrens #endif
1808*fa9e4066Sahrens 	if (db->db_level == 0 &&
1809*fa9e4066Sahrens 	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
1810*fa9e4066Sahrens 		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
1811*fa9e4066Sahrens 		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
1812*fa9e4066Sahrens 		int old_size = BP_GET_ASIZE(db->db_blkptr);
1813*fa9e4066Sahrens 		int new_size = BP_GET_ASIZE(*bpp);
1814*fa9e4066Sahrens 
1815*fa9e4066Sahrens 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
1816*fa9e4066Sahrens 
1817*fa9e4066Sahrens 		dnode_diduse_space(dn, new_size-old_size);
1818*fa9e4066Sahrens 		mutex_enter(&dn->dn_mtx);
1819*fa9e4066Sahrens 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
1820*fa9e4066Sahrens 			dn->dn_phys->dn_maxblkid = db->db_blkid;
1821*fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
1822*fa9e4066Sahrens 
1823*fa9e4066Sahrens 		dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
1824*fa9e4066Sahrens 		if (!BP_IS_HOLE(db->db_blkptr))
1825*fa9e4066Sahrens 			dsl_dataset_block_kill(os->os_dsl_dataset,
1826*fa9e4066Sahrens 			    db->db_blkptr, os->os_synctx);
1827*fa9e4066Sahrens 
1828*fa9e4066Sahrens 		mutex_enter(&db->db_mtx);
1829*fa9e4066Sahrens 		*db->db_blkptr = **bpp;
1830*fa9e4066Sahrens 		kmem_free(*bpp, sizeof (blkptr_t));
1831*fa9e4066Sahrens 		*bpp = NULL;
1832*fa9e4066Sahrens 
1833*fa9e4066Sahrens 		if (*old != db->db_buf)
1834*fa9e4066Sahrens 			arc_buf_free(*old, db);
1835*fa9e4066Sahrens 		*old = NULL;
1836*fa9e4066Sahrens 		db->db_data_pending = NULL;
1837*fa9e4066Sahrens 
1838*fa9e4066Sahrens 		cv_broadcast(&db->db_changed);
1839*fa9e4066Sahrens 
1840*fa9e4066Sahrens 		ASSERT(db->db_dirtycnt > 0);
1841*fa9e4066Sahrens 		db->db_dirtycnt -= 1;
1842*fa9e4066Sahrens 		mutex_exit(&db->db_mtx);
1843*fa9e4066Sahrens 		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
1844*fa9e4066Sahrens 	} else {
1845*fa9e4066Sahrens 		int checksum, compress;
1846*fa9e4066Sahrens 
1847*fa9e4066Sahrens 		if (db->db_level > 0) {
1848*fa9e4066Sahrens 			/*
1849*fa9e4066Sahrens 			 * XXX -- we should design a compression algorithm
1850*fa9e4066Sahrens 			 * that specializes in arrays of bps.
1851*fa9e4066Sahrens 			 */
1852*fa9e4066Sahrens 			checksum = ZIO_CHECKSUM_FLETCHER_4;
1853*fa9e4066Sahrens 			compress = ZIO_COMPRESS_LZJB;
1854*fa9e4066Sahrens 		} else {
1855*fa9e4066Sahrens 			/*
1856*fa9e4066Sahrens 			 * Allow dnode settings to override objset settings,
1857*fa9e4066Sahrens 			 * except for metadata checksums.
1858*fa9e4066Sahrens 			 */
1859*fa9e4066Sahrens 			if (dmu_ot[dn->dn_type].ot_metadata) {
1860*fa9e4066Sahrens 				checksum = os->os_md_checksum;
1861*fa9e4066Sahrens 				compress = zio_compress_select(dn->dn_compress,
1862*fa9e4066Sahrens 				    os->os_md_compress);
1863*fa9e4066Sahrens 			} else {
1864*fa9e4066Sahrens 				checksum = zio_checksum_select(dn->dn_checksum,
1865*fa9e4066Sahrens 				    os->os_checksum);
1866*fa9e4066Sahrens 				compress = zio_compress_select(dn->dn_compress,
1867*fa9e4066Sahrens 				    os->os_compress);
1868*fa9e4066Sahrens 			}
1869*fa9e4066Sahrens 		}
1870*fa9e4066Sahrens #ifdef ZFS_DEBUG
1871*fa9e4066Sahrens 		if (db->db_parent) {
1872*fa9e4066Sahrens 			ASSERT(list_link_active(
1873*fa9e4066Sahrens 			    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
1874*fa9e4066Sahrens 			ASSERT(db->db_parent == dn->dn_dbuf ||
1875*fa9e4066Sahrens 			    db->db_parent->db_level > 0);
1876*fa9e4066Sahrens 			if (dn->dn_object & DMU_PRIVATE_OBJECT ||
1877*fa9e4066Sahrens 			    db->db_level > 0)
1878*fa9e4066Sahrens 				ASSERT(*data == db->db_buf);
1879*fa9e4066Sahrens 		}
1880*fa9e4066Sahrens #endif
1881*fa9e4066Sahrens 		ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
1882*fa9e4066Sahrens 		(void) arc_write(zio, os->os_spa, checksum, compress, txg,
1883*fa9e4066Sahrens 		    db->db_blkptr, *data, dbuf_write_done, db,
1884*fa9e4066Sahrens 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
1885*fa9e4066Sahrens 		/*
1886*fa9e4066Sahrens 		 * We can't access db after arc_write, since it could finish
1887*fa9e4066Sahrens 		 * and be freed, and we have no locks on it.
1888*fa9e4066Sahrens 		 */
1889*fa9e4066Sahrens 	}
1890*fa9e4066Sahrens }
1891*fa9e4066Sahrens 
1892*fa9e4066Sahrens struct dbuf_arg {
1893*fa9e4066Sahrens 	objset_impl_t *os;
1894*fa9e4066Sahrens 	blkptr_t bp;
1895*fa9e4066Sahrens };
1896*fa9e4066Sahrens 
1897*fa9e4066Sahrens static void
1898*fa9e4066Sahrens dbuf_do_born(void *arg)
1899*fa9e4066Sahrens {
1900*fa9e4066Sahrens 	struct dbuf_arg *da = arg;
1901*fa9e4066Sahrens 	dsl_dataset_block_born(da->os->os_dsl_dataset,
1902*fa9e4066Sahrens 	    &da->bp, da->os->os_synctx);
1903*fa9e4066Sahrens 	kmem_free(da, sizeof (struct dbuf_arg));
1904*fa9e4066Sahrens }
1905*fa9e4066Sahrens 
1906*fa9e4066Sahrens static void
1907*fa9e4066Sahrens dbuf_do_kill(void *arg)
1908*fa9e4066Sahrens {
1909*fa9e4066Sahrens 	struct dbuf_arg *da = arg;
1910*fa9e4066Sahrens 	dsl_dataset_block_kill(da->os->os_dsl_dataset,
1911*fa9e4066Sahrens 	    &da->bp, da->os->os_synctx);
1912*fa9e4066Sahrens 	kmem_free(da, sizeof (struct dbuf_arg));
1913*fa9e4066Sahrens }
1914*fa9e4066Sahrens 
1915*fa9e4066Sahrens /* ARGSUSED */
1916*fa9e4066Sahrens static void
1917*fa9e4066Sahrens dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
1918*fa9e4066Sahrens {
1919*fa9e4066Sahrens 	dmu_buf_impl_t *db = vdb;
1920*fa9e4066Sahrens 	dnode_t *dn = db->db_dnode;
1921*fa9e4066Sahrens 	objset_impl_t *os = dn->dn_objset;
1922*fa9e4066Sahrens 	uint64_t txg = zio->io_txg;
1923*fa9e4066Sahrens 	uint64_t fill = 0;
1924*fa9e4066Sahrens 	int i;
1925*fa9e4066Sahrens 	int old_size, new_size;
1926*fa9e4066Sahrens 
1927*fa9e4066Sahrens 	ASSERT3U(zio->io_error, ==, 0);
1928*fa9e4066Sahrens 
1929*fa9e4066Sahrens 	dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
1930*fa9e4066Sahrens 
1931*fa9e4066Sahrens 	old_size = BP_GET_ASIZE(&zio->io_bp_orig);
1932*fa9e4066Sahrens 	new_size = BP_GET_ASIZE(zio->io_bp);
1933*fa9e4066Sahrens 
1934*fa9e4066Sahrens 	dnode_diduse_space(dn, new_size-old_size);
1935*fa9e4066Sahrens 
1936*fa9e4066Sahrens 	mutex_enter(&db->db_mtx);
1937*fa9e4066Sahrens 
1938*fa9e4066Sahrens 	if (db->db_dirtied == txg)
1939*fa9e4066Sahrens 		db->db_dirtied = 0;
1940*fa9e4066Sahrens 
1941*fa9e4066Sahrens 	if (db->db_level == 0) {
1942*fa9e4066Sahrens 		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
1943*fa9e4066Sahrens 
1944*fa9e4066Sahrens 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
1945*fa9e4066Sahrens 
1946*fa9e4066Sahrens 		if (*old != db->db_buf)
1947*fa9e4066Sahrens 			arc_buf_free(*old, db);
1948*fa9e4066Sahrens 		*old = NULL;
1949*fa9e4066Sahrens 		db->db_data_pending = NULL;
1950*fa9e4066Sahrens 
1951*fa9e4066Sahrens 		mutex_enter(&dn->dn_mtx);
1952*fa9e4066Sahrens 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
1953*fa9e4066Sahrens 		    !BP_IS_HOLE(db->db_blkptr))
1954*fa9e4066Sahrens 			dn->dn_phys->dn_maxblkid = db->db_blkid;
1955*fa9e4066Sahrens 		mutex_exit(&dn->dn_mtx);
1956*fa9e4066Sahrens 
1957*fa9e4066Sahrens 		if (dn->dn_type == DMU_OT_DNODE) {
1958*fa9e4066Sahrens 			dnode_phys_t *dnp = db->db.db_data;
1959*fa9e4066Sahrens 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
1960*fa9e4066Sahrens 			    i--, dnp++) {
1961*fa9e4066Sahrens 				if (dnp->dn_type != DMU_OT_NONE)
1962*fa9e4066Sahrens 					fill++;
1963*fa9e4066Sahrens 			}
1964*fa9e4066Sahrens 		} else {
1965*fa9e4066Sahrens 			if (!BP_IS_HOLE(db->db_blkptr))
1966*fa9e4066Sahrens 				fill = 1;
1967*fa9e4066Sahrens 		}
1968*fa9e4066Sahrens 	} else {
1969*fa9e4066Sahrens 		blkptr_t *bp = db->db.db_data;
1970*fa9e4066Sahrens 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1971*fa9e4066Sahrens 		if (!BP_IS_HOLE(db->db_blkptr)) {
1972*fa9e4066Sahrens 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
1973*fa9e4066Sahrens 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
1974*fa9e4066Sahrens 			    db->db.db_size);
1975*fa9e4066Sahrens 		}
1976*fa9e4066Sahrens 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
1977*fa9e4066Sahrens 			if (BP_IS_HOLE(bp))
1978*fa9e4066Sahrens 				continue;
1979*fa9e4066Sahrens 			ASSERT3U(BP_GET_LSIZE(bp), ==,
1980*fa9e4066Sahrens 			    db->db_level == 1 ? dn->dn_datablksz :
1981*fa9e4066Sahrens 			    (1<<dn->dn_phys->dn_indblkshift));
1982*fa9e4066Sahrens 			fill += bp->blk_fill;
1983*fa9e4066Sahrens 		}
1984*fa9e4066Sahrens 	}
1985*fa9e4066Sahrens 
1986*fa9e4066Sahrens 	if (!BP_IS_HOLE(db->db_blkptr)) {
1987*fa9e4066Sahrens 		db->db_blkptr->blk_fill = fill;
1988*fa9e4066Sahrens 		BP_SET_TYPE(db->db_blkptr, dn->dn_type);
1989*fa9e4066Sahrens 		BP_SET_LEVEL(db->db_blkptr, db->db_level);
1990*fa9e4066Sahrens 	} else {
1991*fa9e4066Sahrens 		ASSERT3U(fill, ==, 0);
1992*fa9e4066Sahrens 		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
1993*fa9e4066Sahrens 	}
1994*fa9e4066Sahrens 
1995*fa9e4066Sahrens 	dprintf_dbuf_bp(db, db->db_blkptr,
1996*fa9e4066Sahrens 	    "wrote %llu bytes to blkptr:", zio->io_size);
1997*fa9e4066Sahrens 
1998*fa9e4066Sahrens 	ASSERT(db->db_parent == NULL ||
1999*fa9e4066Sahrens 	    list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
2000*fa9e4066Sahrens 	cv_broadcast(&db->db_changed);
2001*fa9e4066Sahrens 	ASSERT(db->db_dirtycnt > 0);
2002*fa9e4066Sahrens 	db->db_dirtycnt -= 1;
2003*fa9e4066Sahrens 	mutex_exit(&db->db_mtx);
2004*fa9e4066Sahrens 
2005*fa9e4066Sahrens 	/* We must do this after we've set the bp's type and level */
2006*fa9e4066Sahrens 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
2007*fa9e4066Sahrens 	    BP_IDENTITY(&zio->io_bp_orig))) {
2008*fa9e4066Sahrens 		struct dbuf_arg *da;
2009*fa9e4066Sahrens 		da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
2010*fa9e4066Sahrens 		da->os = os;
2011*fa9e4066Sahrens 		da->bp = *zio->io_bp;
2012*fa9e4066Sahrens 		(void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
2013*fa9e4066Sahrens 		if (!BP_IS_HOLE(&zio->io_bp_orig)) {
2014*fa9e4066Sahrens 			da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
2015*fa9e4066Sahrens 			da->os = os;
2016*fa9e4066Sahrens 			da->bp = zio->io_bp_orig;
2017*fa9e4066Sahrens 			(void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
2018*fa9e4066Sahrens 		}
2019*fa9e4066Sahrens 	}
2020*fa9e4066Sahrens 
2021*fa9e4066Sahrens 	dbuf_remove_ref(db, (void *)(uintptr_t)txg);
2022*fa9e4066Sahrens }
2023