xref: /titanic_52/usr/src/uts/common/fs/zfs/dbuf.c (revision 6185db853e024a486ff8837e6784dd290d866112)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dbuf.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/spa.h>
37 #include <sys/zio.h>
38 #include <sys/dmu_zfetch.h>
39 
40 static void dbuf_destroy(dmu_buf_impl_t *db);
41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
42 static arc_done_func_t dbuf_write_done;
43 
44 int zfs_mdcomp_disable = 0;
45 
46 /*
47  * Global data structures and functions for the dbuf cache.
48  */
49 taskq_t *dbuf_tq;
50 static kmem_cache_t *dbuf_cache;
51 
52 /* ARGSUSED */
53 static int
54 dbuf_cons(void *vdb, void *unused, int kmflag)
55 {
56 	dmu_buf_impl_t *db = vdb;
57 	bzero(db, sizeof (dmu_buf_impl_t));
58 
59 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
60 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
61 	refcount_create(&db->db_holds);
62 	return (0);
63 }
64 
65 /* ARGSUSED */
66 static void
67 dbuf_dest(void *vdb, void *unused)
68 {
69 	dmu_buf_impl_t *db = vdb;
70 	mutex_destroy(&db->db_mtx);
71 	cv_destroy(&db->db_changed);
72 	refcount_destroy(&db->db_holds);
73 }
74 
75 /*
76  * dbuf hash table routines
77  */
78 static dbuf_hash_table_t dbuf_hash_table;
79 
80 static uint64_t dbuf_hash_count;
81 
82 static uint64_t
83 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
84 {
85 	uintptr_t osv = (uintptr_t)os;
86 	uint64_t crc = -1ULL;
87 
88 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
89 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
90 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
91 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
92 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
93 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
94 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
95 
96 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
97 
98 	return (crc);
99 }
100 
101 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
102 
103 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
104 	((dbuf)->db.db_object == (obj) &&		\
105 	(dbuf)->db_objset == (os) &&			\
106 	(dbuf)->db_level == (level) &&			\
107 	(dbuf)->db_blkid == (blkid))
108 
109 dmu_buf_impl_t *
110 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
111 {
112 	dbuf_hash_table_t *h = &dbuf_hash_table;
113 	objset_impl_t *os = dn->dn_objset;
114 	uint64_t obj = dn->dn_object;
115 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
116 	uint64_t idx = hv & h->hash_table_mask;
117 	dmu_buf_impl_t *db;
118 
119 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
120 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
121 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
122 			mutex_enter(&db->db_mtx);
123 			if (db->db_state != DB_EVICTING) {
124 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
125 				return (db);
126 			}
127 			mutex_exit(&db->db_mtx);
128 		}
129 	}
130 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
131 	return (NULL);
132 }
133 
134 /*
135  * Insert an entry into the hash table.  If there is already an element
136  * equal to elem in the hash table, then the already existing element
137  * will be returned and the new element will not be inserted.
138  * Otherwise returns NULL.
139  */
140 static dmu_buf_impl_t *
141 dbuf_hash_insert(dmu_buf_impl_t *db)
142 {
143 	dbuf_hash_table_t *h = &dbuf_hash_table;
144 	objset_impl_t *os = db->db_objset;
145 	uint64_t obj = db->db.db_object;
146 	int level = db->db_level;
147 	uint64_t blkid = db->db_blkid;
148 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
149 	uint64_t idx = hv & h->hash_table_mask;
150 	dmu_buf_impl_t *dbf;
151 
152 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
153 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
154 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
155 			mutex_enter(&dbf->db_mtx);
156 			if (dbf->db_state != DB_EVICTING) {
157 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
158 				return (dbf);
159 			}
160 			mutex_exit(&dbf->db_mtx);
161 		}
162 	}
163 
164 	mutex_enter(&db->db_mtx);
165 	db->db_hash_next = h->hash_table[idx];
166 	h->hash_table[idx] = db;
167 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
168 	atomic_add_64(&dbuf_hash_count, 1);
169 
170 	return (NULL);
171 }
172 
173 /*
174  * Remove an entry from the hash table.  This operation will
175  * fail if there are any existing holds on the db.
176  */
177 static void
178 dbuf_hash_remove(dmu_buf_impl_t *db)
179 {
180 	dbuf_hash_table_t *h = &dbuf_hash_table;
181 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
182 	    db->db_level, db->db_blkid);
183 	uint64_t idx = hv & h->hash_table_mask;
184 	dmu_buf_impl_t *dbf, **dbp;
185 
186 	/*
187 	 * We musn't hold db_mtx to maintin lock ordering:
188 	 * DBUF_HASH_MUTEX > db_mtx.
189 	 */
190 	ASSERT(refcount_is_zero(&db->db_holds));
191 	ASSERT(db->db_state == DB_EVICTING);
192 	ASSERT(!MUTEX_HELD(&db->db_mtx));
193 
194 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
195 	dbp = &h->hash_table[idx];
196 	while ((dbf = *dbp) != db) {
197 		dbp = &dbf->db_hash_next;
198 		ASSERT(dbf != NULL);
199 	}
200 	*dbp = db->db_hash_next;
201 	db->db_hash_next = NULL;
202 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
203 	atomic_add_64(&dbuf_hash_count, -1);
204 }
205 
206 static arc_evict_func_t dbuf_do_evict;
207 
208 static void
209 dbuf_evict_user(dmu_buf_impl_t *db)
210 {
211 	ASSERT(MUTEX_HELD(&db->db_mtx));
212 
213 	if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
214 		return;
215 
216 	if (db->db_d.db_user_data_ptr_ptr)
217 		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
218 	db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
219 	db->db_d.db_user_ptr = NULL;
220 	db->db_d.db_user_data_ptr_ptr = NULL;
221 	db->db_d.db_evict_func = NULL;
222 }
223 
224 void
225 dbuf_evict(dmu_buf_impl_t *db)
226 {
227 	int i;
228 
229 	ASSERT(MUTEX_HELD(&db->db_mtx));
230 	ASSERT(db->db_buf == NULL);
231 
232 #ifdef ZFS_DEBUG
233 	for (i = 0; i < TXG_SIZE; i++) {
234 		ASSERT(!list_link_active(&db->db_dirty_node[i]));
235 		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
236 	}
237 #endif
238 	dbuf_clear(db);
239 	dbuf_destroy(db);
240 }
241 
242 void
243 dbuf_init(void)
244 {
245 	uint64_t hsize = 1ULL << 16;
246 	dbuf_hash_table_t *h = &dbuf_hash_table;
247 	int i;
248 
249 	/*
250 	 * The hash table is big enough to fill all of physical memory
251 	 * with an average 4K block size.  The table will take up
252 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
253 	 */
254 	while (hsize * 4096 < physmem * PAGESIZE)
255 		hsize <<= 1;
256 
257 retry:
258 	h->hash_table_mask = hsize - 1;
259 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
260 	if (h->hash_table == NULL) {
261 		/* XXX - we should really return an error instead of assert */
262 		ASSERT(hsize > (1ULL << 10));
263 		hsize >>= 1;
264 		goto retry;
265 	}
266 
267 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
268 	    sizeof (dmu_buf_impl_t),
269 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
270 	dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
271 	    TASKQ_PREPOPULATE);
272 
273 	for (i = 0; i < DBUF_MUTEXES; i++)
274 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
275 }
276 
277 void
278 dbuf_fini(void)
279 {
280 	dbuf_hash_table_t *h = &dbuf_hash_table;
281 	int i;
282 
283 	taskq_destroy(dbuf_tq);
284 	dbuf_tq = NULL;
285 
286 	for (i = 0; i < DBUF_MUTEXES; i++)
287 		mutex_destroy(&h->hash_mutexes[i]);
288 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
289 	kmem_cache_destroy(dbuf_cache);
290 }
291 
292 /*
293  * Other stuff.
294  */
295 
296 #ifdef ZFS_DEBUG
297 static void
298 dbuf_verify(dmu_buf_impl_t *db)
299 {
300 	int i;
301 	dnode_t *dn = db->db_dnode;
302 
303 	ASSERT(MUTEX_HELD(&db->db_mtx));
304 
305 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
306 		return;
307 
308 	ASSERT(db->db_objset != NULL);
309 	if (dn == NULL) {
310 		ASSERT(db->db_parent == NULL);
311 		ASSERT(db->db_blkptr == NULL);
312 	} else {
313 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
314 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
315 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
316 		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
317 		    list_head(&dn->dn_dbufs));
318 	}
319 	if (db->db_blkid == DB_BONUS_BLKID) {
320 		ASSERT(dn != NULL);
321 		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
322 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
323 	} else {
324 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
325 	}
326 
327 	if (db->db_level == 0) {
328 		/* we can be momentarily larger in dnode_set_blksz() */
329 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
330 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
331 		}
332 		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
333 			for (i = 0; i < TXG_SIZE; i++) {
334 				/*
335 				 * it should only be modified in syncing
336 				 * context, so make sure we only have
337 				 * one copy of the data.
338 				 */
339 				ASSERT(db->db_d.db_data_old[i] == NULL ||
340 				    db->db_d.db_data_old[i] == db->db_buf);
341 			}
342 		}
343 	}
344 
345 	/* verify db->db_blkptr */
346 	if (db->db_blkptr) {
347 		if (db->db_parent == dn->dn_dbuf) {
348 			/* db is pointed to by the dnode */
349 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
350 			if (db->db.db_object == DMU_META_DNODE_OBJECT)
351 				ASSERT(db->db_parent == NULL);
352 			else
353 				ASSERT(db->db_parent != NULL);
354 			ASSERT3P(db->db_blkptr, ==,
355 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
356 		} else {
357 			/* db is pointed to by an indirect block */
358 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
359 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
360 			ASSERT3U(db->db_parent->db.db_object, ==,
361 			    db->db.db_object);
362 			/*
363 			 * dnode_grow_indblksz() can make this fail if we don't
364 			 * have the struct_rwlock.  XXX indblksz no longer
365 			 * grows.  safe to do this now?
366 			 */
367 			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
368 				ASSERT3P(db->db_blkptr, ==,
369 				    ((blkptr_t *)db->db_parent->db.db_data +
370 				    db->db_blkid % epb));
371 			}
372 		}
373 	}
374 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
375 	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
376 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
377 		/*
378 		 * If the blkptr isn't set but they have nonzero data,
379 		 * it had better be dirty, otherwise we'll lose that
380 		 * data when we evict this buffer.
381 		 */
382 		if (db->db_dirtycnt == 0) {
383 			uint64_t *buf = db->db.db_data;
384 			int i;
385 
386 			for (i = 0; i < db->db.db_size >> 3; i++) {
387 				ASSERT(buf[i] == 0);
388 			}
389 		}
390 	}
391 }
392 #endif
393 
394 static void
395 dbuf_update_data(dmu_buf_impl_t *db)
396 {
397 	ASSERT(MUTEX_HELD(&db->db_mtx));
398 	if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
399 		ASSERT(!refcount_is_zero(&db->db_holds));
400 		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
401 	}
402 }
403 
404 static void
405 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
406 {
407 	ASSERT(MUTEX_HELD(&db->db_mtx));
408 	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
409 	db->db_buf = buf;
410 	if (buf != NULL) {
411 		ASSERT(buf->b_data != NULL);
412 		db->db.db_data = buf->b_data;
413 		if (!arc_released(buf))
414 			arc_set_callback(buf, dbuf_do_evict, db);
415 		dbuf_update_data(db);
416 	} else {
417 		dbuf_evict_user(db);
418 		db->db.db_data = NULL;
419 		db->db_state = DB_UNCACHED;
420 	}
421 }
422 
423 uint64_t
424 dbuf_whichblock(dnode_t *dn, uint64_t offset)
425 {
426 	if (dn->dn_datablkshift) {
427 		return (offset >> dn->dn_datablkshift);
428 	} else {
429 		ASSERT3U(offset, <, dn->dn_datablksz);
430 		return (0);
431 	}
432 }
433 
434 static void
435 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
436 {
437 	dmu_buf_impl_t *db = vdb;
438 
439 	mutex_enter(&db->db_mtx);
440 	ASSERT3U(db->db_state, ==, DB_READ);
441 	/*
442 	 * All reads are synchronous, so we must have a hold on the dbuf
443 	 */
444 	ASSERT(refcount_count(&db->db_holds) > 0);
445 	ASSERT(db->db_buf == NULL);
446 	ASSERT(db->db.db_data == NULL);
447 	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
448 		/* we were freed in flight; disregard any error */
449 		arc_release(buf, db);
450 		bzero(buf->b_data, db->db.db_size);
451 		db->db_d.db_freed_in_flight = FALSE;
452 		dbuf_set_data(db, buf);
453 		db->db_state = DB_CACHED;
454 	} else if (zio == NULL || zio->io_error == 0) {
455 		dbuf_set_data(db, buf);
456 		db->db_state = DB_CACHED;
457 	} else {
458 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
459 		ASSERT3P(db->db_buf, ==, NULL);
460 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
461 		db->db_state = DB_UNCACHED;
462 	}
463 	cv_broadcast(&db->db_changed);
464 	mutex_exit(&db->db_mtx);
465 	dbuf_rele(db, NULL);
466 }
467 
468 static void
469 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
470 {
471 	blkptr_t *bp;
472 	zbookmark_t zb;
473 	uint32_t aflags = ARC_NOWAIT;
474 
475 	ASSERT(!refcount_is_zero(&db->db_holds));
476 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
477 	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
478 	ASSERT(MUTEX_HELD(&db->db_mtx));
479 	ASSERT(db->db_state == DB_UNCACHED);
480 	ASSERT(db->db_buf == NULL);
481 
482 	if (db->db_blkid == DB_BONUS_BLKID) {
483 		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
484 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
485 		if (db->db.db_size < DN_MAX_BONUSLEN)
486 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
487 		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
488 		    db->db.db_size);
489 		dbuf_update_data(db);
490 		db->db_state = DB_CACHED;
491 		mutex_exit(&db->db_mtx);
492 		return;
493 	}
494 
495 	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
496 		bp = NULL;
497 	else
498 		bp = db->db_blkptr;
499 
500 	if (bp == NULL)
501 		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
502 	else
503 		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
504 
505 	if (bp == NULL || BP_IS_HOLE(bp)) {
506 		ASSERT(bp == NULL || BP_IS_HOLE(bp));
507 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
508 		    db->db.db_size, db));
509 		bzero(db->db.db_data, db->db.db_size);
510 		db->db_state = DB_CACHED;
511 		*flags |= DB_RF_CACHED;
512 		mutex_exit(&db->db_mtx);
513 		return;
514 	}
515 
516 	db->db_state = DB_READ;
517 	mutex_exit(&db->db_mtx);
518 
519 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
520 	    db->db_objset->os_dsl_dataset->ds_object : 0;
521 	zb.zb_object = db->db.db_object;
522 	zb.zb_level = db->db_level;
523 	zb.zb_blkid = db->db_blkid;
524 
525 	dbuf_add_ref(db, NULL);
526 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
527 	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
528 	    db->db_level > 0 ? byteswap_uint64_array :
529 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
530 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
531 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
532 	    &aflags, &zb);
533 	if (aflags & ARC_CACHED)
534 		*flags |= DB_RF_CACHED;
535 }
536 
537 int
538 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
539 {
540 	int err = 0;
541 	int havepzio = (zio != NULL);
542 	int prefetch;
543 
544 	/*
545 	 * We don't have to hold the mutex to check db_state because it
546 	 * can't be freed while we have a hold on the buffer.
547 	 */
548 	ASSERT(!refcount_is_zero(&db->db_holds));
549 
550 	if ((flags & DB_RF_HAVESTRUCT) == 0)
551 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
552 
553 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
554 	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
555 
556 	mutex_enter(&db->db_mtx);
557 	if (db->db_state == DB_CACHED) {
558 		mutex_exit(&db->db_mtx);
559 		if (prefetch)
560 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
561 			    db->db.db_size, TRUE);
562 		if ((flags & DB_RF_HAVESTRUCT) == 0)
563 			rw_exit(&db->db_dnode->dn_struct_rwlock);
564 	} else if (db->db_state == DB_UNCACHED) {
565 		if (zio == NULL) {
566 			zio = zio_root(db->db_dnode->dn_objset->os_spa,
567 			    NULL, NULL, ZIO_FLAG_CANFAIL);
568 		}
569 		dbuf_read_impl(db, zio, &flags);
570 
571 		/* dbuf_read_impl has dropped db_mtx for us */
572 
573 		if (prefetch)
574 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
575 			    db->db.db_size, flags & DB_RF_CACHED);
576 
577 		if ((flags & DB_RF_HAVESTRUCT) == 0)
578 			rw_exit(&db->db_dnode->dn_struct_rwlock);
579 
580 		if (!havepzio)
581 			err = zio_wait(zio);
582 	} else {
583 		mutex_exit(&db->db_mtx);
584 		if (prefetch)
585 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
586 			    db->db.db_size, TRUE);
587 		if ((flags & DB_RF_HAVESTRUCT) == 0)
588 			rw_exit(&db->db_dnode->dn_struct_rwlock);
589 
590 		mutex_enter(&db->db_mtx);
591 		if ((flags & DB_RF_NEVERWAIT) == 0) {
592 			while (db->db_state == DB_READ ||
593 			    db->db_state == DB_FILL) {
594 				ASSERT(db->db_state == DB_READ ||
595 				    (flags & DB_RF_HAVESTRUCT) == 0);
596 				cv_wait(&db->db_changed, &db->db_mtx);
597 			}
598 			if (db->db_state == DB_UNCACHED)
599 				err = EIO;
600 		}
601 		mutex_exit(&db->db_mtx);
602 	}
603 
604 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
605 	return (err);
606 }
607 
608 static void
609 dbuf_noread(dmu_buf_impl_t *db)
610 {
611 	ASSERT(!refcount_is_zero(&db->db_holds));
612 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
613 	mutex_enter(&db->db_mtx);
614 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
615 		cv_wait(&db->db_changed, &db->db_mtx);
616 	if (db->db_state == DB_UNCACHED) {
617 		ASSERT(db->db_buf == NULL);
618 		ASSERT(db->db.db_data == NULL);
619 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
620 		    db->db.db_size, db));
621 		db->db_state = DB_FILL;
622 	} else {
623 		ASSERT3U(db->db_state, ==, DB_CACHED);
624 	}
625 	mutex_exit(&db->db_mtx);
626 }
627 
628 /*
629  * This is our just-in-time copy function.  It makes a copy of
630  * buffers, that have been modified in a previous transaction
631  * group, before we modify them in the current active group.
632  *
633  * This function is used in two places: when we are dirtying a
634  * buffer for the first time in a txg, and when we are freeing
635  * a range in a dnode that includes this buffer.
636  *
637  * Note that when we are called from dbuf_free_range() we do
638  * not put a hold on the buffer, we just traverse the active
639  * dbuf list for the dnode.
640  */
641 static void
642 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
643 {
644 	arc_buf_t **quiescing, **syncing;
645 
646 	ASSERT(MUTEX_HELD(&db->db_mtx));
647 	ASSERT(db->db.db_data != NULL);
648 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
649 
650 	quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK];
651 	syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK];
652 
653 	/*
654 	 * If this buffer is referenced from the current quiescing
655 	 * transaction group: either make a copy and reset the reference
656 	 * to point to the copy, or (if there a no active holders) just
657 	 * null out the current db_data pointer.
658 	 */
659 	if (*quiescing == db->db_buf) {
660 		/*
661 		 * If the quiescing txg is "dirty", then we better not
662 		 * be referencing the same buffer from the syncing txg.
663 		 */
664 		ASSERT(*syncing != db->db_buf);
665 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
666 			int size = db->db.db_size;
667 			*quiescing = arc_buf_alloc(
668 			    db->db_dnode->dn_objset->os_spa, size, db);
669 			bcopy(db->db.db_data, (*quiescing)->b_data, size);
670 		} else {
671 			dbuf_set_data(db, NULL);
672 		}
673 		return;
674 	}
675 
676 	/*
677 	 * If this buffer is referenced from the current syncing
678 	 * transaction group: either
679 	 *	1 - make a copy and reset the reference, or
680 	 *	2 - if there are no holders, just null the current db_data.
681 	 */
682 	if (*syncing == db->db_buf) {
683 		ASSERT3P(*quiescing, ==, NULL);
684 		ASSERT3U(db->db_dirtycnt, ==, 1);
685 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
686 			int size = db->db.db_size;
687 			/* we can't copy if we have already started a write */
688 			ASSERT(*syncing != db->db_data_pending);
689 			*syncing = arc_buf_alloc(
690 			    db->db_dnode->dn_objset->os_spa, size, db);
691 			bcopy(db->db.db_data, (*syncing)->b_data, size);
692 		} else {
693 			dbuf_set_data(db, NULL);
694 		}
695 	}
696 }
697 
698 /*
699  * This is the "bonus buffer" version of the above routine
700  */
701 static void
702 dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
703 {
704 	void **quiescing, **syncing;
705 
706 	ASSERT(MUTEX_HELD(&db->db_mtx));
707 	ASSERT(db->db.db_data != NULL);
708 	ASSERT(db->db_blkid == DB_BONUS_BLKID);
709 
710 	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
711 	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
712 
713 	if (*quiescing == db->db.db_data) {
714 		ASSERT(*syncing != db->db.db_data);
715 		*quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
716 		bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
717 	} else if (*syncing == db->db.db_data) {
718 		ASSERT3P(*quiescing, ==, NULL);
719 		ASSERT3U(db->db_dirtycnt, ==, 1);
720 		*syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
721 		bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
722 	}
723 }
724 
725 void
726 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
727 {
728 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
729 	ASSERT(MUTEX_HELD(&db->db_mtx));
730 	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC);
731 
732 	if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
733 		/* free this block */
734 		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
735 		    db->db_dnode->dn_free_txg == txg);
736 		if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
737 			/* XXX can get silent EIO here */
738 			(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
739 			    txg, db->db_d.db_overridden_by[txg&TXG_MASK],
740 			    NULL, NULL, ARC_WAIT);
741 		}
742 		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
743 		    sizeof (blkptr_t));
744 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
745 		/* release the already-written buffer */
746 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
747 	}
748 }
749 
750 void
751 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
752 {
753 	dmu_buf_impl_t *db, *db_next;
754 	uint64_t txg = tx->tx_txg;
755 
756 	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
757 	mutex_enter(&dn->dn_dbufs_mtx);
758 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
759 		db_next = list_next(&dn->dn_dbufs, db);
760 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
761 		if (db->db_level != 0)
762 			continue;
763 		dprintf_dbuf(db, "found buf %s\n", "");
764 		if (db->db_blkid < blkid ||
765 		    db->db_blkid >= blkid+nblks)
766 			continue;
767 
768 		/* found a level 0 buffer in the range */
769 		if (dbuf_undirty(db, tx))
770 			continue;
771 
772 		mutex_enter(&db->db_mtx);
773 		if (db->db_state == DB_UNCACHED ||
774 		    db->db_state == DB_EVICTING) {
775 			ASSERT(db->db.db_data == NULL);
776 			mutex_exit(&db->db_mtx);
777 			continue;
778 		}
779 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
780 			/* will be handled in dbuf_read_done or dbuf_rele */
781 			db->db_d.db_freed_in_flight = TRUE;
782 			mutex_exit(&db->db_mtx);
783 			continue;
784 		}
785 		if (refcount_count(&db->db_holds) == 0) {
786 			ASSERT(db->db_buf);
787 			dbuf_clear(db);
788 			continue;
789 		}
790 		/* The dbuf is CACHED and referenced */
791 
792 		if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
793 			/*
794 			 * This dbuf is not currently dirty.  Either
795 			 * uncache it (if its not referenced in the open
796 			 * context) or reset its contents to empty.
797 			 */
798 			dbuf_fix_old_data(db, txg);
799 		} else {
800 			if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
801 				/*
802 				 * This dbuf is overridden.  Clear that state.
803 				 */
804 				dbuf_unoverride(db, txg);
805 			}
806 			if (db->db_blkid > dn->dn_maxblkid)
807 				dn->dn_maxblkid = db->db_blkid;
808 		}
809 		/* fill in with appropriate data */
810 		if (db->db_state == DB_CACHED) {
811 			ASSERT(db->db.db_data != NULL);
812 			arc_release(db->db_buf, db);
813 			bzero(db->db.db_data, db->db.db_size);
814 		}
815 
816 		mutex_exit(&db->db_mtx);
817 	}
818 	mutex_exit(&dn->dn_dbufs_mtx);
819 }
820 
821 static int
822 dbuf_new_block(dmu_buf_impl_t *db)
823 {
824 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
825 	uint64_t birth_txg = 0;
826 
827 	/* Don't count meta-objects */
828 	if (ds == NULL)
829 		return (FALSE);
830 
831 	/*
832 	 * We don't need any locking to protect db_blkptr:
833 	 * If it's syncing, then db_dirtied will be set so we'll
834 	 * ignore db_blkptr.
835 	 */
836 	ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
837 	/* If we have been dirtied since the last snapshot, its not new */
838 	if (db->db_dirtied)
839 		birth_txg = db->db_dirtied;
840 	else if (db->db_blkptr)
841 		birth_txg = db->db_blkptr->blk_birth;
842 
843 	if (birth_txg)
844 		return (!dsl_dataset_block_freeable(ds, birth_txg));
845 	else
846 		return (TRUE);
847 }
848 
849 void
850 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
851 {
852 	arc_buf_t *buf, *obuf;
853 	int osize = db->db.db_size;
854 
855 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
856 
857 	/* XXX does *this* func really need the lock? */
858 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
859 
860 	/*
861 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
862 	 * is OK, because there can be no other references to the db
863 	 * when we are changing its size, so no concurrent DB_FILL can
864 	 * be happening.
865 	 */
866 	/*
867 	 * XXX we should be doing a dbuf_read, checking the return
868 	 * value and returning that up to our callers
869 	 */
870 	dbuf_will_dirty(db, tx);
871 
872 	/* create the data buffer for the new block */
873 	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
874 
875 	/* copy old block data to the new block */
876 	obuf = db->db_buf;
877 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
878 	/* zero the remainder */
879 	if (size > osize)
880 		bzero((uint8_t *)buf->b_data + osize, size - osize);
881 
882 	mutex_enter(&db->db_mtx);
883 	dbuf_set_data(db, buf);
884 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
885 	db->db.db_size = size;
886 
887 	if (db->db_level == 0)
888 		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
889 	mutex_exit(&db->db_mtx);
890 
891 	dnode_willuse_space(db->db_dnode, size-osize, tx);
892 }
893 
894 void
895 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
896 {
897 	dnode_t *dn = db->db_dnode;
898 	objset_impl_t *os = dn->dn_objset;
899 	int drop_struct_lock = FALSE;
900 	int txgoff = tx->tx_txg & TXG_MASK;
901 
902 	ASSERT(tx->tx_txg != 0);
903 	ASSERT(!refcount_is_zero(&db->db_holds));
904 	DMU_TX_DIRTY_BUF(tx, db);
905 
906 	/*
907 	 * Shouldn't dirty a regular buffer in syncing context.  Private
908 	 * objects may be dirtied in syncing context, but only if they
909 	 * were already pre-dirtied in open context.
910 	 * XXX We may want to prohibit dirtying in syncing context even
911 	 * if they did pre-dirty.
912 	 */
913 	ASSERT(!(dmu_tx_is_syncing(tx) &&
914 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
915 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
916 	    dn->dn_objset->os_dsl_dataset != NULL &&
917 	    !dsl_dir_is_private(
918 	    dn->dn_objset->os_dsl_dataset->ds_dir)));
919 
920 	/*
921 	 * We make this assert for private objects as well, but after we
922 	 * check if we're already dirty.  They are allowed to re-dirty
923 	 * in syncing context.
924 	 */
925 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
926 	    dn->dn_dirtyctx == DN_UNDIRTIED ||
927 	    dn->dn_dirtyctx ==
928 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
929 
930 	mutex_enter(&db->db_mtx);
931 	/* XXX make this true for indirects too? */
932 	ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
933 	    db->db_state == DB_FILL);
934 
935 	/*
936 	 * If this buffer is currently part of an "overridden" region,
937 	 * we now need to remove it from that region.
938 	 */
939 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
940 	    db->db_d.db_overridden_by[txgoff] != NULL) {
941 		dbuf_unoverride(db, tx->tx_txg);
942 	}
943 
944 	mutex_enter(&dn->dn_mtx);
945 	/*
946 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
947 	 * initialize the objset.
948 	 */
949 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
950 	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
951 		dn->dn_dirtyctx =
952 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
953 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
954 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
955 	}
956 	mutex_exit(&dn->dn_mtx);
957 
958 	/*
959 	 * If this buffer is already dirty, we're done.
960 	 */
961 	if (list_link_active(&db->db_dirty_node[txgoff])) {
962 		mutex_exit(&db->db_mtx);
963 		return;
964 	}
965 
966 	/*
967 	 * Only valid if not already dirty.
968 	 */
969 	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
970 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
971 
972 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
973 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
974 	    dn->dn_phys->dn_nlevels > db->db_level ||
975 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
976 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
977 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
978 
979 	/*
980 	 * We should only be dirtying in syncing context if it's the
981 	 * mos, a spa os, or we're initializing the os.  However, we are
982 	 * allowed to dirty in syncing context provided we already
983 	 * dirtied it in open context.  Hence we must make this
984 	 * assertion only if we're not already dirty.
985 	 */
986 	ASSERT(!dmu_tx_is_syncing(tx) ||
987 	    os->os_dsl_dataset == NULL ||
988 	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
989 	    !BP_IS_HOLE(&os->os_rootbp));
990 	ASSERT(db->db.db_size != 0);
991 
992 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
993 
994 	/*
995 	 * If this buffer is dirty in an old transaction group we need
996 	 * to make a copy of it so that the changes we make in this
997 	 * transaction group won't leak out when we sync the older txg.
998 	 */
999 	if (db->db_blkid == DB_BONUS_BLKID) {
1000 		ASSERT(db->db.db_data != NULL);
1001 		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
1002 		dbuf_fix_old_bonus_data(db, tx->tx_txg);
1003 		db->db_d.db_data_old[txgoff] = db->db.db_data;
1004 	} else if (db->db_level == 0) {
1005 		/*
1006 		 * Release the data buffer from the cache so that we
1007 		 * can modify it without impacting possible other users
1008 		 * of this cached data block.  Note that indirect blocks
1009 		 * and private objects are not released until the syncing
1010 		 * state (since they are only modified then).
1011 		 */
1012 		ASSERT(db->db_buf != NULL);
1013 		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
1014 		if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1015 			arc_release(db->db_buf, db);
1016 			dbuf_fix_old_data(db, tx->tx_txg);
1017 			ASSERT(db->db_buf != NULL);
1018 		}
1019 		db->db_d.db_data_old[txgoff] = db->db_buf;
1020 	}
1021 
1022 	mutex_enter(&dn->dn_mtx);
1023 	/*
1024 	 * We could have been freed_in_flight between the dbuf_noread
1025 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1026 	 * happened after the free.
1027 	 */
1028 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1029 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1030 		db->db_d.db_freed_in_flight = FALSE;
1031 	}
1032 
1033 	db->db_dirtied = tx->tx_txg;
1034 	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
1035 	mutex_exit(&dn->dn_mtx);
1036 
1037 	if (db->db_blkid != DB_BONUS_BLKID) {
1038 		/*
1039 		 * Update the accounting.
1040 		 */
1041 		if (!dbuf_new_block(db) && db->db_blkptr) {
1042 			/*
1043 			 * This is only a guess -- if the dbuf is dirty
1044 			 * in a previous txg, we don't know how much
1045 			 * space it will use on disk yet.  We should
1046 			 * really have the struct_rwlock to access
1047 			 * db_blkptr, but since this is just a guess,
1048 			 * it's OK if we get an odd answer.
1049 			 */
1050 			dnode_willuse_space(dn,
1051 			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
1052 		}
1053 		dnode_willuse_space(dn, db->db.db_size, tx);
1054 	}
1055 
1056 	/*
1057 	 * This buffer is now part of this txg
1058 	 */
1059 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1060 	db->db_dirtycnt += 1;
1061 	ASSERT3U(db->db_dirtycnt, <=, 3);
1062 
1063 	mutex_exit(&db->db_mtx);
1064 
1065 	if (db->db_blkid == DB_BONUS_BLKID) {
1066 		dnode_setdirty(dn, tx);
1067 		return;
1068 	}
1069 
1070 	if (db->db_level == 0) {
1071 		dnode_new_blkid(dn, db->db_blkid, tx);
1072 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1073 	}
1074 
1075 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1076 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1077 		drop_struct_lock = TRUE;
1078 	}
1079 
1080 	if (db->db_level+1 < dn->dn_nlevels) {
1081 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1082 		dmu_buf_impl_t *parent;
1083 		parent = dbuf_hold_level(dn, db->db_level+1,
1084 		    db->db_blkid >> epbs, FTAG);
1085 		if (drop_struct_lock)
1086 			rw_exit(&dn->dn_struct_rwlock);
1087 		dbuf_dirty(parent, tx);
1088 		dbuf_rele(parent, FTAG);
1089 	} else {
1090 		if (drop_struct_lock)
1091 			rw_exit(&dn->dn_struct_rwlock);
1092 	}
1093 
1094 	dnode_setdirty(dn, tx);
1095 }
1096 
1097 static int
1098 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1099 {
1100 	dnode_t *dn = db->db_dnode;
1101 	int txgoff = tx->tx_txg & TXG_MASK;
1102 	int64_t holds;
1103 
1104 	ASSERT(tx->tx_txg != 0);
1105 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1106 
1107 	mutex_enter(&db->db_mtx);
1108 
1109 	/*
1110 	 * If this buffer is not dirty, we're done.
1111 	 */
1112 	if (!list_link_active(&db->db_dirty_node[txgoff])) {
1113 		mutex_exit(&db->db_mtx);
1114 		return (0);
1115 	}
1116 
1117 	/*
1118 	 * If this buffer is currently held, we cannot undirty
1119 	 * it, since one of the current holders may be in the
1120 	 * middle of an update.  Note that users of dbuf_undirty()
1121 	 * should not place a hold on the dbuf before the call.
1122 	 */
1123 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1124 		mutex_exit(&db->db_mtx);
1125 		/* Make sure we don't toss this buffer at sync phase */
1126 		mutex_enter(&dn->dn_mtx);
1127 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1128 		mutex_exit(&dn->dn_mtx);
1129 		return (0);
1130 	}
1131 
1132 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1133 
1134 	dbuf_unoverride(db, tx->tx_txg);
1135 
1136 	ASSERT(db->db.db_size != 0);
1137 	if (db->db_level == 0) {
1138 		ASSERT(db->db_buf != NULL);
1139 		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
1140 		if (db->db_d.db_data_old[txgoff] != db->db_buf)
1141 			VERIFY(arc_buf_remove_ref(
1142 			    db->db_d.db_data_old[txgoff], db) == 1);
1143 		db->db_d.db_data_old[txgoff] = NULL;
1144 	}
1145 
1146 	/* XXX would be nice to fix up dn_towrite_space[] */
1147 	/* XXX undo db_dirtied? but how? */
1148 	/* db->db_dirtied = tx->tx_txg; */
1149 
1150 	mutex_enter(&dn->dn_mtx);
1151 	list_remove(&dn->dn_dirty_dbufs[txgoff], db);
1152 	mutex_exit(&dn->dn_mtx);
1153 
1154 	ASSERT(db->db_dirtycnt > 0);
1155 	db->db_dirtycnt -= 1;
1156 
1157 	if ((holds = refcount_remove(&db->db_holds,
1158 	    (void *)(uintptr_t)tx->tx_txg)) == 0) {
1159 		arc_buf_t *buf = db->db_buf;
1160 
1161 		ASSERT(arc_released(buf));
1162 		dbuf_set_data(db, NULL);
1163 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
1164 		dbuf_evict(db);
1165 		return (1);
1166 	}
1167 	ASSERT(holds > 0);
1168 
1169 	mutex_exit(&db->db_mtx);
1170 	return (0);
1171 }
1172 
1173 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1174 void
1175 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1176 {
1177 	int rf = DB_RF_MUST_SUCCEED;
1178 
1179 	ASSERT(tx->tx_txg != 0);
1180 	ASSERT(!refcount_is_zero(&db->db_holds));
1181 
1182 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1183 		rf |= DB_RF_HAVESTRUCT;
1184 	(void) dbuf_read(db, NULL, rf);
1185 	dbuf_dirty(db, tx);
1186 }
1187 
1188 void
1189 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1190 {
1191 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1192 
1193 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1194 	ASSERT(tx->tx_txg != 0);
1195 	ASSERT(db->db_level == 0);
1196 	ASSERT(!refcount_is_zero(&db->db_holds));
1197 
1198 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1199 	    dmu_tx_private_ok(tx));
1200 
1201 	dbuf_noread(db);
1202 	dbuf_dirty(db, tx);
1203 }
1204 
1205 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1206 /* ARGSUSED */
1207 void
1208 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1209 {
1210 	mutex_enter(&db->db_mtx);
1211 	DBUF_VERIFY(db);
1212 
1213 	if (db->db_state == DB_FILL) {
1214 		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
1215 			ASSERT(db->db_blkid != DB_BONUS_BLKID);
1216 			/* we were freed while filling */
1217 			/* XXX dbuf_undirty? */
1218 			bzero(db->db.db_data, db->db.db_size);
1219 			db->db_d.db_freed_in_flight = FALSE;
1220 		}
1221 		db->db_state = DB_CACHED;
1222 		cv_broadcast(&db->db_changed);
1223 	}
1224 	mutex_exit(&db->db_mtx);
1225 }
1226 
1227 /*
1228  * "Clear" the contents of this dbuf.  This will mark the dbuf
1229  * EVICTING and clear *most* of its references.  Unfortunetely,
1230  * when we are not holding the dn_dbufs_mtx, we can't clear the
1231  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1232  * in this case.  For callers from the DMU we will usually see:
1233  *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1234  * For the arc callback, we will usually see:
1235  * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1236  * Sometimes, though, we will get a mix of these two:
1237  *	DMU: dbuf_clear()->arc_buf_evict()
1238  *	ARC: dbuf_do_evict()->dbuf_destroy()
1239  */
1240 void
1241 dbuf_clear(dmu_buf_impl_t *db)
1242 {
1243 	dnode_t *dn = db->db_dnode;
1244 	dmu_buf_impl_t *parent = db->db_parent;
1245 	dmu_buf_impl_t *dndb = dn->dn_dbuf;
1246 	int dbuf_gone = FALSE;
1247 
1248 	ASSERT(MUTEX_HELD(&db->db_mtx));
1249 	ASSERT(refcount_is_zero(&db->db_holds));
1250 
1251 	dbuf_evict_user(db);
1252 
1253 	if (db->db_state == DB_CACHED) {
1254 		ASSERT(db->db.db_data != NULL);
1255 		if (db->db_blkid == DB_BONUS_BLKID)
1256 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1257 		db->db.db_data = NULL;
1258 		db->db_state = DB_UNCACHED;
1259 	}
1260 
1261 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
1262 	ASSERT(db->db_data_pending == NULL);
1263 
1264 	db->db_state = DB_EVICTING;
1265 	db->db_blkptr = NULL;
1266 
1267 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1268 		list_remove(&dn->dn_dbufs, db);
1269 		dnode_rele(dn, db);
1270 	}
1271 
1272 	if (db->db_buf)
1273 		dbuf_gone = arc_buf_evict(db->db_buf);
1274 
1275 	if (!dbuf_gone)
1276 		mutex_exit(&db->db_mtx);
1277 
1278 	/*
1279 	 * If this dbuf is referened from an indirect dbuf,
1280 	 * decrement the ref count on the indirect dbuf.
1281 	 */
1282 	if (parent && parent != dndb)
1283 		dbuf_rele(parent, db);
1284 }
1285 
1286 static int
1287 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1288     dmu_buf_impl_t **parentp, blkptr_t **bpp)
1289 {
1290 	int nlevels, epbs;
1291 
1292 	*parentp = NULL;
1293 	*bpp = NULL;
1294 
1295 	ASSERT(blkid != DB_BONUS_BLKID);
1296 
1297 	if (dn->dn_phys->dn_nlevels == 0)
1298 		nlevels = 1;
1299 	else
1300 		nlevels = dn->dn_phys->dn_nlevels;
1301 
1302 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1303 
1304 	ASSERT3U(level * epbs, <, 64);
1305 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1306 	if (level >= nlevels ||
1307 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1308 		/* the buffer has no parent yet */
1309 		return (ENOENT);
1310 	} else if (level < nlevels-1) {
1311 		/* this block is referenced from an indirect block */
1312 		int err = dbuf_hold_impl(dn, level+1,
1313 		    blkid >> epbs, fail_sparse, NULL, parentp);
1314 		if (err)
1315 			return (err);
1316 		err = dbuf_read(*parentp, NULL,
1317 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1318 		if (err) {
1319 			dbuf_rele(*parentp, NULL);
1320 			*parentp = NULL;
1321 			return (err);
1322 		}
1323 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1324 		    (blkid & ((1ULL << epbs) - 1));
1325 		return (0);
1326 	} else {
1327 		/* the block is referenced from the dnode */
1328 		ASSERT3U(level, ==, nlevels-1);
1329 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1330 		    blkid < dn->dn_phys->dn_nblkptr);
1331 		if (dn->dn_dbuf) {
1332 			dbuf_add_ref(dn->dn_dbuf, NULL);
1333 			*parentp = dn->dn_dbuf;
1334 		}
1335 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1336 		return (0);
1337 	}
1338 }
1339 
1340 static dmu_buf_impl_t *
1341 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1342     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1343 {
1344 	objset_impl_t *os = dn->dn_objset;
1345 	dmu_buf_impl_t *db, *odb;
1346 
1347 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1348 	ASSERT(dn->dn_type != DMU_OT_NONE);
1349 
1350 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1351 
1352 	db->db_objset = os;
1353 	db->db.db_object = dn->dn_object;
1354 	db->db_level = level;
1355 	db->db_blkid = blkid;
1356 	db->db_dirtied = 0;
1357 	db->db_dirtycnt = 0;
1358 	db->db_dnode = dn;
1359 	db->db_parent = parent;
1360 	db->db_blkptr = blkptr;
1361 
1362 	bzero(&db->db_d, sizeof (db->db_d));
1363 
1364 	if (blkid == DB_BONUS_BLKID) {
1365 		ASSERT3P(parent, ==, dn->dn_dbuf);
1366 		db->db.db_size = dn->dn_bonuslen;
1367 		db->db.db_offset = DB_BONUS_BLKID;
1368 		db->db_state = DB_UNCACHED;
1369 		/* the bonus dbuf is not placed in the hash table */
1370 		return (db);
1371 	} else {
1372 		int blocksize =
1373 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1374 		db->db.db_size = blocksize;
1375 		db->db.db_offset = db->db_blkid * blocksize;
1376 	}
1377 
1378 	/*
1379 	 * Hold the dn_dbufs_mtx while we get the new dbuf
1380 	 * in the hash table *and* added to the dbufs list.
1381 	 * This prevents a possible deadlock with someone
1382 	 * trying to look up this dbuf before its added to the
1383 	 * dn_dbufs list.
1384 	 */
1385 	mutex_enter(&dn->dn_dbufs_mtx);
1386 	db->db_state = DB_EVICTING;
1387 	if ((odb = dbuf_hash_insert(db)) != NULL) {
1388 		/* someone else inserted it first */
1389 		kmem_cache_free(dbuf_cache, db);
1390 		mutex_exit(&dn->dn_dbufs_mtx);
1391 		return (odb);
1392 	}
1393 	list_insert_head(&dn->dn_dbufs, db);
1394 	db->db_state = DB_UNCACHED;
1395 	mutex_exit(&dn->dn_dbufs_mtx);
1396 
1397 	if (parent && parent != dn->dn_dbuf)
1398 		dbuf_add_ref(parent, db);
1399 
1400 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1401 	    refcount_count(&dn->dn_holds) > 0);
1402 	(void) refcount_add(&dn->dn_holds, db);
1403 
1404 	dprintf_dbuf(db, "db=%p\n", db);
1405 
1406 	return (db);
1407 }
1408 
1409 static int
1410 dbuf_do_evict(void *private)
1411 {
1412 	arc_buf_t *buf = private;
1413 	dmu_buf_impl_t *db = buf->b_private;
1414 
1415 	if (!MUTEX_HELD(&db->db_mtx))
1416 		mutex_enter(&db->db_mtx);
1417 
1418 	ASSERT(refcount_is_zero(&db->db_holds));
1419 
1420 	if (db->db_state != DB_EVICTING) {
1421 		ASSERT(db->db_state == DB_CACHED);
1422 		DBUF_VERIFY(db);
1423 		db->db_buf = NULL;
1424 		dbuf_evict(db);
1425 	} else {
1426 		mutex_exit(&db->db_mtx);
1427 		dbuf_destroy(db);
1428 	}
1429 	return (0);
1430 }
1431 
1432 static void
1433 dbuf_destroy(dmu_buf_impl_t *db)
1434 {
1435 	ASSERT(refcount_is_zero(&db->db_holds));
1436 
1437 	if (db->db_blkid != DB_BONUS_BLKID) {
1438 		dnode_t *dn = db->db_dnode;
1439 
1440 		/*
1441 		 * If this dbuf is still on the dn_dbufs list,
1442 		 * remove it from that list.
1443 		 */
1444 		if (list_link_active(&db->db_link)) {
1445 			mutex_enter(&dn->dn_dbufs_mtx);
1446 			list_remove(&dn->dn_dbufs, db);
1447 			mutex_exit(&dn->dn_dbufs_mtx);
1448 
1449 			dnode_rele(dn, db);
1450 		}
1451 		dbuf_hash_remove(db);
1452 	}
1453 	db->db_parent = NULL;
1454 	db->db_dnode = NULL;
1455 	db->db_buf = NULL;
1456 
1457 	ASSERT(db->db.db_data == NULL);
1458 	ASSERT(db->db_hash_next == NULL);
1459 	ASSERT(db->db_blkptr == NULL);
1460 	ASSERT(db->db_data_pending == NULL);
1461 
1462 	kmem_cache_free(dbuf_cache, db);
1463 }
1464 
1465 void
1466 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1467 {
1468 	dmu_buf_impl_t *db = NULL;
1469 	blkptr_t *bp = NULL;
1470 
1471 	ASSERT(blkid != DB_BONUS_BLKID);
1472 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1473 
1474 	if (dnode_block_freed(dn, blkid))
1475 		return;
1476 
1477 	/* dbuf_find() returns with db_mtx held */
1478 	if (db = dbuf_find(dn, 0, blkid)) {
1479 		if (refcount_count(&db->db_holds) > 0) {
1480 			/*
1481 			 * This dbuf is active.  We assume that it is
1482 			 * already CACHED, or else about to be either
1483 			 * read or filled.
1484 			 */
1485 			mutex_exit(&db->db_mtx);
1486 			return;
1487 		}
1488 		mutex_exit(&db->db_mtx);
1489 		db = NULL;
1490 	}
1491 
1492 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1493 		if (bp && !BP_IS_HOLE(bp)) {
1494 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1495 			zbookmark_t zb;
1496 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1497 			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
1498 			zb.zb_object = dn->dn_object;
1499 			zb.zb_level = 0;
1500 			zb.zb_blkid = blkid;
1501 
1502 			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
1503 			    dmu_ot[dn->dn_type].ot_byteswap,
1504 			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1505 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1506 			    &aflags, &zb);
1507 		}
1508 		if (db)
1509 			dbuf_rele(db, NULL);
1510 	}
1511 }
1512 
1513 /*
1514  * Returns with db_holds incremented, and db_mtx not held.
1515  * Note: dn_struct_rwlock must be held.
1516  */
1517 int
1518 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1519     void *tag, dmu_buf_impl_t **dbp)
1520 {
1521 	dmu_buf_impl_t *db, *parent = NULL;
1522 
1523 	ASSERT(blkid != DB_BONUS_BLKID);
1524 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1525 	ASSERT3U(dn->dn_nlevels, >, level);
1526 
1527 	*dbp = NULL;
1528 top:
1529 	/* dbuf_find() returns with db_mtx held */
1530 	db = dbuf_find(dn, level, blkid);
1531 
1532 	if (db == NULL) {
1533 		blkptr_t *bp = NULL;
1534 		int err;
1535 
1536 		ASSERT3P(parent, ==, NULL);
1537 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1538 		if (fail_sparse) {
1539 			if (err == 0 && bp && BP_IS_HOLE(bp))
1540 				err = ENOENT;
1541 			if (err) {
1542 				if (parent)
1543 					dbuf_rele(parent, NULL);
1544 				return (err);
1545 			}
1546 		}
1547 		if (err && err != ENOENT)
1548 			return (err);
1549 		db = dbuf_create(dn, level, blkid, parent, bp);
1550 	}
1551 
1552 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1553 		arc_buf_add_ref(db->db_buf, db);
1554 		if (db->db_buf->b_data == NULL) {
1555 			dbuf_clear(db);
1556 			if (parent) {
1557 				dbuf_rele(parent, NULL);
1558 				parent = NULL;
1559 			}
1560 			goto top;
1561 		}
1562 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1563 	}
1564 
1565 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1566 
1567 	/*
1568 	 * If this buffer is currently syncing out, and we are
1569 	 * are still referencing it from db_data, we need to make
1570 	 * a copy of it in case we decide we want to dirty it
1571 	 * again in this txg.
1572 	 */
1573 	if (db->db_level == 0 && db->db_state == DB_CACHED &&
1574 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1575 	    db->db_data_pending == db->db_buf) {
1576 		int size = (db->db_blkid == DB_BONUS_BLKID) ?
1577 		    DN_MAX_BONUSLEN : db->db.db_size;
1578 
1579 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1580 		    size, db));
1581 		bcopy(db->db_data_pending->b_data, db->db.db_data,
1582 		    db->db.db_size);
1583 	}
1584 
1585 	(void) refcount_add(&db->db_holds, tag);
1586 	dbuf_update_data(db);
1587 	DBUF_VERIFY(db);
1588 	mutex_exit(&db->db_mtx);
1589 
1590 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1591 	if (parent)
1592 		dbuf_rele(parent, NULL);
1593 
1594 	ASSERT3P(db->db_dnode, ==, dn);
1595 	ASSERT3U(db->db_blkid, ==, blkid);
1596 	ASSERT3U(db->db_level, ==, level);
1597 	*dbp = db;
1598 
1599 	return (0);
1600 }
1601 
1602 dmu_buf_impl_t *
1603 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1604 {
1605 	dmu_buf_impl_t *db;
1606 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1607 	return (err ? NULL : db);
1608 }
1609 
1610 dmu_buf_impl_t *
1611 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1612 {
1613 	dmu_buf_impl_t *db;
1614 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1615 	return (err ? NULL : db);
1616 }
1617 
1618 dmu_buf_impl_t *
1619 dbuf_create_bonus(dnode_t *dn)
1620 {
1621 	dmu_buf_impl_t *db = dn->dn_bonus;
1622 
1623 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1624 
1625 	ASSERT(dn->dn_bonus == NULL);
1626 	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1627 	return (db);
1628 }
1629 
1630 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1631 void
1632 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1633 {
1634 	int64_t holds = refcount_add(&db->db_holds, tag);
1635 	ASSERT(holds > 1);
1636 }
1637 
1638 #pragma weak dmu_buf_rele = dbuf_rele
1639 void
1640 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1641 {
1642 	int64_t holds;
1643 
1644 	mutex_enter(&db->db_mtx);
1645 	DBUF_VERIFY(db);
1646 
1647 	holds = refcount_remove(&db->db_holds, tag);
1648 	ASSERT(holds >= 0);
1649 
1650 	if (holds == db->db_dirtycnt &&
1651 	    db->db_level == 0 && db->db_d.db_immediate_evict)
1652 		dbuf_evict_user(db);
1653 
1654 	if (holds == 0) {
1655 		if (db->db_blkid == DB_BONUS_BLKID) {
1656 			mutex_exit(&db->db_mtx);
1657 			dnode_rele(db->db_dnode, db);
1658 		} else if (db->db_buf == NULL) {
1659 			/*
1660 			 * This is a special case: we never associated this
1661 			 * dbuf with any data allocated from the ARC.
1662 			 */
1663 			ASSERT3U(db->db_state, ==, DB_UNCACHED);
1664 			dbuf_evict(db);
1665 		} else  if (arc_released(db->db_buf)) {
1666 			arc_buf_t *buf = db->db_buf;
1667 			/*
1668 			 * This dbuf has anonymous data associated with it.
1669 			 */
1670 			dbuf_set_data(db, NULL);
1671 			VERIFY(arc_buf_remove_ref(buf, db) == 1);
1672 			dbuf_evict(db);
1673 		} else {
1674 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1675 			mutex_exit(&db->db_mtx);
1676 		}
1677 	} else {
1678 		mutex_exit(&db->db_mtx);
1679 	}
1680 }
1681 
1682 #pragma weak dmu_buf_refcount = dbuf_refcount
1683 uint64_t
1684 dbuf_refcount(dmu_buf_impl_t *db)
1685 {
1686 	return (refcount_count(&db->db_holds));
1687 }
1688 
1689 void *
1690 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1691     dmu_buf_evict_func_t *evict_func)
1692 {
1693 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1694 	    user_data_ptr_ptr, evict_func));
1695 }
1696 
1697 void *
1698 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1699     dmu_buf_evict_func_t *evict_func)
1700 {
1701 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1702 
1703 	db->db_d.db_immediate_evict = TRUE;
1704 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1705 	    user_data_ptr_ptr, evict_func));
1706 }
1707 
1708 void *
1709 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1710     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1711 {
1712 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1713 	ASSERT(db->db_level == 0);
1714 
1715 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
1716 
1717 	mutex_enter(&db->db_mtx);
1718 
1719 	if (db->db_d.db_user_ptr == old_user_ptr) {
1720 		db->db_d.db_user_ptr = user_ptr;
1721 		db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
1722 		db->db_d.db_evict_func = evict_func;
1723 
1724 		dbuf_update_data(db);
1725 	} else {
1726 		old_user_ptr = db->db_d.db_user_ptr;
1727 	}
1728 
1729 	mutex_exit(&db->db_mtx);
1730 	return (old_user_ptr);
1731 }
1732 
1733 void *
1734 dmu_buf_get_user(dmu_buf_t *db_fake)
1735 {
1736 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1737 	ASSERT(!refcount_is_zero(&db->db_holds));
1738 
1739 	return (db->db_d.db_user_ptr);
1740 }
1741 
1742 void
1743 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
1744 {
1745 	arc_buf_t **data;
1746 	uint64_t txg = tx->tx_txg;
1747 	dnode_t *dn = db->db_dnode;
1748 	objset_impl_t *os = dn->dn_objset;
1749 	int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1750 	int checksum, compress;
1751 	zbookmark_t zb;
1752 	int blksz;
1753 
1754 	ASSERT(dmu_tx_is_syncing(tx));
1755 
1756 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1757 
1758 	mutex_enter(&db->db_mtx);
1759 	/*
1760 	 * To be synced, we must be dirtied.  But we
1761 	 * might have been freed after the dirty.
1762 	 */
1763 	if (db->db_state == DB_UNCACHED) {
1764 		/* This buffer has been freed since it was dirtied */
1765 		ASSERT(db->db.db_data == NULL);
1766 	} else if (db->db_state == DB_FILL) {
1767 		/* This buffer was freed and is now being re-filled */
1768 		ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
1769 	} else {
1770 		ASSERT3U(db->db_state, ==, DB_CACHED);
1771 	}
1772 	DBUF_VERIFY(db);
1773 
1774 	/*
1775 	 * Don't need a lock on db_dirty (dn_mtx), because it can't
1776 	 * be modified yet.
1777 	 */
1778 
1779 	if (db->db_blkid == DB_BONUS_BLKID) {
1780 		void **datap = &db->db_d.db_data_old[txg&TXG_MASK];
1781 		/*
1782 		 * Simply copy the bonus data into the dnode.  It will
1783 		 * be written out when the dnode is synced (and it will
1784 		 * be synced, since it must have been dirty for dbuf_sync
1785 		 * to be called).
1786 		 */
1787 		/*
1788 		 * Use dn_phys->dn_bonuslen since db.db_size is the length
1789 		 * of the bonus buffer in the open transaction rather than
1790 		 * the syncing transaction.
1791 		 */
1792 		ASSERT(*datap != NULL);
1793 		ASSERT3U(db->db_level, ==, 0);
1794 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1795 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1796 		if (*datap != db->db.db_data)
1797 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
1798 		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
1799 		db->db_data_pending = NULL;
1800 		if (db->db_dirtied == txg)
1801 			db->db_dirtied = 0;
1802 		ASSERT(db->db_dirtycnt > 0);
1803 		db->db_dirtycnt -= 1;
1804 		mutex_exit(&db->db_mtx);
1805 		dbuf_rele(db, (void *)(uintptr_t)txg);
1806 		return;
1807 	}
1808 
1809 	if (db->db_level == 0) {
1810 		data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
1811 		blksz = arc_buf_size(*data);
1812 
1813 		/*
1814 		 * This buffer is in the middle of an immdiate write.
1815 		 * Wait for the synchronous IO to complete.
1816 		 */
1817 		while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
1818 			ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1819 			cv_wait(&db->db_changed, &db->db_mtx);
1820 			ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]);
1821 		}
1822 		/*
1823 		 * If this buffer is currently "in use" (i.e., there are
1824 		 * active holds and db_data still references it), then make
1825 		 * a copy before we start the write so that any modifications
1826 		 * from the open txg will not leak into this write.
1827 		 *
1828 		 * NOTE: this copy does not need to be made for objects only
1829 		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
1830 		 * or if there is no actual write involved (bonus blocks).
1831 		 */
1832 		if (dn->dn_object != DMU_META_DNODE_OBJECT &&
1833 		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
1834 			if (refcount_count(&db->db_holds) > 1 &&
1835 			    *data == db->db_buf) {
1836 				*data = arc_buf_alloc(os->os_spa, blksz, db);
1837 				bcopy(db->db.db_data, (*data)->b_data, blksz);
1838 			}
1839 			db->db_data_pending = *data;
1840 		} else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
1841 			/*
1842 			 * Private object buffers are released here rather
1843 			 * than in dbuf_dirty() since they are only modified
1844 			 * in the syncing context and we don't want the
1845 			 * overhead of making multiple copies of the data.
1846 			 */
1847 			arc_release(db->db_buf, db);
1848 		}
1849 	} else {
1850 		data = &db->db_buf;
1851 		if (*data == NULL) {
1852 			/*
1853 			 * This can happen if we dirty and then free
1854 			 * the level-0 data blocks in the same txg. So
1855 			 * this indirect remains unchanged.
1856 			 */
1857 			if (db->db_dirtied == txg)
1858 				db->db_dirtied = 0;
1859 			ASSERT(db->db_dirtycnt > 0);
1860 			db->db_dirtycnt -= 1;
1861 			mutex_exit(&db->db_mtx);
1862 			dbuf_rele(db, (void *)(uintptr_t)txg);
1863 			return;
1864 		}
1865 		blksz = db->db.db_size;
1866 		ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
1867 	}
1868 
1869 	ASSERT(*data != NULL);
1870 
1871 	if (db->db_level > 0 && !arc_released(db->db_buf)) {
1872 		/*
1873 		 * This indirect buffer was marked dirty, but
1874 		 * never modified (if it had been modified, then
1875 		 * we would have released the buffer).  There is
1876 		 * no reason to write anything.
1877 		 */
1878 		db->db_data_pending = NULL;
1879 		if (db->db_dirtied == txg)
1880 			db->db_dirtied = 0;
1881 		ASSERT(db->db_dirtycnt > 0);
1882 		db->db_dirtycnt -= 1;
1883 		mutex_exit(&db->db_mtx);
1884 		dbuf_rele(db, (void *)(uintptr_t)txg);
1885 		return;
1886 	} else if (db->db_blkptr == NULL &&
1887 	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
1888 	    db->db_blkid < dn->dn_phys->dn_nblkptr) {
1889 		/*
1890 		 * This buffer was allocated at a time when there was
1891 		 * no available blkptrs from the dnode, or it was
1892 		 * inappropriate to hook it in (i.e., nlevels mis-match).
1893 		 */
1894 		ASSERT(db->db_blkptr == NULL);
1895 		ASSERT(db->db_parent == NULL);
1896 		db->db_parent = dn->dn_dbuf;
1897 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1898 		DBUF_VERIFY(db);
1899 		mutex_exit(&db->db_mtx);
1900 	} else if (db->db_blkptr == NULL) {
1901 		dmu_buf_impl_t *parent = db->db_parent;
1902 
1903 		mutex_exit(&db->db_mtx);
1904 		ASSERT(dn->dn_phys->dn_nlevels > 1);
1905 		if (parent == NULL) {
1906 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
1907 			(void) dbuf_hold_impl(dn, db->db_level+1,
1908 			    db->db_blkid >> epbs, FALSE, FTAG, &parent);
1909 			rw_exit(&dn->dn_struct_rwlock);
1910 			dbuf_add_ref(parent, db);
1911 			db->db_parent = parent;
1912 			dbuf_rele(parent, FTAG);
1913 		}
1914 		(void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
1915 	} else {
1916 		mutex_exit(&db->db_mtx);
1917 	}
1918 
1919 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
1920 
1921 	if (db->db_level > 0 &&
1922 	    db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
1923 		/*
1924 		 * Don't write indirect blocks past EOF.
1925 		 * We get these when we truncate a file *after* dirtying
1926 		 * blocks in the truncate range (we undirty the level 0
1927 		 * blocks in dbuf_free_range(), but not the indirects).
1928 		 */
1929 #ifdef ZFS_DEBUG
1930 		/*
1931 		 * Verify that this indirect block is empty.
1932 		 */
1933 		blkptr_t *bplist;
1934 		int i;
1935 
1936 		mutex_enter(&db->db_mtx);
1937 		bplist = db->db.db_data;
1938 		for (i = 0; i < (1 << epbs); i++) {
1939 			if (!BP_IS_HOLE(&bplist[i])) {
1940 				panic("data past EOF: "
1941 				    "db=%p level=%d id=%llu i=%d\n",
1942 				    db, db->db_level,
1943 				    (u_longlong_t)db->db_blkid, i);
1944 			}
1945 		}
1946 		mutex_exit(&db->db_mtx);
1947 #endif
1948 		ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr));
1949 		mutex_enter(&db->db_mtx);
1950 		db->db_dirtycnt -= 1;
1951 		mutex_exit(&db->db_mtx);
1952 		dbuf_rele(db, (void *)(uintptr_t)txg);
1953 		return;
1954 	}
1955 
1956 	if (db->db_parent != dn->dn_dbuf) {
1957 		dmu_buf_impl_t *parent = db->db_parent;
1958 
1959 		mutex_enter(&db->db_mtx);
1960 		ASSERT(db->db_level == parent->db_level-1);
1961 		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
1962 		/*
1963 		 * We may have read this indirect block after we dirtied it,
1964 		 * so never released it from the cache.
1965 		 */
1966 		arc_release(parent->db_buf, db->db_parent);
1967 
1968 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
1969 		    (db->db_blkid & ((1ULL << epbs) - 1));
1970 		DBUF_VERIFY(db);
1971 		mutex_exit(&db->db_mtx);
1972 #ifdef ZFS_DEBUG
1973 	} else {
1974 		/*
1975 		 * We don't need to dnode_setdirty(dn) because if we got
1976 		 * here then the parent is already dirty.
1977 		 */
1978 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
1979 		ASSERT3P(db->db_blkptr, ==,
1980 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
1981 #endif
1982 	}
1983 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1984 
1985 	if (db->db_level == 0 &&
1986 	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
1987 		arc_buf_t **old =
1988 		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
1989 		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
1990 		int old_size = bp_get_dasize(os->os_spa, db->db_blkptr);
1991 		int new_size = bp_get_dasize(os->os_spa, *bpp);
1992 
1993 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
1994 
1995 		dnode_diduse_space(dn, new_size-old_size);
1996 		mutex_enter(&dn->dn_mtx);
1997 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
1998 			dn->dn_phys->dn_maxblkid = db->db_blkid;
1999 		mutex_exit(&dn->dn_mtx);
2000 
2001 		dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
2002 		if (!BP_IS_HOLE(db->db_blkptr))
2003 			dsl_dataset_block_kill(os->os_dsl_dataset,
2004 			    db->db_blkptr, os->os_synctx);
2005 
2006 		mutex_enter(&db->db_mtx);
2007 		*db->db_blkptr = **bpp;
2008 		kmem_free(*bpp, sizeof (blkptr_t));
2009 		*bpp = NULL;
2010 
2011 		if (*old != db->db_buf)
2012 			VERIFY(arc_buf_remove_ref(*old, db) == 1);
2013 		else if (!BP_IS_HOLE(db->db_blkptr))
2014 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2015 		else
2016 			ASSERT(arc_released(db->db_buf));
2017 		*old = NULL;
2018 		db->db_data_pending = NULL;
2019 
2020 		cv_broadcast(&db->db_changed);
2021 
2022 		ASSERT(db->db_dirtycnt > 0);
2023 		db->db_dirtycnt -= 1;
2024 		mutex_exit(&db->db_mtx);
2025 		dbuf_rele(db, (void *)(uintptr_t)txg);
2026 		return;
2027 	}
2028 
2029 	if (db->db_level > 0) {
2030 		/*
2031 		 * XXX -- we should design a compression algorithm
2032 		 * that specializes in arrays of bps.
2033 		 */
2034 		checksum = ZIO_CHECKSUM_FLETCHER_4;
2035 		if (zfs_mdcomp_disable)
2036 			compress = ZIO_COMPRESS_EMPTY;
2037 		else
2038 			compress = ZIO_COMPRESS_LZJB;
2039 	} else {
2040 		/*
2041 		 * Allow dnode settings to override objset settings,
2042 		 * except for metadata checksums.
2043 		 */
2044 		if (dmu_ot[dn->dn_type].ot_metadata) {
2045 			checksum = os->os_md_checksum;
2046 			compress = zio_compress_select(dn->dn_compress,
2047 			    os->os_md_compress);
2048 		} else {
2049 			checksum = zio_checksum_select(dn->dn_checksum,
2050 			    os->os_checksum);
2051 			compress = zio_compress_select(dn->dn_compress,
2052 			    os->os_compress);
2053 		}
2054 	}
2055 #ifdef ZFS_DEBUG
2056 	if (db->db_parent) {
2057 		ASSERT(list_link_active(
2058 		    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
2059 		ASSERT(db->db_parent == dn->dn_dbuf ||
2060 		    db->db_parent->db_level > 0);
2061 		if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
2062 			ASSERT(*data == db->db_buf);
2063 	}
2064 #endif
2065 	ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
2066 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2067 	zb.zb_object = db->db.db_object;
2068 	zb.zb_level = db->db_level;
2069 	zb.zb_blkid = db->db_blkid;
2070 
2071 	(void) arc_write(zio, os->os_spa, checksum, compress,
2072 	    dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
2073 	    db->db_blkptr, *data, dbuf_write_done, db,
2074 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
2075 	/*
2076 	 * We can't access db after arc_write, since it could finish
2077 	 * and be freed, and we have no locks on it.
2078 	 */
2079 }
2080 
2081 struct dbuf_arg {
2082 	objset_impl_t *os;
2083 	blkptr_t bp;
2084 };
2085 
2086 static void
2087 dbuf_do_born(void *arg)
2088 {
2089 	struct dbuf_arg *da = arg;
2090 	dsl_dataset_block_born(da->os->os_dsl_dataset,
2091 	    &da->bp, da->os->os_synctx);
2092 	kmem_free(da, sizeof (struct dbuf_arg));
2093 }
2094 
2095 static void
2096 dbuf_do_kill(void *arg)
2097 {
2098 	struct dbuf_arg *da = arg;
2099 	dsl_dataset_block_kill(da->os->os_dsl_dataset,
2100 	    &da->bp, da->os->os_synctx);
2101 	kmem_free(da, sizeof (struct dbuf_arg));
2102 }
2103 
2104 /* ARGSUSED */
2105 static void
2106 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2107 {
2108 	dmu_buf_impl_t *db = vdb;
2109 	dnode_t *dn = db->db_dnode;
2110 	objset_impl_t *os = dn->dn_objset;
2111 	uint64_t txg = zio->io_txg;
2112 	uint64_t fill = 0;
2113 	int i;
2114 	int old_size, new_size;
2115 
2116 	ASSERT3U(zio->io_error, ==, 0);
2117 
2118 	dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
2119 
2120 	old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig);
2121 	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
2122 
2123 	dnode_diduse_space(dn, new_size-old_size);
2124 
2125 	mutex_enter(&db->db_mtx);
2126 
2127 	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
2128 
2129 	if (db->db_dirtied == txg)
2130 		db->db_dirtied = 0;
2131 
2132 	if (db->db_level == 0) {
2133 		arc_buf_t **old =
2134 		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
2135 
2136 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
2137 
2138 		if (*old != db->db_buf)
2139 			VERIFY(arc_buf_remove_ref(*old, db) == 1);
2140 		else if (!BP_IS_HOLE(db->db_blkptr))
2141 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2142 		else
2143 			ASSERT(arc_released(db->db_buf));
2144 		*old = NULL;
2145 		db->db_data_pending = NULL;
2146 
2147 		mutex_enter(&dn->dn_mtx);
2148 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2149 		    !BP_IS_HOLE(db->db_blkptr))
2150 			dn->dn_phys->dn_maxblkid = db->db_blkid;
2151 		mutex_exit(&dn->dn_mtx);
2152 
2153 		if (dn->dn_type == DMU_OT_DNODE) {
2154 			dnode_phys_t *dnp = db->db.db_data;
2155 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2156 			    i--, dnp++) {
2157 				if (dnp->dn_type != DMU_OT_NONE)
2158 					fill++;
2159 			}
2160 		} else {
2161 			if (!BP_IS_HOLE(db->db_blkptr))
2162 				fill = 1;
2163 		}
2164 	} else {
2165 		blkptr_t *bp = db->db.db_data;
2166 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2167 		if (!BP_IS_HOLE(db->db_blkptr)) {
2168 			int epbs =
2169 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2170 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
2171 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2172 			    db->db.db_size);
2173 			ASSERT3U(dn->dn_phys->dn_maxblkid
2174 			    >> (db->db_level * epbs), >=, db->db_blkid);
2175 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2176 		}
2177 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
2178 			if (BP_IS_HOLE(bp))
2179 				continue;
2180 			ASSERT3U(BP_GET_LSIZE(bp), ==,
2181 			    db->db_level == 1 ? dn->dn_datablksz :
2182 			    (1<<dn->dn_phys->dn_indblkshift));
2183 			fill += bp->blk_fill;
2184 		}
2185 	}
2186 
2187 	if (!BP_IS_HOLE(db->db_blkptr)) {
2188 		db->db_blkptr->blk_fill = fill;
2189 		BP_SET_TYPE(db->db_blkptr, dn->dn_type);
2190 		BP_SET_LEVEL(db->db_blkptr, db->db_level);
2191 	} else {
2192 		ASSERT3U(fill, ==, 0);
2193 		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
2194 	}
2195 
2196 	dprintf_dbuf_bp(db, db->db_blkptr,
2197 	    "wrote %llu bytes to blkptr:", zio->io_size);
2198 
2199 	ASSERT(db->db_parent == NULL ||
2200 	    list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
2201 	cv_broadcast(&db->db_changed);
2202 	ASSERT(db->db_dirtycnt > 0);
2203 	db->db_dirtycnt -= 1;
2204 	mutex_exit(&db->db_mtx);
2205 
2206 	/* We must do this after we've set the bp's type and level */
2207 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
2208 	    BP_IDENTITY(&zio->io_bp_orig))) {
2209 		struct dbuf_arg *da;
2210 		da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
2211 		da->os = os;
2212 		da->bp = *zio->io_bp;
2213 		(void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
2214 		if (!BP_IS_HOLE(&zio->io_bp_orig)) {
2215 			da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
2216 			da->os = os;
2217 			da->bp = zio->io_bp_orig;
2218 			(void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
2219 		}
2220 	}
2221 
2222 	dbuf_rele(db, (void *)(uintptr_t)txg);
2223 }
2224