xref: /titanic_50/usr/src/uts/common/fs/zfs/dbuf.c (revision b249c65cf0a7400e86a36ddab5c3fce085809859)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dbuf.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/spa.h>
37 #include <sys/zio.h>
38 #include <sys/dmu_zfetch.h>
39 
40 static void dbuf_destroy(dmu_buf_impl_t *db);
41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
43 static arc_done_func_t dbuf_write_ready;
44 static arc_done_func_t dbuf_write_done;
45 
46 /*
47  * Global data structures and functions for the dbuf cache.
48  */
49 static kmem_cache_t *dbuf_cache;
50 
51 /* ARGSUSED */
52 static int
53 dbuf_cons(void *vdb, void *unused, int kmflag)
54 {
55 	dmu_buf_impl_t *db = vdb;
56 	bzero(db, sizeof (dmu_buf_impl_t));
57 
58 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
59 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
60 	refcount_create(&db->db_holds);
61 	return (0);
62 }
63 
64 /* ARGSUSED */
65 static void
66 dbuf_dest(void *vdb, void *unused)
67 {
68 	dmu_buf_impl_t *db = vdb;
69 	mutex_destroy(&db->db_mtx);
70 	cv_destroy(&db->db_changed);
71 	refcount_destroy(&db->db_holds);
72 }
73 
74 /*
75  * dbuf hash table routines
76  */
77 static dbuf_hash_table_t dbuf_hash_table;
78 
79 static uint64_t dbuf_hash_count;
80 
81 static uint64_t
82 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
83 {
84 	uintptr_t osv = (uintptr_t)os;
85 	uint64_t crc = -1ULL;
86 
87 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
88 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
89 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
90 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
91 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
92 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
93 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
94 
95 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
96 
97 	return (crc);
98 }
99 
100 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
101 
102 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
103 	((dbuf)->db.db_object == (obj) &&		\
104 	(dbuf)->db_objset == (os) &&			\
105 	(dbuf)->db_level == (level) &&			\
106 	(dbuf)->db_blkid == (blkid))
107 
108 dmu_buf_impl_t *
109 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
110 {
111 	dbuf_hash_table_t *h = &dbuf_hash_table;
112 	objset_impl_t *os = dn->dn_objset;
113 	uint64_t obj = dn->dn_object;
114 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
115 	uint64_t idx = hv & h->hash_table_mask;
116 	dmu_buf_impl_t *db;
117 
118 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
119 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
120 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
121 			mutex_enter(&db->db_mtx);
122 			if (db->db_state != DB_EVICTING) {
123 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
124 				return (db);
125 			}
126 			mutex_exit(&db->db_mtx);
127 		}
128 	}
129 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
130 	return (NULL);
131 }
132 
133 /*
134  * Insert an entry into the hash table.  If there is already an element
135  * equal to elem in the hash table, then the already existing element
136  * will be returned and the new element will not be inserted.
137  * Otherwise returns NULL.
138  */
139 static dmu_buf_impl_t *
140 dbuf_hash_insert(dmu_buf_impl_t *db)
141 {
142 	dbuf_hash_table_t *h = &dbuf_hash_table;
143 	objset_impl_t *os = db->db_objset;
144 	uint64_t obj = db->db.db_object;
145 	int level = db->db_level;
146 	uint64_t blkid = db->db_blkid;
147 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
148 	uint64_t idx = hv & h->hash_table_mask;
149 	dmu_buf_impl_t *dbf;
150 
151 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
152 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
153 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
154 			mutex_enter(&dbf->db_mtx);
155 			if (dbf->db_state != DB_EVICTING) {
156 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
157 				return (dbf);
158 			}
159 			mutex_exit(&dbf->db_mtx);
160 		}
161 	}
162 
163 	mutex_enter(&db->db_mtx);
164 	db->db_hash_next = h->hash_table[idx];
165 	h->hash_table[idx] = db;
166 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
167 	atomic_add_64(&dbuf_hash_count, 1);
168 
169 	return (NULL);
170 }
171 
172 /*
173  * Remove an entry from the hash table.  This operation will
174  * fail if there are any existing holds on the db.
175  */
176 static void
177 dbuf_hash_remove(dmu_buf_impl_t *db)
178 {
179 	dbuf_hash_table_t *h = &dbuf_hash_table;
180 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
181 	    db->db_level, db->db_blkid);
182 	uint64_t idx = hv & h->hash_table_mask;
183 	dmu_buf_impl_t *dbf, **dbp;
184 
185 	/*
186 	 * We musn't hold db_mtx to maintin lock ordering:
187 	 * DBUF_HASH_MUTEX > db_mtx.
188 	 */
189 	ASSERT(refcount_is_zero(&db->db_holds));
190 	ASSERT(db->db_state == DB_EVICTING);
191 	ASSERT(!MUTEX_HELD(&db->db_mtx));
192 
193 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
194 	dbp = &h->hash_table[idx];
195 	while ((dbf = *dbp) != db) {
196 		dbp = &dbf->db_hash_next;
197 		ASSERT(dbf != NULL);
198 	}
199 	*dbp = db->db_hash_next;
200 	db->db_hash_next = NULL;
201 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
202 	atomic_add_64(&dbuf_hash_count, -1);
203 }
204 
205 static arc_evict_func_t dbuf_do_evict;
206 
207 static void
208 dbuf_evict_user(dmu_buf_impl_t *db)
209 {
210 	ASSERT(MUTEX_HELD(&db->db_mtx));
211 
212 	if (db->db_level != 0 || db->db_evict_func == NULL)
213 		return;
214 
215 	if (db->db_user_data_ptr_ptr)
216 		*db->db_user_data_ptr_ptr = db->db.db_data;
217 	db->db_evict_func(&db->db, db->db_user_ptr);
218 	db->db_user_ptr = NULL;
219 	db->db_user_data_ptr_ptr = NULL;
220 	db->db_evict_func = NULL;
221 }
222 
223 void
224 dbuf_evict(dmu_buf_impl_t *db)
225 {
226 	ASSERT(MUTEX_HELD(&db->db_mtx));
227 	ASSERT(db->db_buf == NULL);
228 	ASSERT(db->db_data_pending == NULL);
229 
230 	dbuf_clear(db);
231 	dbuf_destroy(db);
232 }
233 
234 void
235 dbuf_init(void)
236 {
237 	uint64_t hsize = 1ULL << 16;
238 	dbuf_hash_table_t *h = &dbuf_hash_table;
239 	int i;
240 
241 	/*
242 	 * The hash table is big enough to fill all of physical memory
243 	 * with an average 4K block size.  The table will take up
244 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
245 	 */
246 	while (hsize * 4096 < physmem * PAGESIZE)
247 		hsize <<= 1;
248 
249 retry:
250 	h->hash_table_mask = hsize - 1;
251 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
252 	if (h->hash_table == NULL) {
253 		/* XXX - we should really return an error instead of assert */
254 		ASSERT(hsize > (1ULL << 10));
255 		hsize >>= 1;
256 		goto retry;
257 	}
258 
259 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
260 	    sizeof (dmu_buf_impl_t),
261 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
262 
263 	for (i = 0; i < DBUF_MUTEXES; i++)
264 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
265 }
266 
267 void
268 dbuf_fini(void)
269 {
270 	dbuf_hash_table_t *h = &dbuf_hash_table;
271 	int i;
272 
273 	for (i = 0; i < DBUF_MUTEXES; i++)
274 		mutex_destroy(&h->hash_mutexes[i]);
275 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
276 	kmem_cache_destroy(dbuf_cache);
277 }
278 
279 /*
280  * Other stuff.
281  */
282 
283 #ifdef ZFS_DEBUG
284 static void
285 dbuf_verify(dmu_buf_impl_t *db)
286 {
287 	dnode_t *dn = db->db_dnode;
288 
289 	ASSERT(MUTEX_HELD(&db->db_mtx));
290 
291 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
292 		return;
293 
294 	ASSERT(db->db_objset != NULL);
295 	if (dn == NULL) {
296 		ASSERT(db->db_parent == NULL);
297 		ASSERT(db->db_blkptr == NULL);
298 	} else {
299 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
300 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
301 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
302 		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
303 		    list_head(&dn->dn_dbufs));
304 	}
305 	if (db->db_blkid == DB_BONUS_BLKID) {
306 		ASSERT(dn != NULL);
307 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
308 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
309 	} else {
310 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
311 	}
312 
313 	if (db->db_level == 0) {
314 		/* we can be momentarily larger in dnode_set_blksz() */
315 		if (db->db_blkid != DB_BONUS_BLKID && dn) {
316 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
317 		}
318 		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
319 			dbuf_dirty_record_t *dr = db->db_data_pending;
320 			/*
321 			 * it should only be modified in syncing
322 			 * context, so make sure we only have
323 			 * one copy of the data.
324 			 */
325 			ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
326 		}
327 	}
328 
329 	/* verify db->db_blkptr */
330 	if (db->db_blkptr) {
331 		if (db->db_parent == dn->dn_dbuf) {
332 			/* db is pointed to by the dnode */
333 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
334 			if (db->db.db_object == DMU_META_DNODE_OBJECT)
335 				ASSERT(db->db_parent == NULL);
336 			else
337 				ASSERT(db->db_parent != NULL);
338 			ASSERT3P(db->db_blkptr, ==,
339 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
340 		} else {
341 			/* db is pointed to by an indirect block */
342 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
343 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
344 			ASSERT3U(db->db_parent->db.db_object, ==,
345 			    db->db.db_object);
346 			/*
347 			 * dnode_grow_indblksz() can make this fail if we don't
348 			 * have the struct_rwlock.  XXX indblksz no longer
349 			 * grows.  safe to do this now?
350 			 */
351 			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
352 				ASSERT3P(db->db_blkptr, ==,
353 				    ((blkptr_t *)db->db_parent->db.db_data +
354 				    db->db_blkid % epb));
355 			}
356 		}
357 	}
358 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
359 	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
360 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
361 		/*
362 		 * If the blkptr isn't set but they have nonzero data,
363 		 * it had better be dirty, otherwise we'll lose that
364 		 * data when we evict this buffer.
365 		 */
366 		if (db->db_dirtycnt == 0) {
367 			uint64_t *buf = db->db.db_data;
368 			int i;
369 
370 			for (i = 0; i < db->db.db_size >> 3; i++) {
371 				ASSERT(buf[i] == 0);
372 			}
373 		}
374 	}
375 }
376 #endif
377 
378 static void
379 dbuf_update_data(dmu_buf_impl_t *db)
380 {
381 	ASSERT(MUTEX_HELD(&db->db_mtx));
382 	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
383 		ASSERT(!refcount_is_zero(&db->db_holds));
384 		*db->db_user_data_ptr_ptr = db->db.db_data;
385 	}
386 }
387 
388 static void
389 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
390 {
391 	ASSERT(MUTEX_HELD(&db->db_mtx));
392 	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
393 	db->db_buf = buf;
394 	if (buf != NULL) {
395 		ASSERT(buf->b_data != NULL);
396 		db->db.db_data = buf->b_data;
397 		if (!arc_released(buf))
398 			arc_set_callback(buf, dbuf_do_evict, db);
399 		dbuf_update_data(db);
400 	} else {
401 		dbuf_evict_user(db);
402 		db->db.db_data = NULL;
403 		db->db_state = DB_UNCACHED;
404 	}
405 }
406 
407 uint64_t
408 dbuf_whichblock(dnode_t *dn, uint64_t offset)
409 {
410 	if (dn->dn_datablkshift) {
411 		return (offset >> dn->dn_datablkshift);
412 	} else {
413 		ASSERT3U(offset, <, dn->dn_datablksz);
414 		return (0);
415 	}
416 }
417 
418 static void
419 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
420 {
421 	dmu_buf_impl_t *db = vdb;
422 
423 	mutex_enter(&db->db_mtx);
424 	ASSERT3U(db->db_state, ==, DB_READ);
425 	/*
426 	 * All reads are synchronous, so we must have a hold on the dbuf
427 	 */
428 	ASSERT(refcount_count(&db->db_holds) > 0);
429 	ASSERT(db->db_buf == NULL);
430 	ASSERT(db->db.db_data == NULL);
431 	if (db->db_level == 0 && db->db_freed_in_flight) {
432 		/* we were freed in flight; disregard any error */
433 		arc_release(buf, db);
434 		bzero(buf->b_data, db->db.db_size);
435 		arc_buf_freeze(buf);
436 		db->db_freed_in_flight = FALSE;
437 		dbuf_set_data(db, buf);
438 		db->db_state = DB_CACHED;
439 	} else if (zio == NULL || zio->io_error == 0) {
440 		dbuf_set_data(db, buf);
441 		db->db_state = DB_CACHED;
442 	} else {
443 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
444 		ASSERT3P(db->db_buf, ==, NULL);
445 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
446 		db->db_state = DB_UNCACHED;
447 	}
448 	cv_broadcast(&db->db_changed);
449 	mutex_exit(&db->db_mtx);
450 	dbuf_rele(db, NULL);
451 }
452 
453 static void
454 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
455 {
456 	dnode_t *dn = db->db_dnode;
457 	zbookmark_t zb;
458 	uint32_t aflags = ARC_NOWAIT;
459 	arc_buf_t *pbuf;
460 
461 	ASSERT(!refcount_is_zero(&db->db_holds));
462 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
463 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
464 	ASSERT(MUTEX_HELD(&db->db_mtx));
465 	ASSERT(db->db_state == DB_UNCACHED);
466 	ASSERT(db->db_buf == NULL);
467 
468 	if (db->db_blkid == DB_BONUS_BLKID) {
469 		int bonuslen = dn->dn_bonuslen;
470 
471 		ASSERT3U(bonuslen, <=, db->db.db_size);
472 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
473 		arc_space_consume(DN_MAX_BONUSLEN);
474 		if (bonuslen < DN_MAX_BONUSLEN)
475 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
476 		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
477 		    bonuslen);
478 		dbuf_update_data(db);
479 		db->db_state = DB_CACHED;
480 		mutex_exit(&db->db_mtx);
481 		return;
482 	}
483 
484 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
485 	    (db->db_level == 0 && dnode_block_freed(dn, db->db_blkid))) {
486 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
487 
488 		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
489 		    db->db.db_size, db, type));
490 		bzero(db->db.db_data, db->db.db_size);
491 		db->db_state = DB_CACHED;
492 		*flags |= DB_RF_CACHED;
493 		mutex_exit(&db->db_mtx);
494 		return;
495 	}
496 
497 	db->db_state = DB_READ;
498 	mutex_exit(&db->db_mtx);
499 
500 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
501 	    db->db_objset->os_dsl_dataset->ds_object : 0;
502 	zb.zb_object = db->db.db_object;
503 	zb.zb_level = db->db_level;
504 	zb.zb_blkid = db->db_blkid;
505 
506 	dbuf_add_ref(db, NULL);
507 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
508 
509 	if (db->db_parent)
510 		pbuf = db->db_parent->db_buf;
511 	else
512 		pbuf = db->db_objset->os_phys_buf;
513 
514 	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
515 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
516 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
517 	    &aflags, &zb);
518 	if (aflags & ARC_CACHED)
519 		*flags |= DB_RF_CACHED;
520 }
521 
522 int
523 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
524 {
525 	int err = 0;
526 	int havepzio = (zio != NULL);
527 	int prefetch;
528 
529 	/*
530 	 * We don't have to hold the mutex to check db_state because it
531 	 * can't be freed while we have a hold on the buffer.
532 	 */
533 	ASSERT(!refcount_is_zero(&db->db_holds));
534 
535 	if ((flags & DB_RF_HAVESTRUCT) == 0)
536 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
537 
538 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
539 	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
540 
541 	mutex_enter(&db->db_mtx);
542 	if (db->db_state == DB_CACHED) {
543 		mutex_exit(&db->db_mtx);
544 		if (prefetch)
545 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
546 			    db->db.db_size, TRUE);
547 		if ((flags & DB_RF_HAVESTRUCT) == 0)
548 			rw_exit(&db->db_dnode->dn_struct_rwlock);
549 	} else if (db->db_state == DB_UNCACHED) {
550 		if (zio == NULL) {
551 			zio = zio_root(db->db_dnode->dn_objset->os_spa,
552 			    NULL, NULL, ZIO_FLAG_CANFAIL);
553 		}
554 		dbuf_read_impl(db, zio, &flags);
555 
556 		/* dbuf_read_impl has dropped db_mtx for us */
557 
558 		if (prefetch)
559 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
560 			    db->db.db_size, flags & DB_RF_CACHED);
561 
562 		if ((flags & DB_RF_HAVESTRUCT) == 0)
563 			rw_exit(&db->db_dnode->dn_struct_rwlock);
564 
565 		if (!havepzio)
566 			err = zio_wait(zio);
567 	} else {
568 		mutex_exit(&db->db_mtx);
569 		if (prefetch)
570 			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
571 			    db->db.db_size, TRUE);
572 		if ((flags & DB_RF_HAVESTRUCT) == 0)
573 			rw_exit(&db->db_dnode->dn_struct_rwlock);
574 
575 		mutex_enter(&db->db_mtx);
576 		if ((flags & DB_RF_NEVERWAIT) == 0) {
577 			while (db->db_state == DB_READ ||
578 			    db->db_state == DB_FILL) {
579 				ASSERT(db->db_state == DB_READ ||
580 				    (flags & DB_RF_HAVESTRUCT) == 0);
581 				cv_wait(&db->db_changed, &db->db_mtx);
582 			}
583 			if (db->db_state == DB_UNCACHED)
584 				err = EIO;
585 		}
586 		mutex_exit(&db->db_mtx);
587 	}
588 
589 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
590 	return (err);
591 }
592 
593 static void
594 dbuf_noread(dmu_buf_impl_t *db)
595 {
596 	ASSERT(!refcount_is_zero(&db->db_holds));
597 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
598 	mutex_enter(&db->db_mtx);
599 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
600 		cv_wait(&db->db_changed, &db->db_mtx);
601 	if (db->db_state == DB_UNCACHED) {
602 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
603 
604 		ASSERT(db->db_buf == NULL);
605 		ASSERT(db->db.db_data == NULL);
606 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
607 		    db->db.db_size, db, type));
608 		db->db_state = DB_FILL;
609 	} else {
610 		ASSERT3U(db->db_state, ==, DB_CACHED);
611 	}
612 	mutex_exit(&db->db_mtx);
613 }
614 
615 /*
616  * This is our just-in-time copy function.  It makes a copy of
617  * buffers, that have been modified in a previous transaction
618  * group, before we modify them in the current active group.
619  *
620  * This function is used in two places: when we are dirtying a
621  * buffer for the first time in a txg, and when we are freeing
622  * a range in a dnode that includes this buffer.
623  *
624  * Note that when we are called from dbuf_free_range() we do
625  * not put a hold on the buffer, we just traverse the active
626  * dbuf list for the dnode.
627  */
628 static void
629 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
630 {
631 	dbuf_dirty_record_t *dr = db->db_last_dirty;
632 
633 	ASSERT(MUTEX_HELD(&db->db_mtx));
634 	ASSERT(db->db.db_data != NULL);
635 	ASSERT(db->db_level == 0);
636 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
637 
638 	if (dr == NULL ||
639 	    (dr->dt.dl.dr_data !=
640 	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
641 		return;
642 
643 	/*
644 	 * If the last dirty record for this dbuf has not yet synced
645 	 * and its referencing the dbuf data, either:
646 	 * 	reset the reference to point to a new copy,
647 	 * or (if there a no active holders)
648 	 *	just null out the current db_data pointer.
649 	 */
650 	ASSERT(dr->dr_txg >= txg - 2);
651 	if (db->db_blkid == DB_BONUS_BLKID) {
652 		/* Note that the data bufs here are zio_bufs */
653 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
654 		arc_space_consume(DN_MAX_BONUSLEN);
655 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
656 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
657 		int size = db->db.db_size;
658 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
659 		dr->dt.dl.dr_data = arc_buf_alloc(
660 		    db->db_dnode->dn_objset->os_spa, size, db, type);
661 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
662 	} else {
663 		dbuf_set_data(db, NULL);
664 	}
665 }
666 
667 void
668 dbuf_unoverride(dbuf_dirty_record_t *dr)
669 {
670 	dmu_buf_impl_t *db = dr->dr_dbuf;
671 	uint64_t txg = dr->dr_txg;
672 
673 	ASSERT(MUTEX_HELD(&db->db_mtx));
674 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
675 	ASSERT(db->db_level == 0);
676 
677 	if (db->db_blkid == DB_BONUS_BLKID ||
678 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
679 		return;
680 
681 	/* free this block */
682 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
683 		/* XXX can get silent EIO here */
684 		(void) dsl_free(NULL,
685 		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
686 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
687 	}
688 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
689 	/*
690 	 * Release the already-written buffer, so we leave it in
691 	 * a consistent dirty state.  Note that all callers are
692 	 * modifying the buffer, so they will immediately do
693 	 * another (redundant) arc_release().  Therefore, leave
694 	 * the buf thawed to save the effort of freezing &
695 	 * immediately re-thawing it.
696 	 */
697 	arc_release(dr->dt.dl.dr_data, db);
698 }
699 
700 /*
701  * Evict (if its unreferenced) or clear (if its referenced) any level-0
702  * data blocks in the free range, so that any future readers will find
703  * empty blocks.  Also, if we happen accross any level-1 dbufs in the
704  * range that have not already been marked dirty, mark them dirty so
705  * they stay in memory.
706  */
707 void
708 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
709 {
710 	dmu_buf_impl_t *db, *db_next;
711 	uint64_t txg = tx->tx_txg;
712 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
713 	uint64_t first_l1 = start >> epbs;
714 	uint64_t last_l1 = end >> epbs;
715 
716 	if (end > dn->dn_maxblkid) {
717 		end = dn->dn_maxblkid;
718 		last_l1 = end >> epbs;
719 	}
720 	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
721 	mutex_enter(&dn->dn_dbufs_mtx);
722 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
723 		db_next = list_next(&dn->dn_dbufs, db);
724 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
725 
726 		if (db->db_level == 1 &&
727 		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
728 			mutex_enter(&db->db_mtx);
729 			if (db->db_last_dirty &&
730 			    db->db_last_dirty->dr_txg < txg) {
731 				dbuf_add_ref(db, FTAG);
732 				mutex_exit(&db->db_mtx);
733 				dbuf_will_dirty(db, tx);
734 				dbuf_rele(db, FTAG);
735 			} else {
736 				mutex_exit(&db->db_mtx);
737 			}
738 		}
739 
740 		if (db->db_level != 0)
741 			continue;
742 		dprintf_dbuf(db, "found buf %s\n", "");
743 		if (db->db_blkid < start || db->db_blkid > end)
744 			continue;
745 
746 		/* found a level 0 buffer in the range */
747 		if (dbuf_undirty(db, tx))
748 			continue;
749 
750 		mutex_enter(&db->db_mtx);
751 		if (db->db_state == DB_UNCACHED ||
752 		    db->db_state == DB_EVICTING) {
753 			ASSERT(db->db.db_data == NULL);
754 			mutex_exit(&db->db_mtx);
755 			continue;
756 		}
757 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
758 			/* will be handled in dbuf_read_done or dbuf_rele */
759 			db->db_freed_in_flight = TRUE;
760 			mutex_exit(&db->db_mtx);
761 			continue;
762 		}
763 		if (refcount_count(&db->db_holds) == 0) {
764 			ASSERT(db->db_buf);
765 			dbuf_clear(db);
766 			continue;
767 		}
768 		/* The dbuf is referenced */
769 
770 		if (db->db_last_dirty != NULL) {
771 			dbuf_dirty_record_t *dr = db->db_last_dirty;
772 
773 			if (dr->dr_txg == txg) {
774 				/*
775 				 * This buffer is "in-use", re-adjust the file
776 				 * size to reflect that this buffer may
777 				 * contain new data when we sync.
778 				 */
779 				if (db->db_blkid > dn->dn_maxblkid)
780 					dn->dn_maxblkid = db->db_blkid;
781 				dbuf_unoverride(dr);
782 			} else {
783 				/*
784 				 * This dbuf is not dirty in the open context.
785 				 * Either uncache it (if its not referenced in
786 				 * the open context) or reset its contents to
787 				 * empty.
788 				 */
789 				dbuf_fix_old_data(db, txg);
790 			}
791 		}
792 		/* clear the contents if its cached */
793 		if (db->db_state == DB_CACHED) {
794 			ASSERT(db->db.db_data != NULL);
795 			arc_release(db->db_buf, db);
796 			bzero(db->db.db_data, db->db.db_size);
797 			arc_buf_freeze(db->db_buf);
798 		}
799 
800 		mutex_exit(&db->db_mtx);
801 	}
802 	mutex_exit(&dn->dn_dbufs_mtx);
803 }
804 
805 static int
806 dbuf_block_freeable(dmu_buf_impl_t *db)
807 {
808 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
809 	uint64_t birth_txg = 0;
810 
811 	/*
812 	 * We don't need any locking to protect db_blkptr:
813 	 * If it's syncing, then db_last_dirty will be set
814 	 * so we'll ignore db_blkptr.
815 	 */
816 	ASSERT(MUTEX_HELD(&db->db_mtx));
817 	if (db->db_last_dirty)
818 		birth_txg = db->db_last_dirty->dr_txg;
819 	else if (db->db_blkptr)
820 		birth_txg = db->db_blkptr->blk_birth;
821 
822 	/* If we don't exist or are in a snapshot, we can't be freed */
823 	if (birth_txg)
824 		return (ds == NULL ||
825 		    dsl_dataset_block_freeable(ds, birth_txg));
826 	else
827 		return (FALSE);
828 }
829 
830 void
831 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
832 {
833 	arc_buf_t *buf, *obuf;
834 	int osize = db->db.db_size;
835 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
836 
837 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
838 
839 	/* XXX does *this* func really need the lock? */
840 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
841 
842 	/*
843 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
844 	 * is OK, because there can be no other references to the db
845 	 * when we are changing its size, so no concurrent DB_FILL can
846 	 * be happening.
847 	 */
848 	/*
849 	 * XXX we should be doing a dbuf_read, checking the return
850 	 * value and returning that up to our callers
851 	 */
852 	dbuf_will_dirty(db, tx);
853 
854 	/* create the data buffer for the new block */
855 	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
856 
857 	/* copy old block data to the new block */
858 	obuf = db->db_buf;
859 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
860 	/* zero the remainder */
861 	if (size > osize)
862 		bzero((uint8_t *)buf->b_data + osize, size - osize);
863 
864 	mutex_enter(&db->db_mtx);
865 	dbuf_set_data(db, buf);
866 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
867 	db->db.db_size = size;
868 
869 	if (db->db_level == 0) {
870 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
871 		db->db_last_dirty->dt.dl.dr_data = buf;
872 	}
873 	mutex_exit(&db->db_mtx);
874 
875 	dnode_willuse_space(db->db_dnode, size-osize, tx);
876 }
877 
878 dbuf_dirty_record_t *
879 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
880 {
881 	dnode_t *dn = db->db_dnode;
882 	objset_impl_t *os = dn->dn_objset;
883 	dbuf_dirty_record_t **drp, *dr;
884 	int drop_struct_lock = FALSE;
885 	int txgoff = tx->tx_txg & TXG_MASK;
886 
887 	ASSERT(tx->tx_txg != 0);
888 	ASSERT(!refcount_is_zero(&db->db_holds));
889 	DMU_TX_DIRTY_BUF(tx, db);
890 
891 	/*
892 	 * Shouldn't dirty a regular buffer in syncing context.  Private
893 	 * objects may be dirtied in syncing context, but only if they
894 	 * were already pre-dirtied in open context.
895 	 * XXX We may want to prohibit dirtying in syncing context even
896 	 * if they did pre-dirty.
897 	 */
898 	ASSERT(!dmu_tx_is_syncing(tx) ||
899 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
900 	    dn->dn_object == DMU_META_DNODE_OBJECT ||
901 	    dn->dn_objset->os_dsl_dataset == NULL ||
902 	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
903 
904 	/*
905 	 * We make this assert for private objects as well, but after we
906 	 * check if we're already dirty.  They are allowed to re-dirty
907 	 * in syncing context.
908 	 */
909 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
910 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
911 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
912 
913 	mutex_enter(&db->db_mtx);
914 	/*
915 	 * XXX make this true for indirects too?  The problem is that
916 	 * transactions created with dmu_tx_create_assigned() from
917 	 * syncing context don't bother holding ahead.
918 	 */
919 	ASSERT(db->db_level != 0 ||
920 	    db->db_state == DB_CACHED || db->db_state == DB_FILL);
921 
922 	mutex_enter(&dn->dn_mtx);
923 	/*
924 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
925 	 * initialize the objset.
926 	 */
927 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
928 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
929 		dn->dn_dirtyctx =
930 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
931 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
932 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
933 	}
934 	mutex_exit(&dn->dn_mtx);
935 
936 	/*
937 	 * If this buffer is already dirty, we're done.
938 	 */
939 	drp = &db->db_last_dirty;
940 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
941 	    db->db.db_object == DMU_META_DNODE_OBJECT);
942 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
943 		drp = &dr->dr_next;
944 	if (dr && dr->dr_txg == tx->tx_txg) {
945 		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
946 			/*
947 			 * If this buffer has already been written out,
948 			 * we now need to reset its state.
949 			 */
950 			dbuf_unoverride(dr);
951 			if (db->db.db_object != DMU_META_DNODE_OBJECT)
952 				arc_buf_thaw(db->db_buf);
953 		}
954 		mutex_exit(&db->db_mtx);
955 		return (dr);
956 	}
957 
958 	/*
959 	 * Only valid if not already dirty.
960 	 */
961 	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
962 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
963 
964 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
965 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
966 	    dn->dn_phys->dn_nlevels > db->db_level ||
967 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
968 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
969 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
970 
971 	/*
972 	 * We should only be dirtying in syncing context if it's the
973 	 * mos, a spa os, or we're initializing the os.  However, we are
974 	 * allowed to dirty in syncing context provided we already
975 	 * dirtied it in open context.  Hence we must make this
976 	 * assertion only if we're not already dirty.
977 	 */
978 	ASSERT(!dmu_tx_is_syncing(tx) ||
979 	    os->os_dsl_dataset == NULL ||
980 	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
981 	    !BP_IS_HOLE(os->os_rootbp));
982 	ASSERT(db->db.db_size != 0);
983 
984 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
985 
986 	if (db->db_blkid != DB_BONUS_BLKID) {
987 		/*
988 		 * Update the accounting.
989 		 */
990 		if (dbuf_block_freeable(db)) {
991 			blkptr_t *bp = db->db_blkptr;
992 			int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
993 			    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
994 			/*
995 			 * This is only a guess -- if the dbuf is dirty
996 			 * in a previous txg, we don't know how much
997 			 * space it will use on disk yet.  We should
998 			 * really have the struct_rwlock to access
999 			 * db_blkptr, but since this is just a guess,
1000 			 * it's OK if we get an odd answer.
1001 			 */
1002 			dnode_willuse_space(dn, -willfree, tx);
1003 		}
1004 		dnode_willuse_space(dn, db->db.db_size, tx);
1005 	}
1006 
1007 	/*
1008 	 * If this buffer is dirty in an old transaction group we need
1009 	 * to make a copy of it so that the changes we make in this
1010 	 * transaction group won't leak out when we sync the older txg.
1011 	 */
1012 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1013 	if (db->db_level == 0) {
1014 		void *data_old = db->db_buf;
1015 
1016 		if (db->db_blkid == DB_BONUS_BLKID) {
1017 			dbuf_fix_old_data(db, tx->tx_txg);
1018 			data_old = db->db.db_data;
1019 		} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1020 			/*
1021 			 * Release the data buffer from the cache so that we
1022 			 * can modify it without impacting possible other users
1023 			 * of this cached data block.  Note that indirect
1024 			 * blocks and private objects are not released until the
1025 			 * syncing state (since they are only modified then).
1026 			 */
1027 			arc_release(db->db_buf, db);
1028 			dbuf_fix_old_data(db, tx->tx_txg);
1029 			data_old = db->db_buf;
1030 		}
1031 		ASSERT(data_old != NULL);
1032 		dr->dt.dl.dr_data = data_old;
1033 	} else {
1034 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1035 		list_create(&dr->dt.di.dr_children,
1036 		    sizeof (dbuf_dirty_record_t),
1037 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1038 	}
1039 	dr->dr_dbuf = db;
1040 	dr->dr_txg = tx->tx_txg;
1041 	dr->dr_next = *drp;
1042 	*drp = dr;
1043 
1044 	/*
1045 	 * We could have been freed_in_flight between the dbuf_noread
1046 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1047 	 * happened after the free.
1048 	 */
1049 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
1050 		mutex_enter(&dn->dn_mtx);
1051 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1052 		mutex_exit(&dn->dn_mtx);
1053 		db->db_freed_in_flight = FALSE;
1054 	}
1055 
1056 	/*
1057 	 * This buffer is now part of this txg
1058 	 */
1059 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1060 	db->db_dirtycnt += 1;
1061 	ASSERT3U(db->db_dirtycnt, <=, 3);
1062 
1063 	mutex_exit(&db->db_mtx);
1064 
1065 	if (db->db_blkid == DB_BONUS_BLKID) {
1066 		mutex_enter(&dn->dn_mtx);
1067 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1068 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1069 		mutex_exit(&dn->dn_mtx);
1070 		dnode_setdirty(dn, tx);
1071 		return (dr);
1072 	}
1073 
1074 	if (db->db_level == 0) {
1075 		dnode_new_blkid(dn, db->db_blkid, tx);
1076 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1077 	}
1078 
1079 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1080 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1081 		drop_struct_lock = TRUE;
1082 	}
1083 
1084 	if (db->db_level+1 < dn->dn_nlevels) {
1085 		dmu_buf_impl_t *parent = db->db_parent;
1086 		dbuf_dirty_record_t *di;
1087 		int parent_held = FALSE;
1088 
1089 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1090 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1091 
1092 			parent = dbuf_hold_level(dn, db->db_level+1,
1093 			    db->db_blkid >> epbs, FTAG);
1094 			parent_held = TRUE;
1095 		}
1096 		if (drop_struct_lock)
1097 			rw_exit(&dn->dn_struct_rwlock);
1098 		ASSERT3U(db->db_level+1, ==, parent->db_level);
1099 		di = dbuf_dirty(parent, tx);
1100 		if (parent_held)
1101 			dbuf_rele(parent, FTAG);
1102 
1103 		mutex_enter(&db->db_mtx);
1104 		/*  possible race with dbuf_undirty() */
1105 		if (db->db_last_dirty == dr ||
1106 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1107 			mutex_enter(&di->dt.di.dr_mtx);
1108 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1109 			ASSERT(!list_link_active(&dr->dr_dirty_node));
1110 			list_insert_tail(&di->dt.di.dr_children, dr);
1111 			mutex_exit(&di->dt.di.dr_mtx);
1112 			dr->dr_parent = di;
1113 		}
1114 		mutex_exit(&db->db_mtx);
1115 	} else {
1116 		ASSERT(db->db_level+1 == dn->dn_nlevels);
1117 		ASSERT(db->db_blkid < dn->dn_nblkptr);
1118 		ASSERT(db->db_parent == NULL ||
1119 		    db->db_parent == db->db_dnode->dn_dbuf);
1120 		mutex_enter(&dn->dn_mtx);
1121 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1122 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1123 		mutex_exit(&dn->dn_mtx);
1124 		if (drop_struct_lock)
1125 			rw_exit(&dn->dn_struct_rwlock);
1126 	}
1127 
1128 	dnode_setdirty(dn, tx);
1129 	return (dr);
1130 }
1131 
1132 static int
1133 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1134 {
1135 	dnode_t *dn = db->db_dnode;
1136 	uint64_t txg = tx->tx_txg;
1137 	dbuf_dirty_record_t *dr, **drp;
1138 
1139 	ASSERT(txg != 0);
1140 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1141 
1142 	mutex_enter(&db->db_mtx);
1143 
1144 	/*
1145 	 * If this buffer is not dirty, we're done.
1146 	 */
1147 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1148 		if (dr->dr_txg <= txg)
1149 			break;
1150 	if (dr == NULL || dr->dr_txg < txg) {
1151 		mutex_exit(&db->db_mtx);
1152 		return (0);
1153 	}
1154 	ASSERT(dr->dr_txg == txg);
1155 
1156 	/*
1157 	 * If this buffer is currently held, we cannot undirty
1158 	 * it, since one of the current holders may be in the
1159 	 * middle of an update.  Note that users of dbuf_undirty()
1160 	 * should not place a hold on the dbuf before the call.
1161 	 */
1162 	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1163 		mutex_exit(&db->db_mtx);
1164 		/* Make sure we don't toss this buffer at sync phase */
1165 		mutex_enter(&dn->dn_mtx);
1166 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1167 		mutex_exit(&dn->dn_mtx);
1168 		return (0);
1169 	}
1170 
1171 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1172 
1173 	ASSERT(db->db.db_size != 0);
1174 
1175 	/* XXX would be nice to fix up dn_towrite_space[] */
1176 
1177 	*drp = dr->dr_next;
1178 
1179 	if (dr->dr_parent) {
1180 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1181 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1182 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1183 	} else if (db->db_level+1 == dn->dn_nlevels) {
1184 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1185 		mutex_enter(&dn->dn_mtx);
1186 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1187 		mutex_exit(&dn->dn_mtx);
1188 	}
1189 
1190 	if (db->db_level == 0) {
1191 		dbuf_unoverride(dr);
1192 
1193 		ASSERT(db->db_buf != NULL);
1194 		ASSERT(dr->dt.dl.dr_data != NULL);
1195 		if (dr->dt.dl.dr_data != db->db_buf)
1196 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
1197 	} else {
1198 		ASSERT(db->db_buf != NULL);
1199 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1200 		mutex_destroy(&dr->dt.di.dr_mtx);
1201 		list_destroy(&dr->dt.di.dr_children);
1202 	}
1203 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1204 
1205 	ASSERT(db->db_dirtycnt > 0);
1206 	db->db_dirtycnt -= 1;
1207 
1208 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1209 		arc_buf_t *buf = db->db_buf;
1210 
1211 		ASSERT(arc_released(buf));
1212 		dbuf_set_data(db, NULL);
1213 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
1214 		dbuf_evict(db);
1215 		return (1);
1216 	}
1217 
1218 	mutex_exit(&db->db_mtx);
1219 	return (0);
1220 }
1221 
1222 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1223 void
1224 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1225 {
1226 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1227 
1228 	ASSERT(tx->tx_txg != 0);
1229 	ASSERT(!refcount_is_zero(&db->db_holds));
1230 
1231 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
1232 		rf |= DB_RF_HAVESTRUCT;
1233 	(void) dbuf_read(db, NULL, rf);
1234 	(void) dbuf_dirty(db, tx);
1235 }
1236 
1237 void
1238 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1239 {
1240 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1241 
1242 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
1243 	ASSERT(tx->tx_txg != 0);
1244 	ASSERT(db->db_level == 0);
1245 	ASSERT(!refcount_is_zero(&db->db_holds));
1246 
1247 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1248 	    dmu_tx_private_ok(tx));
1249 
1250 	dbuf_noread(db);
1251 	(void) dbuf_dirty(db, tx);
1252 }
1253 
1254 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1255 /* ARGSUSED */
1256 void
1257 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1258 {
1259 	mutex_enter(&db->db_mtx);
1260 	DBUF_VERIFY(db);
1261 
1262 	if (db->db_state == DB_FILL) {
1263 		if (db->db_level == 0 && db->db_freed_in_flight) {
1264 			ASSERT(db->db_blkid != DB_BONUS_BLKID);
1265 			/* we were freed while filling */
1266 			/* XXX dbuf_undirty? */
1267 			bzero(db->db.db_data, db->db.db_size);
1268 			db->db_freed_in_flight = FALSE;
1269 		}
1270 		db->db_state = DB_CACHED;
1271 		cv_broadcast(&db->db_changed);
1272 	}
1273 	mutex_exit(&db->db_mtx);
1274 }
1275 
1276 /*
1277  * "Clear" the contents of this dbuf.  This will mark the dbuf
1278  * EVICTING and clear *most* of its references.  Unfortunetely,
1279  * when we are not holding the dn_dbufs_mtx, we can't clear the
1280  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1281  * in this case.  For callers from the DMU we will usually see:
1282  *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1283  * For the arc callback, we will usually see:
1284  * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1285  * Sometimes, though, we will get a mix of these two:
1286  *	DMU: dbuf_clear()->arc_buf_evict()
1287  *	ARC: dbuf_do_evict()->dbuf_destroy()
1288  */
1289 void
1290 dbuf_clear(dmu_buf_impl_t *db)
1291 {
1292 	dnode_t *dn = db->db_dnode;
1293 	dmu_buf_impl_t *parent = db->db_parent;
1294 	dmu_buf_impl_t *dndb = dn->dn_dbuf;
1295 	int dbuf_gone = FALSE;
1296 
1297 	ASSERT(MUTEX_HELD(&db->db_mtx));
1298 	ASSERT(refcount_is_zero(&db->db_holds));
1299 
1300 	dbuf_evict_user(db);
1301 
1302 	if (db->db_state == DB_CACHED) {
1303 		ASSERT(db->db.db_data != NULL);
1304 		if (db->db_blkid == DB_BONUS_BLKID) {
1305 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1306 			arc_space_return(DN_MAX_BONUSLEN);
1307 		}
1308 		db->db.db_data = NULL;
1309 		db->db_state = DB_UNCACHED;
1310 	}
1311 
1312 	ASSERT3U(db->db_state, ==, DB_UNCACHED);
1313 	ASSERT(db->db_data_pending == NULL);
1314 
1315 	db->db_state = DB_EVICTING;
1316 	db->db_blkptr = NULL;
1317 
1318 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1319 		list_remove(&dn->dn_dbufs, db);
1320 		dnode_rele(dn, db);
1321 		db->db_dnode = NULL;
1322 	}
1323 
1324 	if (db->db_buf)
1325 		dbuf_gone = arc_buf_evict(db->db_buf);
1326 
1327 	if (!dbuf_gone)
1328 		mutex_exit(&db->db_mtx);
1329 
1330 	/*
1331 	 * If this dbuf is referened from an indirect dbuf,
1332 	 * decrement the ref count on the indirect dbuf.
1333 	 */
1334 	if (parent && parent != dndb)
1335 		dbuf_rele(parent, db);
1336 }
1337 
1338 static int
1339 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1340     dmu_buf_impl_t **parentp, blkptr_t **bpp)
1341 {
1342 	int nlevels, epbs;
1343 
1344 	*parentp = NULL;
1345 	*bpp = NULL;
1346 
1347 	ASSERT(blkid != DB_BONUS_BLKID);
1348 
1349 	if (dn->dn_phys->dn_nlevels == 0)
1350 		nlevels = 1;
1351 	else
1352 		nlevels = dn->dn_phys->dn_nlevels;
1353 
1354 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1355 
1356 	ASSERT3U(level * epbs, <, 64);
1357 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1358 	if (level >= nlevels ||
1359 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1360 		/* the buffer has no parent yet */
1361 		return (ENOENT);
1362 	} else if (level < nlevels-1) {
1363 		/* this block is referenced from an indirect block */
1364 		int err = dbuf_hold_impl(dn, level+1,
1365 		    blkid >> epbs, fail_sparse, NULL, parentp);
1366 		if (err)
1367 			return (err);
1368 		err = dbuf_read(*parentp, NULL,
1369 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1370 		if (err) {
1371 			dbuf_rele(*parentp, NULL);
1372 			*parentp = NULL;
1373 			return (err);
1374 		}
1375 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1376 		    (blkid & ((1ULL << epbs) - 1));
1377 		return (0);
1378 	} else {
1379 		/* the block is referenced from the dnode */
1380 		ASSERT3U(level, ==, nlevels-1);
1381 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1382 		    blkid < dn->dn_phys->dn_nblkptr);
1383 		if (dn->dn_dbuf) {
1384 			dbuf_add_ref(dn->dn_dbuf, NULL);
1385 			*parentp = dn->dn_dbuf;
1386 		}
1387 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1388 		return (0);
1389 	}
1390 }
1391 
1392 static dmu_buf_impl_t *
1393 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1394     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1395 {
1396 	objset_impl_t *os = dn->dn_objset;
1397 	dmu_buf_impl_t *db, *odb;
1398 
1399 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1400 	ASSERT(dn->dn_type != DMU_OT_NONE);
1401 
1402 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1403 
1404 	db->db_objset = os;
1405 	db->db.db_object = dn->dn_object;
1406 	db->db_level = level;
1407 	db->db_blkid = blkid;
1408 	db->db_last_dirty = NULL;
1409 	db->db_dirtycnt = 0;
1410 	db->db_dnode = dn;
1411 	db->db_parent = parent;
1412 	db->db_blkptr = blkptr;
1413 
1414 	db->db_user_ptr = NULL;
1415 	db->db_user_data_ptr_ptr = NULL;
1416 	db->db_evict_func = NULL;
1417 	db->db_immediate_evict = 0;
1418 	db->db_freed_in_flight = 0;
1419 
1420 	if (blkid == DB_BONUS_BLKID) {
1421 		ASSERT3P(parent, ==, dn->dn_dbuf);
1422 		db->db.db_size = DN_MAX_BONUSLEN -
1423 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1424 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1425 		db->db.db_offset = DB_BONUS_BLKID;
1426 		db->db_state = DB_UNCACHED;
1427 		/* the bonus dbuf is not placed in the hash table */
1428 		arc_space_consume(sizeof (dmu_buf_impl_t));
1429 		return (db);
1430 	} else {
1431 		int blocksize =
1432 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1433 		db->db.db_size = blocksize;
1434 		db->db.db_offset = db->db_blkid * blocksize;
1435 	}
1436 
1437 	/*
1438 	 * Hold the dn_dbufs_mtx while we get the new dbuf
1439 	 * in the hash table *and* added to the dbufs list.
1440 	 * This prevents a possible deadlock with someone
1441 	 * trying to look up this dbuf before its added to the
1442 	 * dn_dbufs list.
1443 	 */
1444 	mutex_enter(&dn->dn_dbufs_mtx);
1445 	db->db_state = DB_EVICTING;
1446 	if ((odb = dbuf_hash_insert(db)) != NULL) {
1447 		/* someone else inserted it first */
1448 		kmem_cache_free(dbuf_cache, db);
1449 		mutex_exit(&dn->dn_dbufs_mtx);
1450 		return (odb);
1451 	}
1452 	list_insert_head(&dn->dn_dbufs, db);
1453 	db->db_state = DB_UNCACHED;
1454 	mutex_exit(&dn->dn_dbufs_mtx);
1455 	arc_space_consume(sizeof (dmu_buf_impl_t));
1456 
1457 	if (parent && parent != dn->dn_dbuf)
1458 		dbuf_add_ref(parent, db);
1459 
1460 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1461 	    refcount_count(&dn->dn_holds) > 0);
1462 	(void) refcount_add(&dn->dn_holds, db);
1463 
1464 	dprintf_dbuf(db, "db=%p\n", db);
1465 
1466 	return (db);
1467 }
1468 
1469 static int
1470 dbuf_do_evict(void *private)
1471 {
1472 	arc_buf_t *buf = private;
1473 	dmu_buf_impl_t *db = buf->b_private;
1474 
1475 	if (!MUTEX_HELD(&db->db_mtx))
1476 		mutex_enter(&db->db_mtx);
1477 
1478 	ASSERT(refcount_is_zero(&db->db_holds));
1479 
1480 	if (db->db_state != DB_EVICTING) {
1481 		ASSERT(db->db_state == DB_CACHED);
1482 		DBUF_VERIFY(db);
1483 		db->db_buf = NULL;
1484 		dbuf_evict(db);
1485 	} else {
1486 		mutex_exit(&db->db_mtx);
1487 		dbuf_destroy(db);
1488 	}
1489 	return (0);
1490 }
1491 
1492 static void
1493 dbuf_destroy(dmu_buf_impl_t *db)
1494 {
1495 	ASSERT(refcount_is_zero(&db->db_holds));
1496 
1497 	if (db->db_blkid != DB_BONUS_BLKID) {
1498 		/*
1499 		 * If this dbuf is still on the dn_dbufs list,
1500 		 * remove it from that list.
1501 		 */
1502 		if (db->db_dnode) {
1503 			dnode_t *dn = db->db_dnode;
1504 
1505 			mutex_enter(&dn->dn_dbufs_mtx);
1506 			list_remove(&dn->dn_dbufs, db);
1507 			mutex_exit(&dn->dn_dbufs_mtx);
1508 
1509 			dnode_rele(dn, db);
1510 			db->db_dnode = NULL;
1511 		}
1512 		dbuf_hash_remove(db);
1513 	}
1514 	db->db_parent = NULL;
1515 	db->db_buf = NULL;
1516 
1517 	ASSERT(!list_link_active(&db->db_link));
1518 	ASSERT(db->db.db_data == NULL);
1519 	ASSERT(db->db_hash_next == NULL);
1520 	ASSERT(db->db_blkptr == NULL);
1521 	ASSERT(db->db_data_pending == NULL);
1522 
1523 	kmem_cache_free(dbuf_cache, db);
1524 	arc_space_return(sizeof (dmu_buf_impl_t));
1525 }
1526 
1527 void
1528 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1529 {
1530 	dmu_buf_impl_t *db = NULL;
1531 	blkptr_t *bp = NULL;
1532 
1533 	ASSERT(blkid != DB_BONUS_BLKID);
1534 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1535 
1536 	if (dnode_block_freed(dn, blkid))
1537 		return;
1538 
1539 	/* dbuf_find() returns with db_mtx held */
1540 	if (db = dbuf_find(dn, 0, blkid)) {
1541 		if (refcount_count(&db->db_holds) > 0) {
1542 			/*
1543 			 * This dbuf is active.  We assume that it is
1544 			 * already CACHED, or else about to be either
1545 			 * read or filled.
1546 			 */
1547 			mutex_exit(&db->db_mtx);
1548 			return;
1549 		}
1550 		mutex_exit(&db->db_mtx);
1551 		db = NULL;
1552 	}
1553 
1554 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1555 		if (bp && !BP_IS_HOLE(bp)) {
1556 			arc_buf_t *pbuf;
1557 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1558 			zbookmark_t zb;
1559 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
1560 			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
1561 			zb.zb_object = dn->dn_object;
1562 			zb.zb_level = 0;
1563 			zb.zb_blkid = blkid;
1564 
1565 			if (db)
1566 				pbuf = db->db_buf;
1567 			else
1568 				pbuf = dn->dn_objset->os_phys_buf;
1569 
1570 			(void) arc_read(NULL, dn->dn_objset->os_spa,
1571 			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
1572 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1573 			    &aflags, &zb);
1574 		}
1575 		if (db)
1576 			dbuf_rele(db, NULL);
1577 	}
1578 }
1579 
1580 /*
1581  * Returns with db_holds incremented, and db_mtx not held.
1582  * Note: dn_struct_rwlock must be held.
1583  */
1584 int
1585 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1586     void *tag, dmu_buf_impl_t **dbp)
1587 {
1588 	dmu_buf_impl_t *db, *parent = NULL;
1589 
1590 	ASSERT(blkid != DB_BONUS_BLKID);
1591 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1592 	ASSERT3U(dn->dn_nlevels, >, level);
1593 
1594 	*dbp = NULL;
1595 top:
1596 	/* dbuf_find() returns with db_mtx held */
1597 	db = dbuf_find(dn, level, blkid);
1598 
1599 	if (db == NULL) {
1600 		blkptr_t *bp = NULL;
1601 		int err;
1602 
1603 		ASSERT3P(parent, ==, NULL);
1604 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1605 		if (fail_sparse) {
1606 			if (err == 0 && bp && BP_IS_HOLE(bp))
1607 				err = ENOENT;
1608 			if (err) {
1609 				if (parent)
1610 					dbuf_rele(parent, NULL);
1611 				return (err);
1612 			}
1613 		}
1614 		if (err && err != ENOENT)
1615 			return (err);
1616 		db = dbuf_create(dn, level, blkid, parent, bp);
1617 	}
1618 
1619 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1620 		arc_buf_add_ref(db->db_buf, db);
1621 		if (db->db_buf->b_data == NULL) {
1622 			dbuf_clear(db);
1623 			if (parent) {
1624 				dbuf_rele(parent, NULL);
1625 				parent = NULL;
1626 			}
1627 			goto top;
1628 		}
1629 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1630 	}
1631 
1632 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1633 
1634 	/*
1635 	 * If this buffer is currently syncing out, and we are are
1636 	 * still referencing it from db_data, we need to make a copy
1637 	 * of it in case we decide we want to dirty it again in this txg.
1638 	 */
1639 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
1640 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1641 	    db->db_state == DB_CACHED && db->db_data_pending) {
1642 		dbuf_dirty_record_t *dr = db->db_data_pending;
1643 
1644 		if (dr->dt.dl.dr_data == db->db_buf) {
1645 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1646 
1647 			dbuf_set_data(db,
1648 			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
1649 			    db->db.db_size, db, type));
1650 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1651 			    db->db.db_size);
1652 		}
1653 	}
1654 
1655 	(void) refcount_add(&db->db_holds, tag);
1656 	dbuf_update_data(db);
1657 	DBUF_VERIFY(db);
1658 	mutex_exit(&db->db_mtx);
1659 
1660 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1661 	if (parent)
1662 		dbuf_rele(parent, NULL);
1663 
1664 	ASSERT3P(db->db_dnode, ==, dn);
1665 	ASSERT3U(db->db_blkid, ==, blkid);
1666 	ASSERT3U(db->db_level, ==, level);
1667 	*dbp = db;
1668 
1669 	return (0);
1670 }
1671 
1672 dmu_buf_impl_t *
1673 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1674 {
1675 	dmu_buf_impl_t *db;
1676 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1677 	return (err ? NULL : db);
1678 }
1679 
1680 dmu_buf_impl_t *
1681 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1682 {
1683 	dmu_buf_impl_t *db;
1684 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1685 	return (err ? NULL : db);
1686 }
1687 
1688 void
1689 dbuf_create_bonus(dnode_t *dn)
1690 {
1691 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1692 
1693 	ASSERT(dn->dn_bonus == NULL);
1694 	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
1695 }
1696 
1697 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1698 void
1699 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
1700 {
1701 	int64_t holds = refcount_add(&db->db_holds, tag);
1702 	ASSERT(holds > 1);
1703 }
1704 
1705 #pragma weak dmu_buf_rele = dbuf_rele
1706 void
1707 dbuf_rele(dmu_buf_impl_t *db, void *tag)
1708 {
1709 	int64_t holds;
1710 
1711 	mutex_enter(&db->db_mtx);
1712 	DBUF_VERIFY(db);
1713 
1714 	holds = refcount_remove(&db->db_holds, tag);
1715 	ASSERT(holds >= 0);
1716 
1717 	/*
1718 	 * We can't freeze indirects if there is a possibility that they
1719 	 * may be modified in the current syncing context.
1720 	 */
1721 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
1722 		arc_buf_freeze(db->db_buf);
1723 
1724 	if (holds == db->db_dirtycnt &&
1725 	    db->db_level == 0 && db->db_immediate_evict)
1726 		dbuf_evict_user(db);
1727 
1728 	if (holds == 0) {
1729 		if (db->db_blkid == DB_BONUS_BLKID) {
1730 			mutex_exit(&db->db_mtx);
1731 			dnode_rele(db->db_dnode, db);
1732 		} else if (db->db_buf == NULL) {
1733 			/*
1734 			 * This is a special case: we never associated this
1735 			 * dbuf with any data allocated from the ARC.
1736 			 */
1737 			ASSERT3U(db->db_state, ==, DB_UNCACHED);
1738 			dbuf_evict(db);
1739 		} else if (arc_released(db->db_buf)) {
1740 			arc_buf_t *buf = db->db_buf;
1741 			/*
1742 			 * This dbuf has anonymous data associated with it.
1743 			 */
1744 			dbuf_set_data(db, NULL);
1745 			VERIFY(arc_buf_remove_ref(buf, db) == 1);
1746 			dbuf_evict(db);
1747 		} else {
1748 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
1749 			mutex_exit(&db->db_mtx);
1750 		}
1751 	} else {
1752 		mutex_exit(&db->db_mtx);
1753 	}
1754 }
1755 
1756 #pragma weak dmu_buf_refcount = dbuf_refcount
1757 uint64_t
1758 dbuf_refcount(dmu_buf_impl_t *db)
1759 {
1760 	return (refcount_count(&db->db_holds));
1761 }
1762 
1763 void *
1764 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1765     dmu_buf_evict_func_t *evict_func)
1766 {
1767 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1768 	    user_data_ptr_ptr, evict_func));
1769 }
1770 
1771 void *
1772 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
1773     dmu_buf_evict_func_t *evict_func)
1774 {
1775 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1776 
1777 	db->db_immediate_evict = TRUE;
1778 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
1779 	    user_data_ptr_ptr, evict_func));
1780 }
1781 
1782 void *
1783 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
1784     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
1785 {
1786 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1787 	ASSERT(db->db_level == 0);
1788 
1789 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
1790 
1791 	mutex_enter(&db->db_mtx);
1792 
1793 	if (db->db_user_ptr == old_user_ptr) {
1794 		db->db_user_ptr = user_ptr;
1795 		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
1796 		db->db_evict_func = evict_func;
1797 
1798 		dbuf_update_data(db);
1799 	} else {
1800 		old_user_ptr = db->db_user_ptr;
1801 	}
1802 
1803 	mutex_exit(&db->db_mtx);
1804 	return (old_user_ptr);
1805 }
1806 
1807 void *
1808 dmu_buf_get_user(dmu_buf_t *db_fake)
1809 {
1810 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1811 	ASSERT(!refcount_is_zero(&db->db_holds));
1812 
1813 	return (db->db_user_ptr);
1814 }
1815 
1816 static void
1817 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
1818 {
1819 	/* ASSERT(dmu_tx_is_syncing(tx) */
1820 	ASSERT(MUTEX_HELD(&db->db_mtx));
1821 
1822 	if (db->db_blkptr != NULL)
1823 		return;
1824 
1825 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
1826 		/*
1827 		 * This buffer was allocated at a time when there was
1828 		 * no available blkptrs from the dnode, or it was
1829 		 * inappropriate to hook it in (i.e., nlevels mis-match).
1830 		 */
1831 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
1832 		ASSERT(db->db_parent == NULL);
1833 		db->db_parent = dn->dn_dbuf;
1834 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
1835 		DBUF_VERIFY(db);
1836 	} else {
1837 		dmu_buf_impl_t *parent = db->db_parent;
1838 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
1839 
1840 		ASSERT(dn->dn_phys->dn_nlevels > 1);
1841 		if (parent == NULL) {
1842 			mutex_exit(&db->db_mtx);
1843 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
1844 			(void) dbuf_hold_impl(dn, db->db_level+1,
1845 			    db->db_blkid >> epbs, FALSE, db, &parent);
1846 			rw_exit(&dn->dn_struct_rwlock);
1847 			mutex_enter(&db->db_mtx);
1848 			db->db_parent = parent;
1849 		}
1850 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
1851 		    (db->db_blkid & ((1ULL << epbs) - 1));
1852 		DBUF_VERIFY(db);
1853 	}
1854 }
1855 
1856 static void
1857 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1858 {
1859 	dmu_buf_impl_t *db = dr->dr_dbuf;
1860 	dnode_t *dn = db->db_dnode;
1861 	zio_t *zio;
1862 
1863 	ASSERT(dmu_tx_is_syncing(tx));
1864 
1865 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1866 
1867 	mutex_enter(&db->db_mtx);
1868 
1869 	ASSERT(db->db_level > 0);
1870 	DBUF_VERIFY(db);
1871 
1872 	if (db->db_buf == NULL) {
1873 		mutex_exit(&db->db_mtx);
1874 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
1875 		mutex_enter(&db->db_mtx);
1876 	}
1877 	ASSERT3U(db->db_state, ==, DB_CACHED);
1878 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
1879 	ASSERT(db->db_buf != NULL);
1880 
1881 	dbuf_check_blkptr(dn, db);
1882 
1883 	db->db_data_pending = dr;
1884 
1885 	mutex_exit(&db->db_mtx);
1886 	dbuf_write(dr, db->db_buf, tx);
1887 
1888 	zio = dr->dr_zio;
1889 	mutex_enter(&dr->dt.di.dr_mtx);
1890 	dbuf_sync_list(&dr->dt.di.dr_children, tx);
1891 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1892 	mutex_exit(&dr->dt.di.dr_mtx);
1893 	zio_nowait(zio);
1894 }
1895 
1896 static void
1897 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
1898 {
1899 	arc_buf_t **datap = &dr->dt.dl.dr_data;
1900 	dmu_buf_impl_t *db = dr->dr_dbuf;
1901 	dnode_t *dn = db->db_dnode;
1902 	objset_impl_t *os = dn->dn_objset;
1903 	uint64_t txg = tx->tx_txg;
1904 	int blksz;
1905 
1906 	ASSERT(dmu_tx_is_syncing(tx));
1907 
1908 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
1909 
1910 	mutex_enter(&db->db_mtx);
1911 	/*
1912 	 * To be synced, we must be dirtied.  But we
1913 	 * might have been freed after the dirty.
1914 	 */
1915 	if (db->db_state == DB_UNCACHED) {
1916 		/* This buffer has been freed since it was dirtied */
1917 		ASSERT(db->db.db_data == NULL);
1918 	} else if (db->db_state == DB_FILL) {
1919 		/* This buffer was freed and is now being re-filled */
1920 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
1921 	} else {
1922 		ASSERT3U(db->db_state, ==, DB_CACHED);
1923 	}
1924 	DBUF_VERIFY(db);
1925 
1926 	/*
1927 	 * If this is a bonus buffer, simply copy the bonus data into the
1928 	 * dnode.  It will be written out when the dnode is synced (and it
1929 	 * will be synced, since it must have been dirty for dbuf_sync to
1930 	 * be called).
1931 	 */
1932 	if (db->db_blkid == DB_BONUS_BLKID) {
1933 		dbuf_dirty_record_t **drp;
1934 
1935 		ASSERT(*datap != NULL);
1936 		ASSERT3U(db->db_level, ==, 0);
1937 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
1938 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
1939 		if (*datap != db->db.db_data) {
1940 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
1941 			arc_space_return(DN_MAX_BONUSLEN);
1942 		}
1943 		db->db_data_pending = NULL;
1944 		drp = &db->db_last_dirty;
1945 		while (*drp != dr)
1946 			drp = &(*drp)->dr_next;
1947 		ASSERT(dr->dr_next == NULL);
1948 		*drp = dr->dr_next;
1949 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
1950 		ASSERT(db->db_dirtycnt > 0);
1951 		db->db_dirtycnt -= 1;
1952 		mutex_exit(&db->db_mtx);
1953 		dbuf_rele(db, (void *)(uintptr_t)txg);
1954 		return;
1955 	}
1956 
1957 	/*
1958 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
1959 	 * operation to sneak in. As a result, we need to ensure that we
1960 	 * don't check the dr_override_state until we have returned from
1961 	 * dbuf_check_blkptr.
1962 	 */
1963 	dbuf_check_blkptr(dn, db);
1964 
1965 	/*
1966 	 * If this buffer is in the middle of an immdiate write,
1967 	 * wait for the synchronous IO to complete.
1968 	 */
1969 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
1970 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1971 		cv_wait(&db->db_changed, &db->db_mtx);
1972 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
1973 	}
1974 
1975 	/*
1976 	 * If this dbuf has already been written out via an immediate write,
1977 	 * just complete the write by copying over the new block pointer and
1978 	 * updating the accounting via the write-completion functions.
1979 	 */
1980 	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1981 		zio_t zio_fake;
1982 
1983 		zio_fake.io_private = &db;
1984 		zio_fake.io_error = 0;
1985 		zio_fake.io_bp = db->db_blkptr;
1986 		zio_fake.io_bp_orig = *db->db_blkptr;
1987 		zio_fake.io_txg = txg;
1988 
1989 		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
1990 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1991 		db->db_data_pending = dr;
1992 		dr->dr_zio = &zio_fake;
1993 		mutex_exit(&db->db_mtx);
1994 
1995 		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
1996 			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
1997 			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
1998 
1999 		dbuf_write_ready(&zio_fake, db->db_buf, db);
2000 		dbuf_write_done(&zio_fake, db->db_buf, db);
2001 
2002 		return;
2003 	}
2004 
2005 	blksz = arc_buf_size(*datap);
2006 
2007 	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
2008 		/*
2009 		 * If this buffer is currently "in use" (i.e., there are
2010 		 * active holds and db_data still references it), then make
2011 		 * a copy before we start the write so that any modifications
2012 		 * from the open txg will not leak into this write.
2013 		 *
2014 		 * NOTE: this copy does not need to be made for objects only
2015 		 * modified in the syncing context (e.g. DNONE_DNODE blocks).
2016 		 */
2017 		if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
2018 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2019 			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2020 			bcopy(db->db.db_data, (*datap)->b_data, blksz);
2021 		}
2022 	}
2023 
2024 	ASSERT(*datap != NULL);
2025 	db->db_data_pending = dr;
2026 
2027 	mutex_exit(&db->db_mtx);
2028 
2029 	dbuf_write(dr, *datap, tx);
2030 
2031 	ASSERT(!list_link_active(&dr->dr_dirty_node));
2032 	if (dn->dn_object == DMU_META_DNODE_OBJECT)
2033 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2034 	else
2035 		zio_nowait(dr->dr_zio);
2036 }
2037 
2038 void
2039 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2040 {
2041 	dbuf_dirty_record_t *dr;
2042 
2043 	while (dr = list_head(list)) {
2044 		if (dr->dr_zio != NULL) {
2045 			/*
2046 			 * If we find an already initialized zio then we
2047 			 * are processing the meta-dnode, and we have finished.
2048 			 * The dbufs for all dnodes are put back on the list
2049 			 * during processing, so that we can zio_wait()
2050 			 * these IOs after initiating all child IOs.
2051 			 */
2052 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2053 			    DMU_META_DNODE_OBJECT);
2054 			break;
2055 		}
2056 		list_remove(list, dr);
2057 		if (dr->dr_dbuf->db_level > 0)
2058 			dbuf_sync_indirect(dr, tx);
2059 		else
2060 			dbuf_sync_leaf(dr, tx);
2061 	}
2062 }
2063 
2064 static void
2065 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2066 {
2067 	dmu_buf_impl_t *db = dr->dr_dbuf;
2068 	dnode_t *dn = db->db_dnode;
2069 	objset_impl_t *os = dn->dn_objset;
2070 	dmu_buf_impl_t *parent = db->db_parent;
2071 	uint64_t txg = tx->tx_txg;
2072 	zbookmark_t zb;
2073 	writeprops_t wp = { 0 };
2074 	zio_t *zio;
2075 	int zio_flags;
2076 
2077 	if (!BP_IS_HOLE(db->db_blkptr) &&
2078 	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
2079 		/*
2080 		 * Private object buffers are released here rather
2081 		 * than in dbuf_dirty() since they are only modified
2082 		 * in the syncing context and we don't want the
2083 		 * overhead of making multiple copies of the data.
2084 		 */
2085 		arc_release(data, db);
2086 	} else {
2087 		ASSERT(arc_released(data));
2088 		/* XXX why do we need to thaw here? */
2089 		arc_buf_thaw(data);
2090 	}
2091 
2092 	if (parent != dn->dn_dbuf) {
2093 		ASSERT(parent && parent->db_data_pending);
2094 		ASSERT(db->db_level == parent->db_level-1);
2095 		ASSERT(arc_released(parent->db_buf));
2096 		zio = parent->db_data_pending->dr_zio;
2097 	} else {
2098 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
2099 		ASSERT3P(db->db_blkptr, ==,
2100 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2101 		zio = dn->dn_zio;
2102 	}
2103 
2104 	ASSERT(db->db_level == 0 || data == db->db_buf);
2105 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2106 	ASSERT(zio);
2107 
2108 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
2109 	zb.zb_object = db->db.db_object;
2110 	zb.zb_level = db->db_level;
2111 	zb.zb_blkid = db->db_blkid;
2112 
2113 	zio_flags = ZIO_FLAG_MUSTSUCCEED;
2114 	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
2115 		zio_flags |= ZIO_FLAG_METADATA;
2116 	wp.wp_type = dn->dn_type;
2117 	wp.wp_level = db->db_level;
2118 	wp.wp_copies = os->os_copies;
2119 	wp.wp_dncompress = dn->dn_compress;
2120 	wp.wp_oscompress = os->os_compress;
2121 	wp.wp_dnchecksum = dn->dn_checksum;
2122 	wp.wp_oschecksum = os->os_checksum;
2123 
2124 	if (BP_IS_OLDER(db->db_blkptr, txg))
2125 		(void) dsl_dataset_block_kill(
2126 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
2127 
2128 	dr->dr_zio = arc_write(zio, os->os_spa, &wp,
2129 	    txg, db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
2130 	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
2131 }
2132 
2133 /* ARGSUSED */
2134 static void
2135 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2136 {
2137 	dmu_buf_impl_t *db = vdb;
2138 	dnode_t *dn = db->db_dnode;
2139 	objset_impl_t *os = dn->dn_objset;
2140 	blkptr_t *bp_orig = &zio->io_bp_orig;
2141 	uint64_t fill = 0;
2142 	int old_size, new_size, i;
2143 
2144 	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
2145 
2146 	old_size = bp_get_dasize(os->os_spa, bp_orig);
2147 	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
2148 
2149 	dnode_diduse_space(dn, new_size-old_size);
2150 
2151 	if (BP_IS_HOLE(zio->io_bp)) {
2152 		dsl_dataset_t *ds = os->os_dsl_dataset;
2153 		dmu_tx_t *tx = os->os_synctx;
2154 
2155 		if (bp_orig->blk_birth == tx->tx_txg)
2156 			(void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2157 		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
2158 		return;
2159 	}
2160 
2161 	mutex_enter(&db->db_mtx);
2162 
2163 	if (db->db_level == 0) {
2164 		mutex_enter(&dn->dn_mtx);
2165 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
2166 			dn->dn_phys->dn_maxblkid = db->db_blkid;
2167 		mutex_exit(&dn->dn_mtx);
2168 
2169 		if (dn->dn_type == DMU_OT_DNODE) {
2170 			dnode_phys_t *dnp = db->db.db_data;
2171 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2172 			    i--, dnp++) {
2173 				if (dnp->dn_type != DMU_OT_NONE)
2174 					fill++;
2175 			}
2176 		} else {
2177 			fill = 1;
2178 		}
2179 	} else {
2180 		blkptr_t *bp = db->db.db_data;
2181 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2182 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
2183 			if (BP_IS_HOLE(bp))
2184 				continue;
2185 			ASSERT3U(BP_GET_LSIZE(bp), ==,
2186 			    db->db_level == 1 ? dn->dn_datablksz :
2187 			    (1<<dn->dn_phys->dn_indblkshift));
2188 			fill += bp->blk_fill;
2189 		}
2190 	}
2191 
2192 	db->db_blkptr->blk_fill = fill;
2193 	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
2194 	BP_SET_LEVEL(db->db_blkptr, db->db_level);
2195 
2196 	mutex_exit(&db->db_mtx);
2197 
2198 	/* We must do this after we've set the bp's type and level */
2199 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
2200 		dsl_dataset_t *ds = os->os_dsl_dataset;
2201 		dmu_tx_t *tx = os->os_synctx;
2202 
2203 		if (bp_orig->blk_birth == tx->tx_txg)
2204 			(void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
2205 		dsl_dataset_block_born(ds, zio->io_bp, tx);
2206 	}
2207 }
2208 
2209 /* ARGSUSED */
2210 static void
2211 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2212 {
2213 	dmu_buf_impl_t *db = vdb;
2214 	uint64_t txg = zio->io_txg;
2215 	dbuf_dirty_record_t **drp, *dr;
2216 
2217 	ASSERT3U(zio->io_error, ==, 0);
2218 
2219 	mutex_enter(&db->db_mtx);
2220 
2221 	drp = &db->db_last_dirty;
2222 	while ((dr = *drp) != db->db_data_pending)
2223 		drp = &dr->dr_next;
2224 	ASSERT(!list_link_active(&dr->dr_dirty_node));
2225 	ASSERT(dr->dr_txg == txg);
2226 	ASSERT(dr->dr_next == NULL);
2227 	*drp = dr->dr_next;
2228 
2229 	if (db->db_level == 0) {
2230 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
2231 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2232 
2233 		if (dr->dt.dl.dr_data != db->db_buf)
2234 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
2235 		else if (!BP_IS_HOLE(db->db_blkptr))
2236 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2237 		else
2238 			ASSERT(arc_released(db->db_buf));
2239 	} else {
2240 		dnode_t *dn = db->db_dnode;
2241 
2242 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2243 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2244 		if (!BP_IS_HOLE(db->db_blkptr)) {
2245 			int epbs =
2246 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2247 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2248 			    db->db.db_size);
2249 			ASSERT3U(dn->dn_phys->dn_maxblkid
2250 			    >> (db->db_level * epbs), >=, db->db_blkid);
2251 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2252 		}
2253 		mutex_destroy(&dr->dt.di.dr_mtx);
2254 		list_destroy(&dr->dt.di.dr_children);
2255 	}
2256 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2257 
2258 	cv_broadcast(&db->db_changed);
2259 	ASSERT(db->db_dirtycnt > 0);
2260 	db->db_dirtycnt -= 1;
2261 	db->db_data_pending = NULL;
2262 	mutex_exit(&db->db_mtx);
2263 
2264 	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
2265 
2266 	dbuf_rele(db, (void *)(uintptr_t)txg);
2267 }
2268