xref: /titanic_44/usr/src/uts/common/fs/zfs/dbuf.c (revision cf7e209d5f01e9f5fe052b444899ba9cba0e9877)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_send.h>
31 #include <sys/dmu_impl.h>
32 #include <sys/dbuf.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/dsl_dataset.h>
35 #include <sys/dsl_dir.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/spa.h>
38 #include <sys/zio.h>
39 #include <sys/dmu_zfetch.h>
40 #include <sys/sa.h>
41 #include <sys/sa_impl.h>
42 
43 static void dbuf_destroy(dmu_buf_impl_t *db);
44 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
45 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
46 
47 /*
48  * Global data structures and functions for the dbuf cache.
49  */
50 static kmem_cache_t *dbuf_cache;
51 
52 /* ARGSUSED */
53 static int
54 dbuf_cons(void *vdb, void *unused, int kmflag)
55 {
56 	dmu_buf_impl_t *db = vdb;
57 	bzero(db, sizeof (dmu_buf_impl_t));
58 
59 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
60 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
61 	refcount_create(&db->db_holds);
62 	return (0);
63 }
64 
65 /* ARGSUSED */
66 static void
67 dbuf_dest(void *vdb, void *unused)
68 {
69 	dmu_buf_impl_t *db = vdb;
70 	mutex_destroy(&db->db_mtx);
71 	cv_destroy(&db->db_changed);
72 	refcount_destroy(&db->db_holds);
73 }
74 
75 /*
76  * dbuf hash table routines
77  */
78 static dbuf_hash_table_t dbuf_hash_table;
79 
80 static uint64_t dbuf_hash_count;
81 
82 static uint64_t
83 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
84 {
85 	uintptr_t osv = (uintptr_t)os;
86 	uint64_t crc = -1ULL;
87 
88 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
89 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
90 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
91 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
92 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
93 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
94 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
95 
96 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
97 
98 	return (crc);
99 }
100 
101 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
102 
103 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
104 	((dbuf)->db.db_object == (obj) &&		\
105 	(dbuf)->db_objset == (os) &&			\
106 	(dbuf)->db_level == (level) &&			\
107 	(dbuf)->db_blkid == (blkid))
108 
109 dmu_buf_impl_t *
110 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
111 {
112 	dbuf_hash_table_t *h = &dbuf_hash_table;
113 	objset_t *os = dn->dn_objset;
114 	uint64_t obj = dn->dn_object;
115 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
116 	uint64_t idx = hv & h->hash_table_mask;
117 	dmu_buf_impl_t *db;
118 
119 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
120 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
121 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
122 			mutex_enter(&db->db_mtx);
123 			if (db->db_state != DB_EVICTING) {
124 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
125 				return (db);
126 			}
127 			mutex_exit(&db->db_mtx);
128 		}
129 	}
130 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
131 	return (NULL);
132 }
133 
134 /*
135  * Insert an entry into the hash table.  If there is already an element
136  * equal to elem in the hash table, then the already existing element
137  * will be returned and the new element will not be inserted.
138  * Otherwise returns NULL.
139  */
140 static dmu_buf_impl_t *
141 dbuf_hash_insert(dmu_buf_impl_t *db)
142 {
143 	dbuf_hash_table_t *h = &dbuf_hash_table;
144 	objset_t *os = db->db_objset;
145 	uint64_t obj = db->db.db_object;
146 	int level = db->db_level;
147 	uint64_t blkid = db->db_blkid;
148 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
149 	uint64_t idx = hv & h->hash_table_mask;
150 	dmu_buf_impl_t *dbf;
151 
152 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
153 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
154 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
155 			mutex_enter(&dbf->db_mtx);
156 			if (dbf->db_state != DB_EVICTING) {
157 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
158 				return (dbf);
159 			}
160 			mutex_exit(&dbf->db_mtx);
161 		}
162 	}
163 
164 	mutex_enter(&db->db_mtx);
165 	db->db_hash_next = h->hash_table[idx];
166 	h->hash_table[idx] = db;
167 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
168 	atomic_add_64(&dbuf_hash_count, 1);
169 
170 	return (NULL);
171 }
172 
173 /*
174  * Remove an entry from the hash table.  This operation will
175  * fail if there are any existing holds on the db.
176  */
177 static void
178 dbuf_hash_remove(dmu_buf_impl_t *db)
179 {
180 	dbuf_hash_table_t *h = &dbuf_hash_table;
181 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
182 	    db->db_level, db->db_blkid);
183 	uint64_t idx = hv & h->hash_table_mask;
184 	dmu_buf_impl_t *dbf, **dbp;
185 
186 	/*
187 	 * We musn't hold db_mtx to maintin lock ordering:
188 	 * DBUF_HASH_MUTEX > db_mtx.
189 	 */
190 	ASSERT(refcount_is_zero(&db->db_holds));
191 	ASSERT(db->db_state == DB_EVICTING);
192 	ASSERT(!MUTEX_HELD(&db->db_mtx));
193 
194 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
195 	dbp = &h->hash_table[idx];
196 	while ((dbf = *dbp) != db) {
197 		dbp = &dbf->db_hash_next;
198 		ASSERT(dbf != NULL);
199 	}
200 	*dbp = db->db_hash_next;
201 	db->db_hash_next = NULL;
202 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
203 	atomic_add_64(&dbuf_hash_count, -1);
204 }
205 
206 static arc_evict_func_t dbuf_do_evict;
207 
208 static void
209 dbuf_evict_user(dmu_buf_impl_t *db)
210 {
211 	ASSERT(MUTEX_HELD(&db->db_mtx));
212 
213 	if (db->db_level != 0 || db->db_evict_func == NULL)
214 		return;
215 
216 	if (db->db_user_data_ptr_ptr)
217 		*db->db_user_data_ptr_ptr = db->db.db_data;
218 	db->db_evict_func(&db->db, db->db_user_ptr);
219 	db->db_user_ptr = NULL;
220 	db->db_user_data_ptr_ptr = NULL;
221 	db->db_evict_func = NULL;
222 }
223 
224 boolean_t
225 dbuf_is_metadata(dmu_buf_impl_t *db)
226 {
227 	if (db->db_level > 0) {
228 		return (B_TRUE);
229 	} else {
230 		boolean_t is_metadata;
231 
232 		DB_DNODE_ENTER(db);
233 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
234 		DB_DNODE_EXIT(db);
235 
236 		return (is_metadata);
237 	}
238 }
239 
240 void
241 dbuf_evict(dmu_buf_impl_t *db)
242 {
243 	ASSERT(MUTEX_HELD(&db->db_mtx));
244 	ASSERT(db->db_buf == NULL);
245 	ASSERT(db->db_data_pending == NULL);
246 
247 	dbuf_clear(db);
248 	dbuf_destroy(db);
249 }
250 
251 void
252 dbuf_init(void)
253 {
254 	uint64_t hsize = 1ULL << 16;
255 	dbuf_hash_table_t *h = &dbuf_hash_table;
256 	int i;
257 
258 	/*
259 	 * The hash table is big enough to fill all of physical memory
260 	 * with an average 4K block size.  The table will take up
261 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
262 	 */
263 	while (hsize * 4096 < physmem * PAGESIZE)
264 		hsize <<= 1;
265 
266 retry:
267 	h->hash_table_mask = hsize - 1;
268 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
269 	if (h->hash_table == NULL) {
270 		/* XXX - we should really return an error instead of assert */
271 		ASSERT(hsize > (1ULL << 10));
272 		hsize >>= 1;
273 		goto retry;
274 	}
275 
276 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
277 	    sizeof (dmu_buf_impl_t),
278 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
279 
280 	for (i = 0; i < DBUF_MUTEXES; i++)
281 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
282 }
283 
284 void
285 dbuf_fini(void)
286 {
287 	dbuf_hash_table_t *h = &dbuf_hash_table;
288 	int i;
289 
290 	for (i = 0; i < DBUF_MUTEXES; i++)
291 		mutex_destroy(&h->hash_mutexes[i]);
292 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
293 	kmem_cache_destroy(dbuf_cache);
294 }
295 
296 /*
297  * Other stuff.
298  */
299 
300 #ifdef ZFS_DEBUG
301 static void
302 dbuf_verify(dmu_buf_impl_t *db)
303 {
304 	dnode_t *dn;
305 	dbuf_dirty_record_t *dr;
306 
307 	ASSERT(MUTEX_HELD(&db->db_mtx));
308 
309 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
310 		return;
311 
312 	ASSERT(db->db_objset != NULL);
313 	DB_DNODE_ENTER(db);
314 	dn = DB_DNODE(db);
315 	if (dn == NULL) {
316 		ASSERT(db->db_parent == NULL);
317 		ASSERT(db->db_blkptr == NULL);
318 	} else {
319 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
320 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
321 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
322 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
323 		    db->db_blkid == DMU_SPILL_BLKID ||
324 		    !list_is_empty(&dn->dn_dbufs));
325 	}
326 	if (db->db_blkid == DMU_BONUS_BLKID) {
327 		ASSERT(dn != NULL);
328 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
329 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
330 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
331 		ASSERT(dn != NULL);
332 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
333 		ASSERT0(db->db.db_offset);
334 	} else {
335 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
336 	}
337 
338 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
339 		ASSERT(dr->dr_dbuf == db);
340 
341 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
342 		ASSERT(dr->dr_dbuf == db);
343 
344 	/*
345 	 * We can't assert that db_size matches dn_datablksz because it
346 	 * can be momentarily different when another thread is doing
347 	 * dnode_set_blksz().
348 	 */
349 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
350 		dr = db->db_data_pending;
351 		/*
352 		 * It should only be modified in syncing context, so
353 		 * make sure we only have one copy of the data.
354 		 */
355 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
356 	}
357 
358 	/* verify db->db_blkptr */
359 	if (db->db_blkptr) {
360 		if (db->db_parent == dn->dn_dbuf) {
361 			/* db is pointed to by the dnode */
362 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
363 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
364 				ASSERT(db->db_parent == NULL);
365 			else
366 				ASSERT(db->db_parent != NULL);
367 			if (db->db_blkid != DMU_SPILL_BLKID)
368 				ASSERT3P(db->db_blkptr, ==,
369 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
370 		} else {
371 			/* db is pointed to by an indirect block */
372 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
373 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
374 			ASSERT3U(db->db_parent->db.db_object, ==,
375 			    db->db.db_object);
376 			/*
377 			 * dnode_grow_indblksz() can make this fail if we don't
378 			 * have the struct_rwlock.  XXX indblksz no longer
379 			 * grows.  safe to do this now?
380 			 */
381 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
382 				ASSERT3P(db->db_blkptr, ==,
383 				    ((blkptr_t *)db->db_parent->db.db_data +
384 				    db->db_blkid % epb));
385 			}
386 		}
387 	}
388 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
389 	    (db->db_buf == NULL || db->db_buf->b_data) &&
390 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
391 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
392 		/*
393 		 * If the blkptr isn't set but they have nonzero data,
394 		 * it had better be dirty, otherwise we'll lose that
395 		 * data when we evict this buffer.
396 		 */
397 		if (db->db_dirtycnt == 0) {
398 			uint64_t *buf = db->db.db_data;
399 			int i;
400 
401 			for (i = 0; i < db->db.db_size >> 3; i++) {
402 				ASSERT(buf[i] == 0);
403 			}
404 		}
405 	}
406 	DB_DNODE_EXIT(db);
407 }
408 #endif
409 
410 static void
411 dbuf_update_data(dmu_buf_impl_t *db)
412 {
413 	ASSERT(MUTEX_HELD(&db->db_mtx));
414 	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
415 		ASSERT(!refcount_is_zero(&db->db_holds));
416 		*db->db_user_data_ptr_ptr = db->db.db_data;
417 	}
418 }
419 
420 static void
421 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
422 {
423 	ASSERT(MUTEX_HELD(&db->db_mtx));
424 	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
425 	db->db_buf = buf;
426 	if (buf != NULL) {
427 		ASSERT(buf->b_data != NULL);
428 		db->db.db_data = buf->b_data;
429 		if (!arc_released(buf))
430 			arc_set_callback(buf, dbuf_do_evict, db);
431 		dbuf_update_data(db);
432 	} else {
433 		dbuf_evict_user(db);
434 		db->db.db_data = NULL;
435 		if (db->db_state != DB_NOFILL)
436 			db->db_state = DB_UNCACHED;
437 	}
438 }
439 
440 /*
441  * Loan out an arc_buf for read.  Return the loaned arc_buf.
442  */
443 arc_buf_t *
444 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
445 {
446 	arc_buf_t *abuf;
447 
448 	mutex_enter(&db->db_mtx);
449 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
450 		int blksz = db->db.db_size;
451 		spa_t *spa;
452 
453 		mutex_exit(&db->db_mtx);
454 		DB_GET_SPA(&spa, db);
455 		abuf = arc_loan_buf(spa, blksz);
456 		bcopy(db->db.db_data, abuf->b_data, blksz);
457 	} else {
458 		abuf = db->db_buf;
459 		arc_loan_inuse_buf(abuf, db);
460 		dbuf_set_data(db, NULL);
461 		mutex_exit(&db->db_mtx);
462 	}
463 	return (abuf);
464 }
465 
466 uint64_t
467 dbuf_whichblock(dnode_t *dn, uint64_t offset)
468 {
469 	if (dn->dn_datablkshift) {
470 		return (offset >> dn->dn_datablkshift);
471 	} else {
472 		ASSERT3U(offset, <, dn->dn_datablksz);
473 		return (0);
474 	}
475 }
476 
477 static void
478 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
479 {
480 	dmu_buf_impl_t *db = vdb;
481 
482 	mutex_enter(&db->db_mtx);
483 	ASSERT3U(db->db_state, ==, DB_READ);
484 	/*
485 	 * All reads are synchronous, so we must have a hold on the dbuf
486 	 */
487 	ASSERT(refcount_count(&db->db_holds) > 0);
488 	ASSERT(db->db_buf == NULL);
489 	ASSERT(db->db.db_data == NULL);
490 	if (db->db_level == 0 && db->db_freed_in_flight) {
491 		/* we were freed in flight; disregard any error */
492 		arc_release(buf, db);
493 		bzero(buf->b_data, db->db.db_size);
494 		arc_buf_freeze(buf);
495 		db->db_freed_in_flight = FALSE;
496 		dbuf_set_data(db, buf);
497 		db->db_state = DB_CACHED;
498 	} else if (zio == NULL || zio->io_error == 0) {
499 		dbuf_set_data(db, buf);
500 		db->db_state = DB_CACHED;
501 	} else {
502 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
503 		ASSERT3P(db->db_buf, ==, NULL);
504 		VERIFY(arc_buf_remove_ref(buf, db));
505 		db->db_state = DB_UNCACHED;
506 	}
507 	cv_broadcast(&db->db_changed);
508 	dbuf_rele_and_unlock(db, NULL);
509 }
510 
511 static void
512 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
513 {
514 	dnode_t *dn;
515 	spa_t *spa;
516 	zbookmark_t zb;
517 	uint32_t aflags = ARC_NOWAIT;
518 
519 	DB_DNODE_ENTER(db);
520 	dn = DB_DNODE(db);
521 	ASSERT(!refcount_is_zero(&db->db_holds));
522 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
523 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
524 	ASSERT(MUTEX_HELD(&db->db_mtx));
525 	ASSERT(db->db_state == DB_UNCACHED);
526 	ASSERT(db->db_buf == NULL);
527 
528 	if (db->db_blkid == DMU_BONUS_BLKID) {
529 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
530 
531 		ASSERT3U(bonuslen, <=, db->db.db_size);
532 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
533 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
534 		if (bonuslen < DN_MAX_BONUSLEN)
535 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
536 		if (bonuslen)
537 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
538 		DB_DNODE_EXIT(db);
539 		dbuf_update_data(db);
540 		db->db_state = DB_CACHED;
541 		mutex_exit(&db->db_mtx);
542 		return;
543 	}
544 
545 	/*
546 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
547 	 * processes the delete record and clears the bp while we are waiting
548 	 * for the dn_mtx (resulting in a "no" from block_freed).
549 	 */
550 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
551 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
552 	    BP_IS_HOLE(db->db_blkptr)))) {
553 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
554 
555 		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
556 		    db->db.db_size, db, type));
557 		DB_DNODE_EXIT(db);
558 		bzero(db->db.db_data, db->db.db_size);
559 		db->db_state = DB_CACHED;
560 		*flags |= DB_RF_CACHED;
561 		mutex_exit(&db->db_mtx);
562 		return;
563 	}
564 
565 	spa = dn->dn_objset->os_spa;
566 	DB_DNODE_EXIT(db);
567 
568 	db->db_state = DB_READ;
569 	mutex_exit(&db->db_mtx);
570 
571 	if (DBUF_IS_L2CACHEABLE(db))
572 		aflags |= ARC_L2CACHE;
573 	if (DBUF_IS_L2COMPRESSIBLE(db))
574 		aflags |= ARC_L2COMPRESS;
575 
576 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
577 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
578 	    db->db.db_object, db->db_level, db->db_blkid);
579 
580 	dbuf_add_ref(db, NULL);
581 
582 	(void) arc_read(zio, spa, db->db_blkptr,
583 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
584 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
585 	    &aflags, &zb);
586 	if (aflags & ARC_CACHED)
587 		*flags |= DB_RF_CACHED;
588 }
589 
590 int
591 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
592 {
593 	int err = 0;
594 	int havepzio = (zio != NULL);
595 	int prefetch;
596 	dnode_t *dn;
597 
598 	/*
599 	 * We don't have to hold the mutex to check db_state because it
600 	 * can't be freed while we have a hold on the buffer.
601 	 */
602 	ASSERT(!refcount_is_zero(&db->db_holds));
603 
604 	if (db->db_state == DB_NOFILL)
605 		return (SET_ERROR(EIO));
606 
607 	DB_DNODE_ENTER(db);
608 	dn = DB_DNODE(db);
609 	if ((flags & DB_RF_HAVESTRUCT) == 0)
610 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
611 
612 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
613 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
614 	    DBUF_IS_CACHEABLE(db);
615 
616 	mutex_enter(&db->db_mtx);
617 	if (db->db_state == DB_CACHED) {
618 		mutex_exit(&db->db_mtx);
619 		if (prefetch)
620 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
621 			    db->db.db_size, TRUE);
622 		if ((flags & DB_RF_HAVESTRUCT) == 0)
623 			rw_exit(&dn->dn_struct_rwlock);
624 		DB_DNODE_EXIT(db);
625 	} else if (db->db_state == DB_UNCACHED) {
626 		spa_t *spa = dn->dn_objset->os_spa;
627 
628 		if (zio == NULL)
629 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
630 		dbuf_read_impl(db, zio, &flags);
631 
632 		/* dbuf_read_impl has dropped db_mtx for us */
633 
634 		if (prefetch)
635 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
636 			    db->db.db_size, flags & DB_RF_CACHED);
637 
638 		if ((flags & DB_RF_HAVESTRUCT) == 0)
639 			rw_exit(&dn->dn_struct_rwlock);
640 		DB_DNODE_EXIT(db);
641 
642 		if (!havepzio)
643 			err = zio_wait(zio);
644 	} else {
645 		/*
646 		 * Another reader came in while the dbuf was in flight
647 		 * between UNCACHED and CACHED.  Either a writer will finish
648 		 * writing the buffer (sending the dbuf to CACHED) or the
649 		 * first reader's request will reach the read_done callback
650 		 * and send the dbuf to CACHED.  Otherwise, a failure
651 		 * occurred and the dbuf went to UNCACHED.
652 		 */
653 		mutex_exit(&db->db_mtx);
654 		if (prefetch)
655 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
656 			    db->db.db_size, TRUE);
657 		if ((flags & DB_RF_HAVESTRUCT) == 0)
658 			rw_exit(&dn->dn_struct_rwlock);
659 		DB_DNODE_EXIT(db);
660 
661 		/* Skip the wait per the caller's request. */
662 		mutex_enter(&db->db_mtx);
663 		if ((flags & DB_RF_NEVERWAIT) == 0) {
664 			while (db->db_state == DB_READ ||
665 			    db->db_state == DB_FILL) {
666 				ASSERT(db->db_state == DB_READ ||
667 				    (flags & DB_RF_HAVESTRUCT) == 0);
668 				cv_wait(&db->db_changed, &db->db_mtx);
669 			}
670 			if (db->db_state == DB_UNCACHED)
671 				err = SET_ERROR(EIO);
672 		}
673 		mutex_exit(&db->db_mtx);
674 	}
675 
676 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
677 	return (err);
678 }
679 
680 static void
681 dbuf_noread(dmu_buf_impl_t *db)
682 {
683 	ASSERT(!refcount_is_zero(&db->db_holds));
684 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
685 	mutex_enter(&db->db_mtx);
686 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
687 		cv_wait(&db->db_changed, &db->db_mtx);
688 	if (db->db_state == DB_UNCACHED) {
689 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
690 		spa_t *spa;
691 
692 		ASSERT(db->db_buf == NULL);
693 		ASSERT(db->db.db_data == NULL);
694 		DB_GET_SPA(&spa, db);
695 		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
696 		db->db_state = DB_FILL;
697 	} else if (db->db_state == DB_NOFILL) {
698 		dbuf_set_data(db, NULL);
699 	} else {
700 		ASSERT3U(db->db_state, ==, DB_CACHED);
701 	}
702 	mutex_exit(&db->db_mtx);
703 }
704 
705 /*
706  * This is our just-in-time copy function.  It makes a copy of
707  * buffers, that have been modified in a previous transaction
708  * group, before we modify them in the current active group.
709  *
710  * This function is used in two places: when we are dirtying a
711  * buffer for the first time in a txg, and when we are freeing
712  * a range in a dnode that includes this buffer.
713  *
714  * Note that when we are called from dbuf_free_range() we do
715  * not put a hold on the buffer, we just traverse the active
716  * dbuf list for the dnode.
717  */
718 static void
719 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
720 {
721 	dbuf_dirty_record_t *dr = db->db_last_dirty;
722 
723 	ASSERT(MUTEX_HELD(&db->db_mtx));
724 	ASSERT(db->db.db_data != NULL);
725 	ASSERT(db->db_level == 0);
726 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
727 
728 	if (dr == NULL ||
729 	    (dr->dt.dl.dr_data !=
730 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
731 		return;
732 
733 	/*
734 	 * If the last dirty record for this dbuf has not yet synced
735 	 * and its referencing the dbuf data, either:
736 	 *	reset the reference to point to a new copy,
737 	 * or (if there a no active holders)
738 	 *	just null out the current db_data pointer.
739 	 */
740 	ASSERT(dr->dr_txg >= txg - 2);
741 	if (db->db_blkid == DMU_BONUS_BLKID) {
742 		/* Note that the data bufs here are zio_bufs */
743 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
744 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
745 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
746 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
747 		int size = db->db.db_size;
748 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
749 		spa_t *spa;
750 
751 		DB_GET_SPA(&spa, db);
752 		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
753 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
754 	} else {
755 		dbuf_set_data(db, NULL);
756 	}
757 }
758 
759 void
760 dbuf_unoverride(dbuf_dirty_record_t *dr)
761 {
762 	dmu_buf_impl_t *db = dr->dr_dbuf;
763 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
764 	uint64_t txg = dr->dr_txg;
765 
766 	ASSERT(MUTEX_HELD(&db->db_mtx));
767 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
768 	ASSERT(db->db_level == 0);
769 
770 	if (db->db_blkid == DMU_BONUS_BLKID ||
771 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
772 		return;
773 
774 	ASSERT(db->db_data_pending != dr);
775 
776 	/* free this block */
777 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
778 		spa_t *spa;
779 
780 		DB_GET_SPA(&spa, db);
781 		zio_free(spa, txg, bp);
782 	}
783 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
784 	dr->dt.dl.dr_nopwrite = B_FALSE;
785 
786 	/*
787 	 * Release the already-written buffer, so we leave it in
788 	 * a consistent dirty state.  Note that all callers are
789 	 * modifying the buffer, so they will immediately do
790 	 * another (redundant) arc_release().  Therefore, leave
791 	 * the buf thawed to save the effort of freezing &
792 	 * immediately re-thawing it.
793 	 */
794 	arc_release(dr->dt.dl.dr_data, db);
795 }
796 
797 /*
798  * Evict (if its unreferenced) or clear (if its referenced) any level-0
799  * data blocks in the free range, so that any future readers will find
800  * empty blocks.  Also, if we happen across any level-1 dbufs in the
801  * range that have not already been marked dirty, mark them dirty so
802  * they stay in memory.
803  *
804  * This is a no-op if the dataset is in the middle of an incremental
805  * receive; see comment below for details.
806  */
807 void
808 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
809 {
810 	dmu_buf_impl_t *db, *db_next;
811 	uint64_t txg = tx->tx_txg;
812 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
813 	uint64_t first_l1 = start >> epbs;
814 	uint64_t last_l1 = end >> epbs;
815 
816 	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
817 		end = dn->dn_maxblkid;
818 		last_l1 = end >> epbs;
819 	}
820 	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
821 
822 	if (dmu_objset_is_receiving(dn->dn_objset)) {
823 		/*
824 		 * When processing a free record from a zfs receive,
825 		 * there should have been no previous modifications to the
826 		 * data in this range.  Therefore there should be no dbufs
827 		 * in the range.  Searching dn_dbufs for these non-existent
828 		 * dbufs can be very expensive, so simply ignore this.
829 		 */
830 		VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
831 		VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
832 		return;
833 	}
834 
835 	mutex_enter(&dn->dn_dbufs_mtx);
836 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
837 		db_next = list_next(&dn->dn_dbufs, db);
838 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
839 
840 		if (db->db_level == 1 &&
841 		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
842 			mutex_enter(&db->db_mtx);
843 			if (db->db_last_dirty &&
844 			    db->db_last_dirty->dr_txg < txg) {
845 				dbuf_add_ref(db, FTAG);
846 				mutex_exit(&db->db_mtx);
847 				dbuf_will_dirty(db, tx);
848 				dbuf_rele(db, FTAG);
849 			} else {
850 				mutex_exit(&db->db_mtx);
851 			}
852 		}
853 
854 		if (db->db_level != 0)
855 			continue;
856 		dprintf_dbuf(db, "found buf %s\n", "");
857 		if (db->db_blkid < start || db->db_blkid > end)
858 			continue;
859 
860 		/* found a level 0 buffer in the range */
861 		mutex_enter(&db->db_mtx);
862 		if (dbuf_undirty(db, tx)) {
863 			/* mutex has been dropped and dbuf destroyed */
864 			continue;
865 		}
866 
867 		if (db->db_state == DB_UNCACHED ||
868 		    db->db_state == DB_NOFILL ||
869 		    db->db_state == DB_EVICTING) {
870 			ASSERT(db->db.db_data == NULL);
871 			mutex_exit(&db->db_mtx);
872 			continue;
873 		}
874 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
875 			/* will be handled in dbuf_read_done or dbuf_rele */
876 			db->db_freed_in_flight = TRUE;
877 			mutex_exit(&db->db_mtx);
878 			continue;
879 		}
880 		if (refcount_count(&db->db_holds) == 0) {
881 			ASSERT(db->db_buf);
882 			dbuf_clear(db);
883 			continue;
884 		}
885 		/* The dbuf is referenced */
886 
887 		if (db->db_last_dirty != NULL) {
888 			dbuf_dirty_record_t *dr = db->db_last_dirty;
889 
890 			if (dr->dr_txg == txg) {
891 				/*
892 				 * This buffer is "in-use", re-adjust the file
893 				 * size to reflect that this buffer may
894 				 * contain new data when we sync.
895 				 */
896 				if (db->db_blkid != DMU_SPILL_BLKID &&
897 				    db->db_blkid > dn->dn_maxblkid)
898 					dn->dn_maxblkid = db->db_blkid;
899 				dbuf_unoverride(dr);
900 			} else {
901 				/*
902 				 * This dbuf is not dirty in the open context.
903 				 * Either uncache it (if its not referenced in
904 				 * the open context) or reset its contents to
905 				 * empty.
906 				 */
907 				dbuf_fix_old_data(db, txg);
908 			}
909 		}
910 		/* clear the contents if its cached */
911 		if (db->db_state == DB_CACHED) {
912 			ASSERT(db->db.db_data != NULL);
913 			arc_release(db->db_buf, db);
914 			bzero(db->db.db_data, db->db.db_size);
915 			arc_buf_freeze(db->db_buf);
916 		}
917 
918 		mutex_exit(&db->db_mtx);
919 	}
920 	mutex_exit(&dn->dn_dbufs_mtx);
921 }
922 
923 static int
924 dbuf_block_freeable(dmu_buf_impl_t *db)
925 {
926 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
927 	uint64_t birth_txg = 0;
928 
929 	/*
930 	 * We don't need any locking to protect db_blkptr:
931 	 * If it's syncing, then db_last_dirty will be set
932 	 * so we'll ignore db_blkptr.
933 	 */
934 	ASSERT(MUTEX_HELD(&db->db_mtx));
935 	if (db->db_last_dirty)
936 		birth_txg = db->db_last_dirty->dr_txg;
937 	else if (db->db_blkptr)
938 		birth_txg = db->db_blkptr->blk_birth;
939 
940 	/*
941 	 * If we don't exist or are in a snapshot, we can't be freed.
942 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
943 	 * are holding the db_mtx lock and might deadlock if we are
944 	 * prefetching a dedup-ed block.
945 	 */
946 	if (birth_txg)
947 		return (ds == NULL ||
948 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
949 	else
950 		return (FALSE);
951 }
952 
953 void
954 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
955 {
956 	arc_buf_t *buf, *obuf;
957 	int osize = db->db.db_size;
958 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
959 	dnode_t *dn;
960 
961 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
962 
963 	DB_DNODE_ENTER(db);
964 	dn = DB_DNODE(db);
965 
966 	/* XXX does *this* func really need the lock? */
967 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
968 
969 	/*
970 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
971 	 * is OK, because there can be no other references to the db
972 	 * when we are changing its size, so no concurrent DB_FILL can
973 	 * be happening.
974 	 */
975 	/*
976 	 * XXX we should be doing a dbuf_read, checking the return
977 	 * value and returning that up to our callers
978 	 */
979 	dbuf_will_dirty(db, tx);
980 
981 	/* create the data buffer for the new block */
982 	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
983 
984 	/* copy old block data to the new block */
985 	obuf = db->db_buf;
986 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
987 	/* zero the remainder */
988 	if (size > osize)
989 		bzero((uint8_t *)buf->b_data + osize, size - osize);
990 
991 	mutex_enter(&db->db_mtx);
992 	dbuf_set_data(db, buf);
993 	VERIFY(arc_buf_remove_ref(obuf, db));
994 	db->db.db_size = size;
995 
996 	if (db->db_level == 0) {
997 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
998 		db->db_last_dirty->dt.dl.dr_data = buf;
999 	}
1000 	mutex_exit(&db->db_mtx);
1001 
1002 	dnode_willuse_space(dn, size-osize, tx);
1003 	DB_DNODE_EXIT(db);
1004 }
1005 
1006 void
1007 dbuf_release_bp(dmu_buf_impl_t *db)
1008 {
1009 	objset_t *os;
1010 
1011 	DB_GET_OBJSET(&os, db);
1012 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1013 	ASSERT(arc_released(os->os_phys_buf) ||
1014 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
1015 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1016 
1017 	(void) arc_release(db->db_buf, db);
1018 }
1019 
1020 dbuf_dirty_record_t *
1021 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1022 {
1023 	dnode_t *dn;
1024 	objset_t *os;
1025 	dbuf_dirty_record_t **drp, *dr;
1026 	int drop_struct_lock = FALSE;
1027 	boolean_t do_free_accounting = B_FALSE;
1028 	int txgoff = tx->tx_txg & TXG_MASK;
1029 
1030 	ASSERT(tx->tx_txg != 0);
1031 	ASSERT(!refcount_is_zero(&db->db_holds));
1032 	DMU_TX_DIRTY_BUF(tx, db);
1033 
1034 	DB_DNODE_ENTER(db);
1035 	dn = DB_DNODE(db);
1036 	/*
1037 	 * Shouldn't dirty a regular buffer in syncing context.  Private
1038 	 * objects may be dirtied in syncing context, but only if they
1039 	 * were already pre-dirtied in open context.
1040 	 */
1041 	ASSERT(!dmu_tx_is_syncing(tx) ||
1042 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1043 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1044 	    dn->dn_objset->os_dsl_dataset == NULL);
1045 	/*
1046 	 * We make this assert for private objects as well, but after we
1047 	 * check if we're already dirty.  They are allowed to re-dirty
1048 	 * in syncing context.
1049 	 */
1050 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1051 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1052 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1053 
1054 	mutex_enter(&db->db_mtx);
1055 	/*
1056 	 * XXX make this true for indirects too?  The problem is that
1057 	 * transactions created with dmu_tx_create_assigned() from
1058 	 * syncing context don't bother holding ahead.
1059 	 */
1060 	ASSERT(db->db_level != 0 ||
1061 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1062 	    db->db_state == DB_NOFILL);
1063 
1064 	mutex_enter(&dn->dn_mtx);
1065 	/*
1066 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
1067 	 * initialize the objset.
1068 	 */
1069 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1070 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1071 		dn->dn_dirtyctx =
1072 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1073 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
1074 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1075 	}
1076 	mutex_exit(&dn->dn_mtx);
1077 
1078 	if (db->db_blkid == DMU_SPILL_BLKID)
1079 		dn->dn_have_spill = B_TRUE;
1080 
1081 	/*
1082 	 * If this buffer is already dirty, we're done.
1083 	 */
1084 	drp = &db->db_last_dirty;
1085 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1086 	    db->db.db_object == DMU_META_DNODE_OBJECT);
1087 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1088 		drp = &dr->dr_next;
1089 	if (dr && dr->dr_txg == tx->tx_txg) {
1090 		DB_DNODE_EXIT(db);
1091 
1092 		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1093 			/*
1094 			 * If this buffer has already been written out,
1095 			 * we now need to reset its state.
1096 			 */
1097 			dbuf_unoverride(dr);
1098 			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1099 			    db->db_state != DB_NOFILL)
1100 				arc_buf_thaw(db->db_buf);
1101 		}
1102 		mutex_exit(&db->db_mtx);
1103 		return (dr);
1104 	}
1105 
1106 	/*
1107 	 * Only valid if not already dirty.
1108 	 */
1109 	ASSERT(dn->dn_object == 0 ||
1110 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1111 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1112 
1113 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
1114 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1115 	    dn->dn_phys->dn_nlevels > db->db_level ||
1116 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
1117 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1118 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1119 
1120 	/*
1121 	 * We should only be dirtying in syncing context if it's the
1122 	 * mos or we're initializing the os or it's a special object.
1123 	 * However, we are allowed to dirty in syncing context provided
1124 	 * we already dirtied it in open context.  Hence we must make
1125 	 * this assertion only if we're not already dirty.
1126 	 */
1127 	os = dn->dn_objset;
1128 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1129 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1130 	ASSERT(db->db.db_size != 0);
1131 
1132 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1133 
1134 	if (db->db_blkid != DMU_BONUS_BLKID) {
1135 		/*
1136 		 * Update the accounting.
1137 		 * Note: we delay "free accounting" until after we drop
1138 		 * the db_mtx.  This keeps us from grabbing other locks
1139 		 * (and possibly deadlocking) in bp_get_dsize() while
1140 		 * also holding the db_mtx.
1141 		 */
1142 		dnode_willuse_space(dn, db->db.db_size, tx);
1143 		do_free_accounting = dbuf_block_freeable(db);
1144 	}
1145 
1146 	/*
1147 	 * If this buffer is dirty in an old transaction group we need
1148 	 * to make a copy of it so that the changes we make in this
1149 	 * transaction group won't leak out when we sync the older txg.
1150 	 */
1151 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1152 	if (db->db_level == 0) {
1153 		void *data_old = db->db_buf;
1154 
1155 		if (db->db_state != DB_NOFILL) {
1156 			if (db->db_blkid == DMU_BONUS_BLKID) {
1157 				dbuf_fix_old_data(db, tx->tx_txg);
1158 				data_old = db->db.db_data;
1159 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1160 				/*
1161 				 * Release the data buffer from the cache so
1162 				 * that we can modify it without impacting
1163 				 * possible other users of this cached data
1164 				 * block.  Note that indirect blocks and
1165 				 * private objects are not released until the
1166 				 * syncing state (since they are only modified
1167 				 * then).
1168 				 */
1169 				arc_release(db->db_buf, db);
1170 				dbuf_fix_old_data(db, tx->tx_txg);
1171 				data_old = db->db_buf;
1172 			}
1173 			ASSERT(data_old != NULL);
1174 		}
1175 		dr->dt.dl.dr_data = data_old;
1176 	} else {
1177 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1178 		list_create(&dr->dt.di.dr_children,
1179 		    sizeof (dbuf_dirty_record_t),
1180 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
1181 	}
1182 	dr->dr_dbuf = db;
1183 	dr->dr_txg = tx->tx_txg;
1184 	dr->dr_next = *drp;
1185 	*drp = dr;
1186 
1187 	/*
1188 	 * We could have been freed_in_flight between the dbuf_noread
1189 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
1190 	 * happened after the free.
1191 	 */
1192 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1193 	    db->db_blkid != DMU_SPILL_BLKID) {
1194 		mutex_enter(&dn->dn_mtx);
1195 		dnode_clear_range(dn, db->db_blkid, 1, tx);
1196 		mutex_exit(&dn->dn_mtx);
1197 		db->db_freed_in_flight = FALSE;
1198 	}
1199 
1200 	/*
1201 	 * This buffer is now part of this txg
1202 	 */
1203 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1204 	db->db_dirtycnt += 1;
1205 	ASSERT3U(db->db_dirtycnt, <=, 3);
1206 
1207 	mutex_exit(&db->db_mtx);
1208 
1209 	if (db->db_blkid == DMU_BONUS_BLKID ||
1210 	    db->db_blkid == DMU_SPILL_BLKID) {
1211 		mutex_enter(&dn->dn_mtx);
1212 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1213 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1214 		mutex_exit(&dn->dn_mtx);
1215 		dnode_setdirty(dn, tx);
1216 		DB_DNODE_EXIT(db);
1217 		return (dr);
1218 	} else if (do_free_accounting) {
1219 		blkptr_t *bp = db->db_blkptr;
1220 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1221 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1222 		/*
1223 		 * This is only a guess -- if the dbuf is dirty
1224 		 * in a previous txg, we don't know how much
1225 		 * space it will use on disk yet.  We should
1226 		 * really have the struct_rwlock to access
1227 		 * db_blkptr, but since this is just a guess,
1228 		 * it's OK if we get an odd answer.
1229 		 */
1230 		ddt_prefetch(os->os_spa, bp);
1231 		dnode_willuse_space(dn, -willfree, tx);
1232 	}
1233 
1234 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1235 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
1236 		drop_struct_lock = TRUE;
1237 	}
1238 
1239 	if (db->db_level == 0) {
1240 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1241 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
1242 	}
1243 
1244 	if (db->db_level+1 < dn->dn_nlevels) {
1245 		dmu_buf_impl_t *parent = db->db_parent;
1246 		dbuf_dirty_record_t *di;
1247 		int parent_held = FALSE;
1248 
1249 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1250 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1251 
1252 			parent = dbuf_hold_level(dn, db->db_level+1,
1253 			    db->db_blkid >> epbs, FTAG);
1254 			ASSERT(parent != NULL);
1255 			parent_held = TRUE;
1256 		}
1257 		if (drop_struct_lock)
1258 			rw_exit(&dn->dn_struct_rwlock);
1259 		ASSERT3U(db->db_level+1, ==, parent->db_level);
1260 		di = dbuf_dirty(parent, tx);
1261 		if (parent_held)
1262 			dbuf_rele(parent, FTAG);
1263 
1264 		mutex_enter(&db->db_mtx);
1265 		/*  possible race with dbuf_undirty() */
1266 		if (db->db_last_dirty == dr ||
1267 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
1268 			mutex_enter(&di->dt.di.dr_mtx);
1269 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1270 			ASSERT(!list_link_active(&dr->dr_dirty_node));
1271 			list_insert_tail(&di->dt.di.dr_children, dr);
1272 			mutex_exit(&di->dt.di.dr_mtx);
1273 			dr->dr_parent = di;
1274 		}
1275 		mutex_exit(&db->db_mtx);
1276 	} else {
1277 		ASSERT(db->db_level+1 == dn->dn_nlevels);
1278 		ASSERT(db->db_blkid < dn->dn_nblkptr);
1279 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1280 		mutex_enter(&dn->dn_mtx);
1281 		ASSERT(!list_link_active(&dr->dr_dirty_node));
1282 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1283 		mutex_exit(&dn->dn_mtx);
1284 		if (drop_struct_lock)
1285 			rw_exit(&dn->dn_struct_rwlock);
1286 	}
1287 
1288 	dnode_setdirty(dn, tx);
1289 	DB_DNODE_EXIT(db);
1290 	return (dr);
1291 }
1292 
1293 /*
1294  * Undirty a buffer in the transaction group referenced by the given
1295  * transaction.  Return whether this evicted the dbuf.
1296  */
1297 static boolean_t
1298 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1299 {
1300 	dnode_t *dn;
1301 	uint64_t txg = tx->tx_txg;
1302 	dbuf_dirty_record_t *dr, **drp;
1303 
1304 	ASSERT(txg != 0);
1305 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1306 	ASSERT0(db->db_level);
1307 	ASSERT(MUTEX_HELD(&db->db_mtx));
1308 
1309 	/*
1310 	 * If this buffer is not dirty, we're done.
1311 	 */
1312 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1313 		if (dr->dr_txg <= txg)
1314 			break;
1315 	if (dr == NULL || dr->dr_txg < txg)
1316 		return (B_FALSE);
1317 	ASSERT(dr->dr_txg == txg);
1318 	ASSERT(dr->dr_dbuf == db);
1319 
1320 	DB_DNODE_ENTER(db);
1321 	dn = DB_DNODE(db);
1322 
1323 	/*
1324 	 * Note:  This code will probably work even if there are concurrent
1325 	 * holders, but it is untested in that scenerio, as the ZPL and
1326 	 * ztest have additional locking (the range locks) that prevents
1327 	 * that type of concurrent access.
1328 	 */
1329 	ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1330 
1331 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1332 
1333 	ASSERT(db->db.db_size != 0);
1334 
1335 	/* XXX would be nice to fix up dn_towrite_space[] */
1336 
1337 	*drp = dr->dr_next;
1338 
1339 	/*
1340 	 * Note that there are three places in dbuf_dirty()
1341 	 * where this dirty record may be put on a list.
1342 	 * Make sure to do a list_remove corresponding to
1343 	 * every one of those list_insert calls.
1344 	 */
1345 	if (dr->dr_parent) {
1346 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1347 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1348 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1349 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
1350 	    db->db_level+1 == dn->dn_nlevels) {
1351 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1352 		mutex_enter(&dn->dn_mtx);
1353 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1354 		mutex_exit(&dn->dn_mtx);
1355 	}
1356 	DB_DNODE_EXIT(db);
1357 
1358 	if (db->db_state != DB_NOFILL) {
1359 		dbuf_unoverride(dr);
1360 
1361 		ASSERT(db->db_buf != NULL);
1362 		ASSERT(dr->dt.dl.dr_data != NULL);
1363 		if (dr->dt.dl.dr_data != db->db_buf)
1364 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1365 	}
1366 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
1367 
1368 	ASSERT(db->db_dirtycnt > 0);
1369 	db->db_dirtycnt -= 1;
1370 
1371 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1372 		arc_buf_t *buf = db->db_buf;
1373 
1374 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1375 		dbuf_set_data(db, NULL);
1376 		VERIFY(arc_buf_remove_ref(buf, db));
1377 		dbuf_evict(db);
1378 		return (B_TRUE);
1379 	}
1380 
1381 	return (B_FALSE);
1382 }
1383 
1384 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1385 void
1386 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1387 {
1388 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1389 
1390 	ASSERT(tx->tx_txg != 0);
1391 	ASSERT(!refcount_is_zero(&db->db_holds));
1392 
1393 	DB_DNODE_ENTER(db);
1394 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1395 		rf |= DB_RF_HAVESTRUCT;
1396 	DB_DNODE_EXIT(db);
1397 	(void) dbuf_read(db, NULL, rf);
1398 	(void) dbuf_dirty(db, tx);
1399 }
1400 
1401 void
1402 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1403 {
1404 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1405 
1406 	db->db_state = DB_NOFILL;
1407 
1408 	dmu_buf_will_fill(db_fake, tx);
1409 }
1410 
1411 void
1412 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1413 {
1414 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1415 
1416 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1417 	ASSERT(tx->tx_txg != 0);
1418 	ASSERT(db->db_level == 0);
1419 	ASSERT(!refcount_is_zero(&db->db_holds));
1420 
1421 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1422 	    dmu_tx_private_ok(tx));
1423 
1424 	dbuf_noread(db);
1425 	(void) dbuf_dirty(db, tx);
1426 }
1427 
1428 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1429 /* ARGSUSED */
1430 void
1431 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1432 {
1433 	mutex_enter(&db->db_mtx);
1434 	DBUF_VERIFY(db);
1435 
1436 	if (db->db_state == DB_FILL) {
1437 		if (db->db_level == 0 && db->db_freed_in_flight) {
1438 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1439 			/* we were freed while filling */
1440 			/* XXX dbuf_undirty? */
1441 			bzero(db->db.db_data, db->db.db_size);
1442 			db->db_freed_in_flight = FALSE;
1443 		}
1444 		db->db_state = DB_CACHED;
1445 		cv_broadcast(&db->db_changed);
1446 	}
1447 	mutex_exit(&db->db_mtx);
1448 }
1449 
1450 /*
1451  * Directly assign a provided arc buf to a given dbuf if it's not referenced
1452  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1453  */
1454 void
1455 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1456 {
1457 	ASSERT(!refcount_is_zero(&db->db_holds));
1458 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1459 	ASSERT(db->db_level == 0);
1460 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1461 	ASSERT(buf != NULL);
1462 	ASSERT(arc_buf_size(buf) == db->db.db_size);
1463 	ASSERT(tx->tx_txg != 0);
1464 
1465 	arc_return_buf(buf, db);
1466 	ASSERT(arc_released(buf));
1467 
1468 	mutex_enter(&db->db_mtx);
1469 
1470 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
1471 		cv_wait(&db->db_changed, &db->db_mtx);
1472 
1473 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1474 
1475 	if (db->db_state == DB_CACHED &&
1476 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1477 		mutex_exit(&db->db_mtx);
1478 		(void) dbuf_dirty(db, tx);
1479 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1480 		VERIFY(arc_buf_remove_ref(buf, db));
1481 		xuio_stat_wbuf_copied();
1482 		return;
1483 	}
1484 
1485 	xuio_stat_wbuf_nocopy();
1486 	if (db->db_state == DB_CACHED) {
1487 		dbuf_dirty_record_t *dr = db->db_last_dirty;
1488 
1489 		ASSERT(db->db_buf != NULL);
1490 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1491 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
1492 			if (!arc_released(db->db_buf)) {
1493 				ASSERT(dr->dt.dl.dr_override_state ==
1494 				    DR_OVERRIDDEN);
1495 				arc_release(db->db_buf, db);
1496 			}
1497 			dr->dt.dl.dr_data = buf;
1498 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1499 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1500 			arc_release(db->db_buf, db);
1501 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
1502 		}
1503 		db->db_buf = NULL;
1504 	}
1505 	ASSERT(db->db_buf == NULL);
1506 	dbuf_set_data(db, buf);
1507 	db->db_state = DB_FILL;
1508 	mutex_exit(&db->db_mtx);
1509 	(void) dbuf_dirty(db, tx);
1510 	dbuf_fill_done(db, tx);
1511 }
1512 
1513 /*
1514  * "Clear" the contents of this dbuf.  This will mark the dbuf
1515  * EVICTING and clear *most* of its references.  Unfortunetely,
1516  * when we are not holding the dn_dbufs_mtx, we can't clear the
1517  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
1518  * in this case.  For callers from the DMU we will usually see:
1519  *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1520  * For the arc callback, we will usually see:
1521  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1522  * Sometimes, though, we will get a mix of these two:
1523  *	DMU: dbuf_clear()->arc_buf_evict()
1524  *	ARC: dbuf_do_evict()->dbuf_destroy()
1525  */
1526 void
1527 dbuf_clear(dmu_buf_impl_t *db)
1528 {
1529 	dnode_t *dn;
1530 	dmu_buf_impl_t *parent = db->db_parent;
1531 	dmu_buf_impl_t *dndb;
1532 	int dbuf_gone = FALSE;
1533 
1534 	ASSERT(MUTEX_HELD(&db->db_mtx));
1535 	ASSERT(refcount_is_zero(&db->db_holds));
1536 
1537 	dbuf_evict_user(db);
1538 
1539 	if (db->db_state == DB_CACHED) {
1540 		ASSERT(db->db.db_data != NULL);
1541 		if (db->db_blkid == DMU_BONUS_BLKID) {
1542 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1543 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1544 		}
1545 		db->db.db_data = NULL;
1546 		db->db_state = DB_UNCACHED;
1547 	}
1548 
1549 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1550 	ASSERT(db->db_data_pending == NULL);
1551 
1552 	db->db_state = DB_EVICTING;
1553 	db->db_blkptr = NULL;
1554 
1555 	DB_DNODE_ENTER(db);
1556 	dn = DB_DNODE(db);
1557 	dndb = dn->dn_dbuf;
1558 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1559 		list_remove(&dn->dn_dbufs, db);
1560 		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1561 		membar_producer();
1562 		DB_DNODE_EXIT(db);
1563 		/*
1564 		 * Decrementing the dbuf count means that the hold corresponding
1565 		 * to the removed dbuf is no longer discounted in dnode_move(),
1566 		 * so the dnode cannot be moved until after we release the hold.
1567 		 * The membar_producer() ensures visibility of the decremented
1568 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1569 		 * release any lock.
1570 		 */
1571 		dnode_rele(dn, db);
1572 		db->db_dnode_handle = NULL;
1573 	} else {
1574 		DB_DNODE_EXIT(db);
1575 	}
1576 
1577 	if (db->db_buf)
1578 		dbuf_gone = arc_buf_evict(db->db_buf);
1579 
1580 	if (!dbuf_gone)
1581 		mutex_exit(&db->db_mtx);
1582 
1583 	/*
1584 	 * If this dbuf is referenced from an indirect dbuf,
1585 	 * decrement the ref count on the indirect dbuf.
1586 	 */
1587 	if (parent && parent != dndb)
1588 		dbuf_rele(parent, db);
1589 }
1590 
1591 static int
1592 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1593     dmu_buf_impl_t **parentp, blkptr_t **bpp)
1594 {
1595 	int nlevels, epbs;
1596 
1597 	*parentp = NULL;
1598 	*bpp = NULL;
1599 
1600 	ASSERT(blkid != DMU_BONUS_BLKID);
1601 
1602 	if (blkid == DMU_SPILL_BLKID) {
1603 		mutex_enter(&dn->dn_mtx);
1604 		if (dn->dn_have_spill &&
1605 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1606 			*bpp = &dn->dn_phys->dn_spill;
1607 		else
1608 			*bpp = NULL;
1609 		dbuf_add_ref(dn->dn_dbuf, NULL);
1610 		*parentp = dn->dn_dbuf;
1611 		mutex_exit(&dn->dn_mtx);
1612 		return (0);
1613 	}
1614 
1615 	if (dn->dn_phys->dn_nlevels == 0)
1616 		nlevels = 1;
1617 	else
1618 		nlevels = dn->dn_phys->dn_nlevels;
1619 
1620 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1621 
1622 	ASSERT3U(level * epbs, <, 64);
1623 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1624 	if (level >= nlevels ||
1625 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1626 		/* the buffer has no parent yet */
1627 		return (SET_ERROR(ENOENT));
1628 	} else if (level < nlevels-1) {
1629 		/* this block is referenced from an indirect block */
1630 		int err = dbuf_hold_impl(dn, level+1,
1631 		    blkid >> epbs, fail_sparse, NULL, parentp);
1632 		if (err)
1633 			return (err);
1634 		err = dbuf_read(*parentp, NULL,
1635 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1636 		if (err) {
1637 			dbuf_rele(*parentp, NULL);
1638 			*parentp = NULL;
1639 			return (err);
1640 		}
1641 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1642 		    (blkid & ((1ULL << epbs) - 1));
1643 		return (0);
1644 	} else {
1645 		/* the block is referenced from the dnode */
1646 		ASSERT3U(level, ==, nlevels-1);
1647 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1648 		    blkid < dn->dn_phys->dn_nblkptr);
1649 		if (dn->dn_dbuf) {
1650 			dbuf_add_ref(dn->dn_dbuf, NULL);
1651 			*parentp = dn->dn_dbuf;
1652 		}
1653 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
1654 		return (0);
1655 	}
1656 }
1657 
1658 static dmu_buf_impl_t *
1659 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1660     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1661 {
1662 	objset_t *os = dn->dn_objset;
1663 	dmu_buf_impl_t *db, *odb;
1664 
1665 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1666 	ASSERT(dn->dn_type != DMU_OT_NONE);
1667 
1668 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1669 
1670 	db->db_objset = os;
1671 	db->db.db_object = dn->dn_object;
1672 	db->db_level = level;
1673 	db->db_blkid = blkid;
1674 	db->db_last_dirty = NULL;
1675 	db->db_dirtycnt = 0;
1676 	db->db_dnode_handle = dn->dn_handle;
1677 	db->db_parent = parent;
1678 	db->db_blkptr = blkptr;
1679 
1680 	db->db_user_ptr = NULL;
1681 	db->db_user_data_ptr_ptr = NULL;
1682 	db->db_evict_func = NULL;
1683 	db->db_immediate_evict = 0;
1684 	db->db_freed_in_flight = 0;
1685 
1686 	if (blkid == DMU_BONUS_BLKID) {
1687 		ASSERT3P(parent, ==, dn->dn_dbuf);
1688 		db->db.db_size = DN_MAX_BONUSLEN -
1689 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1690 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1691 		db->db.db_offset = DMU_BONUS_BLKID;
1692 		db->db_state = DB_UNCACHED;
1693 		/* the bonus dbuf is not placed in the hash table */
1694 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1695 		return (db);
1696 	} else if (blkid == DMU_SPILL_BLKID) {
1697 		db->db.db_size = (blkptr != NULL) ?
1698 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1699 		db->db.db_offset = 0;
1700 	} else {
1701 		int blocksize =
1702 		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
1703 		db->db.db_size = blocksize;
1704 		db->db.db_offset = db->db_blkid * blocksize;
1705 	}
1706 
1707 	/*
1708 	 * Hold the dn_dbufs_mtx while we get the new dbuf
1709 	 * in the hash table *and* added to the dbufs list.
1710 	 * This prevents a possible deadlock with someone
1711 	 * trying to look up this dbuf before its added to the
1712 	 * dn_dbufs list.
1713 	 */
1714 	mutex_enter(&dn->dn_dbufs_mtx);
1715 	db->db_state = DB_EVICTING;
1716 	if ((odb = dbuf_hash_insert(db)) != NULL) {
1717 		/* someone else inserted it first */
1718 		kmem_cache_free(dbuf_cache, db);
1719 		mutex_exit(&dn->dn_dbufs_mtx);
1720 		return (odb);
1721 	}
1722 	list_insert_head(&dn->dn_dbufs, db);
1723 	db->db_state = DB_UNCACHED;
1724 	mutex_exit(&dn->dn_dbufs_mtx);
1725 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1726 
1727 	if (parent && parent != dn->dn_dbuf)
1728 		dbuf_add_ref(parent, db);
1729 
1730 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1731 	    refcount_count(&dn->dn_holds) > 0);
1732 	(void) refcount_add(&dn->dn_holds, db);
1733 	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1734 
1735 	dprintf_dbuf(db, "db=%p\n", db);
1736 
1737 	return (db);
1738 }
1739 
1740 static int
1741 dbuf_do_evict(void *private)
1742 {
1743 	arc_buf_t *buf = private;
1744 	dmu_buf_impl_t *db = buf->b_private;
1745 
1746 	if (!MUTEX_HELD(&db->db_mtx))
1747 		mutex_enter(&db->db_mtx);
1748 
1749 	ASSERT(refcount_is_zero(&db->db_holds));
1750 
1751 	if (db->db_state != DB_EVICTING) {
1752 		ASSERT(db->db_state == DB_CACHED);
1753 		DBUF_VERIFY(db);
1754 		db->db_buf = NULL;
1755 		dbuf_evict(db);
1756 	} else {
1757 		mutex_exit(&db->db_mtx);
1758 		dbuf_destroy(db);
1759 	}
1760 	return (0);
1761 }
1762 
1763 static void
1764 dbuf_destroy(dmu_buf_impl_t *db)
1765 {
1766 	ASSERT(refcount_is_zero(&db->db_holds));
1767 
1768 	if (db->db_blkid != DMU_BONUS_BLKID) {
1769 		/*
1770 		 * If this dbuf is still on the dn_dbufs list,
1771 		 * remove it from that list.
1772 		 */
1773 		if (db->db_dnode_handle != NULL) {
1774 			dnode_t *dn;
1775 
1776 			DB_DNODE_ENTER(db);
1777 			dn = DB_DNODE(db);
1778 			mutex_enter(&dn->dn_dbufs_mtx);
1779 			list_remove(&dn->dn_dbufs, db);
1780 			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1781 			mutex_exit(&dn->dn_dbufs_mtx);
1782 			DB_DNODE_EXIT(db);
1783 			/*
1784 			 * Decrementing the dbuf count means that the hold
1785 			 * corresponding to the removed dbuf is no longer
1786 			 * discounted in dnode_move(), so the dnode cannot be
1787 			 * moved until after we release the hold.
1788 			 */
1789 			dnode_rele(dn, db);
1790 			db->db_dnode_handle = NULL;
1791 		}
1792 		dbuf_hash_remove(db);
1793 	}
1794 	db->db_parent = NULL;
1795 	db->db_buf = NULL;
1796 
1797 	ASSERT(!list_link_active(&db->db_link));
1798 	ASSERT(db->db.db_data == NULL);
1799 	ASSERT(db->db_hash_next == NULL);
1800 	ASSERT(db->db_blkptr == NULL);
1801 	ASSERT(db->db_data_pending == NULL);
1802 
1803 	kmem_cache_free(dbuf_cache, db);
1804 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1805 }
1806 
1807 void
1808 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1809 {
1810 	dmu_buf_impl_t *db = NULL;
1811 	blkptr_t *bp = NULL;
1812 
1813 	ASSERT(blkid != DMU_BONUS_BLKID);
1814 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1815 
1816 	if (dnode_block_freed(dn, blkid))
1817 		return;
1818 
1819 	/* dbuf_find() returns with db_mtx held */
1820 	if (db = dbuf_find(dn, 0, blkid)) {
1821 		/*
1822 		 * This dbuf is already in the cache.  We assume that
1823 		 * it is already CACHED, or else about to be either
1824 		 * read or filled.
1825 		 */
1826 		mutex_exit(&db->db_mtx);
1827 		return;
1828 	}
1829 
1830 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1831 		if (bp && !BP_IS_HOLE(bp)) {
1832 			int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1833 			    ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1834 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1835 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1836 			zbookmark_t zb;
1837 
1838 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1839 			    dn->dn_object, 0, blkid);
1840 
1841 			(void) arc_read(NULL, dn->dn_objset->os_spa,
1842 			    bp, NULL, NULL, priority,
1843 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1844 			    &aflags, &zb);
1845 		}
1846 		if (db)
1847 			dbuf_rele(db, NULL);
1848 	}
1849 }
1850 
1851 /*
1852  * Returns with db_holds incremented, and db_mtx not held.
1853  * Note: dn_struct_rwlock must be held.
1854  */
1855 int
1856 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1857     void *tag, dmu_buf_impl_t **dbp)
1858 {
1859 	dmu_buf_impl_t *db, *parent = NULL;
1860 
1861 	ASSERT(blkid != DMU_BONUS_BLKID);
1862 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1863 	ASSERT3U(dn->dn_nlevels, >, level);
1864 
1865 	*dbp = NULL;
1866 top:
1867 	/* dbuf_find() returns with db_mtx held */
1868 	db = dbuf_find(dn, level, blkid);
1869 
1870 	if (db == NULL) {
1871 		blkptr_t *bp = NULL;
1872 		int err;
1873 
1874 		ASSERT3P(parent, ==, NULL);
1875 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1876 		if (fail_sparse) {
1877 			if (err == 0 && bp && BP_IS_HOLE(bp))
1878 				err = SET_ERROR(ENOENT);
1879 			if (err) {
1880 				if (parent)
1881 					dbuf_rele(parent, NULL);
1882 				return (err);
1883 			}
1884 		}
1885 		if (err && err != ENOENT)
1886 			return (err);
1887 		db = dbuf_create(dn, level, blkid, parent, bp);
1888 	}
1889 
1890 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1891 		arc_buf_add_ref(db->db_buf, db);
1892 		if (db->db_buf->b_data == NULL) {
1893 			dbuf_clear(db);
1894 			if (parent) {
1895 				dbuf_rele(parent, NULL);
1896 				parent = NULL;
1897 			}
1898 			goto top;
1899 		}
1900 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1901 	}
1902 
1903 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1904 
1905 	/*
1906 	 * If this buffer is currently syncing out, and we are are
1907 	 * still referencing it from db_data, we need to make a copy
1908 	 * of it in case we decide we want to dirty it again in this txg.
1909 	 */
1910 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1911 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
1912 	    db->db_state == DB_CACHED && db->db_data_pending) {
1913 		dbuf_dirty_record_t *dr = db->db_data_pending;
1914 
1915 		if (dr->dt.dl.dr_data == db->db_buf) {
1916 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1917 
1918 			dbuf_set_data(db,
1919 			    arc_buf_alloc(dn->dn_objset->os_spa,
1920 			    db->db.db_size, db, type));
1921 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1922 			    db->db.db_size);
1923 		}
1924 	}
1925 
1926 	(void) refcount_add(&db->db_holds, tag);
1927 	dbuf_update_data(db);
1928 	DBUF_VERIFY(db);
1929 	mutex_exit(&db->db_mtx);
1930 
1931 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
1932 	if (parent)
1933 		dbuf_rele(parent, NULL);
1934 
1935 	ASSERT3P(DB_DNODE(db), ==, dn);
1936 	ASSERT3U(db->db_blkid, ==, blkid);
1937 	ASSERT3U(db->db_level, ==, level);
1938 	*dbp = db;
1939 
1940 	return (0);
1941 }
1942 
1943 dmu_buf_impl_t *
1944 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1945 {
1946 	dmu_buf_impl_t *db;
1947 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1948 	return (err ? NULL : db);
1949 }
1950 
1951 dmu_buf_impl_t *
1952 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1953 {
1954 	dmu_buf_impl_t *db;
1955 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1956 	return (err ? NULL : db);
1957 }
1958 
1959 void
1960 dbuf_create_bonus(dnode_t *dn)
1961 {
1962 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1963 
1964 	ASSERT(dn->dn_bonus == NULL);
1965 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1966 }
1967 
1968 int
1969 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1970 {
1971 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1972 	dnode_t *dn;
1973 
1974 	if (db->db_blkid != DMU_SPILL_BLKID)
1975 		return (SET_ERROR(ENOTSUP));
1976 	if (blksz == 0)
1977 		blksz = SPA_MINBLOCKSIZE;
1978 	if (blksz > SPA_MAXBLOCKSIZE)
1979 		blksz = SPA_MAXBLOCKSIZE;
1980 	else
1981 		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1982 
1983 	DB_DNODE_ENTER(db);
1984 	dn = DB_DNODE(db);
1985 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1986 	dbuf_new_size(db, blksz, tx);
1987 	rw_exit(&dn->dn_struct_rwlock);
1988 	DB_DNODE_EXIT(db);
1989 
1990 	return (0);
1991 }
1992 
1993 void
1994 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1995 {
1996 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1997 }
1998 
1999 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2000 void
2001 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2002 {
2003 	int64_t holds = refcount_add(&db->db_holds, tag);
2004 	ASSERT(holds > 1);
2005 }
2006 
2007 /*
2008  * If you call dbuf_rele() you had better not be referencing the dnode handle
2009  * unless you have some other direct or indirect hold on the dnode. (An indirect
2010  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2011  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2012  * dnode's parent dbuf evicting its dnode handles.
2013  */
2014 #pragma weak dmu_buf_rele = dbuf_rele
2015 void
2016 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2017 {
2018 	mutex_enter(&db->db_mtx);
2019 	dbuf_rele_and_unlock(db, tag);
2020 }
2021 
2022 /*
2023  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
2024  * db_dirtycnt and db_holds to be updated atomically.
2025  */
2026 void
2027 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2028 {
2029 	int64_t holds;
2030 
2031 	ASSERT(MUTEX_HELD(&db->db_mtx));
2032 	DBUF_VERIFY(db);
2033 
2034 	/*
2035 	 * Remove the reference to the dbuf before removing its hold on the
2036 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
2037 	 * buffer has a corresponding dnode hold.
2038 	 */
2039 	holds = refcount_remove(&db->db_holds, tag);
2040 	ASSERT(holds >= 0);
2041 
2042 	/*
2043 	 * We can't freeze indirects if there is a possibility that they
2044 	 * may be modified in the current syncing context.
2045 	 */
2046 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2047 		arc_buf_freeze(db->db_buf);
2048 
2049 	if (holds == db->db_dirtycnt &&
2050 	    db->db_level == 0 && db->db_immediate_evict)
2051 		dbuf_evict_user(db);
2052 
2053 	if (holds == 0) {
2054 		if (db->db_blkid == DMU_BONUS_BLKID) {
2055 			mutex_exit(&db->db_mtx);
2056 
2057 			/*
2058 			 * If the dnode moves here, we cannot cross this barrier
2059 			 * until the move completes.
2060 			 */
2061 			DB_DNODE_ENTER(db);
2062 			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2063 			DB_DNODE_EXIT(db);
2064 			/*
2065 			 * The bonus buffer's dnode hold is no longer discounted
2066 			 * in dnode_move(). The dnode cannot move until after
2067 			 * the dnode_rele().
2068 			 */
2069 			dnode_rele(DB_DNODE(db), db);
2070 		} else if (db->db_buf == NULL) {
2071 			/*
2072 			 * This is a special case: we never associated this
2073 			 * dbuf with any data allocated from the ARC.
2074 			 */
2075 			ASSERT(db->db_state == DB_UNCACHED ||
2076 			    db->db_state == DB_NOFILL);
2077 			dbuf_evict(db);
2078 		} else if (arc_released(db->db_buf)) {
2079 			arc_buf_t *buf = db->db_buf;
2080 			/*
2081 			 * This dbuf has anonymous data associated with it.
2082 			 */
2083 			dbuf_set_data(db, NULL);
2084 			VERIFY(arc_buf_remove_ref(buf, db));
2085 			dbuf_evict(db);
2086 		} else {
2087 			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2088 
2089 			/*
2090 			 * A dbuf will be eligible for eviction if either the
2091 			 * 'primarycache' property is set or a duplicate
2092 			 * copy of this buffer is already cached in the arc.
2093 			 *
2094 			 * In the case of the 'primarycache' a buffer
2095 			 * is considered for eviction if it matches the
2096 			 * criteria set in the property.
2097 			 *
2098 			 * To decide if our buffer is considered a
2099 			 * duplicate, we must call into the arc to determine
2100 			 * if multiple buffers are referencing the same
2101 			 * block on-disk. If so, then we simply evict
2102 			 * ourselves.
2103 			 */
2104 			if (!DBUF_IS_CACHEABLE(db) ||
2105 			    arc_buf_eviction_needed(db->db_buf))
2106 				dbuf_clear(db);
2107 			else
2108 				mutex_exit(&db->db_mtx);
2109 		}
2110 	} else {
2111 		mutex_exit(&db->db_mtx);
2112 	}
2113 }
2114 
2115 #pragma weak dmu_buf_refcount = dbuf_refcount
2116 uint64_t
2117 dbuf_refcount(dmu_buf_impl_t *db)
2118 {
2119 	return (refcount_count(&db->db_holds));
2120 }
2121 
2122 void *
2123 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2124     dmu_buf_evict_func_t *evict_func)
2125 {
2126 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2127 	    user_data_ptr_ptr, evict_func));
2128 }
2129 
2130 void *
2131 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2132     dmu_buf_evict_func_t *evict_func)
2133 {
2134 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2135 
2136 	db->db_immediate_evict = TRUE;
2137 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2138 	    user_data_ptr_ptr, evict_func));
2139 }
2140 
2141 void *
2142 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2143     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2144 {
2145 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2146 	ASSERT(db->db_level == 0);
2147 
2148 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
2149 
2150 	mutex_enter(&db->db_mtx);
2151 
2152 	if (db->db_user_ptr == old_user_ptr) {
2153 		db->db_user_ptr = user_ptr;
2154 		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2155 		db->db_evict_func = evict_func;
2156 
2157 		dbuf_update_data(db);
2158 	} else {
2159 		old_user_ptr = db->db_user_ptr;
2160 	}
2161 
2162 	mutex_exit(&db->db_mtx);
2163 	return (old_user_ptr);
2164 }
2165 
2166 void *
2167 dmu_buf_get_user(dmu_buf_t *db_fake)
2168 {
2169 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2170 	ASSERT(!refcount_is_zero(&db->db_holds));
2171 
2172 	return (db->db_user_ptr);
2173 }
2174 
2175 boolean_t
2176 dmu_buf_freeable(dmu_buf_t *dbuf)
2177 {
2178 	boolean_t res = B_FALSE;
2179 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2180 
2181 	if (db->db_blkptr)
2182 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2183 		    db->db_blkptr, db->db_blkptr->blk_birth);
2184 
2185 	return (res);
2186 }
2187 
2188 blkptr_t *
2189 dmu_buf_get_blkptr(dmu_buf_t *db)
2190 {
2191 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2192 	return (dbi->db_blkptr);
2193 }
2194 
2195 static void
2196 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2197 {
2198 	/* ASSERT(dmu_tx_is_syncing(tx) */
2199 	ASSERT(MUTEX_HELD(&db->db_mtx));
2200 
2201 	if (db->db_blkptr != NULL)
2202 		return;
2203 
2204 	if (db->db_blkid == DMU_SPILL_BLKID) {
2205 		db->db_blkptr = &dn->dn_phys->dn_spill;
2206 		BP_ZERO(db->db_blkptr);
2207 		return;
2208 	}
2209 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2210 		/*
2211 		 * This buffer was allocated at a time when there was
2212 		 * no available blkptrs from the dnode, or it was
2213 		 * inappropriate to hook it in (i.e., nlevels mis-match).
2214 		 */
2215 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2216 		ASSERT(db->db_parent == NULL);
2217 		db->db_parent = dn->dn_dbuf;
2218 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2219 		DBUF_VERIFY(db);
2220 	} else {
2221 		dmu_buf_impl_t *parent = db->db_parent;
2222 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2223 
2224 		ASSERT(dn->dn_phys->dn_nlevels > 1);
2225 		if (parent == NULL) {
2226 			mutex_exit(&db->db_mtx);
2227 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
2228 			(void) dbuf_hold_impl(dn, db->db_level+1,
2229 			    db->db_blkid >> epbs, FALSE, db, &parent);
2230 			rw_exit(&dn->dn_struct_rwlock);
2231 			mutex_enter(&db->db_mtx);
2232 			db->db_parent = parent;
2233 		}
2234 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
2235 		    (db->db_blkid & ((1ULL << epbs) - 1));
2236 		DBUF_VERIFY(db);
2237 	}
2238 }
2239 
2240 static void
2241 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2242 {
2243 	dmu_buf_impl_t *db = dr->dr_dbuf;
2244 	dnode_t *dn;
2245 	zio_t *zio;
2246 
2247 	ASSERT(dmu_tx_is_syncing(tx));
2248 
2249 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2250 
2251 	mutex_enter(&db->db_mtx);
2252 
2253 	ASSERT(db->db_level > 0);
2254 	DBUF_VERIFY(db);
2255 
2256 	/* Read the block if it hasn't been read yet. */
2257 	if (db->db_buf == NULL) {
2258 		mutex_exit(&db->db_mtx);
2259 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2260 		mutex_enter(&db->db_mtx);
2261 	}
2262 	ASSERT3U(db->db_state, ==, DB_CACHED);
2263 	ASSERT(db->db_buf != NULL);
2264 
2265 	DB_DNODE_ENTER(db);
2266 	dn = DB_DNODE(db);
2267 	/* Indirect block size must match what the dnode thinks it is. */
2268 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2269 	dbuf_check_blkptr(dn, db);
2270 	DB_DNODE_EXIT(db);
2271 
2272 	/* Provide the pending dirty record to child dbufs */
2273 	db->db_data_pending = dr;
2274 
2275 	mutex_exit(&db->db_mtx);
2276 	dbuf_write(dr, db->db_buf, tx);
2277 
2278 	zio = dr->dr_zio;
2279 	mutex_enter(&dr->dt.di.dr_mtx);
2280 	dbuf_sync_list(&dr->dt.di.dr_children, tx);
2281 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2282 	mutex_exit(&dr->dt.di.dr_mtx);
2283 	zio_nowait(zio);
2284 }
2285 
2286 static void
2287 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2288 {
2289 	arc_buf_t **datap = &dr->dt.dl.dr_data;
2290 	dmu_buf_impl_t *db = dr->dr_dbuf;
2291 	dnode_t *dn;
2292 	objset_t *os;
2293 	uint64_t txg = tx->tx_txg;
2294 
2295 	ASSERT(dmu_tx_is_syncing(tx));
2296 
2297 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2298 
2299 	mutex_enter(&db->db_mtx);
2300 	/*
2301 	 * To be synced, we must be dirtied.  But we
2302 	 * might have been freed after the dirty.
2303 	 */
2304 	if (db->db_state == DB_UNCACHED) {
2305 		/* This buffer has been freed since it was dirtied */
2306 		ASSERT(db->db.db_data == NULL);
2307 	} else if (db->db_state == DB_FILL) {
2308 		/* This buffer was freed and is now being re-filled */
2309 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2310 	} else {
2311 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2312 	}
2313 	DBUF_VERIFY(db);
2314 
2315 	DB_DNODE_ENTER(db);
2316 	dn = DB_DNODE(db);
2317 
2318 	if (db->db_blkid == DMU_SPILL_BLKID) {
2319 		mutex_enter(&dn->dn_mtx);
2320 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2321 		mutex_exit(&dn->dn_mtx);
2322 	}
2323 
2324 	/*
2325 	 * If this is a bonus buffer, simply copy the bonus data into the
2326 	 * dnode.  It will be written out when the dnode is synced (and it
2327 	 * will be synced, since it must have been dirty for dbuf_sync to
2328 	 * be called).
2329 	 */
2330 	if (db->db_blkid == DMU_BONUS_BLKID) {
2331 		dbuf_dirty_record_t **drp;
2332 
2333 		ASSERT(*datap != NULL);
2334 		ASSERT0(db->db_level);
2335 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2336 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2337 		DB_DNODE_EXIT(db);
2338 
2339 		if (*datap != db->db.db_data) {
2340 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
2341 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2342 		}
2343 		db->db_data_pending = NULL;
2344 		drp = &db->db_last_dirty;
2345 		while (*drp != dr)
2346 			drp = &(*drp)->dr_next;
2347 		ASSERT(dr->dr_next == NULL);
2348 		ASSERT(dr->dr_dbuf == db);
2349 		*drp = dr->dr_next;
2350 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
2351 		ASSERT(db->db_dirtycnt > 0);
2352 		db->db_dirtycnt -= 1;
2353 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2354 		return;
2355 	}
2356 
2357 	os = dn->dn_objset;
2358 
2359 	/*
2360 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
2361 	 * operation to sneak in. As a result, we need to ensure that we
2362 	 * don't check the dr_override_state until we have returned from
2363 	 * dbuf_check_blkptr.
2364 	 */
2365 	dbuf_check_blkptr(dn, db);
2366 
2367 	/*
2368 	 * If this buffer is in the middle of an immediate write,
2369 	 * wait for the synchronous IO to complete.
2370 	 */
2371 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2372 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2373 		cv_wait(&db->db_changed, &db->db_mtx);
2374 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2375 	}
2376 
2377 	if (db->db_state != DB_NOFILL &&
2378 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
2379 	    refcount_count(&db->db_holds) > 1 &&
2380 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2381 	    *datap == db->db_buf) {
2382 		/*
2383 		 * If this buffer is currently "in use" (i.e., there
2384 		 * are active holds and db_data still references it),
2385 		 * then make a copy before we start the write so that
2386 		 * any modifications from the open txg will not leak
2387 		 * into this write.
2388 		 *
2389 		 * NOTE: this copy does not need to be made for
2390 		 * objects only modified in the syncing context (e.g.
2391 		 * DNONE_DNODE blocks).
2392 		 */
2393 		int blksz = arc_buf_size(*datap);
2394 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2395 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2396 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
2397 	}
2398 	db->db_data_pending = dr;
2399 
2400 	mutex_exit(&db->db_mtx);
2401 
2402 	dbuf_write(dr, *datap, tx);
2403 
2404 	ASSERT(!list_link_active(&dr->dr_dirty_node));
2405 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2406 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2407 		DB_DNODE_EXIT(db);
2408 	} else {
2409 		/*
2410 		 * Although zio_nowait() does not "wait for an IO", it does
2411 		 * initiate the IO. If this is an empty write it seems plausible
2412 		 * that the IO could actually be completed before the nowait
2413 		 * returns. We need to DB_DNODE_EXIT() first in case
2414 		 * zio_nowait() invalidates the dbuf.
2415 		 */
2416 		DB_DNODE_EXIT(db);
2417 		zio_nowait(dr->dr_zio);
2418 	}
2419 }
2420 
2421 void
2422 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2423 {
2424 	dbuf_dirty_record_t *dr;
2425 
2426 	while (dr = list_head(list)) {
2427 		if (dr->dr_zio != NULL) {
2428 			/*
2429 			 * If we find an already initialized zio then we
2430 			 * are processing the meta-dnode, and we have finished.
2431 			 * The dbufs for all dnodes are put back on the list
2432 			 * during processing, so that we can zio_wait()
2433 			 * these IOs after initiating all child IOs.
2434 			 */
2435 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2436 			    DMU_META_DNODE_OBJECT);
2437 			break;
2438 		}
2439 		list_remove(list, dr);
2440 		if (dr->dr_dbuf->db_level > 0)
2441 			dbuf_sync_indirect(dr, tx);
2442 		else
2443 			dbuf_sync_leaf(dr, tx);
2444 	}
2445 }
2446 
2447 /* ARGSUSED */
2448 static void
2449 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2450 {
2451 	dmu_buf_impl_t *db = vdb;
2452 	dnode_t *dn;
2453 	blkptr_t *bp = zio->io_bp;
2454 	blkptr_t *bp_orig = &zio->io_bp_orig;
2455 	spa_t *spa = zio->io_spa;
2456 	int64_t delta;
2457 	uint64_t fill = 0;
2458 	int i;
2459 
2460 	ASSERT(db->db_blkptr == bp);
2461 
2462 	DB_DNODE_ENTER(db);
2463 	dn = DB_DNODE(db);
2464 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2465 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2466 	zio->io_prev_space_delta = delta;
2467 
2468 	if (BP_IS_HOLE(bp)) {
2469 		ASSERT(bp->blk_fill == 0);
2470 		DB_DNODE_EXIT(db);
2471 		return;
2472 	}
2473 
2474 	ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2475 	    BP_GET_TYPE(bp) == dn->dn_type) ||
2476 	    (db->db_blkid == DMU_SPILL_BLKID &&
2477 	    BP_GET_TYPE(bp) == dn->dn_bonustype));
2478 	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2479 
2480 	mutex_enter(&db->db_mtx);
2481 
2482 #ifdef ZFS_DEBUG
2483 	if (db->db_blkid == DMU_SPILL_BLKID) {
2484 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2485 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2486 		    db->db_blkptr == &dn->dn_phys->dn_spill);
2487 	}
2488 #endif
2489 
2490 	if (db->db_level == 0) {
2491 		mutex_enter(&dn->dn_mtx);
2492 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2493 		    db->db_blkid != DMU_SPILL_BLKID)
2494 			dn->dn_phys->dn_maxblkid = db->db_blkid;
2495 		mutex_exit(&dn->dn_mtx);
2496 
2497 		if (dn->dn_type == DMU_OT_DNODE) {
2498 			dnode_phys_t *dnp = db->db.db_data;
2499 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2500 			    i--, dnp++) {
2501 				if (dnp->dn_type != DMU_OT_NONE)
2502 					fill++;
2503 			}
2504 		} else {
2505 			fill = 1;
2506 		}
2507 	} else {
2508 		blkptr_t *ibp = db->db.db_data;
2509 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2510 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2511 			if (BP_IS_HOLE(ibp))
2512 				continue;
2513 			fill += ibp->blk_fill;
2514 		}
2515 	}
2516 	DB_DNODE_EXIT(db);
2517 
2518 	bp->blk_fill = fill;
2519 
2520 	mutex_exit(&db->db_mtx);
2521 }
2522 
2523 /* ARGSUSED */
2524 static void
2525 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2526 {
2527 	dmu_buf_impl_t *db = vdb;
2528 	blkptr_t *bp = zio->io_bp;
2529 	blkptr_t *bp_orig = &zio->io_bp_orig;
2530 	uint64_t txg = zio->io_txg;
2531 	dbuf_dirty_record_t **drp, *dr;
2532 
2533 	ASSERT0(zio->io_error);
2534 	ASSERT(db->db_blkptr == bp);
2535 
2536 	/*
2537 	 * For nopwrites and rewrites we ensure that the bp matches our
2538 	 * original and bypass all the accounting.
2539 	 */
2540 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2541 		ASSERT(BP_EQUAL(bp, bp_orig));
2542 	} else {
2543 		objset_t *os;
2544 		dsl_dataset_t *ds;
2545 		dmu_tx_t *tx;
2546 
2547 		DB_GET_OBJSET(&os, db);
2548 		ds = os->os_dsl_dataset;
2549 		tx = os->os_synctx;
2550 
2551 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2552 		dsl_dataset_block_born(ds, bp, tx);
2553 	}
2554 
2555 	mutex_enter(&db->db_mtx);
2556 
2557 	DBUF_VERIFY(db);
2558 
2559 	drp = &db->db_last_dirty;
2560 	while ((dr = *drp) != db->db_data_pending)
2561 		drp = &dr->dr_next;
2562 	ASSERT(!list_link_active(&dr->dr_dirty_node));
2563 	ASSERT(dr->dr_txg == txg);
2564 	ASSERT(dr->dr_dbuf == db);
2565 	ASSERT(dr->dr_next == NULL);
2566 	*drp = dr->dr_next;
2567 
2568 #ifdef ZFS_DEBUG
2569 	if (db->db_blkid == DMU_SPILL_BLKID) {
2570 		dnode_t *dn;
2571 
2572 		DB_DNODE_ENTER(db);
2573 		dn = DB_DNODE(db);
2574 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2575 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2576 		    db->db_blkptr == &dn->dn_phys->dn_spill);
2577 		DB_DNODE_EXIT(db);
2578 	}
2579 #endif
2580 
2581 	if (db->db_level == 0) {
2582 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2583 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2584 		if (db->db_state != DB_NOFILL) {
2585 			if (dr->dt.dl.dr_data != db->db_buf)
2586 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2587 				    db));
2588 			else if (!arc_released(db->db_buf))
2589 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
2590 		}
2591 	} else {
2592 		dnode_t *dn;
2593 
2594 		DB_DNODE_ENTER(db);
2595 		dn = DB_DNODE(db);
2596 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2597 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2598 		if (!BP_IS_HOLE(db->db_blkptr)) {
2599 			int epbs =
2600 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2601 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2602 			    db->db.db_size);
2603 			ASSERT3U(dn->dn_phys->dn_maxblkid
2604 			    >> (db->db_level * epbs), >=, db->db_blkid);
2605 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
2606 		}
2607 		DB_DNODE_EXIT(db);
2608 		mutex_destroy(&dr->dt.di.dr_mtx);
2609 		list_destroy(&dr->dt.di.dr_children);
2610 	}
2611 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
2612 
2613 	cv_broadcast(&db->db_changed);
2614 	ASSERT(db->db_dirtycnt > 0);
2615 	db->db_dirtycnt -= 1;
2616 	db->db_data_pending = NULL;
2617 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2618 }
2619 
2620 static void
2621 dbuf_write_nofill_ready(zio_t *zio)
2622 {
2623 	dbuf_write_ready(zio, NULL, zio->io_private);
2624 }
2625 
2626 static void
2627 dbuf_write_nofill_done(zio_t *zio)
2628 {
2629 	dbuf_write_done(zio, NULL, zio->io_private);
2630 }
2631 
2632 static void
2633 dbuf_write_override_ready(zio_t *zio)
2634 {
2635 	dbuf_dirty_record_t *dr = zio->io_private;
2636 	dmu_buf_impl_t *db = dr->dr_dbuf;
2637 
2638 	dbuf_write_ready(zio, NULL, db);
2639 }
2640 
2641 static void
2642 dbuf_write_override_done(zio_t *zio)
2643 {
2644 	dbuf_dirty_record_t *dr = zio->io_private;
2645 	dmu_buf_impl_t *db = dr->dr_dbuf;
2646 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2647 
2648 	mutex_enter(&db->db_mtx);
2649 	if (!BP_EQUAL(zio->io_bp, obp)) {
2650 		if (!BP_IS_HOLE(obp))
2651 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2652 		arc_release(dr->dt.dl.dr_data, db);
2653 	}
2654 	mutex_exit(&db->db_mtx);
2655 
2656 	dbuf_write_done(zio, NULL, db);
2657 }
2658 
2659 /* Issue I/O to commit a dirty buffer to disk. */
2660 static void
2661 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2662 {
2663 	dmu_buf_impl_t *db = dr->dr_dbuf;
2664 	dnode_t *dn;
2665 	objset_t *os;
2666 	dmu_buf_impl_t *parent = db->db_parent;
2667 	uint64_t txg = tx->tx_txg;
2668 	zbookmark_t zb;
2669 	zio_prop_t zp;
2670 	zio_t *zio;
2671 	int wp_flag = 0;
2672 
2673 	DB_DNODE_ENTER(db);
2674 	dn = DB_DNODE(db);
2675 	os = dn->dn_objset;
2676 
2677 	if (db->db_state != DB_NOFILL) {
2678 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2679 			/*
2680 			 * Private object buffers are released here rather
2681 			 * than in dbuf_dirty() since they are only modified
2682 			 * in the syncing context and we don't want the
2683 			 * overhead of making multiple copies of the data.
2684 			 */
2685 			if (BP_IS_HOLE(db->db_blkptr)) {
2686 				arc_buf_thaw(data);
2687 			} else {
2688 				dbuf_release_bp(db);
2689 			}
2690 		}
2691 	}
2692 
2693 	if (parent != dn->dn_dbuf) {
2694 		/* Our parent is an indirect block. */
2695 		/* We have a dirty parent that has been scheduled for write. */
2696 		ASSERT(parent && parent->db_data_pending);
2697 		/* Our parent's buffer is one level closer to the dnode. */
2698 		ASSERT(db->db_level == parent->db_level-1);
2699 		/*
2700 		 * We're about to modify our parent's db_data by modifying
2701 		 * our block pointer, so the parent must be released.
2702 		 */
2703 		ASSERT(arc_released(parent->db_buf));
2704 		zio = parent->db_data_pending->dr_zio;
2705 	} else {
2706 		/* Our parent is the dnode itself. */
2707 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2708 		    db->db_blkid != DMU_SPILL_BLKID) ||
2709 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2710 		if (db->db_blkid != DMU_SPILL_BLKID)
2711 			ASSERT3P(db->db_blkptr, ==,
2712 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
2713 		zio = dn->dn_zio;
2714 	}
2715 
2716 	ASSERT(db->db_level == 0 || data == db->db_buf);
2717 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2718 	ASSERT(zio);
2719 
2720 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2721 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2722 	    db->db.db_object, db->db_level, db->db_blkid);
2723 
2724 	if (db->db_blkid == DMU_SPILL_BLKID)
2725 		wp_flag = WP_SPILL;
2726 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2727 
2728 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2729 	DB_DNODE_EXIT(db);
2730 
2731 	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2732 		ASSERT(db->db_state != DB_NOFILL);
2733 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2734 		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2735 		    dbuf_write_override_ready, dbuf_write_override_done, dr,
2736 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2737 		mutex_enter(&db->db_mtx);
2738 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2739 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2740 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2741 		mutex_exit(&db->db_mtx);
2742 	} else if (db->db_state == DB_NOFILL) {
2743 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2744 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
2745 		    db->db_blkptr, NULL, db->db.db_size, &zp,
2746 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2747 		    ZIO_PRIORITY_ASYNC_WRITE,
2748 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2749 	} else {
2750 		ASSERT(arc_released(data));
2751 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
2752 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2753 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2754 		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
2755 		    ZIO_FLAG_MUSTSUCCEED, &zb);
2756 	}
2757 }
2758