1fa9e4066Sahrens /*
2fa9e4066Sahrens * CDDL HEADER START
3fa9e4066Sahrens *
4fa9e4066Sahrens * The contents of this file are subject to the terms of the
5f65e61c0Sahrens * Common Development and Distribution License (the "License").
6f65e61c0Sahrens * You may not use this file except in compliance with the License.
7fa9e4066Sahrens *
8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens * See the License for the specific language governing permissions
11fa9e4066Sahrens * and limitations under the License.
12fa9e4066Sahrens *
13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens *
19fa9e4066Sahrens * CDDL HEADER END
20fa9e4066Sahrens */
21fa9e4066Sahrens /*
2206e0070dSMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
233f2366c2SGordon Ross * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2446e1baa6SMatthew Ahrens * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25aad02571SSaso Kiselkov * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26810e43b2SBill Pijewski * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27bc9014e6SJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
29fa9e4066Sahrens */
30fa9e4066Sahrens
31fa9e4066Sahrens #include <sys/zfs_context.h>
32fa9e4066Sahrens #include <sys/dmu.h>
332f3d8780SMatthew Ahrens #include <sys/dmu_send.h>
34fa9e4066Sahrens #include <sys/dmu_impl.h>
35fa9e4066Sahrens #include <sys/dbuf.h>
36fa9e4066Sahrens #include <sys/dmu_objset.h>
37fa9e4066Sahrens #include <sys/dsl_dataset.h>
38fa9e4066Sahrens #include <sys/dsl_dir.h>
39fa9e4066Sahrens #include <sys/dmu_tx.h>
40fa9e4066Sahrens #include <sys/spa.h>
41fa9e4066Sahrens #include <sys/zio.h>
42fa9e4066Sahrens #include <sys/dmu_zfetch.h>
430a586ceaSMark Shellenbaum #include <sys/sa.h>
440a586ceaSMark Shellenbaum #include <sys/sa_impl.h>
455d7b4d43SMatthew Ahrens #include <sys/zfeature.h>
465d7b4d43SMatthew Ahrens #include <sys/blkptr.h>
47bf16b11eSMatthew Ahrens #include <sys/range_tree.h>
48fa9e4066Sahrens
49713d6c20SMatthew Ahrens /*
50713d6c20SMatthew Ahrens * Number of times that zfs_free_range() took the slow path while doing
51713d6c20SMatthew Ahrens * a zfs receive. A nonzero value indicates a potential performance problem.
52713d6c20SMatthew Ahrens */
53713d6c20SMatthew Ahrens uint64_t zfs_free_range_recv_miss;
54713d6c20SMatthew Ahrens
55fa9e4066Sahrens static void dbuf_destroy(dmu_buf_impl_t *db);
563b2aab18SMatthew Ahrens static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
57088f3894Sahrens static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
58fa9e4066Sahrens
59bc9014e6SJustin Gibbs #ifndef __lint
60bc9014e6SJustin Gibbs extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
61*e6546372SJosef 'Jeff' Sipek dmu_buf_evict_func_t *evict_func_sync,
62*e6546372SJosef 'Jeff' Sipek dmu_buf_evict_func_t *evict_func_async,
63*e6546372SJosef 'Jeff' Sipek dmu_buf_t **clear_on_evict_dbufp);
64bc9014e6SJustin Gibbs #endif /* ! __lint */
65bc9014e6SJustin Gibbs
66fa9e4066Sahrens /*
67fa9e4066Sahrens * Global data structures and functions for the dbuf cache.
68fa9e4066Sahrens */
69fa9e4066Sahrens static kmem_cache_t *dbuf_cache;
70bc9014e6SJustin Gibbs static taskq_t *dbu_evict_taskq;
71fa9e4066Sahrens
72fa9e4066Sahrens /* ARGSUSED */
73fa9e4066Sahrens static int
dbuf_cons(void * vdb,void * unused,int kmflag)74fa9e4066Sahrens dbuf_cons(void *vdb, void *unused, int kmflag)
75fa9e4066Sahrens {
76fa9e4066Sahrens dmu_buf_impl_t *db = vdb;
77fa9e4066Sahrens bzero(db, sizeof (dmu_buf_impl_t));
78fa9e4066Sahrens
79fa9e4066Sahrens mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
80fa9e4066Sahrens cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
81fa9e4066Sahrens refcount_create(&db->db_holds);
820f6d88adSAlex Reece
83fa9e4066Sahrens return (0);
84fa9e4066Sahrens }
85fa9e4066Sahrens
86fa9e4066Sahrens /* ARGSUSED */
87fa9e4066Sahrens static void
dbuf_dest(void * vdb,void * unused)88fa9e4066Sahrens dbuf_dest(void *vdb, void *unused)
89fa9e4066Sahrens {
90fa9e4066Sahrens dmu_buf_impl_t *db = vdb;
91fa9e4066Sahrens mutex_destroy(&db->db_mtx);
92fa9e4066Sahrens cv_destroy(&db->db_changed);
93fa9e4066Sahrens refcount_destroy(&db->db_holds);
94fa9e4066Sahrens }
95fa9e4066Sahrens
96fa9e4066Sahrens /*
97fa9e4066Sahrens * dbuf hash table routines
98fa9e4066Sahrens */
99fa9e4066Sahrens static dbuf_hash_table_t dbuf_hash_table;
100fa9e4066Sahrens
101fa9e4066Sahrens static uint64_t dbuf_hash_count;
102fa9e4066Sahrens
103fa9e4066Sahrens static uint64_t
dbuf_hash(void * os,uint64_t obj,uint8_t lvl,uint64_t blkid)104fa9e4066Sahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
105fa9e4066Sahrens {
106fa9e4066Sahrens uintptr_t osv = (uintptr_t)os;
107fa9e4066Sahrens uint64_t crc = -1ULL;
108fa9e4066Sahrens
109fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
110fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
111fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
112fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
113fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
114fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
115fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
116fa9e4066Sahrens
117fa9e4066Sahrens crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
118fa9e4066Sahrens
119fa9e4066Sahrens return (crc);
120fa9e4066Sahrens }
121fa9e4066Sahrens
122fa9e4066Sahrens #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
123fa9e4066Sahrens
124fa9e4066Sahrens #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
125fa9e4066Sahrens ((dbuf)->db.db_object == (obj) && \
126fa9e4066Sahrens (dbuf)->db_objset == (os) && \
127fa9e4066Sahrens (dbuf)->db_level == (level) && \
128fa9e4066Sahrens (dbuf)->db_blkid == (blkid))
129fa9e4066Sahrens
130fa9e4066Sahrens dmu_buf_impl_t *
dbuf_find(objset_t * os,uint64_t obj,uint8_t level,uint64_t blkid)131e57a022bSJustin T. Gibbs dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
132fa9e4066Sahrens {
133fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table;
134fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid);
135fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask;
136fa9e4066Sahrens dmu_buf_impl_t *db;
137fa9e4066Sahrens
138fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx));
139fa9e4066Sahrens for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
140fa9e4066Sahrens if (DBUF_EQUAL(db, os, obj, level, blkid)) {
141fa9e4066Sahrens mutex_enter(&db->db_mtx);
142ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) {
143fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx));
144fa9e4066Sahrens return (db);
145fa9e4066Sahrens }
146fa9e4066Sahrens mutex_exit(&db->db_mtx);
147fa9e4066Sahrens }
148fa9e4066Sahrens }
149fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx));
150fa9e4066Sahrens return (NULL);
151fa9e4066Sahrens }
152fa9e4066Sahrens
153e57a022bSJustin T. Gibbs static dmu_buf_impl_t *
dbuf_find_bonus(objset_t * os,uint64_t object)154e57a022bSJustin T. Gibbs dbuf_find_bonus(objset_t *os, uint64_t object)
155e57a022bSJustin T. Gibbs {
156e57a022bSJustin T. Gibbs dnode_t *dn;
157e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = NULL;
158e57a022bSJustin T. Gibbs
159e57a022bSJustin T. Gibbs if (dnode_hold(os, object, FTAG, &dn) == 0) {
160e57a022bSJustin T. Gibbs rw_enter(&dn->dn_struct_rwlock, RW_READER);
161e57a022bSJustin T. Gibbs if (dn->dn_bonus != NULL) {
162e57a022bSJustin T. Gibbs db = dn->dn_bonus;
163e57a022bSJustin T. Gibbs mutex_enter(&db->db_mtx);
164e57a022bSJustin T. Gibbs }
165e57a022bSJustin T. Gibbs rw_exit(&dn->dn_struct_rwlock);
166e57a022bSJustin T. Gibbs dnode_rele(dn, FTAG);
167e57a022bSJustin T. Gibbs }
168e57a022bSJustin T. Gibbs return (db);
169e57a022bSJustin T. Gibbs }
170e57a022bSJustin T. Gibbs
171fa9e4066Sahrens /*
172fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element
173fa9e4066Sahrens * equal to elem in the hash table, then the already existing element
174fa9e4066Sahrens * will be returned and the new element will not be inserted.
175fa9e4066Sahrens * Otherwise returns NULL.
176fa9e4066Sahrens */
177fa9e4066Sahrens static dmu_buf_impl_t *
dbuf_hash_insert(dmu_buf_impl_t * db)178fa9e4066Sahrens dbuf_hash_insert(dmu_buf_impl_t *db)
179fa9e4066Sahrens {
180fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table;
181503ad85cSMatthew Ahrens objset_t *os = db->db_objset;
182fa9e4066Sahrens uint64_t obj = db->db.db_object;
183fa9e4066Sahrens int level = db->db_level;
184fa9e4066Sahrens uint64_t blkid = db->db_blkid;
185fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid);
186fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask;
187fa9e4066Sahrens dmu_buf_impl_t *dbf;
188fa9e4066Sahrens
189fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx));
190fa9e4066Sahrens for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
191fa9e4066Sahrens if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
192fa9e4066Sahrens mutex_enter(&dbf->db_mtx);
193ea8dc4b6Seschrock if (dbf->db_state != DB_EVICTING) {
194fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx));
195fa9e4066Sahrens return (dbf);
196fa9e4066Sahrens }
197fa9e4066Sahrens mutex_exit(&dbf->db_mtx);
198fa9e4066Sahrens }
199fa9e4066Sahrens }
200fa9e4066Sahrens
201fa9e4066Sahrens mutex_enter(&db->db_mtx);
202fa9e4066Sahrens db->db_hash_next = h->hash_table[idx];
203fa9e4066Sahrens h->hash_table[idx] = db;
204fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx));
2051a5e258fSJosef 'Jeff' Sipek atomic_inc_64(&dbuf_hash_count);
206fa9e4066Sahrens
207fa9e4066Sahrens return (NULL);
208fa9e4066Sahrens }
209fa9e4066Sahrens
210fa9e4066Sahrens /*
211bbfa8ea8SMatthew Ahrens * Remove an entry from the hash table. It must be in the EVICTING state.
212fa9e4066Sahrens */
213fa9e4066Sahrens static void
dbuf_hash_remove(dmu_buf_impl_t * db)214fa9e4066Sahrens dbuf_hash_remove(dmu_buf_impl_t *db)
215fa9e4066Sahrens {
216fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table;
217fa9e4066Sahrens uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
218fa9e4066Sahrens db->db_level, db->db_blkid);
219fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask;
220fa9e4066Sahrens dmu_buf_impl_t *dbf, **dbp;
221fa9e4066Sahrens
222fa9e4066Sahrens /*
223bbfa8ea8SMatthew Ahrens * We musn't hold db_mtx to maintain lock ordering:
224fa9e4066Sahrens * DBUF_HASH_MUTEX > db_mtx.
225fa9e4066Sahrens */
226fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds));
227ea8dc4b6Seschrock ASSERT(db->db_state == DB_EVICTING);
228fa9e4066Sahrens ASSERT(!MUTEX_HELD(&db->db_mtx));
229fa9e4066Sahrens
230fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx));
231fa9e4066Sahrens dbp = &h->hash_table[idx];
232fa9e4066Sahrens while ((dbf = *dbp) != db) {
233fa9e4066Sahrens dbp = &dbf->db_hash_next;
234fa9e4066Sahrens ASSERT(dbf != NULL);
235fa9e4066Sahrens }
236fa9e4066Sahrens *dbp = db->db_hash_next;
237fa9e4066Sahrens db->db_hash_next = NULL;
238fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx));
2391a5e258fSJosef 'Jeff' Sipek atomic_dec_64(&dbuf_hash_count);
240fa9e4066Sahrens }
241fa9e4066Sahrens
242ea8dc4b6Seschrock static arc_evict_func_t dbuf_do_evict;
243fa9e4066Sahrens
244bc9014e6SJustin Gibbs typedef enum {
245bc9014e6SJustin Gibbs DBVU_EVICTING,
246bc9014e6SJustin Gibbs DBVU_NOT_EVICTING
247bc9014e6SJustin Gibbs } dbvu_verify_type_t;
248bc9014e6SJustin Gibbs
249bc9014e6SJustin Gibbs static void
dbuf_verify_user(dmu_buf_impl_t * db,dbvu_verify_type_t verify_type)250bc9014e6SJustin Gibbs dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
251bc9014e6SJustin Gibbs {
252bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG
253bc9014e6SJustin Gibbs int64_t holds;
254bc9014e6SJustin Gibbs
255bc9014e6SJustin Gibbs if (db->db_user == NULL)
256bc9014e6SJustin Gibbs return;
257bc9014e6SJustin Gibbs
258bc9014e6SJustin Gibbs /* Only data blocks support the attachment of user data. */
259bc9014e6SJustin Gibbs ASSERT(db->db_level == 0);
260bc9014e6SJustin Gibbs
261bc9014e6SJustin Gibbs /* Clients must resolve a dbuf before attaching user data. */
262bc9014e6SJustin Gibbs ASSERT(db->db.db_data != NULL);
263bc9014e6SJustin Gibbs ASSERT3U(db->db_state, ==, DB_CACHED);
264bc9014e6SJustin Gibbs
265bc9014e6SJustin Gibbs holds = refcount_count(&db->db_holds);
266bc9014e6SJustin Gibbs if (verify_type == DBVU_EVICTING) {
267bc9014e6SJustin Gibbs /*
268bc9014e6SJustin Gibbs * Immediate eviction occurs when holds == dirtycnt.
269bc9014e6SJustin Gibbs * For normal eviction buffers, holds is zero on
270bc9014e6SJustin Gibbs * eviction, except when dbuf_fix_old_data() calls
271bc9014e6SJustin Gibbs * dbuf_clear_data(). However, the hold count can grow
272bc9014e6SJustin Gibbs * during eviction even though db_mtx is held (see
273bc9014e6SJustin Gibbs * dmu_bonus_hold() for an example), so we can only
274bc9014e6SJustin Gibbs * test the generic invariant that holds >= dirtycnt.
275bc9014e6SJustin Gibbs */
276bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt);
277bc9014e6SJustin Gibbs } else {
278d2058105SJustin T. Gibbs if (db->db_user_immediate_evict == TRUE)
279bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt);
280bc9014e6SJustin Gibbs else
281bc9014e6SJustin Gibbs ASSERT3U(holds, >, 0);
282bc9014e6SJustin Gibbs }
283bc9014e6SJustin Gibbs #endif
284bc9014e6SJustin Gibbs }
285bc9014e6SJustin Gibbs
286fa9e4066Sahrens static void
dbuf_evict_user(dmu_buf_impl_t * db)287fa9e4066Sahrens dbuf_evict_user(dmu_buf_impl_t *db)
288fa9e4066Sahrens {
289bc9014e6SJustin Gibbs dmu_buf_user_t *dbu = db->db_user;
290bc9014e6SJustin Gibbs
291fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx));
292fa9e4066Sahrens
293bc9014e6SJustin Gibbs if (dbu == NULL)
294fa9e4066Sahrens return;
295fa9e4066Sahrens
296bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_EVICTING);
297bc9014e6SJustin Gibbs db->db_user = NULL;
298bc9014e6SJustin Gibbs
299bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG
300bc9014e6SJustin Gibbs if (dbu->dbu_clear_on_evict_dbufp != NULL)
301bc9014e6SJustin Gibbs *dbu->dbu_clear_on_evict_dbufp = NULL;
302bc9014e6SJustin Gibbs #endif
303bc9014e6SJustin Gibbs
304bc9014e6SJustin Gibbs /*
305*e6546372SJosef 'Jeff' Sipek * There are two eviction callbacks - one that we call synchronously
306*e6546372SJosef 'Jeff' Sipek * and one that we invoke via a taskq. The async one is useful for
307*e6546372SJosef 'Jeff' Sipek * avoiding lock order reversals and limiting stack depth.
308*e6546372SJosef 'Jeff' Sipek *
309*e6546372SJosef 'Jeff' Sipek * Note that if we have a sync callback but no async callback,
310*e6546372SJosef 'Jeff' Sipek * it's likely that the sync callback will free the structure
311*e6546372SJosef 'Jeff' Sipek * containing the dbu. In that case we need to take care to not
312*e6546372SJosef 'Jeff' Sipek * dereference dbu after calling the sync evict func.
313bc9014e6SJustin Gibbs */
314*e6546372SJosef 'Jeff' Sipek boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
315*e6546372SJosef 'Jeff' Sipek
316*e6546372SJosef 'Jeff' Sipek if (dbu->dbu_evict_func_sync != NULL)
317*e6546372SJosef 'Jeff' Sipek dbu->dbu_evict_func_sync(dbu);
318*e6546372SJosef 'Jeff' Sipek
319*e6546372SJosef 'Jeff' Sipek if (has_async) {
320*e6546372SJosef 'Jeff' Sipek taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
321*e6546372SJosef 'Jeff' Sipek dbu, 0, &dbu->dbu_tqent);
322*e6546372SJosef 'Jeff' Sipek }
323fa9e4066Sahrens }
324fa9e4066Sahrens
325744947dcSTom Erickson boolean_t
dbuf_is_metadata(dmu_buf_impl_t * db)326744947dcSTom Erickson dbuf_is_metadata(dmu_buf_impl_t *db)
327744947dcSTom Erickson {
328744947dcSTom Erickson if (db->db_level > 0) {
329744947dcSTom Erickson return (B_TRUE);
330744947dcSTom Erickson } else {
331744947dcSTom Erickson boolean_t is_metadata;
332744947dcSTom Erickson
333744947dcSTom Erickson DB_DNODE_ENTER(db);
334ad135b5dSChristopher Siden is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
335744947dcSTom Erickson DB_DNODE_EXIT(db);
336744947dcSTom Erickson
337744947dcSTom Erickson return (is_metadata);
338744947dcSTom Erickson }
339744947dcSTom Erickson }
340744947dcSTom Erickson
341fa9e4066Sahrens void
dbuf_evict(dmu_buf_impl_t * db)342ea8dc4b6Seschrock dbuf_evict(dmu_buf_impl_t *db)
343ea8dc4b6Seschrock {
344ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx));
345ea8dc4b6Seschrock ASSERT(db->db_buf == NULL);
346c717a561Smaybee ASSERT(db->db_data_pending == NULL);
347ea8dc4b6Seschrock
348ea8dc4b6Seschrock dbuf_clear(db);
349ea8dc4b6Seschrock dbuf_destroy(db);
350ea8dc4b6Seschrock }
351ea8dc4b6Seschrock
352ea8dc4b6Seschrock void
dbuf_init(void)353fa9e4066Sahrens dbuf_init(void)
354fa9e4066Sahrens {
355ea8dc4b6Seschrock uint64_t hsize = 1ULL << 16;
356fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table;
357fa9e4066Sahrens int i;
358fa9e4066Sahrens
359fa9e4066Sahrens /*
360fa9e4066Sahrens * The hash table is big enough to fill all of physical memory
361ea8dc4b6Seschrock * with an average 4K block size. The table will take up
362ea8dc4b6Seschrock * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
363fa9e4066Sahrens */
364ea8dc4b6Seschrock while (hsize * 4096 < physmem * PAGESIZE)
365fa9e4066Sahrens hsize <<= 1;
366fa9e4066Sahrens
367ea8dc4b6Seschrock retry:
368fa9e4066Sahrens h->hash_table_mask = hsize - 1;
369ea8dc4b6Seschrock h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
370ea8dc4b6Seschrock if (h->hash_table == NULL) {
371ea8dc4b6Seschrock /* XXX - we should really return an error instead of assert */
372ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 10));
373ea8dc4b6Seschrock hsize >>= 1;
374ea8dc4b6Seschrock goto retry;
375ea8dc4b6Seschrock }
376fa9e4066Sahrens
377fa9e4066Sahrens dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
378fa9e4066Sahrens sizeof (dmu_buf_impl_t),
379fa9e4066Sahrens 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
380fa9e4066Sahrens
381fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++)
382fa9e4066Sahrens mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
383bc9014e6SJustin Gibbs
384bc9014e6SJustin Gibbs /*
385bc9014e6SJustin Gibbs * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
386bc9014e6SJustin Gibbs * configuration is not required.
387bc9014e6SJustin Gibbs */
388bc9014e6SJustin Gibbs dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
389fa9e4066Sahrens }
390fa9e4066Sahrens
391fa9e4066Sahrens void
dbuf_fini(void)392fa9e4066Sahrens dbuf_fini(void)
393fa9e4066Sahrens {
394fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table;
395fa9e4066Sahrens int i;
396fa9e4066Sahrens
397fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++)
398fa9e4066Sahrens mutex_destroy(&h->hash_mutexes[i]);
399fa9e4066Sahrens kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
400fa9e4066Sahrens kmem_cache_destroy(dbuf_cache);
401bc9014e6SJustin Gibbs taskq_destroy(dbu_evict_taskq);
402fa9e4066Sahrens }
403fa9e4066Sahrens
404fa9e4066Sahrens /*
405fa9e4066Sahrens * Other stuff.
406fa9e4066Sahrens */
407fa9e4066Sahrens
4089c9dc39aSek110237 #ifdef ZFS_DEBUG
409fa9e4066Sahrens static void
dbuf_verify(dmu_buf_impl_t * db)410fa9e4066Sahrens dbuf_verify(dmu_buf_impl_t *db)
411fa9e4066Sahrens {
412744947dcSTom Erickson dnode_t *dn;
413b24ab676SJeff Bonwick dbuf_dirty_record_t *dr;
414fa9e4066Sahrens
415fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx));
416fa9e4066Sahrens
417fa9e4066Sahrens if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
418fa9e4066Sahrens return;
419fa9e4066Sahrens
420fa9e4066Sahrens ASSERT(db->db_objset != NULL);
421744947dcSTom Erickson DB_DNODE_ENTER(db);
422744947dcSTom Erickson dn = DB_DNODE(db);
423fa9e4066Sahrens if (dn == NULL) {
424fa9e4066Sahrens ASSERT(db->db_parent == NULL);
425fa9e4066Sahrens ASSERT(db->db_blkptr == NULL);
426fa9e4066Sahrens } else {
427fa9e4066Sahrens ASSERT3U(db->db.db_object, ==, dn->dn_object);
428fa9e4066Sahrens ASSERT3P(db->db_objset, ==, dn->dn_objset);
429fa9e4066Sahrens ASSERT3U(db->db_level, <, dn->dn_nlevels);
430744947dcSTom Erickson ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
431744947dcSTom Erickson db->db_blkid == DMU_SPILL_BLKID ||
4320f6d88adSAlex Reece !avl_is_empty(&dn->dn_dbufs));
433fa9e4066Sahrens }
4340a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
435fa9e4066Sahrens ASSERT(dn != NULL);
4361934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
4370a586ceaSMark Shellenbaum ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
4380a586ceaSMark Shellenbaum } else if (db->db_blkid == DMU_SPILL_BLKID) {
4390a586ceaSMark Shellenbaum ASSERT(dn != NULL);
4400a586ceaSMark Shellenbaum ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
441fb09f5aaSMadhav Suresh ASSERT0(db->db.db_offset);
442fa9e4066Sahrens } else {
443fa9e4066Sahrens ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
444fa9e4066Sahrens }
445fa9e4066Sahrens
446b24ab676SJeff Bonwick for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
447b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db);
448b24ab676SJeff Bonwick
449b24ab676SJeff Bonwick for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
450b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db);
451b24ab676SJeff Bonwick
45288b7b0f2SMatthew Ahrens /*
45388b7b0f2SMatthew Ahrens * We can't assert that db_size matches dn_datablksz because it
45488b7b0f2SMatthew Ahrens * can be momentarily different when another thread is doing
45588b7b0f2SMatthew Ahrens * dnode_set_blksz().
45688b7b0f2SMatthew Ahrens */
45788b7b0f2SMatthew Ahrens if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
458b24ab676SJeff Bonwick dr = db->db_data_pending;
459fa9e4066Sahrens /*
46088b7b0f2SMatthew Ahrens * It should only be modified in syncing context, so
46188b7b0f2SMatthew Ahrens * make sure we only have one copy of the data.
462fa9e4066Sahrens */
463c717a561Smaybee ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
464fa9e4066Sahrens }
465fa9e4066Sahrens
466fa9e4066Sahrens /* verify db->db_blkptr */
467fa9e4066Sahrens if (db->db_blkptr) {
468fa9e4066Sahrens if (db->db_parent == dn->dn_dbuf) {
469fa9e4066Sahrens /* db is pointed to by the dnode */
470fa9e4066Sahrens /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
47114843421SMatthew Ahrens if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
472fa9e4066Sahrens ASSERT(db->db_parent == NULL);
473fa9e4066Sahrens else
474fa9e4066Sahrens ASSERT(db->db_parent != NULL);
4750a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID)
476fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==,
477fa9e4066Sahrens &dn->dn_phys->dn_blkptr[db->db_blkid]);
478fa9e4066Sahrens } else {
479fa9e4066Sahrens /* db is pointed to by an indirect block */
480fa9e4066Sahrens int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
481fa9e4066Sahrens ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
482fa9e4066Sahrens ASSERT3U(db->db_parent->db.db_object, ==,
483fa9e4066Sahrens db->db.db_object);
484fa9e4066Sahrens /*
485fa9e4066Sahrens * dnode_grow_indblksz() can make this fail if we don't
486fa9e4066Sahrens * have the struct_rwlock. XXX indblksz no longer
487fa9e4066Sahrens * grows. safe to do this now?
488fa9e4066Sahrens */
489744947dcSTom Erickson if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
490fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==,
491fa9e4066Sahrens ((blkptr_t *)db->db_parent->db.db_data +
492fa9e4066Sahrens db->db_blkid % epb));
493fa9e4066Sahrens }
494fa9e4066Sahrens }
495fa9e4066Sahrens }
496fa9e4066Sahrens if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
4973f9d6ad7SLin Ling (db->db_buf == NULL || db->db_buf->b_data) &&
4980a586ceaSMark Shellenbaum db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
499fa9e4066Sahrens db->db_state != DB_FILL && !dn->dn_free_txg) {
500fa9e4066Sahrens /*
501fa9e4066Sahrens * If the blkptr isn't set but they have nonzero data,
502fa9e4066Sahrens * it had better be dirty, otherwise we'll lose that
503fa9e4066Sahrens * data when we evict this buffer.
504fa9e4066Sahrens */
505fa9e4066Sahrens if (db->db_dirtycnt == 0) {
506fa9e4066Sahrens uint64_t *buf = db->db.db_data;
507fa9e4066Sahrens int i;
508fa9e4066Sahrens
509fa9e4066Sahrens for (i = 0; i < db->db.db_size >> 3; i++) {
510fa9e4066Sahrens ASSERT(buf[i] == 0);
511fa9e4066Sahrens }
512fa9e4066Sahrens }
513fa9e4066Sahrens }
514744947dcSTom Erickson DB_DNODE_EXIT(db);
515fa9e4066Sahrens }
5169c9dc39aSek110237 #endif
517fa9e4066Sahrens
518fa9e4066Sahrens static void
dbuf_clear_data(dmu_buf_impl_t * db)519bc9014e6SJustin Gibbs dbuf_clear_data(dmu_buf_impl_t *db)
520fa9e4066Sahrens {
521fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx));
522ea8dc4b6Seschrock dbuf_evict_user(db);
523bc9014e6SJustin Gibbs db->db_buf = NULL;
524ea8dc4b6Seschrock db->db.db_data = NULL;
52582c9918fSTim Haley if (db->db_state != DB_NOFILL)
526ea8dc4b6Seschrock db->db_state = DB_UNCACHED;
527ea8dc4b6Seschrock }
528bc9014e6SJustin Gibbs
529bc9014e6SJustin Gibbs static void
dbuf_set_data(dmu_buf_impl_t * db,arc_buf_t * buf)530bc9014e6SJustin Gibbs dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
531bc9014e6SJustin Gibbs {
532bc9014e6SJustin Gibbs ASSERT(MUTEX_HELD(&db->db_mtx));
533bc9014e6SJustin Gibbs ASSERT(buf != NULL);
534bc9014e6SJustin Gibbs
535bc9014e6SJustin Gibbs db->db_buf = buf;
536bc9014e6SJustin Gibbs ASSERT(buf->b_data != NULL);
537bc9014e6SJustin Gibbs db->db.db_data = buf->b_data;
538bc9014e6SJustin Gibbs if (!arc_released(buf))
539bc9014e6SJustin Gibbs arc_set_callback(buf, dbuf_do_evict, db);
540fa9e4066Sahrens }
541fa9e4066Sahrens
542c242f9a0Schunli zhang - Sun Microsystems - Irvine United States /*
543c242f9a0Schunli zhang - Sun Microsystems - Irvine United States * Loan out an arc_buf for read. Return the loaned arc_buf.
544c242f9a0Schunli zhang - Sun Microsystems - Irvine United States */
545c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t *
dbuf_loan_arcbuf(dmu_buf_impl_t * db)546c242f9a0Schunli zhang - Sun Microsystems - Irvine United States dbuf_loan_arcbuf(dmu_buf_impl_t *db)
547c242f9a0Schunli zhang - Sun Microsystems - Irvine United States {
548c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t *abuf;
549c242f9a0Schunli zhang - Sun Microsystems - Irvine United States
550c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_enter(&db->db_mtx);
551c242f9a0Schunli zhang - Sun Microsystems - Irvine United States if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
552c242f9a0Schunli zhang - Sun Microsystems - Irvine United States int blksz = db->db.db_size;
55343466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa;
554744947dcSTom Erickson
555c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx);
556744947dcSTom Erickson abuf = arc_loan_buf(spa, blksz);
557c242f9a0Schunli zhang - Sun Microsystems - Irvine United States bcopy(db->db.db_data, abuf->b_data, blksz);
558c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } else {
559c242f9a0Schunli zhang - Sun Microsystems - Irvine United States abuf = db->db_buf;
560c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_loan_inuse_buf(abuf, db);
561bc9014e6SJustin Gibbs dbuf_clear_data(db);
562c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx);
563c242f9a0Schunli zhang - Sun Microsystems - Irvine United States }
564c242f9a0Schunli zhang - Sun Microsystems - Irvine United States return (abuf);
565c242f9a0Schunli zhang - Sun Microsystems - Irvine United States }
566c242f9a0Schunli zhang - Sun Microsystems - Irvine United States
567a2cdcdd2SPaul Dagnelie /*
568a2cdcdd2SPaul Dagnelie * Calculate which level n block references the data at the level 0 offset
569a2cdcdd2SPaul Dagnelie * provided.
570a2cdcdd2SPaul Dagnelie */
571fa9e4066Sahrens uint64_t
dbuf_whichblock(dnode_t * dn,int64_t level,uint64_t offset)572a2cdcdd2SPaul Dagnelie dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
573fa9e4066Sahrens {
574a2cdcdd2SPaul Dagnelie if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
575a2cdcdd2SPaul Dagnelie /*
576a2cdcdd2SPaul Dagnelie * The level n blkid is equal to the level 0 blkid divided by
577a2cdcdd2SPaul Dagnelie * the number of level 0s in a level n block.
578a2cdcdd2SPaul Dagnelie *
579a2cdcdd2SPaul Dagnelie * The level 0 blkid is offset >> datablkshift =
580a2cdcdd2SPaul Dagnelie * offset / 2^datablkshift.
581a2cdcdd2SPaul Dagnelie *
582a2cdcdd2SPaul Dagnelie * The number of level 0s in a level n is the number of block
583a2cdcdd2SPaul Dagnelie * pointers in an indirect block, raised to the power of level.
584a2cdcdd2SPaul Dagnelie * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
585a2cdcdd2SPaul Dagnelie * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
586a2cdcdd2SPaul Dagnelie *
587a2cdcdd2SPaul Dagnelie * Thus, the level n blkid is: offset /
588a2cdcdd2SPaul Dagnelie * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
589a2cdcdd2SPaul Dagnelie * = offset / 2^(datablkshift + level *
590a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT))
591a2cdcdd2SPaul Dagnelie * = offset >> (datablkshift + level *
592a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT))
593a2cdcdd2SPaul Dagnelie */
594a2cdcdd2SPaul Dagnelie return (offset >> (dn->dn_datablkshift + level *
595a2cdcdd2SPaul Dagnelie (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
596fa9e4066Sahrens } else {
597fa9e4066Sahrens ASSERT3U(offset, <, dn->dn_datablksz);
598fa9e4066Sahrens return (0);
599fa9e4066Sahrens }
600fa9e4066Sahrens }
601fa9e4066Sahrens
602fa9e4066Sahrens static void
dbuf_read_done(zio_t * zio,arc_buf_t * buf,void * vdb)603fa9e4066Sahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
604fa9e4066Sahrens {
605fa9e4066Sahrens dmu_buf_impl_t *db = vdb;
606fa9e4066Sahrens
607fa9e4066Sahrens mutex_enter(&db->db_mtx);
608fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_READ);
609fa9e4066Sahrens /*
610fa9e4066Sahrens * All reads are synchronous, so we must have a hold on the dbuf
611fa9e4066Sahrens */
612fa9e4066Sahrens ASSERT(refcount_count(&db->db_holds) > 0);
613ea8dc4b6Seschrock ASSERT(db->db_buf == NULL);
614fa9e4066Sahrens ASSERT(db->db.db_data == NULL);
615c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) {
616fa9e4066Sahrens /* we were freed in flight; disregard any error */
617fa9e4066Sahrens arc_release(buf, db);
618fa9e4066Sahrens bzero(buf->b_data, db->db.db_size);
6196b4acc8bSahrens arc_buf_freeze(buf);
620c717a561Smaybee db->db_freed_in_flight = FALSE;
621fa9e4066Sahrens dbuf_set_data(db, buf);
622fa9e4066Sahrens db->db_state = DB_CACHED;
623fa9e4066Sahrens } else if (zio == NULL || zio->io_error == 0) {
624fa9e4066Sahrens dbuf_set_data(db, buf);
625fa9e4066Sahrens db->db_state = DB_CACHED;
626fa9e4066Sahrens } else {
6270a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
628fa9e4066Sahrens ASSERT3P(db->db_buf, ==, NULL);
6293b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db));
630ea8dc4b6Seschrock db->db_state = DB_UNCACHED;
631fa9e4066Sahrens }
632fa9e4066Sahrens cv_broadcast(&db->db_changed);
6333f9d6ad7SLin Ling dbuf_rele_and_unlock(db, NULL);
634fa9e4066Sahrens }
635fa9e4066Sahrens
636ea8dc4b6Seschrock static void
dbuf_read_impl(dmu_buf_impl_t * db,zio_t * zio,uint32_t flags)637cf6106c8SMatthew Ahrens dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
638fa9e4066Sahrens {
639744947dcSTom Erickson dnode_t *dn;
6407802d7bfSMatthew Ahrens zbookmark_phys_t zb;
6417adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_NOWAIT;
642fa9e4066Sahrens
643744947dcSTom Erickson DB_DNODE_ENTER(db);
644744947dcSTom Erickson dn = DB_DNODE(db);
645fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
646fa9e4066Sahrens /* We need the struct_rwlock to prevent db_blkptr from changing. */
647088f3894Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
648ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx));
649ea8dc4b6Seschrock ASSERT(db->db_state == DB_UNCACHED);
650ea8dc4b6Seschrock ASSERT(db->db_buf == NULL);
651fa9e4066Sahrens
6520a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
653cf04dda1SMark Maybee int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
6541934e92fSmaybee
6551934e92fSmaybee ASSERT3U(bonuslen, <=, db->db.db_size);
656ea8dc4b6Seschrock db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
6575a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
6581934e92fSmaybee if (bonuslen < DN_MAX_BONUSLEN)
659ea8dc4b6Seschrock bzero(db->db.db_data, DN_MAX_BONUSLEN);
660cf04dda1SMark Maybee if (bonuslen)
661cf04dda1SMark Maybee bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
662744947dcSTom Erickson DB_DNODE_EXIT(db);
663fa9e4066Sahrens db->db_state = DB_CACHED;
664fa9e4066Sahrens mutex_exit(&db->db_mtx);
665fa9e4066Sahrens return;
666fa9e4066Sahrens }
667fa9e4066Sahrens
6681c8564a7SMark Maybee /*
6691c8564a7SMark Maybee * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
6701c8564a7SMark Maybee * processes the delete record and clears the bp while we are waiting
6711c8564a7SMark Maybee * for the dn_mtx (resulting in a "no" from block_freed).
6721c8564a7SMark Maybee */
673088f3894Sahrens if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
6741c8564a7SMark Maybee (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
6751c8564a7SMark Maybee BP_IS_HOLE(db->db_blkptr)))) {
676ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
677ad23a2dbSjohansen
678744947dcSTom Erickson DB_DNODE_EXIT(db);
67943466aaeSMax Grossman dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
68043466aaeSMax Grossman db->db.db_size, db, type));
681fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size);
682fa9e4066Sahrens db->db_state = DB_CACHED;
683fa9e4066Sahrens mutex_exit(&db->db_mtx);
684fa9e4066Sahrens return;
685fa9e4066Sahrens }
686fa9e4066Sahrens
687744947dcSTom Erickson DB_DNODE_EXIT(db);
688744947dcSTom Erickson
689fa9e4066Sahrens db->db_state = DB_READ;
690fa9e4066Sahrens mutex_exit(&db->db_mtx);
691fa9e4066Sahrens
6923baa08fcSek110237 if (DBUF_IS_L2CACHEABLE(db))
6937adb730bSGeorge Wilson aflags |= ARC_FLAG_L2CACHE;
694aad02571SSaso Kiselkov if (DBUF_IS_L2COMPRESSIBLE(db))
6957adb730bSGeorge Wilson aflags |= ARC_FLAG_L2COMPRESS;
6963baa08fcSek110237
697b24ab676SJeff Bonwick SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
698b24ab676SJeff Bonwick db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
699b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid);
700ea8dc4b6Seschrock
701ea8dc4b6Seschrock dbuf_add_ref(db, NULL);
702088f3894Sahrens
70343466aaeSMax Grossman (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
704fa9e4066Sahrens dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
705cf6106c8SMatthew Ahrens (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
70613506d1eSmaybee &aflags, &zb);
707fa9e4066Sahrens }
708fa9e4066Sahrens
709ea8dc4b6Seschrock int
dbuf_read(dmu_buf_impl_t * db,zio_t * zio,uint32_t flags)710ea8dc4b6Seschrock dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
711fa9e4066Sahrens {
712ea8dc4b6Seschrock int err = 0;
71343466aaeSMax Grossman boolean_t havepzio = (zio != NULL);
71443466aaeSMax Grossman boolean_t prefetch;
715744947dcSTom Erickson dnode_t *dn;
716fa9e4066Sahrens
717fa9e4066Sahrens /*
718fa9e4066Sahrens * We don't have to hold the mutex to check db_state because it
719fa9e4066Sahrens * can't be freed while we have a hold on the buffer.
720fa9e4066Sahrens */
721fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
722fa9e4066Sahrens
72382c9918fSTim Haley if (db->db_state == DB_NOFILL)
724be6fd75aSMatthew Ahrens return (SET_ERROR(EIO));
72582c9918fSTim Haley
726744947dcSTom Erickson DB_DNODE_ENTER(db);
727744947dcSTom Erickson dn = DB_DNODE(db);
728fa9e4066Sahrens if ((flags & DB_RF_HAVESTRUCT) == 0)
729744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_READER);
730fa9e4066Sahrens
7310a586ceaSMark Shellenbaum prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
732744947dcSTom Erickson (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
7333baa08fcSek110237 DBUF_IS_CACHEABLE(db);
73413506d1eSmaybee
735fa9e4066Sahrens mutex_enter(&db->db_mtx);
736ea8dc4b6Seschrock if (db->db_state == DB_CACHED) {
737ea8dc4b6Seschrock mutex_exit(&db->db_mtx);
73813506d1eSmaybee if (prefetch)
739cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
740ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0)
741744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock);
742744947dcSTom Erickson DB_DNODE_EXIT(db);
743ea8dc4b6Seschrock } else if (db->db_state == DB_UNCACHED) {
744744947dcSTom Erickson spa_t *spa = dn->dn_objset->os_spa;
745744947dcSTom Erickson
746744947dcSTom Erickson if (zio == NULL)
747744947dcSTom Erickson zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
748cf6106c8SMatthew Ahrens dbuf_read_impl(db, zio, flags);
74913506d1eSmaybee
750ea8dc4b6Seschrock /* dbuf_read_impl has dropped db_mtx for us */
751ea8dc4b6Seschrock
75213506d1eSmaybee if (prefetch)
753cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
754ea8dc4b6Seschrock
755ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0)
756744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock);
757744947dcSTom Erickson DB_DNODE_EXIT(db);
758ea8dc4b6Seschrock
759ea8dc4b6Seschrock if (!havepzio)
760ea8dc4b6Seschrock err = zio_wait(zio);
761ea8dc4b6Seschrock } else {
7623e30c24aSWill Andrews /*
7633e30c24aSWill Andrews * Another reader came in while the dbuf was in flight
7643e30c24aSWill Andrews * between UNCACHED and CACHED. Either a writer will finish
7653e30c24aSWill Andrews * writing the buffer (sending the dbuf to CACHED) or the
7663e30c24aSWill Andrews * first reader's request will reach the read_done callback
7673e30c24aSWill Andrews * and send the dbuf to CACHED. Otherwise, a failure
7683e30c24aSWill Andrews * occurred and the dbuf went to UNCACHED.
7693e30c24aSWill Andrews */
77013506d1eSmaybee mutex_exit(&db->db_mtx);
77113506d1eSmaybee if (prefetch)
772cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
773ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0)
774744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock);
775744947dcSTom Erickson DB_DNODE_EXIT(db);
77613506d1eSmaybee
7773e30c24aSWill Andrews /* Skip the wait per the caller's request. */
77813506d1eSmaybee mutex_enter(&db->db_mtx);
779ea8dc4b6Seschrock if ((flags & DB_RF_NEVERWAIT) == 0) {
780ea8dc4b6Seschrock while (db->db_state == DB_READ ||
781ea8dc4b6Seschrock db->db_state == DB_FILL) {
782fa9e4066Sahrens ASSERT(db->db_state == DB_READ ||
783fa9e4066Sahrens (flags & DB_RF_HAVESTRUCT) == 0);
784f6164ad6SAdam H. Leventhal DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
785f6164ad6SAdam H. Leventhal db, zio_t *, zio);
786fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx);
787fa9e4066Sahrens }
788ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED)
789be6fd75aSMatthew Ahrens err = SET_ERROR(EIO);
790ea8dc4b6Seschrock }
791fa9e4066Sahrens mutex_exit(&db->db_mtx);
792fa9e4066Sahrens }
793fa9e4066Sahrens
794ea8dc4b6Seschrock ASSERT(err || havepzio || db->db_state == DB_CACHED);
795ea8dc4b6Seschrock return (err);
796fa9e4066Sahrens }
797fa9e4066Sahrens
798fa9e4066Sahrens static void
dbuf_noread(dmu_buf_impl_t * db)799fa9e4066Sahrens dbuf_noread(dmu_buf_impl_t *db)
800fa9e4066Sahrens {
801fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
8020a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
803fa9e4066Sahrens mutex_enter(&db->db_mtx);
804fa9e4066Sahrens while (db->db_state == DB_READ || db->db_state == DB_FILL)
805fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx);
806fa9e4066Sahrens if (db->db_state == DB_UNCACHED) {
807ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
80843466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa;
809ad23a2dbSjohansen
810ea8dc4b6Seschrock ASSERT(db->db_buf == NULL);
811fa9e4066Sahrens ASSERT(db->db.db_data == NULL);
812744947dcSTom Erickson dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
813fa9e4066Sahrens db->db_state = DB_FILL;
81482c9918fSTim Haley } else if (db->db_state == DB_NOFILL) {
815bc9014e6SJustin Gibbs dbuf_clear_data(db);
816fa9e4066Sahrens } else {
817fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED);
818fa9e4066Sahrens }
819fa9e4066Sahrens mutex_exit(&db->db_mtx);
820fa9e4066Sahrens }
821fa9e4066Sahrens
822fa9e4066Sahrens /*
823fa9e4066Sahrens * This is our just-in-time copy function. It makes a copy of
824fa9e4066Sahrens * buffers, that have been modified in a previous transaction
825fa9e4066Sahrens * group, before we modify them in the current active group.
826fa9e4066Sahrens *
827fa9e4066Sahrens * This function is used in two places: when we are dirtying a
828fa9e4066Sahrens * buffer for the first time in a txg, and when we are freeing
829fa9e4066Sahrens * a range in a dnode that includes this buffer.
830fa9e4066Sahrens *
831fa9e4066Sahrens * Note that when we are called from dbuf_free_range() we do
832fa9e4066Sahrens * not put a hold on the buffer, we just traverse the active
833fa9e4066Sahrens * dbuf list for the dnode.
834fa9e4066Sahrens */
835fa9e4066Sahrens static void
dbuf_fix_old_data(dmu_buf_impl_t * db,uint64_t txg)836fa9e4066Sahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
837fa9e4066Sahrens {
838c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty;
839fa9e4066Sahrens
840fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx));
841fa9e4066Sahrens ASSERT(db->db.db_data != NULL);
842c717a561Smaybee ASSERT(db->db_level == 0);
843c717a561Smaybee ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
844fa9e4066Sahrens
8454d31c452Smaybee if (dr == NULL ||
8464d31c452Smaybee (dr->dt.dl.dr_data !=
8470a586ceaSMark Shellenbaum ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
848fa9e4066Sahrens return;
849fa9e4066Sahrens
850fa9e4066Sahrens /*
851c717a561Smaybee * If the last dirty record for this dbuf has not yet synced
852c717a561Smaybee * and its referencing the dbuf data, either:
853c717a561Smaybee * reset the reference to point to a new copy,
854c717a561Smaybee * or (if there a no active holders)
855c717a561Smaybee * just null out the current db_data pointer.
856fa9e4066Sahrens */
857c717a561Smaybee ASSERT(dr->dr_txg >= txg - 2);
8580a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
859c717a561Smaybee /* Note that the data bufs here are zio_bufs */
860c717a561Smaybee dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
8615a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
862c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
863c717a561Smaybee } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
864ea8dc4b6Seschrock int size = db->db.db_size;
865c717a561Smaybee arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
86643466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa;
867744947dcSTom Erickson
868744947dcSTom Erickson dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
869c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
870fa9e4066Sahrens } else {
871bc9014e6SJustin Gibbs dbuf_clear_data(db);
872fa9e4066Sahrens }
873fa9e4066Sahrens }
874ea8dc4b6Seschrock
875fa9e4066Sahrens void
dbuf_unoverride(dbuf_dirty_record_t * dr)876c717a561Smaybee dbuf_unoverride(dbuf_dirty_record_t *dr)
877fa9e4066Sahrens {
878c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf;
879b24ab676SJeff Bonwick blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
880c717a561Smaybee uint64_t txg = dr->dr_txg;
881c5c6ffa0Smaybee
882c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx));
883c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
884c717a561Smaybee ASSERT(db->db_level == 0);
885c717a561Smaybee
8860a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID ||
887c717a561Smaybee dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
888c717a561Smaybee return;
889c717a561Smaybee
890b24ab676SJeff Bonwick ASSERT(db->db_data_pending != dr);
891b24ab676SJeff Bonwick
892fa9e4066Sahrens /* free this block */
89343466aaeSMax Grossman if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
89443466aaeSMax Grossman zio_free(db->db_objset->os_spa, txg, bp);
895b24ab676SJeff Bonwick
896c717a561Smaybee dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
89780901aeaSGeorge Wilson dr->dt.dl.dr_nopwrite = B_FALSE;
89880901aeaSGeorge Wilson
8996b4acc8bSahrens /*
9006b4acc8bSahrens * Release the already-written buffer, so we leave it in
9016b4acc8bSahrens * a consistent dirty state. Note that all callers are
9026b4acc8bSahrens * modifying the buffer, so they will immediately do
9036b4acc8bSahrens * another (redundant) arc_release(). Therefore, leave
9046b4acc8bSahrens * the buf thawed to save the effort of freezing &
9056b4acc8bSahrens * immediately re-thawing it.
9066b4acc8bSahrens */
907c717a561Smaybee arc_release(dr->dt.dl.dr_data, db);
908fa9e4066Sahrens }
909fa9e4066Sahrens
910cdb0ab79Smaybee /*
911cdb0ab79Smaybee * Evict (if its unreferenced) or clear (if its referenced) any level-0
912cdb0ab79Smaybee * data blocks in the free range, so that any future readers will find
91343466aaeSMax Grossman * empty blocks.
9142f3d8780SMatthew Ahrens *
9152f3d8780SMatthew Ahrens * This is a no-op if the dataset is in the middle of an incremental
9162f3d8780SMatthew Ahrens * receive; see comment below for details.
917cdb0ab79Smaybee */
918fa9e4066Sahrens void
dbuf_free_range(dnode_t * dn,uint64_t start_blkid,uint64_t end_blkid,dmu_tx_t * tx)9190f6d88adSAlex Reece dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
9200f6d88adSAlex Reece dmu_tx_t *tx)
921fa9e4066Sahrens {
922bc9014e6SJustin Gibbs dmu_buf_impl_t db_search;
923bc9014e6SJustin Gibbs dmu_buf_impl_t *db, *db_next;
924fa9e4066Sahrens uint64_t txg = tx->tx_txg;
9250f6d88adSAlex Reece avl_index_t where;
926fa9e4066Sahrens
9270f6d88adSAlex Reece if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
9280f6d88adSAlex Reece end_blkid = dn->dn_maxblkid;
9290f6d88adSAlex Reece dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
9300f6d88adSAlex Reece
9310f6d88adSAlex Reece db_search.db_level = 0;
9320f6d88adSAlex Reece db_search.db_blkid = start_blkid;
93386bb58aeSAlex Reece db_search.db_state = DB_SEARCH;
9342f3d8780SMatthew Ahrens
935713d6c20SMatthew Ahrens mutex_enter(&dn->dn_dbufs_mtx);
9360f6d88adSAlex Reece if (start_blkid >= dn->dn_unlisted_l0_blkid) {
937713d6c20SMatthew Ahrens /* There can't be any dbufs in this range; no need to search. */
9380f6d88adSAlex Reece #ifdef DEBUG
9390f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where);
9400f6d88adSAlex Reece ASSERT3P(db, ==, NULL);
9410f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
9420f6d88adSAlex Reece ASSERT(db == NULL || db->db_level > 0);
9430f6d88adSAlex Reece #endif
944713d6c20SMatthew Ahrens mutex_exit(&dn->dn_dbufs_mtx);
9452f3d8780SMatthew Ahrens return;
946713d6c20SMatthew Ahrens } else if (dmu_objset_is_receiving(dn->dn_objset)) {
947713d6c20SMatthew Ahrens /*
948713d6c20SMatthew Ahrens * If we are receiving, we expect there to be no dbufs in
949713d6c20SMatthew Ahrens * the range to be freed, because receive modifies each
950713d6c20SMatthew Ahrens * block at most once, and in offset order. If this is
951713d6c20SMatthew Ahrens * not the case, it can lead to performance problems,
952713d6c20SMatthew Ahrens * so note that we unexpectedly took the slow path.
953713d6c20SMatthew Ahrens */
954713d6c20SMatthew Ahrens atomic_inc_64(&zfs_free_range_recv_miss);
9552f3d8780SMatthew Ahrens }
9562f3d8780SMatthew Ahrens
9570f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where);
9580f6d88adSAlex Reece ASSERT3P(db, ==, NULL);
9590f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
9600f6d88adSAlex Reece
9610f6d88adSAlex Reece for (; db != NULL; db = db_next) {
9620f6d88adSAlex Reece db_next = AVL_NEXT(&dn->dn_dbufs, db);
9630a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
964cdb0ab79Smaybee
9650f6d88adSAlex Reece if (db->db_level != 0 || db->db_blkid > end_blkid) {
9660f6d88adSAlex Reece break;
9670f6d88adSAlex Reece }
9680f6d88adSAlex Reece ASSERT3U(db->db_blkid, >=, start_blkid);
969fa9e4066Sahrens
970fa9e4066Sahrens /* found a level 0 buffer in the range */
971fa9e4066Sahrens mutex_enter(&db->db_mtx);
9723b2aab18SMatthew Ahrens if (dbuf_undirty(db, tx)) {
9733b2aab18SMatthew Ahrens /* mutex has been dropped and dbuf destroyed */
9743b2aab18SMatthew Ahrens continue;
9753b2aab18SMatthew Ahrens }
9763b2aab18SMatthew Ahrens
977ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED ||
97882c9918fSTim Haley db->db_state == DB_NOFILL ||
979ea8dc4b6Seschrock db->db_state == DB_EVICTING) {
980fa9e4066Sahrens ASSERT(db->db.db_data == NULL);
981fa9e4066Sahrens mutex_exit(&db->db_mtx);
982fa9e4066Sahrens continue;
983fa9e4066Sahrens }
984c543ec06Sahrens if (db->db_state == DB_READ || db->db_state == DB_FILL) {
985c543ec06Sahrens /* will be handled in dbuf_read_done or dbuf_rele */
986c717a561Smaybee db->db_freed_in_flight = TRUE;
987fa9e4066Sahrens mutex_exit(&db->db_mtx);
988fa9e4066Sahrens continue;
989fa9e4066Sahrens }
990ea8dc4b6Seschrock if (refcount_count(&db->db_holds) == 0) {
991ea8dc4b6Seschrock ASSERT(db->db_buf);
992ea8dc4b6Seschrock dbuf_clear(db);
993ea8dc4b6Seschrock continue;
994ea8dc4b6Seschrock }
995c717a561Smaybee /* The dbuf is referenced */
996fa9e4066Sahrens
997c717a561Smaybee if (db->db_last_dirty != NULL) {
998c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty;
999c717a561Smaybee
1000c717a561Smaybee if (dr->dr_txg == txg) {
1001ea8dc4b6Seschrock /*
1002c717a561Smaybee * This buffer is "in-use", re-adjust the file
1003c717a561Smaybee * size to reflect that this buffer may
1004c717a561Smaybee * contain new data when we sync.
1005ea8dc4b6Seschrock */
100606e0070dSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID &&
100706e0070dSMark Shellenbaum db->db_blkid > dn->dn_maxblkid)
100844eda4d7Smaybee dn->dn_maxblkid = db->db_blkid;
1009c717a561Smaybee dbuf_unoverride(dr);
1010c717a561Smaybee } else {
1011c717a561Smaybee /*
1012c717a561Smaybee * This dbuf is not dirty in the open context.
1013c717a561Smaybee * Either uncache it (if its not referenced in
1014c717a561Smaybee * the open context) or reset its contents to
1015c717a561Smaybee * empty.
1016c717a561Smaybee */
1017c717a561Smaybee dbuf_fix_old_data(db, txg);
101844eda4d7Smaybee }
1019c717a561Smaybee }
1020c717a561Smaybee /* clear the contents if its cached */
1021ea8dc4b6Seschrock if (db->db_state == DB_CACHED) {
1022ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL);
1023fa9e4066Sahrens arc_release(db->db_buf, db);
1024fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size);
10256b4acc8bSahrens arc_buf_freeze(db->db_buf);
1026fa9e4066Sahrens }
1027ea8dc4b6Seschrock
1028fa9e4066Sahrens mutex_exit(&db->db_mtx);
1029fa9e4066Sahrens }
1030fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx);
1031fa9e4066Sahrens }
1032fa9e4066Sahrens
1033fa9e4066Sahrens static int
dbuf_block_freeable(dmu_buf_impl_t * db)10341934e92fSmaybee dbuf_block_freeable(dmu_buf_impl_t *db)
1035fa9e4066Sahrens {
1036fa9e4066Sahrens dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
1037fa9e4066Sahrens uint64_t birth_txg = 0;
1038fa9e4066Sahrens
1039fa9e4066Sahrens /*
1040fa9e4066Sahrens * We don't need any locking to protect db_blkptr:
1041c717a561Smaybee * If it's syncing, then db_last_dirty will be set
1042c717a561Smaybee * so we'll ignore db_blkptr.
104343466aaeSMax Grossman *
104443466aaeSMax Grossman * This logic ensures that only block births for
104543466aaeSMax Grossman * filled blocks are considered.
1046fa9e4066Sahrens */
1047c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx));
104843466aaeSMax Grossman if (db->db_last_dirty && (db->db_blkptr == NULL ||
104943466aaeSMax Grossman !BP_IS_HOLE(db->db_blkptr))) {
1050c717a561Smaybee birth_txg = db->db_last_dirty->dr_txg;
105143466aaeSMax Grossman } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1052fa9e4066Sahrens birth_txg = db->db_blkptr->blk_birth;
105343466aaeSMax Grossman }
1054fa9e4066Sahrens
1055837b568bSGeorge Wilson /*
105643466aaeSMax Grossman * If this block don't exist or is in a snapshot, it can't be freed.
1057837b568bSGeorge Wilson * Don't pass the bp to dsl_dataset_block_freeable() since we
1058837b568bSGeorge Wilson * are holding the db_mtx lock and might deadlock if we are
1059837b568bSGeorge Wilson * prefetching a dedup-ed block.
1060837b568bSGeorge Wilson */
106143466aaeSMax Grossman if (birth_txg != 0)
10621934e92fSmaybee return (ds == NULL ||
1063837b568bSGeorge Wilson dsl_dataset_block_freeable(ds, NULL, birth_txg));
1064fa9e4066Sahrens else
106543466aaeSMax Grossman return (B_FALSE);
1066fa9e4066Sahrens }
1067fa9e4066Sahrens
1068fa9e4066Sahrens void
dbuf_new_size(dmu_buf_impl_t * db,int size,dmu_tx_t * tx)1069fa9e4066Sahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1070fa9e4066Sahrens {
1071fa9e4066Sahrens arc_buf_t *buf, *obuf;
1072fa9e4066Sahrens int osize = db->db.db_size;
1073ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1074744947dcSTom Erickson dnode_t *dn;
1075fa9e4066Sahrens
10760a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1077ea8dc4b6Seschrock
1078744947dcSTom Erickson DB_DNODE_ENTER(db);
1079744947dcSTom Erickson dn = DB_DNODE(db);
1080744947dcSTom Erickson
1081fa9e4066Sahrens /* XXX does *this* func really need the lock? */
1082744947dcSTom Erickson ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1083fa9e4066Sahrens
1084fa9e4066Sahrens /*
108543466aaeSMax Grossman * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
1086fa9e4066Sahrens * is OK, because there can be no other references to the db
1087fa9e4066Sahrens * when we are changing its size, so no concurrent DB_FILL can
1088fa9e4066Sahrens * be happening.
1089fa9e4066Sahrens */
1090ea8dc4b6Seschrock /*
1091ea8dc4b6Seschrock * XXX we should be doing a dbuf_read, checking the return
1092ea8dc4b6Seschrock * value and returning that up to our callers
1093ea8dc4b6Seschrock */
109443466aaeSMax Grossman dmu_buf_will_dirty(&db->db, tx);
1095fa9e4066Sahrens
1096fa9e4066Sahrens /* create the data buffer for the new block */
1097744947dcSTom Erickson buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1098fa9e4066Sahrens
1099fa9e4066Sahrens /* copy old block data to the new block */
1100fa9e4066Sahrens obuf = db->db_buf;
1101f65e61c0Sahrens bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1102fa9e4066Sahrens /* zero the remainder */
1103f65e61c0Sahrens if (size > osize)
1104fa9e4066Sahrens bzero((uint8_t *)buf->b_data + osize, size - osize);
1105fa9e4066Sahrens
1106fa9e4066Sahrens mutex_enter(&db->db_mtx);
1107fa9e4066Sahrens dbuf_set_data(db, buf);
11083b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(obuf, db));
1109fa9e4066Sahrens db->db.db_size = size;
1110fa9e4066Sahrens
1111c717a561Smaybee if (db->db_level == 0) {
1112c717a561Smaybee ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1113c717a561Smaybee db->db_last_dirty->dt.dl.dr_data = buf;
1114c717a561Smaybee }
1115fa9e4066Sahrens mutex_exit(&db->db_mtx);
1116fa9e4066Sahrens
1117744947dcSTom Erickson dnode_willuse_space(dn, size-osize, tx);
1118744947dcSTom Erickson DB_DNODE_EXIT(db);
1119fa9e4066Sahrens }
1120fa9e4066Sahrens
11213f9d6ad7SLin Ling void
dbuf_release_bp(dmu_buf_impl_t * db)11223f9d6ad7SLin Ling dbuf_release_bp(dmu_buf_impl_t *db)
11233f9d6ad7SLin Ling {
112443466aaeSMax Grossman objset_t *os = db->db_objset;
11253f9d6ad7SLin Ling
11263f9d6ad7SLin Ling ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
11273f9d6ad7SLin Ling ASSERT(arc_released(os->os_phys_buf) ||
11283f9d6ad7SLin Ling list_link_active(&os->os_dsl_dataset->ds_synced_link));
11293f9d6ad7SLin Ling ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
11303f9d6ad7SLin Ling
11311b912ec7SGeorge Wilson (void) arc_release(db->db_buf, db);
11323f9d6ad7SLin Ling }
11333f9d6ad7SLin Ling
11340f2e7d03SMatthew Ahrens /*
11350f2e7d03SMatthew Ahrens * We already have a dirty record for this TXG, and we are being
11360f2e7d03SMatthew Ahrens * dirtied again.
11370f2e7d03SMatthew Ahrens */
11380f2e7d03SMatthew Ahrens static void
dbuf_redirty(dbuf_dirty_record_t * dr)11390f2e7d03SMatthew Ahrens dbuf_redirty(dbuf_dirty_record_t *dr)
11400f2e7d03SMatthew Ahrens {
11410f2e7d03SMatthew Ahrens dmu_buf_impl_t *db = dr->dr_dbuf;
11420f2e7d03SMatthew Ahrens
11430f2e7d03SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx));
11440f2e7d03SMatthew Ahrens
11450f2e7d03SMatthew Ahrens if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
11460f2e7d03SMatthew Ahrens /*
11470f2e7d03SMatthew Ahrens * If this buffer has already been written out,
11480f2e7d03SMatthew Ahrens * we now need to reset its state.
11490f2e7d03SMatthew Ahrens */
11500f2e7d03SMatthew Ahrens dbuf_unoverride(dr);
11510f2e7d03SMatthew Ahrens if (db->db.db_object != DMU_META_DNODE_OBJECT &&
11520f2e7d03SMatthew Ahrens db->db_state != DB_NOFILL) {
11530f2e7d03SMatthew Ahrens /* Already released on initial dirty, so just thaw. */
11540f2e7d03SMatthew Ahrens ASSERT(arc_released(db->db_buf));
11550f2e7d03SMatthew Ahrens arc_buf_thaw(db->db_buf);
11560f2e7d03SMatthew Ahrens }
11570f2e7d03SMatthew Ahrens }
11580f2e7d03SMatthew Ahrens }
11590f2e7d03SMatthew Ahrens
1160c717a561Smaybee dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t * db,dmu_tx_t * tx)1161fa9e4066Sahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1162fa9e4066Sahrens {
1163744947dcSTom Erickson dnode_t *dn;
1164744947dcSTom Erickson objset_t *os;
1165c717a561Smaybee dbuf_dirty_record_t **drp, *dr;
1166fa9e4066Sahrens int drop_struct_lock = FALSE;
1167d3469faaSMark Maybee boolean_t do_free_accounting = B_FALSE;
1168fa9e4066Sahrens int txgoff = tx->tx_txg & TXG_MASK;
1169fa9e4066Sahrens
1170fa9e4066Sahrens ASSERT(tx->tx_txg != 0);
1171fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
11729c9dc39aSek110237 DMU_TX_DIRTY_BUF(tx, db);
1173fa9e4066Sahrens
1174744947dcSTom Erickson DB_DNODE_ENTER(db);
1175744947dcSTom Erickson dn = DB_DNODE(db);
1176fa9e4066Sahrens /*
1177fa9e4066Sahrens * Shouldn't dirty a regular buffer in syncing context. Private
1178fa9e4066Sahrens * objects may be dirtied in syncing context, but only if they
1179fa9e4066Sahrens * were already pre-dirtied in open context.
1180fa9e4066Sahrens */
1181c717a561Smaybee ASSERT(!dmu_tx_is_syncing(tx) ||
1182c717a561Smaybee BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
118314843421SMatthew Ahrens DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
118414843421SMatthew Ahrens dn->dn_objset->os_dsl_dataset == NULL);
1185fa9e4066Sahrens /*
1186fa9e4066Sahrens * We make this assert for private objects as well, but after we
1187fa9e4066Sahrens * check if we're already dirty. They are allowed to re-dirty
1188fa9e4066Sahrens * in syncing context.
1189fa9e4066Sahrens */
1190ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1191c717a561Smaybee dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1192fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1193fa9e4066Sahrens
1194fa9e4066Sahrens mutex_enter(&db->db_mtx);
1195fa9e4066Sahrens /*
1196c717a561Smaybee * XXX make this true for indirects too? The problem is that
1197c717a561Smaybee * transactions created with dmu_tx_create_assigned() from
1198c717a561Smaybee * syncing context don't bother holding ahead.
1199fa9e4066Sahrens */
1200c717a561Smaybee ASSERT(db->db_level != 0 ||
120182c9918fSTim Haley db->db_state == DB_CACHED || db->db_state == DB_FILL ||
120282c9918fSTim Haley db->db_state == DB_NOFILL);
1203fa9e4066Sahrens
1204fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
1205fa9e4066Sahrens /*
1206fa9e4066Sahrens * Don't set dirtyctx to SYNC if we're just modifying this as we
1207fa9e4066Sahrens * initialize the objset.
1208fa9e4066Sahrens */
1209fa9e4066Sahrens if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1210c717a561Smaybee !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1211fa9e4066Sahrens dn->dn_dirtyctx =
1212fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1213fa9e4066Sahrens ASSERT(dn->dn_dirtyctx_firstset == NULL);
1214fa9e4066Sahrens dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1215fa9e4066Sahrens }
1216fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1217fa9e4066Sahrens
12180a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID)
12190a586ceaSMark Shellenbaum dn->dn_have_spill = B_TRUE;
12200a586ceaSMark Shellenbaum
1221fa9e4066Sahrens /*
1222fa9e4066Sahrens * If this buffer is already dirty, we're done.
1223fa9e4066Sahrens */
1224c717a561Smaybee drp = &db->db_last_dirty;
1225c717a561Smaybee ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1226c717a561Smaybee db->db.db_object == DMU_META_DNODE_OBJECT);
12277e2186e3Sbonwick while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
12287e2186e3Sbonwick drp = &dr->dr_next;
12297e2186e3Sbonwick if (dr && dr->dr_txg == tx->tx_txg) {
1230744947dcSTom Erickson DB_DNODE_EXIT(db);
1231744947dcSTom Erickson
12320f2e7d03SMatthew Ahrens dbuf_redirty(dr);
1233fa9e4066Sahrens mutex_exit(&db->db_mtx);
12347e2186e3Sbonwick return (dr);
1235fa9e4066Sahrens }
1236fa9e4066Sahrens
1237fa9e4066Sahrens /*
1238fa9e4066Sahrens * Only valid if not already dirty.
1239fa9e4066Sahrens */
124014843421SMatthew Ahrens ASSERT(dn->dn_object == 0 ||
124114843421SMatthew Ahrens dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1242fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1243fa9e4066Sahrens
1244fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, db->db_level);
1245fa9e4066Sahrens ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1246fa9e4066Sahrens dn->dn_phys->dn_nlevels > db->db_level ||
1247fa9e4066Sahrens dn->dn_next_nlevels[txgoff] > db->db_level ||
1248fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1249fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1250fa9e4066Sahrens
1251fa9e4066Sahrens /*
1252fa9e4066Sahrens * We should only be dirtying in syncing context if it's the
125314843421SMatthew Ahrens * mos or we're initializing the os or it's a special object.
125414843421SMatthew Ahrens * However, we are allowed to dirty in syncing context provided
125514843421SMatthew Ahrens * we already dirtied it in open context. Hence we must make
125614843421SMatthew Ahrens * this assertion only if we're not already dirty.
1257fa9e4066Sahrens */
1258744947dcSTom Erickson os = dn->dn_objset;
125914843421SMatthew Ahrens ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
126014843421SMatthew Ahrens os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1261fa9e4066Sahrens ASSERT(db->db.db_size != 0);
1262fa9e4066Sahrens
1263fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1264fa9e4066Sahrens
12650a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) {
12661934e92fSmaybee /*
12671934e92fSmaybee * Update the accounting.
1268d3469faaSMark Maybee * Note: we delay "free accounting" until after we drop
1269d3469faaSMark Maybee * the db_mtx. This keeps us from grabbing other locks
1270b24ab676SJeff Bonwick * (and possibly deadlocking) in bp_get_dsize() while
1271d3469faaSMark Maybee * also holding the db_mtx.
12721934e92fSmaybee */
12731934e92fSmaybee dnode_willuse_space(dn, db->db.db_size, tx);
1274d3469faaSMark Maybee do_free_accounting = dbuf_block_freeable(db);
12751934e92fSmaybee }
12761934e92fSmaybee
1277ea8dc4b6Seschrock /*
1278ea8dc4b6Seschrock * If this buffer is dirty in an old transaction group we need
1279ea8dc4b6Seschrock * to make a copy of it so that the changes we make in this
1280ea8dc4b6Seschrock * transaction group won't leak out when we sync the older txg.
1281ea8dc4b6Seschrock */
1282c717a561Smaybee dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1283c717a561Smaybee if (db->db_level == 0) {
1284c717a561Smaybee void *data_old = db->db_buf;
1285c717a561Smaybee
128682c9918fSTim Haley if (db->db_state != DB_NOFILL) {
12870a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
1288c717a561Smaybee dbuf_fix_old_data(db, tx->tx_txg);
1289c717a561Smaybee data_old = db->db.db_data;
1290c717a561Smaybee } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1291fa9e4066Sahrens /*
129282c9918fSTim Haley * Release the data buffer from the cache so
129382c9918fSTim Haley * that we can modify it without impacting
129482c9918fSTim Haley * possible other users of this cached data
129582c9918fSTim Haley * block. Note that indirect blocks and
129682c9918fSTim Haley * private objects are not released until the
129782c9918fSTim Haley * syncing state (since they are only modified
129882c9918fSTim Haley * then).
1299fa9e4066Sahrens */
1300fa9e4066Sahrens arc_release(db->db_buf, db);
1301fa9e4066Sahrens dbuf_fix_old_data(db, tx->tx_txg);
1302c717a561Smaybee data_old = db->db_buf;
1303fa9e4066Sahrens }
1304c717a561Smaybee ASSERT(data_old != NULL);
130582c9918fSTim Haley }
1306c717a561Smaybee dr->dt.dl.dr_data = data_old;
1307c717a561Smaybee } else {
1308c717a561Smaybee mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1309c717a561Smaybee list_create(&dr->dt.di.dr_children,
1310c717a561Smaybee sizeof (dbuf_dirty_record_t),
1311c717a561Smaybee offsetof(dbuf_dirty_record_t, dr_dirty_node));
1312fa9e4066Sahrens }
131369962b56SMatthew Ahrens if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
131469962b56SMatthew Ahrens dr->dr_accounted = db->db.db_size;
1315c717a561Smaybee dr->dr_dbuf = db;
1316c717a561Smaybee dr->dr_txg = tx->tx_txg;
1317c717a561Smaybee dr->dr_next = *drp;
1318c717a561Smaybee *drp = dr;
1319fa9e4066Sahrens
1320fa9e4066Sahrens /*
1321fa9e4066Sahrens * We could have been freed_in_flight between the dbuf_noread
1322fa9e4066Sahrens * and dbuf_dirty. We win, as though the dbuf_noread() had
1323fa9e4066Sahrens * happened after the free.
1324fa9e4066Sahrens */
13250a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
13260a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) {
1327c717a561Smaybee mutex_enter(&dn->dn_mtx);
1328bf16b11eSMatthew Ahrens if (dn->dn_free_ranges[txgoff] != NULL) {
1329bf16b11eSMatthew Ahrens range_tree_clear(dn->dn_free_ranges[txgoff],
1330bf16b11eSMatthew Ahrens db->db_blkid, 1);
1331bf16b11eSMatthew Ahrens }
1332fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1333c717a561Smaybee db->db_freed_in_flight = FALSE;
1334c717a561Smaybee }
1335fa9e4066Sahrens
1336fa9e4066Sahrens /*
1337fa9e4066Sahrens * This buffer is now part of this txg
1338fa9e4066Sahrens */
1339fa9e4066Sahrens dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1340fa9e4066Sahrens db->db_dirtycnt += 1;
1341fa9e4066Sahrens ASSERT3U(db->db_dirtycnt, <=, 3);
1342fa9e4066Sahrens
1343fa9e4066Sahrens mutex_exit(&db->db_mtx);
1344fa9e4066Sahrens
13450a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID ||
13460a586ceaSMark Shellenbaum db->db_blkid == DMU_SPILL_BLKID) {
1347c717a561Smaybee mutex_enter(&dn->dn_mtx);
1348c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node));
1349c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1350c717a561Smaybee mutex_exit(&dn->dn_mtx);
1351fa9e4066Sahrens dnode_setdirty(dn, tx);
1352744947dcSTom Erickson DB_DNODE_EXIT(db);
1353c717a561Smaybee return (dr);
1354d3469faaSMark Maybee } else if (do_free_accounting) {
1355d3469faaSMark Maybee blkptr_t *bp = db->db_blkptr;
1356d3469faaSMark Maybee int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1357b24ab676SJeff Bonwick bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1358d3469faaSMark Maybee /*
1359d3469faaSMark Maybee * This is only a guess -- if the dbuf is dirty
1360d3469faaSMark Maybee * in a previous txg, we don't know how much
1361d3469faaSMark Maybee * space it will use on disk yet. We should
1362d3469faaSMark Maybee * really have the struct_rwlock to access
1363d3469faaSMark Maybee * db_blkptr, but since this is just a guess,
1364d3469faaSMark Maybee * it's OK if we get an odd answer.
1365d3469faaSMark Maybee */
1366837b568bSGeorge Wilson ddt_prefetch(os->os_spa, bp);
1367d3469faaSMark Maybee dnode_willuse_space(dn, -willfree, tx);
1368fa9e4066Sahrens }
1369fa9e4066Sahrens
1370fa9e4066Sahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1371fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER);
1372fa9e4066Sahrens drop_struct_lock = TRUE;
1373fa9e4066Sahrens }
1374fa9e4066Sahrens
13758346f03fSJonathan W Adams if (db->db_level == 0) {
13768346f03fSJonathan W Adams dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
13778346f03fSJonathan W Adams ASSERT(dn->dn_maxblkid >= db->db_blkid);
13788346f03fSJonathan W Adams }
13798346f03fSJonathan W Adams
138044eda4d7Smaybee if (db->db_level+1 < dn->dn_nlevels) {
1381c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent;
1382c717a561Smaybee dbuf_dirty_record_t *di;
1383c717a561Smaybee int parent_held = FALSE;
1384c717a561Smaybee
1385c717a561Smaybee if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1386fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1387c717a561Smaybee
1388fa9e4066Sahrens parent = dbuf_hold_level(dn, db->db_level+1,
1389fa9e4066Sahrens db->db_blkid >> epbs, FTAG);
139001025c89SJohn Harres ASSERT(parent != NULL);
1391c717a561Smaybee parent_held = TRUE;
1392c717a561Smaybee }
1393fa9e4066Sahrens if (drop_struct_lock)
1394fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock);
1395c717a561Smaybee ASSERT3U(db->db_level+1, ==, parent->db_level);
1396c717a561Smaybee di = dbuf_dirty(parent, tx);
1397c717a561Smaybee if (parent_held)
1398ea8dc4b6Seschrock dbuf_rele(parent, FTAG);
1399c717a561Smaybee
1400c717a561Smaybee mutex_enter(&db->db_mtx);
140169962b56SMatthew Ahrens /*
140269962b56SMatthew Ahrens * Since we've dropped the mutex, it's possible that
140369962b56SMatthew Ahrens * dbuf_undirty() might have changed this out from under us.
140469962b56SMatthew Ahrens */
1405c717a561Smaybee if (db->db_last_dirty == dr ||
1406c717a561Smaybee dn->dn_object == DMU_META_DNODE_OBJECT) {
1407c717a561Smaybee mutex_enter(&di->dt.di.dr_mtx);
1408c717a561Smaybee ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1409c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node));
1410c717a561Smaybee list_insert_tail(&di->dt.di.dr_children, dr);
1411c717a561Smaybee mutex_exit(&di->dt.di.dr_mtx);
1412c717a561Smaybee dr->dr_parent = di;
1413c717a561Smaybee }
1414c717a561Smaybee mutex_exit(&db->db_mtx);
1415fa9e4066Sahrens } else {
1416c717a561Smaybee ASSERT(db->db_level+1 == dn->dn_nlevels);
1417c717a561Smaybee ASSERT(db->db_blkid < dn->dn_nblkptr);
1418744947dcSTom Erickson ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1419c717a561Smaybee mutex_enter(&dn->dn_mtx);
1420c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node));
1421c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1422c717a561Smaybee mutex_exit(&dn->dn_mtx);
1423fa9e4066Sahrens if (drop_struct_lock)
1424fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock);
1425fa9e4066Sahrens }
1426fa9e4066Sahrens
1427fa9e4066Sahrens dnode_setdirty(dn, tx);
1428744947dcSTom Erickson DB_DNODE_EXIT(db);
1429c717a561Smaybee return (dr);
1430fa9e4066Sahrens }
1431fa9e4066Sahrens
14323b2aab18SMatthew Ahrens /*
14333e30c24aSWill Andrews * Undirty a buffer in the transaction group referenced by the given
14343e30c24aSWill Andrews * transaction. Return whether this evicted the dbuf.
14353b2aab18SMatthew Ahrens */
14363b2aab18SMatthew Ahrens static boolean_t
dbuf_undirty(dmu_buf_impl_t * db,dmu_tx_t * tx)1437fa9e4066Sahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1438fa9e4066Sahrens {
1439744947dcSTom Erickson dnode_t *dn;
1440c717a561Smaybee uint64_t txg = tx->tx_txg;
144117f17c2dSbonwick dbuf_dirty_record_t *dr, **drp;
1442fa9e4066Sahrens
1443c717a561Smaybee ASSERT(txg != 0);
144446e1baa6SMatthew Ahrens
144546e1baa6SMatthew Ahrens /*
144646e1baa6SMatthew Ahrens * Due to our use of dn_nlevels below, this can only be called
144746e1baa6SMatthew Ahrens * in open context, unless we are operating on the MOS.
144846e1baa6SMatthew Ahrens * From syncing context, dn_nlevels may be different from the
144946e1baa6SMatthew Ahrens * dn_nlevels used when dbuf was dirtied.
145046e1baa6SMatthew Ahrens */
145146e1baa6SMatthew Ahrens ASSERT(db->db_objset ==
145246e1baa6SMatthew Ahrens dmu_objset_pool(db->db_objset)->dp_meta_objset ||
145346e1baa6SMatthew Ahrens txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
14540a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
14553b2aab18SMatthew Ahrens ASSERT0(db->db_level);
14563b2aab18SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx));
1457fa9e4066Sahrens
1458fa9e4066Sahrens /*
1459fa9e4066Sahrens * If this buffer is not dirty, we're done.
1460fa9e4066Sahrens */
146117f17c2dSbonwick for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1462c717a561Smaybee if (dr->dr_txg <= txg)
1463c717a561Smaybee break;
14643b2aab18SMatthew Ahrens if (dr == NULL || dr->dr_txg < txg)
14653b2aab18SMatthew Ahrens return (B_FALSE);
1466c717a561Smaybee ASSERT(dr->dr_txg == txg);
1467b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db);
1468fa9e4066Sahrens
1469744947dcSTom Erickson DB_DNODE_ENTER(db);
1470744947dcSTom Erickson dn = DB_DNODE(db);
1471744947dcSTom Erickson
1472fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1473fa9e4066Sahrens
1474fa9e4066Sahrens ASSERT(db->db.db_size != 0);
1475fa9e4066Sahrens
147646e1baa6SMatthew Ahrens dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
147746e1baa6SMatthew Ahrens dr->dr_accounted, txg);
1478fa9e4066Sahrens
147917f17c2dSbonwick *drp = dr->dr_next;
1480c717a561Smaybee
14813f2366c2SGordon Ross /*
14823f2366c2SGordon Ross * Note that there are three places in dbuf_dirty()
14833f2366c2SGordon Ross * where this dirty record may be put on a list.
14843f2366c2SGordon Ross * Make sure to do a list_remove corresponding to
14853f2366c2SGordon Ross * every one of those list_insert calls.
14863f2366c2SGordon Ross */
1487c717a561Smaybee if (dr->dr_parent) {
1488c717a561Smaybee mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1489c717a561Smaybee list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1490c717a561Smaybee mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
14913f2366c2SGordon Ross } else if (db->db_blkid == DMU_SPILL_BLKID ||
14923f2366c2SGordon Ross db->db_level + 1 == dn->dn_nlevels) {
1493cdb0ab79Smaybee ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1494fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
1495c717a561Smaybee list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1496fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
1497c717a561Smaybee }
1498744947dcSTom Erickson DB_DNODE_EXIT(db);
1499c717a561Smaybee
150082c9918fSTim Haley if (db->db_state != DB_NOFILL) {
1501c717a561Smaybee dbuf_unoverride(dr);
1502c717a561Smaybee
1503c717a561Smaybee ASSERT(db->db_buf != NULL);
1504c717a561Smaybee ASSERT(dr->dt.dl.dr_data != NULL);
1505c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf)
15063b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1507c717a561Smaybee }
1508d2b3cbbdSJorgen Lundman
1509c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t));
1510fa9e4066Sahrens
1511fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0);
1512fa9e4066Sahrens db->db_dirtycnt -= 1;
1513fa9e4066Sahrens
1514c717a561Smaybee if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1515ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf;
1516fa9e4066Sahrens
1517b24ab676SJeff Bonwick ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1518bc9014e6SJustin Gibbs dbuf_clear_data(db);
15193b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db));
1520fa9e4066Sahrens dbuf_evict(db);
15213b2aab18SMatthew Ahrens return (B_TRUE);
1522fa9e4066Sahrens }
1523fa9e4066Sahrens
15243b2aab18SMatthew Ahrens return (B_FALSE);
1525fa9e4066Sahrens }
1526fa9e4066Sahrens
1527fa9e4066Sahrens void
dmu_buf_will_dirty(dmu_buf_t * db_fake,dmu_tx_t * tx)152843466aaeSMax Grossman dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1529fa9e4066Sahrens {
153043466aaeSMax Grossman dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
15311ab7f2deSmaybee int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1532fa9e4066Sahrens
1533fa9e4066Sahrens ASSERT(tx->tx_txg != 0);
1534fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
1535fa9e4066Sahrens
15360f2e7d03SMatthew Ahrens /*
15370f2e7d03SMatthew Ahrens * Quick check for dirtyness. For already dirty blocks, this
15380f2e7d03SMatthew Ahrens * reduces runtime of this function by >90%, and overall performance
15390f2e7d03SMatthew Ahrens * by 50% for some workloads (e.g. file deletion with indirect blocks
15400f2e7d03SMatthew Ahrens * cached).
15410f2e7d03SMatthew Ahrens */
15420f2e7d03SMatthew Ahrens mutex_enter(&db->db_mtx);
15430f2e7d03SMatthew Ahrens dbuf_dirty_record_t *dr;
15440f2e7d03SMatthew Ahrens for (dr = db->db_last_dirty;
15450f2e7d03SMatthew Ahrens dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
15460f2e7d03SMatthew Ahrens /*
15470f2e7d03SMatthew Ahrens * It's possible that it is already dirty but not cached,
15480f2e7d03SMatthew Ahrens * because there are some calls to dbuf_dirty() that don't
15490f2e7d03SMatthew Ahrens * go through dmu_buf_will_dirty().
15500f2e7d03SMatthew Ahrens */
15510f2e7d03SMatthew Ahrens if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
15520f2e7d03SMatthew Ahrens /* This dbuf is already dirty and cached. */
15530f2e7d03SMatthew Ahrens dbuf_redirty(dr);
15540f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx);
15550f2e7d03SMatthew Ahrens return;
15560f2e7d03SMatthew Ahrens }
15570f2e7d03SMatthew Ahrens }
15580f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx);
15590f2e7d03SMatthew Ahrens
1560744947dcSTom Erickson DB_DNODE_ENTER(db);
1561744947dcSTom Erickson if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1562fa9e4066Sahrens rf |= DB_RF_HAVESTRUCT;
1563744947dcSTom Erickson DB_DNODE_EXIT(db);
1564ea8dc4b6Seschrock (void) dbuf_read(db, NULL, rf);
1565c717a561Smaybee (void) dbuf_dirty(db, tx);
1566fa9e4066Sahrens }
1567fa9e4066Sahrens
1568fa9e4066Sahrens void
dmu_buf_will_not_fill(dmu_buf_t * db_fake,dmu_tx_t * tx)156982c9918fSTim Haley dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
157082c9918fSTim Haley {
157182c9918fSTim Haley dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
157282c9918fSTim Haley
157382c9918fSTim Haley db->db_state = DB_NOFILL;
157482c9918fSTim Haley
157582c9918fSTim Haley dmu_buf_will_fill(db_fake, tx);
157682c9918fSTim Haley }
157782c9918fSTim Haley
157882c9918fSTim Haley void
dmu_buf_will_fill(dmu_buf_t * db_fake,dmu_tx_t * tx)1579ea8dc4b6Seschrock dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1580fa9e4066Sahrens {
1581ea8dc4b6Seschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1582ea8dc4b6Seschrock
15830a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1584fa9e4066Sahrens ASSERT(tx->tx_txg != 0);
1585fa9e4066Sahrens ASSERT(db->db_level == 0);
1586fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds));
1587fa9e4066Sahrens
1588ea8dc4b6Seschrock ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1589fa9e4066Sahrens dmu_tx_private_ok(tx));
1590fa9e4066Sahrens
1591fa9e4066Sahrens dbuf_noread(db);
1592c717a561Smaybee (void) dbuf_dirty(db, tx);
1593fa9e4066Sahrens }
1594fa9e4066Sahrens
1595fa9e4066Sahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done
1596fa9e4066Sahrens /* ARGSUSED */
1597fa9e4066Sahrens void
dbuf_fill_done(dmu_buf_impl_t * db,dmu_tx_t * tx)1598fa9e4066Sahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1599fa9e4066Sahrens {
1600fa9e4066Sahrens mutex_enter(&db->db_mtx);
16019c9dc39aSek110237 DBUF_VERIFY(db);
1602fa9e4066Sahrens
1603fa9e4066Sahrens if (db->db_state == DB_FILL) {
1604c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) {
16050a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1606fa9e4066Sahrens /* we were freed while filling */
1607fa9e4066Sahrens /* XXX dbuf_undirty? */
1608fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size);
1609c717a561Smaybee db->db_freed_in_flight = FALSE;
1610fa9e4066Sahrens }
1611fa9e4066Sahrens db->db_state = DB_CACHED;
1612fa9e4066Sahrens cv_broadcast(&db->db_changed);
1613fa9e4066Sahrens }
1614fa9e4066Sahrens mutex_exit(&db->db_mtx);
1615fa9e4066Sahrens }
1616fa9e4066Sahrens
16175d7b4d43SMatthew Ahrens void
dmu_buf_write_embedded(dmu_buf_t * dbuf,void * data,bp_embedded_type_t etype,enum zio_compress comp,int uncompressed_size,int compressed_size,int byteorder,dmu_tx_t * tx)16185d7b4d43SMatthew Ahrens dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
16195d7b4d43SMatthew Ahrens bp_embedded_type_t etype, enum zio_compress comp,
16205d7b4d43SMatthew Ahrens int uncompressed_size, int compressed_size, int byteorder,
16215d7b4d43SMatthew Ahrens dmu_tx_t *tx)
16225d7b4d43SMatthew Ahrens {
16235d7b4d43SMatthew Ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
16245d7b4d43SMatthew Ahrens struct dirty_leaf *dl;
16255d7b4d43SMatthew Ahrens dmu_object_type_t type;
16265d7b4d43SMatthew Ahrens
1627ca0cc391SMatthew Ahrens if (etype == BP_EMBEDDED_TYPE_DATA) {
1628ca0cc391SMatthew Ahrens ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
1629ca0cc391SMatthew Ahrens SPA_FEATURE_EMBEDDED_DATA));
1630ca0cc391SMatthew Ahrens }
1631ca0cc391SMatthew Ahrens
16325d7b4d43SMatthew Ahrens DB_DNODE_ENTER(db);
16335d7b4d43SMatthew Ahrens type = DB_DNODE(db)->dn_type;
16345d7b4d43SMatthew Ahrens DB_DNODE_EXIT(db);
16355d7b4d43SMatthew Ahrens
16365d7b4d43SMatthew Ahrens ASSERT0(db->db_level);
16375d7b4d43SMatthew Ahrens ASSERT(db->db_blkid != DMU_BONUS_BLKID);
16385d7b4d43SMatthew Ahrens
16395d7b4d43SMatthew Ahrens dmu_buf_will_not_fill(dbuf, tx);
16405d7b4d43SMatthew Ahrens
16415d7b4d43SMatthew Ahrens ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
16425d7b4d43SMatthew Ahrens dl = &db->db_last_dirty->dt.dl;
16435d7b4d43SMatthew Ahrens encode_embedded_bp_compressed(&dl->dr_overridden_by,
16445d7b4d43SMatthew Ahrens data, comp, uncompressed_size, compressed_size);
16455d7b4d43SMatthew Ahrens BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
16465d7b4d43SMatthew Ahrens BP_SET_TYPE(&dl->dr_overridden_by, type);
16475d7b4d43SMatthew Ahrens BP_SET_LEVEL(&dl->dr_overridden_by, 0);
16485d7b4d43SMatthew Ahrens BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
16495d7b4d43SMatthew Ahrens
16505d7b4d43SMatthew Ahrens dl->dr_override_state = DR_OVERRIDDEN;
16515d7b4d43SMatthew Ahrens dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
16525d7b4d43SMatthew Ahrens }
16535d7b4d43SMatthew Ahrens
1654ea8dc4b6Seschrock /*
16552fdbea25SAleksandr Guzovskiy * Directly assign a provided arc buf to a given dbuf if it's not referenced
16562fdbea25SAleksandr Guzovskiy * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
16572fdbea25SAleksandr Guzovskiy */
16582fdbea25SAleksandr Guzovskiy void
dbuf_assign_arcbuf(dmu_buf_impl_t * db,arc_buf_t * buf,dmu_tx_t * tx)16592fdbea25SAleksandr Guzovskiy dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
16602fdbea25SAleksandr Guzovskiy {
16612fdbea25SAleksandr Guzovskiy ASSERT(!refcount_is_zero(&db->db_holds));
16620a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
16632fdbea25SAleksandr Guzovskiy ASSERT(db->db_level == 0);
16642fdbea25SAleksandr Guzovskiy ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
16652fdbea25SAleksandr Guzovskiy ASSERT(buf != NULL);
16662fdbea25SAleksandr Guzovskiy ASSERT(arc_buf_size(buf) == db->db.db_size);
16672fdbea25SAleksandr Guzovskiy ASSERT(tx->tx_txg != 0);
16682fdbea25SAleksandr Guzovskiy
16692fdbea25SAleksandr Guzovskiy arc_return_buf(buf, db);
16702fdbea25SAleksandr Guzovskiy ASSERT(arc_released(buf));
16712fdbea25SAleksandr Guzovskiy
16722fdbea25SAleksandr Guzovskiy mutex_enter(&db->db_mtx);
16732fdbea25SAleksandr Guzovskiy
16742fdbea25SAleksandr Guzovskiy while (db->db_state == DB_READ || db->db_state == DB_FILL)
16752fdbea25SAleksandr Guzovskiy cv_wait(&db->db_changed, &db->db_mtx);
16762fdbea25SAleksandr Guzovskiy
16772fdbea25SAleksandr Guzovskiy ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
16782fdbea25SAleksandr Guzovskiy
16792fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED &&
16802fdbea25SAleksandr Guzovskiy refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
16812fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx);
16822fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx);
16832fdbea25SAleksandr Guzovskiy bcopy(buf->b_data, db->db.db_data, db->db.db_size);
16843b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db));
1685c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_copied();
16862fdbea25SAleksandr Guzovskiy return;
16872fdbea25SAleksandr Guzovskiy }
16882fdbea25SAleksandr Guzovskiy
1689c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_nocopy();
16902fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED) {
16912fdbea25SAleksandr Guzovskiy dbuf_dirty_record_t *dr = db->db_last_dirty;
16922fdbea25SAleksandr Guzovskiy
16932fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf != NULL);
16942fdbea25SAleksandr Guzovskiy if (dr != NULL && dr->dr_txg == tx->tx_txg) {
16952fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_data == db->db_buf);
16962fdbea25SAleksandr Guzovskiy if (!arc_released(db->db_buf)) {
16972fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_override_state ==
16982fdbea25SAleksandr Guzovskiy DR_OVERRIDDEN);
16992fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db);
17002fdbea25SAleksandr Guzovskiy }
17012fdbea25SAleksandr Guzovskiy dr->dt.dl.dr_data = buf;
17023b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db));
17032fdbea25SAleksandr Guzovskiy } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
17042fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db);
17053b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db));
17062fdbea25SAleksandr Guzovskiy }
17072fdbea25SAleksandr Guzovskiy db->db_buf = NULL;
17082fdbea25SAleksandr Guzovskiy }
17092fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf == NULL);
17102fdbea25SAleksandr Guzovskiy dbuf_set_data(db, buf);
17112fdbea25SAleksandr Guzovskiy db->db_state = DB_FILL;
17122fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx);
17132fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx);
171443466aaeSMax Grossman dmu_buf_fill_done(&db->db, tx);
17152fdbea25SAleksandr Guzovskiy }
17162fdbea25SAleksandr Guzovskiy
17172fdbea25SAleksandr Guzovskiy /*
1718ea8dc4b6Seschrock * "Clear" the contents of this dbuf. This will mark the dbuf
171969962b56SMatthew Ahrens * EVICTING and clear *most* of its references. Unfortunately,
1720ea8dc4b6Seschrock * when we are not holding the dn_dbufs_mtx, we can't clear the
1721ea8dc4b6Seschrock * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1722ea8dc4b6Seschrock * in this case. For callers from the DMU we will usually see:
1723bbfa8ea8SMatthew Ahrens * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
1724ea8dc4b6Seschrock * For the arc callback, we will usually see:
1725ea8dc4b6Seschrock * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1726ea8dc4b6Seschrock * Sometimes, though, we will get a mix of these two:
1727bbfa8ea8SMatthew Ahrens * DMU: dbuf_clear()->arc_clear_callback()
1728ea8dc4b6Seschrock * ARC: dbuf_do_evict()->dbuf_destroy()
1729bbfa8ea8SMatthew Ahrens *
1730bbfa8ea8SMatthew Ahrens * This routine will dissociate the dbuf from the arc, by calling
1731bbfa8ea8SMatthew Ahrens * arc_clear_callback(), but will not evict the data from the ARC.
1732ea8dc4b6Seschrock */
1733ea8dc4b6Seschrock void
dbuf_clear(dmu_buf_impl_t * db)1734fa9e4066Sahrens dbuf_clear(dmu_buf_impl_t *db)
1735fa9e4066Sahrens {
1736744947dcSTom Erickson dnode_t *dn;
1737ea8dc4b6Seschrock dmu_buf_impl_t *parent = db->db_parent;
1738744947dcSTom Erickson dmu_buf_impl_t *dndb;
1739bbfa8ea8SMatthew Ahrens boolean_t dbuf_gone = B_FALSE;
1740fa9e4066Sahrens
1741fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx));
1742fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds));
1743fa9e4066Sahrens
1744ea8dc4b6Seschrock dbuf_evict_user(db);
1745ea8dc4b6Seschrock
1746fa9e4066Sahrens if (db->db_state == DB_CACHED) {
1747ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL);
17480a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
1749ea8dc4b6Seschrock zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
17505a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
17510e8c6158Smaybee }
1752fa9e4066Sahrens db->db.db_data = NULL;
1753fa9e4066Sahrens db->db_state = DB_UNCACHED;
1754fa9e4066Sahrens }
1755fa9e4066Sahrens
175682c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1757fa9e4066Sahrens ASSERT(db->db_data_pending == NULL);
1758fa9e4066Sahrens
1759ea8dc4b6Seschrock db->db_state = DB_EVICTING;
1760ea8dc4b6Seschrock db->db_blkptr = NULL;
1761ea8dc4b6Seschrock
1762744947dcSTom Erickson DB_DNODE_ENTER(db);
1763744947dcSTom Erickson dn = DB_DNODE(db);
1764744947dcSTom Erickson dndb = dn->dn_dbuf;
17650a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
17660f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db);
1767640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count);
1768744947dcSTom Erickson membar_producer();
1769744947dcSTom Erickson DB_DNODE_EXIT(db);
1770744947dcSTom Erickson /*
1771744947dcSTom Erickson * Decrementing the dbuf count means that the hold corresponding
1772744947dcSTom Erickson * to the removed dbuf is no longer discounted in dnode_move(),
1773744947dcSTom Erickson * so the dnode cannot be moved until after we release the hold.
1774744947dcSTom Erickson * The membar_producer() ensures visibility of the decremented
1775744947dcSTom Erickson * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1776744947dcSTom Erickson * release any lock.
1777744947dcSTom Erickson */
1778ea8dc4b6Seschrock dnode_rele(dn, db);
1779744947dcSTom Erickson db->db_dnode_handle = NULL;
1780744947dcSTom Erickson } else {
1781744947dcSTom Erickson DB_DNODE_EXIT(db);
1782ea8dc4b6Seschrock }
1783ea8dc4b6Seschrock
1784ea8dc4b6Seschrock if (db->db_buf)
1785bbfa8ea8SMatthew Ahrens dbuf_gone = arc_clear_callback(db->db_buf);
1786ea8dc4b6Seschrock
1787ea8dc4b6Seschrock if (!dbuf_gone)
1788fa9e4066Sahrens mutex_exit(&db->db_mtx);
1789fa9e4066Sahrens
1790fa9e4066Sahrens /*
1791744947dcSTom Erickson * If this dbuf is referenced from an indirect dbuf,
1792fa9e4066Sahrens * decrement the ref count on the indirect dbuf.
1793fa9e4066Sahrens */
1794c543ec06Sahrens if (parent && parent != dndb)
1795ea8dc4b6Seschrock dbuf_rele(parent, db);
1796fa9e4066Sahrens }
1797fa9e4066Sahrens
1798a2cdcdd2SPaul Dagnelie /*
1799a2cdcdd2SPaul Dagnelie * Note: While bpp will always be updated if the function returns success,
1800a2cdcdd2SPaul Dagnelie * parentp will not be updated if the dnode does not have dn_dbuf filled in;
1801a2cdcdd2SPaul Dagnelie * this happens when the dnode is the meta-dnode, or a userused or groupused
1802a2cdcdd2SPaul Dagnelie * object.
1803a2cdcdd2SPaul Dagnelie */
1804fa9e4066Sahrens static int
dbuf_findbp(dnode_t * dn,int level,uint64_t blkid,int fail_sparse,dmu_buf_impl_t ** parentp,blkptr_t ** bpp)1805fa9e4066Sahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1806fa9e4066Sahrens dmu_buf_impl_t **parentp, blkptr_t **bpp)
1807fa9e4066Sahrens {
1808fa9e4066Sahrens int nlevels, epbs;
1809fa9e4066Sahrens
18100b69c2f0Sahrens *parentp = NULL;
18110b69c2f0Sahrens *bpp = NULL;
18120b69c2f0Sahrens
18130a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID);
18140a586ceaSMark Shellenbaum
18150a586ceaSMark Shellenbaum if (blkid == DMU_SPILL_BLKID) {
18160a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx);
181706e0070dSMark Shellenbaum if (dn->dn_have_spill &&
181806e0070dSMark Shellenbaum (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
18190a586ceaSMark Shellenbaum *bpp = &dn->dn_phys->dn_spill;
18200a586ceaSMark Shellenbaum else
18210a586ceaSMark Shellenbaum *bpp = NULL;
18220a586ceaSMark Shellenbaum dbuf_add_ref(dn->dn_dbuf, NULL);
18230a586ceaSMark Shellenbaum *parentp = dn->dn_dbuf;
18240a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx);
18250a586ceaSMark Shellenbaum return (0);
18260a586ceaSMark Shellenbaum }
1827ea8dc4b6Seschrock
1828fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 0)
1829fa9e4066Sahrens nlevels = 1;
1830fa9e4066Sahrens else
1831fa9e4066Sahrens nlevels = dn->dn_phys->dn_nlevels;
1832fa9e4066Sahrens
1833fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1834fa9e4066Sahrens
1835fa9e4066Sahrens ASSERT3U(level * epbs, <, 64);
1836fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1837ea8dc4b6Seschrock if (level >= nlevels ||
1838fa9e4066Sahrens (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1839fa9e4066Sahrens /* the buffer has no parent yet */
1840be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT));
1841fa9e4066Sahrens } else if (level < nlevels-1) {
1842fa9e4066Sahrens /* this block is referenced from an indirect block */
1843fa9e4066Sahrens int err = dbuf_hold_impl(dn, level+1,
1844a2cdcdd2SPaul Dagnelie blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
1845fa9e4066Sahrens if (err)
1846fa9e4066Sahrens return (err);
1847ea8dc4b6Seschrock err = dbuf_read(*parentp, NULL,
1848ea8dc4b6Seschrock (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1849c543ec06Sahrens if (err) {
1850c543ec06Sahrens dbuf_rele(*parentp, NULL);
1851c543ec06Sahrens *parentp = NULL;
1852c543ec06Sahrens return (err);
1853c543ec06Sahrens }
1854fa9e4066Sahrens *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1855fa9e4066Sahrens (blkid & ((1ULL << epbs) - 1));
1856c543ec06Sahrens return (0);
1857fa9e4066Sahrens } else {
1858fa9e4066Sahrens /* the block is referenced from the dnode */
1859fa9e4066Sahrens ASSERT3U(level, ==, nlevels-1);
1860fa9e4066Sahrens ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1861fa9e4066Sahrens blkid < dn->dn_phys->dn_nblkptr);
1862c543ec06Sahrens if (dn->dn_dbuf) {
1863c543ec06Sahrens dbuf_add_ref(dn->dn_dbuf, NULL);
1864fa9e4066Sahrens *parentp = dn->dn_dbuf;
1865c543ec06Sahrens }
1866fa9e4066Sahrens *bpp = &dn->dn_phys->dn_blkptr[blkid];
1867fa9e4066Sahrens return (0);
1868fa9e4066Sahrens }
1869fa9e4066Sahrens }
1870fa9e4066Sahrens
1871fa9e4066Sahrens static dmu_buf_impl_t *
dbuf_create(dnode_t * dn,uint8_t level,uint64_t blkid,dmu_buf_impl_t * parent,blkptr_t * blkptr)1872fa9e4066Sahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1873fa9e4066Sahrens dmu_buf_impl_t *parent, blkptr_t *blkptr)
1874fa9e4066Sahrens {
1875503ad85cSMatthew Ahrens objset_t *os = dn->dn_objset;
1876fa9e4066Sahrens dmu_buf_impl_t *db, *odb;
1877fa9e4066Sahrens
1878fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1879fa9e4066Sahrens ASSERT(dn->dn_type != DMU_OT_NONE);
1880fa9e4066Sahrens
1881fa9e4066Sahrens db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1882fa9e4066Sahrens
1883fa9e4066Sahrens db->db_objset = os;
1884fa9e4066Sahrens db->db.db_object = dn->dn_object;
1885fa9e4066Sahrens db->db_level = level;
1886fa9e4066Sahrens db->db_blkid = blkid;
1887c717a561Smaybee db->db_last_dirty = NULL;
1888ea8dc4b6Seschrock db->db_dirtycnt = 0;
1889744947dcSTom Erickson db->db_dnode_handle = dn->dn_handle;
1890ea8dc4b6Seschrock db->db_parent = parent;
1891ea8dc4b6Seschrock db->db_blkptr = blkptr;
1892fa9e4066Sahrens
1893bc9014e6SJustin Gibbs db->db_user = NULL;
1894d2058105SJustin T. Gibbs db->db_user_immediate_evict = FALSE;
1895d2058105SJustin T. Gibbs db->db_freed_in_flight = FALSE;
1896d2058105SJustin T. Gibbs db->db_pending_evict = FALSE;
1897ea8dc4b6Seschrock
18980a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID) {
1899ea8dc4b6Seschrock ASSERT3P(parent, ==, dn->dn_dbuf);
19001934e92fSmaybee db->db.db_size = DN_MAX_BONUSLEN -
19011934e92fSmaybee (dn->dn_nblkptr-1) * sizeof (blkptr_t);
19021934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
19030a586ceaSMark Shellenbaum db->db.db_offset = DMU_BONUS_BLKID;
1904ea8dc4b6Seschrock db->db_state = DB_UNCACHED;
1905ea8dc4b6Seschrock /* the bonus dbuf is not placed in the hash table */
19065a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1907ea8dc4b6Seschrock return (db);
19080a586ceaSMark Shellenbaum } else if (blkid == DMU_SPILL_BLKID) {
19090a586ceaSMark Shellenbaum db->db.db_size = (blkptr != NULL) ?
19100a586ceaSMark Shellenbaum BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
19110a586ceaSMark Shellenbaum db->db.db_offset = 0;
1912fa9e4066Sahrens } else {
1913fa9e4066Sahrens int blocksize =
1914fa9e4066Sahrens db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1915fa9e4066Sahrens db->db.db_size = blocksize;
1916fa9e4066Sahrens db->db.db_offset = db->db_blkid * blocksize;
1917fa9e4066Sahrens }
1918fa9e4066Sahrens
1919fa9e4066Sahrens /*
1920fa9e4066Sahrens * Hold the dn_dbufs_mtx while we get the new dbuf
1921fa9e4066Sahrens * in the hash table *and* added to the dbufs list.
1922fa9e4066Sahrens * This prevents a possible deadlock with someone
1923fa9e4066Sahrens * trying to look up this dbuf before its added to the
1924fa9e4066Sahrens * dn_dbufs list.
1925fa9e4066Sahrens */
1926fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx);
1927ea8dc4b6Seschrock db->db_state = DB_EVICTING;
1928fa9e4066Sahrens if ((odb = dbuf_hash_insert(db)) != NULL) {
1929fa9e4066Sahrens /* someone else inserted it first */
1930fa9e4066Sahrens kmem_cache_free(dbuf_cache, db);
1931fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx);
1932fa9e4066Sahrens return (odb);
1933fa9e4066Sahrens }
19340f6d88adSAlex Reece avl_add(&dn->dn_dbufs, db);
1935713d6c20SMatthew Ahrens if (db->db_level == 0 && db->db_blkid >=
1936713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid)
1937713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1938ea8dc4b6Seschrock db->db_state = DB_UNCACHED;
1939fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx);
19405a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1941fa9e4066Sahrens
1942fa9e4066Sahrens if (parent && parent != dn->dn_dbuf)
1943fa9e4066Sahrens dbuf_add_ref(parent, db);
1944fa9e4066Sahrens
1945ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1946ea8dc4b6Seschrock refcount_count(&dn->dn_holds) > 0);
1947fa9e4066Sahrens (void) refcount_add(&dn->dn_holds, db);
1948640c1670SJosef 'Jeff' Sipek atomic_inc_32(&dn->dn_dbufs_count);
1949fa9e4066Sahrens
1950fa9e4066Sahrens dprintf_dbuf(db, "db=%p\n", db);
1951fa9e4066Sahrens
1952fa9e4066Sahrens return (db);
1953fa9e4066Sahrens }
1954fa9e4066Sahrens
1955fa9e4066Sahrens static int
dbuf_do_evict(void * private)1956ea8dc4b6Seschrock dbuf_do_evict(void *private)
1957fa9e4066Sahrens {
1958bbfa8ea8SMatthew Ahrens dmu_buf_impl_t *db = private;
1959fa9e4066Sahrens
1960ea8dc4b6Seschrock if (!MUTEX_HELD(&db->db_mtx))
1961ea8dc4b6Seschrock mutex_enter(&db->db_mtx);
1962ea8dc4b6Seschrock
1963ea8dc4b6Seschrock ASSERT(refcount_is_zero(&db->db_holds));
1964ea8dc4b6Seschrock
1965ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) {
1966ea8dc4b6Seschrock ASSERT(db->db_state == DB_CACHED);
19679c9dc39aSek110237 DBUF_VERIFY(db);
1968ea8dc4b6Seschrock db->db_buf = NULL;
1969ea8dc4b6Seschrock dbuf_evict(db);
1970ea8dc4b6Seschrock } else {
1971ea8dc4b6Seschrock mutex_exit(&db->db_mtx);
1972ea8dc4b6Seschrock dbuf_destroy(db);
1973fa9e4066Sahrens }
1974ea8dc4b6Seschrock return (0);
1975fa9e4066Sahrens }
1976fa9e4066Sahrens
1977fa9e4066Sahrens static void
dbuf_destroy(dmu_buf_impl_t * db)1978fa9e4066Sahrens dbuf_destroy(dmu_buf_impl_t *db)
1979fa9e4066Sahrens {
1980fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds));
1981fa9e4066Sahrens
19820a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) {
1983ea8dc4b6Seschrock /*
1984ea8dc4b6Seschrock * If this dbuf is still on the dn_dbufs list,
1985ea8dc4b6Seschrock * remove it from that list.
1986ea8dc4b6Seschrock */
1987744947dcSTom Erickson if (db->db_dnode_handle != NULL) {
1988744947dcSTom Erickson dnode_t *dn;
19891934e92fSmaybee
1990744947dcSTom Erickson DB_DNODE_ENTER(db);
1991744947dcSTom Erickson dn = DB_DNODE(db);
19921934e92fSmaybee mutex_enter(&dn->dn_dbufs_mtx);
19930f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db);
1994640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count);
1995ea8dc4b6Seschrock mutex_exit(&dn->dn_dbufs_mtx);
1996744947dcSTom Erickson DB_DNODE_EXIT(db);
1997744947dcSTom Erickson /*
1998744947dcSTom Erickson * Decrementing the dbuf count means that the hold
1999744947dcSTom Erickson * corresponding to the removed dbuf is no longer
2000744947dcSTom Erickson * discounted in dnode_move(), so the dnode cannot be
2001744947dcSTom Erickson * moved until after we release the hold.
2002744947dcSTom Erickson */
2003ea8dc4b6Seschrock dnode_rele(dn, db);
2004744947dcSTom Erickson db->db_dnode_handle = NULL;
2005ea8dc4b6Seschrock }
2006ea8dc4b6Seschrock dbuf_hash_remove(db);
2007ea8dc4b6Seschrock }
2008ea8dc4b6Seschrock db->db_parent = NULL;
2009ea8dc4b6Seschrock db->db_buf = NULL;
2010ea8dc4b6Seschrock
2011fa9e4066Sahrens ASSERT(db->db.db_data == NULL);
2012fa9e4066Sahrens ASSERT(db->db_hash_next == NULL);
2013fa9e4066Sahrens ASSERT(db->db_blkptr == NULL);
2014fa9e4066Sahrens ASSERT(db->db_data_pending == NULL);
2015fa9e4066Sahrens
2016fa9e4066Sahrens kmem_cache_free(dbuf_cache, db);
20175a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2018fa9e4066Sahrens }
2019fa9e4066Sahrens
2020a2cdcdd2SPaul Dagnelie typedef struct dbuf_prefetch_arg {
2021a2cdcdd2SPaul Dagnelie spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2022a2cdcdd2SPaul Dagnelie zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2023a2cdcdd2SPaul Dagnelie int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2024a2cdcdd2SPaul Dagnelie int dpa_curlevel; /* The current level that we're reading */
2025a2cdcdd2SPaul Dagnelie zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2026a2cdcdd2SPaul Dagnelie zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2027a2cdcdd2SPaul Dagnelie arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2028a2cdcdd2SPaul Dagnelie } dbuf_prefetch_arg_t;
2029a2cdcdd2SPaul Dagnelie
2030a2cdcdd2SPaul Dagnelie /*
2031a2cdcdd2SPaul Dagnelie * Actually issue the prefetch read for the block given.
2032a2cdcdd2SPaul Dagnelie */
2033a2cdcdd2SPaul Dagnelie static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t * dpa,blkptr_t * bp)2034a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2035fa9e4066Sahrens {
2036a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2037a2cdcdd2SPaul Dagnelie return;
2038a2cdcdd2SPaul Dagnelie
2039a2cdcdd2SPaul Dagnelie arc_flags_t aflags =
2040a2cdcdd2SPaul Dagnelie dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2041a2cdcdd2SPaul Dagnelie
2042a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2043a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2044a2cdcdd2SPaul Dagnelie ASSERT(dpa->dpa_zio != NULL);
2045a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2046a2cdcdd2SPaul Dagnelie dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2047a2cdcdd2SPaul Dagnelie &aflags, &dpa->dpa_zb);
2048a2cdcdd2SPaul Dagnelie }
2049a2cdcdd2SPaul Dagnelie
2050a2cdcdd2SPaul Dagnelie /*
2051a2cdcdd2SPaul Dagnelie * Called when an indirect block above our prefetch target is read in. This
2052a2cdcdd2SPaul Dagnelie * will either read in the next indirect block down the tree or issue the actual
2053a2cdcdd2SPaul Dagnelie * prefetch if the next block down is our target.
2054a2cdcdd2SPaul Dagnelie */
2055a2cdcdd2SPaul Dagnelie static void
dbuf_prefetch_indirect_done(zio_t * zio,arc_buf_t * abuf,void * private)2056a2cdcdd2SPaul Dagnelie dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
2057a2cdcdd2SPaul Dagnelie {
2058a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = private;
2059a2cdcdd2SPaul Dagnelie
2060a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2061a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_curlevel, >, 0);
2062a2cdcdd2SPaul Dagnelie if (zio != NULL) {
2063a2cdcdd2SPaul Dagnelie ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2064a2cdcdd2SPaul Dagnelie ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
2065a2cdcdd2SPaul Dagnelie ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
2066a2cdcdd2SPaul Dagnelie }
2067a2cdcdd2SPaul Dagnelie
2068a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel--;
2069a2cdcdd2SPaul Dagnelie
2070a2cdcdd2SPaul Dagnelie uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
2071a2cdcdd2SPaul Dagnelie (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
2072a2cdcdd2SPaul Dagnelie blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
2073a2cdcdd2SPaul Dagnelie P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
2074a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
2075a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa));
2076a2cdcdd2SPaul Dagnelie } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
2077a2cdcdd2SPaul Dagnelie ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
2078a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, bp);
2079a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa));
2080a2cdcdd2SPaul Dagnelie } else {
2081a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
2082a2cdcdd2SPaul Dagnelie zbookmark_phys_t zb;
2083a2cdcdd2SPaul Dagnelie
2084a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2085a2cdcdd2SPaul Dagnelie
2086a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
2087a2cdcdd2SPaul Dagnelie dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
2088a2cdcdd2SPaul Dagnelie
2089a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2090a2cdcdd2SPaul Dagnelie bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
2091a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2092a2cdcdd2SPaul Dagnelie &iter_aflags, &zb);
2093a2cdcdd2SPaul Dagnelie }
2094a2cdcdd2SPaul Dagnelie (void) arc_buf_remove_ref(abuf, private);
2095a2cdcdd2SPaul Dagnelie }
2096a2cdcdd2SPaul Dagnelie
2097a2cdcdd2SPaul Dagnelie /*
2098a2cdcdd2SPaul Dagnelie * Issue prefetch reads for the given block on the given level. If the indirect
2099a2cdcdd2SPaul Dagnelie * blocks above that block are not in memory, we will read them in
2100a2cdcdd2SPaul Dagnelie * asynchronously. As a result, this call never blocks waiting for a read to
2101a2cdcdd2SPaul Dagnelie * complete.
2102a2cdcdd2SPaul Dagnelie */
2103a2cdcdd2SPaul Dagnelie void
dbuf_prefetch(dnode_t * dn,int64_t level,uint64_t blkid,zio_priority_t prio,arc_flags_t aflags)2104a2cdcdd2SPaul Dagnelie dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
2105a2cdcdd2SPaul Dagnelie arc_flags_t aflags)
2106a2cdcdd2SPaul Dagnelie {
2107a2cdcdd2SPaul Dagnelie blkptr_t bp;
2108a2cdcdd2SPaul Dagnelie int epbs, nlevels, curlevel;
2109a2cdcdd2SPaul Dagnelie uint64_t curblkid;
2110fa9e4066Sahrens
21110a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID);
2112fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2113fa9e4066Sahrens
2114cf6106c8SMatthew Ahrens if (blkid > dn->dn_maxblkid)
2115cf6106c8SMatthew Ahrens return;
2116cf6106c8SMatthew Ahrens
2117fa9e4066Sahrens if (dnode_block_freed(dn, blkid))
2118fa9e4066Sahrens return;
2119fa9e4066Sahrens
2120fa9e4066Sahrens /*
2121a2cdcdd2SPaul Dagnelie * This dnode hasn't been written to disk yet, so there's nothing to
2122a2cdcdd2SPaul Dagnelie * prefetch.
2123fa9e4066Sahrens */
2124a2cdcdd2SPaul Dagnelie nlevels = dn->dn_phys->dn_nlevels;
2125a2cdcdd2SPaul Dagnelie if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
2126a2cdcdd2SPaul Dagnelie return;
2127a2cdcdd2SPaul Dagnelie
2128a2cdcdd2SPaul Dagnelie epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2129a2cdcdd2SPaul Dagnelie if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
2130a2cdcdd2SPaul Dagnelie return;
2131a2cdcdd2SPaul Dagnelie
2132a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
2133a2cdcdd2SPaul Dagnelie level, blkid);
2134a2cdcdd2SPaul Dagnelie if (db != NULL) {
2135fa9e4066Sahrens mutex_exit(&db->db_mtx);
2136a2cdcdd2SPaul Dagnelie /*
2137a2cdcdd2SPaul Dagnelie * This dbuf already exists. It is either CACHED, or
2138a2cdcdd2SPaul Dagnelie * (we assume) about to be read or filled.
2139a2cdcdd2SPaul Dagnelie */
2140fa9e4066Sahrens return;
2141fa9e4066Sahrens }
2142fa9e4066Sahrens
2143a2cdcdd2SPaul Dagnelie /*
2144a2cdcdd2SPaul Dagnelie * Find the closest ancestor (indirect block) of the target block
2145a2cdcdd2SPaul Dagnelie * that is present in the cache. In this indirect block, we will
2146a2cdcdd2SPaul Dagnelie * find the bp that is at curlevel, curblkid.
2147a2cdcdd2SPaul Dagnelie */
2148a2cdcdd2SPaul Dagnelie curlevel = level;
2149a2cdcdd2SPaul Dagnelie curblkid = blkid;
2150a2cdcdd2SPaul Dagnelie while (curlevel < nlevels - 1) {
2151a2cdcdd2SPaul Dagnelie int parent_level = curlevel + 1;
2152a2cdcdd2SPaul Dagnelie uint64_t parent_blkid = curblkid >> epbs;
2153a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db;
2154a2cdcdd2SPaul Dagnelie
2155a2cdcdd2SPaul Dagnelie if (dbuf_hold_impl(dn, parent_level, parent_blkid,
2156a2cdcdd2SPaul Dagnelie FALSE, TRUE, FTAG, &db) == 0) {
2157a2cdcdd2SPaul Dagnelie blkptr_t *bpp = db->db_buf->b_data;
2158a2cdcdd2SPaul Dagnelie bp = bpp[P2PHASE(curblkid, 1 << epbs)];
2159a2cdcdd2SPaul Dagnelie dbuf_rele(db, FTAG);
2160a2cdcdd2SPaul Dagnelie break;
2161a2cdcdd2SPaul Dagnelie }
2162a2cdcdd2SPaul Dagnelie
2163a2cdcdd2SPaul Dagnelie curlevel = parent_level;
2164a2cdcdd2SPaul Dagnelie curblkid = parent_blkid;
2165a2cdcdd2SPaul Dagnelie }
2166a2cdcdd2SPaul Dagnelie
2167a2cdcdd2SPaul Dagnelie if (curlevel == nlevels - 1) {
2168a2cdcdd2SPaul Dagnelie /* No cached indirect blocks found. */
2169a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
2170a2cdcdd2SPaul Dagnelie bp = dn->dn_phys->dn_blkptr[curblkid];
2171a2cdcdd2SPaul Dagnelie }
2172a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(&bp))
2173a2cdcdd2SPaul Dagnelie return;
2174a2cdcdd2SPaul Dagnelie
2175a2cdcdd2SPaul Dagnelie ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
2176a2cdcdd2SPaul Dagnelie
2177a2cdcdd2SPaul Dagnelie zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
2178a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL);
2179a2cdcdd2SPaul Dagnelie
2180a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
2181b24ab676SJeff Bonwick dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
2182a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2183a2cdcdd2SPaul Dagnelie dn->dn_object, level, blkid);
2184a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel = curlevel;
2185a2cdcdd2SPaul Dagnelie dpa->dpa_prio = prio;
2186a2cdcdd2SPaul Dagnelie dpa->dpa_aflags = aflags;
2187a2cdcdd2SPaul Dagnelie dpa->dpa_spa = dn->dn_objset->os_spa;
2188a2cdcdd2SPaul Dagnelie dpa->dpa_epbs = epbs;
2189a2cdcdd2SPaul Dagnelie dpa->dpa_zio = pio;
2190a2cdcdd2SPaul Dagnelie
2191a2cdcdd2SPaul Dagnelie /*
2192a2cdcdd2SPaul Dagnelie * If we have the indirect just above us, no need to do the asynchronous
2193a2cdcdd2SPaul Dagnelie * prefetch chain; we'll just run the last step ourselves. If we're at
2194a2cdcdd2SPaul Dagnelie * a higher level, though, we want to issue the prefetches for all the
2195a2cdcdd2SPaul Dagnelie * indirect blocks asynchronously, so we can go on with whatever we were
2196a2cdcdd2SPaul Dagnelie * doing.
2197a2cdcdd2SPaul Dagnelie */
2198a2cdcdd2SPaul Dagnelie if (curlevel == level) {
2199a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, ==, blkid);
2200a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, &bp);
2201a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa));
2202a2cdcdd2SPaul Dagnelie } else {
2203a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
22047802d7bfSMatthew Ahrens zbookmark_phys_t zb;
2205b24ab676SJeff Bonwick
2206a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
2207a2cdcdd2SPaul Dagnelie dn->dn_object, curlevel, curblkid);
2208a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
2209a2cdcdd2SPaul Dagnelie &bp, dbuf_prefetch_indirect_done, dpa, prio,
2210fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2211a2cdcdd2SPaul Dagnelie &iter_aflags, &zb);
2212fa9e4066Sahrens }
2213a2cdcdd2SPaul Dagnelie /*
2214a2cdcdd2SPaul Dagnelie * We use pio here instead of dpa_zio since it's possible that
2215a2cdcdd2SPaul Dagnelie * dpa may have already been freed.
2216a2cdcdd2SPaul Dagnelie */
2217a2cdcdd2SPaul Dagnelie zio_nowait(pio);
2218fa9e4066Sahrens }
2219fa9e4066Sahrens
2220fa9e4066Sahrens /*
2221fa9e4066Sahrens * Returns with db_holds incremented, and db_mtx not held.
2222fa9e4066Sahrens * Note: dn_struct_rwlock must be held.
2223fa9e4066Sahrens */
2224fa9e4066Sahrens int
dbuf_hold_impl(dnode_t * dn,uint8_t level,uint64_t blkid,boolean_t fail_sparse,boolean_t fail_uncached,void * tag,dmu_buf_impl_t ** dbp)2225a2cdcdd2SPaul Dagnelie dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
2226a2cdcdd2SPaul Dagnelie boolean_t fail_sparse, boolean_t fail_uncached,
2227fa9e4066Sahrens void *tag, dmu_buf_impl_t **dbp)
2228fa9e4066Sahrens {
2229fa9e4066Sahrens dmu_buf_impl_t *db, *parent = NULL;
2230fa9e4066Sahrens
22310a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID);
2232fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2233fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, level);
2234fa9e4066Sahrens
2235fa9e4066Sahrens *dbp = NULL;
2236ea8dc4b6Seschrock top:
2237fa9e4066Sahrens /* dbuf_find() returns with db_mtx held */
2238e57a022bSJustin T. Gibbs db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
2239fa9e4066Sahrens
2240fa9e4066Sahrens if (db == NULL) {
2241fa9e4066Sahrens blkptr_t *bp = NULL;
2242fa9e4066Sahrens int err;
2243fa9e4066Sahrens
2244a2cdcdd2SPaul Dagnelie if (fail_uncached)
2245a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT));
2246a2cdcdd2SPaul Dagnelie
2247c543ec06Sahrens ASSERT3P(parent, ==, NULL);
2248fa9e4066Sahrens err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2249fa9e4066Sahrens if (fail_sparse) {
2250fa9e4066Sahrens if (err == 0 && bp && BP_IS_HOLE(bp))
2251be6fd75aSMatthew Ahrens err = SET_ERROR(ENOENT);
2252fa9e4066Sahrens if (err) {
2253c543ec06Sahrens if (parent)
2254ea8dc4b6Seschrock dbuf_rele(parent, NULL);
2255fa9e4066Sahrens return (err);
2256fa9e4066Sahrens }
2257fa9e4066Sahrens }
2258ea8dc4b6Seschrock if (err && err != ENOENT)
2259ea8dc4b6Seschrock return (err);
2260fa9e4066Sahrens db = dbuf_create(dn, level, blkid, parent, bp);
2261fa9e4066Sahrens }
2262fa9e4066Sahrens
2263a2cdcdd2SPaul Dagnelie if (fail_uncached && db->db_state != DB_CACHED) {
2264a2cdcdd2SPaul Dagnelie mutex_exit(&db->db_mtx);
2265a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT));
2266a2cdcdd2SPaul Dagnelie }
2267a2cdcdd2SPaul Dagnelie
2268ea8dc4b6Seschrock if (db->db_buf && refcount_is_zero(&db->db_holds)) {
2269ea8dc4b6Seschrock arc_buf_add_ref(db->db_buf, db);
2270ea8dc4b6Seschrock if (db->db_buf->b_data == NULL) {
2271ea8dc4b6Seschrock dbuf_clear(db);
2272c543ec06Sahrens if (parent) {
2273c543ec06Sahrens dbuf_rele(parent, NULL);
2274c543ec06Sahrens parent = NULL;
2275c543ec06Sahrens }
2276ea8dc4b6Seschrock goto top;
2277ea8dc4b6Seschrock }
2278ea8dc4b6Seschrock ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2279ea8dc4b6Seschrock }
2280ea8dc4b6Seschrock
2281ea8dc4b6Seschrock ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2282ea8dc4b6Seschrock
2283fa9e4066Sahrens /*
2284c717a561Smaybee * If this buffer is currently syncing out, and we are are
2285c717a561Smaybee * still referencing it from db_data, we need to make a copy
2286c717a561Smaybee * of it in case we decide we want to dirty it again in this txg.
2287fa9e4066Sahrens */
22880a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2289ea8dc4b6Seschrock dn->dn_object != DMU_META_DNODE_OBJECT &&
2290c717a561Smaybee db->db_state == DB_CACHED && db->db_data_pending) {
2291c717a561Smaybee dbuf_dirty_record_t *dr = db->db_data_pending;
2292c717a561Smaybee
2293c717a561Smaybee if (dr->dt.dl.dr_data == db->db_buf) {
2294ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2295fa9e4066Sahrens
2296c717a561Smaybee dbuf_set_data(db,
2297744947dcSTom Erickson arc_buf_alloc(dn->dn_objset->os_spa,
2298c717a561Smaybee db->db.db_size, db, type));
2299c717a561Smaybee bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2300fa9e4066Sahrens db->db.db_size);
2301fa9e4066Sahrens }
2302c717a561Smaybee }
2303fa9e4066Sahrens
2304ea8dc4b6Seschrock (void) refcount_add(&db->db_holds, tag);
23059c9dc39aSek110237 DBUF_VERIFY(db);
2306fa9e4066Sahrens mutex_exit(&db->db_mtx);
2307fa9e4066Sahrens
2308fa9e4066Sahrens /* NOTE: we can't rele the parent until after we drop the db_mtx */
2309c543ec06Sahrens if (parent)
2310ea8dc4b6Seschrock dbuf_rele(parent, NULL);
2311fa9e4066Sahrens
2312744947dcSTom Erickson ASSERT3P(DB_DNODE(db), ==, dn);
2313fa9e4066Sahrens ASSERT3U(db->db_blkid, ==, blkid);
2314fa9e4066Sahrens ASSERT3U(db->db_level, ==, level);
2315fa9e4066Sahrens *dbp = db;
2316fa9e4066Sahrens
2317fa9e4066Sahrens return (0);
2318fa9e4066Sahrens }
2319fa9e4066Sahrens
2320fa9e4066Sahrens dmu_buf_impl_t *
dbuf_hold(dnode_t * dn,uint64_t blkid,void * tag)2321ea8dc4b6Seschrock dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2322fa9e4066Sahrens {
2323a2cdcdd2SPaul Dagnelie return (dbuf_hold_level(dn, 0, blkid, tag));
2324fa9e4066Sahrens }
2325fa9e4066Sahrens
2326fa9e4066Sahrens dmu_buf_impl_t *
dbuf_hold_level(dnode_t * dn,int level,uint64_t blkid,void * tag)2327fa9e4066Sahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2328fa9e4066Sahrens {
2329fa9e4066Sahrens dmu_buf_impl_t *db;
2330a2cdcdd2SPaul Dagnelie int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
2331ea8dc4b6Seschrock return (err ? NULL : db);
2332fa9e4066Sahrens }
2333fa9e4066Sahrens
23341934e92fSmaybee void
dbuf_create_bonus(dnode_t * dn)2335ea8dc4b6Seschrock dbuf_create_bonus(dnode_t *dn)
2336fa9e4066Sahrens {
2337ea8dc4b6Seschrock ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2338ea8dc4b6Seschrock
2339ea8dc4b6Seschrock ASSERT(dn->dn_bonus == NULL);
23400a586ceaSMark Shellenbaum dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
23410a586ceaSMark Shellenbaum }
23420a586ceaSMark Shellenbaum
23430a586ceaSMark Shellenbaum int
dbuf_spill_set_blksz(dmu_buf_t * db_fake,uint64_t blksz,dmu_tx_t * tx)23440a586ceaSMark Shellenbaum dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
23450a586ceaSMark Shellenbaum {
23460a586ceaSMark Shellenbaum dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2347744947dcSTom Erickson dnode_t *dn;
2348744947dcSTom Erickson
23490a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID)
2350be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP));
23510a586ceaSMark Shellenbaum if (blksz == 0)
23520a586ceaSMark Shellenbaum blksz = SPA_MINBLOCKSIZE;
2353b5152584SMatthew Ahrens ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
23540a586ceaSMark Shellenbaum blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
23550a586ceaSMark Shellenbaum
2356744947dcSTom Erickson DB_DNODE_ENTER(db);
2357744947dcSTom Erickson dn = DB_DNODE(db);
2358744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
23590a586ceaSMark Shellenbaum dbuf_new_size(db, blksz, tx);
2360744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock);
2361744947dcSTom Erickson DB_DNODE_EXIT(db);
23620a586ceaSMark Shellenbaum
23630a586ceaSMark Shellenbaum return (0);
23640a586ceaSMark Shellenbaum }
23650a586ceaSMark Shellenbaum
23660a586ceaSMark Shellenbaum void
dbuf_rm_spill(dnode_t * dn,dmu_tx_t * tx)23670a586ceaSMark Shellenbaum dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
23680a586ceaSMark Shellenbaum {
23690a586ceaSMark Shellenbaum dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2370fa9e4066Sahrens }
2371fa9e4066Sahrens
2372ea8dc4b6Seschrock #pragma weak dmu_buf_add_ref = dbuf_add_ref
2373fa9e4066Sahrens void
dbuf_add_ref(dmu_buf_impl_t * db,void * tag)2374fa9e4066Sahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2375fa9e4066Sahrens {
2376ea8dc4b6Seschrock int64_t holds = refcount_add(&db->db_holds, tag);
2377ea8dc4b6Seschrock ASSERT(holds > 1);
2378fa9e4066Sahrens }
2379fa9e4066Sahrens
2380e57a022bSJustin T. Gibbs #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
2381e57a022bSJustin T. Gibbs boolean_t
dbuf_try_add_ref(dmu_buf_t * db_fake,objset_t * os,uint64_t obj,uint64_t blkid,void * tag)2382e57a022bSJustin T. Gibbs dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
2383e57a022bSJustin T. Gibbs void *tag)
2384e57a022bSJustin T. Gibbs {
2385e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2386e57a022bSJustin T. Gibbs dmu_buf_impl_t *found_db;
2387e57a022bSJustin T. Gibbs boolean_t result = B_FALSE;
2388e57a022bSJustin T. Gibbs
2389e57a022bSJustin T. Gibbs if (db->db_blkid == DMU_BONUS_BLKID)
2390e57a022bSJustin T. Gibbs found_db = dbuf_find_bonus(os, obj);
2391e57a022bSJustin T. Gibbs else
2392e57a022bSJustin T. Gibbs found_db = dbuf_find(os, obj, 0, blkid);
2393e57a022bSJustin T. Gibbs
2394e57a022bSJustin T. Gibbs if (found_db != NULL) {
2395e57a022bSJustin T. Gibbs if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
2396e57a022bSJustin T. Gibbs (void) refcount_add(&db->db_holds, tag);
2397e57a022bSJustin T. Gibbs result = B_TRUE;
2398e57a022bSJustin T. Gibbs }
2399e57a022bSJustin T. Gibbs mutex_exit(&db->db_mtx);
2400e57a022bSJustin T. Gibbs }
2401e57a022bSJustin T. Gibbs return (result);
2402e57a022bSJustin T. Gibbs }
2403e57a022bSJustin T. Gibbs
2404744947dcSTom Erickson /*
2405744947dcSTom Erickson * If you call dbuf_rele() you had better not be referencing the dnode handle
2406744947dcSTom Erickson * unless you have some other direct or indirect hold on the dnode. (An indirect
2407744947dcSTom Erickson * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2408744947dcSTom Erickson * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2409744947dcSTom Erickson * dnode's parent dbuf evicting its dnode handles.
2410744947dcSTom Erickson */
2411fa9e4066Sahrens void
dbuf_rele(dmu_buf_impl_t * db,void * tag)2412ea8dc4b6Seschrock dbuf_rele(dmu_buf_impl_t *db, void *tag)
2413fa9e4066Sahrens {
2414b24ab676SJeff Bonwick mutex_enter(&db->db_mtx);
2415b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, tag);
2416b24ab676SJeff Bonwick }
2417b24ab676SJeff Bonwick
241843466aaeSMax Grossman void
dmu_buf_rele(dmu_buf_t * db,void * tag)241943466aaeSMax Grossman dmu_buf_rele(dmu_buf_t *db, void *tag)
242043466aaeSMax Grossman {
242143466aaeSMax Grossman dbuf_rele((dmu_buf_impl_t *)db, tag);
242243466aaeSMax Grossman }
242343466aaeSMax Grossman
2424b24ab676SJeff Bonwick /*
2425b24ab676SJeff Bonwick * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2426b24ab676SJeff Bonwick * db_dirtycnt and db_holds to be updated atomically.
2427b24ab676SJeff Bonwick */
2428b24ab676SJeff Bonwick void
dbuf_rele_and_unlock(dmu_buf_impl_t * db,void * tag)2429b24ab676SJeff Bonwick dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2430b24ab676SJeff Bonwick {
2431fa9e4066Sahrens int64_t holds;
2432fa9e4066Sahrens
2433b24ab676SJeff Bonwick ASSERT(MUTEX_HELD(&db->db_mtx));
24349c9dc39aSek110237 DBUF_VERIFY(db);
2435fa9e4066Sahrens
2436744947dcSTom Erickson /*
2437744947dcSTom Erickson * Remove the reference to the dbuf before removing its hold on the
2438744947dcSTom Erickson * dnode so we can guarantee in dnode_move() that a referenced bonus
2439744947dcSTom Erickson * buffer has a corresponding dnode hold.
2440744947dcSTom Erickson */
2441fa9e4066Sahrens holds = refcount_remove(&db->db_holds, tag);
2442ea8dc4b6Seschrock ASSERT(holds >= 0);
2443fa9e4066Sahrens
2444c717a561Smaybee /*
2445c717a561Smaybee * We can't freeze indirects if there is a possibility that they
2446c717a561Smaybee * may be modified in the current syncing context.
2447c717a561Smaybee */
2448c717a561Smaybee if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
24496b4acc8bSahrens arc_buf_freeze(db->db_buf);
24506b4acc8bSahrens
2451fa9e4066Sahrens if (holds == db->db_dirtycnt &&
2452d2058105SJustin T. Gibbs db->db_level == 0 && db->db_user_immediate_evict)
2453fa9e4066Sahrens dbuf_evict_user(db);
2454ea8dc4b6Seschrock
2455ea8dc4b6Seschrock if (holds == 0) {
24560a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
2457cd485b49SJustin T. Gibbs dnode_t *dn;
2458d2058105SJustin T. Gibbs boolean_t evict_dbuf = db->db_pending_evict;
2459cd485b49SJustin T. Gibbs
2460cd485b49SJustin T. Gibbs /*
2461cd485b49SJustin T. Gibbs * If the dnode moves here, we cannot cross this
2462cd485b49SJustin T. Gibbs * barrier until the move completes.
2463cd485b49SJustin T. Gibbs */
2464cd485b49SJustin T. Gibbs DB_DNODE_ENTER(db);
2465cd485b49SJustin T. Gibbs
2466cd485b49SJustin T. Gibbs dn = DB_DNODE(db);
2467cd485b49SJustin T. Gibbs atomic_dec_32(&dn->dn_dbufs_count);
2468cd485b49SJustin T. Gibbs
2469cd485b49SJustin T. Gibbs /*
2470cd485b49SJustin T. Gibbs * Decrementing the dbuf count means that the bonus
2471cd485b49SJustin T. Gibbs * buffer's dnode hold is no longer discounted in
2472cd485b49SJustin T. Gibbs * dnode_move(). The dnode cannot move until after
2473d2058105SJustin T. Gibbs * the dnode_rele() below.
2474cd485b49SJustin T. Gibbs */
2475cd485b49SJustin T. Gibbs DB_DNODE_EXIT(db);
2476cd485b49SJustin T. Gibbs
2477cd485b49SJustin T. Gibbs /*
2478cd485b49SJustin T. Gibbs * Do not reference db after its lock is dropped.
2479cd485b49SJustin T. Gibbs * Another thread may evict it.
2480cd485b49SJustin T. Gibbs */
2481ea8dc4b6Seschrock mutex_exit(&db->db_mtx);
2482744947dcSTom Erickson
2483d2058105SJustin T. Gibbs if (evict_dbuf)
2484cd485b49SJustin T. Gibbs dnode_evict_bonus(dn);
2485d2058105SJustin T. Gibbs
2486d2058105SJustin T. Gibbs dnode_rele(dn, db);
2487ea8dc4b6Seschrock } else if (db->db_buf == NULL) {
2488ea8dc4b6Seschrock /*
2489ea8dc4b6Seschrock * This is a special case: we never associated this
2490ea8dc4b6Seschrock * dbuf with any data allocated from the ARC.
2491ea8dc4b6Seschrock */
249282c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED ||
249382c9918fSTim Haley db->db_state == DB_NOFILL);
2494ea8dc4b6Seschrock dbuf_evict(db);
2495ea8dc4b6Seschrock } else if (arc_released(db->db_buf)) {
2496ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf;
2497ea8dc4b6Seschrock /*
2498ea8dc4b6Seschrock * This dbuf has anonymous data associated with it.
2499ea8dc4b6Seschrock */
2500bc9014e6SJustin Gibbs dbuf_clear_data(db);
25013b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db));
2502ea8dc4b6Seschrock dbuf_evict(db);
2503ea8dc4b6Seschrock } else {
25043b2aab18SMatthew Ahrens VERIFY(!arc_buf_remove_ref(db->db_buf, db));
25059253d63dSGeorge Wilson
25069253d63dSGeorge Wilson /*
25079253d63dSGeorge Wilson * A dbuf will be eligible for eviction if either the
25089253d63dSGeorge Wilson * 'primarycache' property is set or a duplicate
25099253d63dSGeorge Wilson * copy of this buffer is already cached in the arc.
25109253d63dSGeorge Wilson *
25119253d63dSGeorge Wilson * In the case of the 'primarycache' a buffer
25129253d63dSGeorge Wilson * is considered for eviction if it matches the
25139253d63dSGeorge Wilson * criteria set in the property.
25149253d63dSGeorge Wilson *
25159253d63dSGeorge Wilson * To decide if our buffer is considered a
25169253d63dSGeorge Wilson * duplicate, we must call into the arc to determine
25179253d63dSGeorge Wilson * if multiple buffers are referencing the same
25189253d63dSGeorge Wilson * block on-disk. If so, then we simply evict
25199253d63dSGeorge Wilson * ourselves.
25209253d63dSGeorge Wilson */
2521bbfa8ea8SMatthew Ahrens if (!DBUF_IS_CACHEABLE(db)) {
2522bbfa8ea8SMatthew Ahrens if (db->db_blkptr != NULL &&
2523bbfa8ea8SMatthew Ahrens !BP_IS_HOLE(db->db_blkptr) &&
2524bbfa8ea8SMatthew Ahrens !BP_IS_EMBEDDED(db->db_blkptr)) {
2525bbfa8ea8SMatthew Ahrens spa_t *spa =
2526bbfa8ea8SMatthew Ahrens dmu_objset_spa(db->db_objset);
2527bbfa8ea8SMatthew Ahrens blkptr_t bp = *db->db_blkptr;
25283baa08fcSek110237 dbuf_clear(db);
2529bbfa8ea8SMatthew Ahrens arc_freed(spa, &bp);
2530bbfa8ea8SMatthew Ahrens } else {
2531bbfa8ea8SMatthew Ahrens dbuf_clear(db);
2532bbfa8ea8SMatthew Ahrens }
2533d2058105SJustin T. Gibbs } else if (db->db_pending_evict ||
2534bc9014e6SJustin Gibbs arc_buf_eviction_needed(db->db_buf)) {
2535bbfa8ea8SMatthew Ahrens dbuf_clear(db);
2536bbfa8ea8SMatthew Ahrens } else {
2537fa9e4066Sahrens mutex_exit(&db->db_mtx);
2538fa9e4066Sahrens }
2539bbfa8ea8SMatthew Ahrens }
2540ea8dc4b6Seschrock } else {
2541ea8dc4b6Seschrock mutex_exit(&db->db_mtx);
2542fa9e4066Sahrens }
2543fa9e4066Sahrens }
2544fa9e4066Sahrens
2545fa9e4066Sahrens #pragma weak dmu_buf_refcount = dbuf_refcount
2546fa9e4066Sahrens uint64_t
dbuf_refcount(dmu_buf_impl_t * db)2547fa9e4066Sahrens dbuf_refcount(dmu_buf_impl_t *db)
2548fa9e4066Sahrens {
2549fa9e4066Sahrens return (refcount_count(&db->db_holds));
2550fa9e4066Sahrens }
2551fa9e4066Sahrens
2552fa9e4066Sahrens void *
dmu_buf_replace_user(dmu_buf_t * db_fake,dmu_buf_user_t * old_user,dmu_buf_user_t * new_user)2553bc9014e6SJustin Gibbs dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2554bc9014e6SJustin Gibbs dmu_buf_user_t *new_user)
2555fa9e4066Sahrens {
2556bc9014e6SJustin Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2557bc9014e6SJustin Gibbs
2558bc9014e6SJustin Gibbs mutex_enter(&db->db_mtx);
2559bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING);
2560bc9014e6SJustin Gibbs if (db->db_user == old_user)
2561bc9014e6SJustin Gibbs db->db_user = new_user;
2562bc9014e6SJustin Gibbs else
2563bc9014e6SJustin Gibbs old_user = db->db_user;
2564bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING);
2565bc9014e6SJustin Gibbs mutex_exit(&db->db_mtx);
2566bc9014e6SJustin Gibbs
2567bc9014e6SJustin Gibbs return (old_user);
2568fa9e4066Sahrens }
2569fa9e4066Sahrens
2570fa9e4066Sahrens void *
dmu_buf_set_user(dmu_buf_t * db_fake,dmu_buf_user_t * user)2571bc9014e6SJustin Gibbs dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2572bc9014e6SJustin Gibbs {
2573bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, NULL, user));
2574bc9014e6SJustin Gibbs }
2575bc9014e6SJustin Gibbs
2576bc9014e6SJustin Gibbs void *
dmu_buf_set_user_ie(dmu_buf_t * db_fake,dmu_buf_user_t * user)2577bc9014e6SJustin Gibbs dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2578fa9e4066Sahrens {
2579fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2580fa9e4066Sahrens
2581d2058105SJustin T. Gibbs db->db_user_immediate_evict = TRUE;
2582bc9014e6SJustin Gibbs return (dmu_buf_set_user(db_fake, user));
2583fa9e4066Sahrens }
2584fa9e4066Sahrens
2585fa9e4066Sahrens void *
dmu_buf_remove_user(dmu_buf_t * db_fake,dmu_buf_user_t * user)2586bc9014e6SJustin Gibbs dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2587fa9e4066Sahrens {
2588bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, user, NULL));
2589fa9e4066Sahrens }
2590fa9e4066Sahrens
2591fa9e4066Sahrens void *
dmu_buf_get_user(dmu_buf_t * db_fake)2592fa9e4066Sahrens dmu_buf_get_user(dmu_buf_t *db_fake)
2593fa9e4066Sahrens {
2594fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2595fa9e4066Sahrens
2596bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING);
2597bc9014e6SJustin Gibbs return (db->db_user);
2598bc9014e6SJustin Gibbs }
2599bc9014e6SJustin Gibbs
2600bc9014e6SJustin Gibbs void
dmu_buf_user_evict_wait()2601bc9014e6SJustin Gibbs dmu_buf_user_evict_wait()
2602bc9014e6SJustin Gibbs {
2603bc9014e6SJustin Gibbs taskq_wait(dbu_evict_taskq);
2604fa9e4066Sahrens }
2605fa9e4066Sahrens
26063d692628SSanjeev Bagewadi boolean_t
dmu_buf_freeable(dmu_buf_t * dbuf)26073d692628SSanjeev Bagewadi dmu_buf_freeable(dmu_buf_t *dbuf)
26083d692628SSanjeev Bagewadi {
26093d692628SSanjeev Bagewadi boolean_t res = B_FALSE;
26103d692628SSanjeev Bagewadi dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
26113d692628SSanjeev Bagewadi
26123d692628SSanjeev Bagewadi if (db->db_blkptr)
26133d692628SSanjeev Bagewadi res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2614c7cd2421SGeorge Wilson db->db_blkptr, db->db_blkptr->blk_birth);
26153d692628SSanjeev Bagewadi
26163d692628SSanjeev Bagewadi return (res);
26173d692628SSanjeev Bagewadi }
26183d692628SSanjeev Bagewadi
261980901aeaSGeorge Wilson blkptr_t *
dmu_buf_get_blkptr(dmu_buf_t * db)262080901aeaSGeorge Wilson dmu_buf_get_blkptr(dmu_buf_t *db)
262180901aeaSGeorge Wilson {
262280901aeaSGeorge Wilson dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
262380901aeaSGeorge Wilson return (dbi->db_blkptr);
262480901aeaSGeorge Wilson }
262580901aeaSGeorge Wilson
2626c717a561Smaybee static void
dbuf_check_blkptr(dnode_t * dn,dmu_buf_impl_t * db)2627c717a561Smaybee dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2628fa9e4066Sahrens {
2629c717a561Smaybee /* ASSERT(dmu_tx_is_syncing(tx) */
2630c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx));
2631c717a561Smaybee
2632c717a561Smaybee if (db->db_blkptr != NULL)
2633c717a561Smaybee return;
2634c717a561Smaybee
26350a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) {
26360a586ceaSMark Shellenbaum db->db_blkptr = &dn->dn_phys->dn_spill;
26370a586ceaSMark Shellenbaum BP_ZERO(db->db_blkptr);
26380a586ceaSMark Shellenbaum return;
26390a586ceaSMark Shellenbaum }
2640c717a561Smaybee if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2641c717a561Smaybee /*
2642c717a561Smaybee * This buffer was allocated at a time when there was
2643c717a561Smaybee * no available blkptrs from the dnode, or it was
2644c717a561Smaybee * inappropriate to hook it in (i.e., nlevels mis-match).
2645c717a561Smaybee */
2646c717a561Smaybee ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2647c717a561Smaybee ASSERT(db->db_parent == NULL);
2648c717a561Smaybee db->db_parent = dn->dn_dbuf;
2649c717a561Smaybee db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2650c717a561Smaybee DBUF_VERIFY(db);
2651c717a561Smaybee } else {
2652c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent;
2653c717a561Smaybee int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2654c717a561Smaybee
2655c717a561Smaybee ASSERT(dn->dn_phys->dn_nlevels > 1);
2656c717a561Smaybee if (parent == NULL) {
2657c717a561Smaybee mutex_exit(&db->db_mtx);
2658c717a561Smaybee rw_enter(&dn->dn_struct_rwlock, RW_READER);
2659a2cdcdd2SPaul Dagnelie parent = dbuf_hold_level(dn, db->db_level + 1,
2660a2cdcdd2SPaul Dagnelie db->db_blkid >> epbs, db);
2661c717a561Smaybee rw_exit(&dn->dn_struct_rwlock);
2662c717a561Smaybee mutex_enter(&db->db_mtx);
2663c717a561Smaybee db->db_parent = parent;
2664c717a561Smaybee }
2665c717a561Smaybee db->db_blkptr = (blkptr_t *)parent->db.db_data +
2666c717a561Smaybee (db->db_blkid & ((1ULL << epbs) - 1));
2667c717a561Smaybee DBUF_VERIFY(db);
2668c717a561Smaybee }
2669c717a561Smaybee }
2670c717a561Smaybee
2671c717a561Smaybee static void
dbuf_sync_indirect(dbuf_dirty_record_t * dr,dmu_tx_t * tx)2672c717a561Smaybee dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2673c717a561Smaybee {
2674c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf;
2675744947dcSTom Erickson dnode_t *dn;
2676c717a561Smaybee zio_t *zio;
2677c717a561Smaybee
2678c717a561Smaybee ASSERT(dmu_tx_is_syncing(tx));
2679c717a561Smaybee
2680c717a561Smaybee dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2681c717a561Smaybee
2682c717a561Smaybee mutex_enter(&db->db_mtx);
2683c717a561Smaybee
2684c717a561Smaybee ASSERT(db->db_level > 0);
2685c717a561Smaybee DBUF_VERIFY(db);
2686c717a561Smaybee
26873e30c24aSWill Andrews /* Read the block if it hasn't been read yet. */
2688c717a561Smaybee if (db->db_buf == NULL) {
2689c717a561Smaybee mutex_exit(&db->db_mtx);
2690c717a561Smaybee (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2691c717a561Smaybee mutex_enter(&db->db_mtx);
2692c717a561Smaybee }
2693c717a561Smaybee ASSERT3U(db->db_state, ==, DB_CACHED);
2694c717a561Smaybee ASSERT(db->db_buf != NULL);
2695c717a561Smaybee
2696744947dcSTom Erickson DB_DNODE_ENTER(db);
2697744947dcSTom Erickson dn = DB_DNODE(db);
26983e30c24aSWill Andrews /* Indirect block size must match what the dnode thinks it is. */
2699744947dcSTom Erickson ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2700c717a561Smaybee dbuf_check_blkptr(dn, db);
2701744947dcSTom Erickson DB_DNODE_EXIT(db);
2702c717a561Smaybee
27033e30c24aSWill Andrews /* Provide the pending dirty record to child dbufs */
2704c717a561Smaybee db->db_data_pending = dr;
2705c717a561Smaybee
2706af2c4821Smaybee mutex_exit(&db->db_mtx);
2707088f3894Sahrens dbuf_write(dr, db->db_buf, tx);
2708c717a561Smaybee
2709c717a561Smaybee zio = dr->dr_zio;
2710c717a561Smaybee mutex_enter(&dr->dt.di.dr_mtx);
271146e1baa6SMatthew Ahrens dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
2712c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2713c717a561Smaybee mutex_exit(&dr->dt.di.dr_mtx);
2714c717a561Smaybee zio_nowait(zio);
2715c717a561Smaybee }
2716c717a561Smaybee
2717c717a561Smaybee static void
dbuf_sync_leaf(dbuf_dirty_record_t * dr,dmu_tx_t * tx)2718c717a561Smaybee dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2719c717a561Smaybee {
2720c717a561Smaybee arc_buf_t **datap = &dr->dt.dl.dr_data;
2721c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf;
2722744947dcSTom Erickson dnode_t *dn;
2723744947dcSTom Erickson objset_t *os;
2724c717a561Smaybee uint64_t txg = tx->tx_txg;
2725fa9e4066Sahrens
2726fa9e4066Sahrens ASSERT(dmu_tx_is_syncing(tx));
2727fa9e4066Sahrens
2728fa9e4066Sahrens dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2729fa9e4066Sahrens
2730fa9e4066Sahrens mutex_enter(&db->db_mtx);
2731fa9e4066Sahrens /*
2732fa9e4066Sahrens * To be synced, we must be dirtied. But we
2733fa9e4066Sahrens * might have been freed after the dirty.
2734fa9e4066Sahrens */
2735fa9e4066Sahrens if (db->db_state == DB_UNCACHED) {
2736fa9e4066Sahrens /* This buffer has been freed since it was dirtied */
2737fa9e4066Sahrens ASSERT(db->db.db_data == NULL);
2738fa9e4066Sahrens } else if (db->db_state == DB_FILL) {
2739fa9e4066Sahrens /* This buffer was freed and is now being re-filled */
2740c717a561Smaybee ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2741fa9e4066Sahrens } else {
274282c9918fSTim Haley ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2743fa9e4066Sahrens }
27449c9dc39aSek110237 DBUF_VERIFY(db);
2745fa9e4066Sahrens
2746744947dcSTom Erickson DB_DNODE_ENTER(db);
2747744947dcSTom Erickson dn = DB_DNODE(db);
2748744947dcSTom Erickson
27490a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) {
27500a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx);
27510a586ceaSMark Shellenbaum dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
27520a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx);
27530a586ceaSMark Shellenbaum }
27540a586ceaSMark Shellenbaum
2755fa9e4066Sahrens /*
2756c717a561Smaybee * If this is a bonus buffer, simply copy the bonus data into the
2757c717a561Smaybee * dnode. It will be written out when the dnode is synced (and it
2758c717a561Smaybee * will be synced, since it must have been dirty for dbuf_sync to
2759c717a561Smaybee * be called).
2760fa9e4066Sahrens */
27610a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) {
2762c717a561Smaybee dbuf_dirty_record_t **drp;
27631934e92fSmaybee
2764ea8dc4b6Seschrock ASSERT(*datap != NULL);
2765fb09f5aaSMadhav Suresh ASSERT0(db->db_level);
2766ea8dc4b6Seschrock ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2767ea8dc4b6Seschrock bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2768744947dcSTom Erickson DB_DNODE_EXIT(db);
2769744947dcSTom Erickson
27700e8c6158Smaybee if (*datap != db->db.db_data) {
2771ea8dc4b6Seschrock zio_buf_free(*datap, DN_MAX_BONUSLEN);
27725a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
27730e8c6158Smaybee }
2774ea8dc4b6Seschrock db->db_data_pending = NULL;
2775c717a561Smaybee drp = &db->db_last_dirty;
2776c717a561Smaybee while (*drp != dr)
2777c717a561Smaybee drp = &(*drp)->dr_next;
277817f17c2dSbonwick ASSERT(dr->dr_next == NULL);
2779b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db);
278017f17c2dSbonwick *drp = dr->dr_next;
2781c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t));
2782ea8dc4b6Seschrock ASSERT(db->db_dirtycnt > 0);
2783ea8dc4b6Seschrock db->db_dirtycnt -= 1;
2784b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2785ea8dc4b6Seschrock return;
2786ea8dc4b6Seschrock }
2787ea8dc4b6Seschrock
2788744947dcSTom Erickson os = dn->dn_objset;
2789744947dcSTom Erickson
2790c5c6ffa0Smaybee /*
2791f82bfe17Sgw25295 * This function may have dropped the db_mtx lock allowing a dmu_sync
2792f82bfe17Sgw25295 * operation to sneak in. As a result, we need to ensure that we
2793f82bfe17Sgw25295 * don't check the dr_override_state until we have returned from
2794f82bfe17Sgw25295 * dbuf_check_blkptr.
2795f82bfe17Sgw25295 */
2796f82bfe17Sgw25295 dbuf_check_blkptr(dn, db);
2797f82bfe17Sgw25295
2798f82bfe17Sgw25295 /*
2799744947dcSTom Erickson * If this buffer is in the middle of an immediate write,
2800c717a561Smaybee * wait for the synchronous IO to complete.
2801c5c6ffa0Smaybee */
2802c717a561Smaybee while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2803c5c6ffa0Smaybee ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2804c5c6ffa0Smaybee cv_wait(&db->db_changed, &db->db_mtx);
2805c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2806c5c6ffa0Smaybee }
2807c717a561Smaybee
2808ab69d62fSMatthew Ahrens if (db->db_state != DB_NOFILL &&
2809ab69d62fSMatthew Ahrens dn->dn_object != DMU_META_DNODE_OBJECT &&
2810ab69d62fSMatthew Ahrens refcount_count(&db->db_holds) > 1 &&
2811b24ab676SJeff Bonwick dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2812ab69d62fSMatthew Ahrens *datap == db->db_buf) {
2813fa9e4066Sahrens /*
281482c9918fSTim Haley * If this buffer is currently "in use" (i.e., there
281582c9918fSTim Haley * are active holds and db_data still references it),
281682c9918fSTim Haley * then make a copy before we start the write so that
281782c9918fSTim Haley * any modifications from the open txg will not leak
281882c9918fSTim Haley * into this write.
2819fa9e4066Sahrens *
282082c9918fSTim Haley * NOTE: this copy does not need to be made for
282182c9918fSTim Haley * objects only modified in the syncing context (e.g.
282282c9918fSTim Haley * DNONE_DNODE blocks).
2823fa9e4066Sahrens */
2824ab69d62fSMatthew Ahrens int blksz = arc_buf_size(*datap);
2825ab69d62fSMatthew Ahrens arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2826ab69d62fSMatthew Ahrens *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2827c717a561Smaybee bcopy(db->db.db_data, (*datap)->b_data, blksz);
2828fa9e4066Sahrens }
2829c717a561Smaybee db->db_data_pending = dr;
2830fa9e4066Sahrens
2831fa9e4066Sahrens mutex_exit(&db->db_mtx);
2832fa9e4066Sahrens
2833088f3894Sahrens dbuf_write(dr, *datap, tx);
2834c717a561Smaybee
2835c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node));
2836744947dcSTom Erickson if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2837c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2838744947dcSTom Erickson DB_DNODE_EXIT(db);
2839744947dcSTom Erickson } else {
2840744947dcSTom Erickson /*
2841744947dcSTom Erickson * Although zio_nowait() does not "wait for an IO", it does
2842744947dcSTom Erickson * initiate the IO. If this is an empty write it seems plausible
2843744947dcSTom Erickson * that the IO could actually be completed before the nowait
2844744947dcSTom Erickson * returns. We need to DB_DNODE_EXIT() first in case
2845744947dcSTom Erickson * zio_nowait() invalidates the dbuf.
2846744947dcSTom Erickson */
2847744947dcSTom Erickson DB_DNODE_EXIT(db);
2848c717a561Smaybee zio_nowait(dr->dr_zio);
2849fa9e4066Sahrens }
2850744947dcSTom Erickson }
2851c717a561Smaybee
2852c717a561Smaybee void
dbuf_sync_list(list_t * list,int level,dmu_tx_t * tx)285346e1baa6SMatthew Ahrens dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
2854c717a561Smaybee {
2855c717a561Smaybee dbuf_dirty_record_t *dr;
2856c717a561Smaybee
2857c717a561Smaybee while (dr = list_head(list)) {
2858c717a561Smaybee if (dr->dr_zio != NULL) {
2859c717a561Smaybee /*
2860c717a561Smaybee * If we find an already initialized zio then we
2861c717a561Smaybee * are processing the meta-dnode, and we have finished.
2862c717a561Smaybee * The dbufs for all dnodes are put back on the list
2863c717a561Smaybee * during processing, so that we can zio_wait()
2864c717a561Smaybee * these IOs after initiating all child IOs.
2865c717a561Smaybee */
2866c717a561Smaybee ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2867c717a561Smaybee DMU_META_DNODE_OBJECT);
2868c717a561Smaybee break;
2869fa9e4066Sahrens }
287046e1baa6SMatthew Ahrens if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
287146e1baa6SMatthew Ahrens dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
287246e1baa6SMatthew Ahrens VERIFY3U(dr->dr_dbuf->db_level, ==, level);
287346e1baa6SMatthew Ahrens }
2874c717a561Smaybee list_remove(list, dr);
2875c717a561Smaybee if (dr->dr_dbuf->db_level > 0)
2876c717a561Smaybee dbuf_sync_indirect(dr, tx);
2877c717a561Smaybee else
2878c717a561Smaybee dbuf_sync_leaf(dr, tx);
2879c717a561Smaybee }
2880c717a561Smaybee }
2881c717a561Smaybee
2882fa9e4066Sahrens /* ARGSUSED */
2883fa9e4066Sahrens static void
dbuf_write_ready(zio_t * zio,arc_buf_t * buf,void * vdb)2884c717a561Smaybee dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2885fa9e4066Sahrens {
2886fa9e4066Sahrens dmu_buf_impl_t *db = vdb;
2887744947dcSTom Erickson dnode_t *dn;
2888e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp;
2889c717a561Smaybee blkptr_t *bp_orig = &zio->io_bp_orig;
2890b24ab676SJeff Bonwick spa_t *spa = zio->io_spa;
2891b24ab676SJeff Bonwick int64_t delta;
2892fa9e4066Sahrens uint64_t fill = 0;
2893b24ab676SJeff Bonwick int i;
2894fa9e4066Sahrens
28955d7b4d43SMatthew Ahrens ASSERT3P(db->db_blkptr, ==, bp);
2896e14bb325SJeff Bonwick
2897744947dcSTom Erickson DB_DNODE_ENTER(db);
2898744947dcSTom Erickson dn = DB_DNODE(db);
2899b24ab676SJeff Bonwick delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2900b24ab676SJeff Bonwick dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2901b24ab676SJeff Bonwick zio->io_prev_space_delta = delta;
2902fa9e4066Sahrens
290343466aaeSMax Grossman if (bp->blk_birth != 0) {
29040a586ceaSMark Shellenbaum ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
29050a586ceaSMark Shellenbaum BP_GET_TYPE(bp) == dn->dn_type) ||
29060a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID &&
29075d7b4d43SMatthew Ahrens BP_GET_TYPE(bp) == dn->dn_bonustype) ||
29085d7b4d43SMatthew Ahrens BP_IS_EMBEDDED(bp));
2909e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(bp) == db->db_level);
291043466aaeSMax Grossman }
2911e14bb325SJeff Bonwick
2912fa9e4066Sahrens mutex_enter(&db->db_mtx);
2913fa9e4066Sahrens
29140a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG
29150a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) {
29160a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
29170a586ceaSMark Shellenbaum ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
29180a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill);
29190a586ceaSMark Shellenbaum }
29200a586ceaSMark Shellenbaum #endif
29210a586ceaSMark Shellenbaum
2922fa9e4066Sahrens if (db->db_level == 0) {
2923fa9e4066Sahrens mutex_enter(&dn->dn_mtx);
29240a586ceaSMark Shellenbaum if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
29250a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID)
2926fa9e4066Sahrens dn->dn_phys->dn_maxblkid = db->db_blkid;
2927fa9e4066Sahrens mutex_exit(&dn->dn_mtx);
2928fa9e4066Sahrens
2929fa9e4066Sahrens if (dn->dn_type == DMU_OT_DNODE) {
2930fa9e4066Sahrens dnode_phys_t *dnp = db->db.db_data;
2931fa9e4066Sahrens for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2932fa9e4066Sahrens i--, dnp++) {
2933fa9e4066Sahrens if (dnp->dn_type != DMU_OT_NONE)
2934fa9e4066Sahrens fill++;
2935fa9e4066Sahrens }
2936fa9e4066Sahrens } else {
293743466aaeSMax Grossman if (BP_IS_HOLE(bp)) {
293843466aaeSMax Grossman fill = 0;
293943466aaeSMax Grossman } else {
2940fa9e4066Sahrens fill = 1;
2941fa9e4066Sahrens }
294243466aaeSMax Grossman }
2943fa9e4066Sahrens } else {
2944e14bb325SJeff Bonwick blkptr_t *ibp = db->db.db_data;
2945fa9e4066Sahrens ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2946e14bb325SJeff Bonwick for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2947e14bb325SJeff Bonwick if (BP_IS_HOLE(ibp))
2948fa9e4066Sahrens continue;
29495d7b4d43SMatthew Ahrens fill += BP_GET_FILL(ibp);
2950fa9e4066Sahrens }
2951fa9e4066Sahrens }
2952744947dcSTom Erickson DB_DNODE_EXIT(db);
2953fa9e4066Sahrens
29545d7b4d43SMatthew Ahrens if (!BP_IS_EMBEDDED(bp))
2955e14bb325SJeff Bonwick bp->blk_fill = fill;
2956fa9e4066Sahrens
2957fa9e4066Sahrens mutex_exit(&db->db_mtx);
2958fa9e4066Sahrens }
2959fa9e4066Sahrens
296069962b56SMatthew Ahrens /*
296169962b56SMatthew Ahrens * The SPA will call this callback several times for each zio - once
296269962b56SMatthew Ahrens * for every physical child i/o (zio->io_phys_children times). This
296369962b56SMatthew Ahrens * allows the DMU to monitor the progress of each logical i/o. For example,
296469962b56SMatthew Ahrens * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
296569962b56SMatthew Ahrens * block. There may be a long delay before all copies/fragments are completed,
296669962b56SMatthew Ahrens * so this callback allows us to retire dirty space gradually, as the physical
296769962b56SMatthew Ahrens * i/os complete.
296869962b56SMatthew Ahrens */
296969962b56SMatthew Ahrens /* ARGSUSED */
297069962b56SMatthew Ahrens static void
dbuf_write_physdone(zio_t * zio,arc_buf_t * buf,void * arg)297169962b56SMatthew Ahrens dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
297269962b56SMatthew Ahrens {
297369962b56SMatthew Ahrens dmu_buf_impl_t *db = arg;
297469962b56SMatthew Ahrens objset_t *os = db->db_objset;
297569962b56SMatthew Ahrens dsl_pool_t *dp = dmu_objset_pool(os);
297669962b56SMatthew Ahrens dbuf_dirty_record_t *dr;
297769962b56SMatthew Ahrens int delta = 0;
297869962b56SMatthew Ahrens
297969962b56SMatthew Ahrens dr = db->db_data_pending;
298069962b56SMatthew Ahrens ASSERT3U(dr->dr_txg, ==, zio->io_txg);
298169962b56SMatthew Ahrens
298269962b56SMatthew Ahrens /*
298369962b56SMatthew Ahrens * The callback will be called io_phys_children times. Retire one
298469962b56SMatthew Ahrens * portion of our dirty space each time we are called. Any rounding
298569962b56SMatthew Ahrens * error will be cleaned up by dsl_pool_sync()'s call to
298669962b56SMatthew Ahrens * dsl_pool_undirty_space().
298769962b56SMatthew Ahrens */
298869962b56SMatthew Ahrens delta = dr->dr_accounted / zio->io_phys_children;
298969962b56SMatthew Ahrens dsl_pool_undirty_space(dp, delta, zio->io_txg);
299069962b56SMatthew Ahrens }
299169962b56SMatthew Ahrens
2992c717a561Smaybee /* ARGSUSED */
2993c717a561Smaybee static void
dbuf_write_done(zio_t * zio,arc_buf_t * buf,void * vdb)2994c717a561Smaybee dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2995c717a561Smaybee {
2996c717a561Smaybee dmu_buf_impl_t *db = vdb;
2997b24ab676SJeff Bonwick blkptr_t *bp_orig = &zio->io_bp_orig;
299843466aaeSMax Grossman blkptr_t *bp = db->db_blkptr;
299943466aaeSMax Grossman objset_t *os = db->db_objset;
300043466aaeSMax Grossman dmu_tx_t *tx = os->os_synctx;
3001c717a561Smaybee dbuf_dirty_record_t **drp, *dr;
3002c717a561Smaybee
3003fb09f5aaSMadhav Suresh ASSERT0(zio->io_error);
3004b24ab676SJeff Bonwick ASSERT(db->db_blkptr == bp);
3005b24ab676SJeff Bonwick
300680901aeaSGeorge Wilson /*
300780901aeaSGeorge Wilson * For nopwrites and rewrites we ensure that the bp matches our
300880901aeaSGeorge Wilson * original and bypass all the accounting.
300980901aeaSGeorge Wilson */
301080901aeaSGeorge Wilson if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
3011b24ab676SJeff Bonwick ASSERT(BP_EQUAL(bp, bp_orig));
3012b24ab676SJeff Bonwick } else {
301343466aaeSMax Grossman dsl_dataset_t *ds = os->os_dsl_dataset;
3014b24ab676SJeff Bonwick (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
3015b24ab676SJeff Bonwick dsl_dataset_block_born(ds, bp, tx);
3016b24ab676SJeff Bonwick }
3017c717a561Smaybee
3018c717a561Smaybee mutex_enter(&db->db_mtx);
3019c717a561Smaybee
3020b24ab676SJeff Bonwick DBUF_VERIFY(db);
3021b24ab676SJeff Bonwick
3022c717a561Smaybee drp = &db->db_last_dirty;
302317f17c2dSbonwick while ((dr = *drp) != db->db_data_pending)
302417f17c2dSbonwick drp = &dr->dr_next;
302517f17c2dSbonwick ASSERT(!list_link_active(&dr->dr_dirty_node));
3026b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db);
302717f17c2dSbonwick ASSERT(dr->dr_next == NULL);
302817f17c2dSbonwick *drp = dr->dr_next;
3029c717a561Smaybee
30300a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG
30310a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) {
3032744947dcSTom Erickson dnode_t *dn;
3033744947dcSTom Erickson
3034744947dcSTom Erickson DB_DNODE_ENTER(db);
3035744947dcSTom Erickson dn = DB_DNODE(db);
30360a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
30370a586ceaSMark Shellenbaum ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
30380a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill);
3039744947dcSTom Erickson DB_DNODE_EXIT(db);
30400a586ceaSMark Shellenbaum }
30410a586ceaSMark Shellenbaum #endif
30420a586ceaSMark Shellenbaum
3043c717a561Smaybee if (db->db_level == 0) {
30440a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID);
3045c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
304682c9918fSTim Haley if (db->db_state != DB_NOFILL) {
3047c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf)
304882c9918fSTim Haley VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
30493b2aab18SMatthew Ahrens db));
3050b24ab676SJeff Bonwick else if (!arc_released(db->db_buf))
3051c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db);
305282c9918fSTim Haley }
3053c717a561Smaybee } else {
3054744947dcSTom Erickson dnode_t *dn;
3055744947dcSTom Erickson
3056744947dcSTom Erickson DB_DNODE_ENTER(db);
3057744947dcSTom Erickson dn = DB_DNODE(db);
3058c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3059c717a561Smaybee ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
3060c717a561Smaybee if (!BP_IS_HOLE(db->db_blkptr)) {
3061c717a561Smaybee int epbs =
3062c717a561Smaybee dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
306343466aaeSMax Grossman ASSERT3U(db->db_blkid, <=,
306443466aaeSMax Grossman dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
3065c717a561Smaybee ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
3066c717a561Smaybee db->db.db_size);
30675d7b4d43SMatthew Ahrens if (!arc_released(db->db_buf))
3068c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db);
3069c717a561Smaybee }
3070744947dcSTom Erickson DB_DNODE_EXIT(db);
3071c25056deSgw25295 mutex_destroy(&dr->dt.di.dr_mtx);
3072c25056deSgw25295 list_destroy(&dr->dt.di.dr_children);
3073c717a561Smaybee }
3074c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t));
3075c717a561Smaybee
3076c717a561Smaybee cv_broadcast(&db->db_changed);
3077c717a561Smaybee ASSERT(db->db_dirtycnt > 0);
3078c717a561Smaybee db->db_dirtycnt -= 1;
3079c717a561Smaybee db->db_data_pending = NULL;
308043466aaeSMax Grossman dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
3081b24ab676SJeff Bonwick }
3082b24ab676SJeff Bonwick
3083b24ab676SJeff Bonwick static void
dbuf_write_nofill_ready(zio_t * zio)3084b24ab676SJeff Bonwick dbuf_write_nofill_ready(zio_t *zio)
3085b24ab676SJeff Bonwick {
3086b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, zio->io_private);
3087b24ab676SJeff Bonwick }
3088b24ab676SJeff Bonwick
3089b24ab676SJeff Bonwick static void
dbuf_write_nofill_done(zio_t * zio)3090b24ab676SJeff Bonwick dbuf_write_nofill_done(zio_t *zio)
3091b24ab676SJeff Bonwick {
3092b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, zio->io_private);
3093b24ab676SJeff Bonwick }
3094b24ab676SJeff Bonwick
3095b24ab676SJeff Bonwick static void
dbuf_write_override_ready(zio_t * zio)3096b24ab676SJeff Bonwick dbuf_write_override_ready(zio_t *zio)
3097b24ab676SJeff Bonwick {
3098b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private;
3099b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf;
3100b24ab676SJeff Bonwick
3101b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, db);
3102b24ab676SJeff Bonwick }
3103b24ab676SJeff Bonwick
3104b24ab676SJeff Bonwick static void
dbuf_write_override_done(zio_t * zio)3105b24ab676SJeff Bonwick dbuf_write_override_done(zio_t *zio)
3106b24ab676SJeff Bonwick {
3107b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private;
3108b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf;
3109b24ab676SJeff Bonwick blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3110b24ab676SJeff Bonwick
3111b24ab676SJeff Bonwick mutex_enter(&db->db_mtx);
3112b24ab676SJeff Bonwick if (!BP_EQUAL(zio->io_bp, obp)) {
3113b24ab676SJeff Bonwick if (!BP_IS_HOLE(obp))
3114b24ab676SJeff Bonwick dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3115b24ab676SJeff Bonwick arc_release(dr->dt.dl.dr_data, db);
3116b24ab676SJeff Bonwick }
3117c717a561Smaybee mutex_exit(&db->db_mtx);
3118c717a561Smaybee
3119b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, db);
3120b24ab676SJeff Bonwick }
3121c717a561Smaybee
31223e30c24aSWill Andrews /* Issue I/O to commit a dirty buffer to disk. */
3123b24ab676SJeff Bonwick static void
dbuf_write(dbuf_dirty_record_t * dr,arc_buf_t * data,dmu_tx_t * tx)3124b24ab676SJeff Bonwick dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3125b24ab676SJeff Bonwick {
3126b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf;
3127744947dcSTom Erickson dnode_t *dn;
3128744947dcSTom Erickson objset_t *os;
3129b24ab676SJeff Bonwick dmu_buf_impl_t *parent = db->db_parent;
3130b24ab676SJeff Bonwick uint64_t txg = tx->tx_txg;
31317802d7bfSMatthew Ahrens zbookmark_phys_t zb;
3132b24ab676SJeff Bonwick zio_prop_t zp;
3133b24ab676SJeff Bonwick zio_t *zio;
31340a586ceaSMark Shellenbaum int wp_flag = 0;
3135b24ab676SJeff Bonwick
3136744947dcSTom Erickson DB_DNODE_ENTER(db);
3137744947dcSTom Erickson dn = DB_DNODE(db);
3138744947dcSTom Erickson os = dn->dn_objset;
3139744947dcSTom Erickson
3140b24ab676SJeff Bonwick if (db->db_state != DB_NOFILL) {
3141b24ab676SJeff Bonwick if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3142b24ab676SJeff Bonwick /*
3143b24ab676SJeff Bonwick * Private object buffers are released here rather
3144b24ab676SJeff Bonwick * than in dbuf_dirty() since they are only modified
3145b24ab676SJeff Bonwick * in the syncing context and we don't want the
3146b24ab676SJeff Bonwick * overhead of making multiple copies of the data.
3147b24ab676SJeff Bonwick */
3148b24ab676SJeff Bonwick if (BP_IS_HOLE(db->db_blkptr)) {
3149b24ab676SJeff Bonwick arc_buf_thaw(data);
3150b24ab676SJeff Bonwick } else {
31513f9d6ad7SLin Ling dbuf_release_bp(db);
3152b24ab676SJeff Bonwick }
3153b24ab676SJeff Bonwick }
3154b24ab676SJeff Bonwick }
3155b24ab676SJeff Bonwick
3156b24ab676SJeff Bonwick if (parent != dn->dn_dbuf) {
31573e30c24aSWill Andrews /* Our parent is an indirect block. */
31583e30c24aSWill Andrews /* We have a dirty parent that has been scheduled for write. */
3159b24ab676SJeff Bonwick ASSERT(parent && parent->db_data_pending);
31603e30c24aSWill Andrews /* Our parent's buffer is one level closer to the dnode. */
3161b24ab676SJeff Bonwick ASSERT(db->db_level == parent->db_level-1);
31623e30c24aSWill Andrews /*
31633e30c24aSWill Andrews * We're about to modify our parent's db_data by modifying
31643e30c24aSWill Andrews * our block pointer, so the parent must be released.
31653e30c24aSWill Andrews */
3166b24ab676SJeff Bonwick ASSERT(arc_released(parent->db_buf));
3167b24ab676SJeff Bonwick zio = parent->db_data_pending->dr_zio;
3168b24ab676SJeff Bonwick } else {
31693e30c24aSWill Andrews /* Our parent is the dnode itself. */
31700a586ceaSMark Shellenbaum ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
31710a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) ||
31720a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
31730a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID)
3174b24ab676SJeff Bonwick ASSERT3P(db->db_blkptr, ==,
3175b24ab676SJeff Bonwick &dn->dn_phys->dn_blkptr[db->db_blkid]);
3176b24ab676SJeff Bonwick zio = dn->dn_zio;
3177b24ab676SJeff Bonwick }
3178b24ab676SJeff Bonwick
3179b24ab676SJeff Bonwick ASSERT(db->db_level == 0 || data == db->db_buf);
3180b24ab676SJeff Bonwick ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3181b24ab676SJeff Bonwick ASSERT(zio);
3182b24ab676SJeff Bonwick
3183b24ab676SJeff Bonwick SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3184b24ab676SJeff Bonwick os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3185b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid);
3186b24ab676SJeff Bonwick
31870a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID)
31880a586ceaSMark Shellenbaum wp_flag = WP_SPILL;
31890a586ceaSMark Shellenbaum wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
31900a586ceaSMark Shellenbaum
31910a586ceaSMark Shellenbaum dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3192744947dcSTom Erickson DB_DNODE_EXIT(db);
3193b24ab676SJeff Bonwick
31945d7b4d43SMatthew Ahrens if (db->db_level == 0 &&
31955d7b4d43SMatthew Ahrens dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
31965d7b4d43SMatthew Ahrens /*
31975d7b4d43SMatthew Ahrens * The BP for this block has been provided by open context
31985d7b4d43SMatthew Ahrens * (by dmu_sync() or dmu_buf_write_embedded()).
31995d7b4d43SMatthew Ahrens */
32005d7b4d43SMatthew Ahrens void *contents = (data != NULL) ? data->b_data : NULL;
32015d7b4d43SMatthew Ahrens
3202b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg,
32035d7b4d43SMatthew Ahrens db->db_blkptr, contents, db->db.db_size, &zp,
320469962b56SMatthew Ahrens dbuf_write_override_ready, NULL, dbuf_write_override_done,
320569962b56SMatthew Ahrens dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3206b24ab676SJeff Bonwick mutex_enter(&db->db_mtx);
3207b24ab676SJeff Bonwick dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3208b24ab676SJeff Bonwick zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
320980901aeaSGeorge Wilson dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3210b24ab676SJeff Bonwick mutex_exit(&db->db_mtx);
3211b24ab676SJeff Bonwick } else if (db->db_state == DB_NOFILL) {
3212810e43b2SBill Pijewski ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3213810e43b2SBill Pijewski zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3214b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg,
3215b24ab676SJeff Bonwick db->db_blkptr, NULL, db->db.db_size, &zp,
321669962b56SMatthew Ahrens dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
3217b24ab676SJeff Bonwick ZIO_PRIORITY_ASYNC_WRITE,
3218b24ab676SJeff Bonwick ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3219b24ab676SJeff Bonwick } else {
3220b24ab676SJeff Bonwick ASSERT(arc_released(data));
3221b24ab676SJeff Bonwick dr->dr_zio = arc_write(zio, os->os_spa, txg,
3222aad02571SSaso Kiselkov db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
3223aad02571SSaso Kiselkov DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
322469962b56SMatthew Ahrens dbuf_write_physdone, dbuf_write_done, db,
322569962b56SMatthew Ahrens ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3226b24ab676SJeff Bonwick }
3227fa9e4066Sahrens }
3228