xref: /freebsd/sys/contrib/openzfs/module/zfs/dmu_direct.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 
24 #include <sys/dmu.h>
25 #include <sys/dmu_impl.h>
26 #include <sys/dbuf.h>
27 #include <sys/dnode.h>
28 #include <sys/zfs_context.h>
29 #include <sys/zfs_racct.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dmu_objset.h>
32 
33 static abd_t *
make_abd_for_dbuf(dmu_buf_impl_t * db,abd_t * data,uint64_t offset,uint64_t size)34 make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
35     uint64_t size)
36 {
37 	size_t buf_size = db->db.db_size;
38 	abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
39 	size_t buf_off = 0;
40 
41 	ASSERT(MUTEX_HELD(&db->db_mtx));
42 
43 	if (offset > db->db.db_offset) {
44 		size_t pre_size = offset - db->db.db_offset;
45 		pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
46 		buf_size -= pre_size;
47 		buf_off = 0;
48 	} else {
49 		buf_off = db->db.db_offset - offset;
50 		size -= buf_off;
51 	}
52 
53 	if (size < buf_size) {
54 		size_t post_size = buf_size - size;
55 		post_buf = abd_alloc_for_io(post_size, B_TRUE);
56 		buf_size -= post_size;
57 	}
58 
59 	ASSERT3U(buf_size, >, 0);
60 	abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
61 
62 	if (pre_buf || post_buf) {
63 		mbuf = abd_alloc_gang();
64 		if (pre_buf)
65 			abd_gang_add(mbuf, pre_buf, B_TRUE);
66 		abd_gang_add(mbuf, buf, B_TRUE);
67 		if (post_buf)
68 			abd_gang_add(mbuf, post_buf, B_TRUE);
69 	} else {
70 		mbuf = buf;
71 	}
72 
73 	return (mbuf);
74 }
75 
76 static void
dmu_read_abd_done(zio_t * zio)77 dmu_read_abd_done(zio_t *zio)
78 {
79 	abd_free(zio->io_abd);
80 }
81 
82 static void
dmu_write_direct_ready(zio_t * zio)83 dmu_write_direct_ready(zio_t *zio)
84 {
85 	dmu_sync_ready(zio, NULL, zio->io_private);
86 }
87 
88 static void
dmu_write_direct_done(zio_t * zio)89 dmu_write_direct_done(zio_t *zio)
90 {
91 	dmu_sync_arg_t *dsa = zio->io_private;
92 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
93 	dmu_buf_impl_t *db = dr->dr_dbuf;
94 
95 	abd_free(zio->io_abd);
96 
97 	mutex_enter(&db->db_mtx);
98 	ASSERT3P(db->db_buf, ==, NULL);
99 	ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
100 	ASSERT3P(db->db.db_data, ==, NULL);
101 	db->db_state = DB_UNCACHED;
102 	mutex_exit(&db->db_mtx);
103 
104 	dmu_sync_done(zio, NULL, zio->io_private);
105 
106 	if (zio->io_error != 0) {
107 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
108 			ASSERT3U(zio->io_error, ==, EIO);
109 
110 		/*
111 		 * In the event of an I/O error this block has been freed in
112 		 * zio_done() through zio_dva_unallocate(). Calling
113 		 * dmu_sync_done() above set dr_override_state to
114 		 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
115 		 * dbuf_unoverride(), it will skip doing zio_free() to free
116 		 * this block as that was already taken care of.
117 		 *
118 		 * Since we are undirtying the record in open-context, we must
119 		 * have a hold on the db, so it should never be evicted after
120 		 * calling dbuf_undirty().
121 		 */
122 		mutex_enter(&db->db_mtx);
123 		VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
124 		mutex_exit(&db->db_mtx);
125 	}
126 
127 	kmem_free(zio->io_bp, sizeof (blkptr_t));
128 	zio->io_bp = NULL;
129 }
130 
131 int
dmu_write_direct(zio_t * pio,dmu_buf_impl_t * db,abd_t * data,dmu_tx_t * tx)132 dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
133 {
134 	objset_t *os = db->db_objset;
135 	dsl_dataset_t *ds = dmu_objset_ds(os);
136 	zbookmark_phys_t zb;
137 	dbuf_dirty_record_t *dr_head;
138 
139 	SET_BOOKMARK(&zb, ds->ds_object,
140 	    db->db.db_object, db->db_level, db->db_blkid);
141 
142 	DB_DNODE_ENTER(db);
143 	zio_prop_t zp;
144 	dmu_write_policy(os, DB_DNODE(db), db->db_level,
145 	    WP_DMU_SYNC | WP_DIRECT_WR, &zp);
146 	DB_DNODE_EXIT(db);
147 
148 	/*
149 	 * Dirty this dbuf with DB_NOFILL since we will not have any data
150 	 * associated with the dbuf.
151 	 */
152 	dmu_buf_will_clone_or_dio(&db->db, tx);
153 
154 	mutex_enter(&db->db_mtx);
155 
156 	uint64_t txg = dmu_tx_get_txg(tx);
157 	ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
158 	ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
159 
160 	dr_head = list_head(&db->db_dirty_records);
161 	ASSERT3U(dr_head->dr_txg, ==, txg);
162 	dr_head->dt.dl.dr_diowrite = B_TRUE;
163 	dr_head->dr_accounted = db->db.db_size;
164 
165 	blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
166 	if (db->db_blkptr != NULL) {
167 		/*
168 		 * Fill in bp with the current block pointer so that
169 		 * the nopwrite code can check if we're writing the same
170 		 * data that's already on disk.
171 		 */
172 		*bp = *db->db_blkptr;
173 	} else {
174 		memset(bp, 0, sizeof (blkptr_t));
175 	}
176 
177 	/*
178 	 * Disable nopwrite if the current block pointer could change
179 	 * before this TXG syncs.
180 	 */
181 	if (list_next(&db->db_dirty_records, dr_head) != NULL)
182 		zp.zp_nopwrite = B_FALSE;
183 
184 	ASSERT0(dr_head->dt.dl.dr_has_raw_params);
185 	ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
186 	dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
187 
188 	mutex_exit(&db->db_mtx);
189 
190 	dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
191 
192 	dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
193 	dsa->dsa_dr = dr_head;
194 	dsa->dsa_tx = tx;
195 
196 	zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
197 	    db->db.db_size, db->db.db_size, &zp,
198 	    dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
199 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
200 
201 	if (pio == NULL)
202 		return (zio_wait(zio));
203 
204 	zio_nowait(zio);
205 
206 	return (0);
207 }
208 
209 int
dmu_write_abd(dnode_t * dn,uint64_t offset,uint64_t size,abd_t * data,uint32_t flags,dmu_tx_t * tx)210 dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
211     abd_t *data, uint32_t flags, dmu_tx_t *tx)
212 {
213 	dmu_buf_t **dbp;
214 	spa_t *spa = dn->dn_objset->os_spa;
215 	int numbufs, err;
216 
217 	ASSERT(flags & DMU_DIRECTIO);
218 
219 	err = dmu_buf_hold_array_by_dnode(dn, offset,
220 	    size, B_FALSE, FTAG, &numbufs, &dbp, flags);
221 	if (err)
222 		return (err);
223 
224 	zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
225 
226 	for (int i = 0; i < numbufs && err == 0; i++) {
227 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
228 
229 		abd_t *abd = abd_get_offset_size(data,
230 		    db->db.db_offset - offset, dn->dn_datablksz);
231 
232 		zfs_racct_write(spa, db->db.db_size, 1, flags);
233 		err = dmu_write_direct(pio, db, abd, tx);
234 		ASSERT0(err);
235 	}
236 
237 	err = zio_wait(pio);
238 
239 	/*
240 	 * The dbuf must be held until the Direct I/O write has completed in
241 	 * the event there was any errors and dbuf_undirty() was called.
242 	 */
243 	dmu_buf_rele_array(dbp, numbufs, FTAG);
244 
245 	return (err);
246 }
247 
248 int
dmu_read_abd(dnode_t * dn,uint64_t offset,uint64_t size,abd_t * data,uint32_t flags)249 dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
250     abd_t *data, uint32_t flags)
251 {
252 	objset_t *os = dn->dn_objset;
253 	spa_t *spa = os->os_spa;
254 	dmu_buf_t **dbp;
255 	int numbufs, err;
256 
257 	ASSERT(flags & DMU_DIRECTIO);
258 
259 	err = dmu_buf_hold_array_by_dnode(dn, offset,
260 	    size, B_FALSE, FTAG, &numbufs, &dbp, flags);
261 	if (err)
262 		return (err);
263 
264 	zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
265 
266 	for (int i = 0; i < numbufs; i++) {
267 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
268 		abd_t *mbuf;
269 		zbookmark_phys_t zb;
270 		blkptr_t *bp;
271 
272 		mutex_enter(&db->db_mtx);
273 
274 		SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
275 		    db->db.db_object, db->db_level, db->db_blkid);
276 
277 		/*
278 		 * If there is another read for this dbuf, we will wait for
279 		 * that to complete first before checking the db_state below.
280 		 */
281 		while (db->db_state == DB_READ)
282 			cv_wait(&db->db_changed, &db->db_mtx);
283 
284 		err = dmu_buf_get_bp_from_dbuf(db, &bp);
285 		if (err) {
286 			mutex_exit(&db->db_mtx);
287 			goto error;
288 		}
289 
290 		/*
291 		 * There is no need to read if this is a hole or the data is
292 		 * cached. This will not be considered a direct read for IO
293 		 * accounting in the same way that an ARC hit is not counted.
294 		 */
295 		if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
296 			size_t aoff = offset < db->db.db_offset ?
297 			    db->db.db_offset - offset : 0;
298 			size_t boff = offset > db->db.db_offset ?
299 			    offset - db->db.db_offset : 0;
300 			size_t len = MIN(size - aoff, db->db.db_size - boff);
301 
302 			if (db->db_state == DB_CACHED) {
303 				/*
304 				 * We need to untransformed the ARC buf data
305 				 * before we copy it over.
306 				 */
307 				err = dmu_buf_untransform_direct(db, spa);
308 				ASSERT0(err);
309 				abd_copy_from_buf_off(data,
310 				    (char *)db->db.db_data + boff, aoff, len);
311 			} else {
312 				abd_zero_off(data, aoff, len);
313 			}
314 
315 			mutex_exit(&db->db_mtx);
316 			continue;
317 		}
318 
319 		mbuf = make_abd_for_dbuf(db, data, offset, size);
320 		ASSERT3P(mbuf, !=, NULL);
321 
322 		/*
323 		 * The dbuf mutex (db_mtx) must be held when creating the ZIO
324 		 * for the read. The BP returned from
325 		 * dmu_buf_get_bp_from_dbuf() could be from a pending block
326 		 * clone or a yet to be synced Direct I/O write that is in the
327 		 * dbuf's dirty record. When zio_read() is called, zio_create()
328 		 * will make a copy of the BP. However, if zio_read() is called
329 		 * without the mutex being held then the dirty record from the
330 		 * dbuf could be freed in dbuf_write_done() resulting in garbage
331 		 * being set for the zio BP.
332 		 */
333 		zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
334 		    dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
335 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb);
336 		mutex_exit(&db->db_mtx);
337 
338 		zfs_racct_read(spa, db->db.db_size, 1, flags);
339 		zio_nowait(cio);
340 	}
341 
342 	dmu_buf_rele_array(dbp, numbufs, FTAG);
343 
344 	return (zio_wait(rio));
345 
346 error:
347 	dmu_buf_rele_array(dbp, numbufs, FTAG);
348 	(void) zio_wait(rio);
349 	return (err);
350 }
351 
352 #ifdef _KERNEL
353 int
dmu_read_uio_direct(dnode_t * dn,zfs_uio_t * uio,uint64_t size)354 dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
355 {
356 	offset_t offset = zfs_uio_offset(uio);
357 	offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
358 	int err;
359 
360 	ASSERT(uio->uio_extflg & UIO_DIRECT);
361 	ASSERT3U(page_index, <, uio->uio_dio.npages);
362 
363 	abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
364 	    offset & (PAGESIZE - 1), size);
365 	err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
366 	abd_free(data);
367 
368 	if (err == 0)
369 		zfs_uioskip(uio, size);
370 
371 	return (err);
372 }
373 
374 int
dmu_write_uio_direct(dnode_t * dn,zfs_uio_t * uio,uint64_t size,dmu_tx_t * tx)375 dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
376 {
377 	offset_t offset = zfs_uio_offset(uio);
378 	offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
379 	int err;
380 
381 	ASSERT(uio->uio_extflg & UIO_DIRECT);
382 	ASSERT3U(page_index, <, uio->uio_dio.npages);
383 
384 	abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
385 	    offset & (PAGESIZE - 1), size);
386 	err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
387 	abd_free(data);
388 
389 	if (err == 0)
390 		zfs_uioskip(uio, size);
391 
392 	return (err);
393 }
394 #endif /* _KERNEL */
395 
396 EXPORT_SYMBOL(dmu_read_uio_direct);
397 EXPORT_SYMBOL(dmu_write_uio_direct);
398