xref: /linux/fs/btrfs/direct-io.c (revision 7696286034ac72cf9b46499be1715ac62fd302c3)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/fsverity.h>
4 #include <linux/iomap.h>
5 #include "ctree.h"
6 #include "delalloc-space.h"
7 #include "direct-io.h"
8 #include "extent-tree.h"
9 #include "file.h"
10 #include "fs.h"
11 #include "transaction.h"
12 #include "volumes.h"
13 #include "bio.h"
14 #include "ordered-data.h"
15 
16 struct btrfs_dio_data {
17 	ssize_t submitted;
18 	struct extent_changeset *data_reserved;
19 	struct btrfs_ordered_extent *ordered;
20 	bool data_space_reserved;
21 	bool nocow_done;
22 };
23 
24 struct btrfs_dio_private {
25 	/* Range of I/O */
26 	u64 file_offset;
27 	u32 bytes;
28 
29 	/* This must be last */
30 	struct btrfs_bio bbio;
31 };
32 
33 static struct bio_set btrfs_dio_bioset;
34 
35 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
36 			      struct extent_state **cached_state,
37 			      unsigned int iomap_flags)
38 {
39 	const bool writing = (iomap_flags & IOMAP_WRITE);
40 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
41 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
42 	struct btrfs_ordered_extent *ordered;
43 	int ret = 0;
44 
45 	/* Direct lock must be taken before the extent lock. */
46 	if (nowait) {
47 		if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
48 			return -EAGAIN;
49 	} else {
50 		btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
51 	}
52 
53 	while (1) {
54 		if (nowait) {
55 			if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
56 						   cached_state)) {
57 				ret = -EAGAIN;
58 				break;
59 			}
60 		} else {
61 			btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
62 		}
63 		/*
64 		 * We're concerned with the entire range that we're going to be
65 		 * doing DIO to, so we need to make sure there's no ordered
66 		 * extents in this range.
67 		 */
68 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
69 						     lockend - lockstart + 1);
70 
71 		/*
72 		 * We need to make sure there are no buffered pages in this
73 		 * range either, we could have raced between the invalidate in
74 		 * generic_file_direct_write and locking the extent.  The
75 		 * invalidate needs to happen so that reads after a write do not
76 		 * get stale data.
77 		 */
78 		if (!ordered &&
79 		    (!writing || !filemap_range_has_page(inode->i_mapping,
80 							 lockstart, lockend)))
81 			break;
82 
83 		btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
84 
85 		if (ordered) {
86 			if (nowait) {
87 				btrfs_put_ordered_extent(ordered);
88 				ret = -EAGAIN;
89 				break;
90 			}
91 			/*
92 			 * If we are doing a DIO read and the ordered extent we
93 			 * found is for a buffered write, we can not wait for it
94 			 * to complete and retry, because if we do so we can
95 			 * deadlock with concurrent buffered writes on page
96 			 * locks. This happens only if our DIO read covers more
97 			 * than one extent map, if at this point has already
98 			 * created an ordered extent for a previous extent map
99 			 * and locked its range in the inode's io tree, and a
100 			 * concurrent write against that previous extent map's
101 			 * range and this range started (we unlock the ranges
102 			 * in the io tree only when the bios complete and
103 			 * buffered writes always lock pages before attempting
104 			 * to lock range in the io tree).
105 			 */
106 			if (writing ||
107 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
108 				btrfs_start_ordered_extent(ordered);
109 			else
110 				ret = nowait ? -EAGAIN : -ENOTBLK;
111 			btrfs_put_ordered_extent(ordered);
112 		} else {
113 			/*
114 			 * We could trigger writeback for this range (and wait
115 			 * for it to complete) and then invalidate the pages for
116 			 * this range (through invalidate_inode_pages2_range()),
117 			 * but that can lead us to a deadlock with a concurrent
118 			 * call to readahead (a buffered read or a defrag call
119 			 * triggered a readahead) on a page lock due to an
120 			 * ordered dio extent we created before but did not have
121 			 * yet a corresponding bio submitted (whence it can not
122 			 * complete), which makes readahead wait for that
123 			 * ordered extent to complete while holding a lock on
124 			 * that page.
125 			 */
126 			ret = nowait ? -EAGAIN : -ENOTBLK;
127 		}
128 
129 		if (ret)
130 			break;
131 
132 		cond_resched();
133 	}
134 
135 	if (ret)
136 		btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
137 	return ret;
138 }
139 
140 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
141 						  struct btrfs_dio_data *dio_data,
142 						  const u64 start,
143 						  const struct btrfs_file_extent *file_extent,
144 						  const int type)
145 {
146 	struct extent_map *em = NULL;
147 	struct btrfs_ordered_extent *ordered;
148 
149 	if (type != BTRFS_ORDERED_NOCOW) {
150 		em = btrfs_create_io_em(inode, start, file_extent, type);
151 		if (IS_ERR(em))
152 			goto out;
153 	}
154 
155 	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
156 					     (1U << type) |
157 					     (1U << BTRFS_ORDERED_DIRECT));
158 	if (IS_ERR(ordered)) {
159 		if (em) {
160 			btrfs_free_extent_map(em);
161 			btrfs_drop_extent_map_range(inode, start,
162 					start + file_extent->num_bytes - 1, false);
163 		}
164 		em = ERR_CAST(ordered);
165 	} else {
166 		ASSERT(!dio_data->ordered);
167 		dio_data->ordered = ordered;
168 	}
169  out:
170 
171 	return em;
172 }
173 
174 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
175 						  struct btrfs_dio_data *dio_data,
176 						  u64 start, u64 len)
177 {
178 	struct btrfs_root *root = inode->root;
179 	struct btrfs_fs_info *fs_info = root->fs_info;
180 	struct btrfs_file_extent file_extent;
181 	struct extent_map *em;
182 	struct btrfs_key ins;
183 	u64 alloc_hint;
184 	int ret;
185 
186 	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
187 again:
188 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
189 				   0, alloc_hint, &ins, true, true);
190 	if (ret == -EAGAIN) {
191 		ASSERT(btrfs_is_zoned(fs_info));
192 		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
193 			       TASK_UNINTERRUPTIBLE);
194 		goto again;
195 	}
196 	if (ret)
197 		return ERR_PTR(ret);
198 
199 	file_extent.disk_bytenr = ins.objectid;
200 	file_extent.disk_num_bytes = ins.offset;
201 	file_extent.num_bytes = ins.offset;
202 	file_extent.ram_bytes = ins.offset;
203 	file_extent.offset = 0;
204 	file_extent.compression = BTRFS_COMPRESS_NONE;
205 	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
206 				     BTRFS_ORDERED_REGULAR);
207 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
208 	if (IS_ERR(em))
209 		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
210 
211 	return em;
212 }
213 
214 static int btrfs_get_blocks_direct_write(struct extent_map **map,
215 					 struct inode *inode,
216 					 struct btrfs_dio_data *dio_data,
217 					 u64 start, u64 *lenp,
218 					 unsigned int iomap_flags)
219 {
220 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
221 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
222 	struct btrfs_file_extent file_extent;
223 	struct extent_map *em = *map;
224 	int type;
225 	u64 block_start;
226 	struct btrfs_block_group *bg;
227 	bool can_nocow = false;
228 	bool space_reserved = false;
229 	u64 len = *lenp;
230 	u64 prev_len;
231 	int ret = 0;
232 
233 	/*
234 	 * We don't allocate a new extent in the following cases
235 	 *
236 	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
237 	 * existing extent.
238 	 * 2) The extent is marked as PREALLOC. We're good to go here and can
239 	 * just use the extent.
240 	 *
241 	 */
242 	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
243 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
244 	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
245 		if (em->flags & EXTENT_FLAG_PREALLOC)
246 			type = BTRFS_ORDERED_PREALLOC;
247 		else
248 			type = BTRFS_ORDERED_NOCOW;
249 		len = min(len, em->len - (start - em->start));
250 		block_start = btrfs_extent_map_block_start(em) + (start - em->start);
251 
252 		if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
253 				     false) == 1) {
254 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
255 			if (bg)
256 				can_nocow = true;
257 		}
258 	}
259 
260 	prev_len = len;
261 	if (can_nocow) {
262 		struct extent_map *em2;
263 
264 		/* We can NOCOW, so only need to reserve metadata space. */
265 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
266 						      nowait);
267 		if (ret < 0) {
268 			/* Our caller expects us to free the input extent map. */
269 			btrfs_free_extent_map(em);
270 			*map = NULL;
271 			btrfs_dec_nocow_writers(bg);
272 			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
273 				ret = -EAGAIN;
274 			goto out;
275 		}
276 		space_reserved = true;
277 
278 		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
279 					      &file_extent, type);
280 		btrfs_dec_nocow_writers(bg);
281 		if (type == BTRFS_ORDERED_PREALLOC) {
282 			btrfs_free_extent_map(em);
283 			*map = em2;
284 			em = em2;
285 		}
286 
287 		if (IS_ERR(em2)) {
288 			ret = PTR_ERR(em2);
289 			goto out;
290 		}
291 
292 		dio_data->nocow_done = true;
293 	} else {
294 		/* Our caller expects us to free the input extent map. */
295 		btrfs_free_extent_map(em);
296 		*map = NULL;
297 
298 		if (nowait) {
299 			ret = -EAGAIN;
300 			goto out;
301 		}
302 
303 		/*
304 		 * If we could not allocate data space before locking the file
305 		 * range and we can't do a NOCOW write, then we have to fail.
306 		 */
307 		if (!dio_data->data_space_reserved) {
308 			ret = -ENOSPC;
309 			goto out;
310 		}
311 
312 		/*
313 		 * We have to COW and we have already reserved data space before,
314 		 * so now we reserve only metadata.
315 		 */
316 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
317 						      false);
318 		if (ret < 0)
319 			goto out;
320 		space_reserved = true;
321 
322 		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
323 		if (IS_ERR(em)) {
324 			ret = PTR_ERR(em);
325 			goto out;
326 		}
327 		*map = em;
328 		len = min(len, em->len - (start - em->start));
329 		if (len < prev_len)
330 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
331 							prev_len - len, true);
332 	}
333 
334 	/*
335 	 * We have created our ordered extent, so we can now release our reservation
336 	 * for an outstanding extent.
337 	 */
338 	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
339 
340 	/*
341 	 * Need to update the i_size under the extent lock so buffered
342 	 * readers will get the updated i_size when we unlock.
343 	 */
344 	if (start + len > i_size_read(inode))
345 		i_size_write(inode, start + len);
346 out:
347 	if (ret && space_reserved) {
348 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
349 		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
350 	}
351 	*lenp = len;
352 	return ret;
353 }
354 
355 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
356 		loff_t length, unsigned int flags, struct iomap *iomap,
357 		struct iomap *srcmap)
358 {
359 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
360 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
361 	struct extent_map *em;
362 	struct extent_state *cached_state = NULL;
363 	struct btrfs_dio_data *dio_data = iter->private;
364 	u64 lockstart, lockend;
365 	const bool write = !!(flags & IOMAP_WRITE);
366 	int ret = 0;
367 	u64 len = length;
368 	const u64 data_alloc_len = length;
369 	u32 unlock_bits = EXTENT_LOCKED;
370 
371 	/*
372 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
373 	 * we're NOWAIT we may submit a bio for a partial range and return
374 	 * EIOCBQUEUED, which would result in an errant short read.
375 	 *
376 	 * The best way to handle this would be to allow for partial completions
377 	 * of iocb's, so we could submit the partial bio, return and fault in
378 	 * the rest of the pages, and then submit the io for the rest of the
379 	 * range.  However we don't have that currently, so simply return
380 	 * -EAGAIN at this point so that the normal path is used.
381 	 */
382 	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
383 		return -EAGAIN;
384 
385 	/*
386 	 * Cap the size of reads to that usually seen in buffered I/O as we need
387 	 * to allocate a contiguous array for the checksums.
388 	 */
389 	if (!write)
390 		len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
391 
392 	lockstart = start;
393 	lockend = start + len - 1;
394 
395 	/*
396 	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
397 	 * enough if we've written compressed pages to this area, so we need to
398 	 * flush the dirty pages again to make absolutely sure that any
399 	 * outstanding dirty pages are on disk - the first flush only starts
400 	 * compression on the data, while keeping the pages locked, so by the
401 	 * time the second flush returns we know bios for the compressed pages
402 	 * were submitted and finished, and the pages no longer under writeback.
403 	 *
404 	 * If we have a NOWAIT request and we have any pages in the range that
405 	 * are locked, likely due to compression still in progress, we don't want
406 	 * to block on page locks. We also don't want to block on pages marked as
407 	 * dirty or under writeback (same as for the non-compression case).
408 	 * iomap_dio_rw() did the same check, but after that and before we got
409 	 * here, mmap'ed writes may have happened or buffered reads started
410 	 * (readpage() and readahead(), which lock pages), as we haven't locked
411 	 * the file range yet.
412 	 */
413 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
414 		     &BTRFS_I(inode)->runtime_flags)) {
415 		if (flags & IOMAP_NOWAIT) {
416 			if (filemap_range_needs_writeback(inode->i_mapping,
417 							  lockstart, lockend))
418 				return -EAGAIN;
419 		} else {
420 			ret = filemap_fdatawrite_range(inode->i_mapping, start,
421 						       start + length - 1);
422 			if (ret)
423 				return ret;
424 		}
425 	}
426 
427 	memset(dio_data, 0, sizeof(*dio_data));
428 
429 	/*
430 	 * We always try to allocate data space and must do it before locking
431 	 * the file range, to avoid deadlocks with concurrent writes to the same
432 	 * range if the range has several extents and the writes don't expand the
433 	 * current i_size (the inode lock is taken in shared mode). If we fail to
434 	 * allocate data space here we continue and later, after locking the
435 	 * file range, we fail with ENOSPC only if we figure out we can not do a
436 	 * NOCOW write.
437 	 */
438 	if (write && !(flags & IOMAP_NOWAIT)) {
439 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
440 						  &dio_data->data_reserved,
441 						  start, data_alloc_len, false);
442 		if (!ret)
443 			dio_data->data_space_reserved = true;
444 		else if (!(BTRFS_I(inode)->flags &
445 			   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
446 			goto err;
447 	}
448 
449 	/*
450 	 * If this errors out it's because we couldn't invalidate pagecache for
451 	 * this range and we need to fallback to buffered IO, or we are doing a
452 	 * NOWAIT read/write and we need to block.
453 	 */
454 	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
455 	if (ret < 0)
456 		goto err;
457 
458 	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
459 	if (IS_ERR(em)) {
460 		ret = PTR_ERR(em);
461 		goto unlock_err;
462 	}
463 
464 	/*
465 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
466 	 * io.  INLINE is special, and we could probably kludge it in here, but
467 	 * it's still buffered so for safety lets just fall back to the generic
468 	 * buffered path.
469 	 *
470 	 * For COMPRESSED we _have_ to read the entire extent in so we can
471 	 * decompress it, so there will be buffering required no matter what we
472 	 * do, so go ahead and fallback to buffered.
473 	 *
474 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
475 	 * to buffered IO.  Don't blame me, this is the price we pay for using
476 	 * the generic code.
477 	 */
478 	if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
479 		btrfs_free_extent_map(em);
480 		/*
481 		 * If we are in a NOWAIT context, return -EAGAIN in order to
482 		 * fallback to buffered IO. This is not only because we can
483 		 * block with buffered IO (no support for NOWAIT semantics at
484 		 * the moment) but also to avoid returning short reads to user
485 		 * space - this happens if we were able to read some data from
486 		 * previous non-compressed extents and then when we fallback to
487 		 * buffered IO, at btrfs_file_read_iter() by calling
488 		 * filemap_read(), we fail to fault in pages for the read buffer,
489 		 * in which case filemap_read() returns a short read (the number
490 		 * of bytes previously read is > 0, so it does not return -EFAULT).
491 		 */
492 		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
493 		goto unlock_err;
494 	}
495 
496 	len = min(len, em->len - (start - em->start));
497 
498 	/*
499 	 * If we have a NOWAIT request and the range contains multiple extents
500 	 * (or a mix of extents and holes), then we return -EAGAIN to make the
501 	 * caller fallback to a context where it can do a blocking (without
502 	 * NOWAIT) request. This way we avoid doing partial IO and returning
503 	 * success to the caller, which is not optimal for writes and for reads
504 	 * it can result in unexpected behaviour for an application.
505 	 *
506 	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
507 	 * iomap_dio_rw(), we can end up returning less data then what the caller
508 	 * asked for, resulting in an unexpected, and incorrect, short read.
509 	 * That is, the caller asked to read N bytes and we return less than that,
510 	 * which is wrong unless we are crossing EOF. This happens if we get a
511 	 * page fault error when trying to fault in pages for the buffer that is
512 	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
513 	 * have previously submitted bios for other extents in the range, in
514 	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
515 	 * those bios have completed by the time we get the page fault error,
516 	 * which we return back to our caller - we should only return EIOCBQUEUED
517 	 * after we have submitted bios for all the extents in the range.
518 	 */
519 	if ((flags & IOMAP_NOWAIT) && len < length) {
520 		btrfs_free_extent_map(em);
521 		ret = -EAGAIN;
522 		goto unlock_err;
523 	}
524 
525 	if (write) {
526 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
527 						    start, &len, flags);
528 		if (ret < 0)
529 			goto unlock_err;
530 		/* Recalc len in case the new em is smaller than requested */
531 		len = min(len, em->len - (start - em->start));
532 		if (dio_data->data_space_reserved) {
533 			u64 release_offset;
534 			u64 release_len = 0;
535 
536 			if (dio_data->nocow_done) {
537 				release_offset = start;
538 				release_len = data_alloc_len;
539 			} else if (len < data_alloc_len) {
540 				release_offset = start + len;
541 				release_len = data_alloc_len - len;
542 			}
543 
544 			if (release_len > 0)
545 				btrfs_free_reserved_data_space(BTRFS_I(inode),
546 							       dio_data->data_reserved,
547 							       release_offset,
548 							       release_len);
549 		}
550 	}
551 
552 	/*
553 	 * Translate extent map information to iomap.
554 	 * We trim the extents (and move the addr) even though iomap code does
555 	 * that, since we have locked only the parts we are performing I/O in.
556 	 */
557 	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
558 	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
559 		iomap->addr = IOMAP_NULL_ADDR;
560 		iomap->type = IOMAP_HOLE;
561 	} else {
562 		iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
563 		iomap->type = IOMAP_MAPPED;
564 	}
565 	iomap->offset = start;
566 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
567 	iomap->length = len;
568 	btrfs_free_extent_map(em);
569 
570 	/*
571 	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
572 	 * writes only hold it for this part.  We hold the extent lock until
573 	 * we're completely done with the extent map to make sure it remains
574 	 * valid.
575 	 */
576 	if (write)
577 		unlock_bits |= EXTENT_DIO_LOCKED;
578 
579 	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
580 			       unlock_bits, &cached_state);
581 
582 	/* We didn't use everything, unlock the dio extent for the remainder. */
583 	if (!write && (start + len) < lockend)
584 		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
585 					lockend, NULL);
586 
587 	return 0;
588 
589 unlock_err:
590 	/*
591 	 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
592 	 * to update this, be explicit that we expect EXTENT_LOCKED and
593 	 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
594 	 */
595 	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
596 			       EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
597 err:
598 	if (dio_data->data_space_reserved) {
599 		btrfs_free_reserved_data_space(BTRFS_I(inode),
600 					       dio_data->data_reserved,
601 					       start, data_alloc_len);
602 		extent_changeset_free(dio_data->data_reserved);
603 	}
604 
605 	return ret;
606 }
607 
608 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
609 		ssize_t written, unsigned int flags, struct iomap *iomap)
610 {
611 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
612 	struct btrfs_dio_data *dio_data = iter->private;
613 	size_t submitted = dio_data->submitted;
614 	const bool write = !!(flags & IOMAP_WRITE);
615 	int ret = 0;
616 
617 	if (!write && (iomap->type == IOMAP_HOLE)) {
618 		/* If reading from a hole, unlock and return */
619 		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
620 					pos + length - 1, NULL);
621 		return 0;
622 	}
623 
624 	if (submitted < length) {
625 		pos += submitted;
626 		length -= submitted;
627 		if (write)
628 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
629 						    pos, length, false);
630 		else
631 			btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
632 						pos + length - 1, NULL);
633 		ret = -ENOTBLK;
634 	}
635 	if (write) {
636 		btrfs_put_ordered_extent(dio_data->ordered);
637 		dio_data->ordered = NULL;
638 	}
639 
640 	if (write)
641 		extent_changeset_free(dio_data->data_reserved);
642 	return ret;
643 }
644 
645 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
646 {
647 	struct btrfs_dio_private *dip =
648 		container_of(bbio, struct btrfs_dio_private, bbio);
649 	struct btrfs_inode *inode = bbio->inode;
650 	struct bio *bio = &bbio->bio;
651 
652 	if (bio->bi_status) {
653 		btrfs_warn(inode->root->fs_info,
654 		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
655 			   btrfs_ino(inode), bio->bi_opf,
656 			   dip->file_offset, dip->bytes, bio->bi_status);
657 	}
658 
659 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
660 		btrfs_finish_ordered_extent(bbio->ordered, NULL,
661 					    dip->file_offset, dip->bytes,
662 					    !bio->bi_status);
663 	} else {
664 		btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
665 					dip->file_offset + dip->bytes - 1, NULL);
666 	}
667 
668 	bbio->bio.bi_private = bbio->private;
669 	iomap_dio_bio_end_io(bio);
670 }
671 
672 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
673 					struct btrfs_ordered_extent *ordered)
674 {
675 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
676 	u64 len = bbio->bio.bi_iter.bi_size;
677 	struct btrfs_ordered_extent *new;
678 	int ret;
679 
680 	/* Must always be called for the beginning of an ordered extent. */
681 	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
682 		return -EINVAL;
683 
684 	/* No need to split if the ordered extent covers the entire bio. */
685 	if (ordered->disk_num_bytes == len) {
686 		refcount_inc(&ordered->refs);
687 		bbio->ordered = ordered;
688 		return 0;
689 	}
690 
691 	/*
692 	 * Don't split the extent_map for NOCOW extents, as we're writing into
693 	 * a pre-existing one.
694 	 */
695 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
696 		ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
697 					     ordered->num_bytes, len,
698 					     ordered->disk_bytenr);
699 		if (ret)
700 			return ret;
701 	}
702 
703 	new = btrfs_split_ordered_extent(ordered, len);
704 	if (IS_ERR(new))
705 		return PTR_ERR(new);
706 	bbio->ordered = new;
707 	return 0;
708 }
709 
710 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
711 				loff_t file_offset)
712 {
713 	struct btrfs_bio *bbio = btrfs_bio(bio);
714 	struct btrfs_dio_private *dip =
715 		container_of(bbio, struct btrfs_dio_private, bbio);
716 	struct btrfs_dio_data *dio_data = iter->private;
717 
718 	btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
719 		       btrfs_dio_end_io, bio->bi_private);
720 
721 	dip->file_offset = file_offset;
722 	dip->bytes = bio->bi_iter.bi_size;
723 
724 	dio_data->submitted += bio->bi_iter.bi_size;
725 
726 	/*
727 	 * Check if we are doing a partial write.  If we are, we need to split
728 	 * the ordered extent to match the submitted bio.  Hang on to the
729 	 * remaining unfinishable ordered_extent in dio_data so that it can be
730 	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
731 	 * remaining pages is blocked on the outstanding ordered extent.
732 	 */
733 	if (iter->flags & IOMAP_WRITE) {
734 		int ret;
735 
736 		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737 		if (ret) {
738 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739 						    file_offset, dip->bytes,
740 						    !ret);
741 			bio->bi_status = errno_to_blk_status(ret);
742 			iomap_dio_bio_end_io(bio);
743 			return;
744 		}
745 	}
746 
747 	btrfs_submit_bbio(bbio, 0);
748 }
749 
750 static const struct iomap_ops btrfs_dio_iomap_ops = {
751 	.iomap_begin            = btrfs_dio_iomap_begin,
752 	.iomap_end              = btrfs_dio_iomap_end,
753 };
754 
755 static const struct iomap_dio_ops btrfs_dio_ops = {
756 	.submit_io		= btrfs_dio_submit_io,
757 	.bio_set		= &btrfs_dio_bioset,
758 };
759 
760 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761 			      size_t done_before)
762 {
763 	struct btrfs_dio_data data = { 0 };
764 
765 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766 			    IOMAP_DIO_PARTIAL, &data, done_before);
767 }
768 
769 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770 					 size_t done_before)
771 {
772 	struct btrfs_dio_data data = { 0 };
773 
774 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775 			    IOMAP_DIO_PARTIAL, &data, done_before);
776 }
777 
778 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779 			       const struct iov_iter *iter, loff_t offset)
780 {
781 	const u32 blocksize_mask = fs_info->sectorsize - 1;
782 
783 	if (offset & blocksize_mask)
784 		return -EINVAL;
785 
786 	if (iov_iter_alignment(iter) & blocksize_mask)
787 		return -EINVAL;
788 
789 	/*
790 	 * For bs > ps support, we heavily rely on large folios to make sure no
791 	 * block will cross large folio boundaries.
792 	 *
793 	 * But memory provided by direct IO is only virtually contiguous, not
794 	 * physically contiguous, and will break the btrfs' large folio requirement.
795 	 *
796 	 * So for bs > ps support, all direct IOs should fallback to buffered ones.
797 	 */
798 	if (fs_info->sectorsize > PAGE_SIZE)
799 		return -EINVAL;
800 
801 	return 0;
802 }
803 
804 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
805 {
806 	struct file *file = iocb->ki_filp;
807 	struct inode *inode = file_inode(file);
808 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
809 	loff_t pos;
810 	ssize_t written = 0;
811 	ssize_t written_buffered;
812 	size_t prev_left = 0;
813 	loff_t endbyte;
814 	ssize_t ret;
815 	unsigned int ilock_flags = 0;
816 	struct iomap_dio *dio;
817 
818 	if (iocb->ki_flags & IOCB_NOWAIT)
819 		ilock_flags |= BTRFS_ILOCK_TRY;
820 
821 	/*
822 	 * If the write DIO is within EOF, use a shared lock and also only if
823 	 * security bits will likely not be dropped by file_remove_privs() called
824 	 * from btrfs_write_check(). Either will need to be rechecked after the
825 	 * lock was acquired.
826 	 */
827 	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
828 		ilock_flags |= BTRFS_ILOCK_SHARED;
829 
830 relock:
831 	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
832 	if (ret < 0)
833 		return ret;
834 
835 	/* Shared lock cannot be used with security bits set. */
836 	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
837 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
838 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
839 		goto relock;
840 	}
841 
842 	ret = generic_write_checks(iocb, from);
843 	if (ret <= 0) {
844 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
845 		return ret;
846 	}
847 
848 	ret = btrfs_write_check(iocb, ret);
849 	if (ret < 0) {
850 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
851 		goto out;
852 	}
853 
854 	pos = iocb->ki_pos;
855 	/*
856 	 * Re-check since file size may have changed just before taking the
857 	 * lock or pos may have changed because of O_APPEND in generic_write_check()
858 	 */
859 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
860 	    pos + iov_iter_count(from) > i_size_read(inode)) {
861 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
862 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
863 		goto relock;
864 	}
865 
866 	if (check_direct_IO(fs_info, from, pos)) {
867 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
868 		goto buffered;
869 	}
870 	/*
871 	 * We can't control the folios being passed in, applications can write
872 	 * to them while a direct IO write is in progress.  This means the
873 	 * content might change after we calculated the data checksum.
874 	 * Therefore we can end up storing a checksum that doesn't match the
875 	 * persisted data.
876 	 *
877 	 * To be extra safe and avoid false data checksum mismatch, if the
878 	 * inode requires data checksum, just fallback to buffered IO.
879 	 * For buffered IO we have full control of page cache and can ensure
880 	 * no one is modifying the content during writeback.
881 	 */
882 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
883 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
884 		goto buffered;
885 	}
886 
887 	/*
888 	 * The iov_iter can be mapped to the same file range we are writing to.
889 	 * If that's the case, then we will deadlock in the iomap code, because
890 	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
891 	 * an ordered extent, and after that it will fault in the pages that the
892 	 * iov_iter refers to. During the fault in we end up in the readahead
893 	 * pages code (starting at btrfs_readahead()), which will lock the range,
894 	 * find that ordered extent and then wait for it to complete (at
895 	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
896 	 * obviously the ordered extent can never complete as we didn't submit
897 	 * yet the respective bio(s). This always happens when the buffer is
898 	 * memory mapped to the same file range, since the iomap DIO code always
899 	 * invalidates pages in the target file range (after starting and waiting
900 	 * for any writeback).
901 	 *
902 	 * So here we disable page faults in the iov_iter and then retry if we
903 	 * got -EFAULT, faulting in the pages before the retry.
904 	 */
905 again:
906 	from->nofault = true;
907 	dio = btrfs_dio_write(iocb, from, written);
908 	from->nofault = false;
909 
910 	if (IS_ERR_OR_NULL(dio)) {
911 		ret = PTR_ERR_OR_ZERO(dio);
912 	} else {
913 		/*
914 		 * If we have a synchronous write, we must make sure the fsync
915 		 * triggered by the iomap_dio_complete() call below doesn't
916 		 * deadlock on the inode lock - we are already holding it and we
917 		 * can't call it after unlocking because we may need to complete
918 		 * partial writes due to the input buffer (or parts of it) not
919 		 * being already faulted in.
920 		 */
921 		ASSERT(current->journal_info == NULL);
922 		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
923 		ret = iomap_dio_complete(dio);
924 		current->journal_info = NULL;
925 	}
926 
927 	/* No increment (+=) because iomap returns a cumulative value. */
928 	if (ret > 0)
929 		written = ret;
930 
931 	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
932 		const size_t left = iov_iter_count(from);
933 		/*
934 		 * We have more data left to write. Try to fault in as many as
935 		 * possible of the remainder pages and retry. We do this without
936 		 * releasing and locking again the inode, to prevent races with
937 		 * truncate.
938 		 *
939 		 * Also, in case the iov refers to pages in the file range of the
940 		 * file we want to write to (due to a mmap), we could enter an
941 		 * infinite loop if we retry after faulting the pages in, since
942 		 * iomap will invalidate any pages in the range early on, before
943 		 * it tries to fault in the pages of the iov. So we keep track of
944 		 * how much was left of iov in the previous EFAULT and fallback
945 		 * to buffered IO in case we haven't made any progress.
946 		 */
947 		if (left == prev_left) {
948 			ret = -ENOTBLK;
949 		} else {
950 			fault_in_iov_iter_readable(from, left);
951 			prev_left = left;
952 			goto again;
953 		}
954 	}
955 
956 	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
957 
958 	/*
959 	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
960 	 * we must fallback to buffered IO.
961 	 */
962 	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
963 		goto out;
964 
965 buffered:
966 	/*
967 	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
968 	 * it must retry the operation in a context where blocking is acceptable,
969 	 * because even if we end up not blocking during the buffered IO attempt
970 	 * below, we will block when flushing and waiting for the IO.
971 	 */
972 	if (iocb->ki_flags & IOCB_NOWAIT) {
973 		ret = -EAGAIN;
974 		goto out;
975 	}
976 
977 	pos = iocb->ki_pos;
978 	written_buffered = btrfs_buffered_write(iocb, from);
979 	if (written_buffered < 0) {
980 		ret = written_buffered;
981 		goto out;
982 	}
983 	/*
984 	 * Ensure all data is persisted. We want the next direct IO read to be
985 	 * able to read what was just written.
986 	 */
987 	endbyte = pos + written_buffered - 1;
988 	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
989 	if (ret)
990 		goto out;
991 	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
992 	if (ret)
993 		goto out;
994 	written += written_buffered;
995 	iocb->ki_pos = pos + written_buffered;
996 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
997 				 endbyte >> PAGE_SHIFT);
998 out:
999 	return ret < 0 ? ret : written;
1000 }
1001 
1002 static int check_direct_read(struct btrfs_fs_info *fs_info,
1003 			     const struct iov_iter *iter, loff_t offset)
1004 {
1005 	int ret;
1006 	int i, seg;
1007 
1008 	ret = check_direct_IO(fs_info, iter, offset);
1009 	if (ret < 0)
1010 		return ret;
1011 
1012 	if (!iter_is_iovec(iter))
1013 		return 0;
1014 
1015 	for (seg = 0; seg < iter->nr_segs; seg++) {
1016 		for (i = seg + 1; i < iter->nr_segs; i++) {
1017 			const struct iovec *iov1 = iter_iov(iter) + seg;
1018 			const struct iovec *iov2 = iter_iov(iter) + i;
1019 
1020 			if (iov1->iov_base == iov2->iov_base)
1021 				return -EINVAL;
1022 		}
1023 	}
1024 	return 0;
1025 }
1026 
1027 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1028 {
1029 	struct inode *inode = file_inode(iocb->ki_filp);
1030 	size_t prev_left = 0;
1031 	ssize_t read = 0;
1032 	ssize_t ret;
1033 
1034 	if (fsverity_active(inode))
1035 		return 0;
1036 
1037 	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1038 		return 0;
1039 
1040 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1041 again:
1042 	/*
1043 	 * This is similar to what we do for direct IO writes, see the comment
1044 	 * at btrfs_direct_write(), but we also disable page faults in addition
1045 	 * to disabling them only at the iov_iter level. This is because when
1046 	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1047 	 * which can still trigger page fault ins despite having set ->nofault
1048 	 * to true of our 'to' iov_iter.
1049 	 *
1050 	 * The difference to direct IO writes is that we deadlock when trying
1051 	 * to lock the extent range in the inode's tree during he page reads
1052 	 * triggered by the fault in (while for writes it is due to waiting for
1053 	 * our own ordered extent). This is because for direct IO reads,
1054 	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1055 	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1056 	 */
1057 	pagefault_disable();
1058 	to->nofault = true;
1059 	ret = btrfs_dio_read(iocb, to, read);
1060 	to->nofault = false;
1061 	pagefault_enable();
1062 
1063 	/* No increment (+=) because iomap returns a cumulative value. */
1064 	if (ret > 0)
1065 		read = ret;
1066 
1067 	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1068 		const size_t left = iov_iter_count(to);
1069 
1070 		if (left == prev_left) {
1071 			/*
1072 			 * We didn't make any progress since the last attempt,
1073 			 * fallback to a buffered read for the remainder of the
1074 			 * range. This is just to avoid any possibility of looping
1075 			 * for too long.
1076 			 */
1077 			ret = read;
1078 		} else {
1079 			/*
1080 			 * We made some progress since the last retry or this is
1081 			 * the first time we are retrying. Fault in as many pages
1082 			 * as possible and retry.
1083 			 */
1084 			fault_in_iov_iter_writeable(to, left);
1085 			prev_left = left;
1086 			goto again;
1087 		}
1088 	}
1089 	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1090 	return ret < 0 ? ret : read;
1091 }
1092 
1093 int __init btrfs_init_dio(void)
1094 {
1095 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1096 			offsetof(struct btrfs_dio_private, bbio.bio),
1097 			BIOSET_NEED_BVECS))
1098 		return -ENOMEM;
1099 
1100 	return 0;
1101 }
1102 
1103 void __cold btrfs_destroy_dio(void)
1104 {
1105 	bioset_exit(&btrfs_dio_bioset);
1106 }
1107