xref: /linux/fs/btrfs/direct-io.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/fsverity.h>
4 #include <linux/iomap.h>
5 #include "ctree.h"
6 #include "delalloc-space.h"
7 #include "direct-io.h"
8 #include "extent-tree.h"
9 #include "file.h"
10 #include "fs.h"
11 #include "transaction.h"
12 #include "volumes.h"
13 
14 struct btrfs_dio_data {
15 	ssize_t submitted;
16 	struct extent_changeset *data_reserved;
17 	struct btrfs_ordered_extent *ordered;
18 	bool data_space_reserved;
19 	bool nocow_done;
20 };
21 
22 struct btrfs_dio_private {
23 	/* Range of I/O */
24 	u64 file_offset;
25 	u32 bytes;
26 
27 	/* This must be last */
28 	struct btrfs_bio bbio;
29 };
30 
31 static struct bio_set btrfs_dio_bioset;
32 
33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
34 			      struct extent_state **cached_state,
35 			      unsigned int iomap_flags)
36 {
37 	const bool writing = (iomap_flags & IOMAP_WRITE);
38 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
39 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
40 	struct btrfs_ordered_extent *ordered;
41 	int ret = 0;
42 
43 	/* Direct lock must be taken before the extent lock. */
44 	if (nowait) {
45 		if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
46 			return -EAGAIN;
47 	} else {
48 		lock_dio_extent(io_tree, lockstart, lockend, cached_state);
49 	}
50 
51 	while (1) {
52 		if (nowait) {
53 			if (!try_lock_extent(io_tree, lockstart, lockend,
54 					     cached_state)) {
55 				ret = -EAGAIN;
56 				break;
57 			}
58 		} else {
59 			lock_extent(io_tree, lockstart, lockend, cached_state);
60 		}
61 		/*
62 		 * We're concerned with the entire range that we're going to be
63 		 * doing DIO to, so we need to make sure there's no ordered
64 		 * extents in this range.
65 		 */
66 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
67 						     lockend - lockstart + 1);
68 
69 		/*
70 		 * We need to make sure there are no buffered pages in this
71 		 * range either, we could have raced between the invalidate in
72 		 * generic_file_direct_write and locking the extent.  The
73 		 * invalidate needs to happen so that reads after a write do not
74 		 * get stale data.
75 		 */
76 		if (!ordered &&
77 		    (!writing || !filemap_range_has_page(inode->i_mapping,
78 							 lockstart, lockend)))
79 			break;
80 
81 		unlock_extent(io_tree, lockstart, lockend, cached_state);
82 
83 		if (ordered) {
84 			if (nowait) {
85 				btrfs_put_ordered_extent(ordered);
86 				ret = -EAGAIN;
87 				break;
88 			}
89 			/*
90 			 * If we are doing a DIO read and the ordered extent we
91 			 * found is for a buffered write, we can not wait for it
92 			 * to complete and retry, because if we do so we can
93 			 * deadlock with concurrent buffered writes on page
94 			 * locks. This happens only if our DIO read covers more
95 			 * than one extent map, if at this point has already
96 			 * created an ordered extent for a previous extent map
97 			 * and locked its range in the inode's io tree, and a
98 			 * concurrent write against that previous extent map's
99 			 * range and this range started (we unlock the ranges
100 			 * in the io tree only when the bios complete and
101 			 * buffered writes always lock pages before attempting
102 			 * to lock range in the io tree).
103 			 */
104 			if (writing ||
105 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
106 				btrfs_start_ordered_extent(ordered);
107 			else
108 				ret = nowait ? -EAGAIN : -ENOTBLK;
109 			btrfs_put_ordered_extent(ordered);
110 		} else {
111 			/*
112 			 * We could trigger writeback for this range (and wait
113 			 * for it to complete) and then invalidate the pages for
114 			 * this range (through invalidate_inode_pages2_range()),
115 			 * but that can lead us to a deadlock with a concurrent
116 			 * call to readahead (a buffered read or a defrag call
117 			 * triggered a readahead) on a page lock due to an
118 			 * ordered dio extent we created before but did not have
119 			 * yet a corresponding bio submitted (whence it can not
120 			 * complete), which makes readahead wait for that
121 			 * ordered extent to complete while holding a lock on
122 			 * that page.
123 			 */
124 			ret = nowait ? -EAGAIN : -ENOTBLK;
125 		}
126 
127 		if (ret)
128 			break;
129 
130 		cond_resched();
131 	}
132 
133 	if (ret)
134 		unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
135 	return ret;
136 }
137 
138 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
139 						  struct btrfs_dio_data *dio_data,
140 						  const u64 start,
141 						  const struct btrfs_file_extent *file_extent,
142 						  const int type)
143 {
144 	struct extent_map *em = NULL;
145 	struct btrfs_ordered_extent *ordered;
146 
147 	if (type != BTRFS_ORDERED_NOCOW) {
148 		em = btrfs_create_io_em(inode, start, file_extent, type);
149 		if (IS_ERR(em))
150 			goto out;
151 	}
152 
153 	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
154 					     (1 << type) |
155 					     (1 << BTRFS_ORDERED_DIRECT));
156 	if (IS_ERR(ordered)) {
157 		if (em) {
158 			free_extent_map(em);
159 			btrfs_drop_extent_map_range(inode, start,
160 					start + file_extent->num_bytes - 1, false);
161 		}
162 		em = ERR_CAST(ordered);
163 	} else {
164 		ASSERT(!dio_data->ordered);
165 		dio_data->ordered = ordered;
166 	}
167  out:
168 
169 	return em;
170 }
171 
172 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
173 						  struct btrfs_dio_data *dio_data,
174 						  u64 start, u64 len)
175 {
176 	struct btrfs_root *root = inode->root;
177 	struct btrfs_fs_info *fs_info = root->fs_info;
178 	struct btrfs_file_extent file_extent;
179 	struct extent_map *em;
180 	struct btrfs_key ins;
181 	u64 alloc_hint;
182 	int ret;
183 
184 	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
185 again:
186 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
187 				   0, alloc_hint, &ins, 1, 1);
188 	if (ret == -EAGAIN) {
189 		ASSERT(btrfs_is_zoned(fs_info));
190 		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
191 			       TASK_UNINTERRUPTIBLE);
192 		goto again;
193 	}
194 	if (ret)
195 		return ERR_PTR(ret);
196 
197 	file_extent.disk_bytenr = ins.objectid;
198 	file_extent.disk_num_bytes = ins.offset;
199 	file_extent.num_bytes = ins.offset;
200 	file_extent.ram_bytes = ins.offset;
201 	file_extent.offset = 0;
202 	file_extent.compression = BTRFS_COMPRESS_NONE;
203 	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
204 				     BTRFS_ORDERED_REGULAR);
205 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
206 	if (IS_ERR(em))
207 		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
208 					   1);
209 
210 	return em;
211 }
212 
213 static int btrfs_get_blocks_direct_write(struct extent_map **map,
214 					 struct inode *inode,
215 					 struct btrfs_dio_data *dio_data,
216 					 u64 start, u64 *lenp,
217 					 unsigned int iomap_flags)
218 {
219 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
220 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
221 	struct btrfs_file_extent file_extent;
222 	struct extent_map *em = *map;
223 	int type;
224 	u64 block_start;
225 	struct btrfs_block_group *bg;
226 	bool can_nocow = false;
227 	bool space_reserved = false;
228 	u64 len = *lenp;
229 	u64 prev_len;
230 	int ret = 0;
231 
232 	/*
233 	 * We don't allocate a new extent in the following cases
234 	 *
235 	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
236 	 * existing extent.
237 	 * 2) The extent is marked as PREALLOC. We're good to go here and can
238 	 * just use the extent.
239 	 *
240 	 */
241 	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
242 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
243 	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
244 		if (em->flags & EXTENT_FLAG_PREALLOC)
245 			type = BTRFS_ORDERED_PREALLOC;
246 		else
247 			type = BTRFS_ORDERED_NOCOW;
248 		len = min(len, em->len - (start - em->start));
249 		block_start = extent_map_block_start(em) + (start - em->start);
250 
251 		if (can_nocow_extent(inode, start, &len,
252 				     &file_extent, false, false) == 1) {
253 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
254 			if (bg)
255 				can_nocow = true;
256 		}
257 	}
258 
259 	prev_len = len;
260 	if (can_nocow) {
261 		struct extent_map *em2;
262 
263 		/* We can NOCOW, so only need to reserve metadata space. */
264 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
265 						      nowait);
266 		if (ret < 0) {
267 			/* Our caller expects us to free the input extent map. */
268 			free_extent_map(em);
269 			*map = NULL;
270 			btrfs_dec_nocow_writers(bg);
271 			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
272 				ret = -EAGAIN;
273 			goto out;
274 		}
275 		space_reserved = true;
276 
277 		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
278 					      &file_extent, type);
279 		btrfs_dec_nocow_writers(bg);
280 		if (type == BTRFS_ORDERED_PREALLOC) {
281 			free_extent_map(em);
282 			*map = em2;
283 			em = em2;
284 		}
285 
286 		if (IS_ERR(em2)) {
287 			ret = PTR_ERR(em2);
288 			goto out;
289 		}
290 
291 		dio_data->nocow_done = true;
292 	} else {
293 		/* Our caller expects us to free the input extent map. */
294 		free_extent_map(em);
295 		*map = NULL;
296 
297 		if (nowait) {
298 			ret = -EAGAIN;
299 			goto out;
300 		}
301 
302 		/*
303 		 * If we could not allocate data space before locking the file
304 		 * range and we can't do a NOCOW write, then we have to fail.
305 		 */
306 		if (!dio_data->data_space_reserved) {
307 			ret = -ENOSPC;
308 			goto out;
309 		}
310 
311 		/*
312 		 * We have to COW and we have already reserved data space before,
313 		 * so now we reserve only metadata.
314 		 */
315 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
316 						      false);
317 		if (ret < 0)
318 			goto out;
319 		space_reserved = true;
320 
321 		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
322 		if (IS_ERR(em)) {
323 			ret = PTR_ERR(em);
324 			goto out;
325 		}
326 		*map = em;
327 		len = min(len, em->len - (start - em->start));
328 		if (len < prev_len)
329 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
330 							prev_len - len, true);
331 	}
332 
333 	/*
334 	 * We have created our ordered extent, so we can now release our reservation
335 	 * for an outstanding extent.
336 	 */
337 	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
338 
339 	/*
340 	 * Need to update the i_size under the extent lock so buffered
341 	 * readers will get the updated i_size when we unlock.
342 	 */
343 	if (start + len > i_size_read(inode))
344 		i_size_write(inode, start + len);
345 out:
346 	if (ret && space_reserved) {
347 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
348 		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
349 	}
350 	*lenp = len;
351 	return ret;
352 }
353 
354 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
355 		loff_t length, unsigned int flags, struct iomap *iomap,
356 		struct iomap *srcmap)
357 {
358 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
359 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
360 	struct extent_map *em;
361 	struct extent_state *cached_state = NULL;
362 	struct btrfs_dio_data *dio_data = iter->private;
363 	u64 lockstart, lockend;
364 	const bool write = !!(flags & IOMAP_WRITE);
365 	int ret = 0;
366 	u64 len = length;
367 	const u64 data_alloc_len = length;
368 	u32 unlock_bits = EXTENT_LOCKED;
369 
370 	/*
371 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
372 	 * we're NOWAIT we may submit a bio for a partial range and return
373 	 * EIOCBQUEUED, which would result in an errant short read.
374 	 *
375 	 * The best way to handle this would be to allow for partial completions
376 	 * of iocb's, so we could submit the partial bio, return and fault in
377 	 * the rest of the pages, and then submit the io for the rest of the
378 	 * range.  However we don't have that currently, so simply return
379 	 * -EAGAIN at this point so that the normal path is used.
380 	 */
381 	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
382 		return -EAGAIN;
383 
384 	/*
385 	 * Cap the size of reads to that usually seen in buffered I/O as we need
386 	 * to allocate a contiguous array for the checksums.
387 	 */
388 	if (!write)
389 		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
390 
391 	lockstart = start;
392 	lockend = start + len - 1;
393 
394 	/*
395 	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
396 	 * enough if we've written compressed pages to this area, so we need to
397 	 * flush the dirty pages again to make absolutely sure that any
398 	 * outstanding dirty pages are on disk - the first flush only starts
399 	 * compression on the data, while keeping the pages locked, so by the
400 	 * time the second flush returns we know bios for the compressed pages
401 	 * were submitted and finished, and the pages no longer under writeback.
402 	 *
403 	 * If we have a NOWAIT request and we have any pages in the range that
404 	 * are locked, likely due to compression still in progress, we don't want
405 	 * to block on page locks. We also don't want to block on pages marked as
406 	 * dirty or under writeback (same as for the non-compression case).
407 	 * iomap_dio_rw() did the same check, but after that and before we got
408 	 * here, mmap'ed writes may have happened or buffered reads started
409 	 * (readpage() and readahead(), which lock pages), as we haven't locked
410 	 * the file range yet.
411 	 */
412 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
413 		     &BTRFS_I(inode)->runtime_flags)) {
414 		if (flags & IOMAP_NOWAIT) {
415 			if (filemap_range_needs_writeback(inode->i_mapping,
416 							  lockstart, lockend))
417 				return -EAGAIN;
418 		} else {
419 			ret = filemap_fdatawrite_range(inode->i_mapping, start,
420 						       start + length - 1);
421 			if (ret)
422 				return ret;
423 		}
424 	}
425 
426 	memset(dio_data, 0, sizeof(*dio_data));
427 
428 	/*
429 	 * We always try to allocate data space and must do it before locking
430 	 * the file range, to avoid deadlocks with concurrent writes to the same
431 	 * range if the range has several extents and the writes don't expand the
432 	 * current i_size (the inode lock is taken in shared mode). If we fail to
433 	 * allocate data space here we continue and later, after locking the
434 	 * file range, we fail with ENOSPC only if we figure out we can not do a
435 	 * NOCOW write.
436 	 */
437 	if (write && !(flags & IOMAP_NOWAIT)) {
438 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
439 						  &dio_data->data_reserved,
440 						  start, data_alloc_len, false);
441 		if (!ret)
442 			dio_data->data_space_reserved = true;
443 		else if (ret && !(BTRFS_I(inode)->flags &
444 				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
445 			goto err;
446 	}
447 
448 	/*
449 	 * If this errors out it's because we couldn't invalidate pagecache for
450 	 * this range and we need to fallback to buffered IO, or we are doing a
451 	 * NOWAIT read/write and we need to block.
452 	 */
453 	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
454 	if (ret < 0)
455 		goto err;
456 
457 	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
458 	if (IS_ERR(em)) {
459 		ret = PTR_ERR(em);
460 		goto unlock_err;
461 	}
462 
463 	/*
464 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
465 	 * io.  INLINE is special, and we could probably kludge it in here, but
466 	 * it's still buffered so for safety lets just fall back to the generic
467 	 * buffered path.
468 	 *
469 	 * For COMPRESSED we _have_ to read the entire extent in so we can
470 	 * decompress it, so there will be buffering required no matter what we
471 	 * do, so go ahead and fallback to buffered.
472 	 *
473 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
474 	 * to buffered IO.  Don't blame me, this is the price we pay for using
475 	 * the generic code.
476 	 */
477 	if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
478 		free_extent_map(em);
479 		/*
480 		 * If we are in a NOWAIT context, return -EAGAIN in order to
481 		 * fallback to buffered IO. This is not only because we can
482 		 * block with buffered IO (no support for NOWAIT semantics at
483 		 * the moment) but also to avoid returning short reads to user
484 		 * space - this happens if we were able to read some data from
485 		 * previous non-compressed extents and then when we fallback to
486 		 * buffered IO, at btrfs_file_read_iter() by calling
487 		 * filemap_read(), we fail to fault in pages for the read buffer,
488 		 * in which case filemap_read() returns a short read (the number
489 		 * of bytes previously read is > 0, so it does not return -EFAULT).
490 		 */
491 		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
492 		goto unlock_err;
493 	}
494 
495 	len = min(len, em->len - (start - em->start));
496 
497 	/*
498 	 * If we have a NOWAIT request and the range contains multiple extents
499 	 * (or a mix of extents and holes), then we return -EAGAIN to make the
500 	 * caller fallback to a context where it can do a blocking (without
501 	 * NOWAIT) request. This way we avoid doing partial IO and returning
502 	 * success to the caller, which is not optimal for writes and for reads
503 	 * it can result in unexpected behaviour for an application.
504 	 *
505 	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
506 	 * iomap_dio_rw(), we can end up returning less data then what the caller
507 	 * asked for, resulting in an unexpected, and incorrect, short read.
508 	 * That is, the caller asked to read N bytes and we return less than that,
509 	 * which is wrong unless we are crossing EOF. This happens if we get a
510 	 * page fault error when trying to fault in pages for the buffer that is
511 	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
512 	 * have previously submitted bios for other extents in the range, in
513 	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
514 	 * those bios have completed by the time we get the page fault error,
515 	 * which we return back to our caller - we should only return EIOCBQUEUED
516 	 * after we have submitted bios for all the extents in the range.
517 	 */
518 	if ((flags & IOMAP_NOWAIT) && len < length) {
519 		free_extent_map(em);
520 		ret = -EAGAIN;
521 		goto unlock_err;
522 	}
523 
524 	if (write) {
525 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
526 						    start, &len, flags);
527 		if (ret < 0)
528 			goto unlock_err;
529 		/* Recalc len in case the new em is smaller than requested */
530 		len = min(len, em->len - (start - em->start));
531 		if (dio_data->data_space_reserved) {
532 			u64 release_offset;
533 			u64 release_len = 0;
534 
535 			if (dio_data->nocow_done) {
536 				release_offset = start;
537 				release_len = data_alloc_len;
538 			} else if (len < data_alloc_len) {
539 				release_offset = start + len;
540 				release_len = data_alloc_len - len;
541 			}
542 
543 			if (release_len > 0)
544 				btrfs_free_reserved_data_space(BTRFS_I(inode),
545 							       dio_data->data_reserved,
546 							       release_offset,
547 							       release_len);
548 		}
549 	}
550 
551 	/*
552 	 * Translate extent map information to iomap.
553 	 * We trim the extents (and move the addr) even though iomap code does
554 	 * that, since we have locked only the parts we are performing I/O in.
555 	 */
556 	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
557 	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
558 		iomap->addr = IOMAP_NULL_ADDR;
559 		iomap->type = IOMAP_HOLE;
560 	} else {
561 		iomap->addr = extent_map_block_start(em) + (start - em->start);
562 		iomap->type = IOMAP_MAPPED;
563 	}
564 	iomap->offset = start;
565 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
566 	iomap->length = len;
567 	free_extent_map(em);
568 
569 	/*
570 	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
571 	 * writes only hold it for this part.  We hold the extent lock until
572 	 * we're completely done with the extent map to make sure it remains
573 	 * valid.
574 	 */
575 	if (write)
576 		unlock_bits |= EXTENT_DIO_LOCKED;
577 
578 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
579 			 unlock_bits, &cached_state);
580 
581 	/* We didn't use everything, unlock the dio extent for the remainder. */
582 	if (!write && (start + len) < lockend)
583 		unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
584 				  lockend, NULL);
585 
586 	return 0;
587 
588 unlock_err:
589 	/*
590 	 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
591 	 * to update this, be explicit that we expect EXTENT_LOCKED and
592 	 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
593 	 */
594 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
595 			 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
596 err:
597 	if (dio_data->data_space_reserved) {
598 		btrfs_free_reserved_data_space(BTRFS_I(inode),
599 					       dio_data->data_reserved,
600 					       start, data_alloc_len);
601 		extent_changeset_free(dio_data->data_reserved);
602 	}
603 
604 	return ret;
605 }
606 
607 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
608 		ssize_t written, unsigned int flags, struct iomap *iomap)
609 {
610 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
611 	struct btrfs_dio_data *dio_data = iter->private;
612 	size_t submitted = dio_data->submitted;
613 	const bool write = !!(flags & IOMAP_WRITE);
614 	int ret = 0;
615 
616 	if (!write && (iomap->type == IOMAP_HOLE)) {
617 		/* If reading from a hole, unlock and return */
618 		unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
619 				  pos + length - 1, NULL);
620 		return 0;
621 	}
622 
623 	if (submitted < length) {
624 		pos += submitted;
625 		length -= submitted;
626 		if (write)
627 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
628 						    pos, length, false);
629 		else
630 			unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
631 					  pos + length - 1, NULL);
632 		ret = -ENOTBLK;
633 	}
634 	if (write) {
635 		btrfs_put_ordered_extent(dio_data->ordered);
636 		dio_data->ordered = NULL;
637 	}
638 
639 	if (write)
640 		extent_changeset_free(dio_data->data_reserved);
641 	return ret;
642 }
643 
644 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
645 {
646 	struct btrfs_dio_private *dip =
647 		container_of(bbio, struct btrfs_dio_private, bbio);
648 	struct btrfs_inode *inode = bbio->inode;
649 	struct bio *bio = &bbio->bio;
650 
651 	if (bio->bi_status) {
652 		btrfs_warn(inode->root->fs_info,
653 		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
654 			   btrfs_ino(inode), bio->bi_opf,
655 			   dip->file_offset, dip->bytes, bio->bi_status);
656 	}
657 
658 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
659 		btrfs_finish_ordered_extent(bbio->ordered, NULL,
660 					    dip->file_offset, dip->bytes,
661 					    !bio->bi_status);
662 	} else {
663 		unlock_dio_extent(&inode->io_tree, dip->file_offset,
664 				  dip->file_offset + dip->bytes - 1, NULL);
665 	}
666 
667 	bbio->bio.bi_private = bbio->private;
668 	iomap_dio_bio_end_io(bio);
669 }
670 
671 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
672 					struct btrfs_ordered_extent *ordered)
673 {
674 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
675 	u64 len = bbio->bio.bi_iter.bi_size;
676 	struct btrfs_ordered_extent *new;
677 	int ret;
678 
679 	/* Must always be called for the beginning of an ordered extent. */
680 	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
681 		return -EINVAL;
682 
683 	/* No need to split if the ordered extent covers the entire bio. */
684 	if (ordered->disk_num_bytes == len) {
685 		refcount_inc(&ordered->refs);
686 		bbio->ordered = ordered;
687 		return 0;
688 	}
689 
690 	/*
691 	 * Don't split the extent_map for NOCOW extents, as we're writing into
692 	 * a pre-existing one.
693 	 */
694 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
695 		ret = split_extent_map(bbio->inode, bbio->file_offset,
696 				       ordered->num_bytes, len,
697 				       ordered->disk_bytenr);
698 		if (ret)
699 			return ret;
700 	}
701 
702 	new = btrfs_split_ordered_extent(ordered, len);
703 	if (IS_ERR(new))
704 		return PTR_ERR(new);
705 	bbio->ordered = new;
706 	return 0;
707 }
708 
709 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
710 				loff_t file_offset)
711 {
712 	struct btrfs_bio *bbio = btrfs_bio(bio);
713 	struct btrfs_dio_private *dip =
714 		container_of(bbio, struct btrfs_dio_private, bbio);
715 	struct btrfs_dio_data *dio_data = iter->private;
716 
717 	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
718 		       btrfs_dio_end_io, bio->bi_private);
719 	bbio->inode = BTRFS_I(iter->inode);
720 	bbio->file_offset = file_offset;
721 
722 	dip->file_offset = file_offset;
723 	dip->bytes = bio->bi_iter.bi_size;
724 
725 	dio_data->submitted += bio->bi_iter.bi_size;
726 
727 	/*
728 	 * Check if we are doing a partial write.  If we are, we need to split
729 	 * the ordered extent to match the submitted bio.  Hang on to the
730 	 * remaining unfinishable ordered_extent in dio_data so that it can be
731 	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
732 	 * remaining pages is blocked on the outstanding ordered extent.
733 	 */
734 	if (iter->flags & IOMAP_WRITE) {
735 		int ret;
736 
737 		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
738 		if (ret) {
739 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
740 						    file_offset, dip->bytes,
741 						    !ret);
742 			bio->bi_status = errno_to_blk_status(ret);
743 			iomap_dio_bio_end_io(bio);
744 			return;
745 		}
746 	}
747 
748 	btrfs_submit_bbio(bbio, 0);
749 }
750 
751 static const struct iomap_ops btrfs_dio_iomap_ops = {
752 	.iomap_begin            = btrfs_dio_iomap_begin,
753 	.iomap_end              = btrfs_dio_iomap_end,
754 };
755 
756 static const struct iomap_dio_ops btrfs_dio_ops = {
757 	.submit_io		= btrfs_dio_submit_io,
758 	.bio_set		= &btrfs_dio_bioset,
759 };
760 
761 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
762 			      size_t done_before)
763 {
764 	struct btrfs_dio_data data = { 0 };
765 
766 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
767 			    IOMAP_DIO_PARTIAL, &data, done_before);
768 }
769 
770 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
771 					 size_t done_before)
772 {
773 	struct btrfs_dio_data data = { 0 };
774 
775 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
776 			    IOMAP_DIO_PARTIAL, &data, done_before);
777 }
778 
779 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
780 			       const struct iov_iter *iter, loff_t offset)
781 {
782 	const u32 blocksize_mask = fs_info->sectorsize - 1;
783 
784 	if (offset & blocksize_mask)
785 		return -EINVAL;
786 
787 	if (iov_iter_alignment(iter) & blocksize_mask)
788 		return -EINVAL;
789 
790 	return 0;
791 }
792 
793 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
794 {
795 	struct file *file = iocb->ki_filp;
796 	struct inode *inode = file_inode(file);
797 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
798 	loff_t pos;
799 	ssize_t written = 0;
800 	ssize_t written_buffered;
801 	size_t prev_left = 0;
802 	loff_t endbyte;
803 	ssize_t ret;
804 	unsigned int ilock_flags = 0;
805 	struct iomap_dio *dio;
806 
807 	if (iocb->ki_flags & IOCB_NOWAIT)
808 		ilock_flags |= BTRFS_ILOCK_TRY;
809 
810 	/*
811 	 * If the write DIO is within EOF, use a shared lock and also only if
812 	 * security bits will likely not be dropped by file_remove_privs() called
813 	 * from btrfs_write_check(). Either will need to be rechecked after the
814 	 * lock was acquired.
815 	 */
816 	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
817 		ilock_flags |= BTRFS_ILOCK_SHARED;
818 
819 relock:
820 	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
821 	if (ret < 0)
822 		return ret;
823 
824 	/* Shared lock cannot be used with security bits set. */
825 	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
826 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
827 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
828 		goto relock;
829 	}
830 
831 	ret = generic_write_checks(iocb, from);
832 	if (ret <= 0) {
833 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
834 		return ret;
835 	}
836 
837 	ret = btrfs_write_check(iocb, from, ret);
838 	if (ret < 0) {
839 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
840 		goto out;
841 	}
842 
843 	pos = iocb->ki_pos;
844 	/*
845 	 * Re-check since file size may have changed just before taking the
846 	 * lock or pos may have changed because of O_APPEND in generic_write_check()
847 	 */
848 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
849 	    pos + iov_iter_count(from) > i_size_read(inode)) {
850 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
851 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
852 		goto relock;
853 	}
854 
855 	if (check_direct_IO(fs_info, from, pos)) {
856 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
857 		goto buffered;
858 	}
859 
860 	/*
861 	 * The iov_iter can be mapped to the same file range we are writing to.
862 	 * If that's the case, then we will deadlock in the iomap code, because
863 	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
864 	 * an ordered extent, and after that it will fault in the pages that the
865 	 * iov_iter refers to. During the fault in we end up in the readahead
866 	 * pages code (starting at btrfs_readahead()), which will lock the range,
867 	 * find that ordered extent and then wait for it to complete (at
868 	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
869 	 * obviously the ordered extent can never complete as we didn't submit
870 	 * yet the respective bio(s). This always happens when the buffer is
871 	 * memory mapped to the same file range, since the iomap DIO code always
872 	 * invalidates pages in the target file range (after starting and waiting
873 	 * for any writeback).
874 	 *
875 	 * So here we disable page faults in the iov_iter and then retry if we
876 	 * got -EFAULT, faulting in the pages before the retry.
877 	 */
878 again:
879 	from->nofault = true;
880 	dio = btrfs_dio_write(iocb, from, written);
881 	from->nofault = false;
882 
883 	if (IS_ERR_OR_NULL(dio)) {
884 		ret = PTR_ERR_OR_ZERO(dio);
885 	} else {
886 		/*
887 		 * If we have a synchronous write, we must make sure the fsync
888 		 * triggered by the iomap_dio_complete() call below doesn't
889 		 * deadlock on the inode lock - we are already holding it and we
890 		 * can't call it after unlocking because we may need to complete
891 		 * partial writes due to the input buffer (or parts of it) not
892 		 * being already faulted in.
893 		 */
894 		ASSERT(current->journal_info == NULL);
895 		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
896 		ret = iomap_dio_complete(dio);
897 		current->journal_info = NULL;
898 	}
899 
900 	/* No increment (+=) because iomap returns a cumulative value. */
901 	if (ret > 0)
902 		written = ret;
903 
904 	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
905 		const size_t left = iov_iter_count(from);
906 		/*
907 		 * We have more data left to write. Try to fault in as many as
908 		 * possible of the remainder pages and retry. We do this without
909 		 * releasing and locking again the inode, to prevent races with
910 		 * truncate.
911 		 *
912 		 * Also, in case the iov refers to pages in the file range of the
913 		 * file we want to write to (due to a mmap), we could enter an
914 		 * infinite loop if we retry after faulting the pages in, since
915 		 * iomap will invalidate any pages in the range early on, before
916 		 * it tries to fault in the pages of the iov. So we keep track of
917 		 * how much was left of iov in the previous EFAULT and fallback
918 		 * to buffered IO in case we haven't made any progress.
919 		 */
920 		if (left == prev_left) {
921 			ret = -ENOTBLK;
922 		} else {
923 			fault_in_iov_iter_readable(from, left);
924 			prev_left = left;
925 			goto again;
926 		}
927 	}
928 
929 	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
930 
931 	/*
932 	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
933 	 * we must fallback to buffered IO.
934 	 */
935 	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
936 		goto out;
937 
938 buffered:
939 	/*
940 	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
941 	 * it must retry the operation in a context where blocking is acceptable,
942 	 * because even if we end up not blocking during the buffered IO attempt
943 	 * below, we will block when flushing and waiting for the IO.
944 	 */
945 	if (iocb->ki_flags & IOCB_NOWAIT) {
946 		ret = -EAGAIN;
947 		goto out;
948 	}
949 
950 	pos = iocb->ki_pos;
951 	written_buffered = btrfs_buffered_write(iocb, from);
952 	if (written_buffered < 0) {
953 		ret = written_buffered;
954 		goto out;
955 	}
956 	/*
957 	 * Ensure all data is persisted. We want the next direct IO read to be
958 	 * able to read what was just written.
959 	 */
960 	endbyte = pos + written_buffered - 1;
961 	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
962 	if (ret)
963 		goto out;
964 	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
965 	if (ret)
966 		goto out;
967 	written += written_buffered;
968 	iocb->ki_pos = pos + written_buffered;
969 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
970 				 endbyte >> PAGE_SHIFT);
971 out:
972 	return ret < 0 ? ret : written;
973 }
974 
975 static int check_direct_read(struct btrfs_fs_info *fs_info,
976 			     const struct iov_iter *iter, loff_t offset)
977 {
978 	int ret;
979 	int i, seg;
980 
981 	ret = check_direct_IO(fs_info, iter, offset);
982 	if (ret < 0)
983 		return ret;
984 
985 	if (!iter_is_iovec(iter))
986 		return 0;
987 
988 	for (seg = 0; seg < iter->nr_segs; seg++) {
989 		for (i = seg + 1; i < iter->nr_segs; i++) {
990 			const struct iovec *iov1 = iter_iov(iter) + seg;
991 			const struct iovec *iov2 = iter_iov(iter) + i;
992 
993 			if (iov1->iov_base == iov2->iov_base)
994 				return -EINVAL;
995 		}
996 	}
997 	return 0;
998 }
999 
1000 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1001 {
1002 	struct inode *inode = file_inode(iocb->ki_filp);
1003 	size_t prev_left = 0;
1004 	ssize_t read = 0;
1005 	ssize_t ret;
1006 
1007 	if (fsverity_active(inode))
1008 		return 0;
1009 
1010 	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1011 		return 0;
1012 
1013 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1014 again:
1015 	/*
1016 	 * This is similar to what we do for direct IO writes, see the comment
1017 	 * at btrfs_direct_write(), but we also disable page faults in addition
1018 	 * to disabling them only at the iov_iter level. This is because when
1019 	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1020 	 * which can still trigger page fault ins despite having set ->nofault
1021 	 * to true of our 'to' iov_iter.
1022 	 *
1023 	 * The difference to direct IO writes is that we deadlock when trying
1024 	 * to lock the extent range in the inode's tree during he page reads
1025 	 * triggered by the fault in (while for writes it is due to waiting for
1026 	 * our own ordered extent). This is because for direct IO reads,
1027 	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1028 	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1029 	 */
1030 	pagefault_disable();
1031 	to->nofault = true;
1032 	ret = btrfs_dio_read(iocb, to, read);
1033 	to->nofault = false;
1034 	pagefault_enable();
1035 
1036 	/* No increment (+=) because iomap returns a cumulative value. */
1037 	if (ret > 0)
1038 		read = ret;
1039 
1040 	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1041 		const size_t left = iov_iter_count(to);
1042 
1043 		if (left == prev_left) {
1044 			/*
1045 			 * We didn't make any progress since the last attempt,
1046 			 * fallback to a buffered read for the remainder of the
1047 			 * range. This is just to avoid any possibility of looping
1048 			 * for too long.
1049 			 */
1050 			ret = read;
1051 		} else {
1052 			/*
1053 			 * We made some progress since the last retry or this is
1054 			 * the first time we are retrying. Fault in as many pages
1055 			 * as possible and retry.
1056 			 */
1057 			fault_in_iov_iter_writeable(to, left);
1058 			prev_left = left;
1059 			goto again;
1060 		}
1061 	}
1062 	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1063 	return ret < 0 ? ret : read;
1064 }
1065 
1066 int __init btrfs_init_dio(void)
1067 {
1068 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1069 			offsetof(struct btrfs_dio_private, bbio.bio),
1070 			BIOSET_NEED_BVECS))
1071 		return -ENOMEM;
1072 
1073 	return 0;
1074 }
1075 
1076 void __cold btrfs_destroy_dio(void)
1077 {
1078 	bioset_exit(&btrfs_dio_bioset);
1079 }
1080