xref: /linux/fs/btrfs/direct-io.c (revision d8b762070c3fde224f8b9ea3cf59bc41a5a3eb57)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/fsverity.h>
4 #include <linux/iomap.h>
5 #include "ctree.h"
6 #include "delalloc-space.h"
7 #include "direct-io.h"
8 #include "extent-tree.h"
9 #include "file.h"
10 #include "fs.h"
11 #include "transaction.h"
12 #include "volumes.h"
13 
14 struct btrfs_dio_data {
15 	ssize_t submitted;
16 	struct extent_changeset *data_reserved;
17 	struct btrfs_ordered_extent *ordered;
18 	bool data_space_reserved;
19 	bool nocow_done;
20 };
21 
22 struct btrfs_dio_private {
23 	/* Range of I/O */
24 	u64 file_offset;
25 	u32 bytes;
26 
27 	/* This must be last */
28 	struct btrfs_bio bbio;
29 };
30 
31 static struct bio_set btrfs_dio_bioset;
32 
33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
34 			      struct extent_state **cached_state,
35 			      unsigned int iomap_flags)
36 {
37 	const bool writing = (iomap_flags & IOMAP_WRITE);
38 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
39 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
40 	struct btrfs_ordered_extent *ordered;
41 	int ret = 0;
42 
43 	while (1) {
44 		if (nowait) {
45 			if (!try_lock_extent(io_tree, lockstart, lockend,
46 					     cached_state))
47 				return -EAGAIN;
48 		} else {
49 			lock_extent(io_tree, lockstart, lockend, cached_state);
50 		}
51 		/*
52 		 * We're concerned with the entire range that we're going to be
53 		 * doing DIO to, so we need to make sure there's no ordered
54 		 * extents in this range.
55 		 */
56 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
57 						     lockend - lockstart + 1);
58 
59 		/*
60 		 * We need to make sure there are no buffered pages in this
61 		 * range either, we could have raced between the invalidate in
62 		 * generic_file_direct_write and locking the extent.  The
63 		 * invalidate needs to happen so that reads after a write do not
64 		 * get stale data.
65 		 */
66 		if (!ordered &&
67 		    (!writing || !filemap_range_has_page(inode->i_mapping,
68 							 lockstart, lockend)))
69 			break;
70 
71 		unlock_extent(io_tree, lockstart, lockend, cached_state);
72 
73 		if (ordered) {
74 			if (nowait) {
75 				btrfs_put_ordered_extent(ordered);
76 				ret = -EAGAIN;
77 				break;
78 			}
79 			/*
80 			 * If we are doing a DIO read and the ordered extent we
81 			 * found is for a buffered write, we can not wait for it
82 			 * to complete and retry, because if we do so we can
83 			 * deadlock with concurrent buffered writes on page
84 			 * locks. This happens only if our DIO read covers more
85 			 * than one extent map, if at this point has already
86 			 * created an ordered extent for a previous extent map
87 			 * and locked its range in the inode's io tree, and a
88 			 * concurrent write against that previous extent map's
89 			 * range and this range started (we unlock the ranges
90 			 * in the io tree only when the bios complete and
91 			 * buffered writes always lock pages before attempting
92 			 * to lock range in the io tree).
93 			 */
94 			if (writing ||
95 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
96 				btrfs_start_ordered_extent(ordered);
97 			else
98 				ret = nowait ? -EAGAIN : -ENOTBLK;
99 			btrfs_put_ordered_extent(ordered);
100 		} else {
101 			/*
102 			 * We could trigger writeback for this range (and wait
103 			 * for it to complete) and then invalidate the pages for
104 			 * this range (through invalidate_inode_pages2_range()),
105 			 * but that can lead us to a deadlock with a concurrent
106 			 * call to readahead (a buffered read or a defrag call
107 			 * triggered a readahead) on a page lock due to an
108 			 * ordered dio extent we created before but did not have
109 			 * yet a corresponding bio submitted (whence it can not
110 			 * complete), which makes readahead wait for that
111 			 * ordered extent to complete while holding a lock on
112 			 * that page.
113 			 */
114 			ret = nowait ? -EAGAIN : -ENOTBLK;
115 		}
116 
117 		if (ret)
118 			break;
119 
120 		cond_resched();
121 	}
122 
123 	return ret;
124 }
125 
126 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
127 						  struct btrfs_dio_data *dio_data,
128 						  const u64 start,
129 						  const struct btrfs_file_extent *file_extent,
130 						  const int type)
131 {
132 	struct extent_map *em = NULL;
133 	struct btrfs_ordered_extent *ordered;
134 
135 	if (type != BTRFS_ORDERED_NOCOW) {
136 		em = btrfs_create_io_em(inode, start, file_extent, type);
137 		if (IS_ERR(em))
138 			goto out;
139 	}
140 
141 	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
142 					     (1 << type) |
143 					     (1 << BTRFS_ORDERED_DIRECT));
144 	if (IS_ERR(ordered)) {
145 		if (em) {
146 			free_extent_map(em);
147 			btrfs_drop_extent_map_range(inode, start,
148 					start + file_extent->num_bytes - 1, false);
149 		}
150 		em = ERR_CAST(ordered);
151 	} else {
152 		ASSERT(!dio_data->ordered);
153 		dio_data->ordered = ordered;
154 	}
155  out:
156 
157 	return em;
158 }
159 
160 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
161 						  struct btrfs_dio_data *dio_data,
162 						  u64 start, u64 len)
163 {
164 	struct btrfs_root *root = inode->root;
165 	struct btrfs_fs_info *fs_info = root->fs_info;
166 	struct btrfs_file_extent file_extent;
167 	struct extent_map *em;
168 	struct btrfs_key ins;
169 	u64 alloc_hint;
170 	int ret;
171 
172 	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
173 again:
174 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
175 				   0, alloc_hint, &ins, 1, 1);
176 	if (ret == -EAGAIN) {
177 		ASSERT(btrfs_is_zoned(fs_info));
178 		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
179 			       TASK_UNINTERRUPTIBLE);
180 		goto again;
181 	}
182 	if (ret)
183 		return ERR_PTR(ret);
184 
185 	file_extent.disk_bytenr = ins.objectid;
186 	file_extent.disk_num_bytes = ins.offset;
187 	file_extent.num_bytes = ins.offset;
188 	file_extent.ram_bytes = ins.offset;
189 	file_extent.offset = 0;
190 	file_extent.compression = BTRFS_COMPRESS_NONE;
191 	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
192 				     BTRFS_ORDERED_REGULAR);
193 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
194 	if (IS_ERR(em))
195 		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
196 					   1);
197 
198 	return em;
199 }
200 
201 static int btrfs_get_blocks_direct_write(struct extent_map **map,
202 					 struct inode *inode,
203 					 struct btrfs_dio_data *dio_data,
204 					 u64 start, u64 *lenp,
205 					 unsigned int iomap_flags)
206 {
207 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
208 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
209 	struct btrfs_file_extent file_extent;
210 	struct extent_map *em = *map;
211 	int type;
212 	u64 block_start;
213 	struct btrfs_block_group *bg;
214 	bool can_nocow = false;
215 	bool space_reserved = false;
216 	u64 len = *lenp;
217 	u64 prev_len;
218 	int ret = 0;
219 
220 	/*
221 	 * We don't allocate a new extent in the following cases
222 	 *
223 	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
224 	 * existing extent.
225 	 * 2) The extent is marked as PREALLOC. We're good to go here and can
226 	 * just use the extent.
227 	 *
228 	 */
229 	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
230 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
231 	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
232 		if (em->flags & EXTENT_FLAG_PREALLOC)
233 			type = BTRFS_ORDERED_PREALLOC;
234 		else
235 			type = BTRFS_ORDERED_NOCOW;
236 		len = min(len, em->len - (start - em->start));
237 		block_start = extent_map_block_start(em) + (start - em->start);
238 
239 		if (can_nocow_extent(inode, start, &len,
240 				     &file_extent, false, false) == 1) {
241 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
242 			if (bg)
243 				can_nocow = true;
244 		}
245 	}
246 
247 	prev_len = len;
248 	if (can_nocow) {
249 		struct extent_map *em2;
250 
251 		/* We can NOCOW, so only need to reserve metadata space. */
252 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
253 						      nowait);
254 		if (ret < 0) {
255 			/* Our caller expects us to free the input extent map. */
256 			free_extent_map(em);
257 			*map = NULL;
258 			btrfs_dec_nocow_writers(bg);
259 			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
260 				ret = -EAGAIN;
261 			goto out;
262 		}
263 		space_reserved = true;
264 
265 		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
266 					      &file_extent, type);
267 		btrfs_dec_nocow_writers(bg);
268 		if (type == BTRFS_ORDERED_PREALLOC) {
269 			free_extent_map(em);
270 			*map = em2;
271 			em = em2;
272 		}
273 
274 		if (IS_ERR(em2)) {
275 			ret = PTR_ERR(em2);
276 			goto out;
277 		}
278 
279 		dio_data->nocow_done = true;
280 	} else {
281 		/* Our caller expects us to free the input extent map. */
282 		free_extent_map(em);
283 		*map = NULL;
284 
285 		if (nowait) {
286 			ret = -EAGAIN;
287 			goto out;
288 		}
289 
290 		/*
291 		 * If we could not allocate data space before locking the file
292 		 * range and we can't do a NOCOW write, then we have to fail.
293 		 */
294 		if (!dio_data->data_space_reserved) {
295 			ret = -ENOSPC;
296 			goto out;
297 		}
298 
299 		/*
300 		 * We have to COW and we have already reserved data space before,
301 		 * so now we reserve only metadata.
302 		 */
303 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
304 						      false);
305 		if (ret < 0)
306 			goto out;
307 		space_reserved = true;
308 
309 		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
310 		if (IS_ERR(em)) {
311 			ret = PTR_ERR(em);
312 			goto out;
313 		}
314 		*map = em;
315 		len = min(len, em->len - (start - em->start));
316 		if (len < prev_len)
317 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
318 							prev_len - len, true);
319 	}
320 
321 	/*
322 	 * We have created our ordered extent, so we can now release our reservation
323 	 * for an outstanding extent.
324 	 */
325 	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
326 
327 	/*
328 	 * Need to update the i_size under the extent lock so buffered
329 	 * readers will get the updated i_size when we unlock.
330 	 */
331 	if (start + len > i_size_read(inode))
332 		i_size_write(inode, start + len);
333 out:
334 	if (ret && space_reserved) {
335 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
336 		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
337 	}
338 	*lenp = len;
339 	return ret;
340 }
341 
342 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
343 		loff_t length, unsigned int flags, struct iomap *iomap,
344 		struct iomap *srcmap)
345 {
346 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
347 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
348 	struct extent_map *em;
349 	struct extent_state *cached_state = NULL;
350 	struct btrfs_dio_data *dio_data = iter->private;
351 	u64 lockstart, lockend;
352 	const bool write = !!(flags & IOMAP_WRITE);
353 	int ret = 0;
354 	u64 len = length;
355 	const u64 data_alloc_len = length;
356 	bool unlock_extents = false;
357 
358 	/*
359 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
360 	 * we're NOWAIT we may submit a bio for a partial range and return
361 	 * EIOCBQUEUED, which would result in an errant short read.
362 	 *
363 	 * The best way to handle this would be to allow for partial completions
364 	 * of iocb's, so we could submit the partial bio, return and fault in
365 	 * the rest of the pages, and then submit the io for the rest of the
366 	 * range.  However we don't have that currently, so simply return
367 	 * -EAGAIN at this point so that the normal path is used.
368 	 */
369 	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
370 		return -EAGAIN;
371 
372 	/*
373 	 * Cap the size of reads to that usually seen in buffered I/O as we need
374 	 * to allocate a contiguous array for the checksums.
375 	 */
376 	if (!write)
377 		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
378 
379 	lockstart = start;
380 	lockend = start + len - 1;
381 
382 	/*
383 	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
384 	 * enough if we've written compressed pages to this area, so we need to
385 	 * flush the dirty pages again to make absolutely sure that any
386 	 * outstanding dirty pages are on disk - the first flush only starts
387 	 * compression on the data, while keeping the pages locked, so by the
388 	 * time the second flush returns we know bios for the compressed pages
389 	 * were submitted and finished, and the pages no longer under writeback.
390 	 *
391 	 * If we have a NOWAIT request and we have any pages in the range that
392 	 * are locked, likely due to compression still in progress, we don't want
393 	 * to block on page locks. We also don't want to block on pages marked as
394 	 * dirty or under writeback (same as for the non-compression case).
395 	 * iomap_dio_rw() did the same check, but after that and before we got
396 	 * here, mmap'ed writes may have happened or buffered reads started
397 	 * (readpage() and readahead(), which lock pages), as we haven't locked
398 	 * the file range yet.
399 	 */
400 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
401 		     &BTRFS_I(inode)->runtime_flags)) {
402 		if (flags & IOMAP_NOWAIT) {
403 			if (filemap_range_needs_writeback(inode->i_mapping,
404 							  lockstart, lockend))
405 				return -EAGAIN;
406 		} else {
407 			ret = filemap_fdatawrite_range(inode->i_mapping, start,
408 						       start + length - 1);
409 			if (ret)
410 				return ret;
411 		}
412 	}
413 
414 	memset(dio_data, 0, sizeof(*dio_data));
415 
416 	/*
417 	 * We always try to allocate data space and must do it before locking
418 	 * the file range, to avoid deadlocks with concurrent writes to the same
419 	 * range if the range has several extents and the writes don't expand the
420 	 * current i_size (the inode lock is taken in shared mode). If we fail to
421 	 * allocate data space here we continue and later, after locking the
422 	 * file range, we fail with ENOSPC only if we figure out we can not do a
423 	 * NOCOW write.
424 	 */
425 	if (write && !(flags & IOMAP_NOWAIT)) {
426 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
427 						  &dio_data->data_reserved,
428 						  start, data_alloc_len, false);
429 		if (!ret)
430 			dio_data->data_space_reserved = true;
431 		else if (ret && !(BTRFS_I(inode)->flags &
432 				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
433 			goto err;
434 	}
435 
436 	/*
437 	 * If this errors out it's because we couldn't invalidate pagecache for
438 	 * this range and we need to fallback to buffered IO, or we are doing a
439 	 * NOWAIT read/write and we need to block.
440 	 */
441 	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
442 	if (ret < 0)
443 		goto err;
444 
445 	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
446 	if (IS_ERR(em)) {
447 		ret = PTR_ERR(em);
448 		goto unlock_err;
449 	}
450 
451 	/*
452 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
453 	 * io.  INLINE is special, and we could probably kludge it in here, but
454 	 * it's still buffered so for safety lets just fall back to the generic
455 	 * buffered path.
456 	 *
457 	 * For COMPRESSED we _have_ to read the entire extent in so we can
458 	 * decompress it, so there will be buffering required no matter what we
459 	 * do, so go ahead and fallback to buffered.
460 	 *
461 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
462 	 * to buffered IO.  Don't blame me, this is the price we pay for using
463 	 * the generic code.
464 	 */
465 	if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
466 		free_extent_map(em);
467 		/*
468 		 * If we are in a NOWAIT context, return -EAGAIN in order to
469 		 * fallback to buffered IO. This is not only because we can
470 		 * block with buffered IO (no support for NOWAIT semantics at
471 		 * the moment) but also to avoid returning short reads to user
472 		 * space - this happens if we were able to read some data from
473 		 * previous non-compressed extents and then when we fallback to
474 		 * buffered IO, at btrfs_file_read_iter() by calling
475 		 * filemap_read(), we fail to fault in pages for the read buffer,
476 		 * in which case filemap_read() returns a short read (the number
477 		 * of bytes previously read is > 0, so it does not return -EFAULT).
478 		 */
479 		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
480 		goto unlock_err;
481 	}
482 
483 	len = min(len, em->len - (start - em->start));
484 
485 	/*
486 	 * If we have a NOWAIT request and the range contains multiple extents
487 	 * (or a mix of extents and holes), then we return -EAGAIN to make the
488 	 * caller fallback to a context where it can do a blocking (without
489 	 * NOWAIT) request. This way we avoid doing partial IO and returning
490 	 * success to the caller, which is not optimal for writes and for reads
491 	 * it can result in unexpected behaviour for an application.
492 	 *
493 	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
494 	 * iomap_dio_rw(), we can end up returning less data then what the caller
495 	 * asked for, resulting in an unexpected, and incorrect, short read.
496 	 * That is, the caller asked to read N bytes and we return less than that,
497 	 * which is wrong unless we are crossing EOF. This happens if we get a
498 	 * page fault error when trying to fault in pages for the buffer that is
499 	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
500 	 * have previously submitted bios for other extents in the range, in
501 	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
502 	 * those bios have completed by the time we get the page fault error,
503 	 * which we return back to our caller - we should only return EIOCBQUEUED
504 	 * after we have submitted bios for all the extents in the range.
505 	 */
506 	if ((flags & IOMAP_NOWAIT) && len < length) {
507 		free_extent_map(em);
508 		ret = -EAGAIN;
509 		goto unlock_err;
510 	}
511 
512 	if (write) {
513 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
514 						    start, &len, flags);
515 		if (ret < 0)
516 			goto unlock_err;
517 		unlock_extents = true;
518 		/* Recalc len in case the new em is smaller than requested */
519 		len = min(len, em->len - (start - em->start));
520 		if (dio_data->data_space_reserved) {
521 			u64 release_offset;
522 			u64 release_len = 0;
523 
524 			if (dio_data->nocow_done) {
525 				release_offset = start;
526 				release_len = data_alloc_len;
527 			} else if (len < data_alloc_len) {
528 				release_offset = start + len;
529 				release_len = data_alloc_len - len;
530 			}
531 
532 			if (release_len > 0)
533 				btrfs_free_reserved_data_space(BTRFS_I(inode),
534 							       dio_data->data_reserved,
535 							       release_offset,
536 							       release_len);
537 		}
538 	} else {
539 		/*
540 		 * We need to unlock only the end area that we aren't using.
541 		 * The rest is going to be unlocked by the endio routine.
542 		 */
543 		lockstart = start + len;
544 		if (lockstart < lockend)
545 			unlock_extents = true;
546 	}
547 
548 	if (unlock_extents)
549 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
550 			      &cached_state);
551 	else
552 		free_extent_state(cached_state);
553 
554 	/*
555 	 * Translate extent map information to iomap.
556 	 * We trim the extents (and move the addr) even though iomap code does
557 	 * that, since we have locked only the parts we are performing I/O in.
558 	 */
559 	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
560 	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
561 		iomap->addr = IOMAP_NULL_ADDR;
562 		iomap->type = IOMAP_HOLE;
563 	} else {
564 		iomap->addr = extent_map_block_start(em) + (start - em->start);
565 		iomap->type = IOMAP_MAPPED;
566 	}
567 	iomap->offset = start;
568 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
569 	iomap->length = len;
570 	free_extent_map(em);
571 
572 	return 0;
573 
574 unlock_err:
575 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
576 		      &cached_state);
577 err:
578 	if (dio_data->data_space_reserved) {
579 		btrfs_free_reserved_data_space(BTRFS_I(inode),
580 					       dio_data->data_reserved,
581 					       start, data_alloc_len);
582 		extent_changeset_free(dio_data->data_reserved);
583 	}
584 
585 	return ret;
586 }
587 
588 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
589 		ssize_t written, unsigned int flags, struct iomap *iomap)
590 {
591 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
592 	struct btrfs_dio_data *dio_data = iter->private;
593 	size_t submitted = dio_data->submitted;
594 	const bool write = !!(flags & IOMAP_WRITE);
595 	int ret = 0;
596 
597 	if (!write && (iomap->type == IOMAP_HOLE)) {
598 		/* If reading from a hole, unlock and return */
599 		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
600 			      NULL);
601 		return 0;
602 	}
603 
604 	if (submitted < length) {
605 		pos += submitted;
606 		length -= submitted;
607 		if (write)
608 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
609 						    pos, length, false);
610 		else
611 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
612 				      pos + length - 1, NULL);
613 		ret = -ENOTBLK;
614 	}
615 	if (write) {
616 		btrfs_put_ordered_extent(dio_data->ordered);
617 		dio_data->ordered = NULL;
618 	}
619 
620 	if (write)
621 		extent_changeset_free(dio_data->data_reserved);
622 	return ret;
623 }
624 
625 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
626 {
627 	struct btrfs_dio_private *dip =
628 		container_of(bbio, struct btrfs_dio_private, bbio);
629 	struct btrfs_inode *inode = bbio->inode;
630 	struct bio *bio = &bbio->bio;
631 
632 	if (bio->bi_status) {
633 		btrfs_warn(inode->root->fs_info,
634 		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
635 			   btrfs_ino(inode), bio->bi_opf,
636 			   dip->file_offset, dip->bytes, bio->bi_status);
637 	}
638 
639 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
640 		btrfs_finish_ordered_extent(bbio->ordered, NULL,
641 					    dip->file_offset, dip->bytes,
642 					    !bio->bi_status);
643 	} else {
644 		unlock_extent(&inode->io_tree, dip->file_offset,
645 			      dip->file_offset + dip->bytes - 1, NULL);
646 	}
647 
648 	bbio->bio.bi_private = bbio->private;
649 	iomap_dio_bio_end_io(bio);
650 }
651 
652 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
653 					struct btrfs_ordered_extent *ordered)
654 {
655 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
656 	u64 len = bbio->bio.bi_iter.bi_size;
657 	struct btrfs_ordered_extent *new;
658 	int ret;
659 
660 	/* Must always be called for the beginning of an ordered extent. */
661 	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
662 		return -EINVAL;
663 
664 	/* No need to split if the ordered extent covers the entire bio. */
665 	if (ordered->disk_num_bytes == len) {
666 		refcount_inc(&ordered->refs);
667 		bbio->ordered = ordered;
668 		return 0;
669 	}
670 
671 	/*
672 	 * Don't split the extent_map for NOCOW extents, as we're writing into
673 	 * a pre-existing one.
674 	 */
675 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
676 		ret = split_extent_map(bbio->inode, bbio->file_offset,
677 				       ordered->num_bytes, len,
678 				       ordered->disk_bytenr);
679 		if (ret)
680 			return ret;
681 	}
682 
683 	new = btrfs_split_ordered_extent(ordered, len);
684 	if (IS_ERR(new))
685 		return PTR_ERR(new);
686 	bbio->ordered = new;
687 	return 0;
688 }
689 
690 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
691 				loff_t file_offset)
692 {
693 	struct btrfs_bio *bbio = btrfs_bio(bio);
694 	struct btrfs_dio_private *dip =
695 		container_of(bbio, struct btrfs_dio_private, bbio);
696 	struct btrfs_dio_data *dio_data = iter->private;
697 
698 	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
699 		       btrfs_dio_end_io, bio->bi_private);
700 	bbio->inode = BTRFS_I(iter->inode);
701 	bbio->file_offset = file_offset;
702 
703 	dip->file_offset = file_offset;
704 	dip->bytes = bio->bi_iter.bi_size;
705 
706 	dio_data->submitted += bio->bi_iter.bi_size;
707 
708 	/*
709 	 * Check if we are doing a partial write.  If we are, we need to split
710 	 * the ordered extent to match the submitted bio.  Hang on to the
711 	 * remaining unfinishable ordered_extent in dio_data so that it can be
712 	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
713 	 * remaining pages is blocked on the outstanding ordered extent.
714 	 */
715 	if (iter->flags & IOMAP_WRITE) {
716 		int ret;
717 
718 		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
719 		if (ret) {
720 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
721 						    file_offset, dip->bytes,
722 						    !ret);
723 			bio->bi_status = errno_to_blk_status(ret);
724 			iomap_dio_bio_end_io(bio);
725 			return;
726 		}
727 	}
728 
729 	btrfs_submit_bio(bbio, 0);
730 }
731 
732 static const struct iomap_ops btrfs_dio_iomap_ops = {
733 	.iomap_begin            = btrfs_dio_iomap_begin,
734 	.iomap_end              = btrfs_dio_iomap_end,
735 };
736 
737 static const struct iomap_dio_ops btrfs_dio_ops = {
738 	.submit_io		= btrfs_dio_submit_io,
739 	.bio_set		= &btrfs_dio_bioset,
740 };
741 
742 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
743 			      size_t done_before)
744 {
745 	struct btrfs_dio_data data = { 0 };
746 
747 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
748 			    IOMAP_DIO_PARTIAL, &data, done_before);
749 }
750 
751 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
752 					 size_t done_before)
753 {
754 	struct btrfs_dio_data data = { 0 };
755 
756 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
757 			    IOMAP_DIO_PARTIAL, &data, done_before);
758 }
759 
760 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
761 			       const struct iov_iter *iter, loff_t offset)
762 {
763 	const u32 blocksize_mask = fs_info->sectorsize - 1;
764 
765 	if (offset & blocksize_mask)
766 		return -EINVAL;
767 
768 	if (iov_iter_alignment(iter) & blocksize_mask)
769 		return -EINVAL;
770 
771 	return 0;
772 }
773 
774 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
775 {
776 	struct file *file = iocb->ki_filp;
777 	struct inode *inode = file_inode(file);
778 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
779 	loff_t pos;
780 	ssize_t written = 0;
781 	ssize_t written_buffered;
782 	size_t prev_left = 0;
783 	loff_t endbyte;
784 	ssize_t ret;
785 	unsigned int ilock_flags = 0;
786 	struct iomap_dio *dio;
787 
788 	if (iocb->ki_flags & IOCB_NOWAIT)
789 		ilock_flags |= BTRFS_ILOCK_TRY;
790 
791 	/*
792 	 * If the write DIO is within EOF, use a shared lock and also only if
793 	 * security bits will likely not be dropped by file_remove_privs() called
794 	 * from btrfs_write_check(). Either will need to be rechecked after the
795 	 * lock was acquired.
796 	 */
797 	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
798 		ilock_flags |= BTRFS_ILOCK_SHARED;
799 
800 relock:
801 	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
802 	if (ret < 0)
803 		return ret;
804 
805 	/* Shared lock cannot be used with security bits set. */
806 	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
807 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
808 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
809 		goto relock;
810 	}
811 
812 	ret = generic_write_checks(iocb, from);
813 	if (ret <= 0) {
814 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
815 		return ret;
816 	}
817 
818 	ret = btrfs_write_check(iocb, from, ret);
819 	if (ret < 0) {
820 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
821 		goto out;
822 	}
823 
824 	pos = iocb->ki_pos;
825 	/*
826 	 * Re-check since file size may have changed just before taking the
827 	 * lock or pos may have changed because of O_APPEND in generic_write_check()
828 	 */
829 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
830 	    pos + iov_iter_count(from) > i_size_read(inode)) {
831 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
832 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
833 		goto relock;
834 	}
835 
836 	if (check_direct_IO(fs_info, from, pos)) {
837 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
838 		goto buffered;
839 	}
840 
841 	/*
842 	 * The iov_iter can be mapped to the same file range we are writing to.
843 	 * If that's the case, then we will deadlock in the iomap code, because
844 	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
845 	 * an ordered extent, and after that it will fault in the pages that the
846 	 * iov_iter refers to. During the fault in we end up in the readahead
847 	 * pages code (starting at btrfs_readahead()), which will lock the range,
848 	 * find that ordered extent and then wait for it to complete (at
849 	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
850 	 * obviously the ordered extent can never complete as we didn't submit
851 	 * yet the respective bio(s). This always happens when the buffer is
852 	 * memory mapped to the same file range, since the iomap DIO code always
853 	 * invalidates pages in the target file range (after starting and waiting
854 	 * for any writeback).
855 	 *
856 	 * So here we disable page faults in the iov_iter and then retry if we
857 	 * got -EFAULT, faulting in the pages before the retry.
858 	 */
859 again:
860 	from->nofault = true;
861 	dio = btrfs_dio_write(iocb, from, written);
862 	from->nofault = false;
863 
864 	if (IS_ERR_OR_NULL(dio)) {
865 		ret = PTR_ERR_OR_ZERO(dio);
866 	} else {
867 		struct btrfs_file_private stack_private = { 0 };
868 		struct btrfs_file_private *private;
869 		const bool have_private = (file->private_data != NULL);
870 
871 		if (!have_private)
872 			file->private_data = &stack_private;
873 
874 		/*
875 		 * If we have a synchronous write, we must make sure the fsync
876 		 * triggered by the iomap_dio_complete() call below doesn't
877 		 * deadlock on the inode lock - we are already holding it and we
878 		 * can't call it after unlocking because we may need to complete
879 		 * partial writes due to the input buffer (or parts of it) not
880 		 * being already faulted in.
881 		 */
882 		private = file->private_data;
883 		private->fsync_skip_inode_lock = true;
884 		ret = iomap_dio_complete(dio);
885 		private->fsync_skip_inode_lock = false;
886 
887 		if (!have_private)
888 			file->private_data = NULL;
889 	}
890 
891 	/* No increment (+=) because iomap returns a cumulative value. */
892 	if (ret > 0)
893 		written = ret;
894 
895 	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
896 		const size_t left = iov_iter_count(from);
897 		/*
898 		 * We have more data left to write. Try to fault in as many as
899 		 * possible of the remainder pages and retry. We do this without
900 		 * releasing and locking again the inode, to prevent races with
901 		 * truncate.
902 		 *
903 		 * Also, in case the iov refers to pages in the file range of the
904 		 * file we want to write to (due to a mmap), we could enter an
905 		 * infinite loop if we retry after faulting the pages in, since
906 		 * iomap will invalidate any pages in the range early on, before
907 		 * it tries to fault in the pages of the iov. So we keep track of
908 		 * how much was left of iov in the previous EFAULT and fallback
909 		 * to buffered IO in case we haven't made any progress.
910 		 */
911 		if (left == prev_left) {
912 			ret = -ENOTBLK;
913 		} else {
914 			fault_in_iov_iter_readable(from, left);
915 			prev_left = left;
916 			goto again;
917 		}
918 	}
919 
920 	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
921 
922 	/*
923 	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
924 	 * we must fallback to buffered IO.
925 	 */
926 	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
927 		goto out;
928 
929 buffered:
930 	/*
931 	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
932 	 * it must retry the operation in a context where blocking is acceptable,
933 	 * because even if we end up not blocking during the buffered IO attempt
934 	 * below, we will block when flushing and waiting for the IO.
935 	 */
936 	if (iocb->ki_flags & IOCB_NOWAIT) {
937 		ret = -EAGAIN;
938 		goto out;
939 	}
940 
941 	pos = iocb->ki_pos;
942 	written_buffered = btrfs_buffered_write(iocb, from);
943 	if (written_buffered < 0) {
944 		ret = written_buffered;
945 		goto out;
946 	}
947 	/*
948 	 * Ensure all data is persisted. We want the next direct IO read to be
949 	 * able to read what was just written.
950 	 */
951 	endbyte = pos + written_buffered - 1;
952 	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
953 	if (ret)
954 		goto out;
955 	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
956 	if (ret)
957 		goto out;
958 	written += written_buffered;
959 	iocb->ki_pos = pos + written_buffered;
960 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
961 				 endbyte >> PAGE_SHIFT);
962 out:
963 	return ret < 0 ? ret : written;
964 }
965 
966 static int check_direct_read(struct btrfs_fs_info *fs_info,
967 			     const struct iov_iter *iter, loff_t offset)
968 {
969 	int ret;
970 	int i, seg;
971 
972 	ret = check_direct_IO(fs_info, iter, offset);
973 	if (ret < 0)
974 		return ret;
975 
976 	if (!iter_is_iovec(iter))
977 		return 0;
978 
979 	for (seg = 0; seg < iter->nr_segs; seg++) {
980 		for (i = seg + 1; i < iter->nr_segs; i++) {
981 			const struct iovec *iov1 = iter_iov(iter) + seg;
982 			const struct iovec *iov2 = iter_iov(iter) + i;
983 
984 			if (iov1->iov_base == iov2->iov_base)
985 				return -EINVAL;
986 		}
987 	}
988 	return 0;
989 }
990 
991 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
992 {
993 	struct inode *inode = file_inode(iocb->ki_filp);
994 	size_t prev_left = 0;
995 	ssize_t read = 0;
996 	ssize_t ret;
997 
998 	if (fsverity_active(inode))
999 		return 0;
1000 
1001 	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1002 		return 0;
1003 
1004 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1005 again:
1006 	/*
1007 	 * This is similar to what we do for direct IO writes, see the comment
1008 	 * at btrfs_direct_write(), but we also disable page faults in addition
1009 	 * to disabling them only at the iov_iter level. This is because when
1010 	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1011 	 * which can still trigger page fault ins despite having set ->nofault
1012 	 * to true of our 'to' iov_iter.
1013 	 *
1014 	 * The difference to direct IO writes is that we deadlock when trying
1015 	 * to lock the extent range in the inode's tree during he page reads
1016 	 * triggered by the fault in (while for writes it is due to waiting for
1017 	 * our own ordered extent). This is because for direct IO reads,
1018 	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1019 	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1020 	 */
1021 	pagefault_disable();
1022 	to->nofault = true;
1023 	ret = btrfs_dio_read(iocb, to, read);
1024 	to->nofault = false;
1025 	pagefault_enable();
1026 
1027 	/* No increment (+=) because iomap returns a cumulative value. */
1028 	if (ret > 0)
1029 		read = ret;
1030 
1031 	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1032 		const size_t left = iov_iter_count(to);
1033 
1034 		if (left == prev_left) {
1035 			/*
1036 			 * We didn't make any progress since the last attempt,
1037 			 * fallback to a buffered read for the remainder of the
1038 			 * range. This is just to avoid any possibility of looping
1039 			 * for too long.
1040 			 */
1041 			ret = read;
1042 		} else {
1043 			/*
1044 			 * We made some progress since the last retry or this is
1045 			 * the first time we are retrying. Fault in as many pages
1046 			 * as possible and retry.
1047 			 */
1048 			fault_in_iov_iter_writeable(to, left);
1049 			prev_left = left;
1050 			goto again;
1051 		}
1052 	}
1053 	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1054 	return ret < 0 ? ret : read;
1055 }
1056 
1057 int __init btrfs_init_dio(void)
1058 {
1059 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1060 			offsetof(struct btrfs_dio_private, bbio.bio),
1061 			BIOSET_NEED_BVECS))
1062 		return -ENOMEM;
1063 
1064 	return 0;
1065 }
1066 
1067 void __cold btrfs_destroy_dio(void)
1068 {
1069 	bioset_exit(&btrfs_dio_bioset);
1070 }
1071