1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/fsverity.h>
4 #include <linux/iomap.h>
5 #include "ctree.h"
6 #include "delalloc-space.h"
7 #include "direct-io.h"
8 #include "extent-tree.h"
9 #include "file.h"
10 #include "fs.h"
11 #include "transaction.h"
12 #include "volumes.h"
13
14 struct btrfs_dio_data {
15 ssize_t submitted;
16 struct extent_changeset *data_reserved;
17 struct btrfs_ordered_extent *ordered;
18 bool data_space_reserved;
19 bool nocow_done;
20 };
21
22 struct btrfs_dio_private {
23 /* Range of I/O */
24 u64 file_offset;
25 u32 bytes;
26
27 /* This must be last */
28 struct btrfs_bio bbio;
29 };
30
31 static struct bio_set btrfs_dio_bioset;
32
lock_extent_direct(struct inode * inode,u64 lockstart,u64 lockend,struct extent_state ** cached_state,unsigned int iomap_flags)33 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
34 struct extent_state **cached_state,
35 unsigned int iomap_flags)
36 {
37 const bool writing = (iomap_flags & IOMAP_WRITE);
38 const bool nowait = (iomap_flags & IOMAP_NOWAIT);
39 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
40 struct btrfs_ordered_extent *ordered;
41 int ret = 0;
42
43 /* Direct lock must be taken before the extent lock. */
44 if (nowait) {
45 if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
46 return -EAGAIN;
47 } else {
48 lock_dio_extent(io_tree, lockstart, lockend, cached_state);
49 }
50
51 while (1) {
52 if (nowait) {
53 if (!try_lock_extent(io_tree, lockstart, lockend,
54 cached_state)) {
55 ret = -EAGAIN;
56 break;
57 }
58 } else {
59 lock_extent(io_tree, lockstart, lockend, cached_state);
60 }
61 /*
62 * We're concerned with the entire range that we're going to be
63 * doing DIO to, so we need to make sure there's no ordered
64 * extents in this range.
65 */
66 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
67 lockend - lockstart + 1);
68
69 /*
70 * We need to make sure there are no buffered pages in this
71 * range either, we could have raced between the invalidate in
72 * generic_file_direct_write and locking the extent. The
73 * invalidate needs to happen so that reads after a write do not
74 * get stale data.
75 */
76 if (!ordered &&
77 (!writing || !filemap_range_has_page(inode->i_mapping,
78 lockstart, lockend)))
79 break;
80
81 unlock_extent(io_tree, lockstart, lockend, cached_state);
82
83 if (ordered) {
84 if (nowait) {
85 btrfs_put_ordered_extent(ordered);
86 ret = -EAGAIN;
87 break;
88 }
89 /*
90 * If we are doing a DIO read and the ordered extent we
91 * found is for a buffered write, we can not wait for it
92 * to complete and retry, because if we do so we can
93 * deadlock with concurrent buffered writes on page
94 * locks. This happens only if our DIO read covers more
95 * than one extent map, if at this point has already
96 * created an ordered extent for a previous extent map
97 * and locked its range in the inode's io tree, and a
98 * concurrent write against that previous extent map's
99 * range and this range started (we unlock the ranges
100 * in the io tree only when the bios complete and
101 * buffered writes always lock pages before attempting
102 * to lock range in the io tree).
103 */
104 if (writing ||
105 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
106 btrfs_start_ordered_extent(ordered);
107 else
108 ret = nowait ? -EAGAIN : -ENOTBLK;
109 btrfs_put_ordered_extent(ordered);
110 } else {
111 /*
112 * We could trigger writeback for this range (and wait
113 * for it to complete) and then invalidate the pages for
114 * this range (through invalidate_inode_pages2_range()),
115 * but that can lead us to a deadlock with a concurrent
116 * call to readahead (a buffered read or a defrag call
117 * triggered a readahead) on a page lock due to an
118 * ordered dio extent we created before but did not have
119 * yet a corresponding bio submitted (whence it can not
120 * complete), which makes readahead wait for that
121 * ordered extent to complete while holding a lock on
122 * that page.
123 */
124 ret = nowait ? -EAGAIN : -ENOTBLK;
125 }
126
127 if (ret)
128 break;
129
130 cond_resched();
131 }
132
133 if (ret)
134 unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
135 return ret;
136 }
137
btrfs_create_dio_extent(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,const u64 start,const struct btrfs_file_extent * file_extent,const int type)138 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
139 struct btrfs_dio_data *dio_data,
140 const u64 start,
141 const struct btrfs_file_extent *file_extent,
142 const int type)
143 {
144 struct extent_map *em = NULL;
145 struct btrfs_ordered_extent *ordered;
146
147 if (type != BTRFS_ORDERED_NOCOW) {
148 em = btrfs_create_io_em(inode, start, file_extent, type);
149 if (IS_ERR(em))
150 goto out;
151 }
152
153 ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
154 (1 << type) |
155 (1 << BTRFS_ORDERED_DIRECT));
156 if (IS_ERR(ordered)) {
157 if (em) {
158 free_extent_map(em);
159 btrfs_drop_extent_map_range(inode, start,
160 start + file_extent->num_bytes - 1, false);
161 }
162 em = ERR_CAST(ordered);
163 } else {
164 ASSERT(!dio_data->ordered);
165 dio_data->ordered = ordered;
166 }
167 out:
168
169 return em;
170 }
171
btrfs_new_extent_direct(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 len)172 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
173 struct btrfs_dio_data *dio_data,
174 u64 start, u64 len)
175 {
176 struct btrfs_root *root = inode->root;
177 struct btrfs_fs_info *fs_info = root->fs_info;
178 struct btrfs_file_extent file_extent;
179 struct extent_map *em;
180 struct btrfs_key ins;
181 u64 alloc_hint;
182 int ret;
183
184 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
185 again:
186 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
187 0, alloc_hint, &ins, 1, 1);
188 if (ret == -EAGAIN) {
189 ASSERT(btrfs_is_zoned(fs_info));
190 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
191 TASK_UNINTERRUPTIBLE);
192 goto again;
193 }
194 if (ret)
195 return ERR_PTR(ret);
196
197 file_extent.disk_bytenr = ins.objectid;
198 file_extent.disk_num_bytes = ins.offset;
199 file_extent.num_bytes = ins.offset;
200 file_extent.ram_bytes = ins.offset;
201 file_extent.offset = 0;
202 file_extent.compression = BTRFS_COMPRESS_NONE;
203 em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
204 BTRFS_ORDERED_REGULAR);
205 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
206 if (IS_ERR(em))
207 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
208 1);
209
210 return em;
211 }
212
btrfs_get_blocks_direct_write(struct extent_map ** map,struct inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 * lenp,unsigned int iomap_flags)213 static int btrfs_get_blocks_direct_write(struct extent_map **map,
214 struct inode *inode,
215 struct btrfs_dio_data *dio_data,
216 u64 start, u64 *lenp,
217 unsigned int iomap_flags)
218 {
219 const bool nowait = (iomap_flags & IOMAP_NOWAIT);
220 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
221 struct btrfs_file_extent file_extent;
222 struct extent_map *em = *map;
223 int type;
224 u64 block_start;
225 struct btrfs_block_group *bg;
226 bool can_nocow = false;
227 bool space_reserved = false;
228 u64 len = *lenp;
229 u64 prev_len;
230 int ret = 0;
231
232 /*
233 * We don't allocate a new extent in the following cases
234 *
235 * 1) The inode is marked as NODATACOW. In this case we'll just use the
236 * existing extent.
237 * 2) The extent is marked as PREALLOC. We're good to go here and can
238 * just use the extent.
239 *
240 */
241 if ((em->flags & EXTENT_FLAG_PREALLOC) ||
242 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
243 em->disk_bytenr != EXTENT_MAP_HOLE)) {
244 if (em->flags & EXTENT_FLAG_PREALLOC)
245 type = BTRFS_ORDERED_PREALLOC;
246 else
247 type = BTRFS_ORDERED_NOCOW;
248 len = min(len, em->len - (start - em->start));
249 block_start = extent_map_block_start(em) + (start - em->start);
250
251 if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
252 bg = btrfs_inc_nocow_writers(fs_info, block_start);
253 if (bg)
254 can_nocow = true;
255 }
256 }
257
258 prev_len = len;
259 if (can_nocow) {
260 struct extent_map *em2;
261
262 /* We can NOCOW, so only need to reserve metadata space. */
263 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
264 nowait);
265 if (ret < 0) {
266 /* Our caller expects us to free the input extent map. */
267 free_extent_map(em);
268 *map = NULL;
269 btrfs_dec_nocow_writers(bg);
270 if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
271 ret = -EAGAIN;
272 goto out;
273 }
274 space_reserved = true;
275
276 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
277 &file_extent, type);
278 btrfs_dec_nocow_writers(bg);
279 if (type == BTRFS_ORDERED_PREALLOC) {
280 free_extent_map(em);
281 *map = em2;
282 em = em2;
283 }
284
285 if (IS_ERR(em2)) {
286 ret = PTR_ERR(em2);
287 goto out;
288 }
289
290 dio_data->nocow_done = true;
291 } else {
292 /* Our caller expects us to free the input extent map. */
293 free_extent_map(em);
294 *map = NULL;
295
296 if (nowait) {
297 ret = -EAGAIN;
298 goto out;
299 }
300
301 /*
302 * If we could not allocate data space before locking the file
303 * range and we can't do a NOCOW write, then we have to fail.
304 */
305 if (!dio_data->data_space_reserved) {
306 ret = -ENOSPC;
307 goto out;
308 }
309
310 /*
311 * We have to COW and we have already reserved data space before,
312 * so now we reserve only metadata.
313 */
314 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
315 false);
316 if (ret < 0)
317 goto out;
318 space_reserved = true;
319
320 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
321 if (IS_ERR(em)) {
322 ret = PTR_ERR(em);
323 goto out;
324 }
325 *map = em;
326 len = min(len, em->len - (start - em->start));
327 if (len < prev_len)
328 btrfs_delalloc_release_metadata(BTRFS_I(inode),
329 prev_len - len, true);
330 }
331
332 /*
333 * We have created our ordered extent, so we can now release our reservation
334 * for an outstanding extent.
335 */
336 btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
337
338 /*
339 * Need to update the i_size under the extent lock so buffered
340 * readers will get the updated i_size when we unlock.
341 */
342 if (start + len > i_size_read(inode))
343 i_size_write(inode, start + len);
344 out:
345 if (ret && space_reserved) {
346 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
347 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
348 }
349 *lenp = len;
350 return ret;
351 }
352
btrfs_dio_iomap_begin(struct inode * inode,loff_t start,loff_t length,unsigned int flags,struct iomap * iomap,struct iomap * srcmap)353 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
354 loff_t length, unsigned int flags, struct iomap *iomap,
355 struct iomap *srcmap)
356 {
357 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
358 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
359 struct extent_map *em;
360 struct extent_state *cached_state = NULL;
361 struct btrfs_dio_data *dio_data = iter->private;
362 u64 lockstart, lockend;
363 const bool write = !!(flags & IOMAP_WRITE);
364 int ret = 0;
365 u64 len = length;
366 const u64 data_alloc_len = length;
367 u32 unlock_bits = EXTENT_LOCKED;
368
369 /*
370 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
371 * we're NOWAIT we may submit a bio for a partial range and return
372 * EIOCBQUEUED, which would result in an errant short read.
373 *
374 * The best way to handle this would be to allow for partial completions
375 * of iocb's, so we could submit the partial bio, return and fault in
376 * the rest of the pages, and then submit the io for the rest of the
377 * range. However we don't have that currently, so simply return
378 * -EAGAIN at this point so that the normal path is used.
379 */
380 if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
381 return -EAGAIN;
382
383 /*
384 * Cap the size of reads to that usually seen in buffered I/O as we need
385 * to allocate a contiguous array for the checksums.
386 */
387 if (!write)
388 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
389
390 lockstart = start;
391 lockend = start + len - 1;
392
393 /*
394 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
395 * enough if we've written compressed pages to this area, so we need to
396 * flush the dirty pages again to make absolutely sure that any
397 * outstanding dirty pages are on disk - the first flush only starts
398 * compression on the data, while keeping the pages locked, so by the
399 * time the second flush returns we know bios for the compressed pages
400 * were submitted and finished, and the pages no longer under writeback.
401 *
402 * If we have a NOWAIT request and we have any pages in the range that
403 * are locked, likely due to compression still in progress, we don't want
404 * to block on page locks. We also don't want to block on pages marked as
405 * dirty or under writeback (same as for the non-compression case).
406 * iomap_dio_rw() did the same check, but after that and before we got
407 * here, mmap'ed writes may have happened or buffered reads started
408 * (readpage() and readahead(), which lock pages), as we haven't locked
409 * the file range yet.
410 */
411 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
412 &BTRFS_I(inode)->runtime_flags)) {
413 if (flags & IOMAP_NOWAIT) {
414 if (filemap_range_needs_writeback(inode->i_mapping,
415 lockstart, lockend))
416 return -EAGAIN;
417 } else {
418 ret = filemap_fdatawrite_range(inode->i_mapping, start,
419 start + length - 1);
420 if (ret)
421 return ret;
422 }
423 }
424
425 memset(dio_data, 0, sizeof(*dio_data));
426
427 /*
428 * We always try to allocate data space and must do it before locking
429 * the file range, to avoid deadlocks with concurrent writes to the same
430 * range if the range has several extents and the writes don't expand the
431 * current i_size (the inode lock is taken in shared mode). If we fail to
432 * allocate data space here we continue and later, after locking the
433 * file range, we fail with ENOSPC only if we figure out we can not do a
434 * NOCOW write.
435 */
436 if (write && !(flags & IOMAP_NOWAIT)) {
437 ret = btrfs_check_data_free_space(BTRFS_I(inode),
438 &dio_data->data_reserved,
439 start, data_alloc_len, false);
440 if (!ret)
441 dio_data->data_space_reserved = true;
442 else if (ret && !(BTRFS_I(inode)->flags &
443 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
444 goto err;
445 }
446
447 /*
448 * If this errors out it's because we couldn't invalidate pagecache for
449 * this range and we need to fallback to buffered IO, or we are doing a
450 * NOWAIT read/write and we need to block.
451 */
452 ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
453 if (ret < 0)
454 goto err;
455
456 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
457 if (IS_ERR(em)) {
458 ret = PTR_ERR(em);
459 goto unlock_err;
460 }
461
462 /*
463 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
464 * io. INLINE is special, and we could probably kludge it in here, but
465 * it's still buffered so for safety lets just fall back to the generic
466 * buffered path.
467 *
468 * For COMPRESSED we _have_ to read the entire extent in so we can
469 * decompress it, so there will be buffering required no matter what we
470 * do, so go ahead and fallback to buffered.
471 *
472 * We return -ENOTBLK because that's what makes DIO go ahead and go back
473 * to buffered IO. Don't blame me, this is the price we pay for using
474 * the generic code.
475 */
476 if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
477 free_extent_map(em);
478 /*
479 * If we are in a NOWAIT context, return -EAGAIN in order to
480 * fallback to buffered IO. This is not only because we can
481 * block with buffered IO (no support for NOWAIT semantics at
482 * the moment) but also to avoid returning short reads to user
483 * space - this happens if we were able to read some data from
484 * previous non-compressed extents and then when we fallback to
485 * buffered IO, at btrfs_file_read_iter() by calling
486 * filemap_read(), we fail to fault in pages for the read buffer,
487 * in which case filemap_read() returns a short read (the number
488 * of bytes previously read is > 0, so it does not return -EFAULT).
489 */
490 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
491 goto unlock_err;
492 }
493
494 len = min(len, em->len - (start - em->start));
495
496 /*
497 * If we have a NOWAIT request and the range contains multiple extents
498 * (or a mix of extents and holes), then we return -EAGAIN to make the
499 * caller fallback to a context where it can do a blocking (without
500 * NOWAIT) request. This way we avoid doing partial IO and returning
501 * success to the caller, which is not optimal for writes and for reads
502 * it can result in unexpected behaviour for an application.
503 *
504 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
505 * iomap_dio_rw(), we can end up returning less data then what the caller
506 * asked for, resulting in an unexpected, and incorrect, short read.
507 * That is, the caller asked to read N bytes and we return less than that,
508 * which is wrong unless we are crossing EOF. This happens if we get a
509 * page fault error when trying to fault in pages for the buffer that is
510 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
511 * have previously submitted bios for other extents in the range, in
512 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
513 * those bios have completed by the time we get the page fault error,
514 * which we return back to our caller - we should only return EIOCBQUEUED
515 * after we have submitted bios for all the extents in the range.
516 */
517 if ((flags & IOMAP_NOWAIT) && len < length) {
518 free_extent_map(em);
519 ret = -EAGAIN;
520 goto unlock_err;
521 }
522
523 if (write) {
524 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
525 start, &len, flags);
526 if (ret < 0)
527 goto unlock_err;
528 /* Recalc len in case the new em is smaller than requested */
529 len = min(len, em->len - (start - em->start));
530 if (dio_data->data_space_reserved) {
531 u64 release_offset;
532 u64 release_len = 0;
533
534 if (dio_data->nocow_done) {
535 release_offset = start;
536 release_len = data_alloc_len;
537 } else if (len < data_alloc_len) {
538 release_offset = start + len;
539 release_len = data_alloc_len - len;
540 }
541
542 if (release_len > 0)
543 btrfs_free_reserved_data_space(BTRFS_I(inode),
544 dio_data->data_reserved,
545 release_offset,
546 release_len);
547 }
548 }
549
550 /*
551 * Translate extent map information to iomap.
552 * We trim the extents (and move the addr) even though iomap code does
553 * that, since we have locked only the parts we are performing I/O in.
554 */
555 if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
556 ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
557 iomap->addr = IOMAP_NULL_ADDR;
558 iomap->type = IOMAP_HOLE;
559 } else {
560 iomap->addr = extent_map_block_start(em) + (start - em->start);
561 iomap->type = IOMAP_MAPPED;
562 }
563 iomap->offset = start;
564 iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
565 iomap->length = len;
566 free_extent_map(em);
567
568 /*
569 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
570 * writes only hold it for this part. We hold the extent lock until
571 * we're completely done with the extent map to make sure it remains
572 * valid.
573 */
574 if (write)
575 unlock_bits |= EXTENT_DIO_LOCKED;
576
577 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
578 unlock_bits, &cached_state);
579
580 /* We didn't use everything, unlock the dio extent for the remainder. */
581 if (!write && (start + len) < lockend)
582 unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
583 lockend, NULL);
584
585 return 0;
586
587 unlock_err:
588 /*
589 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
590 * to update this, be explicit that we expect EXTENT_LOCKED and
591 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
592 */
593 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
594 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
595 err:
596 if (dio_data->data_space_reserved) {
597 btrfs_free_reserved_data_space(BTRFS_I(inode),
598 dio_data->data_reserved,
599 start, data_alloc_len);
600 extent_changeset_free(dio_data->data_reserved);
601 }
602
603 return ret;
604 }
605
btrfs_dio_iomap_end(struct inode * inode,loff_t pos,loff_t length,ssize_t written,unsigned int flags,struct iomap * iomap)606 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
607 ssize_t written, unsigned int flags, struct iomap *iomap)
608 {
609 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
610 struct btrfs_dio_data *dio_data = iter->private;
611 size_t submitted = dio_data->submitted;
612 const bool write = !!(flags & IOMAP_WRITE);
613 int ret = 0;
614
615 if (!write && (iomap->type == IOMAP_HOLE)) {
616 /* If reading from a hole, unlock and return */
617 unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
618 pos + length - 1, NULL);
619 return 0;
620 }
621
622 if (submitted < length) {
623 pos += submitted;
624 length -= submitted;
625 if (write)
626 btrfs_finish_ordered_extent(dio_data->ordered, NULL,
627 pos, length, false);
628 else
629 unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
630 pos + length - 1, NULL);
631 ret = -ENOTBLK;
632 }
633 if (write) {
634 btrfs_put_ordered_extent(dio_data->ordered);
635 dio_data->ordered = NULL;
636 }
637
638 if (write)
639 extent_changeset_free(dio_data->data_reserved);
640 return ret;
641 }
642
btrfs_dio_end_io(struct btrfs_bio * bbio)643 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
644 {
645 struct btrfs_dio_private *dip =
646 container_of(bbio, struct btrfs_dio_private, bbio);
647 struct btrfs_inode *inode = bbio->inode;
648 struct bio *bio = &bbio->bio;
649
650 if (bio->bi_status) {
651 btrfs_warn(inode->root->fs_info,
652 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
653 btrfs_ino(inode), bio->bi_opf,
654 dip->file_offset, dip->bytes, bio->bi_status);
655 }
656
657 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
658 btrfs_finish_ordered_extent(bbio->ordered, NULL,
659 dip->file_offset, dip->bytes,
660 !bio->bi_status);
661 } else {
662 unlock_dio_extent(&inode->io_tree, dip->file_offset,
663 dip->file_offset + dip->bytes - 1, NULL);
664 }
665
666 bbio->bio.bi_private = bbio->private;
667 iomap_dio_bio_end_io(bio);
668 }
669
btrfs_extract_ordered_extent(struct btrfs_bio * bbio,struct btrfs_ordered_extent * ordered)670 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
671 struct btrfs_ordered_extent *ordered)
672 {
673 u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
674 u64 len = bbio->bio.bi_iter.bi_size;
675 struct btrfs_ordered_extent *new;
676 int ret;
677
678 /* Must always be called for the beginning of an ordered extent. */
679 if (WARN_ON_ONCE(start != ordered->disk_bytenr))
680 return -EINVAL;
681
682 /* No need to split if the ordered extent covers the entire bio. */
683 if (ordered->disk_num_bytes == len) {
684 refcount_inc(&ordered->refs);
685 bbio->ordered = ordered;
686 return 0;
687 }
688
689 /*
690 * Don't split the extent_map for NOCOW extents, as we're writing into
691 * a pre-existing one.
692 */
693 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
694 ret = split_extent_map(bbio->inode, bbio->file_offset,
695 ordered->num_bytes, len,
696 ordered->disk_bytenr);
697 if (ret)
698 return ret;
699 }
700
701 new = btrfs_split_ordered_extent(ordered, len);
702 if (IS_ERR(new))
703 return PTR_ERR(new);
704 bbio->ordered = new;
705 return 0;
706 }
707
btrfs_dio_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)708 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
709 loff_t file_offset)
710 {
711 struct btrfs_bio *bbio = btrfs_bio(bio);
712 struct btrfs_dio_private *dip =
713 container_of(bbio, struct btrfs_dio_private, bbio);
714 struct btrfs_dio_data *dio_data = iter->private;
715
716 btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
717 btrfs_dio_end_io, bio->bi_private);
718 bbio->inode = BTRFS_I(iter->inode);
719 bbio->file_offset = file_offset;
720
721 dip->file_offset = file_offset;
722 dip->bytes = bio->bi_iter.bi_size;
723
724 dio_data->submitted += bio->bi_iter.bi_size;
725
726 /*
727 * Check if we are doing a partial write. If we are, we need to split
728 * the ordered extent to match the submitted bio. Hang on to the
729 * remaining unfinishable ordered_extent in dio_data so that it can be
730 * cancelled in iomap_end to avoid a deadlock wherein faulting the
731 * remaining pages is blocked on the outstanding ordered extent.
732 */
733 if (iter->flags & IOMAP_WRITE) {
734 int ret;
735
736 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737 if (ret) {
738 btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739 file_offset, dip->bytes,
740 !ret);
741 bio->bi_status = errno_to_blk_status(ret);
742 iomap_dio_bio_end_io(bio);
743 return;
744 }
745 }
746
747 btrfs_submit_bbio(bbio, 0);
748 }
749
750 static const struct iomap_ops btrfs_dio_iomap_ops = {
751 .iomap_begin = btrfs_dio_iomap_begin,
752 .iomap_end = btrfs_dio_iomap_end,
753 };
754
755 static const struct iomap_dio_ops btrfs_dio_ops = {
756 .submit_io = btrfs_dio_submit_io,
757 .bio_set = &btrfs_dio_bioset,
758 };
759
btrfs_dio_read(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)760 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761 size_t done_before)
762 {
763 struct btrfs_dio_data data = { 0 };
764
765 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766 IOMAP_DIO_PARTIAL, &data, done_before);
767 }
768
btrfs_dio_write(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)769 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770 size_t done_before)
771 {
772 struct btrfs_dio_data data = { 0 };
773
774 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775 IOMAP_DIO_PARTIAL, &data, done_before);
776 }
777
check_direct_IO(struct btrfs_fs_info * fs_info,const struct iov_iter * iter,loff_t offset)778 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779 const struct iov_iter *iter, loff_t offset)
780 {
781 const u32 blocksize_mask = fs_info->sectorsize - 1;
782
783 if (offset & blocksize_mask)
784 return -EINVAL;
785
786 if (iov_iter_alignment(iter) & blocksize_mask)
787 return -EINVAL;
788
789 return 0;
790 }
791
btrfs_direct_write(struct kiocb * iocb,struct iov_iter * from)792 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
793 {
794 struct file *file = iocb->ki_filp;
795 struct inode *inode = file_inode(file);
796 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
797 loff_t pos;
798 ssize_t written = 0;
799 ssize_t written_buffered;
800 size_t prev_left = 0;
801 loff_t endbyte;
802 ssize_t ret;
803 unsigned int ilock_flags = 0;
804 struct iomap_dio *dio;
805
806 if (iocb->ki_flags & IOCB_NOWAIT)
807 ilock_flags |= BTRFS_ILOCK_TRY;
808
809 /*
810 * If the write DIO is within EOF, use a shared lock and also only if
811 * security bits will likely not be dropped by file_remove_privs() called
812 * from btrfs_write_check(). Either will need to be rechecked after the
813 * lock was acquired.
814 */
815 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
816 ilock_flags |= BTRFS_ILOCK_SHARED;
817
818 relock:
819 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
820 if (ret < 0)
821 return ret;
822
823 /* Shared lock cannot be used with security bits set. */
824 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
825 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
826 ilock_flags &= ~BTRFS_ILOCK_SHARED;
827 goto relock;
828 }
829
830 ret = generic_write_checks(iocb, from);
831 if (ret <= 0) {
832 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
833 return ret;
834 }
835
836 ret = btrfs_write_check(iocb, ret);
837 if (ret < 0) {
838 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
839 goto out;
840 }
841
842 pos = iocb->ki_pos;
843 /*
844 * Re-check since file size may have changed just before taking the
845 * lock or pos may have changed because of O_APPEND in generic_write_check()
846 */
847 if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
848 pos + iov_iter_count(from) > i_size_read(inode)) {
849 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
850 ilock_flags &= ~BTRFS_ILOCK_SHARED;
851 goto relock;
852 }
853
854 if (check_direct_IO(fs_info, from, pos)) {
855 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
856 goto buffered;
857 }
858
859 /*
860 * The iov_iter can be mapped to the same file range we are writing to.
861 * If that's the case, then we will deadlock in the iomap code, because
862 * it first calls our callback btrfs_dio_iomap_begin(), which will create
863 * an ordered extent, and after that it will fault in the pages that the
864 * iov_iter refers to. During the fault in we end up in the readahead
865 * pages code (starting at btrfs_readahead()), which will lock the range,
866 * find that ordered extent and then wait for it to complete (at
867 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
868 * obviously the ordered extent can never complete as we didn't submit
869 * yet the respective bio(s). This always happens when the buffer is
870 * memory mapped to the same file range, since the iomap DIO code always
871 * invalidates pages in the target file range (after starting and waiting
872 * for any writeback).
873 *
874 * So here we disable page faults in the iov_iter and then retry if we
875 * got -EFAULT, faulting in the pages before the retry.
876 */
877 again:
878 from->nofault = true;
879 dio = btrfs_dio_write(iocb, from, written);
880 from->nofault = false;
881
882 if (IS_ERR_OR_NULL(dio)) {
883 ret = PTR_ERR_OR_ZERO(dio);
884 } else {
885 /*
886 * If we have a synchronous write, we must make sure the fsync
887 * triggered by the iomap_dio_complete() call below doesn't
888 * deadlock on the inode lock - we are already holding it and we
889 * can't call it after unlocking because we may need to complete
890 * partial writes due to the input buffer (or parts of it) not
891 * being already faulted in.
892 */
893 ASSERT(current->journal_info == NULL);
894 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
895 ret = iomap_dio_complete(dio);
896 current->journal_info = NULL;
897 }
898
899 /* No increment (+=) because iomap returns a cumulative value. */
900 if (ret > 0)
901 written = ret;
902
903 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
904 const size_t left = iov_iter_count(from);
905 /*
906 * We have more data left to write. Try to fault in as many as
907 * possible of the remainder pages and retry. We do this without
908 * releasing and locking again the inode, to prevent races with
909 * truncate.
910 *
911 * Also, in case the iov refers to pages in the file range of the
912 * file we want to write to (due to a mmap), we could enter an
913 * infinite loop if we retry after faulting the pages in, since
914 * iomap will invalidate any pages in the range early on, before
915 * it tries to fault in the pages of the iov. So we keep track of
916 * how much was left of iov in the previous EFAULT and fallback
917 * to buffered IO in case we haven't made any progress.
918 */
919 if (left == prev_left) {
920 ret = -ENOTBLK;
921 } else {
922 fault_in_iov_iter_readable(from, left);
923 prev_left = left;
924 goto again;
925 }
926 }
927
928 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
929
930 /*
931 * If 'ret' is -ENOTBLK or we have not written all data, then it means
932 * we must fallback to buffered IO.
933 */
934 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
935 goto out;
936
937 buffered:
938 /*
939 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
940 * it must retry the operation in a context where blocking is acceptable,
941 * because even if we end up not blocking during the buffered IO attempt
942 * below, we will block when flushing and waiting for the IO.
943 */
944 if (iocb->ki_flags & IOCB_NOWAIT) {
945 ret = -EAGAIN;
946 goto out;
947 }
948
949 pos = iocb->ki_pos;
950 written_buffered = btrfs_buffered_write(iocb, from);
951 if (written_buffered < 0) {
952 ret = written_buffered;
953 goto out;
954 }
955 /*
956 * Ensure all data is persisted. We want the next direct IO read to be
957 * able to read what was just written.
958 */
959 endbyte = pos + written_buffered - 1;
960 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
961 if (ret)
962 goto out;
963 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
964 if (ret)
965 goto out;
966 written += written_buffered;
967 iocb->ki_pos = pos + written_buffered;
968 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
969 endbyte >> PAGE_SHIFT);
970 out:
971 return ret < 0 ? ret : written;
972 }
973
check_direct_read(struct btrfs_fs_info * fs_info,const struct iov_iter * iter,loff_t offset)974 static int check_direct_read(struct btrfs_fs_info *fs_info,
975 const struct iov_iter *iter, loff_t offset)
976 {
977 int ret;
978 int i, seg;
979
980 ret = check_direct_IO(fs_info, iter, offset);
981 if (ret < 0)
982 return ret;
983
984 if (!iter_is_iovec(iter))
985 return 0;
986
987 for (seg = 0; seg < iter->nr_segs; seg++) {
988 for (i = seg + 1; i < iter->nr_segs; i++) {
989 const struct iovec *iov1 = iter_iov(iter) + seg;
990 const struct iovec *iov2 = iter_iov(iter) + i;
991
992 if (iov1->iov_base == iov2->iov_base)
993 return -EINVAL;
994 }
995 }
996 return 0;
997 }
998
btrfs_direct_read(struct kiocb * iocb,struct iov_iter * to)999 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1000 {
1001 struct inode *inode = file_inode(iocb->ki_filp);
1002 size_t prev_left = 0;
1003 ssize_t read = 0;
1004 ssize_t ret;
1005
1006 if (fsverity_active(inode))
1007 return 0;
1008
1009 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1010 return 0;
1011
1012 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1013 again:
1014 /*
1015 * This is similar to what we do for direct IO writes, see the comment
1016 * at btrfs_direct_write(), but we also disable page faults in addition
1017 * to disabling them only at the iov_iter level. This is because when
1018 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1019 * which can still trigger page fault ins despite having set ->nofault
1020 * to true of our 'to' iov_iter.
1021 *
1022 * The difference to direct IO writes is that we deadlock when trying
1023 * to lock the extent range in the inode's tree during he page reads
1024 * triggered by the fault in (while for writes it is due to waiting for
1025 * our own ordered extent). This is because for direct IO reads,
1026 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1027 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1028 */
1029 pagefault_disable();
1030 to->nofault = true;
1031 ret = btrfs_dio_read(iocb, to, read);
1032 to->nofault = false;
1033 pagefault_enable();
1034
1035 /* No increment (+=) because iomap returns a cumulative value. */
1036 if (ret > 0)
1037 read = ret;
1038
1039 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1040 const size_t left = iov_iter_count(to);
1041
1042 if (left == prev_left) {
1043 /*
1044 * We didn't make any progress since the last attempt,
1045 * fallback to a buffered read for the remainder of the
1046 * range. This is just to avoid any possibility of looping
1047 * for too long.
1048 */
1049 ret = read;
1050 } else {
1051 /*
1052 * We made some progress since the last retry or this is
1053 * the first time we are retrying. Fault in as many pages
1054 * as possible and retry.
1055 */
1056 fault_in_iov_iter_writeable(to, left);
1057 prev_left = left;
1058 goto again;
1059 }
1060 }
1061 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1062 return ret < 0 ? ret : read;
1063 }
1064
btrfs_init_dio(void)1065 int __init btrfs_init_dio(void)
1066 {
1067 if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1068 offsetof(struct btrfs_dio_private, bbio.bio),
1069 BIOSET_NEED_BVECS))
1070 return -ENOMEM;
1071
1072 return 0;
1073 }
1074
btrfs_destroy_dio(void)1075 void __cold btrfs_destroy_dio(void)
1076 {
1077 bioset_exit(&btrfs_dio_bioset);
1078 }
1079