1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
20 #include "ctree.h"
21 #include "direct-io.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "btrfs_inode.h"
25 #include "tree-log.h"
26 #include "locking.h"
27 #include "qgroup.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
30 #include "reflink.h"
31 #include "subpage.h"
32 #include "fs.h"
33 #include "accessors.h"
34 #include "extent-tree.h"
35 #include "file-item.h"
36 #include "ioctl.h"
37 #include "file.h"
38 #include "super.h"
39 #include "print-tree.h"
40
41 /*
42 * Unlock folio after btrfs_file_write() is done with it.
43 */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
45 u64 pos, u64 copied)
46 {
47 u64 block_start = round_down(pos, fs_info->sectorsize);
48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
49
50 ASSERT(block_len <= U32_MAX);
51 /*
52 * Folio checked is some magic around finding folios that have been
53 * modified without going through btrfs_dirty_folio(). Clear it here.
54 * There should be no need to mark the pages accessed as
55 * prepare_one_folio() should have marked them accessed in
56 * prepare_one_folio() via find_or_create_page()
57 */
58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
59 folio_unlock(folio);
60 folio_put(folio);
61 }
62
63 /*
64 * After copy_folio_from_iter_atomic(), update the following things for delalloc:
65 * - Mark newly dirtied folio as DELALLOC in the io tree.
66 * Used to advise which range is to be written back.
67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
68 * - Update inode size for past EOF write
69 */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
71 size_t write_bytes, struct extent_state **cached, bool noreserve)
72 {
73 struct btrfs_fs_info *fs_info = inode->root->fs_info;
74 int ret = 0;
75 u64 num_bytes;
76 u64 start_pos;
77 u64 end_of_last_block;
78 u64 end_pos = pos + write_bytes;
79 loff_t isize = i_size_read(&inode->vfs_inode);
80 unsigned int extra_bits = 0;
81
82 if (write_bytes == 0)
83 return 0;
84
85 if (noreserve)
86 extra_bits |= EXTENT_NORESERVE;
87
88 start_pos = round_down(pos, fs_info->sectorsize);
89 num_bytes = round_up(write_bytes + pos - start_pos,
90 fs_info->sectorsize);
91 ASSERT(num_bytes <= U32_MAX);
92 ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes);
93
94 end_of_last_block = start_pos + num_bytes - 1;
95
96 /*
97 * The pages may have already been dirty, clear out old accounting so
98 * we can set things up properly
99 */
100 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
101 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
102 cached);
103
104 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
105 extra_bits, cached);
106 if (ret)
107 return ret;
108
109 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
110 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
111 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
112
113 /*
114 * we've only changed i_size in ram, and we haven't updated
115 * the disk i_size. There is no need to log the inode
116 * at this time.
117 */
118 if (end_pos > isize)
119 i_size_write(&inode->vfs_inode, end_pos);
120 return 0;
121 }
122
123 /*
124 * this is very complex, but the basic idea is to drop all extents
125 * in the range start - end. hint_block is filled in with a block number
126 * that would be a good hint to the block allocator for this file.
127 *
128 * If an extent intersects the range but is not entirely inside the range
129 * it is either truncated or split. Anything entirely inside the range
130 * is deleted from the tree.
131 *
132 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
133 * to deal with that. We set the field 'bytes_found' of the arguments structure
134 * with the number of allocated bytes found in the target range, so that the
135 * caller can update the inode's number of bytes in an atomic way when
136 * replacing extents in a range to avoid races with stat(2).
137 */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)138 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root, struct btrfs_inode *inode,
140 struct btrfs_drop_extents_args *args)
141 {
142 struct btrfs_fs_info *fs_info = root->fs_info;
143 struct extent_buffer *leaf;
144 struct btrfs_file_extent_item *fi;
145 struct btrfs_key key;
146 struct btrfs_key new_key;
147 u64 ino = btrfs_ino(inode);
148 u64 search_start = args->start;
149 u64 disk_bytenr = 0;
150 u64 num_bytes = 0;
151 u64 extent_offset = 0;
152 u64 extent_end = 0;
153 u64 last_end = args->start;
154 int del_nr = 0;
155 int del_slot = 0;
156 int extent_type;
157 int recow;
158 int ret;
159 int modify_tree = -1;
160 int update_refs;
161 int found = 0;
162 struct btrfs_path *path = args->path;
163
164 args->bytes_found = 0;
165 args->extent_inserted = false;
166
167 /* Must always have a path if ->replace_extent is true */
168 ASSERT(!(args->replace_extent && !args->path));
169
170 if (!path) {
171 path = btrfs_alloc_path();
172 if (!path) {
173 ret = -ENOMEM;
174 goto out;
175 }
176 }
177
178 if (args->drop_cache)
179 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
180
181 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
182 modify_tree = 0;
183
184 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
185 while (1) {
186 recow = 0;
187 ret = btrfs_lookup_file_extent(trans, root, path, ino,
188 search_start, modify_tree);
189 if (ret < 0)
190 break;
191 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
192 leaf = path->nodes[0];
193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
194 if (key.objectid == ino &&
195 key.type == BTRFS_EXTENT_DATA_KEY)
196 path->slots[0]--;
197 }
198 ret = 0;
199 next_slot:
200 leaf = path->nodes[0];
201 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
202 if (WARN_ON(del_nr > 0)) {
203 btrfs_print_leaf(leaf);
204 ret = -EINVAL;
205 break;
206 }
207 ret = btrfs_next_leaf(root, path);
208 if (ret < 0)
209 break;
210 if (ret > 0) {
211 ret = 0;
212 break;
213 }
214 leaf = path->nodes[0];
215 recow = 1;
216 }
217
218 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
219
220 if (key.objectid > ino)
221 break;
222 if (WARN_ON_ONCE(key.objectid < ino) ||
223 key.type < BTRFS_EXTENT_DATA_KEY) {
224 ASSERT(del_nr == 0);
225 path->slots[0]++;
226 goto next_slot;
227 }
228 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
229 break;
230
231 fi = btrfs_item_ptr(leaf, path->slots[0],
232 struct btrfs_file_extent_item);
233 extent_type = btrfs_file_extent_type(leaf, fi);
234
235 if (extent_type == BTRFS_FILE_EXTENT_REG ||
236 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
237 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
238 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
239 extent_offset = btrfs_file_extent_offset(leaf, fi);
240 extent_end = key.offset +
241 btrfs_file_extent_num_bytes(leaf, fi);
242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
243 extent_end = key.offset +
244 btrfs_file_extent_ram_bytes(leaf, fi);
245 } else {
246 /* can't happen */
247 BUG();
248 }
249
250 /*
251 * Don't skip extent items representing 0 byte lengths. They
252 * used to be created (bug) if while punching holes we hit
253 * -ENOSPC condition. So if we find one here, just ensure we
254 * delete it, otherwise we would insert a new file extent item
255 * with the same key (offset) as that 0 bytes length file
256 * extent item in the call to setup_items_for_insert() later
257 * in this function.
258 */
259 if (extent_end == key.offset && extent_end >= search_start) {
260 last_end = extent_end;
261 goto delete_extent_item;
262 }
263
264 if (extent_end <= search_start) {
265 path->slots[0]++;
266 goto next_slot;
267 }
268
269 found = 1;
270 search_start = max(key.offset, args->start);
271 if (recow || !modify_tree) {
272 modify_tree = -1;
273 btrfs_release_path(path);
274 continue;
275 }
276
277 /*
278 * | - range to drop - |
279 * | -------- extent -------- |
280 */
281 if (args->start > key.offset && args->end < extent_end) {
282 if (WARN_ON(del_nr > 0)) {
283 btrfs_print_leaf(leaf);
284 ret = -EINVAL;
285 break;
286 }
287 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
288 ret = -EOPNOTSUPP;
289 break;
290 }
291
292 memcpy(&new_key, &key, sizeof(new_key));
293 new_key.offset = args->start;
294 ret = btrfs_duplicate_item(trans, root, path,
295 &new_key);
296 if (ret == -EAGAIN) {
297 btrfs_release_path(path);
298 continue;
299 }
300 if (ret < 0)
301 break;
302
303 leaf = path->nodes[0];
304 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
305 struct btrfs_file_extent_item);
306 btrfs_set_file_extent_num_bytes(leaf, fi,
307 args->start - key.offset);
308
309 fi = btrfs_item_ptr(leaf, path->slots[0],
310 struct btrfs_file_extent_item);
311
312 extent_offset += args->start - key.offset;
313 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
314 btrfs_set_file_extent_num_bytes(leaf, fi,
315 extent_end - args->start);
316
317 if (update_refs && disk_bytenr > 0) {
318 struct btrfs_ref ref = {
319 .action = BTRFS_ADD_DELAYED_REF,
320 .bytenr = disk_bytenr,
321 .num_bytes = num_bytes,
322 .parent = 0,
323 .owning_root = btrfs_root_id(root),
324 .ref_root = btrfs_root_id(root),
325 };
326 btrfs_init_data_ref(&ref, new_key.objectid,
327 args->start - extent_offset,
328 0, false);
329 ret = btrfs_inc_extent_ref(trans, &ref);
330 if (unlikely(ret)) {
331 btrfs_abort_transaction(trans, ret);
332 break;
333 }
334 }
335 key.offset = args->start;
336 }
337 /*
338 * From here on out we will have actually dropped something, so
339 * last_end can be updated.
340 */
341 last_end = extent_end;
342
343 /*
344 * | ---- range to drop ----- |
345 * | -------- extent -------- |
346 */
347 if (args->start <= key.offset && args->end < extent_end) {
348 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
349 ret = -EOPNOTSUPP;
350 break;
351 }
352
353 memcpy(&new_key, &key, sizeof(new_key));
354 new_key.offset = args->end;
355 btrfs_set_item_key_safe(trans, path, &new_key);
356
357 extent_offset += args->end - key.offset;
358 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
359 btrfs_set_file_extent_num_bytes(leaf, fi,
360 extent_end - args->end);
361 if (update_refs && disk_bytenr > 0)
362 args->bytes_found += args->end - key.offset;
363 break;
364 }
365
366 search_start = extent_end;
367 /*
368 * | ---- range to drop ----- |
369 * | -------- extent -------- |
370 */
371 if (args->start > key.offset && args->end >= extent_end) {
372 if (WARN_ON(del_nr > 0)) {
373 btrfs_print_leaf(leaf);
374 ret = -EINVAL;
375 break;
376 }
377 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
378 ret = -EOPNOTSUPP;
379 break;
380 }
381
382 btrfs_set_file_extent_num_bytes(leaf, fi,
383 args->start - key.offset);
384 if (update_refs && disk_bytenr > 0)
385 args->bytes_found += extent_end - args->start;
386 if (args->end == extent_end)
387 break;
388
389 path->slots[0]++;
390 goto next_slot;
391 }
392
393 /*
394 * | ---- range to drop ----- |
395 * | ------ extent ------ |
396 */
397 if (args->start <= key.offset && args->end >= extent_end) {
398 delete_extent_item:
399 if (del_nr == 0) {
400 del_slot = path->slots[0];
401 del_nr = 1;
402 } else {
403 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
404 btrfs_print_leaf(leaf);
405 ret = -EINVAL;
406 break;
407 }
408 del_nr++;
409 }
410
411 if (update_refs &&
412 extent_type == BTRFS_FILE_EXTENT_INLINE) {
413 args->bytes_found += extent_end - key.offset;
414 extent_end = ALIGN(extent_end,
415 fs_info->sectorsize);
416 } else if (update_refs && disk_bytenr > 0) {
417 struct btrfs_ref ref = {
418 .action = BTRFS_DROP_DELAYED_REF,
419 .bytenr = disk_bytenr,
420 .num_bytes = num_bytes,
421 .parent = 0,
422 .owning_root = btrfs_root_id(root),
423 .ref_root = btrfs_root_id(root),
424 };
425 btrfs_init_data_ref(&ref, key.objectid,
426 key.offset - extent_offset,
427 0, false);
428 ret = btrfs_free_extent(trans, &ref);
429 if (unlikely(ret)) {
430 btrfs_abort_transaction(trans, ret);
431 break;
432 }
433 args->bytes_found += extent_end - key.offset;
434 }
435
436 if (args->end == extent_end)
437 break;
438
439 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
440 path->slots[0]++;
441 goto next_slot;
442 }
443
444 ret = btrfs_del_items(trans, root, path, del_slot,
445 del_nr);
446 if (unlikely(ret)) {
447 btrfs_abort_transaction(trans, ret);
448 break;
449 }
450
451 del_nr = 0;
452 del_slot = 0;
453
454 btrfs_release_path(path);
455 continue;
456 }
457
458 BUG();
459 }
460
461 if (!ret && del_nr > 0) {
462 /*
463 * Set path->slots[0] to first slot, so that after the delete
464 * if items are move off from our leaf to its immediate left or
465 * right neighbor leafs, we end up with a correct and adjusted
466 * path->slots[0] for our insertion (if args->replace_extent).
467 */
468 path->slots[0] = del_slot;
469 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
470 if (ret)
471 btrfs_abort_transaction(trans, ret);
472 }
473
474 leaf = path->nodes[0];
475 /*
476 * If btrfs_del_items() was called, it might have deleted a leaf, in
477 * which case it unlocked our path, so check path->locks[0] matches a
478 * write lock.
479 */
480 if (!ret && args->replace_extent &&
481 path->locks[0] == BTRFS_WRITE_LOCK &&
482 btrfs_leaf_free_space(leaf) >=
483 sizeof(struct btrfs_item) + args->extent_item_size) {
484
485 key.objectid = ino;
486 key.type = BTRFS_EXTENT_DATA_KEY;
487 key.offset = args->start;
488 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
489 struct btrfs_key slot_key;
490
491 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
492 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
493 path->slots[0]++;
494 }
495 btrfs_setup_item_for_insert(trans, root, path, &key,
496 args->extent_item_size);
497 args->extent_inserted = true;
498 }
499
500 if (!args->path)
501 btrfs_free_path(path);
502 else if (!args->extent_inserted)
503 btrfs_release_path(path);
504 out:
505 args->drop_end = found ? min(args->end, last_end) : args->end;
506
507 return ret;
508 }
509
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)510 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
511 u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
512 {
513 struct btrfs_file_extent_item *fi;
514 struct btrfs_key key;
515 u64 extent_end;
516
517 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
518 return false;
519
520 btrfs_item_key_to_cpu(leaf, &key, slot);
521 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
522 return false;
523
524 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
525 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
526 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
527 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
528 btrfs_file_extent_compression(leaf, fi) ||
529 btrfs_file_extent_encryption(leaf, fi) ||
530 btrfs_file_extent_other_encoding(leaf, fi))
531 return false;
532
533 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
534 if ((*start && *start != key.offset) || (*end && *end != extent_end))
535 return false;
536
537 *start = key.offset;
538 *end = extent_end;
539 return true;
540 }
541
542 /*
543 * Mark extent in the range start - end as written.
544 *
545 * This changes extent type from 'pre-allocated' to 'regular'. If only
546 * part of extent is marked as written, the extent will be split into
547 * two or three.
548 */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)549 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
550 struct btrfs_inode *inode, u64 start, u64 end)
551 {
552 struct btrfs_root *root = inode->root;
553 struct extent_buffer *leaf;
554 BTRFS_PATH_AUTO_FREE(path);
555 struct btrfs_file_extent_item *fi;
556 struct btrfs_ref ref = { 0 };
557 struct btrfs_key key;
558 struct btrfs_key new_key;
559 u64 bytenr;
560 u64 num_bytes;
561 u64 extent_end;
562 u64 orig_offset;
563 u64 other_start;
564 u64 other_end;
565 u64 split;
566 int del_nr = 0;
567 int del_slot = 0;
568 int recow;
569 int ret = 0;
570 u64 ino = btrfs_ino(inode);
571
572 path = btrfs_alloc_path();
573 if (!path)
574 return -ENOMEM;
575 again:
576 recow = 0;
577 split = start;
578 key.objectid = ino;
579 key.type = BTRFS_EXTENT_DATA_KEY;
580 key.offset = split;
581
582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
583 if (ret < 0)
584 goto out;
585 if (ret > 0 && path->slots[0] > 0)
586 path->slots[0]--;
587
588 leaf = path->nodes[0];
589 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
590 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
591 ret = -EINVAL;
592 btrfs_abort_transaction(trans, ret);
593 goto out;
594 }
595 fi = btrfs_item_ptr(leaf, path->slots[0],
596 struct btrfs_file_extent_item);
597 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
598 ret = -EINVAL;
599 btrfs_abort_transaction(trans, ret);
600 goto out;
601 }
602 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
603 if (unlikely(key.offset > start || extent_end < end)) {
604 ret = -EINVAL;
605 btrfs_abort_transaction(trans, ret);
606 goto out;
607 }
608
609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
610 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
611 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
612 memcpy(&new_key, &key, sizeof(new_key));
613
614 if (start == key.offset && end < extent_end) {
615 other_start = 0;
616 other_end = start;
617 if (extent_mergeable(leaf, path->slots[0] - 1,
618 ino, bytenr, orig_offset,
619 &other_start, &other_end)) {
620 new_key.offset = end;
621 btrfs_set_item_key_safe(trans, path, &new_key);
622 fi = btrfs_item_ptr(leaf, path->slots[0],
623 struct btrfs_file_extent_item);
624 btrfs_set_file_extent_generation(leaf, fi,
625 trans->transid);
626 btrfs_set_file_extent_num_bytes(leaf, fi,
627 extent_end - end);
628 btrfs_set_file_extent_offset(leaf, fi,
629 end - orig_offset);
630 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
631 struct btrfs_file_extent_item);
632 btrfs_set_file_extent_generation(leaf, fi,
633 trans->transid);
634 btrfs_set_file_extent_num_bytes(leaf, fi,
635 end - other_start);
636 goto out;
637 }
638 }
639
640 if (start > key.offset && end == extent_end) {
641 other_start = end;
642 other_end = 0;
643 if (extent_mergeable(leaf, path->slots[0] + 1,
644 ino, bytenr, orig_offset,
645 &other_start, &other_end)) {
646 fi = btrfs_item_ptr(leaf, path->slots[0],
647 struct btrfs_file_extent_item);
648 btrfs_set_file_extent_num_bytes(leaf, fi,
649 start - key.offset);
650 btrfs_set_file_extent_generation(leaf, fi,
651 trans->transid);
652 path->slots[0]++;
653 new_key.offset = start;
654 btrfs_set_item_key_safe(trans, path, &new_key);
655
656 fi = btrfs_item_ptr(leaf, path->slots[0],
657 struct btrfs_file_extent_item);
658 btrfs_set_file_extent_generation(leaf, fi,
659 trans->transid);
660 btrfs_set_file_extent_num_bytes(leaf, fi,
661 other_end - start);
662 btrfs_set_file_extent_offset(leaf, fi,
663 start - orig_offset);
664 goto out;
665 }
666 }
667
668 while (start > key.offset || end < extent_end) {
669 if (key.offset == start)
670 split = end;
671
672 new_key.offset = split;
673 ret = btrfs_duplicate_item(trans, root, path, &new_key);
674 if (ret == -EAGAIN) {
675 btrfs_release_path(path);
676 goto again;
677 }
678 if (unlikely(ret < 0)) {
679 btrfs_abort_transaction(trans, ret);
680 goto out;
681 }
682
683 leaf = path->nodes[0];
684 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
685 struct btrfs_file_extent_item);
686 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
687 btrfs_set_file_extent_num_bytes(leaf, fi,
688 split - key.offset);
689
690 fi = btrfs_item_ptr(leaf, path->slots[0],
691 struct btrfs_file_extent_item);
692
693 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
694 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
695 btrfs_set_file_extent_num_bytes(leaf, fi,
696 extent_end - split);
697
698 ref.action = BTRFS_ADD_DELAYED_REF;
699 ref.bytenr = bytenr;
700 ref.num_bytes = num_bytes;
701 ref.parent = 0;
702 ref.owning_root = btrfs_root_id(root);
703 ref.ref_root = btrfs_root_id(root);
704 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
705 ret = btrfs_inc_extent_ref(trans, &ref);
706 if (unlikely(ret)) {
707 btrfs_abort_transaction(trans, ret);
708 goto out;
709 }
710
711 if (split == start) {
712 key.offset = start;
713 } else {
714 if (unlikely(start != key.offset)) {
715 ret = -EINVAL;
716 btrfs_abort_transaction(trans, ret);
717 goto out;
718 }
719 path->slots[0]--;
720 extent_end = end;
721 }
722 recow = 1;
723 }
724
725 other_start = end;
726 other_end = 0;
727
728 ref.action = BTRFS_DROP_DELAYED_REF;
729 ref.bytenr = bytenr;
730 ref.num_bytes = num_bytes;
731 ref.parent = 0;
732 ref.owning_root = btrfs_root_id(root);
733 ref.ref_root = btrfs_root_id(root);
734 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
735 if (extent_mergeable(leaf, path->slots[0] + 1,
736 ino, bytenr, orig_offset,
737 &other_start, &other_end)) {
738 if (recow) {
739 btrfs_release_path(path);
740 goto again;
741 }
742 extent_end = other_end;
743 del_slot = path->slots[0] + 1;
744 del_nr++;
745 ret = btrfs_free_extent(trans, &ref);
746 if (unlikely(ret)) {
747 btrfs_abort_transaction(trans, ret);
748 goto out;
749 }
750 }
751 other_start = 0;
752 other_end = start;
753 if (extent_mergeable(leaf, path->slots[0] - 1,
754 ino, bytenr, orig_offset,
755 &other_start, &other_end)) {
756 if (recow) {
757 btrfs_release_path(path);
758 goto again;
759 }
760 key.offset = other_start;
761 del_slot = path->slots[0];
762 del_nr++;
763 ret = btrfs_free_extent(trans, &ref);
764 if (unlikely(ret)) {
765 btrfs_abort_transaction(trans, ret);
766 goto out;
767 }
768 }
769 if (del_nr == 0) {
770 fi = btrfs_item_ptr(leaf, path->slots[0],
771 struct btrfs_file_extent_item);
772 btrfs_set_file_extent_type(leaf, fi,
773 BTRFS_FILE_EXTENT_REG);
774 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
775 } else {
776 fi = btrfs_item_ptr(leaf, del_slot - 1,
777 struct btrfs_file_extent_item);
778 btrfs_set_file_extent_type(leaf, fi,
779 BTRFS_FILE_EXTENT_REG);
780 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
781 btrfs_set_file_extent_num_bytes(leaf, fi,
782 extent_end - key.offset);
783
784 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
785 if (unlikely(ret < 0)) {
786 btrfs_abort_transaction(trans, ret);
787 goto out;
788 }
789 }
790 out:
791 return ret;
792 }
793
794 /*
795 * On error return an unlocked folio and the error value
796 * On success return a locked folio and 0
797 */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len)798 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
799 u64 len)
800 {
801 u64 clamp_start = max_t(u64, pos, folio_pos(folio));
802 u64 clamp_end = min_t(u64, pos + len, folio_end(folio));
803 const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
804 int ret = 0;
805
806 if (folio_test_uptodate(folio))
807 return 0;
808
809 if (IS_ALIGNED(clamp_start, blocksize) &&
810 IS_ALIGNED(clamp_end, blocksize))
811 return 0;
812
813 ret = btrfs_read_folio(NULL, folio);
814 if (ret)
815 return ret;
816 folio_lock(folio);
817 if (unlikely(!folio_test_uptodate(folio))) {
818 folio_unlock(folio);
819 return -EIO;
820 }
821
822 /*
823 * Since btrfs_read_folio() will unlock the folio before it returns,
824 * there is a window where btrfs_release_folio() can be called to
825 * release the page. Here we check both inode mapping and page
826 * private to make sure the page was not released.
827 *
828 * The private flag check is essential for subpage as we need to store
829 * extra bitmap using folio private.
830 */
831 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
832 folio_unlock(folio);
833 return -EAGAIN;
834 }
835 return 0;
836 }
837
get_prepare_gfp_flags(struct inode * inode,bool nowait)838 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
839 {
840 gfp_t gfp;
841
842 gfp = btrfs_alloc_write_mask(inode->i_mapping);
843 if (nowait) {
844 gfp &= ~__GFP_DIRECT_RECLAIM;
845 gfp |= GFP_NOWAIT;
846 }
847
848 return gfp;
849 }
850
851 /*
852 * Get folio into the page cache and lock it.
853 */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool nowait)854 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
855 loff_t pos, size_t write_bytes,
856 bool nowait)
857 {
858 const pgoff_t index = pos >> PAGE_SHIFT;
859 gfp_t mask = get_prepare_gfp_flags(inode, nowait);
860 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
861 fgf_set_order(write_bytes);
862 struct folio *folio;
863 int ret = 0;
864
865 again:
866 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
867 if (IS_ERR(folio))
868 return PTR_ERR(folio);
869
870 ret = set_folio_extent_mapped(folio);
871 if (ret < 0) {
872 folio_unlock(folio);
873 folio_put(folio);
874 return ret;
875 }
876 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
877 if (ret) {
878 /* The folio is already unlocked. */
879 folio_put(folio);
880 if (!nowait && ret == -EAGAIN) {
881 ret = 0;
882 goto again;
883 }
884 return ret;
885 }
886 *folio_ret = folio;
887 return 0;
888 }
889
890 /*
891 * Locks the extent and properly waits for data=ordered extents to finish
892 * before allowing the folios to be modified if need.
893 *
894 * Return:
895 * 1 - the extent is locked
896 * 0 - the extent is not locked, and everything is OK
897 * -EAGAIN - need to prepare the folios again
898 */
899 static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)900 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
901 loff_t pos, size_t write_bytes,
902 u64 *lockstart, u64 *lockend, bool nowait,
903 struct extent_state **cached_state)
904 {
905 struct btrfs_fs_info *fs_info = inode->root->fs_info;
906 u64 start_pos;
907 u64 last_pos;
908 int ret = 0;
909
910 start_pos = round_down(pos, fs_info->sectorsize);
911 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
912
913 if (start_pos < inode->vfs_inode.i_size) {
914 struct btrfs_ordered_extent *ordered;
915
916 if (nowait) {
917 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
918 last_pos, cached_state)) {
919 folio_unlock(folio);
920 folio_put(folio);
921 return -EAGAIN;
922 }
923 } else {
924 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
925 cached_state);
926 }
927
928 ordered = btrfs_lookup_ordered_range(inode, start_pos,
929 last_pos - start_pos + 1);
930 if (ordered &&
931 ordered->file_offset + ordered->num_bytes > start_pos &&
932 ordered->file_offset <= last_pos) {
933 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
934 cached_state);
935 folio_unlock(folio);
936 folio_put(folio);
937 btrfs_start_ordered_extent(ordered);
938 btrfs_put_ordered_extent(ordered);
939 return -EAGAIN;
940 }
941 if (ordered)
942 btrfs_put_ordered_extent(ordered);
943
944 *lockstart = start_pos;
945 *lockend = last_pos;
946 ret = 1;
947 }
948
949 /*
950 * We should be called after prepare_one_folio() which should have locked
951 * all pages in the range.
952 */
953 WARN_ON(!folio_test_locked(folio));
954
955 return ret;
956 }
957
958 /*
959 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
960 *
961 * @pos: File offset.
962 * @write_bytes: The length to write, will be updated to the nocow writeable
963 * range.
964 * @nowait: Indicate if we can block or not (non-blocking IO context).
965 *
966 * This function will flush ordered extents in the range to ensure proper
967 * nocow checks.
968 *
969 * Return:
970 * > 0 If we can nocow, and updates @write_bytes.
971 * 0 If we can't do a nocow write.
972 * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
973 * root is in progress or because we are in a non-blocking IO
974 * context and need to block (@nowait is true).
975 * < 0 If an error happened.
976 *
977 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
978 */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)979 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
980 size_t *write_bytes, bool nowait)
981 {
982 struct btrfs_fs_info *fs_info = inode->root->fs_info;
983 struct btrfs_root *root = inode->root;
984 struct extent_state *cached_state = NULL;
985 u64 lockstart, lockend;
986 u64 cur_offset;
987 int ret = 0;
988
989 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
990 return 0;
991
992 if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
993 return -EAGAIN;
994
995 lockstart = round_down(pos, fs_info->sectorsize);
996 lockend = round_up(pos + *write_bytes,
997 fs_info->sectorsize) - 1;
998
999 if (nowait) {
1000 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1001 &cached_state)) {
1002 btrfs_drew_write_unlock(&root->snapshot_lock);
1003 return -EAGAIN;
1004 }
1005 } else {
1006 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1007 &cached_state);
1008 }
1009
1010 cur_offset = lockstart;
1011 while (cur_offset < lockend) {
1012 u64 num_bytes = lockend - cur_offset + 1;
1013
1014 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
1015 if (ret <= 0) {
1016 /*
1017 * If cur_offset == lockstart it means we haven't found
1018 * any extent against which we can NOCOW, so unlock the
1019 * snapshot lock.
1020 */
1021 if (cur_offset == lockstart)
1022 btrfs_drew_write_unlock(&root->snapshot_lock);
1023 break;
1024 }
1025 cur_offset += num_bytes;
1026 }
1027
1028 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1029
1030 /*
1031 * cur_offset > lockstart means there's at least a partial range we can
1032 * NOCOW, and that range can cover one or more extents.
1033 */
1034 if (cur_offset > lockstart) {
1035 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
1036 return 1;
1037 }
1038
1039 return ret;
1040 }
1041
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1042 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1043 {
1044 btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1045 }
1046
btrfs_write_check(struct kiocb * iocb,size_t count)1047 int btrfs_write_check(struct kiocb *iocb, size_t count)
1048 {
1049 struct file *file = iocb->ki_filp;
1050 struct inode *inode = file_inode(file);
1051 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1052 loff_t pos = iocb->ki_pos;
1053 int ret;
1054 loff_t oldsize;
1055
1056 /*
1057 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1058 * prealloc flags, as without those flags we always have to COW. We will
1059 * later check if we can really COW into the target range (using
1060 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1061 */
1062 if ((iocb->ki_flags & IOCB_NOWAIT) &&
1063 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1064 return -EAGAIN;
1065
1066 ret = file_remove_privs(file);
1067 if (ret)
1068 return ret;
1069
1070 /*
1071 * We reserve space for updating the inode when we reserve space for the
1072 * extent we are going to write, so we will enospc out there. We don't
1073 * need to start yet another transaction to update the inode as we will
1074 * update the inode when we finish writing whatever data we write.
1075 */
1076 if (!IS_NOCMTIME(inode)) {
1077 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1078 inode_inc_iversion(inode);
1079 }
1080
1081 oldsize = i_size_read(inode);
1082 if (pos > oldsize) {
1083 /* Expand hole size to cover write data, preventing empty gap */
1084 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1085
1086 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1087 if (ret)
1088 return ret;
1089 }
1090
1091 return 0;
1092 }
1093
release_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 start,u64 len,bool only_release_metadata)1094 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
1095 u64 start, u64 len, bool only_release_metadata)
1096 {
1097 if (len == 0)
1098 return;
1099
1100 if (only_release_metadata) {
1101 btrfs_check_nocow_unlock(inode);
1102 btrfs_delalloc_release_metadata(inode, len, true);
1103 } else {
1104 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1105
1106 btrfs_delalloc_release_space(inode, data_reserved,
1107 round_down(start, fs_info->sectorsize),
1108 len, true);
1109 }
1110 }
1111
1112 /*
1113 * Reserve data and metadata space for this buffered write range.
1114 *
1115 * Return >0 for the number of bytes reserved, which is always block aligned.
1116 * Return <0 for error.
1117 */
reserve_space(struct btrfs_inode * inode,struct extent_changeset ** data_reserved,u64 start,size_t * len,bool nowait,bool * only_release_metadata)1118 static ssize_t reserve_space(struct btrfs_inode *inode,
1119 struct extent_changeset **data_reserved,
1120 u64 start, size_t *len, bool nowait,
1121 bool *only_release_metadata)
1122 {
1123 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1124 const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
1125 size_t reserve_bytes;
1126 int ret;
1127
1128 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
1129 if (ret < 0) {
1130 int can_nocow;
1131
1132 if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
1133 return -EAGAIN;
1134
1135 /*
1136 * If we don't have to COW at the offset, reserve metadata only.
1137 * write_bytes may get smaller than requested here.
1138 */
1139 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
1140 if (can_nocow < 0)
1141 ret = can_nocow;
1142 if (can_nocow > 0)
1143 ret = 0;
1144 if (ret)
1145 return ret;
1146 *only_release_metadata = true;
1147 }
1148
1149 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
1150 WARN_ON(reserve_bytes == 0);
1151 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
1152 reserve_bytes, nowait);
1153 if (ret) {
1154 if (!*only_release_metadata)
1155 btrfs_free_reserved_data_space(inode, *data_reserved,
1156 start, *len);
1157 else
1158 btrfs_check_nocow_unlock(inode);
1159
1160 if (nowait && ret == -ENOSPC)
1161 ret = -EAGAIN;
1162 return ret;
1163 }
1164 return reserve_bytes;
1165 }
1166
1167 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
shrink_reserved_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 reserved_start,u64 reserved_len,u64 new_len,bool only_release_metadata)1168 static void shrink_reserved_space(struct btrfs_inode *inode,
1169 struct extent_changeset *data_reserved,
1170 u64 reserved_start, u64 reserved_len,
1171 u64 new_len, bool only_release_metadata)
1172 {
1173 const u64 diff = reserved_len - new_len;
1174
1175 ASSERT(new_len <= reserved_len);
1176 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
1177 if (only_release_metadata)
1178 btrfs_delalloc_release_metadata(inode, diff, true);
1179 else
1180 btrfs_delalloc_release_space(inode, data_reserved,
1181 reserved_start + new_len, diff, true);
1182 }
1183
1184 /* Calculate the maximum amount of bytes we can write into one folio. */
calc_write_bytes(const struct btrfs_inode * inode,const struct iov_iter * iter,u64 start)1185 static size_t calc_write_bytes(const struct btrfs_inode *inode,
1186 const struct iov_iter *iter, u64 start)
1187 {
1188 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
1189
1190 return min(max_folio_size - (start & (max_folio_size - 1)),
1191 iov_iter_count(iter));
1192 }
1193
1194 /*
1195 * Do the heavy-lifting work to copy one range into one folio of the page cache.
1196 *
1197 * Return > 0 in case we copied all bytes or just some of them.
1198 * Return 0 if no bytes were copied, in which case the caller should retry.
1199 * Return <0 on error.
1200 */
copy_one_range(struct btrfs_inode * inode,struct iov_iter * iter,struct extent_changeset ** data_reserved,u64 start,bool nowait)1201 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
1202 struct extent_changeset **data_reserved, u64 start,
1203 bool nowait)
1204 {
1205 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1206 struct extent_state *cached_state = NULL;
1207 size_t write_bytes = calc_write_bytes(inode, iter, start);
1208 size_t copied;
1209 const u64 reserved_start = round_down(start, fs_info->sectorsize);
1210 u64 reserved_len;
1211 struct folio *folio = NULL;
1212 int extents_locked;
1213 u64 lockstart;
1214 u64 lockend;
1215 bool only_release_metadata = false;
1216 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1217 int ret;
1218
1219 /*
1220 * Fault all pages before locking them in prepare_one_folio() to avoid
1221 * recursive lock.
1222 */
1223 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
1224 return -EFAULT;
1225 extent_changeset_release(*data_reserved);
1226 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
1227 &only_release_metadata);
1228 if (ret < 0)
1229 return ret;
1230 reserved_len = ret;
1231 /* Write range must be inside the reserved range. */
1232 ASSERT(reserved_start <= start);
1233 ASSERT(start + write_bytes <= reserved_start + reserved_len);
1234
1235 again:
1236 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
1237 bdp_flags);
1238 if (ret) {
1239 btrfs_delalloc_release_extents(inode, reserved_len);
1240 release_space(inode, *data_reserved, reserved_start, reserved_len,
1241 only_release_metadata);
1242 return ret;
1243 }
1244
1245 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
1246 if (ret) {
1247 btrfs_delalloc_release_extents(inode, reserved_len);
1248 release_space(inode, *data_reserved, reserved_start, reserved_len,
1249 only_release_metadata);
1250 return ret;
1251 }
1252
1253 /*
1254 * The reserved range goes beyond the current folio, shrink the reserved
1255 * space to the folio boundary.
1256 */
1257 if (reserved_start + reserved_len > folio_end(folio)) {
1258 const u64 last_block = folio_end(folio);
1259
1260 shrink_reserved_space(inode, *data_reserved, reserved_start,
1261 reserved_len, last_block - reserved_start,
1262 only_release_metadata);
1263 write_bytes = last_block - start;
1264 reserved_len = last_block - reserved_start;
1265 }
1266
1267 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
1268 write_bytes, &lockstart,
1269 &lockend, nowait,
1270 &cached_state);
1271 if (extents_locked < 0) {
1272 if (!nowait && extents_locked == -EAGAIN)
1273 goto again;
1274
1275 btrfs_delalloc_release_extents(inode, reserved_len);
1276 release_space(inode, *data_reserved, reserved_start, reserved_len,
1277 only_release_metadata);
1278 ret = extents_locked;
1279 return ret;
1280 }
1281
1282 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
1283 write_bytes, iter);
1284 flush_dcache_folio(folio);
1285
1286 if (unlikely(copied < write_bytes)) {
1287 u64 last_block;
1288
1289 /*
1290 * The original write range doesn't need an uptodate folio as
1291 * the range is block aligned. But now a short copy happened.
1292 * We cannot handle it without an uptodate folio.
1293 *
1294 * So just revert the range and we will retry.
1295 */
1296 if (!folio_test_uptodate(folio)) {
1297 iov_iter_revert(iter, copied);
1298 copied = 0;
1299 }
1300
1301 /* No copied bytes, unlock, release reserved space and exit. */
1302 if (copied == 0) {
1303 if (extents_locked)
1304 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
1305 &cached_state);
1306 else
1307 btrfs_free_extent_state(cached_state);
1308 btrfs_delalloc_release_extents(inode, reserved_len);
1309 release_space(inode, *data_reserved, reserved_start, reserved_len,
1310 only_release_metadata);
1311 btrfs_drop_folio(fs_info, folio, start, copied);
1312 return 0;
1313 }
1314
1315 /* Release the reserved space beyond the last block. */
1316 last_block = round_up(start + copied, fs_info->sectorsize);
1317
1318 shrink_reserved_space(inode, *data_reserved, reserved_start,
1319 reserved_len, last_block - reserved_start,
1320 only_release_metadata);
1321 reserved_len = last_block - reserved_start;
1322 }
1323
1324 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
1325 only_release_metadata);
1326 /*
1327 * If we have not locked the extent range, because the range's start
1328 * offset is >= i_size, we might still have a non-NULL cached extent
1329 * state, acquired while marking the extent range as delalloc through
1330 * btrfs_dirty_page(). Therefore free any possible cached extent state
1331 * to avoid a memory leak.
1332 */
1333 if (extents_locked)
1334 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1335 else
1336 btrfs_free_extent_state(cached_state);
1337
1338 btrfs_delalloc_release_extents(inode, reserved_len);
1339 if (ret) {
1340 btrfs_drop_folio(fs_info, folio, start, copied);
1341 release_space(inode, *data_reserved, reserved_start, reserved_len,
1342 only_release_metadata);
1343 return ret;
1344 }
1345 if (only_release_metadata)
1346 btrfs_check_nocow_unlock(inode);
1347
1348 btrfs_drop_folio(fs_info, folio, start, copied);
1349 return copied;
1350 }
1351
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * iter)1352 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1353 {
1354 struct file *file = iocb->ki_filp;
1355 loff_t pos;
1356 struct inode *inode = file_inode(file);
1357 struct extent_changeset *data_reserved = NULL;
1358 size_t num_written = 0;
1359 ssize_t ret;
1360 loff_t old_isize;
1361 unsigned int ilock_flags = 0;
1362 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1363
1364 if (nowait)
1365 ilock_flags |= BTRFS_ILOCK_TRY;
1366
1367 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1368 if (ret < 0)
1369 return ret;
1370
1371 /*
1372 * We can only trust the isize with inode lock held, or it can race with
1373 * other buffered writes and cause incorrect call of
1374 * pagecache_isize_extended() to overwrite existing data.
1375 */
1376 old_isize = i_size_read(inode);
1377
1378 ret = generic_write_checks(iocb, iter);
1379 if (ret <= 0)
1380 goto out;
1381
1382 ret = btrfs_write_check(iocb, ret);
1383 if (ret < 0)
1384 goto out;
1385
1386 pos = iocb->ki_pos;
1387 while (iov_iter_count(iter) > 0) {
1388 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
1389 if (ret < 0)
1390 break;
1391 pos += ret;
1392 num_written += ret;
1393 cond_resched();
1394 }
1395
1396 extent_changeset_free(data_reserved);
1397 if (num_written > 0) {
1398 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1399 iocb->ki_pos += num_written;
1400 }
1401 out:
1402 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1403 return num_written ? num_written : ret;
1404 }
1405
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1406 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1407 const struct btrfs_ioctl_encoded_io_args *encoded)
1408 {
1409 struct file *file = iocb->ki_filp;
1410 struct inode *inode = file_inode(file);
1411 loff_t count;
1412 ssize_t ret;
1413
1414 btrfs_inode_lock(BTRFS_I(inode), 0);
1415 count = encoded->len;
1416 ret = generic_write_checks_count(iocb, &count);
1417 if (ret == 0 && count != encoded->len) {
1418 /*
1419 * The write got truncated by generic_write_checks_count(). We
1420 * can't do a partial encoded write.
1421 */
1422 ret = -EFBIG;
1423 }
1424 if (ret || encoded->len == 0)
1425 goto out;
1426
1427 ret = btrfs_write_check(iocb, encoded->len);
1428 if (ret < 0)
1429 goto out;
1430
1431 ret = btrfs_do_encoded_write(iocb, from, encoded);
1432 out:
1433 btrfs_inode_unlock(BTRFS_I(inode), 0);
1434 return ret;
1435 }
1436
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1437 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1438 const struct btrfs_ioctl_encoded_io_args *encoded)
1439 {
1440 struct file *file = iocb->ki_filp;
1441 struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1442 ssize_t num_written, num_sync;
1443
1444 /*
1445 * If the fs flips readonly due to some impossible error, although we
1446 * have opened a file as writable, we have to stop this write operation
1447 * to ensure consistency.
1448 */
1449 if (BTRFS_FS_ERROR(inode->root->fs_info))
1450 return -EROFS;
1451
1452 if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1453 return -EOPNOTSUPP;
1454
1455 if (encoded) {
1456 num_written = btrfs_encoded_write(iocb, from, encoded);
1457 num_sync = encoded->len;
1458 } else if (iocb->ki_flags & IOCB_DIRECT) {
1459 num_written = btrfs_direct_write(iocb, from);
1460 num_sync = num_written;
1461 } else {
1462 num_written = btrfs_buffered_write(iocb, from);
1463 num_sync = num_written;
1464 }
1465
1466 btrfs_set_inode_last_sub_trans(inode);
1467
1468 if (num_sync > 0) {
1469 num_sync = generic_write_sync(iocb, num_sync);
1470 if (num_sync < 0)
1471 num_written = num_sync;
1472 }
1473
1474 return num_written;
1475 }
1476
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1477 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1478 {
1479 return btrfs_do_write_iter(iocb, from, NULL);
1480 }
1481
btrfs_release_file(struct inode * inode,struct file * filp)1482 int btrfs_release_file(struct inode *inode, struct file *filp)
1483 {
1484 struct btrfs_file_private *private = filp->private_data;
1485
1486 if (private) {
1487 kfree(private->filldir_buf);
1488 btrfs_free_extent_state(private->llseek_cached_state);
1489 kfree(private);
1490 filp->private_data = NULL;
1491 }
1492
1493 /*
1494 * Set by setattr when we are about to truncate a file from a non-zero
1495 * size to a zero size. This tries to flush down new bytes that may
1496 * have been written if the application were using truncate to replace
1497 * a file in place.
1498 */
1499 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1500 &BTRFS_I(inode)->runtime_flags))
1501 filemap_flush(inode->i_mapping);
1502 return 0;
1503 }
1504
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1505 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1506 {
1507 int ret;
1508 struct blk_plug plug;
1509
1510 /*
1511 * This is only called in fsync, which would do synchronous writes, so
1512 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1513 * multiple disks using raid profile, a large IO can be split to
1514 * several segments of stripe length (currently 64K).
1515 */
1516 blk_start_plug(&plug);
1517 ret = btrfs_fdatawrite_range(inode, start, end);
1518 blk_finish_plug(&plug);
1519
1520 return ret;
1521 }
1522
skip_inode_logging(const struct btrfs_log_ctx * ctx)1523 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1524 {
1525 struct btrfs_inode *inode = ctx->inode;
1526 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1527
1528 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1529 list_empty(&ctx->ordered_extents))
1530 return true;
1531
1532 /*
1533 * If we are doing a fast fsync we can not bail out if the inode's
1534 * last_trans is <= then the last committed transaction, because we only
1535 * update the last_trans of the inode during ordered extent completion,
1536 * and for a fast fsync we don't wait for that, we only wait for the
1537 * writeback to complete.
1538 */
1539 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1540 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1541 list_empty(&ctx->ordered_extents)))
1542 return true;
1543
1544 return false;
1545 }
1546
1547 /*
1548 * fsync call for both files and directories. This logs the inode into
1549 * the tree log instead of forcing full commits whenever possible.
1550 *
1551 * It needs to call filemap_fdatawait so that all ordered extent updates are
1552 * in the metadata btree are up to date for copying to the log.
1553 *
1554 * It drops the inode mutex before doing the tree log commit. This is an
1555 * important optimization for directories because holding the mutex prevents
1556 * new operations on the dir while we write to disk.
1557 */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1558 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1559 {
1560 struct dentry *dentry = file_dentry(file);
1561 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1562 struct btrfs_root *root = inode->root;
1563 struct btrfs_fs_info *fs_info = root->fs_info;
1564 struct btrfs_trans_handle *trans;
1565 struct btrfs_log_ctx ctx;
1566 int ret = 0, err;
1567 u64 len;
1568 bool full_sync;
1569 bool skip_ilock = false;
1570
1571 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1572 skip_ilock = true;
1573 current->journal_info = NULL;
1574 btrfs_assert_inode_locked(inode);
1575 }
1576
1577 trace_btrfs_sync_file(file, datasync);
1578
1579 btrfs_init_log_ctx(&ctx, inode);
1580
1581 /*
1582 * Always set the range to a full range, otherwise we can get into
1583 * several problems, from missing file extent items to represent holes
1584 * when not using the NO_HOLES feature, to log tree corruption due to
1585 * races between hole detection during logging and completion of ordered
1586 * extents outside the range, to missing checksums due to ordered extents
1587 * for which we flushed only a subset of their pages.
1588 */
1589 start = 0;
1590 end = LLONG_MAX;
1591 len = (u64)LLONG_MAX + 1;
1592
1593 /*
1594 * We write the dirty pages in the range and wait until they complete
1595 * out of the ->i_mutex. If so, we can flush the dirty pages by
1596 * multi-task, and make the performance up. See
1597 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1598 */
1599 ret = start_ordered_ops(inode, start, end);
1600 if (ret)
1601 goto out;
1602
1603 if (skip_ilock)
1604 down_write(&inode->i_mmap_lock);
1605 else
1606 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1607
1608 atomic_inc(&root->log_batch);
1609
1610 /*
1611 * Before we acquired the inode's lock and the mmap lock, someone may
1612 * have dirtied more pages in the target range. We need to make sure
1613 * that writeback for any such pages does not start while we are logging
1614 * the inode, because if it does, any of the following might happen when
1615 * we are not doing a full inode sync:
1616 *
1617 * 1) We log an extent after its writeback finishes but before its
1618 * checksums are added to the csum tree, leading to -EIO errors
1619 * when attempting to read the extent after a log replay.
1620 *
1621 * 2) We can end up logging an extent before its writeback finishes.
1622 * Therefore after the log replay we will have a file extent item
1623 * pointing to an unwritten extent (and no data checksums as well).
1624 *
1625 * So trigger writeback for any eventual new dirty pages and then we
1626 * wait for all ordered extents to complete below.
1627 */
1628 ret = start_ordered_ops(inode, start, end);
1629 if (ret) {
1630 if (skip_ilock)
1631 up_write(&inode->i_mmap_lock);
1632 else
1633 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1634 goto out;
1635 }
1636
1637 /*
1638 * Always check for the full sync flag while holding the inode's lock,
1639 * to avoid races with other tasks. The flag must be either set all the
1640 * time during logging or always off all the time while logging.
1641 * We check the flag here after starting delalloc above, because when
1642 * running delalloc the full sync flag may be set if we need to drop
1643 * extra extent map ranges due to temporary memory allocation failures.
1644 */
1645 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1646
1647 /*
1648 * We have to do this here to avoid the priority inversion of waiting on
1649 * IO of a lower priority task while holding a transaction open.
1650 *
1651 * For a full fsync we wait for the ordered extents to complete while
1652 * for a fast fsync we wait just for writeback to complete, and then
1653 * attach the ordered extents to the transaction so that a transaction
1654 * commit waits for their completion, to avoid data loss if we fsync,
1655 * the current transaction commits before the ordered extents complete
1656 * and a power failure happens right after that.
1657 *
1658 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1659 * logical address recorded in the ordered extent may change. We need
1660 * to wait for the IO to stabilize the logical address.
1661 */
1662 if (full_sync || btrfs_is_zoned(fs_info)) {
1663 ret = btrfs_wait_ordered_range(inode, start, len);
1664 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1665 } else {
1666 /*
1667 * Get our ordered extents as soon as possible to avoid doing
1668 * checksum lookups in the csum tree, and use instead the
1669 * checksums attached to the ordered extents.
1670 */
1671 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1672 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1673 if (ret)
1674 goto out_release_extents;
1675
1676 /*
1677 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1678 * starting and waiting for writeback, because for buffered IO
1679 * it may have been set during the end IO callback
1680 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1681 * case an error happened and we need to wait for ordered
1682 * extents to complete so that any extent maps that point to
1683 * unwritten locations are dropped and we don't log them.
1684 */
1685 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1686 ret = btrfs_wait_ordered_range(inode, start, len);
1687 }
1688
1689 if (ret)
1690 goto out_release_extents;
1691
1692 atomic_inc(&root->log_batch);
1693
1694 if (skip_inode_logging(&ctx)) {
1695 /*
1696 * We've had everything committed since the last time we were
1697 * modified so clear this flag in case it was set for whatever
1698 * reason, it's no longer relevant.
1699 */
1700 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1701 /*
1702 * An ordered extent might have started before and completed
1703 * already with io errors, in which case the inode was not
1704 * updated and we end up here. So check the inode's mapping
1705 * for any errors that might have happened since we last
1706 * checked called fsync.
1707 */
1708 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1709 goto out_release_extents;
1710 }
1711
1712 btrfs_init_log_ctx_scratch_eb(&ctx);
1713
1714 /*
1715 * We use start here because we will need to wait on the IO to complete
1716 * in btrfs_sync_log, which could require joining a transaction (for
1717 * example checking cross references in the nocow path). If we use join
1718 * here we could get into a situation where we're waiting on IO to
1719 * happen that is blocked on a transaction trying to commit. With start
1720 * we inc the extwriter counter, so we wait for all extwriters to exit
1721 * before we start blocking joiners. This comment is to keep somebody
1722 * from thinking they are super smart and changing this to
1723 * btrfs_join_transaction *cough*Josef*cough*.
1724 */
1725 trans = btrfs_start_transaction(root, 0);
1726 if (IS_ERR(trans)) {
1727 ret = PTR_ERR(trans);
1728 goto out_release_extents;
1729 }
1730 trans->in_fsync = true;
1731
1732 ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1733 /*
1734 * Scratch eb no longer needed, release before syncing log or commit
1735 * transaction, to avoid holding unnecessary memory during such long
1736 * operations.
1737 */
1738 if (ctx.scratch_eb) {
1739 free_extent_buffer(ctx.scratch_eb);
1740 ctx.scratch_eb = NULL;
1741 }
1742 btrfs_release_log_ctx_extents(&ctx);
1743 if (ret < 0) {
1744 /* Fallthrough and commit/free transaction. */
1745 ret = BTRFS_LOG_FORCE_COMMIT;
1746 }
1747
1748 /* we've logged all the items and now have a consistent
1749 * version of the file in the log. It is possible that
1750 * someone will come in and modify the file, but that's
1751 * fine because the log is consistent on disk, and we
1752 * have references to all of the file's extents
1753 *
1754 * It is possible that someone will come in and log the
1755 * file again, but that will end up using the synchronization
1756 * inside btrfs_sync_log to keep things safe.
1757 */
1758 if (skip_ilock)
1759 up_write(&inode->i_mmap_lock);
1760 else
1761 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1762
1763 if (ret == BTRFS_NO_LOG_SYNC) {
1764 ret = btrfs_end_transaction(trans);
1765 goto out;
1766 }
1767
1768 /* We successfully logged the inode, attempt to sync the log. */
1769 if (!ret) {
1770 ret = btrfs_sync_log(trans, root, &ctx);
1771 if (!ret) {
1772 ret = btrfs_end_transaction(trans);
1773 goto out;
1774 }
1775 }
1776
1777 /*
1778 * At this point we need to commit the transaction because we had
1779 * btrfs_need_log_full_commit() or some other error.
1780 *
1781 * If we didn't do a full sync we have to stop the trans handle, wait on
1782 * the ordered extents, start it again and commit the transaction. If
1783 * we attempt to wait on the ordered extents here we could deadlock with
1784 * something like fallocate() that is holding the extent lock trying to
1785 * start a transaction while some other thread is trying to commit the
1786 * transaction while we (fsync) are currently holding the transaction
1787 * open.
1788 */
1789 if (!full_sync) {
1790 ret = btrfs_end_transaction(trans);
1791 if (ret)
1792 goto out;
1793 ret = btrfs_wait_ordered_range(inode, start, len);
1794 if (ret)
1795 goto out;
1796
1797 /*
1798 * This is safe to use here because we're only interested in
1799 * making sure the transaction that had the ordered extents is
1800 * committed. We aren't waiting on anything past this point,
1801 * we're purely getting the transaction and committing it.
1802 */
1803 trans = btrfs_attach_transaction_barrier(root);
1804 if (IS_ERR(trans)) {
1805 ret = PTR_ERR(trans);
1806
1807 /*
1808 * We committed the transaction and there's no currently
1809 * running transaction, this means everything we care
1810 * about made it to disk and we are done.
1811 */
1812 if (ret == -ENOENT)
1813 ret = 0;
1814 goto out;
1815 }
1816 }
1817
1818 ret = btrfs_commit_transaction(trans);
1819 out:
1820 free_extent_buffer(ctx.scratch_eb);
1821 ASSERT(list_empty(&ctx.list));
1822 ASSERT(list_empty(&ctx.conflict_inodes));
1823 err = file_check_and_advance_wb_err(file);
1824 if (!ret)
1825 ret = err;
1826 return ret > 0 ? -EIO : ret;
1827
1828 out_release_extents:
1829 btrfs_release_log_ctx_extents(&ctx);
1830 if (skip_ilock)
1831 up_write(&inode->i_mmap_lock);
1832 else
1833 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1834 goto out;
1835 }
1836
1837 /*
1838 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1839 * called from a page fault handler when a page is first dirtied. Hence we must
1840 * be careful to check for EOF conditions here. We set the page up correctly
1841 * for a written page which means we get ENOSPC checking when writing into
1842 * holes and correct delalloc and unwritten extent mapping on filesystems that
1843 * support these features.
1844 *
1845 * We are not allowed to take the i_mutex here so we have to play games to
1846 * protect against truncate races as the page could now be beyond EOF. Because
1847 * truncate_setsize() writes the inode size before removing pages, once we have
1848 * the page lock we can determine safely if the page is beyond EOF. If it is not
1849 * beyond EOF, then the page is guaranteed safe against truncation until we
1850 * unlock the page.
1851 */
btrfs_page_mkwrite(struct vm_fault * vmf)1852 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1853 {
1854 struct page *page = vmf->page;
1855 struct folio *folio = page_folio(page);
1856 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
1857 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1858 struct extent_io_tree *io_tree = &inode->io_tree;
1859 struct btrfs_ordered_extent *ordered;
1860 struct extent_state *cached_state = NULL;
1861 struct extent_changeset *data_reserved = NULL;
1862 unsigned long zero_start;
1863 loff_t size;
1864 size_t fsize = folio_size(folio);
1865 int ret;
1866 bool only_release_metadata = false;
1867 u64 reserved_space;
1868 u64 page_start;
1869 u64 page_end;
1870 u64 end;
1871
1872 reserved_space = fsize;
1873
1874 sb_start_pagefault(inode->vfs_inode.i_sb);
1875 page_start = folio_pos(folio);
1876 page_end = page_start + folio_size(folio) - 1;
1877 end = page_end;
1878
1879 /*
1880 * Reserving delalloc space after obtaining the page lock can lead to
1881 * deadlock. For example, if a dirty page is locked by this function
1882 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1883 * dirty page write out, then the btrfs_writepages() function could
1884 * end up waiting indefinitely to get a lock on the page currently
1885 * being processed by btrfs_page_mkwrite() function.
1886 */
1887 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
1888 reserved_space, false);
1889 if (ret < 0) {
1890 size_t write_bytes = reserved_space;
1891
1892 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
1893 goto out_noreserve;
1894
1895 only_release_metadata = true;
1896
1897 /*
1898 * Can't write the whole range, there may be shared extents or
1899 * holes in the range, bail out with @only_release_metadata set
1900 * to true so that we unlock the nocow lock before returning the
1901 * error.
1902 */
1903 if (write_bytes < reserved_space)
1904 goto out_noreserve;
1905 }
1906 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
1907 reserved_space, false);
1908 if (ret < 0) {
1909 if (!only_release_metadata)
1910 btrfs_free_reserved_data_space(inode, data_reserved,
1911 page_start, reserved_space);
1912 goto out_noreserve;
1913 }
1914
1915 ret = file_update_time(vmf->vma->vm_file);
1916 if (ret < 0)
1917 goto out;
1918 again:
1919 down_read(&inode->i_mmap_lock);
1920 folio_lock(folio);
1921 size = i_size_read(&inode->vfs_inode);
1922
1923 if ((folio->mapping != inode->vfs_inode.i_mapping) ||
1924 (page_start >= size)) {
1925 /* Page got truncated out from underneath us. */
1926 goto out_unlock;
1927 }
1928 folio_wait_writeback(folio);
1929
1930 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
1931 ret = set_folio_extent_mapped(folio);
1932 if (ret < 0) {
1933 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1934 goto out_unlock;
1935 }
1936
1937 /*
1938 * We can't set the delalloc bits if there are pending ordered
1939 * extents. Drop our locks and wait for them to finish.
1940 */
1941 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
1942 if (ordered) {
1943 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1944 folio_unlock(folio);
1945 up_read(&inode->i_mmap_lock);
1946 btrfs_start_ordered_extent(ordered);
1947 btrfs_put_ordered_extent(ordered);
1948 goto again;
1949 }
1950
1951 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
1952 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1953 if (reserved_space < fsize) {
1954 const u64 to_free = fsize - reserved_space;
1955
1956 end = page_start + reserved_space - 1;
1957 if (only_release_metadata)
1958 btrfs_delalloc_release_metadata(inode, to_free, true);
1959 else
1960 btrfs_delalloc_release_space(inode, data_reserved,
1961 end + 1, to_free, true);
1962 }
1963 }
1964
1965 /*
1966 * page_mkwrite gets called when the page is firstly dirtied after it's
1967 * faulted in, but write(2) could also dirty a page and set delalloc
1968 * bits, thus in this case for space account reason, we still need to
1969 * clear any delalloc bits within this page range since we have to
1970 * reserve data&meta space before lock_page() (see above comments).
1971 */
1972 btrfs_clear_extent_bit(io_tree, page_start, end,
1973 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1974 EXTENT_DEFRAG, &cached_state);
1975
1976 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
1977 if (ret < 0) {
1978 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1979 goto out_unlock;
1980 }
1981
1982 /* Page is wholly or partially inside EOF. */
1983 if (page_start + folio_size(folio) > size)
1984 zero_start = offset_in_folio(folio, size);
1985 else
1986 zero_start = fsize;
1987
1988 if (zero_start != fsize)
1989 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1990
1991 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1992 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1993 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1994
1995 btrfs_set_inode_last_sub_trans(inode);
1996
1997 if (only_release_metadata)
1998 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
1999 &cached_state);
2000
2001 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
2002 up_read(&inode->i_mmap_lock);
2003
2004 btrfs_delalloc_release_extents(inode, fsize);
2005 if (only_release_metadata)
2006 btrfs_check_nocow_unlock(inode);
2007 sb_end_pagefault(inode->vfs_inode.i_sb);
2008 extent_changeset_free(data_reserved);
2009 return VM_FAULT_LOCKED;
2010
2011 out_unlock:
2012 folio_unlock(folio);
2013 up_read(&inode->i_mmap_lock);
2014 out:
2015 btrfs_delalloc_release_extents(inode, fsize);
2016 if (only_release_metadata)
2017 btrfs_delalloc_release_metadata(inode, reserved_space, true);
2018 else
2019 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2020 reserved_space, true);
2021 extent_changeset_free(data_reserved);
2022 out_noreserve:
2023 if (only_release_metadata)
2024 btrfs_check_nocow_unlock(inode);
2025
2026 sb_end_pagefault(inode->vfs_inode.i_sb);
2027
2028 if (ret < 0)
2029 return vmf_error(ret);
2030
2031 /* Make the VM retry the fault. */
2032 return VM_FAULT_NOPAGE;
2033 }
2034
2035 static const struct vm_operations_struct btrfs_file_vm_ops = {
2036 .fault = filemap_fault,
2037 .map_pages = filemap_map_pages,
2038 .page_mkwrite = btrfs_page_mkwrite,
2039 };
2040
btrfs_file_mmap_prepare(struct vm_area_desc * desc)2041 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
2042 {
2043 struct file *filp = desc->file;
2044 struct address_space *mapping = filp->f_mapping;
2045
2046 if (!mapping->a_ops->read_folio)
2047 return -ENOEXEC;
2048
2049 file_accessed(filp);
2050 desc->vm_ops = &btrfs_file_vm_ops;
2051
2052 return 0;
2053 }
2054
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)2055 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2056 int slot, u64 start, u64 end)
2057 {
2058 struct btrfs_file_extent_item *fi;
2059 struct btrfs_key key;
2060
2061 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2062 return false;
2063
2064 btrfs_item_key_to_cpu(leaf, &key, slot);
2065 if (key.objectid != btrfs_ino(inode) ||
2066 key.type != BTRFS_EXTENT_DATA_KEY)
2067 return false;
2068
2069 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2070
2071 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2072 return false;
2073
2074 if (btrfs_file_extent_disk_bytenr(leaf, fi))
2075 return false;
2076
2077 if (key.offset == end)
2078 return true;
2079 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2080 return true;
2081 return false;
2082 }
2083
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)2084 static int fill_holes(struct btrfs_trans_handle *trans,
2085 struct btrfs_inode *inode,
2086 struct btrfs_path *path, u64 offset, u64 end)
2087 {
2088 struct btrfs_fs_info *fs_info = trans->fs_info;
2089 struct btrfs_root *root = inode->root;
2090 struct extent_buffer *leaf;
2091 struct btrfs_file_extent_item *fi;
2092 struct extent_map *hole_em;
2093 struct btrfs_key key;
2094 int ret;
2095
2096 if (btrfs_fs_incompat(fs_info, NO_HOLES))
2097 goto out;
2098
2099 key.objectid = btrfs_ino(inode);
2100 key.type = BTRFS_EXTENT_DATA_KEY;
2101 key.offset = offset;
2102
2103 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2104 if (ret <= 0) {
2105 /*
2106 * We should have dropped this offset, so if we find it then
2107 * something has gone horribly wrong.
2108 */
2109 if (ret == 0)
2110 ret = -EINVAL;
2111 return ret;
2112 }
2113
2114 leaf = path->nodes[0];
2115 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2116 u64 num_bytes;
2117
2118 path->slots[0]--;
2119 fi = btrfs_item_ptr(leaf, path->slots[0],
2120 struct btrfs_file_extent_item);
2121 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2122 end - offset;
2123 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2124 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2125 btrfs_set_file_extent_offset(leaf, fi, 0);
2126 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2127 goto out;
2128 }
2129
2130 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2131 u64 num_bytes;
2132
2133 key.offset = offset;
2134 btrfs_set_item_key_safe(trans, path, &key);
2135 fi = btrfs_item_ptr(leaf, path->slots[0],
2136 struct btrfs_file_extent_item);
2137 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2138 offset;
2139 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2140 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2141 btrfs_set_file_extent_offset(leaf, fi, 0);
2142 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2143 goto out;
2144 }
2145 btrfs_release_path(path);
2146
2147 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2148 end - offset);
2149 if (ret)
2150 return ret;
2151
2152 out:
2153 btrfs_release_path(path);
2154
2155 hole_em = btrfs_alloc_extent_map();
2156 if (!hole_em) {
2157 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2158 btrfs_set_inode_full_sync(inode);
2159 } else {
2160 hole_em->start = offset;
2161 hole_em->len = end - offset;
2162 hole_em->ram_bytes = hole_em->len;
2163
2164 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2165 hole_em->disk_num_bytes = 0;
2166 hole_em->generation = trans->transid;
2167
2168 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2169 btrfs_free_extent_map(hole_em);
2170 if (ret)
2171 btrfs_set_inode_full_sync(inode);
2172 }
2173
2174 return 0;
2175 }
2176
2177 /*
2178 * Find a hole extent on given inode and change start/len to the end of hole
2179 * extent.(hole/vacuum extent whose em->start <= start &&
2180 * em->start + em->len > start)
2181 * When a hole extent is found, return 1 and modify start/len.
2182 */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2183 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2184 {
2185 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2186 struct extent_map *em;
2187 int ret = 0;
2188
2189 em = btrfs_get_extent(inode, NULL,
2190 round_down(*start, fs_info->sectorsize),
2191 round_up(*len, fs_info->sectorsize));
2192 if (IS_ERR(em))
2193 return PTR_ERR(em);
2194
2195 /* Hole or vacuum extent(only exists in no-hole mode) */
2196 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2197 ret = 1;
2198 *len = em->start + em->len > *start + *len ?
2199 0 : *start + *len - em->start - em->len;
2200 *start = em->start + em->len;
2201 }
2202 btrfs_free_extent_map(em);
2203 return ret;
2204 }
2205
2206 /*
2207 * Check if there is no folio in the range.
2208 *
2209 * We cannot utilize filemap_range_has_page() in a filemap with large folios
2210 * as we can hit the following false positive:
2211 *
2212 * start end
2213 * | |
2214 * |//|//|//|//| | | | | | | | |//|//|
2215 * \ / \ /
2216 * Folio A Folio B
2217 *
2218 * That large folio A and B cover the start and end indexes.
2219 * In that case filemap_range_has_page() will always return true, but the above
2220 * case is fine for btrfs_punch_hole_lock_range() usage.
2221 *
2222 * So here we only ensure that no other folios is in the range, excluding the
2223 * head/tail large folio.
2224 */
check_range_has_page(struct inode * inode,u64 start,u64 end)2225 static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
2226 {
2227 struct folio_batch fbatch;
2228 bool ret = false;
2229 /*
2230 * For subpage case, if the range is not at page boundary, we could
2231 * have pages at the leading/tailing part of the range.
2232 * This could lead to dead loop since filemap_range_has_page()
2233 * will always return true.
2234 * So here we need to do extra page alignment for
2235 * filemap_range_has_page().
2236 *
2237 * And do not decrease page_lockend right now, as it can be 0.
2238 */
2239 const u64 page_lockstart = round_up(start, PAGE_SIZE);
2240 const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
2241 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
2242 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
2243 pgoff_t tmp = start_index;
2244 int found_folios;
2245
2246 /* The same page or adjacent pages. */
2247 if (page_lockend <= page_lockstart)
2248 return false;
2249
2250 folio_batch_init(&fbatch);
2251 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
2252 for (int i = 0; i < found_folios; i++) {
2253 struct folio *folio = fbatch.folios[i];
2254
2255 /* A large folio begins before the start. Not a target. */
2256 if (folio->index < start_index)
2257 continue;
2258 /* A large folio extends beyond the end. Not a target. */
2259 if (folio_next_index(folio) > end_index)
2260 continue;
2261 /* A folio doesn't cover the head/tail index. Found a target. */
2262 ret = true;
2263 break;
2264 }
2265 folio_batch_release(&fbatch);
2266 return ret;
2267 }
2268
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2269 static void btrfs_punch_hole_lock_range(struct inode *inode,
2270 const u64 lockstart, const u64 lockend,
2271 struct extent_state **cached_state)
2272 {
2273 while (1) {
2274 truncate_pagecache_range(inode, lockstart, lockend);
2275
2276 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2277 cached_state);
2278 /*
2279 * We can't have ordered extents in the range, nor dirty/writeback
2280 * pages, because we have locked the inode's VFS lock in exclusive
2281 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2282 * we have flushed all delalloc in the range and we have waited
2283 * for any ordered extents in the range to complete.
2284 * We can race with anyone reading pages from this range, so after
2285 * locking the range check if we have pages in the range, and if
2286 * we do, unlock the range and retry.
2287 */
2288 if (!check_range_has_page(inode, lockstart, lockend))
2289 break;
2290
2291 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2292 cached_state);
2293 }
2294
2295 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2296 }
2297
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2298 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2299 struct btrfs_inode *inode,
2300 struct btrfs_path *path,
2301 struct btrfs_replace_extent_info *extent_info,
2302 const u64 replace_len,
2303 const u64 bytes_to_drop)
2304 {
2305 struct btrfs_fs_info *fs_info = trans->fs_info;
2306 struct btrfs_root *root = inode->root;
2307 struct btrfs_file_extent_item *extent;
2308 struct extent_buffer *leaf;
2309 struct btrfs_key key;
2310 int slot;
2311 int ret;
2312
2313 if (replace_len == 0)
2314 return 0;
2315
2316 if (extent_info->disk_offset == 0 &&
2317 btrfs_fs_incompat(fs_info, NO_HOLES)) {
2318 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2319 return 0;
2320 }
2321
2322 key.objectid = btrfs_ino(inode);
2323 key.type = BTRFS_EXTENT_DATA_KEY;
2324 key.offset = extent_info->file_offset;
2325 ret = btrfs_insert_empty_item(trans, root, path, &key,
2326 sizeof(struct btrfs_file_extent_item));
2327 if (ret)
2328 return ret;
2329 leaf = path->nodes[0];
2330 slot = path->slots[0];
2331 write_extent_buffer(leaf, extent_info->extent_buf,
2332 btrfs_item_ptr_offset(leaf, slot),
2333 sizeof(struct btrfs_file_extent_item));
2334 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2335 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2336 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2337 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2338 if (extent_info->is_new_extent)
2339 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2340 btrfs_release_path(path);
2341
2342 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2343 replace_len);
2344 if (ret)
2345 return ret;
2346
2347 /* If it's a hole, nothing more needs to be done. */
2348 if (extent_info->disk_offset == 0) {
2349 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2350 return 0;
2351 }
2352
2353 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2354
2355 if (extent_info->is_new_extent && extent_info->insertions == 0) {
2356 key.objectid = extent_info->disk_offset;
2357 key.type = BTRFS_EXTENT_ITEM_KEY;
2358 key.offset = extent_info->disk_len;
2359 ret = btrfs_alloc_reserved_file_extent(trans, root,
2360 btrfs_ino(inode),
2361 extent_info->file_offset,
2362 extent_info->qgroup_reserved,
2363 &key);
2364 } else {
2365 struct btrfs_ref ref = {
2366 .action = BTRFS_ADD_DELAYED_REF,
2367 .bytenr = extent_info->disk_offset,
2368 .num_bytes = extent_info->disk_len,
2369 .owning_root = btrfs_root_id(root),
2370 .ref_root = btrfs_root_id(root),
2371 };
2372 u64 ref_offset;
2373
2374 ref_offset = extent_info->file_offset - extent_info->data_offset;
2375 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2376 ret = btrfs_inc_extent_ref(trans, &ref);
2377 }
2378
2379 extent_info->insertions++;
2380
2381 return ret;
2382 }
2383
2384 /*
2385 * The respective range must have been previously locked, as well as the inode.
2386 * The end offset is inclusive (last byte of the range).
2387 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2388 * the file range with an extent.
2389 * When not punching a hole, we don't want to end up in a state where we dropped
2390 * extents without inserting a new one, so we must abort the transaction to avoid
2391 * a corruption.
2392 */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2393 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2394 struct btrfs_path *path, const u64 start,
2395 const u64 end,
2396 struct btrfs_replace_extent_info *extent_info,
2397 struct btrfs_trans_handle **trans_out)
2398 {
2399 struct btrfs_drop_extents_args drop_args = { 0 };
2400 struct btrfs_root *root = inode->root;
2401 struct btrfs_fs_info *fs_info = root->fs_info;
2402 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2403 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2404 struct btrfs_trans_handle *trans = NULL;
2405 struct btrfs_block_rsv rsv;
2406 unsigned int rsv_count;
2407 u64 cur_offset;
2408 u64 len = end - start;
2409 int ret = 0;
2410
2411 if (end <= start)
2412 return -EINVAL;
2413
2414 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
2415 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
2416 rsv.failfast = true;
2417
2418 /*
2419 * 1 - update the inode
2420 * 1 - removing the extents in the range
2421 * 1 - adding the hole extent if no_holes isn't set or if we are
2422 * replacing the range with a new extent
2423 */
2424 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2425 rsv_count = 3;
2426 else
2427 rsv_count = 2;
2428
2429 trans = btrfs_start_transaction(root, rsv_count);
2430 if (IS_ERR(trans)) {
2431 ret = PTR_ERR(trans);
2432 trans = NULL;
2433 goto out_release;
2434 }
2435
2436 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
2437 min_size, false);
2438 if (WARN_ON(ret))
2439 goto out_trans;
2440 trans->block_rsv = &rsv;
2441
2442 cur_offset = start;
2443 drop_args.path = path;
2444 drop_args.end = end + 1;
2445 drop_args.drop_cache = true;
2446 while (cur_offset < end) {
2447 drop_args.start = cur_offset;
2448 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2449 /* If we are punching a hole decrement the inode's byte count */
2450 if (!extent_info)
2451 btrfs_update_inode_bytes(inode, 0,
2452 drop_args.bytes_found);
2453 if (ret != -ENOSPC) {
2454 /*
2455 * The only time we don't want to abort is if we are
2456 * attempting to clone a partial inline extent, in which
2457 * case we'll get EOPNOTSUPP. However if we aren't
2458 * clone we need to abort no matter what, because if we
2459 * got EOPNOTSUPP via prealloc then we messed up and
2460 * need to abort.
2461 */
2462 if (unlikely(ret &&
2463 (ret != -EOPNOTSUPP ||
2464 (extent_info && extent_info->is_new_extent))))
2465 btrfs_abort_transaction(trans, ret);
2466 break;
2467 }
2468
2469 trans->block_rsv = &fs_info->trans_block_rsv;
2470
2471 if (!extent_info && cur_offset < drop_args.drop_end &&
2472 cur_offset < ino_size) {
2473 ret = fill_holes(trans, inode, path, cur_offset,
2474 drop_args.drop_end);
2475 if (unlikely(ret)) {
2476 /*
2477 * If we failed then we didn't insert our hole
2478 * entries for the area we dropped, so now the
2479 * fs is corrupted, so we must abort the
2480 * transaction.
2481 */
2482 btrfs_abort_transaction(trans, ret);
2483 break;
2484 }
2485 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2486 /*
2487 * We are past the i_size here, but since we didn't
2488 * insert holes we need to clear the mapped area so we
2489 * know to not set disk_i_size in this area until a new
2490 * file extent is inserted here.
2491 */
2492 ret = btrfs_inode_clear_file_extent_range(inode,
2493 cur_offset,
2494 drop_args.drop_end - cur_offset);
2495 if (unlikely(ret)) {
2496 /*
2497 * We couldn't clear our area, so we could
2498 * presumably adjust up and corrupt the fs, so
2499 * we need to abort.
2500 */
2501 btrfs_abort_transaction(trans, ret);
2502 break;
2503 }
2504 }
2505
2506 if (extent_info &&
2507 drop_args.drop_end > extent_info->file_offset) {
2508 u64 replace_len = drop_args.drop_end -
2509 extent_info->file_offset;
2510
2511 ret = btrfs_insert_replace_extent(trans, inode, path,
2512 extent_info, replace_len,
2513 drop_args.bytes_found);
2514 if (unlikely(ret)) {
2515 btrfs_abort_transaction(trans, ret);
2516 break;
2517 }
2518 extent_info->data_len -= replace_len;
2519 extent_info->data_offset += replace_len;
2520 extent_info->file_offset += replace_len;
2521 }
2522
2523 /*
2524 * We are releasing our handle on the transaction, balance the
2525 * dirty pages of the btree inode and flush delayed items, and
2526 * then get a new transaction handle, which may now point to a
2527 * new transaction in case someone else may have committed the
2528 * transaction we used to replace/drop file extent items. So
2529 * bump the inode's iversion and update mtime and ctime except
2530 * if we are called from a dedupe context. This is because a
2531 * power failure/crash may happen after the transaction is
2532 * committed and before we finish replacing/dropping all the
2533 * file extent items we need.
2534 */
2535 inode_inc_iversion(&inode->vfs_inode);
2536
2537 if (!extent_info || extent_info->update_times)
2538 inode_set_mtime_to_ts(&inode->vfs_inode,
2539 inode_set_ctime_current(&inode->vfs_inode));
2540
2541 ret = btrfs_update_inode(trans, inode);
2542 if (ret)
2543 break;
2544
2545 btrfs_end_transaction(trans);
2546 btrfs_btree_balance_dirty(fs_info);
2547
2548 trans = btrfs_start_transaction(root, rsv_count);
2549 if (IS_ERR(trans)) {
2550 ret = PTR_ERR(trans);
2551 trans = NULL;
2552 break;
2553 }
2554
2555 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2556 &rsv, min_size, false);
2557 if (WARN_ON(ret))
2558 break;
2559 trans->block_rsv = &rsv;
2560
2561 cur_offset = drop_args.drop_end;
2562 len = end - cur_offset;
2563 if (!extent_info && len) {
2564 ret = find_first_non_hole(inode, &cur_offset, &len);
2565 if (unlikely(ret < 0))
2566 break;
2567 if (ret && !len) {
2568 ret = 0;
2569 break;
2570 }
2571 }
2572 }
2573
2574 /*
2575 * If we were cloning, force the next fsync to be a full one since we
2576 * we replaced (or just dropped in the case of cloning holes when
2577 * NO_HOLES is enabled) file extent items and did not setup new extent
2578 * maps for the replacement extents (or holes).
2579 */
2580 if (extent_info && !extent_info->is_new_extent)
2581 btrfs_set_inode_full_sync(inode);
2582
2583 if (ret)
2584 goto out_trans;
2585
2586 trans->block_rsv = &fs_info->trans_block_rsv;
2587 /*
2588 * If we are using the NO_HOLES feature we might have had already an
2589 * hole that overlaps a part of the region [lockstart, lockend] and
2590 * ends at (or beyond) lockend. Since we have no file extent items to
2591 * represent holes, drop_end can be less than lockend and so we must
2592 * make sure we have an extent map representing the existing hole (the
2593 * call to __btrfs_drop_extents() might have dropped the existing extent
2594 * map representing the existing hole), otherwise the fast fsync path
2595 * will not record the existence of the hole region
2596 * [existing_hole_start, lockend].
2597 */
2598 if (drop_args.drop_end <= end)
2599 drop_args.drop_end = end + 1;
2600 /*
2601 * Don't insert file hole extent item if it's for a range beyond eof
2602 * (because it's useless) or if it represents a 0 bytes range (when
2603 * cur_offset == drop_end).
2604 */
2605 if (!extent_info && cur_offset < ino_size &&
2606 cur_offset < drop_args.drop_end) {
2607 ret = fill_holes(trans, inode, path, cur_offset,
2608 drop_args.drop_end);
2609 if (unlikely(ret)) {
2610 /* Same comment as above. */
2611 btrfs_abort_transaction(trans, ret);
2612 goto out_trans;
2613 }
2614 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2615 /* See the comment in the loop above for the reasoning here. */
2616 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2617 drop_args.drop_end - cur_offset);
2618 if (unlikely(ret)) {
2619 btrfs_abort_transaction(trans, ret);
2620 goto out_trans;
2621 }
2622
2623 }
2624 if (extent_info) {
2625 ret = btrfs_insert_replace_extent(trans, inode, path,
2626 extent_info, extent_info->data_len,
2627 drop_args.bytes_found);
2628 if (unlikely(ret)) {
2629 btrfs_abort_transaction(trans, ret);
2630 goto out_trans;
2631 }
2632 }
2633
2634 out_trans:
2635 if (!trans)
2636 goto out_release;
2637
2638 trans->block_rsv = &fs_info->trans_block_rsv;
2639 if (ret)
2640 btrfs_end_transaction(trans);
2641 else
2642 *trans_out = trans;
2643 out_release:
2644 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
2645 return ret;
2646 }
2647
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2648 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2649 {
2650 struct inode *inode = file_inode(file);
2651 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2652 struct btrfs_root *root = BTRFS_I(inode)->root;
2653 struct extent_state *cached_state = NULL;
2654 struct btrfs_path *path;
2655 struct btrfs_trans_handle *trans = NULL;
2656 u64 lockstart;
2657 u64 lockend;
2658 u64 tail_start;
2659 u64 tail_len;
2660 const u64 orig_start = offset;
2661 const u64 orig_end = offset + len - 1;
2662 int ret = 0;
2663 bool same_block;
2664 u64 ino_size;
2665 bool truncated_block = false;
2666 bool updated_inode = false;
2667
2668 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2669
2670 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2671 if (ret)
2672 goto out_only_mutex;
2673
2674 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2675 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2676 if (ret < 0)
2677 goto out_only_mutex;
2678 if (ret && !len) {
2679 /* Already in a large hole */
2680 ret = 0;
2681 goto out_only_mutex;
2682 }
2683
2684 ret = file_modified(file);
2685 if (ret)
2686 goto out_only_mutex;
2687
2688 lockstart = round_up(offset, fs_info->sectorsize);
2689 lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2690 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2691 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2692 /*
2693 * Only do this if we are in the same block and we aren't doing the
2694 * entire block.
2695 */
2696 if (same_block && len < fs_info->sectorsize) {
2697 if (offset < ino_size) {
2698 truncated_block = true;
2699 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2700 orig_start, orig_end);
2701 } else {
2702 ret = 0;
2703 }
2704 goto out_only_mutex;
2705 }
2706
2707 /* zero back part of the first block */
2708 if (offset < ino_size) {
2709 truncated_block = true;
2710 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
2711 if (ret) {
2712 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2713 return ret;
2714 }
2715 }
2716
2717 /* Check the aligned pages after the first unaligned page,
2718 * if offset != orig_start, which means the first unaligned page
2719 * including several following pages are already in holes,
2720 * the extra check can be skipped */
2721 if (offset == orig_start) {
2722 /* after truncate page, check hole again */
2723 len = offset + len - lockstart;
2724 offset = lockstart;
2725 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2726 if (ret < 0)
2727 goto out_only_mutex;
2728 if (ret && !len) {
2729 ret = 0;
2730 goto out_only_mutex;
2731 }
2732 lockstart = offset;
2733 }
2734
2735 /* Check the tail unaligned part is in a hole */
2736 tail_start = lockend + 1;
2737 tail_len = offset + len - tail_start;
2738 if (tail_len) {
2739 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2740 if (unlikely(ret < 0))
2741 goto out_only_mutex;
2742 if (!ret) {
2743 /* zero the front end of the last page */
2744 if (tail_start + tail_len < ino_size) {
2745 truncated_block = true;
2746 ret = btrfs_truncate_block(BTRFS_I(inode),
2747 tail_start + tail_len - 1,
2748 orig_start, orig_end);
2749 if (ret)
2750 goto out_only_mutex;
2751 }
2752 }
2753 }
2754
2755 if (lockend < lockstart) {
2756 ret = 0;
2757 goto out_only_mutex;
2758 }
2759
2760 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2761
2762 path = btrfs_alloc_path();
2763 if (!path) {
2764 ret = -ENOMEM;
2765 goto out;
2766 }
2767
2768 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2769 lockend, NULL, &trans);
2770 btrfs_free_path(path);
2771 if (ret)
2772 goto out;
2773
2774 ASSERT(trans != NULL);
2775 inode_inc_iversion(inode);
2776 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2777 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2778 updated_inode = true;
2779 btrfs_end_transaction(trans);
2780 btrfs_btree_balance_dirty(fs_info);
2781 out:
2782 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2783 &cached_state);
2784 out_only_mutex:
2785 if (!updated_inode && truncated_block && !ret) {
2786 /*
2787 * If we only end up zeroing part of a page, we still need to
2788 * update the inode item, so that all the time fields are
2789 * updated as well as the necessary btrfs inode in memory fields
2790 * for detecting, at fsync time, if the inode isn't yet in the
2791 * log tree or it's there but not up to date.
2792 */
2793 struct timespec64 now = inode_set_ctime_current(inode);
2794
2795 inode_inc_iversion(inode);
2796 inode_set_mtime_to_ts(inode, now);
2797 trans = btrfs_start_transaction(root, 1);
2798 if (IS_ERR(trans)) {
2799 ret = PTR_ERR(trans);
2800 } else {
2801 int ret2;
2802
2803 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2804 ret2 = btrfs_end_transaction(trans);
2805 if (!ret)
2806 ret = ret2;
2807 }
2808 }
2809 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2810 return ret;
2811 }
2812
2813 /* Helper structure to record which range is already reserved */
2814 struct falloc_range {
2815 struct list_head list;
2816 u64 start;
2817 u64 len;
2818 };
2819
2820 /*
2821 * Helper function to add falloc range
2822 *
2823 * Caller should have locked the larger range of extent containing
2824 * [start, len)
2825 */
add_falloc_range(struct list_head * head,u64 start,u64 len)2826 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2827 {
2828 struct falloc_range *range = NULL;
2829
2830 if (!list_empty(head)) {
2831 /*
2832 * As fallocate iterates by bytenr order, we only need to check
2833 * the last range.
2834 */
2835 range = list_last_entry(head, struct falloc_range, list);
2836 if (range->start + range->len == start) {
2837 range->len += len;
2838 return 0;
2839 }
2840 }
2841
2842 range = kmalloc(sizeof(*range), GFP_KERNEL);
2843 if (!range)
2844 return -ENOMEM;
2845 range->start = start;
2846 range->len = len;
2847 list_add_tail(&range->list, head);
2848 return 0;
2849 }
2850
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2851 static int btrfs_fallocate_update_isize(struct inode *inode,
2852 const u64 end,
2853 const int mode)
2854 {
2855 struct btrfs_trans_handle *trans;
2856 struct btrfs_root *root = BTRFS_I(inode)->root;
2857 int ret;
2858 int ret2;
2859
2860 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2861 return 0;
2862
2863 trans = btrfs_start_transaction(root, 1);
2864 if (IS_ERR(trans))
2865 return PTR_ERR(trans);
2866
2867 inode_set_ctime_current(inode);
2868 i_size_write(inode, end);
2869 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2870 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2871 ret2 = btrfs_end_transaction(trans);
2872
2873 return ret ? ret : ret2;
2874 }
2875
2876 enum {
2877 RANGE_BOUNDARY_WRITTEN_EXTENT,
2878 RANGE_BOUNDARY_PREALLOC_EXTENT,
2879 RANGE_BOUNDARY_HOLE,
2880 };
2881
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2882 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2883 u64 offset)
2884 {
2885 const u64 sectorsize = inode->root->fs_info->sectorsize;
2886 struct extent_map *em;
2887 int ret;
2888
2889 offset = round_down(offset, sectorsize);
2890 em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2891 if (IS_ERR(em))
2892 return PTR_ERR(em);
2893
2894 if (em->disk_bytenr == EXTENT_MAP_HOLE)
2895 ret = RANGE_BOUNDARY_HOLE;
2896 else if (em->flags & EXTENT_FLAG_PREALLOC)
2897 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2898 else
2899 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2900
2901 btrfs_free_extent_map(em);
2902 return ret;
2903 }
2904
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2905 static int btrfs_zero_range(struct inode *inode,
2906 loff_t offset,
2907 loff_t len,
2908 const int mode)
2909 {
2910 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2911 struct extent_map *em;
2912 struct extent_changeset *data_reserved = NULL;
2913 int ret;
2914 u64 alloc_hint = 0;
2915 const u64 sectorsize = fs_info->sectorsize;
2916 const u64 orig_start = offset;
2917 const u64 orig_end = offset + len - 1;
2918 u64 alloc_start = round_down(offset, sectorsize);
2919 u64 alloc_end = round_up(offset + len, sectorsize);
2920 u64 bytes_to_reserve = 0;
2921 bool space_reserved = false;
2922
2923 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2924 alloc_end - alloc_start);
2925 if (IS_ERR(em)) {
2926 ret = PTR_ERR(em);
2927 goto out;
2928 }
2929
2930 /*
2931 * Avoid hole punching and extent allocation for some cases. More cases
2932 * could be considered, but these are unlikely common and we keep things
2933 * as simple as possible for now. Also, intentionally, if the target
2934 * range contains one or more prealloc extents together with regular
2935 * extents and holes, we drop all the existing extents and allocate a
2936 * new prealloc extent, so that we get a larger contiguous disk extent.
2937 */
2938 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2939 const u64 em_end = em->start + em->len;
2940
2941 if (em_end >= offset + len) {
2942 /*
2943 * The whole range is already a prealloc extent,
2944 * do nothing except updating the inode's i_size if
2945 * needed.
2946 */
2947 btrfs_free_extent_map(em);
2948 ret = btrfs_fallocate_update_isize(inode, offset + len,
2949 mode);
2950 goto out;
2951 }
2952 /*
2953 * Part of the range is already a prealloc extent, so operate
2954 * only on the remaining part of the range.
2955 */
2956 alloc_start = em_end;
2957 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2958 len = offset + len - alloc_start;
2959 offset = alloc_start;
2960 alloc_hint = btrfs_extent_map_block_start(em) + em->len;
2961 }
2962 btrfs_free_extent_map(em);
2963
2964 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2965 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2966 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2967 if (IS_ERR(em)) {
2968 ret = PTR_ERR(em);
2969 goto out;
2970 }
2971
2972 if (em->flags & EXTENT_FLAG_PREALLOC) {
2973 btrfs_free_extent_map(em);
2974 ret = btrfs_fallocate_update_isize(inode, offset + len,
2975 mode);
2976 goto out;
2977 }
2978 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2979 btrfs_free_extent_map(em);
2980 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2981 orig_start, orig_end);
2982 if (!ret)
2983 ret = btrfs_fallocate_update_isize(inode,
2984 offset + len,
2985 mode);
2986 return ret;
2987 }
2988 btrfs_free_extent_map(em);
2989 alloc_start = round_down(offset, sectorsize);
2990 alloc_end = alloc_start + sectorsize;
2991 goto reserve_space;
2992 }
2993
2994 alloc_start = round_up(offset, sectorsize);
2995 alloc_end = round_down(offset + len, sectorsize);
2996
2997 /*
2998 * For unaligned ranges, check the pages at the boundaries, they might
2999 * map to an extent, in which case we need to partially zero them, or
3000 * they might map to a hole, in which case we need our allocation range
3001 * to cover them.
3002 */
3003 if (!IS_ALIGNED(offset, sectorsize)) {
3004 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3005 offset);
3006 if (ret < 0)
3007 goto out;
3008 if (ret == RANGE_BOUNDARY_HOLE) {
3009 alloc_start = round_down(offset, sectorsize);
3010 ret = 0;
3011 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3012 ret = btrfs_truncate_block(BTRFS_I(inode), offset,
3013 orig_start, orig_end);
3014 if (ret)
3015 goto out;
3016 } else {
3017 ret = 0;
3018 }
3019 }
3020
3021 if (!IS_ALIGNED(offset + len, sectorsize)) {
3022 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3023 offset + len);
3024 if (ret < 0)
3025 goto out;
3026 if (ret == RANGE_BOUNDARY_HOLE) {
3027 alloc_end = round_up(offset + len, sectorsize);
3028 ret = 0;
3029 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3030 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
3031 orig_start, orig_end);
3032 if (ret)
3033 goto out;
3034 } else {
3035 ret = 0;
3036 }
3037 }
3038
3039 reserve_space:
3040 if (alloc_start < alloc_end) {
3041 struct extent_state *cached_state = NULL;
3042 const u64 lockstart = alloc_start;
3043 const u64 lockend = alloc_end - 1;
3044
3045 bytes_to_reserve = alloc_end - alloc_start;
3046 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3047 bytes_to_reserve);
3048 if (ret < 0)
3049 goto out;
3050 space_reserved = true;
3051 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3052 &cached_state);
3053 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3054 alloc_start, bytes_to_reserve);
3055 if (ret) {
3056 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3057 lockend, &cached_state);
3058 goto out;
3059 }
3060 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3061 alloc_end - alloc_start,
3062 fs_info->sectorsize,
3063 offset + len, &alloc_hint);
3064 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3065 &cached_state);
3066 /* btrfs_prealloc_file_range releases reserved space on error */
3067 if (ret) {
3068 space_reserved = false;
3069 goto out;
3070 }
3071 }
3072 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3073 out:
3074 if (ret && space_reserved)
3075 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3076 alloc_start, bytes_to_reserve);
3077 extent_changeset_free(data_reserved);
3078
3079 return ret;
3080 }
3081
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3082 static long btrfs_fallocate(struct file *file, int mode,
3083 loff_t offset, loff_t len)
3084 {
3085 struct inode *inode = file_inode(file);
3086 struct extent_state *cached_state = NULL;
3087 struct extent_changeset *data_reserved = NULL;
3088 struct falloc_range *range;
3089 struct falloc_range *tmp;
3090 LIST_HEAD(reserve_list);
3091 u64 cur_offset;
3092 u64 last_byte;
3093 u64 alloc_start;
3094 u64 alloc_end;
3095 u64 alloc_hint = 0;
3096 u64 locked_end;
3097 u64 actual_end = 0;
3098 u64 data_space_needed = 0;
3099 u64 data_space_reserved = 0;
3100 u64 qgroup_reserved = 0;
3101 struct extent_map *em;
3102 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3103 int ret;
3104
3105 /* Do not allow fallocate in ZONED mode */
3106 if (btrfs_is_zoned(inode_to_fs_info(inode)))
3107 return -EOPNOTSUPP;
3108
3109 alloc_start = round_down(offset, blocksize);
3110 alloc_end = round_up(offset + len, blocksize);
3111 cur_offset = alloc_start;
3112
3113 /* Make sure we aren't being give some crap mode */
3114 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3115 FALLOC_FL_ZERO_RANGE))
3116 return -EOPNOTSUPP;
3117
3118 if (mode & FALLOC_FL_PUNCH_HOLE)
3119 return btrfs_punch_hole(file, offset, len);
3120
3121 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3122
3123 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3124 ret = inode_newsize_ok(inode, offset + len);
3125 if (ret)
3126 goto out;
3127 }
3128
3129 ret = file_modified(file);
3130 if (ret)
3131 goto out;
3132
3133 /*
3134 * TODO: Move these two operations after we have checked
3135 * accurate reserved space, or fallocate can still fail but
3136 * with page truncated or size expanded.
3137 *
3138 * But that's a minor problem and won't do much harm BTW.
3139 */
3140 if (alloc_start > inode->i_size) {
3141 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3142 alloc_start);
3143 if (ret)
3144 goto out;
3145 } else if (offset + len > inode->i_size) {
3146 /*
3147 * If we are fallocating from the end of the file onward we
3148 * need to zero out the end of the block if i_size lands in the
3149 * middle of a block.
3150 */
3151 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
3152 inode->i_size, (u64)-1);
3153 if (ret)
3154 goto out;
3155 }
3156
3157 /*
3158 * We have locked the inode at the VFS level (in exclusive mode) and we
3159 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3160 * locking the file range, flush all dealloc in the range and wait for
3161 * all ordered extents in the range to complete. After this we can lock
3162 * the file range and, due to the previous locking we did, we know there
3163 * can't be more delalloc or ordered extents in the range.
3164 */
3165 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3166 alloc_end - alloc_start);
3167 if (ret)
3168 goto out;
3169
3170 if (mode & FALLOC_FL_ZERO_RANGE) {
3171 ret = btrfs_zero_range(inode, offset, len, mode);
3172 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3173 return ret;
3174 }
3175
3176 locked_end = alloc_end - 1;
3177 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3178 &cached_state);
3179
3180 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3181
3182 /* First, check if we exceed the qgroup limit */
3183 while (cur_offset < alloc_end) {
3184 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3185 alloc_end - cur_offset);
3186 if (IS_ERR(em)) {
3187 ret = PTR_ERR(em);
3188 break;
3189 }
3190 last_byte = min(btrfs_extent_map_end(em), alloc_end);
3191 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
3192 last_byte = ALIGN(last_byte, blocksize);
3193 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3194 (cur_offset >= inode->i_size &&
3195 !(em->flags & EXTENT_FLAG_PREALLOC))) {
3196 const u64 range_len = last_byte - cur_offset;
3197
3198 ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3199 if (ret < 0) {
3200 btrfs_free_extent_map(em);
3201 break;
3202 }
3203 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3204 &data_reserved, cur_offset, range_len);
3205 if (ret < 0) {
3206 btrfs_free_extent_map(em);
3207 break;
3208 }
3209 qgroup_reserved += range_len;
3210 data_space_needed += range_len;
3211 }
3212 btrfs_free_extent_map(em);
3213 cur_offset = last_byte;
3214 }
3215
3216 if (!ret && data_space_needed > 0) {
3217 /*
3218 * We are safe to reserve space here as we can't have delalloc
3219 * in the range, see above.
3220 */
3221 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3222 data_space_needed);
3223 if (!ret)
3224 data_space_reserved = data_space_needed;
3225 }
3226
3227 /*
3228 * If ret is still 0, means we're OK to fallocate.
3229 * Or just cleanup the list and exit.
3230 */
3231 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3232 if (!ret) {
3233 ret = btrfs_prealloc_file_range(inode, mode,
3234 range->start,
3235 range->len, blocksize,
3236 offset + len, &alloc_hint);
3237 /*
3238 * btrfs_prealloc_file_range() releases space even
3239 * if it returns an error.
3240 */
3241 data_space_reserved -= range->len;
3242 qgroup_reserved -= range->len;
3243 } else if (data_space_reserved > 0) {
3244 btrfs_free_reserved_data_space(BTRFS_I(inode),
3245 data_reserved, range->start,
3246 range->len);
3247 data_space_reserved -= range->len;
3248 qgroup_reserved -= range->len;
3249 } else if (qgroup_reserved > 0) {
3250 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3251 range->start, range->len, NULL);
3252 qgroup_reserved -= range->len;
3253 }
3254 list_del(&range->list);
3255 kfree(range);
3256 }
3257 if (ret < 0)
3258 goto out_unlock;
3259
3260 /*
3261 * We didn't need to allocate any more space, but we still extended the
3262 * size of the file so we need to update i_size and the inode item.
3263 */
3264 ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3265 out_unlock:
3266 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3267 &cached_state);
3268 out:
3269 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3270 extent_changeset_free(data_reserved);
3271 return ret;
3272 }
3273
3274 /*
3275 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3276 * that has unflushed and/or flushing delalloc. There might be other adjacent
3277 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3278 * looping while it gets adjacent subranges, and merging them together.
3279 */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3280 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3281 struct extent_state **cached_state,
3282 bool *search_io_tree,
3283 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3284 {
3285 u64 len = end + 1 - start;
3286 u64 delalloc_len = 0;
3287 struct btrfs_ordered_extent *oe;
3288 u64 oe_start;
3289 u64 oe_end;
3290
3291 /*
3292 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3293 * means we have delalloc (dirty pages) for which writeback has not
3294 * started yet.
3295 */
3296 if (*search_io_tree) {
3297 spin_lock(&inode->lock);
3298 if (inode->delalloc_bytes > 0) {
3299 spin_unlock(&inode->lock);
3300 *delalloc_start_ret = start;
3301 delalloc_len = btrfs_count_range_bits(&inode->io_tree,
3302 delalloc_start_ret, end,
3303 len, EXTENT_DELALLOC, 1,
3304 cached_state);
3305 } else {
3306 spin_unlock(&inode->lock);
3307 }
3308 }
3309
3310 if (delalloc_len > 0) {
3311 /*
3312 * If delalloc was found then *delalloc_start_ret has a sector size
3313 * aligned value (rounded down).
3314 */
3315 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3316
3317 if (*delalloc_start_ret == start) {
3318 /* Delalloc for the whole range, nothing more to do. */
3319 if (*delalloc_end_ret == end)
3320 return true;
3321 /* Else trim our search range for ordered extents. */
3322 start = *delalloc_end_ret + 1;
3323 len = end + 1 - start;
3324 }
3325 } else {
3326 /* No delalloc, future calls don't need to search again. */
3327 *search_io_tree = false;
3328 }
3329
3330 /*
3331 * Now also check if there's any ordered extent in the range.
3332 * We do this because:
3333 *
3334 * 1) When delalloc is flushed, the file range is locked, we clear the
3335 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3336 * an ordered extent for the write. So we might just have been called
3337 * after delalloc is flushed and before the ordered extent completes
3338 * and inserts the new file extent item in the subvolume's btree;
3339 *
3340 * 2) We may have an ordered extent created by flushing delalloc for a
3341 * subrange that starts before the subrange we found marked with
3342 * EXTENT_DELALLOC in the io tree.
3343 *
3344 * We could also use the extent map tree to find such delalloc that is
3345 * being flushed, but using the ordered extents tree is more efficient
3346 * because it's usually much smaller as ordered extents are removed from
3347 * the tree once they complete. With the extent maps, we may have them
3348 * in the extent map tree for a very long time, and they were either
3349 * created by previous writes or loaded by read operations.
3350 */
3351 oe = btrfs_lookup_first_ordered_range(inode, start, len);
3352 if (!oe)
3353 return (delalloc_len > 0);
3354
3355 /* The ordered extent may span beyond our search range. */
3356 oe_start = max(oe->file_offset, start);
3357 oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3358
3359 btrfs_put_ordered_extent(oe);
3360
3361 /* Don't have unflushed delalloc, return the ordered extent range. */
3362 if (delalloc_len == 0) {
3363 *delalloc_start_ret = oe_start;
3364 *delalloc_end_ret = oe_end;
3365 return true;
3366 }
3367
3368 /*
3369 * We have both unflushed delalloc (io_tree) and an ordered extent.
3370 * If the ranges are adjacent returned a combined range, otherwise
3371 * return the leftmost range.
3372 */
3373 if (oe_start < *delalloc_start_ret) {
3374 if (oe_end < *delalloc_start_ret)
3375 *delalloc_end_ret = oe_end;
3376 *delalloc_start_ret = oe_start;
3377 } else if (*delalloc_end_ret + 1 == oe_start) {
3378 *delalloc_end_ret = oe_end;
3379 }
3380
3381 return true;
3382 }
3383
3384 /*
3385 * Check if there's delalloc in a given range.
3386 *
3387 * @inode: The inode.
3388 * @start: The start offset of the range. It does not need to be
3389 * sector size aligned.
3390 * @end: The end offset (inclusive value) of the search range.
3391 * It does not need to be sector size aligned.
3392 * @cached_state: Extent state record used for speeding up delalloc
3393 * searches in the inode's io_tree. Can be NULL.
3394 * @delalloc_start_ret: Output argument, set to the start offset of the
3395 * subrange found with delalloc (may not be sector size
3396 * aligned).
3397 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3398 * of the subrange found with delalloc.
3399 *
3400 * Returns true if a subrange with delalloc is found within the given range, and
3401 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3402 * end offsets of the subrange.
3403 */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3404 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3405 struct extent_state **cached_state,
3406 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3407 {
3408 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3409 u64 prev_delalloc_end = 0;
3410 bool search_io_tree = true;
3411 bool ret = false;
3412
3413 while (cur_offset <= end) {
3414 u64 delalloc_start;
3415 u64 delalloc_end;
3416 bool delalloc;
3417
3418 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3419 cached_state, &search_io_tree,
3420 &delalloc_start,
3421 &delalloc_end);
3422 if (!delalloc)
3423 break;
3424
3425 if (prev_delalloc_end == 0) {
3426 /* First subrange found. */
3427 *delalloc_start_ret = max(delalloc_start, start);
3428 *delalloc_end_ret = delalloc_end;
3429 ret = true;
3430 } else if (delalloc_start == prev_delalloc_end + 1) {
3431 /* Subrange adjacent to the previous one, merge them. */
3432 *delalloc_end_ret = delalloc_end;
3433 } else {
3434 /* Subrange not adjacent to the previous one, exit. */
3435 break;
3436 }
3437
3438 prev_delalloc_end = delalloc_end;
3439 cur_offset = delalloc_end + 1;
3440 cond_resched();
3441 }
3442
3443 return ret;
3444 }
3445
3446 /*
3447 * Check if there's a hole or delalloc range in a range representing a hole (or
3448 * prealloc extent) found in the inode's subvolume btree.
3449 *
3450 * @inode: The inode.
3451 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3452 * @start: Start offset of the hole region. It does not need to be sector
3453 * size aligned.
3454 * @end: End offset (inclusive value) of the hole region. It does not
3455 * need to be sector size aligned.
3456 * @start_ret: Return parameter, used to set the start of the subrange in the
3457 * hole that matches the search criteria (seek mode), if such
3458 * subrange is found (return value of the function is true).
3459 * The value returned here may not be sector size aligned.
3460 *
3461 * Returns true if a subrange matching the given seek mode is found, and if one
3462 * is found, it updates @start_ret with the start of the subrange.
3463 */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3464 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3465 struct extent_state **cached_state,
3466 u64 start, u64 end, u64 *start_ret)
3467 {
3468 u64 delalloc_start;
3469 u64 delalloc_end;
3470 bool delalloc;
3471
3472 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3473 &delalloc_start, &delalloc_end);
3474 if (delalloc && whence == SEEK_DATA) {
3475 *start_ret = delalloc_start;
3476 return true;
3477 }
3478
3479 if (delalloc && whence == SEEK_HOLE) {
3480 /*
3481 * We found delalloc but it starts after out start offset. So we
3482 * have a hole between our start offset and the delalloc start.
3483 */
3484 if (start < delalloc_start) {
3485 *start_ret = start;
3486 return true;
3487 }
3488 /*
3489 * Delalloc range starts at our start offset.
3490 * If the delalloc range's length is smaller than our range,
3491 * then it means we have a hole that starts where the delalloc
3492 * subrange ends.
3493 */
3494 if (delalloc_end < end) {
3495 *start_ret = delalloc_end + 1;
3496 return true;
3497 }
3498
3499 /* There's delalloc for the whole range. */
3500 return false;
3501 }
3502
3503 if (!delalloc && whence == SEEK_HOLE) {
3504 *start_ret = start;
3505 return true;
3506 }
3507
3508 /*
3509 * No delalloc in the range and we are seeking for data. The caller has
3510 * to iterate to the next extent item in the subvolume btree.
3511 */
3512 return false;
3513 }
3514
find_desired_extent(struct file * file,loff_t offset,int whence)3515 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3516 {
3517 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3518 struct btrfs_file_private *private;
3519 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3520 struct extent_state *cached_state = NULL;
3521 struct extent_state **delalloc_cached_state;
3522 const loff_t i_size = i_size_read(&inode->vfs_inode);
3523 const u64 ino = btrfs_ino(inode);
3524 struct btrfs_root *root = inode->root;
3525 struct btrfs_path *path;
3526 struct btrfs_key key;
3527 u64 last_extent_end;
3528 u64 lockstart;
3529 u64 lockend;
3530 u64 start;
3531 int ret;
3532 bool found = false;
3533
3534 if (i_size == 0 || offset >= i_size)
3535 return -ENXIO;
3536
3537 /*
3538 * Quick path. If the inode has no prealloc extents and its number of
3539 * bytes used matches its i_size, then it can not have holes.
3540 */
3541 if (whence == SEEK_HOLE &&
3542 !(inode->flags & BTRFS_INODE_PREALLOC) &&
3543 inode_get_bytes(&inode->vfs_inode) == i_size)
3544 return i_size;
3545
3546 spin_lock(&inode->lock);
3547 private = file->private_data;
3548 spin_unlock(&inode->lock);
3549
3550 if (private && private->owner_task != current) {
3551 /*
3552 * Not allocated by us, don't use it as its cached state is used
3553 * by the task that allocated it and we don't want neither to
3554 * mess with it nor get incorrect results because it reflects an
3555 * invalid state for the current task.
3556 */
3557 private = NULL;
3558 } else if (!private) {
3559 private = kzalloc(sizeof(*private), GFP_KERNEL);
3560 /*
3561 * No worries if memory allocation failed.
3562 * The private structure is used only for speeding up multiple
3563 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3564 * so everything will still be correct.
3565 */
3566 if (private) {
3567 bool free = false;
3568
3569 private->owner_task = current;
3570
3571 spin_lock(&inode->lock);
3572 if (file->private_data)
3573 free = true;
3574 else
3575 file->private_data = private;
3576 spin_unlock(&inode->lock);
3577
3578 if (free) {
3579 kfree(private);
3580 private = NULL;
3581 }
3582 }
3583 }
3584
3585 if (private)
3586 delalloc_cached_state = &private->llseek_cached_state;
3587 else
3588 delalloc_cached_state = NULL;
3589
3590 /*
3591 * offset can be negative, in this case we start finding DATA/HOLE from
3592 * the very start of the file.
3593 */
3594 start = max_t(loff_t, 0, offset);
3595
3596 lockstart = round_down(start, fs_info->sectorsize);
3597 lockend = round_up(i_size, fs_info->sectorsize);
3598 if (lockend <= lockstart)
3599 lockend = lockstart + fs_info->sectorsize;
3600 lockend--;
3601
3602 path = btrfs_alloc_path();
3603 if (!path)
3604 return -ENOMEM;
3605 path->reada = READA_FORWARD;
3606
3607 key.objectid = ino;
3608 key.type = BTRFS_EXTENT_DATA_KEY;
3609 key.offset = start;
3610
3611 last_extent_end = lockstart;
3612
3613 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3614
3615 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3616 if (ret < 0) {
3617 goto out;
3618 } else if (ret > 0 && path->slots[0] > 0) {
3619 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3620 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3621 path->slots[0]--;
3622 }
3623
3624 while (start < i_size) {
3625 struct extent_buffer *leaf = path->nodes[0];
3626 struct btrfs_file_extent_item *extent;
3627 u64 extent_end;
3628 u8 type;
3629
3630 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3631 ret = btrfs_next_leaf(root, path);
3632 if (ret < 0)
3633 goto out;
3634 else if (ret > 0)
3635 break;
3636
3637 leaf = path->nodes[0];
3638 }
3639
3640 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3641 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3642 break;
3643
3644 extent_end = btrfs_file_extent_end(path);
3645
3646 /*
3647 * In the first iteration we may have a slot that points to an
3648 * extent that ends before our start offset, so skip it.
3649 */
3650 if (extent_end <= start) {
3651 path->slots[0]++;
3652 continue;
3653 }
3654
3655 /* We have an implicit hole, NO_HOLES feature is likely set. */
3656 if (last_extent_end < key.offset) {
3657 u64 search_start = last_extent_end;
3658 u64 found_start;
3659
3660 /*
3661 * First iteration, @start matches @offset and it's
3662 * within the hole.
3663 */
3664 if (start == offset)
3665 search_start = offset;
3666
3667 found = find_desired_extent_in_hole(inode, whence,
3668 delalloc_cached_state,
3669 search_start,
3670 key.offset - 1,
3671 &found_start);
3672 if (found) {
3673 start = found_start;
3674 break;
3675 }
3676 /*
3677 * Didn't find data or a hole (due to delalloc) in the
3678 * implicit hole range, so need to analyze the extent.
3679 */
3680 }
3681
3682 extent = btrfs_item_ptr(leaf, path->slots[0],
3683 struct btrfs_file_extent_item);
3684 type = btrfs_file_extent_type(leaf, extent);
3685
3686 /*
3687 * Can't access the extent's disk_bytenr field if this is an
3688 * inline extent, since at that offset, it's where the extent
3689 * data starts.
3690 */
3691 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3692 (type == BTRFS_FILE_EXTENT_REG &&
3693 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3694 /*
3695 * Explicit hole or prealloc extent, search for delalloc.
3696 * A prealloc extent is treated like a hole.
3697 */
3698 u64 search_start = key.offset;
3699 u64 found_start;
3700
3701 /*
3702 * First iteration, @start matches @offset and it's
3703 * within the hole.
3704 */
3705 if (start == offset)
3706 search_start = offset;
3707
3708 found = find_desired_extent_in_hole(inode, whence,
3709 delalloc_cached_state,
3710 search_start,
3711 extent_end - 1,
3712 &found_start);
3713 if (found) {
3714 start = found_start;
3715 break;
3716 }
3717 /*
3718 * Didn't find data or a hole (due to delalloc) in the
3719 * implicit hole range, so need to analyze the next
3720 * extent item.
3721 */
3722 } else {
3723 /*
3724 * Found a regular or inline extent.
3725 * If we are seeking for data, adjust the start offset
3726 * and stop, we're done.
3727 */
3728 if (whence == SEEK_DATA) {
3729 start = max_t(u64, key.offset, offset);
3730 found = true;
3731 break;
3732 }
3733 /*
3734 * Else, we are seeking for a hole, check the next file
3735 * extent item.
3736 */
3737 }
3738
3739 start = extent_end;
3740 last_extent_end = extent_end;
3741 path->slots[0]++;
3742 if (fatal_signal_pending(current)) {
3743 ret = -EINTR;
3744 goto out;
3745 }
3746 cond_resched();
3747 }
3748
3749 /* We have an implicit hole from the last extent found up to i_size. */
3750 if (!found && start < i_size) {
3751 found = find_desired_extent_in_hole(inode, whence,
3752 delalloc_cached_state, start,
3753 i_size - 1, &start);
3754 if (!found)
3755 start = i_size;
3756 }
3757
3758 out:
3759 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3760 btrfs_free_path(path);
3761
3762 if (ret < 0)
3763 return ret;
3764
3765 if (whence == SEEK_DATA && start >= i_size)
3766 return -ENXIO;
3767
3768 return min_t(loff_t, start, i_size);
3769 }
3770
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3771 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3772 {
3773 struct inode *inode = file->f_mapping->host;
3774
3775 switch (whence) {
3776 default:
3777 return generic_file_llseek(file, offset, whence);
3778 case SEEK_DATA:
3779 case SEEK_HOLE:
3780 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3781 offset = find_desired_extent(file, offset, whence);
3782 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3783 break;
3784 }
3785
3786 if (offset < 0)
3787 return offset;
3788
3789 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3790 }
3791
btrfs_file_open(struct inode * inode,struct file * filp)3792 static int btrfs_file_open(struct inode *inode, struct file *filp)
3793 {
3794 int ret;
3795
3796 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3797
3798 ret = fsverity_file_open(inode, filp);
3799 if (ret)
3800 return ret;
3801 return generic_file_open(inode, filp);
3802 }
3803
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3804 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3805 {
3806 ssize_t ret = 0;
3807
3808 if (iocb->ki_flags & IOCB_DIRECT) {
3809 ret = btrfs_direct_read(iocb, to);
3810 if (ret < 0 || !iov_iter_count(to) ||
3811 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3812 return ret;
3813 }
3814
3815 return filemap_read(iocb, to, ret);
3816 }
3817
3818 const struct file_operations btrfs_file_operations = {
3819 .llseek = btrfs_file_llseek,
3820 .read_iter = btrfs_file_read_iter,
3821 .splice_read = filemap_splice_read,
3822 .write_iter = btrfs_file_write_iter,
3823 .splice_write = iter_file_splice_write,
3824 .mmap_prepare = btrfs_file_mmap_prepare,
3825 .open = btrfs_file_open,
3826 .release = btrfs_release_file,
3827 .get_unmapped_area = thp_get_unmapped_area,
3828 .fsync = btrfs_sync_file,
3829 .fallocate = btrfs_fallocate,
3830 .unlocked_ioctl = btrfs_ioctl,
3831 #ifdef CONFIG_COMPAT
3832 .compat_ioctl = btrfs_compat_ioctl,
3833 #endif
3834 .remap_file_range = btrfs_remap_file_range,
3835 .uring_cmd = btrfs_uring_cmd,
3836 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3837 };
3838
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3839 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3840 {
3841 struct address_space *mapping = inode->vfs_inode.i_mapping;
3842 int ret;
3843
3844 /*
3845 * So with compression we will find and lock a dirty page and clear the
3846 * first one as dirty, setup an async extent, and immediately return
3847 * with the entire range locked but with nobody actually marked with
3848 * writeback. So we can't just filemap_write_and_wait_range() and
3849 * expect it to work since it will just kick off a thread to do the
3850 * actual work. So we need to call filemap_fdatawrite_range _again_
3851 * since it will wait on the page lock, which won't be unlocked until
3852 * after the pages have been marked as writeback and so we're good to go
3853 * from there. We have to do this otherwise we'll miss the ordered
3854 * extents and that results in badness. Please Josef, do not think you
3855 * know better and pull this out at some point in the future, it is
3856 * right and you are wrong.
3857 */
3858 ret = filemap_fdatawrite_range(mapping, start, end);
3859 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3860 ret = filemap_fdatawrite_range(mapping, start, end);
3861
3862 return ret;
3863 }
3864