1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/filelock.h>
14 #include <linux/writeback.h>
15 #include <linux/compat.h>
16 #include <linux/slab.h>
17 #include <linux/btrfs.h>
18 #include <linux/uio.h>
19 #include <linux/iversion.h>
20 #include <linux/fsverity.h>
21 #include "ctree.h"
22 #include "direct-io.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "btrfs_inode.h"
26 #include "tree-log.h"
27 #include "locking.h"
28 #include "qgroup.h"
29 #include "compression.h"
30 #include "delalloc-space.h"
31 #include "reflink.h"
32 #include "subpage.h"
33 #include "fs.h"
34 #include "accessors.h"
35 #include "extent-tree.h"
36 #include "file-item.h"
37 #include "ioctl.h"
38 #include "file.h"
39 #include "super.h"
40 #include "print-tree.h"
41
42 /*
43 * Unlock folio after btrfs_file_write() is done with it.
44 */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)45 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
46 u64 pos, u64 copied)
47 {
48 u64 block_start = round_down(pos, fs_info->sectorsize);
49 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
50
51 ASSERT(block_len <= U32_MAX);
52 /*
53 * Folio checked is some magic around finding folios that have been
54 * modified without going through btrfs_dirty_folio(). Clear it here.
55 * There should be no need to mark the pages accessed as
56 * prepare_one_folio() should have marked them accessed in
57 * prepare_one_folio() via find_or_create_page()
58 */
59 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
60 folio_unlock(folio);
61 folio_put(folio);
62 }
63
64 /*
65 * After copy_folio_from_iter_atomic(), update the following things for delalloc:
66 * - Mark newly dirtied folio as DELALLOC in the io tree.
67 * Used to advise which range is to be written back.
68 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
69 * - Update inode size for past EOF write
70 */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)71 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
72 size_t write_bytes, struct extent_state **cached, bool noreserve)
73 {
74 struct btrfs_fs_info *fs_info = inode->root->fs_info;
75 int ret = 0;
76 u64 num_bytes;
77 u64 start_pos;
78 u64 end_of_last_block;
79 const u64 end_pos = pos + write_bytes;
80 loff_t isize = i_size_read(&inode->vfs_inode);
81 unsigned int extra_bits = 0;
82
83 if (write_bytes == 0)
84 return 0;
85
86 if (noreserve)
87 extra_bits |= EXTENT_NORESERVE;
88
89 start_pos = round_down(pos, fs_info->sectorsize);
90 num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize);
91 ASSERT(num_bytes <= U32_MAX);
92 ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos);
93
94 end_of_last_block = start_pos + num_bytes - 1;
95
96 /*
97 * The pages may have already been dirty, clear out old accounting so
98 * we can set things up properly
99 */
100 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
101 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
102 cached);
103
104 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
105 extra_bits, cached);
106 if (ret)
107 return ret;
108
109 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
110 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
111 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
112
113 /*
114 * we've only changed i_size in ram, and we haven't updated
115 * the disk i_size. There is no need to log the inode
116 * at this time.
117 */
118 if (end_pos > isize)
119 i_size_write(&inode->vfs_inode, end_pos);
120 return 0;
121 }
122
123 /*
124 * this is very complex, but the basic idea is to drop all extents
125 * in the range start - end. hint_block is filled in with a block number
126 * that would be a good hint to the block allocator for this file.
127 *
128 * If an extent intersects the range but is not entirely inside the range
129 * it is either truncated or split. Anything entirely inside the range
130 * is deleted from the tree.
131 *
132 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
133 * to deal with that. We set the field 'bytes_found' of the arguments structure
134 * with the number of allocated bytes found in the target range, so that the
135 * caller can update the inode's number of bytes in an atomic way when
136 * replacing extents in a range to avoid races with stat(2).
137 */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)138 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root, struct btrfs_inode *inode,
140 struct btrfs_drop_extents_args *args)
141 {
142 struct btrfs_fs_info *fs_info = root->fs_info;
143 struct extent_buffer *leaf;
144 struct btrfs_file_extent_item *fi;
145 struct btrfs_key key;
146 struct btrfs_key new_key;
147 u64 ino = btrfs_ino(inode);
148 u64 search_start = args->start;
149 u64 disk_bytenr = 0;
150 u64 num_bytes = 0;
151 u64 extent_offset = 0;
152 u64 extent_end = 0;
153 u64 last_end = args->start;
154 int del_nr = 0;
155 int del_slot = 0;
156 int extent_type;
157 int recow;
158 int ret;
159 int modify_tree = -1;
160 int update_refs;
161 int found = 0;
162 struct btrfs_path *path = args->path;
163
164 args->bytes_found = 0;
165 args->extent_inserted = false;
166
167 /* Must always have a path if ->replace_extent is true */
168 ASSERT(!(args->replace_extent && !args->path));
169
170 if (!path) {
171 path = btrfs_alloc_path();
172 if (!path) {
173 ret = -ENOMEM;
174 goto out;
175 }
176 }
177
178 if (args->drop_cache)
179 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
180
181 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
182 modify_tree = 0;
183
184 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
185 while (1) {
186 recow = 0;
187 ret = btrfs_lookup_file_extent(trans, root, path, ino,
188 search_start, modify_tree);
189 if (ret < 0)
190 break;
191 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
192 leaf = path->nodes[0];
193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
194 if (key.objectid == ino &&
195 key.type == BTRFS_EXTENT_DATA_KEY)
196 path->slots[0]--;
197 }
198 ret = 0;
199 next_slot:
200 leaf = path->nodes[0];
201 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
202 if (WARN_ON(del_nr > 0)) {
203 btrfs_print_leaf(leaf);
204 ret = -EINVAL;
205 break;
206 }
207 ret = btrfs_next_leaf(root, path);
208 if (ret < 0)
209 break;
210 if (ret > 0) {
211 ret = 0;
212 break;
213 }
214 leaf = path->nodes[0];
215 recow = 1;
216 }
217
218 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
219
220 if (key.objectid > ino)
221 break;
222 if (WARN_ON_ONCE(key.objectid < ino) ||
223 key.type < BTRFS_EXTENT_DATA_KEY) {
224 ASSERT(del_nr == 0);
225 path->slots[0]++;
226 goto next_slot;
227 }
228 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
229 break;
230
231 fi = btrfs_item_ptr(leaf, path->slots[0],
232 struct btrfs_file_extent_item);
233 extent_type = btrfs_file_extent_type(leaf, fi);
234
235 if (extent_type == BTRFS_FILE_EXTENT_REG ||
236 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
237 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
238 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
239 extent_offset = btrfs_file_extent_offset(leaf, fi);
240 extent_end = key.offset +
241 btrfs_file_extent_num_bytes(leaf, fi);
242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
243 extent_end = key.offset +
244 btrfs_file_extent_ram_bytes(leaf, fi);
245 } else {
246 /* can't happen */
247 BUG();
248 }
249
250 /*
251 * Don't skip extent items representing 0 byte lengths. They
252 * used to be created (bug) if while punching holes we hit
253 * -ENOSPC condition. So if we find one here, just ensure we
254 * delete it, otherwise we would insert a new file extent item
255 * with the same key (offset) as that 0 bytes length file
256 * extent item in the call to setup_items_for_insert() later
257 * in this function.
258 */
259 if (extent_end == key.offset && extent_end >= search_start) {
260 last_end = extent_end;
261 goto delete_extent_item;
262 }
263
264 if (extent_end <= search_start) {
265 path->slots[0]++;
266 goto next_slot;
267 }
268
269 found = 1;
270 search_start = max(key.offset, args->start);
271 if (recow || !modify_tree) {
272 modify_tree = -1;
273 btrfs_release_path(path);
274 continue;
275 }
276
277 /*
278 * | - range to drop - |
279 * | -------- extent -------- |
280 */
281 if (args->start > key.offset && args->end < extent_end) {
282 if (WARN_ON(del_nr > 0)) {
283 btrfs_print_leaf(leaf);
284 ret = -EINVAL;
285 break;
286 }
287 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
288 ret = -EOPNOTSUPP;
289 break;
290 }
291
292 memcpy(&new_key, &key, sizeof(new_key));
293 new_key.offset = args->start;
294 ret = btrfs_duplicate_item(trans, root, path,
295 &new_key);
296 if (ret == -EAGAIN) {
297 btrfs_release_path(path);
298 continue;
299 }
300 if (ret < 0)
301 break;
302
303 leaf = path->nodes[0];
304 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
305 struct btrfs_file_extent_item);
306 btrfs_set_file_extent_num_bytes(leaf, fi,
307 args->start - key.offset);
308
309 fi = btrfs_item_ptr(leaf, path->slots[0],
310 struct btrfs_file_extent_item);
311
312 extent_offset += args->start - key.offset;
313 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
314 btrfs_set_file_extent_num_bytes(leaf, fi,
315 extent_end - args->start);
316
317 if (update_refs && disk_bytenr > 0) {
318 struct btrfs_ref ref = {
319 .action = BTRFS_ADD_DELAYED_REF,
320 .bytenr = disk_bytenr,
321 .num_bytes = num_bytes,
322 .parent = 0,
323 .owning_root = btrfs_root_id(root),
324 .ref_root = btrfs_root_id(root),
325 };
326 btrfs_init_data_ref(&ref, new_key.objectid,
327 args->start - extent_offset,
328 0, false);
329 ret = btrfs_inc_extent_ref(trans, &ref);
330 if (unlikely(ret)) {
331 btrfs_abort_transaction(trans, ret);
332 break;
333 }
334 }
335 key.offset = args->start;
336 }
337 /*
338 * From here on out we will have actually dropped something, so
339 * last_end can be updated.
340 */
341 last_end = extent_end;
342
343 /*
344 * | ---- range to drop ----- |
345 * | -------- extent -------- |
346 */
347 if (args->start <= key.offset && args->end < extent_end) {
348 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
349 ret = -EOPNOTSUPP;
350 break;
351 }
352
353 memcpy(&new_key, &key, sizeof(new_key));
354 new_key.offset = args->end;
355 btrfs_set_item_key_safe(trans, path, &new_key);
356
357 extent_offset += args->end - key.offset;
358 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
359 btrfs_set_file_extent_num_bytes(leaf, fi,
360 extent_end - args->end);
361 if (update_refs && disk_bytenr > 0)
362 args->bytes_found += args->end - key.offset;
363 break;
364 }
365
366 search_start = extent_end;
367 /*
368 * | ---- range to drop ----- |
369 * | -------- extent -------- |
370 */
371 if (args->start > key.offset && args->end >= extent_end) {
372 if (WARN_ON(del_nr > 0)) {
373 btrfs_print_leaf(leaf);
374 ret = -EINVAL;
375 break;
376 }
377 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
378 ret = -EOPNOTSUPP;
379 break;
380 }
381
382 btrfs_set_file_extent_num_bytes(leaf, fi,
383 args->start - key.offset);
384 if (update_refs && disk_bytenr > 0)
385 args->bytes_found += extent_end - args->start;
386 if (args->end == extent_end)
387 break;
388
389 path->slots[0]++;
390 goto next_slot;
391 }
392
393 /*
394 * | ---- range to drop ----- |
395 * | ------ extent ------ |
396 */
397 if (args->start <= key.offset && args->end >= extent_end) {
398 delete_extent_item:
399 if (del_nr == 0) {
400 del_slot = path->slots[0];
401 del_nr = 1;
402 } else {
403 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
404 btrfs_print_leaf(leaf);
405 ret = -EINVAL;
406 break;
407 }
408 del_nr++;
409 }
410
411 if (update_refs &&
412 extent_type == BTRFS_FILE_EXTENT_INLINE) {
413 args->bytes_found += extent_end - key.offset;
414 extent_end = ALIGN(extent_end,
415 fs_info->sectorsize);
416 } else if (update_refs && disk_bytenr > 0) {
417 struct btrfs_ref ref = {
418 .action = BTRFS_DROP_DELAYED_REF,
419 .bytenr = disk_bytenr,
420 .num_bytes = num_bytes,
421 .parent = 0,
422 .owning_root = btrfs_root_id(root),
423 .ref_root = btrfs_root_id(root),
424 };
425 btrfs_init_data_ref(&ref, key.objectid,
426 key.offset - extent_offset,
427 0, false);
428 ret = btrfs_free_extent(trans, &ref);
429 if (unlikely(ret)) {
430 btrfs_abort_transaction(trans, ret);
431 break;
432 }
433 args->bytes_found += extent_end - key.offset;
434 }
435
436 if (args->end == extent_end)
437 break;
438
439 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
440 path->slots[0]++;
441 goto next_slot;
442 }
443
444 ret = btrfs_del_items(trans, root, path, del_slot,
445 del_nr);
446 if (unlikely(ret)) {
447 btrfs_abort_transaction(trans, ret);
448 break;
449 }
450
451 del_nr = 0;
452 del_slot = 0;
453
454 btrfs_release_path(path);
455 continue;
456 }
457
458 BUG();
459 }
460
461 if (!ret && del_nr > 0) {
462 /*
463 * Set path->slots[0] to first slot, so that after the delete
464 * if items are move off from our leaf to its immediate left or
465 * right neighbor leafs, we end up with a correct and adjusted
466 * path->slots[0] for our insertion (if args->replace_extent).
467 */
468 path->slots[0] = del_slot;
469 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
470 if (ret)
471 btrfs_abort_transaction(trans, ret);
472 }
473
474 leaf = path->nodes[0];
475 /*
476 * If btrfs_del_items() was called, it might have deleted a leaf, in
477 * which case it unlocked our path, so check path->locks[0] matches a
478 * write lock.
479 */
480 if (!ret && args->replace_extent &&
481 path->locks[0] == BTRFS_WRITE_LOCK &&
482 btrfs_leaf_free_space(leaf) >=
483 sizeof(struct btrfs_item) + args->extent_item_size) {
484
485 key.objectid = ino;
486 key.type = BTRFS_EXTENT_DATA_KEY;
487 key.offset = args->start;
488 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
489 struct btrfs_key slot_key;
490
491 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
492 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
493 path->slots[0]++;
494 }
495 btrfs_setup_item_for_insert(trans, root, path, &key,
496 args->extent_item_size);
497 args->extent_inserted = true;
498 }
499
500 if (!args->path)
501 btrfs_free_path(path);
502 else if (!args->extent_inserted)
503 btrfs_release_path(path);
504 out:
505 args->drop_end = found ? min(args->end, last_end) : args->end;
506
507 return ret;
508 }
509
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)510 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
511 u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
512 {
513 struct btrfs_file_extent_item *fi;
514 struct btrfs_key key;
515 u64 extent_end;
516
517 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
518 return false;
519
520 btrfs_item_key_to_cpu(leaf, &key, slot);
521 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
522 return false;
523
524 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
525 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
526 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
527 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
528 btrfs_file_extent_compression(leaf, fi) ||
529 btrfs_file_extent_encryption(leaf, fi) ||
530 btrfs_file_extent_other_encoding(leaf, fi))
531 return false;
532
533 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
534 if ((*start && *start != key.offset) || (*end && *end != extent_end))
535 return false;
536
537 *start = key.offset;
538 *end = extent_end;
539 return true;
540 }
541
542 /*
543 * Mark extent in the range start - end as written.
544 *
545 * This changes extent type from 'pre-allocated' to 'regular'. If only
546 * part of extent is marked as written, the extent will be split into
547 * two or three.
548 */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)549 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
550 struct btrfs_inode *inode, u64 start, u64 end)
551 {
552 struct btrfs_root *root = inode->root;
553 struct extent_buffer *leaf;
554 BTRFS_PATH_AUTO_FREE(path);
555 struct btrfs_file_extent_item *fi;
556 struct btrfs_ref ref = { 0 };
557 struct btrfs_key key;
558 struct btrfs_key new_key;
559 u64 bytenr;
560 u64 num_bytes;
561 u64 extent_end;
562 u64 orig_offset;
563 u64 other_start;
564 u64 other_end;
565 u64 split;
566 int del_nr = 0;
567 int del_slot = 0;
568 int recow;
569 int ret;
570 u64 ino = btrfs_ino(inode);
571
572 path = btrfs_alloc_path();
573 if (!path)
574 return -ENOMEM;
575 again:
576 recow = 0;
577 split = start;
578 key.objectid = ino;
579 key.type = BTRFS_EXTENT_DATA_KEY;
580 key.offset = split;
581
582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
583 if (ret < 0)
584 return ret;
585 if (ret > 0 && path->slots[0] > 0)
586 path->slots[0]--;
587
588 leaf = path->nodes[0];
589 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
590 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
591 ret = -EINVAL;
592 btrfs_abort_transaction(trans, ret);
593 return ret;
594 }
595 fi = btrfs_item_ptr(leaf, path->slots[0],
596 struct btrfs_file_extent_item);
597 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
598 ret = -EINVAL;
599 btrfs_abort_transaction(trans, ret);
600 return ret;
601 }
602 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
603 if (unlikely(key.offset > start || extent_end < end)) {
604 ret = -EINVAL;
605 btrfs_abort_transaction(trans, ret);
606 return ret;
607 }
608
609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
610 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
611 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
612 memcpy(&new_key, &key, sizeof(new_key));
613
614 if (start == key.offset && end < extent_end) {
615 other_start = 0;
616 other_end = start;
617 if (extent_mergeable(leaf, path->slots[0] - 1,
618 ino, bytenr, orig_offset,
619 &other_start, &other_end)) {
620 new_key.offset = end;
621 btrfs_set_item_key_safe(trans, path, &new_key);
622 fi = btrfs_item_ptr(leaf, path->slots[0],
623 struct btrfs_file_extent_item);
624 btrfs_set_file_extent_generation(leaf, fi,
625 trans->transid);
626 btrfs_set_file_extent_num_bytes(leaf, fi,
627 extent_end - end);
628 btrfs_set_file_extent_offset(leaf, fi,
629 end - orig_offset);
630 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
631 struct btrfs_file_extent_item);
632 btrfs_set_file_extent_generation(leaf, fi,
633 trans->transid);
634 btrfs_set_file_extent_num_bytes(leaf, fi,
635 end - other_start);
636 goto mark_dirty;
637 }
638 }
639
640 if (start > key.offset && end == extent_end) {
641 other_start = end;
642 other_end = 0;
643 if (extent_mergeable(leaf, path->slots[0] + 1,
644 ino, bytenr, orig_offset,
645 &other_start, &other_end)) {
646 fi = btrfs_item_ptr(leaf, path->slots[0],
647 struct btrfs_file_extent_item);
648 btrfs_set_file_extent_num_bytes(leaf, fi,
649 start - key.offset);
650 btrfs_set_file_extent_generation(leaf, fi,
651 trans->transid);
652 path->slots[0]++;
653 new_key.offset = start;
654 btrfs_set_item_key_safe(trans, path, &new_key);
655
656 fi = btrfs_item_ptr(leaf, path->slots[0],
657 struct btrfs_file_extent_item);
658 btrfs_set_file_extent_generation(leaf, fi,
659 trans->transid);
660 btrfs_set_file_extent_num_bytes(leaf, fi,
661 other_end - start);
662 btrfs_set_file_extent_offset(leaf, fi,
663 start - orig_offset);
664 goto mark_dirty;
665 }
666 }
667
668 while (start > key.offset || end < extent_end) {
669 if (key.offset == start)
670 split = end;
671
672 new_key.offset = split;
673 ret = btrfs_duplicate_item(trans, root, path, &new_key);
674 if (ret == -EAGAIN) {
675 btrfs_release_path(path);
676 goto again;
677 }
678 if (unlikely(ret < 0)) {
679 btrfs_abort_transaction(trans, ret);
680 return ret;
681 }
682
683 leaf = path->nodes[0];
684 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
685 struct btrfs_file_extent_item);
686 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
687 btrfs_set_file_extent_num_bytes(leaf, fi,
688 split - key.offset);
689
690 fi = btrfs_item_ptr(leaf, path->slots[0],
691 struct btrfs_file_extent_item);
692
693 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
694 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
695 btrfs_set_file_extent_num_bytes(leaf, fi,
696 extent_end - split);
697
698 ref.action = BTRFS_ADD_DELAYED_REF;
699 ref.bytenr = bytenr;
700 ref.num_bytes = num_bytes;
701 ref.parent = 0;
702 ref.owning_root = btrfs_root_id(root);
703 ref.ref_root = btrfs_root_id(root);
704 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
705 ret = btrfs_inc_extent_ref(trans, &ref);
706 if (unlikely(ret)) {
707 btrfs_abort_transaction(trans, ret);
708 return ret;
709 }
710
711 if (split == start) {
712 key.offset = start;
713 } else {
714 if (unlikely(start != key.offset)) {
715 ret = -EINVAL;
716 btrfs_abort_transaction(trans, ret);
717 return ret;
718 }
719 path->slots[0]--;
720 extent_end = end;
721 }
722 recow = 1;
723 }
724
725 other_start = end;
726 other_end = 0;
727
728 ref.action = BTRFS_DROP_DELAYED_REF;
729 ref.bytenr = bytenr;
730 ref.num_bytes = num_bytes;
731 ref.parent = 0;
732 ref.owning_root = btrfs_root_id(root);
733 ref.ref_root = btrfs_root_id(root);
734 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
735 if (extent_mergeable(leaf, path->slots[0] + 1,
736 ino, bytenr, orig_offset,
737 &other_start, &other_end)) {
738 if (recow) {
739 btrfs_release_path(path);
740 goto again;
741 }
742 extent_end = other_end;
743 del_slot = path->slots[0] + 1;
744 del_nr++;
745 ret = btrfs_free_extent(trans, &ref);
746 if (unlikely(ret)) {
747 btrfs_abort_transaction(trans, ret);
748 return ret;
749 }
750 }
751 other_start = 0;
752 other_end = start;
753 if (extent_mergeable(leaf, path->slots[0] - 1,
754 ino, bytenr, orig_offset,
755 &other_start, &other_end)) {
756 if (recow) {
757 btrfs_release_path(path);
758 goto again;
759 }
760 key.offset = other_start;
761 del_slot = path->slots[0];
762 del_nr++;
763 ret = btrfs_free_extent(trans, &ref);
764 if (unlikely(ret)) {
765 btrfs_abort_transaction(trans, ret);
766 return ret;
767 }
768 }
769 if (del_nr == 0) {
770 fi = btrfs_item_ptr(leaf, path->slots[0],
771 struct btrfs_file_extent_item);
772 btrfs_set_file_extent_type(leaf, fi,
773 BTRFS_FILE_EXTENT_REG);
774 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
775 } else {
776 fi = btrfs_item_ptr(leaf, del_slot - 1,
777 struct btrfs_file_extent_item);
778 btrfs_set_file_extent_type(leaf, fi,
779 BTRFS_FILE_EXTENT_REG);
780 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
781 btrfs_set_file_extent_num_bytes(leaf, fi,
782 extent_end - key.offset);
783
784 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
785 if (unlikely(ret < 0)) {
786 btrfs_abort_transaction(trans, ret);
787 return ret;
788 }
789 }
790
791 mark_dirty:
792 ret = btrfs_inode_set_file_extent_range(inode, start, end - start);
793 if (ret)
794 btrfs_abort_transaction(trans, ret);
795
796 return ret;
797 }
798
799 /*
800 * On error return an unlocked folio and the error value
801 * On success return a locked folio and 0
802 */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len)803 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
804 u64 len)
805 {
806 u64 clamp_start = max_t(u64, pos, folio_pos(folio));
807 u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
808 const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
809 int ret = 0;
810
811 if (folio_test_uptodate(folio))
812 return 0;
813
814 if (IS_ALIGNED(clamp_start, blocksize) &&
815 IS_ALIGNED(clamp_end, blocksize))
816 return 0;
817
818 ret = btrfs_read_folio(NULL, folio);
819 if (ret)
820 return ret;
821 folio_lock(folio);
822 if (unlikely(!folio_test_uptodate(folio))) {
823 folio_unlock(folio);
824 return -EIO;
825 }
826
827 /*
828 * Since btrfs_read_folio() will unlock the folio before it returns,
829 * there is a window where btrfs_release_folio() can be called to
830 * release the page. Here we check both inode mapping and page
831 * private to make sure the page was not released.
832 *
833 * The private flag check is essential for subpage as we need to store
834 * extra bitmap using folio private.
835 */
836 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
837 folio_unlock(folio);
838 return -EAGAIN;
839 }
840 return 0;
841 }
842
get_prepare_gfp_flags(struct inode * inode,bool nowait)843 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
844 {
845 gfp_t gfp;
846
847 gfp = btrfs_alloc_write_mask(inode->i_mapping);
848 if (nowait) {
849 gfp &= ~__GFP_DIRECT_RECLAIM;
850 gfp |= GFP_NOWAIT;
851 }
852
853 return gfp;
854 }
855
856 /*
857 * Get folio into the page cache and lock it.
858 */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool nowait)859 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
860 loff_t pos, size_t write_bytes,
861 bool nowait)
862 {
863 const pgoff_t index = pos >> PAGE_SHIFT;
864 gfp_t mask = get_prepare_gfp_flags(inode, nowait);
865 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
866 fgf_set_order(write_bytes);
867 struct folio *folio;
868 int ret;
869
870 again:
871 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
872 if (IS_ERR(folio))
873 return PTR_ERR(folio);
874
875 ret = set_folio_extent_mapped(folio);
876 if (ret < 0) {
877 folio_unlock(folio);
878 folio_put(folio);
879 return ret;
880 }
881 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
882 if (ret) {
883 /* The folio is already unlocked. */
884 folio_put(folio);
885 if (!nowait && ret == -EAGAIN)
886 goto again;
887 return ret;
888 }
889 *folio_ret = folio;
890 return 0;
891 }
892
893 /*
894 * Locks the extent and properly waits for data=ordered extents to finish
895 * before allowing the folios to be modified if need.
896 *
897 * Return:
898 * 1 - the extent is locked
899 * 0 - the extent is not locked, and everything is OK
900 * -EAGAIN - need to prepare the folios again
901 */
902 static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)903 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
904 loff_t pos, size_t write_bytes,
905 u64 *lockstart, u64 *lockend, bool nowait,
906 struct extent_state **cached_state)
907 {
908 struct btrfs_fs_info *fs_info = inode->root->fs_info;
909 u64 start_pos;
910 u64 last_pos;
911 int ret = 0;
912
913 start_pos = round_down(pos, fs_info->sectorsize);
914 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
915
916 if (start_pos < inode->vfs_inode.i_size) {
917 struct btrfs_ordered_extent *ordered;
918
919 if (nowait) {
920 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
921 last_pos, cached_state)) {
922 folio_unlock(folio);
923 folio_put(folio);
924 return -EAGAIN;
925 }
926 } else {
927 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
928 cached_state);
929 }
930
931 ordered = btrfs_lookup_ordered_range(inode, start_pos,
932 last_pos - start_pos + 1);
933 if (ordered &&
934 ordered->file_offset + ordered->num_bytes > start_pos &&
935 ordered->file_offset <= last_pos) {
936 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
937 cached_state);
938 folio_unlock(folio);
939 folio_put(folio);
940 btrfs_start_ordered_extent(ordered);
941 btrfs_put_ordered_extent(ordered);
942 return -EAGAIN;
943 }
944 if (ordered)
945 btrfs_put_ordered_extent(ordered);
946
947 *lockstart = start_pos;
948 *lockend = last_pos;
949 ret = 1;
950 }
951
952 /*
953 * We should be called after prepare_one_folio() which should have locked
954 * all pages in the range.
955 */
956 WARN_ON(!folio_test_locked(folio));
957
958 return ret;
959 }
960
961 /*
962 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
963 *
964 * @pos: File offset.
965 * @write_bytes: The length to write, will be updated to the nocow writeable
966 * range.
967 * @nowait: Indicate if we can block or not (non-blocking IO context).
968 *
969 * This function will flush ordered extents in the range to ensure proper
970 * nocow checks.
971 *
972 * Return:
973 * > 0 If we can nocow, and updates @write_bytes.
974 * 0 If we can't do a nocow write.
975 * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
976 * root is in progress or because we are in a non-blocking IO
977 * context and need to block (@nowait is true).
978 * < 0 If an error happened.
979 *
980 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
981 */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)982 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
983 size_t *write_bytes, bool nowait)
984 {
985 struct btrfs_fs_info *fs_info = inode->root->fs_info;
986 struct btrfs_root *root = inode->root;
987 struct extent_state *cached_state = NULL;
988 u64 lockstart, lockend;
989 u64 cur_offset;
990 int ret = 0;
991
992 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
993 return 0;
994
995 if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
996 return -EAGAIN;
997
998 lockstart = round_down(pos, fs_info->sectorsize);
999 lockend = round_up(pos + *write_bytes,
1000 fs_info->sectorsize) - 1;
1001
1002 if (nowait) {
1003 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1004 &cached_state)) {
1005 btrfs_drew_write_unlock(&root->snapshot_lock);
1006 return -EAGAIN;
1007 }
1008 } else {
1009 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1010 &cached_state);
1011 }
1012
1013 cur_offset = lockstart;
1014 while (cur_offset < lockend) {
1015 u64 num_bytes = lockend - cur_offset + 1;
1016
1017 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
1018 if (ret <= 0) {
1019 /*
1020 * If cur_offset == lockstart it means we haven't found
1021 * any extent against which we can NOCOW, so unlock the
1022 * snapshot lock.
1023 */
1024 if (cur_offset == lockstart)
1025 btrfs_drew_write_unlock(&root->snapshot_lock);
1026 break;
1027 }
1028 cur_offset += num_bytes;
1029 }
1030
1031 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1032
1033 /*
1034 * cur_offset > lockstart means there's at least a partial range we can
1035 * NOCOW, and that range can cover one or more extents.
1036 */
1037 if (cur_offset > lockstart) {
1038 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
1039 return 1;
1040 }
1041
1042 return ret;
1043 }
1044
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1045 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1046 {
1047 btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1048 }
1049
btrfs_write_check(struct kiocb * iocb,size_t count)1050 int btrfs_write_check(struct kiocb *iocb, size_t count)
1051 {
1052 struct file *file = iocb->ki_filp;
1053 struct inode *inode = file_inode(file);
1054 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1055 loff_t pos = iocb->ki_pos;
1056 int ret;
1057 loff_t oldsize;
1058
1059 /*
1060 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1061 * prealloc flags, as without those flags we always have to COW. We will
1062 * later check if we can really COW into the target range (using
1063 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1064 */
1065 if ((iocb->ki_flags & IOCB_NOWAIT) &&
1066 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1067 return -EAGAIN;
1068
1069 ret = file_remove_privs(file);
1070 if (ret)
1071 return ret;
1072
1073 /*
1074 * We reserve space for updating the inode when we reserve space for the
1075 * extent we are going to write, so we will enospc out there. We don't
1076 * need to start yet another transaction to update the inode as we will
1077 * update the inode when we finish writing whatever data we write.
1078 */
1079 if (!IS_NOCMTIME(inode)) {
1080 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1081 inode_inc_iversion(inode);
1082 }
1083
1084 oldsize = i_size_read(inode);
1085 if (pos > oldsize) {
1086 /* Expand hole size to cover write data, preventing empty gap */
1087 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1088
1089 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1090 if (ret)
1091 return ret;
1092 }
1093
1094 return 0;
1095 }
1096
release_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 start,u64 len,bool only_release_metadata)1097 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
1098 u64 start, u64 len, bool only_release_metadata)
1099 {
1100 if (len == 0)
1101 return;
1102
1103 if (only_release_metadata) {
1104 btrfs_check_nocow_unlock(inode);
1105 btrfs_delalloc_release_metadata(inode, len, true);
1106 } else {
1107 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1108
1109 btrfs_delalloc_release_space(inode, data_reserved,
1110 round_down(start, fs_info->sectorsize),
1111 len, true);
1112 }
1113 }
1114
1115 /*
1116 * Reserve data and metadata space for this buffered write range.
1117 *
1118 * Return >0 for the number of bytes reserved, which is always block aligned.
1119 * Return <0 for error.
1120 */
reserve_space(struct btrfs_inode * inode,struct extent_changeset ** data_reserved,u64 start,size_t * len,bool nowait,bool * only_release_metadata)1121 static ssize_t reserve_space(struct btrfs_inode *inode,
1122 struct extent_changeset **data_reserved,
1123 u64 start, size_t *len, bool nowait,
1124 bool *only_release_metadata)
1125 {
1126 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1127 const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
1128 size_t reserve_bytes;
1129 int ret;
1130
1131 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
1132 if (ret < 0) {
1133 int can_nocow;
1134
1135 if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
1136 return -EAGAIN;
1137
1138 /*
1139 * If we don't have to COW at the offset, reserve metadata only.
1140 * write_bytes may get smaller than requested here.
1141 */
1142 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
1143 if (can_nocow < 0)
1144 ret = can_nocow;
1145 if (can_nocow > 0)
1146 ret = 0;
1147 if (ret)
1148 return ret;
1149 *only_release_metadata = true;
1150 }
1151
1152 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
1153 WARN_ON(reserve_bytes == 0);
1154 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
1155 reserve_bytes, nowait);
1156 if (ret) {
1157 if (!*only_release_metadata)
1158 btrfs_free_reserved_data_space(inode, *data_reserved,
1159 start, *len);
1160 else
1161 btrfs_check_nocow_unlock(inode);
1162
1163 if (nowait && ret == -ENOSPC)
1164 ret = -EAGAIN;
1165 return ret;
1166 }
1167 return reserve_bytes;
1168 }
1169
1170 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
shrink_reserved_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 reserved_start,u64 reserved_len,u64 new_len,bool only_release_metadata)1171 static void shrink_reserved_space(struct btrfs_inode *inode,
1172 struct extent_changeset *data_reserved,
1173 u64 reserved_start, u64 reserved_len,
1174 u64 new_len, bool only_release_metadata)
1175 {
1176 const u64 diff = reserved_len - new_len;
1177
1178 ASSERT(new_len <= reserved_len);
1179 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
1180 if (only_release_metadata)
1181 btrfs_delalloc_release_metadata(inode, diff, true);
1182 else
1183 btrfs_delalloc_release_space(inode, data_reserved,
1184 reserved_start + new_len, diff, true);
1185 }
1186
1187 /* Calculate the maximum amount of bytes we can write into one folio. */
calc_write_bytes(const struct btrfs_inode * inode,const struct iov_iter * iter,u64 start)1188 static size_t calc_write_bytes(const struct btrfs_inode *inode,
1189 const struct iov_iter *iter, u64 start)
1190 {
1191 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
1192
1193 return min(max_folio_size - (start & (max_folio_size - 1)),
1194 iov_iter_count(iter));
1195 }
1196
1197 /*
1198 * Do the heavy-lifting work to copy one range into one folio of the page cache.
1199 *
1200 * Return > 0 in case we copied all bytes or just some of them.
1201 * Return 0 if no bytes were copied, in which case the caller should retry.
1202 * Return <0 on error.
1203 */
copy_one_range(struct btrfs_inode * inode,struct iov_iter * iter,struct extent_changeset ** data_reserved,u64 start,bool nowait)1204 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
1205 struct extent_changeset **data_reserved, u64 start,
1206 bool nowait)
1207 {
1208 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1209 struct extent_state *cached_state = NULL;
1210 size_t write_bytes = calc_write_bytes(inode, iter, start);
1211 size_t copied;
1212 const u64 reserved_start = round_down(start, fs_info->sectorsize);
1213 u64 reserved_len;
1214 struct folio *folio = NULL;
1215 int extents_locked;
1216 u64 lockstart;
1217 u64 lockend;
1218 bool only_release_metadata = false;
1219 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1220 int ret;
1221
1222 /*
1223 * Fault all pages before locking them in prepare_one_folio() to avoid
1224 * recursive lock.
1225 */
1226 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
1227 return -EFAULT;
1228 extent_changeset_release(*data_reserved);
1229 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
1230 &only_release_metadata);
1231 if (ret < 0)
1232 return ret;
1233 reserved_len = ret;
1234 /* Write range must be inside the reserved range. */
1235 ASSERT(reserved_start <= start);
1236 ASSERT(start + write_bytes <= reserved_start + reserved_len);
1237
1238 again:
1239 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
1240 bdp_flags);
1241 if (ret) {
1242 btrfs_delalloc_release_extents(inode, reserved_len);
1243 release_space(inode, *data_reserved, reserved_start, reserved_len,
1244 only_release_metadata);
1245 return ret;
1246 }
1247
1248 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
1249 if (ret) {
1250 btrfs_delalloc_release_extents(inode, reserved_len);
1251 release_space(inode, *data_reserved, reserved_start, reserved_len,
1252 only_release_metadata);
1253 return ret;
1254 }
1255
1256 /*
1257 * The reserved range goes beyond the current folio, shrink the reserved
1258 * space to the folio boundary.
1259 */
1260 if (reserved_start + reserved_len > folio_next_pos(folio)) {
1261 const u64 last_block = folio_next_pos(folio);
1262
1263 shrink_reserved_space(inode, *data_reserved, reserved_start,
1264 reserved_len, last_block - reserved_start,
1265 only_release_metadata);
1266 write_bytes = last_block - start;
1267 reserved_len = last_block - reserved_start;
1268 }
1269
1270 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
1271 write_bytes, &lockstart,
1272 &lockend, nowait,
1273 &cached_state);
1274 if (extents_locked < 0) {
1275 if (!nowait && extents_locked == -EAGAIN)
1276 goto again;
1277
1278 btrfs_delalloc_release_extents(inode, reserved_len);
1279 release_space(inode, *data_reserved, reserved_start, reserved_len,
1280 only_release_metadata);
1281 return extents_locked;
1282 }
1283
1284 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
1285 write_bytes, iter);
1286 flush_dcache_folio(folio);
1287
1288 if (unlikely(copied < write_bytes)) {
1289 u64 last_block;
1290
1291 /*
1292 * The original write range doesn't need an uptodate folio as
1293 * the range is block aligned. But now a short copy happened.
1294 * We cannot handle it without an uptodate folio.
1295 *
1296 * So just revert the range and we will retry.
1297 */
1298 if (!folio_test_uptodate(folio)) {
1299 iov_iter_revert(iter, copied);
1300 copied = 0;
1301 }
1302
1303 /* No copied bytes, unlock, release reserved space and exit. */
1304 if (copied == 0) {
1305 if (extents_locked)
1306 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
1307 &cached_state);
1308 else
1309 btrfs_free_extent_state(cached_state);
1310 btrfs_delalloc_release_extents(inode, reserved_len);
1311 release_space(inode, *data_reserved, reserved_start, reserved_len,
1312 only_release_metadata);
1313 btrfs_drop_folio(fs_info, folio, start, copied);
1314 return 0;
1315 }
1316
1317 /* Release the reserved space beyond the last block. */
1318 last_block = round_up(start + copied, fs_info->sectorsize);
1319
1320 shrink_reserved_space(inode, *data_reserved, reserved_start,
1321 reserved_len, last_block - reserved_start,
1322 only_release_metadata);
1323 reserved_len = last_block - reserved_start;
1324 }
1325
1326 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
1327 only_release_metadata);
1328 /*
1329 * If we have not locked the extent range, because the range's start
1330 * offset is >= i_size, we might still have a non-NULL cached extent
1331 * state, acquired while marking the extent range as delalloc through
1332 * btrfs_dirty_page(). Therefore free any possible cached extent state
1333 * to avoid a memory leak.
1334 */
1335 if (extents_locked)
1336 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1337 else
1338 btrfs_free_extent_state(cached_state);
1339
1340 btrfs_delalloc_release_extents(inode, reserved_len);
1341 if (ret) {
1342 btrfs_drop_folio(fs_info, folio, start, copied);
1343 release_space(inode, *data_reserved, reserved_start, reserved_len,
1344 only_release_metadata);
1345 return ret;
1346 }
1347 if (only_release_metadata)
1348 btrfs_check_nocow_unlock(inode);
1349
1350 btrfs_drop_folio(fs_info, folio, start, copied);
1351 return copied;
1352 }
1353
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * iter)1354 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1355 {
1356 struct file *file = iocb->ki_filp;
1357 loff_t pos;
1358 struct inode *inode = file_inode(file);
1359 struct extent_changeset *data_reserved = NULL;
1360 size_t num_written = 0;
1361 ssize_t ret;
1362 loff_t old_isize;
1363 unsigned int ilock_flags = 0;
1364 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1365
1366 if (nowait)
1367 ilock_flags |= BTRFS_ILOCK_TRY;
1368
1369 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1370 if (ret < 0)
1371 return ret;
1372
1373 /*
1374 * We can only trust the isize with inode lock held, or it can race with
1375 * other buffered writes and cause incorrect call of
1376 * pagecache_isize_extended() to overwrite existing data.
1377 */
1378 old_isize = i_size_read(inode);
1379
1380 ret = generic_write_checks(iocb, iter);
1381 if (ret <= 0)
1382 goto out;
1383
1384 ret = btrfs_write_check(iocb, ret);
1385 if (ret < 0)
1386 goto out;
1387
1388 pos = iocb->ki_pos;
1389 while (iov_iter_count(iter) > 0) {
1390 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
1391 if (ret < 0)
1392 break;
1393 pos += ret;
1394 num_written += ret;
1395 cond_resched();
1396 }
1397
1398 extent_changeset_free(data_reserved);
1399 if (num_written > 0) {
1400 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1401 iocb->ki_pos += num_written;
1402 }
1403 out:
1404 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1405 return num_written ? num_written : ret;
1406 }
1407
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1408 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1409 const struct btrfs_ioctl_encoded_io_args *encoded)
1410 {
1411 struct file *file = iocb->ki_filp;
1412 struct inode *inode = file_inode(file);
1413 loff_t count;
1414 ssize_t ret;
1415
1416 btrfs_inode_lock(BTRFS_I(inode), 0);
1417 count = encoded->len;
1418 ret = generic_write_checks_count(iocb, &count);
1419 if (ret == 0 && count != encoded->len) {
1420 /*
1421 * The write got truncated by generic_write_checks_count(). We
1422 * can't do a partial encoded write.
1423 */
1424 ret = -EFBIG;
1425 }
1426 if (ret || encoded->len == 0)
1427 goto out;
1428
1429 ret = btrfs_write_check(iocb, encoded->len);
1430 if (ret < 0)
1431 goto out;
1432
1433 ret = btrfs_do_encoded_write(iocb, from, encoded);
1434 out:
1435 btrfs_inode_unlock(BTRFS_I(inode), 0);
1436 return ret;
1437 }
1438
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1439 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1440 const struct btrfs_ioctl_encoded_io_args *encoded)
1441 {
1442 struct file *file = iocb->ki_filp;
1443 struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1444 ssize_t num_written, num_sync;
1445
1446 if (btrfs_is_shutdown(inode->root->fs_info))
1447 return -EIO;
1448 /*
1449 * If the fs flips readonly due to some impossible error, although we
1450 * have opened a file as writable, we have to stop this write operation
1451 * to ensure consistency.
1452 */
1453 if (unlikely(BTRFS_FS_ERROR(inode->root->fs_info)))
1454 return -EROFS;
1455
1456 if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1457 return -EOPNOTSUPP;
1458
1459 if (encoded) {
1460 num_written = btrfs_encoded_write(iocb, from, encoded);
1461 num_sync = encoded->len;
1462 } else if (iocb->ki_flags & IOCB_DIRECT) {
1463 num_written = btrfs_direct_write(iocb, from);
1464 num_sync = num_written;
1465 } else {
1466 num_written = btrfs_buffered_write(iocb, from);
1467 num_sync = num_written;
1468 }
1469
1470 btrfs_set_inode_last_sub_trans(inode);
1471
1472 if (num_sync > 0) {
1473 num_sync = generic_write_sync(iocb, num_sync);
1474 if (num_sync < 0)
1475 num_written = num_sync;
1476 }
1477
1478 return num_written;
1479 }
1480
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1481 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1482 {
1483 return btrfs_do_write_iter(iocb, from, NULL);
1484 }
1485
btrfs_release_file(struct inode * inode,struct file * filp)1486 int btrfs_release_file(struct inode *inode, struct file *filp)
1487 {
1488 struct btrfs_file_private *private = filp->private_data;
1489
1490 if (private) {
1491 kfree(private->filldir_buf);
1492 btrfs_free_extent_state(private->llseek_cached_state);
1493 kfree(private);
1494 filp->private_data = NULL;
1495 }
1496
1497 /*
1498 * Set by setattr when we are about to truncate a file from a non-zero
1499 * size to a zero size. This tries to flush down new bytes that may
1500 * have been written if the application were using truncate to replace
1501 * a file in place.
1502 */
1503 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1504 &BTRFS_I(inode)->runtime_flags))
1505 filemap_flush(inode->i_mapping);
1506 return 0;
1507 }
1508
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1509 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1510 {
1511 int ret;
1512 struct blk_plug plug;
1513
1514 /*
1515 * This is only called in fsync, which would do synchronous writes, so
1516 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1517 * multiple disks using raid profile, a large IO can be split to
1518 * several segments of stripe length (currently 64K).
1519 */
1520 blk_start_plug(&plug);
1521 ret = btrfs_fdatawrite_range(inode, start, end);
1522 blk_finish_plug(&plug);
1523
1524 return ret;
1525 }
1526
skip_inode_logging(const struct btrfs_log_ctx * ctx)1527 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1528 {
1529 struct btrfs_inode *inode = ctx->inode;
1530 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1531
1532 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1533 list_empty(&ctx->ordered_extents))
1534 return true;
1535
1536 /*
1537 * If we are doing a fast fsync we can not bail out if the inode's
1538 * last_trans is <= then the last committed transaction, because we only
1539 * update the last_trans of the inode during ordered extent completion,
1540 * and for a fast fsync we don't wait for that, we only wait for the
1541 * writeback to complete.
1542 */
1543 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1544 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1545 list_empty(&ctx->ordered_extents)))
1546 return true;
1547
1548 return false;
1549 }
1550
1551 /*
1552 * fsync call for both files and directories. This logs the inode into
1553 * the tree log instead of forcing full commits whenever possible.
1554 *
1555 * It needs to call filemap_fdatawait so that all ordered extent updates are
1556 * in the metadata btree are up to date for copying to the log.
1557 *
1558 * It drops the inode mutex before doing the tree log commit. This is an
1559 * important optimization for directories because holding the mutex prevents
1560 * new operations on the dir while we write to disk.
1561 */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1562 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1563 {
1564 struct dentry *dentry = file_dentry(file);
1565 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1566 struct btrfs_root *root = inode->root;
1567 struct btrfs_fs_info *fs_info = root->fs_info;
1568 struct btrfs_trans_handle *trans;
1569 struct btrfs_log_ctx ctx;
1570 int ret = 0, err;
1571 u64 len;
1572 bool full_sync;
1573 bool skip_ilock = false;
1574
1575 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1576 skip_ilock = true;
1577 current->journal_info = NULL;
1578 btrfs_assert_inode_locked(inode);
1579 }
1580
1581 trace_btrfs_sync_file(file, datasync);
1582
1583 btrfs_init_log_ctx(&ctx, inode);
1584
1585 /*
1586 * Always set the range to a full range, otherwise we can get into
1587 * several problems, from missing file extent items to represent holes
1588 * when not using the NO_HOLES feature, to log tree corruption due to
1589 * races between hole detection during logging and completion of ordered
1590 * extents outside the range, to missing checksums due to ordered extents
1591 * for which we flushed only a subset of their pages.
1592 */
1593 start = 0;
1594 end = LLONG_MAX;
1595 len = (u64)LLONG_MAX + 1;
1596
1597 /*
1598 * We write the dirty pages in the range and wait until they complete
1599 * out of the ->i_mutex. If so, we can flush the dirty pages by
1600 * multi-task, and make the performance up. See
1601 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1602 */
1603 ret = start_ordered_ops(inode, start, end);
1604 if (ret)
1605 goto out;
1606
1607 if (skip_ilock)
1608 down_write(&inode->i_mmap_lock);
1609 else
1610 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1611
1612 atomic_inc(&root->log_batch);
1613
1614 /*
1615 * Before we acquired the inode's lock and the mmap lock, someone may
1616 * have dirtied more pages in the target range. We need to make sure
1617 * that writeback for any such pages does not start while we are logging
1618 * the inode, because if it does, any of the following might happen when
1619 * we are not doing a full inode sync:
1620 *
1621 * 1) We log an extent after its writeback finishes but before its
1622 * checksums are added to the csum tree, leading to -EIO errors
1623 * when attempting to read the extent after a log replay.
1624 *
1625 * 2) We can end up logging an extent before its writeback finishes.
1626 * Therefore after the log replay we will have a file extent item
1627 * pointing to an unwritten extent (and no data checksums as well).
1628 *
1629 * So trigger writeback for any eventual new dirty pages and then we
1630 * wait for all ordered extents to complete below.
1631 */
1632 ret = start_ordered_ops(inode, start, end);
1633 if (ret) {
1634 if (skip_ilock)
1635 up_write(&inode->i_mmap_lock);
1636 else
1637 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1638 goto out;
1639 }
1640
1641 /*
1642 * Always check for the full sync flag while holding the inode's lock,
1643 * to avoid races with other tasks. The flag must be either set all the
1644 * time during logging or always off all the time while logging.
1645 * We check the flag here after starting delalloc above, because when
1646 * running delalloc the full sync flag may be set if we need to drop
1647 * extra extent map ranges due to temporary memory allocation failures.
1648 */
1649 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1650
1651 /*
1652 * We have to do this here to avoid the priority inversion of waiting on
1653 * IO of a lower priority task while holding a transaction open.
1654 *
1655 * For a full fsync we wait for the ordered extents to complete while
1656 * for a fast fsync we wait just for writeback to complete, and then
1657 * attach the ordered extents to the transaction so that a transaction
1658 * commit waits for their completion, to avoid data loss if we fsync,
1659 * the current transaction commits before the ordered extents complete
1660 * and a power failure happens right after that.
1661 *
1662 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1663 * logical address recorded in the ordered extent may change. We need
1664 * to wait for the IO to stabilize the logical address.
1665 */
1666 if (full_sync || btrfs_is_zoned(fs_info)) {
1667 ret = btrfs_wait_ordered_range(inode, start, len);
1668 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1669 } else {
1670 /*
1671 * Get our ordered extents as soon as possible to avoid doing
1672 * checksum lookups in the csum tree, and use instead the
1673 * checksums attached to the ordered extents.
1674 */
1675 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1676 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1677 if (ret)
1678 goto out_release_extents;
1679
1680 /*
1681 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1682 * starting and waiting for writeback, because for buffered IO
1683 * it may have been set during the end IO callback
1684 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1685 * case an error happened and we need to wait for ordered
1686 * extents to complete so that any extent maps that point to
1687 * unwritten locations are dropped and we don't log them.
1688 */
1689 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1690 ret = btrfs_wait_ordered_range(inode, start, len);
1691 }
1692
1693 if (ret)
1694 goto out_release_extents;
1695
1696 atomic_inc(&root->log_batch);
1697
1698 if (skip_inode_logging(&ctx)) {
1699 /*
1700 * We've had everything committed since the last time we were
1701 * modified so clear this flag in case it was set for whatever
1702 * reason, it's no longer relevant.
1703 */
1704 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1705 /*
1706 * An ordered extent might have started before and completed
1707 * already with io errors, in which case the inode was not
1708 * updated and we end up here. So check the inode's mapping
1709 * for any errors that might have happened since we last
1710 * checked called fsync.
1711 */
1712 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1713 goto out_release_extents;
1714 }
1715
1716 btrfs_init_log_ctx_scratch_eb(&ctx);
1717
1718 /*
1719 * We use start here because we will need to wait on the IO to complete
1720 * in btrfs_sync_log, which could require joining a transaction (for
1721 * example checking cross references in the nocow path). If we use join
1722 * here we could get into a situation where we're waiting on IO to
1723 * happen that is blocked on a transaction trying to commit. With start
1724 * we inc the extwriter counter, so we wait for all extwriters to exit
1725 * before we start blocking joiners. This comment is to keep somebody
1726 * from thinking they are super smart and changing this to
1727 * btrfs_join_transaction *cough*Josef*cough*.
1728 */
1729 trans = btrfs_start_transaction(root, 0);
1730 if (IS_ERR(trans)) {
1731 ret = PTR_ERR(trans);
1732 goto out_release_extents;
1733 }
1734 trans->in_fsync = true;
1735
1736 ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1737 /*
1738 * Scratch eb no longer needed, release before syncing log or commit
1739 * transaction, to avoid holding unnecessary memory during such long
1740 * operations.
1741 */
1742 if (ctx.scratch_eb) {
1743 free_extent_buffer(ctx.scratch_eb);
1744 ctx.scratch_eb = NULL;
1745 }
1746 btrfs_release_log_ctx_extents(&ctx);
1747 if (ret < 0) {
1748 /* Fallthrough and commit/free transaction. */
1749 ret = BTRFS_LOG_FORCE_COMMIT;
1750 }
1751
1752 /* we've logged all the items and now have a consistent
1753 * version of the file in the log. It is possible that
1754 * someone will come in and modify the file, but that's
1755 * fine because the log is consistent on disk, and we
1756 * have references to all of the file's extents
1757 *
1758 * It is possible that someone will come in and log the
1759 * file again, but that will end up using the synchronization
1760 * inside btrfs_sync_log to keep things safe.
1761 */
1762 if (skip_ilock)
1763 up_write(&inode->i_mmap_lock);
1764 else
1765 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1766
1767 if (ret == BTRFS_NO_LOG_SYNC) {
1768 ret = btrfs_end_transaction(trans);
1769 goto out;
1770 }
1771
1772 /* We successfully logged the inode, attempt to sync the log. */
1773 if (!ret) {
1774 ret = btrfs_sync_log(trans, root, &ctx);
1775 if (!ret) {
1776 ret = btrfs_end_transaction(trans);
1777 goto out;
1778 }
1779 }
1780
1781 /*
1782 * At this point we need to commit the transaction because we had
1783 * btrfs_need_log_full_commit() or some other error.
1784 *
1785 * If we didn't do a full sync we have to stop the trans handle, wait on
1786 * the ordered extents, start it again and commit the transaction. If
1787 * we attempt to wait on the ordered extents here we could deadlock with
1788 * something like fallocate() that is holding the extent lock trying to
1789 * start a transaction while some other thread is trying to commit the
1790 * transaction while we (fsync) are currently holding the transaction
1791 * open.
1792 */
1793 if (!full_sync) {
1794 ret = btrfs_end_transaction(trans);
1795 if (ret)
1796 goto out;
1797 ret = btrfs_wait_ordered_range(inode, start, len);
1798 if (ret)
1799 goto out;
1800
1801 /*
1802 * This is safe to use here because we're only interested in
1803 * making sure the transaction that had the ordered extents is
1804 * committed. We aren't waiting on anything past this point,
1805 * we're purely getting the transaction and committing it.
1806 */
1807 trans = btrfs_attach_transaction_barrier(root);
1808 if (IS_ERR(trans)) {
1809 ret = PTR_ERR(trans);
1810
1811 /*
1812 * We committed the transaction and there's no currently
1813 * running transaction, this means everything we care
1814 * about made it to disk and we are done.
1815 */
1816 if (ret == -ENOENT)
1817 ret = 0;
1818 goto out;
1819 }
1820 }
1821
1822 ret = btrfs_commit_transaction(trans);
1823 out:
1824 free_extent_buffer(ctx.scratch_eb);
1825 ASSERT(list_empty(&ctx.list));
1826 ASSERT(list_empty(&ctx.conflict_inodes));
1827 err = file_check_and_advance_wb_err(file);
1828 if (!ret)
1829 ret = err;
1830 return ret > 0 ? -EIO : ret;
1831
1832 out_release_extents:
1833 btrfs_release_log_ctx_extents(&ctx);
1834 if (skip_ilock)
1835 up_write(&inode->i_mmap_lock);
1836 else
1837 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1838 goto out;
1839 }
1840
1841 /*
1842 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1843 * called from a page fault handler when a page is first dirtied. Hence we must
1844 * be careful to check for EOF conditions here. We set the page up correctly
1845 * for a written page which means we get ENOSPC checking when writing into
1846 * holes and correct delalloc and unwritten extent mapping on filesystems that
1847 * support these features.
1848 *
1849 * We are not allowed to take the i_mutex here so we have to play games to
1850 * protect against truncate races as the page could now be beyond EOF. Because
1851 * truncate_setsize() writes the inode size before removing pages, once we have
1852 * the page lock we can determine safely if the page is beyond EOF. If it is not
1853 * beyond EOF, then the page is guaranteed safe against truncation until we
1854 * unlock the page.
1855 */
btrfs_page_mkwrite(struct vm_fault * vmf)1856 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1857 {
1858 struct page *page = vmf->page;
1859 struct folio *folio = page_folio(page);
1860 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
1861 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1862 struct extent_io_tree *io_tree = &inode->io_tree;
1863 struct btrfs_ordered_extent *ordered;
1864 struct extent_state *cached_state = NULL;
1865 struct extent_changeset *data_reserved = NULL;
1866 unsigned long zero_start;
1867 loff_t size;
1868 size_t fsize = folio_size(folio);
1869 int ret;
1870 bool only_release_metadata = false;
1871 u64 reserved_space;
1872 u64 page_start;
1873 u64 page_end;
1874 u64 end;
1875
1876 reserved_space = fsize;
1877
1878 sb_start_pagefault(inode->vfs_inode.i_sb);
1879 page_start = folio_pos(folio);
1880 page_end = page_start + folio_size(folio) - 1;
1881 end = page_end;
1882
1883 /*
1884 * Reserving delalloc space after obtaining the page lock can lead to
1885 * deadlock. For example, if a dirty page is locked by this function
1886 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1887 * dirty page write out, then the btrfs_writepages() function could
1888 * end up waiting indefinitely to get a lock on the page currently
1889 * being processed by btrfs_page_mkwrite() function.
1890 */
1891 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
1892 reserved_space, false);
1893 if (ret < 0) {
1894 size_t write_bytes = reserved_space;
1895
1896 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
1897 goto out_noreserve;
1898
1899 only_release_metadata = true;
1900
1901 /*
1902 * Can't write the whole range, there may be shared extents or
1903 * holes in the range, bail out with @only_release_metadata set
1904 * to true so that we unlock the nocow lock before returning the
1905 * error.
1906 */
1907 if (write_bytes < reserved_space)
1908 goto out_noreserve;
1909 }
1910 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
1911 reserved_space, false);
1912 if (ret < 0) {
1913 if (!only_release_metadata)
1914 btrfs_free_reserved_data_space(inode, data_reserved,
1915 page_start, reserved_space);
1916 goto out_noreserve;
1917 }
1918
1919 ret = file_update_time(vmf->vma->vm_file);
1920 if (ret < 0)
1921 goto out;
1922 again:
1923 down_read(&inode->i_mmap_lock);
1924 folio_lock(folio);
1925 size = i_size_read(&inode->vfs_inode);
1926
1927 if ((folio->mapping != inode->vfs_inode.i_mapping) ||
1928 (page_start >= size)) {
1929 /* Page got truncated out from underneath us. */
1930 goto out_unlock;
1931 }
1932 folio_wait_writeback(folio);
1933
1934 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
1935 ret = set_folio_extent_mapped(folio);
1936 if (ret < 0) {
1937 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1938 goto out_unlock;
1939 }
1940
1941 /*
1942 * We can't set the delalloc bits if there are pending ordered
1943 * extents. Drop our locks and wait for them to finish.
1944 */
1945 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
1946 if (ordered) {
1947 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1948 folio_unlock(folio);
1949 up_read(&inode->i_mmap_lock);
1950 btrfs_start_ordered_extent(ordered);
1951 btrfs_put_ordered_extent(ordered);
1952 goto again;
1953 }
1954
1955 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
1956 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1957 if (reserved_space < fsize) {
1958 const u64 to_free = fsize - reserved_space;
1959
1960 end = page_start + reserved_space - 1;
1961 if (only_release_metadata)
1962 btrfs_delalloc_release_metadata(inode, to_free, true);
1963 else
1964 btrfs_delalloc_release_space(inode, data_reserved,
1965 end + 1, to_free, true);
1966 }
1967 }
1968
1969 /*
1970 * page_mkwrite gets called when the page is firstly dirtied after it's
1971 * faulted in, but write(2) could also dirty a page and set delalloc
1972 * bits, thus in this case for space account reason, we still need to
1973 * clear any delalloc bits within this page range since we have to
1974 * reserve data&meta space before lock_page() (see above comments).
1975 */
1976 btrfs_clear_extent_bit(io_tree, page_start, end,
1977 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1978 EXTENT_DEFRAG, &cached_state);
1979
1980 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
1981 if (ret < 0) {
1982 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1983 goto out_unlock;
1984 }
1985
1986 /* Page is wholly or partially inside EOF. */
1987 if (page_start + folio_size(folio) > size)
1988 zero_start = offset_in_folio(folio, size);
1989 else
1990 zero_start = fsize;
1991
1992 if (zero_start != fsize)
1993 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1994
1995 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1996 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1997 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1998
1999 btrfs_set_inode_last_sub_trans(inode);
2000
2001 if (only_release_metadata)
2002 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
2003 &cached_state);
2004
2005 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
2006 up_read(&inode->i_mmap_lock);
2007
2008 btrfs_delalloc_release_extents(inode, fsize);
2009 if (only_release_metadata)
2010 btrfs_check_nocow_unlock(inode);
2011 sb_end_pagefault(inode->vfs_inode.i_sb);
2012 extent_changeset_free(data_reserved);
2013 return VM_FAULT_LOCKED;
2014
2015 out_unlock:
2016 folio_unlock(folio);
2017 up_read(&inode->i_mmap_lock);
2018 out:
2019 btrfs_delalloc_release_extents(inode, fsize);
2020 if (only_release_metadata)
2021 btrfs_delalloc_release_metadata(inode, reserved_space, true);
2022 else
2023 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2024 reserved_space, true);
2025 out_noreserve:
2026 if (only_release_metadata)
2027 btrfs_check_nocow_unlock(inode);
2028
2029 sb_end_pagefault(inode->vfs_inode.i_sb);
2030
2031 extent_changeset_free(data_reserved);
2032
2033 if (ret < 0)
2034 return vmf_error(ret);
2035
2036 /* Make the VM retry the fault. */
2037 return VM_FAULT_NOPAGE;
2038 }
2039
2040 static const struct vm_operations_struct btrfs_file_vm_ops = {
2041 .fault = filemap_fault,
2042 .map_pages = filemap_map_pages,
2043 .page_mkwrite = btrfs_page_mkwrite,
2044 };
2045
btrfs_file_mmap_prepare(struct vm_area_desc * desc)2046 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
2047 {
2048 struct file *filp = desc->file;
2049 struct address_space *mapping = filp->f_mapping;
2050
2051 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))
2052 return -EIO;
2053 if (!mapping->a_ops->read_folio)
2054 return -ENOEXEC;
2055
2056 file_accessed(filp);
2057 desc->vm_ops = &btrfs_file_vm_ops;
2058
2059 return 0;
2060 }
2061
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)2062 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2063 int slot, u64 start, u64 end)
2064 {
2065 struct btrfs_file_extent_item *fi;
2066 struct btrfs_key key;
2067
2068 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2069 return false;
2070
2071 btrfs_item_key_to_cpu(leaf, &key, slot);
2072 if (key.objectid != btrfs_ino(inode) ||
2073 key.type != BTRFS_EXTENT_DATA_KEY)
2074 return false;
2075
2076 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2077
2078 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2079 return false;
2080
2081 if (btrfs_file_extent_disk_bytenr(leaf, fi))
2082 return false;
2083
2084 if (key.offset == end)
2085 return true;
2086 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2087 return true;
2088 return false;
2089 }
2090
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)2091 static int fill_holes(struct btrfs_trans_handle *trans,
2092 struct btrfs_inode *inode,
2093 struct btrfs_path *path, u64 offset, u64 end)
2094 {
2095 struct btrfs_fs_info *fs_info = trans->fs_info;
2096 struct btrfs_root *root = inode->root;
2097 struct extent_buffer *leaf;
2098 struct btrfs_file_extent_item *fi;
2099 struct extent_map *hole_em;
2100 struct btrfs_key key;
2101 int ret;
2102
2103 if (btrfs_fs_incompat(fs_info, NO_HOLES))
2104 goto out;
2105
2106 key.objectid = btrfs_ino(inode);
2107 key.type = BTRFS_EXTENT_DATA_KEY;
2108 key.offset = offset;
2109
2110 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2111 if (ret <= 0) {
2112 /*
2113 * We should have dropped this offset, so if we find it then
2114 * something has gone horribly wrong.
2115 */
2116 if (ret == 0)
2117 ret = -EINVAL;
2118 return ret;
2119 }
2120
2121 leaf = path->nodes[0];
2122 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2123 u64 num_bytes;
2124
2125 path->slots[0]--;
2126 fi = btrfs_item_ptr(leaf, path->slots[0],
2127 struct btrfs_file_extent_item);
2128 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2129 end - offset;
2130 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2131 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2132 btrfs_set_file_extent_offset(leaf, fi, 0);
2133 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2134 goto out;
2135 }
2136
2137 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2138 u64 num_bytes;
2139
2140 key.offset = offset;
2141 btrfs_set_item_key_safe(trans, path, &key);
2142 fi = btrfs_item_ptr(leaf, path->slots[0],
2143 struct btrfs_file_extent_item);
2144 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2145 offset;
2146 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2147 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2148 btrfs_set_file_extent_offset(leaf, fi, 0);
2149 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2150 goto out;
2151 }
2152 btrfs_release_path(path);
2153
2154 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2155 end - offset);
2156 if (ret)
2157 return ret;
2158
2159 out:
2160 btrfs_release_path(path);
2161
2162 hole_em = btrfs_alloc_extent_map();
2163 if (!hole_em) {
2164 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2165 btrfs_set_inode_full_sync(inode);
2166 } else {
2167 hole_em->start = offset;
2168 hole_em->len = end - offset;
2169 hole_em->ram_bytes = hole_em->len;
2170
2171 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2172 hole_em->disk_num_bytes = 0;
2173 hole_em->generation = trans->transid;
2174
2175 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2176 btrfs_free_extent_map(hole_em);
2177 if (ret)
2178 btrfs_set_inode_full_sync(inode);
2179 }
2180
2181 return 0;
2182 }
2183
2184 /*
2185 * Find a hole extent on given inode and change start/len to the end of hole
2186 * extent.(hole/vacuum extent whose em->start <= start &&
2187 * em->start + em->len > start)
2188 * When a hole extent is found, return 1 and modify start/len.
2189 */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2190 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2191 {
2192 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2193 struct extent_map *em;
2194 int ret = 0;
2195
2196 em = btrfs_get_extent(inode, NULL,
2197 round_down(*start, fs_info->sectorsize),
2198 round_up(*len, fs_info->sectorsize));
2199 if (IS_ERR(em))
2200 return PTR_ERR(em);
2201
2202 /* Hole or vacuum extent(only exists in no-hole mode) */
2203 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2204 const u64 em_end = btrfs_extent_map_end(em);
2205
2206 ret = 1;
2207 *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end);
2208 *start = em_end;
2209 }
2210 btrfs_free_extent_map(em);
2211 return ret;
2212 }
2213
2214 /*
2215 * Check if there is no folio in the range.
2216 *
2217 * We cannot utilize filemap_range_has_page() in a filemap with large folios
2218 * as we can hit the following false positive:
2219 *
2220 * start end
2221 * | |
2222 * |//|//|//|//| | | | | | | | |//|//|
2223 * \ / \ /
2224 * Folio A Folio B
2225 *
2226 * That large folio A and B cover the start and end indexes.
2227 * In that case filemap_range_has_page() will always return true, but the above
2228 * case is fine for btrfs_punch_hole_lock_range() usage.
2229 *
2230 * So here we only ensure that no other folios is in the range, excluding the
2231 * head/tail large folio.
2232 */
check_range_has_page(struct inode * inode,u64 start,u64 end)2233 static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
2234 {
2235 struct folio_batch fbatch;
2236 bool ret = false;
2237 /*
2238 * For subpage case, if the range is not at page boundary, we could
2239 * have pages at the leading/tailing part of the range.
2240 * This could lead to dead loop since filemap_range_has_page()
2241 * will always return true.
2242 * So here we need to do extra page alignment for
2243 * filemap_range_has_page().
2244 *
2245 * And do not decrease page_lockend right now, as it can be 0.
2246 */
2247 const u64 page_lockstart = round_up(start, PAGE_SIZE);
2248 const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
2249 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
2250 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
2251 pgoff_t tmp = start_index;
2252 int found_folios;
2253
2254 /* The same page or adjacent pages. */
2255 if (page_lockend <= page_lockstart)
2256 return false;
2257
2258 folio_batch_init(&fbatch);
2259 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
2260 for (int i = 0; i < found_folios; i++) {
2261 struct folio *folio = fbatch.folios[i];
2262
2263 /* A large folio begins before the start. Not a target. */
2264 if (folio->index < start_index)
2265 continue;
2266 /* A large folio extends beyond the end. Not a target. */
2267 if (folio_next_index(folio) > end_index)
2268 continue;
2269 /* A folio doesn't cover the head/tail index. Found a target. */
2270 ret = true;
2271 break;
2272 }
2273 folio_batch_release(&fbatch);
2274 return ret;
2275 }
2276
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2277 static void btrfs_punch_hole_lock_range(struct inode *inode,
2278 const u64 lockstart, const u64 lockend,
2279 struct extent_state **cached_state)
2280 {
2281 while (1) {
2282 truncate_pagecache_range(inode, lockstart, lockend);
2283
2284 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2285 cached_state);
2286 /*
2287 * We can't have ordered extents in the range, nor dirty/writeback
2288 * pages, because we have locked the inode's VFS lock in exclusive
2289 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2290 * we have flushed all delalloc in the range and we have waited
2291 * for any ordered extents in the range to complete.
2292 * We can race with anyone reading pages from this range, so after
2293 * locking the range check if we have pages in the range, and if
2294 * we do, unlock the range and retry.
2295 */
2296 if (!check_range_has_page(inode, lockstart, lockend))
2297 break;
2298
2299 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2300 cached_state);
2301 }
2302
2303 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2304 }
2305
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2306 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2307 struct btrfs_inode *inode,
2308 struct btrfs_path *path,
2309 struct btrfs_replace_extent_info *extent_info,
2310 const u64 replace_len,
2311 const u64 bytes_to_drop)
2312 {
2313 struct btrfs_fs_info *fs_info = trans->fs_info;
2314 struct btrfs_root *root = inode->root;
2315 struct btrfs_file_extent_item *extent;
2316 struct extent_buffer *leaf;
2317 struct btrfs_key key;
2318 int slot;
2319 int ret;
2320
2321 if (replace_len == 0)
2322 return 0;
2323
2324 if (extent_info->disk_offset == 0 &&
2325 btrfs_fs_incompat(fs_info, NO_HOLES)) {
2326 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2327 return 0;
2328 }
2329
2330 key.objectid = btrfs_ino(inode);
2331 key.type = BTRFS_EXTENT_DATA_KEY;
2332 key.offset = extent_info->file_offset;
2333 ret = btrfs_insert_empty_item(trans, root, path, &key,
2334 sizeof(struct btrfs_file_extent_item));
2335 if (ret)
2336 return ret;
2337 leaf = path->nodes[0];
2338 slot = path->slots[0];
2339 write_extent_buffer(leaf, extent_info->extent_buf,
2340 btrfs_item_ptr_offset(leaf, slot),
2341 sizeof(struct btrfs_file_extent_item));
2342 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2343 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2344 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2345 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2346 if (extent_info->is_new_extent)
2347 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2348 btrfs_release_path(path);
2349
2350 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2351 replace_len);
2352 if (ret)
2353 return ret;
2354
2355 /* If it's a hole, nothing more needs to be done. */
2356 if (extent_info->disk_offset == 0) {
2357 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2358 return 0;
2359 }
2360
2361 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2362
2363 if (extent_info->is_new_extent && extent_info->insertions == 0) {
2364 key.objectid = extent_info->disk_offset;
2365 key.type = BTRFS_EXTENT_ITEM_KEY;
2366 key.offset = extent_info->disk_len;
2367 ret = btrfs_alloc_reserved_file_extent(trans, root,
2368 btrfs_ino(inode),
2369 extent_info->file_offset,
2370 extent_info->qgroup_reserved,
2371 &key);
2372 } else {
2373 struct btrfs_ref ref = {
2374 .action = BTRFS_ADD_DELAYED_REF,
2375 .bytenr = extent_info->disk_offset,
2376 .num_bytes = extent_info->disk_len,
2377 .owning_root = btrfs_root_id(root),
2378 .ref_root = btrfs_root_id(root),
2379 };
2380 u64 ref_offset;
2381
2382 ref_offset = extent_info->file_offset - extent_info->data_offset;
2383 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2384 ret = btrfs_inc_extent_ref(trans, &ref);
2385 }
2386
2387 extent_info->insertions++;
2388
2389 return ret;
2390 }
2391
2392 /*
2393 * The respective range must have been previously locked, as well as the inode.
2394 * The end offset is inclusive (last byte of the range).
2395 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2396 * the file range with an extent.
2397 * When not punching a hole, we don't want to end up in a state where we dropped
2398 * extents without inserting a new one, so we must abort the transaction to avoid
2399 * a corruption.
2400 */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2401 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2402 struct btrfs_path *path, const u64 start,
2403 const u64 end,
2404 struct btrfs_replace_extent_info *extent_info,
2405 struct btrfs_trans_handle **trans_out)
2406 {
2407 struct btrfs_drop_extents_args drop_args = { 0 };
2408 struct btrfs_root *root = inode->root;
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2411 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2412 struct btrfs_trans_handle *trans = NULL;
2413 struct btrfs_block_rsv rsv;
2414 unsigned int rsv_count;
2415 u64 cur_offset;
2416 u64 len = end - start;
2417 int ret = 0;
2418
2419 if (end <= start)
2420 return -EINVAL;
2421
2422 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
2423 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
2424 rsv.failfast = true;
2425
2426 /*
2427 * 1 - update the inode
2428 * 1 - removing the extents in the range
2429 * 1 - adding the hole extent if no_holes isn't set or if we are
2430 * replacing the range with a new extent
2431 */
2432 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2433 rsv_count = 3;
2434 else
2435 rsv_count = 2;
2436
2437 trans = btrfs_start_transaction(root, rsv_count);
2438 if (IS_ERR(trans)) {
2439 ret = PTR_ERR(trans);
2440 trans = NULL;
2441 goto out_release;
2442 }
2443
2444 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
2445 min_size, false);
2446 if (WARN_ON(ret))
2447 goto out_trans;
2448 trans->block_rsv = &rsv;
2449
2450 cur_offset = start;
2451 drop_args.path = path;
2452 drop_args.end = end + 1;
2453 drop_args.drop_cache = true;
2454 while (cur_offset < end) {
2455 drop_args.start = cur_offset;
2456 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2457 /* If we are punching a hole decrement the inode's byte count */
2458 if (!extent_info)
2459 btrfs_update_inode_bytes(inode, 0,
2460 drop_args.bytes_found);
2461 if (ret != -ENOSPC) {
2462 /*
2463 * The only time we don't want to abort is if we are
2464 * attempting to clone a partial inline extent, in which
2465 * case we'll get EOPNOTSUPP. However if we aren't
2466 * clone we need to abort no matter what, because if we
2467 * got EOPNOTSUPP via prealloc then we messed up and
2468 * need to abort.
2469 */
2470 if (unlikely(ret &&
2471 (ret != -EOPNOTSUPP ||
2472 (extent_info && extent_info->is_new_extent))))
2473 btrfs_abort_transaction(trans, ret);
2474 break;
2475 }
2476
2477 trans->block_rsv = &fs_info->trans_block_rsv;
2478
2479 if (!extent_info && cur_offset < drop_args.drop_end &&
2480 cur_offset < ino_size) {
2481 ret = fill_holes(trans, inode, path, cur_offset,
2482 drop_args.drop_end);
2483 if (unlikely(ret)) {
2484 /*
2485 * If we failed then we didn't insert our hole
2486 * entries for the area we dropped, so now the
2487 * fs is corrupted, so we must abort the
2488 * transaction.
2489 */
2490 btrfs_abort_transaction(trans, ret);
2491 break;
2492 }
2493 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2494 /*
2495 * We are past the i_size here, but since we didn't
2496 * insert holes we need to clear the mapped area so we
2497 * know to not set disk_i_size in this area until a new
2498 * file extent is inserted here.
2499 */
2500 ret = btrfs_inode_clear_file_extent_range(inode,
2501 cur_offset,
2502 drop_args.drop_end - cur_offset);
2503 if (unlikely(ret)) {
2504 /*
2505 * We couldn't clear our area, so we could
2506 * presumably adjust up and corrupt the fs, so
2507 * we need to abort.
2508 */
2509 btrfs_abort_transaction(trans, ret);
2510 break;
2511 }
2512 }
2513
2514 if (extent_info &&
2515 drop_args.drop_end > extent_info->file_offset) {
2516 u64 replace_len = drop_args.drop_end -
2517 extent_info->file_offset;
2518
2519 ret = btrfs_insert_replace_extent(trans, inode, path,
2520 extent_info, replace_len,
2521 drop_args.bytes_found);
2522 if (unlikely(ret)) {
2523 btrfs_abort_transaction(trans, ret);
2524 break;
2525 }
2526 extent_info->data_len -= replace_len;
2527 extent_info->data_offset += replace_len;
2528 extent_info->file_offset += replace_len;
2529 }
2530
2531 /*
2532 * We are releasing our handle on the transaction, balance the
2533 * dirty pages of the btree inode and flush delayed items, and
2534 * then get a new transaction handle, which may now point to a
2535 * new transaction in case someone else may have committed the
2536 * transaction we used to replace/drop file extent items. So
2537 * bump the inode's iversion and update mtime and ctime except
2538 * if we are called from a dedupe context. This is because a
2539 * power failure/crash may happen after the transaction is
2540 * committed and before we finish replacing/dropping all the
2541 * file extent items we need.
2542 */
2543 inode_inc_iversion(&inode->vfs_inode);
2544
2545 if (!extent_info || extent_info->update_times)
2546 inode_set_mtime_to_ts(&inode->vfs_inode,
2547 inode_set_ctime_current(&inode->vfs_inode));
2548
2549 ret = btrfs_update_inode(trans, inode);
2550 if (ret)
2551 break;
2552
2553 btrfs_end_transaction(trans);
2554 btrfs_btree_balance_dirty(fs_info);
2555
2556 trans = btrfs_start_transaction(root, rsv_count);
2557 if (IS_ERR(trans)) {
2558 ret = PTR_ERR(trans);
2559 trans = NULL;
2560 break;
2561 }
2562
2563 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2564 &rsv, min_size, false);
2565 if (WARN_ON(ret))
2566 break;
2567 trans->block_rsv = &rsv;
2568
2569 cur_offset = drop_args.drop_end;
2570 len = end - cur_offset;
2571 if (!extent_info && len) {
2572 ret = find_first_non_hole(inode, &cur_offset, &len);
2573 if (unlikely(ret < 0))
2574 break;
2575 if (ret && !len) {
2576 ret = 0;
2577 break;
2578 }
2579 }
2580 }
2581
2582 /*
2583 * If we were cloning, force the next fsync to be a full one since we
2584 * we replaced (or just dropped in the case of cloning holes when
2585 * NO_HOLES is enabled) file extent items and did not setup new extent
2586 * maps for the replacement extents (or holes).
2587 */
2588 if (extent_info && !extent_info->is_new_extent)
2589 btrfs_set_inode_full_sync(inode);
2590
2591 if (ret)
2592 goto out_trans;
2593
2594 trans->block_rsv = &fs_info->trans_block_rsv;
2595 /*
2596 * If we are using the NO_HOLES feature we might have had already an
2597 * hole that overlaps a part of the region [lockstart, lockend] and
2598 * ends at (or beyond) lockend. Since we have no file extent items to
2599 * represent holes, drop_end can be less than lockend and so we must
2600 * make sure we have an extent map representing the existing hole (the
2601 * call to __btrfs_drop_extents() might have dropped the existing extent
2602 * map representing the existing hole), otherwise the fast fsync path
2603 * will not record the existence of the hole region
2604 * [existing_hole_start, lockend].
2605 */
2606 if (drop_args.drop_end <= end)
2607 drop_args.drop_end = end + 1;
2608 /*
2609 * Don't insert file hole extent item if it's for a range beyond eof
2610 * (because it's useless) or if it represents a 0 bytes range (when
2611 * cur_offset == drop_end).
2612 */
2613 if (!extent_info && cur_offset < ino_size &&
2614 cur_offset < drop_args.drop_end) {
2615 ret = fill_holes(trans, inode, path, cur_offset,
2616 drop_args.drop_end);
2617 if (unlikely(ret)) {
2618 /* Same comment as above. */
2619 btrfs_abort_transaction(trans, ret);
2620 goto out_trans;
2621 }
2622 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2623 /* See the comment in the loop above for the reasoning here. */
2624 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2625 drop_args.drop_end - cur_offset);
2626 if (unlikely(ret)) {
2627 btrfs_abort_transaction(trans, ret);
2628 goto out_trans;
2629 }
2630
2631 }
2632 if (extent_info) {
2633 ret = btrfs_insert_replace_extent(trans, inode, path,
2634 extent_info, extent_info->data_len,
2635 drop_args.bytes_found);
2636 if (unlikely(ret)) {
2637 btrfs_abort_transaction(trans, ret);
2638 goto out_trans;
2639 }
2640 }
2641
2642 out_trans:
2643 if (!trans)
2644 goto out_release;
2645
2646 trans->block_rsv = &fs_info->trans_block_rsv;
2647 if (ret)
2648 btrfs_end_transaction(trans);
2649 else
2650 *trans_out = trans;
2651 out_release:
2652 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
2653 return ret;
2654 }
2655
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2656 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2657 {
2658 struct inode *inode = file_inode(file);
2659 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2660 struct btrfs_root *root = BTRFS_I(inode)->root;
2661 struct extent_state *cached_state = NULL;
2662 struct btrfs_path *path;
2663 struct btrfs_trans_handle *trans = NULL;
2664 u64 lockstart;
2665 u64 lockend;
2666 u64 tail_start;
2667 u64 tail_len;
2668 const u64 orig_start = offset;
2669 const u64 orig_end = offset + len - 1;
2670 int ret = 0;
2671 bool same_block;
2672 u64 ino_size;
2673 bool truncated_block = false;
2674 bool updated_inode = false;
2675
2676 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2677
2678 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2679 if (ret)
2680 goto out_only_mutex;
2681
2682 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2683 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2684 if (ret < 0)
2685 goto out_only_mutex;
2686 if (ret && !len) {
2687 /* Already in a large hole */
2688 ret = 0;
2689 goto out_only_mutex;
2690 }
2691
2692 ret = file_modified(file);
2693 if (ret)
2694 goto out_only_mutex;
2695
2696 lockstart = round_up(offset, fs_info->sectorsize);
2697 lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2698 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2699 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2700 /*
2701 * Only do this if we are in the same block and we aren't doing the
2702 * entire block.
2703 */
2704 if (same_block && len < fs_info->sectorsize) {
2705 if (offset < ino_size) {
2706 truncated_block = true;
2707 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2708 orig_start, orig_end);
2709 } else {
2710 ret = 0;
2711 }
2712 goto out_only_mutex;
2713 }
2714
2715 /* zero back part of the first block */
2716 if (offset < ino_size) {
2717 truncated_block = true;
2718 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
2719 if (ret) {
2720 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2721 return ret;
2722 }
2723 }
2724
2725 /* Check the aligned pages after the first unaligned page,
2726 * if offset != orig_start, which means the first unaligned page
2727 * including several following pages are already in holes,
2728 * the extra check can be skipped */
2729 if (offset == orig_start) {
2730 /* after truncate page, check hole again */
2731 len = offset + len - lockstart;
2732 offset = lockstart;
2733 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2734 if (ret < 0)
2735 goto out_only_mutex;
2736 if (ret && !len) {
2737 ret = 0;
2738 goto out_only_mutex;
2739 }
2740 lockstart = offset;
2741 }
2742
2743 /* Check the tail unaligned part is in a hole */
2744 tail_start = lockend + 1;
2745 tail_len = offset + len - tail_start;
2746 if (tail_len) {
2747 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2748 if (unlikely(ret < 0))
2749 goto out_only_mutex;
2750 if (!ret) {
2751 /* zero the front end of the last page */
2752 if (tail_start + tail_len < ino_size) {
2753 truncated_block = true;
2754 ret = btrfs_truncate_block(BTRFS_I(inode),
2755 tail_start + tail_len - 1,
2756 orig_start, orig_end);
2757 if (ret)
2758 goto out_only_mutex;
2759 }
2760 }
2761 }
2762
2763 if (lockend < lockstart) {
2764 ret = 0;
2765 goto out_only_mutex;
2766 }
2767
2768 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2769
2770 path = btrfs_alloc_path();
2771 if (!path) {
2772 ret = -ENOMEM;
2773 goto out;
2774 }
2775
2776 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2777 lockend, NULL, &trans);
2778 btrfs_free_path(path);
2779 if (ret)
2780 goto out;
2781
2782 ASSERT(trans != NULL);
2783 inode_inc_iversion(inode);
2784 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2785 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2786 updated_inode = true;
2787 btrfs_end_transaction(trans);
2788 btrfs_btree_balance_dirty(fs_info);
2789 out:
2790 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2791 &cached_state);
2792 out_only_mutex:
2793 if (!updated_inode && truncated_block && !ret) {
2794 /*
2795 * If we only end up zeroing part of a page, we still need to
2796 * update the inode item, so that all the time fields are
2797 * updated as well as the necessary btrfs inode in memory fields
2798 * for detecting, at fsync time, if the inode isn't yet in the
2799 * log tree or it's there but not up to date.
2800 */
2801 struct timespec64 now = inode_set_ctime_current(inode);
2802
2803 inode_inc_iversion(inode);
2804 inode_set_mtime_to_ts(inode, now);
2805 trans = btrfs_start_transaction(root, 1);
2806 if (IS_ERR(trans)) {
2807 ret = PTR_ERR(trans);
2808 } else {
2809 int ret2;
2810
2811 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2812 ret2 = btrfs_end_transaction(trans);
2813 if (!ret)
2814 ret = ret2;
2815 }
2816 }
2817 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2818 return ret;
2819 }
2820
2821 /* Helper structure to record which range is already reserved */
2822 struct falloc_range {
2823 struct list_head list;
2824 u64 start;
2825 u64 len;
2826 };
2827
2828 /*
2829 * Helper function to add falloc range
2830 *
2831 * Caller should have locked the larger range of extent containing
2832 * [start, len)
2833 */
add_falloc_range(struct list_head * head,u64 start,u64 len)2834 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2835 {
2836 struct falloc_range *range = NULL;
2837
2838 if (!list_empty(head)) {
2839 /*
2840 * As fallocate iterates by bytenr order, we only need to check
2841 * the last range.
2842 */
2843 range = list_last_entry(head, struct falloc_range, list);
2844 if (range->start + range->len == start) {
2845 range->len += len;
2846 return 0;
2847 }
2848 }
2849
2850 range = kmalloc_obj(*range);
2851 if (!range)
2852 return -ENOMEM;
2853 range->start = start;
2854 range->len = len;
2855 list_add_tail(&range->list, head);
2856 return 0;
2857 }
2858
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2859 static int btrfs_fallocate_update_isize(struct inode *inode,
2860 const u64 end,
2861 const int mode)
2862 {
2863 struct btrfs_trans_handle *trans;
2864 struct btrfs_root *root = BTRFS_I(inode)->root;
2865 u64 range_start;
2866 u64 range_end;
2867 int ret;
2868 int ret2;
2869
2870 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2871 return 0;
2872
2873 range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
2874 range_end = round_up(end, root->fs_info->sectorsize);
2875
2876 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
2877 range_end - range_start);
2878 if (ret)
2879 return ret;
2880
2881 trans = btrfs_start_transaction(root, 1);
2882 if (IS_ERR(trans))
2883 return PTR_ERR(trans);
2884
2885 inode_set_ctime_current(inode);
2886 i_size_write(inode, end);
2887 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2888 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2889 ret2 = btrfs_end_transaction(trans);
2890
2891 return ret ? ret : ret2;
2892 }
2893
2894 enum {
2895 RANGE_BOUNDARY_WRITTEN_EXTENT,
2896 RANGE_BOUNDARY_PREALLOC_EXTENT,
2897 RANGE_BOUNDARY_HOLE,
2898 };
2899
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2900 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2901 u64 offset)
2902 {
2903 const u64 sectorsize = inode->root->fs_info->sectorsize;
2904 struct extent_map *em;
2905 int ret;
2906
2907 offset = round_down(offset, sectorsize);
2908 em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2909 if (IS_ERR(em))
2910 return PTR_ERR(em);
2911
2912 if (em->disk_bytenr == EXTENT_MAP_HOLE)
2913 ret = RANGE_BOUNDARY_HOLE;
2914 else if (em->flags & EXTENT_FLAG_PREALLOC)
2915 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2916 else
2917 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2918
2919 btrfs_free_extent_map(em);
2920 return ret;
2921 }
2922
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2923 static int btrfs_zero_range(struct inode *inode,
2924 loff_t offset,
2925 loff_t len,
2926 const int mode)
2927 {
2928 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2929 struct extent_map *em;
2930 struct extent_changeset *data_reserved = NULL;
2931 int ret;
2932 u64 alloc_hint = 0;
2933 const u64 sectorsize = fs_info->sectorsize;
2934 const u64 orig_start = offset;
2935 const u64 orig_end = offset + len - 1;
2936 u64 alloc_start = round_down(offset, sectorsize);
2937 u64 alloc_end = round_up(offset + len, sectorsize);
2938 u64 bytes_to_reserve = 0;
2939 bool space_reserved = false;
2940
2941 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2942 alloc_end - alloc_start);
2943 if (IS_ERR(em)) {
2944 ret = PTR_ERR(em);
2945 goto out;
2946 }
2947
2948 /*
2949 * Avoid hole punching and extent allocation for some cases. More cases
2950 * could be considered, but these are unlikely common and we keep things
2951 * as simple as possible for now. Also, intentionally, if the target
2952 * range contains one or more prealloc extents together with regular
2953 * extents and holes, we drop all the existing extents and allocate a
2954 * new prealloc extent, so that we get a larger contiguous disk extent.
2955 */
2956 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2957 const u64 em_end = btrfs_extent_map_end(em);
2958
2959 if (em_end >= offset + len) {
2960 /*
2961 * The whole range is already a prealloc extent,
2962 * do nothing except updating the inode's i_size if
2963 * needed.
2964 */
2965 btrfs_free_extent_map(em);
2966 ret = btrfs_fallocate_update_isize(inode, offset + len,
2967 mode);
2968 goto out;
2969 }
2970 /*
2971 * Part of the range is already a prealloc extent, so operate
2972 * only on the remaining part of the range.
2973 */
2974 alloc_start = em_end;
2975 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2976 len = offset + len - alloc_start;
2977 offset = alloc_start;
2978 alloc_hint = btrfs_extent_map_block_start(em) + em->len;
2979 }
2980 btrfs_free_extent_map(em);
2981
2982 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2983 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2984 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2985 if (IS_ERR(em)) {
2986 ret = PTR_ERR(em);
2987 goto out;
2988 }
2989
2990 if (em->flags & EXTENT_FLAG_PREALLOC) {
2991 btrfs_free_extent_map(em);
2992 ret = btrfs_fallocate_update_isize(inode, offset + len,
2993 mode);
2994 goto out;
2995 }
2996 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2997 btrfs_free_extent_map(em);
2998 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2999 orig_start, orig_end);
3000 if (!ret)
3001 ret = btrfs_fallocate_update_isize(inode,
3002 offset + len,
3003 mode);
3004 return ret;
3005 }
3006 btrfs_free_extent_map(em);
3007 alloc_start = round_down(offset, sectorsize);
3008 alloc_end = alloc_start + sectorsize;
3009 goto reserve_space;
3010 }
3011
3012 alloc_start = round_up(offset, sectorsize);
3013 alloc_end = round_down(offset + len, sectorsize);
3014
3015 /*
3016 * For unaligned ranges, check the pages at the boundaries, they might
3017 * map to an extent, in which case we need to partially zero them, or
3018 * they might map to a hole, in which case we need our allocation range
3019 * to cover them.
3020 */
3021 if (!IS_ALIGNED(offset, sectorsize)) {
3022 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3023 offset);
3024 if (ret < 0)
3025 goto out;
3026 if (ret == RANGE_BOUNDARY_HOLE) {
3027 alloc_start = round_down(offset, sectorsize);
3028 ret = 0;
3029 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3030 ret = btrfs_truncate_block(BTRFS_I(inode), offset,
3031 orig_start, orig_end);
3032 if (ret)
3033 goto out;
3034 } else {
3035 ret = 0;
3036 }
3037 }
3038
3039 if (!IS_ALIGNED(offset + len, sectorsize)) {
3040 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3041 offset + len);
3042 if (ret < 0)
3043 goto out;
3044 if (ret == RANGE_BOUNDARY_HOLE) {
3045 alloc_end = round_up(offset + len, sectorsize);
3046 ret = 0;
3047 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3048 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
3049 orig_start, orig_end);
3050 if (ret)
3051 goto out;
3052 } else {
3053 ret = 0;
3054 }
3055 }
3056
3057 reserve_space:
3058 if (alloc_start < alloc_end) {
3059 struct extent_state *cached_state = NULL;
3060 const u64 lockstart = alloc_start;
3061 const u64 lockend = alloc_end - 1;
3062
3063 bytes_to_reserve = alloc_end - alloc_start;
3064 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3065 bytes_to_reserve);
3066 if (ret < 0)
3067 goto out;
3068 space_reserved = true;
3069 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3070 &cached_state);
3071 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3072 alloc_start, bytes_to_reserve);
3073 if (ret) {
3074 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3075 lockend, &cached_state);
3076 goto out;
3077 }
3078 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3079 alloc_end - alloc_start,
3080 fs_info->sectorsize,
3081 offset + len, &alloc_hint);
3082 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3083 &cached_state);
3084 /* btrfs_prealloc_file_range releases reserved space on error */
3085 if (ret) {
3086 space_reserved = false;
3087 goto out;
3088 }
3089 }
3090 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3091 out:
3092 if (ret && space_reserved)
3093 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3094 alloc_start, bytes_to_reserve);
3095 extent_changeset_free(data_reserved);
3096
3097 return ret;
3098 }
3099
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3100 static long btrfs_fallocate(struct file *file, int mode,
3101 loff_t offset, loff_t len)
3102 {
3103 struct inode *inode = file_inode(file);
3104 struct extent_state *cached_state = NULL;
3105 struct extent_changeset *data_reserved = NULL;
3106 struct falloc_range *range;
3107 struct falloc_range *tmp;
3108 LIST_HEAD(reserve_list);
3109 u64 cur_offset;
3110 u64 last_byte;
3111 u64 alloc_start;
3112 u64 alloc_end;
3113 u64 alloc_hint = 0;
3114 u64 locked_end;
3115 u64 actual_end = 0;
3116 u64 data_space_needed = 0;
3117 u64 data_space_reserved = 0;
3118 u64 qgroup_reserved = 0;
3119 struct extent_map *em;
3120 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3121 int ret;
3122
3123 if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3124 return -EIO;
3125
3126 /* Do not allow fallocate in ZONED mode */
3127 if (btrfs_is_zoned(inode_to_fs_info(inode)))
3128 return -EOPNOTSUPP;
3129
3130 alloc_start = round_down(offset, blocksize);
3131 alloc_end = round_up(offset + len, blocksize);
3132 cur_offset = alloc_start;
3133
3134 /* Make sure we aren't being give some crap mode */
3135 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3136 FALLOC_FL_ZERO_RANGE))
3137 return -EOPNOTSUPP;
3138
3139 if (mode & FALLOC_FL_PUNCH_HOLE)
3140 return btrfs_punch_hole(file, offset, len);
3141
3142 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3143
3144 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3145 ret = inode_newsize_ok(inode, offset + len);
3146 if (ret)
3147 goto out;
3148 }
3149
3150 ret = file_modified(file);
3151 if (ret)
3152 goto out;
3153
3154 /*
3155 * TODO: Move these two operations after we have checked
3156 * accurate reserved space, or fallocate can still fail but
3157 * with page truncated or size expanded.
3158 *
3159 * But that's a minor problem and won't do much harm BTW.
3160 */
3161 if (alloc_start > inode->i_size) {
3162 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3163 alloc_start);
3164 if (ret)
3165 goto out;
3166 } else if (offset + len > inode->i_size) {
3167 /*
3168 * If we are fallocating from the end of the file onward we
3169 * need to zero out the end of the block if i_size lands in the
3170 * middle of a block.
3171 */
3172 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
3173 inode->i_size, (u64)-1);
3174 if (ret)
3175 goto out;
3176 }
3177
3178 /*
3179 * We have locked the inode at the VFS level (in exclusive mode) and we
3180 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3181 * locking the file range, flush all dealloc in the range and wait for
3182 * all ordered extents in the range to complete. After this we can lock
3183 * the file range and, due to the previous locking we did, we know there
3184 * can't be more delalloc or ordered extents in the range.
3185 */
3186 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3187 alloc_end - alloc_start);
3188 if (ret)
3189 goto out;
3190
3191 if (mode & FALLOC_FL_ZERO_RANGE) {
3192 ret = btrfs_zero_range(inode, offset, len, mode);
3193 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3194 return ret;
3195 }
3196
3197 locked_end = alloc_end - 1;
3198 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3199 &cached_state);
3200
3201 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3202
3203 /* First, check if we exceed the qgroup limit */
3204 while (cur_offset < alloc_end) {
3205 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3206 alloc_end - cur_offset);
3207 if (IS_ERR(em)) {
3208 ret = PTR_ERR(em);
3209 break;
3210 }
3211 last_byte = min(btrfs_extent_map_end(em), alloc_end);
3212 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
3213 last_byte = ALIGN(last_byte, blocksize);
3214 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3215 (cur_offset >= inode->i_size &&
3216 !(em->flags & EXTENT_FLAG_PREALLOC))) {
3217 const u64 range_len = last_byte - cur_offset;
3218
3219 ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3220 if (ret < 0) {
3221 btrfs_free_extent_map(em);
3222 break;
3223 }
3224 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3225 &data_reserved, cur_offset, range_len);
3226 if (ret < 0) {
3227 btrfs_free_extent_map(em);
3228 break;
3229 }
3230 qgroup_reserved += range_len;
3231 data_space_needed += range_len;
3232 }
3233 btrfs_free_extent_map(em);
3234 cur_offset = last_byte;
3235 }
3236
3237 if (!ret && data_space_needed > 0) {
3238 /*
3239 * We are safe to reserve space here as we can't have delalloc
3240 * in the range, see above.
3241 */
3242 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3243 data_space_needed);
3244 if (!ret)
3245 data_space_reserved = data_space_needed;
3246 }
3247
3248 /*
3249 * If ret is still 0, means we're OK to fallocate.
3250 * Or just cleanup the list and exit.
3251 */
3252 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3253 if (!ret) {
3254 ret = btrfs_prealloc_file_range(inode, mode,
3255 range->start,
3256 range->len, blocksize,
3257 offset + len, &alloc_hint);
3258 /*
3259 * btrfs_prealloc_file_range() releases space even
3260 * if it returns an error.
3261 */
3262 data_space_reserved -= range->len;
3263 qgroup_reserved -= range->len;
3264 } else if (data_space_reserved > 0) {
3265 btrfs_free_reserved_data_space(BTRFS_I(inode),
3266 data_reserved, range->start,
3267 range->len);
3268 data_space_reserved -= range->len;
3269 qgroup_reserved -= range->len;
3270 } else if (qgroup_reserved > 0) {
3271 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3272 range->start, range->len, NULL);
3273 qgroup_reserved -= range->len;
3274 }
3275 list_del(&range->list);
3276 kfree(range);
3277 }
3278 if (ret < 0)
3279 goto out_unlock;
3280
3281 /*
3282 * We didn't need to allocate any more space, but we still extended the
3283 * size of the file so we need to update i_size and the inode item.
3284 */
3285 ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3286 out_unlock:
3287 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3288 &cached_state);
3289 out:
3290 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3291 extent_changeset_free(data_reserved);
3292 return ret;
3293 }
3294
3295 /*
3296 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3297 * that has unflushed and/or flushing delalloc. There might be other adjacent
3298 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3299 * looping while it gets adjacent subranges, and merging them together.
3300 */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3301 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3302 struct extent_state **cached_state,
3303 bool *search_io_tree,
3304 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3305 {
3306 u64 len = end + 1 - start;
3307 u64 delalloc_len = 0;
3308 struct btrfs_ordered_extent *oe;
3309 u64 oe_start;
3310 u64 oe_end;
3311
3312 /*
3313 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3314 * means we have delalloc (dirty pages) for which writeback has not
3315 * started yet.
3316 */
3317 if (*search_io_tree) {
3318 spin_lock(&inode->lock);
3319 if (inode->delalloc_bytes > 0) {
3320 spin_unlock(&inode->lock);
3321 *delalloc_start_ret = start;
3322 delalloc_len = btrfs_count_range_bits(&inode->io_tree,
3323 delalloc_start_ret, end,
3324 len, EXTENT_DELALLOC,
3325 true, cached_state);
3326 } else {
3327 spin_unlock(&inode->lock);
3328 }
3329 }
3330
3331 if (delalloc_len > 0) {
3332 /*
3333 * If delalloc was found then *delalloc_start_ret has a sector size
3334 * aligned value (rounded down).
3335 */
3336 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3337
3338 if (*delalloc_start_ret == start) {
3339 /* Delalloc for the whole range, nothing more to do. */
3340 if (*delalloc_end_ret == end)
3341 return true;
3342 /* Else trim our search range for ordered extents. */
3343 start = *delalloc_end_ret + 1;
3344 len = end + 1 - start;
3345 }
3346 } else {
3347 /* No delalloc, future calls don't need to search again. */
3348 *search_io_tree = false;
3349 }
3350
3351 /*
3352 * Now also check if there's any ordered extent in the range.
3353 * We do this because:
3354 *
3355 * 1) When delalloc is flushed, the file range is locked, we clear the
3356 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3357 * an ordered extent for the write. So we might just have been called
3358 * after delalloc is flushed and before the ordered extent completes
3359 * and inserts the new file extent item in the subvolume's btree;
3360 *
3361 * 2) We may have an ordered extent created by flushing delalloc for a
3362 * subrange that starts before the subrange we found marked with
3363 * EXTENT_DELALLOC in the io tree.
3364 *
3365 * We could also use the extent map tree to find such delalloc that is
3366 * being flushed, but using the ordered extents tree is more efficient
3367 * because it's usually much smaller as ordered extents are removed from
3368 * the tree once they complete. With the extent maps, we may have them
3369 * in the extent map tree for a very long time, and they were either
3370 * created by previous writes or loaded by read operations.
3371 */
3372 oe = btrfs_lookup_first_ordered_range(inode, start, len);
3373 if (!oe)
3374 return (delalloc_len > 0);
3375
3376 /* The ordered extent may span beyond our search range. */
3377 oe_start = max(oe->file_offset, start);
3378 oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3379
3380 btrfs_put_ordered_extent(oe);
3381
3382 /* Don't have unflushed delalloc, return the ordered extent range. */
3383 if (delalloc_len == 0) {
3384 *delalloc_start_ret = oe_start;
3385 *delalloc_end_ret = oe_end;
3386 return true;
3387 }
3388
3389 /*
3390 * We have both unflushed delalloc (io_tree) and an ordered extent.
3391 * If the ranges are adjacent returned a combined range, otherwise
3392 * return the leftmost range.
3393 */
3394 if (oe_start < *delalloc_start_ret) {
3395 if (oe_end < *delalloc_start_ret)
3396 *delalloc_end_ret = oe_end;
3397 *delalloc_start_ret = oe_start;
3398 } else if (*delalloc_end_ret + 1 == oe_start) {
3399 *delalloc_end_ret = oe_end;
3400 }
3401
3402 return true;
3403 }
3404
3405 /*
3406 * Check if there's delalloc in a given range.
3407 *
3408 * @inode: The inode.
3409 * @start: The start offset of the range. It does not need to be
3410 * sector size aligned.
3411 * @end: The end offset (inclusive value) of the search range.
3412 * It does not need to be sector size aligned.
3413 * @cached_state: Extent state record used for speeding up delalloc
3414 * searches in the inode's io_tree. Can be NULL.
3415 * @delalloc_start_ret: Output argument, set to the start offset of the
3416 * subrange found with delalloc (may not be sector size
3417 * aligned).
3418 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3419 * of the subrange found with delalloc.
3420 *
3421 * Returns true if a subrange with delalloc is found within the given range, and
3422 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3423 * end offsets of the subrange.
3424 */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3425 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3426 struct extent_state **cached_state,
3427 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3428 {
3429 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3430 u64 prev_delalloc_end = 0;
3431 bool search_io_tree = true;
3432 bool ret = false;
3433
3434 while (cur_offset <= end) {
3435 u64 delalloc_start;
3436 u64 delalloc_end;
3437 bool delalloc;
3438
3439 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3440 cached_state, &search_io_tree,
3441 &delalloc_start,
3442 &delalloc_end);
3443 if (!delalloc)
3444 break;
3445
3446 if (prev_delalloc_end == 0) {
3447 /* First subrange found. */
3448 *delalloc_start_ret = max(delalloc_start, start);
3449 *delalloc_end_ret = delalloc_end;
3450 ret = true;
3451 } else if (delalloc_start == prev_delalloc_end + 1) {
3452 /* Subrange adjacent to the previous one, merge them. */
3453 *delalloc_end_ret = delalloc_end;
3454 } else {
3455 /* Subrange not adjacent to the previous one, exit. */
3456 break;
3457 }
3458
3459 prev_delalloc_end = delalloc_end;
3460 cur_offset = delalloc_end + 1;
3461 cond_resched();
3462 }
3463
3464 return ret;
3465 }
3466
3467 /*
3468 * Check if there's a hole or delalloc range in a range representing a hole (or
3469 * prealloc extent) found in the inode's subvolume btree.
3470 *
3471 * @inode: The inode.
3472 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3473 * @start: Start offset of the hole region. It does not need to be sector
3474 * size aligned.
3475 * @end: End offset (inclusive value) of the hole region. It does not
3476 * need to be sector size aligned.
3477 * @start_ret: Return parameter, used to set the start of the subrange in the
3478 * hole that matches the search criteria (seek mode), if such
3479 * subrange is found (return value of the function is true).
3480 * The value returned here may not be sector size aligned.
3481 *
3482 * Returns true if a subrange matching the given seek mode is found, and if one
3483 * is found, it updates @start_ret with the start of the subrange.
3484 */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3485 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3486 struct extent_state **cached_state,
3487 u64 start, u64 end, u64 *start_ret)
3488 {
3489 u64 delalloc_start;
3490 u64 delalloc_end;
3491 bool delalloc;
3492
3493 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3494 &delalloc_start, &delalloc_end);
3495 if (delalloc && whence == SEEK_DATA) {
3496 *start_ret = delalloc_start;
3497 return true;
3498 }
3499
3500 if (delalloc && whence == SEEK_HOLE) {
3501 /*
3502 * We found delalloc but it starts after out start offset. So we
3503 * have a hole between our start offset and the delalloc start.
3504 */
3505 if (start < delalloc_start) {
3506 *start_ret = start;
3507 return true;
3508 }
3509 /*
3510 * Delalloc range starts at our start offset.
3511 * If the delalloc range's length is smaller than our range,
3512 * then it means we have a hole that starts where the delalloc
3513 * subrange ends.
3514 */
3515 if (delalloc_end < end) {
3516 *start_ret = delalloc_end + 1;
3517 return true;
3518 }
3519
3520 /* There's delalloc for the whole range. */
3521 return false;
3522 }
3523
3524 if (!delalloc && whence == SEEK_HOLE) {
3525 *start_ret = start;
3526 return true;
3527 }
3528
3529 /*
3530 * No delalloc in the range and we are seeking for data. The caller has
3531 * to iterate to the next extent item in the subvolume btree.
3532 */
3533 return false;
3534 }
3535
find_desired_extent(struct file * file,loff_t offset,int whence)3536 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3537 {
3538 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3539 struct btrfs_file_private *private;
3540 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3541 struct extent_state *cached_state = NULL;
3542 struct extent_state **delalloc_cached_state;
3543 const loff_t i_size = i_size_read(&inode->vfs_inode);
3544 const u64 ino = btrfs_ino(inode);
3545 struct btrfs_root *root = inode->root;
3546 struct btrfs_path *path;
3547 struct btrfs_key key;
3548 u64 last_extent_end;
3549 u64 lockstart;
3550 u64 lockend;
3551 u64 start;
3552 int ret;
3553 bool found = false;
3554
3555 if (i_size == 0 || offset >= i_size)
3556 return -ENXIO;
3557
3558 /*
3559 * Quick path. If the inode has no prealloc extents and its number of
3560 * bytes used matches its i_size, then it can not have holes.
3561 */
3562 if (whence == SEEK_HOLE &&
3563 !(inode->flags & BTRFS_INODE_PREALLOC) &&
3564 inode_get_bytes(&inode->vfs_inode) == i_size)
3565 return i_size;
3566
3567 spin_lock(&inode->lock);
3568 private = file->private_data;
3569 spin_unlock(&inode->lock);
3570
3571 if (private && private->owner_task != current) {
3572 /*
3573 * Not allocated by us, don't use it as its cached state is used
3574 * by the task that allocated it and we don't want neither to
3575 * mess with it nor get incorrect results because it reflects an
3576 * invalid state for the current task.
3577 */
3578 private = NULL;
3579 } else if (!private) {
3580 private = kzalloc_obj(*private);
3581 /*
3582 * No worries if memory allocation failed.
3583 * The private structure is used only for speeding up multiple
3584 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3585 * so everything will still be correct.
3586 */
3587 if (private) {
3588 bool free = false;
3589
3590 private->owner_task = current;
3591
3592 spin_lock(&inode->lock);
3593 if (file->private_data)
3594 free = true;
3595 else
3596 file->private_data = private;
3597 spin_unlock(&inode->lock);
3598
3599 if (free) {
3600 kfree(private);
3601 private = NULL;
3602 }
3603 }
3604 }
3605
3606 if (private)
3607 delalloc_cached_state = &private->llseek_cached_state;
3608 else
3609 delalloc_cached_state = NULL;
3610
3611 /*
3612 * offset can be negative, in this case we start finding DATA/HOLE from
3613 * the very start of the file.
3614 */
3615 start = max_t(loff_t, 0, offset);
3616
3617 lockstart = round_down(start, fs_info->sectorsize);
3618 lockend = round_up(i_size, fs_info->sectorsize);
3619 if (lockend <= lockstart)
3620 lockend = lockstart + fs_info->sectorsize;
3621 lockend--;
3622
3623 path = btrfs_alloc_path();
3624 if (!path)
3625 return -ENOMEM;
3626 path->reada = READA_FORWARD;
3627
3628 key.objectid = ino;
3629 key.type = BTRFS_EXTENT_DATA_KEY;
3630 key.offset = start;
3631
3632 last_extent_end = lockstart;
3633
3634 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3635
3636 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3637 if (ret < 0) {
3638 goto out;
3639 } else if (ret > 0 && path->slots[0] > 0) {
3640 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3641 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3642 path->slots[0]--;
3643 }
3644
3645 while (start < i_size) {
3646 struct extent_buffer *leaf = path->nodes[0];
3647 struct btrfs_file_extent_item *extent;
3648 u64 extent_end;
3649 u8 type;
3650
3651 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3652 ret = btrfs_next_leaf(root, path);
3653 if (ret < 0)
3654 goto out;
3655 else if (ret > 0)
3656 break;
3657
3658 leaf = path->nodes[0];
3659 }
3660
3661 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3662 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3663 break;
3664
3665 extent_end = btrfs_file_extent_end(path);
3666
3667 /*
3668 * In the first iteration we may have a slot that points to an
3669 * extent that ends before our start offset, so skip it.
3670 */
3671 if (extent_end <= start) {
3672 path->slots[0]++;
3673 continue;
3674 }
3675
3676 /* We have an implicit hole, NO_HOLES feature is likely set. */
3677 if (last_extent_end < key.offset) {
3678 u64 search_start = last_extent_end;
3679 u64 found_start;
3680
3681 /*
3682 * First iteration, @start matches @offset and it's
3683 * within the hole.
3684 */
3685 if (start == offset)
3686 search_start = offset;
3687
3688 found = find_desired_extent_in_hole(inode, whence,
3689 delalloc_cached_state,
3690 search_start,
3691 key.offset - 1,
3692 &found_start);
3693 if (found) {
3694 start = found_start;
3695 break;
3696 }
3697 /*
3698 * Didn't find data or a hole (due to delalloc) in the
3699 * implicit hole range, so need to analyze the extent.
3700 */
3701 }
3702
3703 extent = btrfs_item_ptr(leaf, path->slots[0],
3704 struct btrfs_file_extent_item);
3705 type = btrfs_file_extent_type(leaf, extent);
3706
3707 /*
3708 * Can't access the extent's disk_bytenr field if this is an
3709 * inline extent, since at that offset, it's where the extent
3710 * data starts.
3711 */
3712 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3713 (type == BTRFS_FILE_EXTENT_REG &&
3714 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3715 /*
3716 * Explicit hole or prealloc extent, search for delalloc.
3717 * A prealloc extent is treated like a hole.
3718 */
3719 u64 search_start = key.offset;
3720 u64 found_start;
3721
3722 /*
3723 * First iteration, @start matches @offset and it's
3724 * within the hole.
3725 */
3726 if (start == offset)
3727 search_start = offset;
3728
3729 found = find_desired_extent_in_hole(inode, whence,
3730 delalloc_cached_state,
3731 search_start,
3732 extent_end - 1,
3733 &found_start);
3734 if (found) {
3735 start = found_start;
3736 break;
3737 }
3738 /*
3739 * Didn't find data or a hole (due to delalloc) in the
3740 * implicit hole range, so need to analyze the next
3741 * extent item.
3742 */
3743 } else {
3744 /*
3745 * Found a regular or inline extent.
3746 * If we are seeking for data, adjust the start offset
3747 * and stop, we're done.
3748 */
3749 if (whence == SEEK_DATA) {
3750 start = max_t(u64, key.offset, offset);
3751 found = true;
3752 break;
3753 }
3754 /*
3755 * Else, we are seeking for a hole, check the next file
3756 * extent item.
3757 */
3758 }
3759
3760 start = extent_end;
3761 last_extent_end = extent_end;
3762 path->slots[0]++;
3763 if (fatal_signal_pending(current)) {
3764 ret = -EINTR;
3765 goto out;
3766 }
3767 cond_resched();
3768 }
3769
3770 /* We have an implicit hole from the last extent found up to i_size. */
3771 if (!found && start < i_size) {
3772 found = find_desired_extent_in_hole(inode, whence,
3773 delalloc_cached_state, start,
3774 i_size - 1, &start);
3775 if (!found)
3776 start = i_size;
3777 }
3778
3779 out:
3780 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3781 btrfs_free_path(path);
3782
3783 if (ret < 0)
3784 return ret;
3785
3786 if (whence == SEEK_DATA && start >= i_size)
3787 return -ENXIO;
3788
3789 return min_t(loff_t, start, i_size);
3790 }
3791
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3792 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3793 {
3794 struct inode *inode = file->f_mapping->host;
3795
3796 switch (whence) {
3797 default:
3798 return generic_file_llseek(file, offset, whence);
3799 case SEEK_DATA:
3800 case SEEK_HOLE:
3801 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3802 offset = find_desired_extent(file, offset, whence);
3803 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3804 break;
3805 }
3806
3807 if (offset < 0)
3808 return offset;
3809
3810 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3811 }
3812
btrfs_file_open(struct inode * inode,struct file * filp)3813 static int btrfs_file_open(struct inode *inode, struct file *filp)
3814 {
3815 int ret;
3816
3817 if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3818 return -EIO;
3819
3820 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3821
3822 ret = fsverity_file_open(inode, filp);
3823 if (ret)
3824 return ret;
3825 return generic_file_open(inode, filp);
3826 }
3827
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3828 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3829 {
3830 ssize_t ret = 0;
3831
3832 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))
3833 return -EIO;
3834
3835 if (iocb->ki_flags & IOCB_DIRECT) {
3836 ret = btrfs_direct_read(iocb, to);
3837 if (ret < 0 || !iov_iter_count(to) ||
3838 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3839 return ret;
3840 }
3841
3842 return filemap_read(iocb, to, ret);
3843 }
3844
btrfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)3845 static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
3846 struct pipe_inode_info *pipe,
3847 size_t len, unsigned int flags)
3848 {
3849 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))
3850 return -EIO;
3851
3852 return filemap_splice_read(in, ppos, pipe, len, flags);
3853 }
3854
3855 const struct file_operations btrfs_file_operations = {
3856 .llseek = btrfs_file_llseek,
3857 .read_iter = btrfs_file_read_iter,
3858 .splice_read = btrfs_file_splice_read,
3859 .write_iter = btrfs_file_write_iter,
3860 .splice_write = iter_file_splice_write,
3861 .mmap_prepare = btrfs_file_mmap_prepare,
3862 .open = btrfs_file_open,
3863 .release = btrfs_release_file,
3864 .get_unmapped_area = thp_get_unmapped_area,
3865 .fsync = btrfs_sync_file,
3866 .fallocate = btrfs_fallocate,
3867 .unlocked_ioctl = btrfs_ioctl,
3868 #ifdef CONFIG_COMPAT
3869 .compat_ioctl = btrfs_compat_ioctl,
3870 #endif
3871 .remap_file_range = btrfs_remap_file_range,
3872 .uring_cmd = btrfs_uring_cmd,
3873 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3874 .setlease = generic_setlease,
3875 };
3876
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3877 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3878 {
3879 struct address_space *mapping = inode->vfs_inode.i_mapping;
3880 int ret;
3881
3882 /*
3883 * So with compression we will find and lock a dirty page and clear the
3884 * first one as dirty, setup an async extent, and immediately return
3885 * with the entire range locked but with nobody actually marked with
3886 * writeback. So we can't just filemap_write_and_wait_range() and
3887 * expect it to work since it will just kick off a thread to do the
3888 * actual work. So we need to call filemap_fdatawrite_range _again_
3889 * since it will wait on the page lock, which won't be unlocked until
3890 * after the pages have been marked as writeback and so we're good to go
3891 * from there. We have to do this otherwise we'll miss the ordered
3892 * extents and that results in badness. Please Josef, do not think you
3893 * know better and pull this out at some point in the future, it is
3894 * right and you are wrong.
3895 */
3896 ret = filemap_fdatawrite_range(mapping, start, end);
3897 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3898 ret = filemap_fdatawrite_range(mapping, start, end);
3899
3900 return ret;
3901 }
3902