1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/filelock.h>
14 #include <linux/writeback.h>
15 #include <linux/compat.h>
16 #include <linux/slab.h>
17 #include <linux/btrfs.h>
18 #include <linux/uio.h>
19 #include <linux/iversion.h>
20 #include <linux/fsverity.h>
21 #include "ctree.h"
22 #include "direct-io.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "btrfs_inode.h"
26 #include "tree-log.h"
27 #include "locking.h"
28 #include "qgroup.h"
29 #include "compression.h"
30 #include "delalloc-space.h"
31 #include "reflink.h"
32 #include "subpage.h"
33 #include "fs.h"
34 #include "accessors.h"
35 #include "extent-tree.h"
36 #include "file-item.h"
37 #include "ioctl.h"
38 #include "file.h"
39 #include "super.h"
40 #include "print-tree.h"
41
42 /*
43 * Unlock folio after btrfs_file_write() is done with it.
44 */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)45 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
46 u64 pos, u64 copied)
47 {
48 u64 block_start = round_down(pos, fs_info->sectorsize);
49 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
50
51 ASSERT(block_len <= U32_MAX);
52 /*
53 * Folio checked is some magic around finding folios that have been
54 * modified without going through btrfs_dirty_folio(). Clear it here.
55 * There should be no need to mark the pages accessed as
56 * prepare_one_folio() should have marked them accessed in
57 * prepare_one_folio() via find_or_create_page()
58 */
59 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
60 folio_unlock(folio);
61 folio_put(folio);
62 }
63
64 /*
65 * After copy_folio_from_iter_atomic(), update the following things for delalloc:
66 * - Mark newly dirtied folio as DELALLOC in the io tree.
67 * Used to advise which range is to be written back.
68 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
69 * - Update inode size for past EOF write
70 */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)71 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
72 size_t write_bytes, struct extent_state **cached, bool noreserve)
73 {
74 struct btrfs_fs_info *fs_info = inode->root->fs_info;
75 int ret = 0;
76 u64 num_bytes;
77 u64 start_pos;
78 u64 end_of_last_block;
79 const u64 end_pos = pos + write_bytes;
80 loff_t isize = i_size_read(&inode->vfs_inode);
81 unsigned int extra_bits = 0;
82
83 if (write_bytes == 0)
84 return 0;
85
86 if (noreserve)
87 extra_bits |= EXTENT_NORESERVE;
88
89 start_pos = round_down(pos, fs_info->sectorsize);
90 num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize);
91 ASSERT(num_bytes <= U32_MAX);
92 ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos);
93
94 end_of_last_block = start_pos + num_bytes - 1;
95
96 /*
97 * The pages may have already been dirty, clear out old accounting so
98 * we can set things up properly
99 */
100 btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
101 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
102 cached);
103
104 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
105 extra_bits, cached);
106 if (ret)
107 return ret;
108
109 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
110 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
111 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
112
113 /*
114 * we've only changed i_size in ram, and we haven't updated
115 * the disk i_size. There is no need to log the inode
116 * at this time.
117 */
118 if (end_pos > isize)
119 i_size_write(&inode->vfs_inode, end_pos);
120 return 0;
121 }
122
123 /*
124 * this is very complex, but the basic idea is to drop all extents
125 * in the range start - end. hint_block is filled in with a block number
126 * that would be a good hint to the block allocator for this file.
127 *
128 * If an extent intersects the range but is not entirely inside the range
129 * it is either truncated or split. Anything entirely inside the range
130 * is deleted from the tree.
131 *
132 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
133 * to deal with that. We set the field 'bytes_found' of the arguments structure
134 * with the number of allocated bytes found in the target range, so that the
135 * caller can update the inode's number of bytes in an atomic way when
136 * replacing extents in a range to avoid races with stat(2).
137 */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)138 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
139 struct btrfs_root *root, struct btrfs_inode *inode,
140 struct btrfs_drop_extents_args *args)
141 {
142 struct btrfs_fs_info *fs_info = root->fs_info;
143 struct extent_buffer *leaf;
144 struct btrfs_file_extent_item *fi;
145 struct btrfs_key key;
146 struct btrfs_key new_key;
147 u64 ino = btrfs_ino(inode);
148 u64 search_start = args->start;
149 u64 disk_bytenr = 0;
150 u64 num_bytes = 0;
151 u64 extent_offset = 0;
152 u64 extent_end = 0;
153 u64 last_end = args->start;
154 int del_nr = 0;
155 int del_slot = 0;
156 int extent_type;
157 int recow;
158 int ret;
159 int modify_tree = -1;
160 int update_refs;
161 int found = 0;
162 struct btrfs_path *path = args->path;
163
164 args->bytes_found = 0;
165 args->extent_inserted = false;
166
167 /* Must always have a path if ->replace_extent is true */
168 ASSERT(!(args->replace_extent && !args->path));
169
170 if (!path) {
171 path = btrfs_alloc_path();
172 if (!path) {
173 ret = -ENOMEM;
174 goto out;
175 }
176 }
177
178 if (args->drop_cache)
179 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
180
181 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
182 modify_tree = 0;
183
184 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
185 while (1) {
186 recow = 0;
187 ret = btrfs_lookup_file_extent(trans, root, path, ino,
188 search_start, modify_tree);
189 if (ret < 0)
190 break;
191 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
192 leaf = path->nodes[0];
193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
194 if (key.objectid == ino &&
195 key.type == BTRFS_EXTENT_DATA_KEY)
196 path->slots[0]--;
197 }
198 ret = 0;
199 next_slot:
200 leaf = path->nodes[0];
201 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
202 if (WARN_ON(del_nr > 0)) {
203 btrfs_print_leaf(leaf);
204 ret = -EINVAL;
205 break;
206 }
207 ret = btrfs_next_leaf(root, path);
208 if (ret < 0)
209 break;
210 if (ret > 0) {
211 ret = 0;
212 break;
213 }
214 leaf = path->nodes[0];
215 recow = 1;
216 }
217
218 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
219
220 if (key.objectid > ino)
221 break;
222 if (WARN_ON_ONCE(key.objectid < ino) ||
223 key.type < BTRFS_EXTENT_DATA_KEY) {
224 ASSERT(del_nr == 0);
225 path->slots[0]++;
226 goto next_slot;
227 }
228 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
229 break;
230
231 fi = btrfs_item_ptr(leaf, path->slots[0],
232 struct btrfs_file_extent_item);
233 extent_type = btrfs_file_extent_type(leaf, fi);
234
235 if (extent_type == BTRFS_FILE_EXTENT_REG ||
236 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
237 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
238 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
239 extent_offset = btrfs_file_extent_offset(leaf, fi);
240 extent_end = key.offset +
241 btrfs_file_extent_num_bytes(leaf, fi);
242 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
243 extent_end = key.offset +
244 btrfs_file_extent_ram_bytes(leaf, fi);
245 } else {
246 /* can't happen */
247 BUG();
248 }
249
250 /*
251 * Don't skip extent items representing 0 byte lengths. They
252 * used to be created (bug) if while punching holes we hit
253 * -ENOSPC condition. So if we find one here, just ensure we
254 * delete it, otherwise we would insert a new file extent item
255 * with the same key (offset) as that 0 bytes length file
256 * extent item in the call to setup_items_for_insert() later
257 * in this function.
258 */
259 if (extent_end == key.offset && extent_end >= search_start) {
260 last_end = extent_end;
261 goto delete_extent_item;
262 }
263
264 if (extent_end <= search_start) {
265 path->slots[0]++;
266 goto next_slot;
267 }
268
269 found = 1;
270 search_start = max(key.offset, args->start);
271 if (recow || !modify_tree) {
272 modify_tree = -1;
273 btrfs_release_path(path);
274 continue;
275 }
276
277 /*
278 * | - range to drop - |
279 * | -------- extent -------- |
280 */
281 if (args->start > key.offset && args->end < extent_end) {
282 if (WARN_ON(del_nr > 0)) {
283 btrfs_print_leaf(leaf);
284 ret = -EINVAL;
285 break;
286 }
287 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
288 ret = -EOPNOTSUPP;
289 break;
290 }
291
292 memcpy(&new_key, &key, sizeof(new_key));
293 new_key.offset = args->start;
294 ret = btrfs_duplicate_item(trans, root, path,
295 &new_key);
296 if (ret == -EAGAIN) {
297 btrfs_release_path(path);
298 continue;
299 }
300 if (ret < 0)
301 break;
302
303 leaf = path->nodes[0];
304 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
305 struct btrfs_file_extent_item);
306 btrfs_set_file_extent_num_bytes(leaf, fi,
307 args->start - key.offset);
308
309 fi = btrfs_item_ptr(leaf, path->slots[0],
310 struct btrfs_file_extent_item);
311
312 extent_offset += args->start - key.offset;
313 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
314 btrfs_set_file_extent_num_bytes(leaf, fi,
315 extent_end - args->start);
316
317 if (update_refs && disk_bytenr > 0) {
318 struct btrfs_ref ref = {
319 .action = BTRFS_ADD_DELAYED_REF,
320 .bytenr = disk_bytenr,
321 .num_bytes = num_bytes,
322 .parent = 0,
323 .owning_root = btrfs_root_id(root),
324 .ref_root = btrfs_root_id(root),
325 };
326 btrfs_init_data_ref(&ref, new_key.objectid,
327 args->start - extent_offset,
328 0, false);
329 ret = btrfs_inc_extent_ref(trans, &ref);
330 if (unlikely(ret)) {
331 btrfs_abort_transaction(trans, ret);
332 break;
333 }
334 }
335 key.offset = args->start;
336 }
337 /*
338 * From here on out we will have actually dropped something, so
339 * last_end can be updated.
340 */
341 last_end = extent_end;
342
343 /*
344 * | ---- range to drop ----- |
345 * | -------- extent -------- |
346 */
347 if (args->start <= key.offset && args->end < extent_end) {
348 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
349 ret = -EOPNOTSUPP;
350 break;
351 }
352
353 memcpy(&new_key, &key, sizeof(new_key));
354 new_key.offset = args->end;
355 btrfs_set_item_key_safe(trans, path, &new_key);
356
357 extent_offset += args->end - key.offset;
358 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
359 btrfs_set_file_extent_num_bytes(leaf, fi,
360 extent_end - args->end);
361 if (update_refs && disk_bytenr > 0)
362 args->bytes_found += args->end - key.offset;
363 break;
364 }
365
366 search_start = extent_end;
367 /*
368 * | ---- range to drop ----- |
369 * | -------- extent -------- |
370 */
371 if (args->start > key.offset && args->end >= extent_end) {
372 if (WARN_ON(del_nr > 0)) {
373 btrfs_print_leaf(leaf);
374 ret = -EINVAL;
375 break;
376 }
377 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
378 ret = -EOPNOTSUPP;
379 break;
380 }
381
382 btrfs_set_file_extent_num_bytes(leaf, fi,
383 args->start - key.offset);
384 if (update_refs && disk_bytenr > 0)
385 args->bytes_found += extent_end - args->start;
386 if (args->end == extent_end)
387 break;
388
389 path->slots[0]++;
390 goto next_slot;
391 }
392
393 /*
394 * | ---- range to drop ----- |
395 * | ------ extent ------ |
396 */
397 if (args->start <= key.offset && args->end >= extent_end) {
398 delete_extent_item:
399 if (del_nr == 0) {
400 del_slot = path->slots[0];
401 del_nr = 1;
402 } else {
403 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
404 btrfs_print_leaf(leaf);
405 ret = -EINVAL;
406 break;
407 }
408 del_nr++;
409 }
410
411 if (update_refs &&
412 extent_type == BTRFS_FILE_EXTENT_INLINE) {
413 args->bytes_found += extent_end - key.offset;
414 extent_end = ALIGN(extent_end,
415 fs_info->sectorsize);
416 } else if (update_refs && disk_bytenr > 0) {
417 struct btrfs_ref ref = {
418 .action = BTRFS_DROP_DELAYED_REF,
419 .bytenr = disk_bytenr,
420 .num_bytes = num_bytes,
421 .parent = 0,
422 .owning_root = btrfs_root_id(root),
423 .ref_root = btrfs_root_id(root),
424 };
425 btrfs_init_data_ref(&ref, key.objectid,
426 key.offset - extent_offset,
427 0, false);
428 ret = btrfs_free_extent(trans, &ref);
429 if (unlikely(ret)) {
430 btrfs_abort_transaction(trans, ret);
431 break;
432 }
433 args->bytes_found += extent_end - key.offset;
434 }
435
436 if (args->end == extent_end)
437 break;
438
439 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
440 path->slots[0]++;
441 goto next_slot;
442 }
443
444 ret = btrfs_del_items(trans, root, path, del_slot,
445 del_nr);
446 if (unlikely(ret)) {
447 btrfs_abort_transaction(trans, ret);
448 break;
449 }
450
451 del_nr = 0;
452 del_slot = 0;
453
454 btrfs_release_path(path);
455 continue;
456 }
457
458 BUG();
459 }
460
461 if (!ret && del_nr > 0) {
462 /*
463 * Set path->slots[0] to first slot, so that after the delete
464 * if items are move off from our leaf to its immediate left or
465 * right neighbor leafs, we end up with a correct and adjusted
466 * path->slots[0] for our insertion (if args->replace_extent).
467 */
468 path->slots[0] = del_slot;
469 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
470 if (ret)
471 btrfs_abort_transaction(trans, ret);
472 }
473
474 leaf = path->nodes[0];
475 /*
476 * If btrfs_del_items() was called, it might have deleted a leaf, in
477 * which case it unlocked our path, so check path->locks[0] matches a
478 * write lock.
479 */
480 if (!ret && args->replace_extent &&
481 path->locks[0] == BTRFS_WRITE_LOCK &&
482 btrfs_leaf_free_space(leaf) >=
483 sizeof(struct btrfs_item) + args->extent_item_size) {
484
485 key.objectid = ino;
486 key.type = BTRFS_EXTENT_DATA_KEY;
487 key.offset = args->start;
488 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
489 struct btrfs_key slot_key;
490
491 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
492 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
493 path->slots[0]++;
494 }
495 btrfs_setup_item_for_insert(trans, root, path, &key,
496 args->extent_item_size);
497 args->extent_inserted = true;
498 }
499
500 if (!args->path)
501 btrfs_free_path(path);
502 else if (!args->extent_inserted)
503 btrfs_release_path(path);
504 out:
505 args->drop_end = found ? min(args->end, last_end) : args->end;
506
507 return ret;
508 }
509
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)510 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
511 u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
512 {
513 struct btrfs_file_extent_item *fi;
514 struct btrfs_key key;
515 u64 extent_end;
516
517 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
518 return false;
519
520 btrfs_item_key_to_cpu(leaf, &key, slot);
521 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
522 return false;
523
524 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
525 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
526 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
527 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
528 btrfs_file_extent_compression(leaf, fi) ||
529 btrfs_file_extent_encryption(leaf, fi) ||
530 btrfs_file_extent_other_encoding(leaf, fi))
531 return false;
532
533 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
534 if ((*start && *start != key.offset) || (*end && *end != extent_end))
535 return false;
536
537 *start = key.offset;
538 *end = extent_end;
539 return true;
540 }
541
542 /*
543 * Mark extent in the range start - end as written.
544 *
545 * This changes extent type from 'pre-allocated' to 'regular'. If only
546 * part of extent is marked as written, the extent will be split into
547 * two or three.
548 */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)549 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
550 struct btrfs_inode *inode, u64 start, u64 end)
551 {
552 struct btrfs_root *root = inode->root;
553 struct extent_buffer *leaf;
554 BTRFS_PATH_AUTO_FREE(path);
555 struct btrfs_file_extent_item *fi;
556 struct btrfs_ref ref = { 0 };
557 struct btrfs_key key;
558 struct btrfs_key new_key;
559 u64 bytenr;
560 u64 num_bytes;
561 u64 extent_end;
562 u64 orig_offset;
563 u64 other_start;
564 u64 other_end;
565 u64 split;
566 int del_nr = 0;
567 int del_slot = 0;
568 int recow;
569 int ret;
570 u64 ino = btrfs_ino(inode);
571
572 path = btrfs_alloc_path();
573 if (!path)
574 return -ENOMEM;
575 again:
576 recow = 0;
577 split = start;
578 key.objectid = ino;
579 key.type = BTRFS_EXTENT_DATA_KEY;
580 key.offset = split;
581
582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
583 if (ret < 0)
584 return ret;
585 if (ret > 0 && path->slots[0] > 0)
586 path->slots[0]--;
587
588 leaf = path->nodes[0];
589 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
590 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
591 ret = -EINVAL;
592 btrfs_abort_transaction(trans, ret);
593 return ret;
594 }
595 fi = btrfs_item_ptr(leaf, path->slots[0],
596 struct btrfs_file_extent_item);
597 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
598 ret = -EINVAL;
599 btrfs_abort_transaction(trans, ret);
600 return ret;
601 }
602 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
603 if (unlikely(key.offset > start || extent_end < end)) {
604 ret = -EINVAL;
605 btrfs_abort_transaction(trans, ret);
606 return ret;
607 }
608
609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
610 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
611 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
612 memcpy(&new_key, &key, sizeof(new_key));
613
614 if (start == key.offset && end < extent_end) {
615 other_start = 0;
616 other_end = start;
617 if (extent_mergeable(leaf, path->slots[0] - 1,
618 ino, bytenr, orig_offset,
619 &other_start, &other_end)) {
620 new_key.offset = end;
621 btrfs_set_item_key_safe(trans, path, &new_key);
622 fi = btrfs_item_ptr(leaf, path->slots[0],
623 struct btrfs_file_extent_item);
624 btrfs_set_file_extent_generation(leaf, fi,
625 trans->transid);
626 btrfs_set_file_extent_num_bytes(leaf, fi,
627 extent_end - end);
628 btrfs_set_file_extent_offset(leaf, fi,
629 end - orig_offset);
630 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
631 struct btrfs_file_extent_item);
632 btrfs_set_file_extent_generation(leaf, fi,
633 trans->transid);
634 btrfs_set_file_extent_num_bytes(leaf, fi,
635 end - other_start);
636 return 0;
637 }
638 }
639
640 if (start > key.offset && end == extent_end) {
641 other_start = end;
642 other_end = 0;
643 if (extent_mergeable(leaf, path->slots[0] + 1,
644 ino, bytenr, orig_offset,
645 &other_start, &other_end)) {
646 fi = btrfs_item_ptr(leaf, path->slots[0],
647 struct btrfs_file_extent_item);
648 btrfs_set_file_extent_num_bytes(leaf, fi,
649 start - key.offset);
650 btrfs_set_file_extent_generation(leaf, fi,
651 trans->transid);
652 path->slots[0]++;
653 new_key.offset = start;
654 btrfs_set_item_key_safe(trans, path, &new_key);
655
656 fi = btrfs_item_ptr(leaf, path->slots[0],
657 struct btrfs_file_extent_item);
658 btrfs_set_file_extent_generation(leaf, fi,
659 trans->transid);
660 btrfs_set_file_extent_num_bytes(leaf, fi,
661 other_end - start);
662 btrfs_set_file_extent_offset(leaf, fi,
663 start - orig_offset);
664 return 0;
665 }
666 }
667
668 while (start > key.offset || end < extent_end) {
669 if (key.offset == start)
670 split = end;
671
672 new_key.offset = split;
673 ret = btrfs_duplicate_item(trans, root, path, &new_key);
674 if (ret == -EAGAIN) {
675 btrfs_release_path(path);
676 goto again;
677 }
678 if (unlikely(ret < 0)) {
679 btrfs_abort_transaction(trans, ret);
680 return ret;
681 }
682
683 leaf = path->nodes[0];
684 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
685 struct btrfs_file_extent_item);
686 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
687 btrfs_set_file_extent_num_bytes(leaf, fi,
688 split - key.offset);
689
690 fi = btrfs_item_ptr(leaf, path->slots[0],
691 struct btrfs_file_extent_item);
692
693 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
694 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
695 btrfs_set_file_extent_num_bytes(leaf, fi,
696 extent_end - split);
697
698 ref.action = BTRFS_ADD_DELAYED_REF;
699 ref.bytenr = bytenr;
700 ref.num_bytes = num_bytes;
701 ref.parent = 0;
702 ref.owning_root = btrfs_root_id(root);
703 ref.ref_root = btrfs_root_id(root);
704 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
705 ret = btrfs_inc_extent_ref(trans, &ref);
706 if (unlikely(ret)) {
707 btrfs_abort_transaction(trans, ret);
708 return ret;
709 }
710
711 if (split == start) {
712 key.offset = start;
713 } else {
714 if (unlikely(start != key.offset)) {
715 ret = -EINVAL;
716 btrfs_abort_transaction(trans, ret);
717 return ret;
718 }
719 path->slots[0]--;
720 extent_end = end;
721 }
722 recow = 1;
723 }
724
725 other_start = end;
726 other_end = 0;
727
728 ref.action = BTRFS_DROP_DELAYED_REF;
729 ref.bytenr = bytenr;
730 ref.num_bytes = num_bytes;
731 ref.parent = 0;
732 ref.owning_root = btrfs_root_id(root);
733 ref.ref_root = btrfs_root_id(root);
734 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
735 if (extent_mergeable(leaf, path->slots[0] + 1,
736 ino, bytenr, orig_offset,
737 &other_start, &other_end)) {
738 if (recow) {
739 btrfs_release_path(path);
740 goto again;
741 }
742 extent_end = other_end;
743 del_slot = path->slots[0] + 1;
744 del_nr++;
745 ret = btrfs_free_extent(trans, &ref);
746 if (unlikely(ret)) {
747 btrfs_abort_transaction(trans, ret);
748 return ret;
749 }
750 }
751 other_start = 0;
752 other_end = start;
753 if (extent_mergeable(leaf, path->slots[0] - 1,
754 ino, bytenr, orig_offset,
755 &other_start, &other_end)) {
756 if (recow) {
757 btrfs_release_path(path);
758 goto again;
759 }
760 key.offset = other_start;
761 del_slot = path->slots[0];
762 del_nr++;
763 ret = btrfs_free_extent(trans, &ref);
764 if (unlikely(ret)) {
765 btrfs_abort_transaction(trans, ret);
766 return ret;
767 }
768 }
769 if (del_nr == 0) {
770 fi = btrfs_item_ptr(leaf, path->slots[0],
771 struct btrfs_file_extent_item);
772 btrfs_set_file_extent_type(leaf, fi,
773 BTRFS_FILE_EXTENT_REG);
774 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
775 } else {
776 fi = btrfs_item_ptr(leaf, del_slot - 1,
777 struct btrfs_file_extent_item);
778 btrfs_set_file_extent_type(leaf, fi,
779 BTRFS_FILE_EXTENT_REG);
780 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
781 btrfs_set_file_extent_num_bytes(leaf, fi,
782 extent_end - key.offset);
783
784 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
785 if (unlikely(ret < 0)) {
786 btrfs_abort_transaction(trans, ret);
787 return ret;
788 }
789 }
790
791 return 0;
792 }
793
794 /*
795 * On error return an unlocked folio and the error value
796 * On success return a locked folio and 0
797 */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len)798 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
799 u64 len)
800 {
801 u64 clamp_start = max_t(u64, pos, folio_pos(folio));
802 u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
803 const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
804 int ret = 0;
805
806 if (folio_test_uptodate(folio))
807 return 0;
808
809 if (IS_ALIGNED(clamp_start, blocksize) &&
810 IS_ALIGNED(clamp_end, blocksize))
811 return 0;
812
813 ret = btrfs_read_folio(NULL, folio);
814 if (ret)
815 return ret;
816 folio_lock(folio);
817 if (unlikely(!folio_test_uptodate(folio))) {
818 folio_unlock(folio);
819 return -EIO;
820 }
821
822 /*
823 * Since btrfs_read_folio() will unlock the folio before it returns,
824 * there is a window where btrfs_release_folio() can be called to
825 * release the page. Here we check both inode mapping and page
826 * private to make sure the page was not released.
827 *
828 * The private flag check is essential for subpage as we need to store
829 * extra bitmap using folio private.
830 */
831 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
832 folio_unlock(folio);
833 return -EAGAIN;
834 }
835 return 0;
836 }
837
get_prepare_gfp_flags(struct inode * inode,bool nowait)838 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
839 {
840 gfp_t gfp;
841
842 gfp = btrfs_alloc_write_mask(inode->i_mapping);
843 if (nowait) {
844 gfp &= ~__GFP_DIRECT_RECLAIM;
845 gfp |= GFP_NOWAIT;
846 }
847
848 return gfp;
849 }
850
851 /*
852 * Get folio into the page cache and lock it.
853 */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool nowait)854 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
855 loff_t pos, size_t write_bytes,
856 bool nowait)
857 {
858 const pgoff_t index = pos >> PAGE_SHIFT;
859 gfp_t mask = get_prepare_gfp_flags(inode, nowait);
860 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
861 fgf_set_order(write_bytes);
862 struct folio *folio;
863 int ret;
864
865 again:
866 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
867 if (IS_ERR(folio))
868 return PTR_ERR(folio);
869
870 ret = set_folio_extent_mapped(folio);
871 if (ret < 0) {
872 folio_unlock(folio);
873 folio_put(folio);
874 return ret;
875 }
876 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
877 if (ret) {
878 /* The folio is already unlocked. */
879 folio_put(folio);
880 if (!nowait && ret == -EAGAIN)
881 goto again;
882 return ret;
883 }
884 *folio_ret = folio;
885 return 0;
886 }
887
888 /*
889 * Locks the extent and properly waits for data=ordered extents to finish
890 * before allowing the folios to be modified if need.
891 *
892 * Return:
893 * 1 - the extent is locked
894 * 0 - the extent is not locked, and everything is OK
895 * -EAGAIN - need to prepare the folios again
896 */
897 static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)898 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
899 loff_t pos, size_t write_bytes,
900 u64 *lockstart, u64 *lockend, bool nowait,
901 struct extent_state **cached_state)
902 {
903 struct btrfs_fs_info *fs_info = inode->root->fs_info;
904 u64 start_pos;
905 u64 last_pos;
906 int ret = 0;
907
908 start_pos = round_down(pos, fs_info->sectorsize);
909 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
910
911 if (start_pos < inode->vfs_inode.i_size) {
912 struct btrfs_ordered_extent *ordered;
913
914 if (nowait) {
915 if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
916 last_pos, cached_state)) {
917 folio_unlock(folio);
918 folio_put(folio);
919 return -EAGAIN;
920 }
921 } else {
922 btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
923 cached_state);
924 }
925
926 ordered = btrfs_lookup_ordered_range(inode, start_pos,
927 last_pos - start_pos + 1);
928 if (ordered &&
929 ordered->file_offset + ordered->num_bytes > start_pos &&
930 ordered->file_offset <= last_pos) {
931 btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
932 cached_state);
933 folio_unlock(folio);
934 folio_put(folio);
935 btrfs_start_ordered_extent(ordered);
936 btrfs_put_ordered_extent(ordered);
937 return -EAGAIN;
938 }
939 if (ordered)
940 btrfs_put_ordered_extent(ordered);
941
942 *lockstart = start_pos;
943 *lockend = last_pos;
944 ret = 1;
945 }
946
947 /*
948 * We should be called after prepare_one_folio() which should have locked
949 * all pages in the range.
950 */
951 WARN_ON(!folio_test_locked(folio));
952
953 return ret;
954 }
955
956 /*
957 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
958 *
959 * @pos: File offset.
960 * @write_bytes: The length to write, will be updated to the nocow writeable
961 * range.
962 * @nowait: Indicate if we can block or not (non-blocking IO context).
963 *
964 * This function will flush ordered extents in the range to ensure proper
965 * nocow checks.
966 *
967 * Return:
968 * > 0 If we can nocow, and updates @write_bytes.
969 * 0 If we can't do a nocow write.
970 * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
971 * root is in progress or because we are in a non-blocking IO
972 * context and need to block (@nowait is true).
973 * < 0 If an error happened.
974 *
975 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
976 */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)977 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
978 size_t *write_bytes, bool nowait)
979 {
980 struct btrfs_fs_info *fs_info = inode->root->fs_info;
981 struct btrfs_root *root = inode->root;
982 struct extent_state *cached_state = NULL;
983 u64 lockstart, lockend;
984 u64 cur_offset;
985 int ret = 0;
986
987 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
988 return 0;
989
990 if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
991 return -EAGAIN;
992
993 lockstart = round_down(pos, fs_info->sectorsize);
994 lockend = round_up(pos + *write_bytes,
995 fs_info->sectorsize) - 1;
996
997 if (nowait) {
998 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
999 &cached_state)) {
1000 btrfs_drew_write_unlock(&root->snapshot_lock);
1001 return -EAGAIN;
1002 }
1003 } else {
1004 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1005 &cached_state);
1006 }
1007
1008 cur_offset = lockstart;
1009 while (cur_offset < lockend) {
1010 u64 num_bytes = lockend - cur_offset + 1;
1011
1012 ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
1013 if (ret <= 0) {
1014 /*
1015 * If cur_offset == lockstart it means we haven't found
1016 * any extent against which we can NOCOW, so unlock the
1017 * snapshot lock.
1018 */
1019 if (cur_offset == lockstart)
1020 btrfs_drew_write_unlock(&root->snapshot_lock);
1021 break;
1022 }
1023 cur_offset += num_bytes;
1024 }
1025
1026 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1027
1028 /*
1029 * cur_offset > lockstart means there's at least a partial range we can
1030 * NOCOW, and that range can cover one or more extents.
1031 */
1032 if (cur_offset > lockstart) {
1033 *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
1034 return 1;
1035 }
1036
1037 return ret;
1038 }
1039
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1040 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1041 {
1042 btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1043 }
1044
btrfs_write_check(struct kiocb * iocb,size_t count)1045 int btrfs_write_check(struct kiocb *iocb, size_t count)
1046 {
1047 struct file *file = iocb->ki_filp;
1048 struct inode *inode = file_inode(file);
1049 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1050 loff_t pos = iocb->ki_pos;
1051 int ret;
1052 loff_t oldsize;
1053
1054 /*
1055 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1056 * prealloc flags, as without those flags we always have to COW. We will
1057 * later check if we can really COW into the target range (using
1058 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1059 */
1060 if ((iocb->ki_flags & IOCB_NOWAIT) &&
1061 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1062 return -EAGAIN;
1063
1064 ret = file_remove_privs(file);
1065 if (ret)
1066 return ret;
1067
1068 /*
1069 * We reserve space for updating the inode when we reserve space for the
1070 * extent we are going to write, so we will enospc out there. We don't
1071 * need to start yet another transaction to update the inode as we will
1072 * update the inode when we finish writing whatever data we write.
1073 */
1074 if (!IS_NOCMTIME(inode)) {
1075 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1076 inode_inc_iversion(inode);
1077 }
1078
1079 oldsize = i_size_read(inode);
1080 if (pos > oldsize) {
1081 /* Expand hole size to cover write data, preventing empty gap */
1082 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1083
1084 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1085 if (ret)
1086 return ret;
1087 }
1088
1089 return 0;
1090 }
1091
release_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 start,u64 len,bool only_release_metadata)1092 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
1093 u64 start, u64 len, bool only_release_metadata)
1094 {
1095 if (len == 0)
1096 return;
1097
1098 if (only_release_metadata) {
1099 btrfs_check_nocow_unlock(inode);
1100 btrfs_delalloc_release_metadata(inode, len, true);
1101 } else {
1102 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1103
1104 btrfs_delalloc_release_space(inode, data_reserved,
1105 round_down(start, fs_info->sectorsize),
1106 len, true);
1107 }
1108 }
1109
1110 /*
1111 * Reserve data and metadata space for this buffered write range.
1112 *
1113 * Return >0 for the number of bytes reserved, which is always block aligned.
1114 * Return <0 for error.
1115 */
reserve_space(struct btrfs_inode * inode,struct extent_changeset ** data_reserved,u64 start,size_t * len,bool nowait,bool * only_release_metadata)1116 static ssize_t reserve_space(struct btrfs_inode *inode,
1117 struct extent_changeset **data_reserved,
1118 u64 start, size_t *len, bool nowait,
1119 bool *only_release_metadata)
1120 {
1121 const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1122 const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
1123 size_t reserve_bytes;
1124 int ret;
1125
1126 ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
1127 if (ret < 0) {
1128 int can_nocow;
1129
1130 if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
1131 return -EAGAIN;
1132
1133 /*
1134 * If we don't have to COW at the offset, reserve metadata only.
1135 * write_bytes may get smaller than requested here.
1136 */
1137 can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
1138 if (can_nocow < 0)
1139 ret = can_nocow;
1140 if (can_nocow > 0)
1141 ret = 0;
1142 if (ret)
1143 return ret;
1144 *only_release_metadata = true;
1145 }
1146
1147 reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
1148 WARN_ON(reserve_bytes == 0);
1149 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
1150 reserve_bytes, nowait);
1151 if (ret) {
1152 if (!*only_release_metadata)
1153 btrfs_free_reserved_data_space(inode, *data_reserved,
1154 start, *len);
1155 else
1156 btrfs_check_nocow_unlock(inode);
1157
1158 if (nowait && ret == -ENOSPC)
1159 ret = -EAGAIN;
1160 return ret;
1161 }
1162 return reserve_bytes;
1163 }
1164
1165 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
shrink_reserved_space(struct btrfs_inode * inode,struct extent_changeset * data_reserved,u64 reserved_start,u64 reserved_len,u64 new_len,bool only_release_metadata)1166 static void shrink_reserved_space(struct btrfs_inode *inode,
1167 struct extent_changeset *data_reserved,
1168 u64 reserved_start, u64 reserved_len,
1169 u64 new_len, bool only_release_metadata)
1170 {
1171 const u64 diff = reserved_len - new_len;
1172
1173 ASSERT(new_len <= reserved_len);
1174 btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
1175 if (only_release_metadata)
1176 btrfs_delalloc_release_metadata(inode, diff, true);
1177 else
1178 btrfs_delalloc_release_space(inode, data_reserved,
1179 reserved_start + new_len, diff, true);
1180 }
1181
1182 /* Calculate the maximum amount of bytes we can write into one folio. */
calc_write_bytes(const struct btrfs_inode * inode,const struct iov_iter * iter,u64 start)1183 static size_t calc_write_bytes(const struct btrfs_inode *inode,
1184 const struct iov_iter *iter, u64 start)
1185 {
1186 const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
1187
1188 return min(max_folio_size - (start & (max_folio_size - 1)),
1189 iov_iter_count(iter));
1190 }
1191
1192 /*
1193 * Do the heavy-lifting work to copy one range into one folio of the page cache.
1194 *
1195 * Return > 0 in case we copied all bytes or just some of them.
1196 * Return 0 if no bytes were copied, in which case the caller should retry.
1197 * Return <0 on error.
1198 */
copy_one_range(struct btrfs_inode * inode,struct iov_iter * iter,struct extent_changeset ** data_reserved,u64 start,bool nowait)1199 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
1200 struct extent_changeset **data_reserved, u64 start,
1201 bool nowait)
1202 {
1203 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1204 struct extent_state *cached_state = NULL;
1205 size_t write_bytes = calc_write_bytes(inode, iter, start);
1206 size_t copied;
1207 const u64 reserved_start = round_down(start, fs_info->sectorsize);
1208 u64 reserved_len;
1209 struct folio *folio = NULL;
1210 int extents_locked;
1211 u64 lockstart;
1212 u64 lockend;
1213 bool only_release_metadata = false;
1214 const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1215 int ret;
1216
1217 /*
1218 * Fault all pages before locking them in prepare_one_folio() to avoid
1219 * recursive lock.
1220 */
1221 if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
1222 return -EFAULT;
1223 extent_changeset_release(*data_reserved);
1224 ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
1225 &only_release_metadata);
1226 if (ret < 0)
1227 return ret;
1228 reserved_len = ret;
1229 /* Write range must be inside the reserved range. */
1230 ASSERT(reserved_start <= start);
1231 ASSERT(start + write_bytes <= reserved_start + reserved_len);
1232
1233 again:
1234 ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
1235 bdp_flags);
1236 if (ret) {
1237 btrfs_delalloc_release_extents(inode, reserved_len);
1238 release_space(inode, *data_reserved, reserved_start, reserved_len,
1239 only_release_metadata);
1240 return ret;
1241 }
1242
1243 ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
1244 if (ret) {
1245 btrfs_delalloc_release_extents(inode, reserved_len);
1246 release_space(inode, *data_reserved, reserved_start, reserved_len,
1247 only_release_metadata);
1248 return ret;
1249 }
1250
1251 /*
1252 * The reserved range goes beyond the current folio, shrink the reserved
1253 * space to the folio boundary.
1254 */
1255 if (reserved_start + reserved_len > folio_next_pos(folio)) {
1256 const u64 last_block = folio_next_pos(folio);
1257
1258 shrink_reserved_space(inode, *data_reserved, reserved_start,
1259 reserved_len, last_block - reserved_start,
1260 only_release_metadata);
1261 write_bytes = last_block - start;
1262 reserved_len = last_block - reserved_start;
1263 }
1264
1265 extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
1266 write_bytes, &lockstart,
1267 &lockend, nowait,
1268 &cached_state);
1269 if (extents_locked < 0) {
1270 if (!nowait && extents_locked == -EAGAIN)
1271 goto again;
1272
1273 btrfs_delalloc_release_extents(inode, reserved_len);
1274 release_space(inode, *data_reserved, reserved_start, reserved_len,
1275 only_release_metadata);
1276 return extents_locked;
1277 }
1278
1279 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
1280 write_bytes, iter);
1281 flush_dcache_folio(folio);
1282
1283 if (unlikely(copied < write_bytes)) {
1284 u64 last_block;
1285
1286 /*
1287 * The original write range doesn't need an uptodate folio as
1288 * the range is block aligned. But now a short copy happened.
1289 * We cannot handle it without an uptodate folio.
1290 *
1291 * So just revert the range and we will retry.
1292 */
1293 if (!folio_test_uptodate(folio)) {
1294 iov_iter_revert(iter, copied);
1295 copied = 0;
1296 }
1297
1298 /* No copied bytes, unlock, release reserved space and exit. */
1299 if (copied == 0) {
1300 if (extents_locked)
1301 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
1302 &cached_state);
1303 else
1304 btrfs_free_extent_state(cached_state);
1305 btrfs_delalloc_release_extents(inode, reserved_len);
1306 release_space(inode, *data_reserved, reserved_start, reserved_len,
1307 only_release_metadata);
1308 btrfs_drop_folio(fs_info, folio, start, copied);
1309 return 0;
1310 }
1311
1312 /* Release the reserved space beyond the last block. */
1313 last_block = round_up(start + copied, fs_info->sectorsize);
1314
1315 shrink_reserved_space(inode, *data_reserved, reserved_start,
1316 reserved_len, last_block - reserved_start,
1317 only_release_metadata);
1318 reserved_len = last_block - reserved_start;
1319 }
1320
1321 ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
1322 only_release_metadata);
1323 /*
1324 * If we have not locked the extent range, because the range's start
1325 * offset is >= i_size, we might still have a non-NULL cached extent
1326 * state, acquired while marking the extent range as delalloc through
1327 * btrfs_dirty_page(). Therefore free any possible cached extent state
1328 * to avoid a memory leak.
1329 */
1330 if (extents_locked)
1331 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1332 else
1333 btrfs_free_extent_state(cached_state);
1334
1335 btrfs_delalloc_release_extents(inode, reserved_len);
1336 if (ret) {
1337 btrfs_drop_folio(fs_info, folio, start, copied);
1338 release_space(inode, *data_reserved, reserved_start, reserved_len,
1339 only_release_metadata);
1340 return ret;
1341 }
1342 if (only_release_metadata)
1343 btrfs_check_nocow_unlock(inode);
1344
1345 btrfs_drop_folio(fs_info, folio, start, copied);
1346 return copied;
1347 }
1348
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * iter)1349 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1350 {
1351 struct file *file = iocb->ki_filp;
1352 loff_t pos;
1353 struct inode *inode = file_inode(file);
1354 struct extent_changeset *data_reserved = NULL;
1355 size_t num_written = 0;
1356 ssize_t ret;
1357 loff_t old_isize;
1358 unsigned int ilock_flags = 0;
1359 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1360
1361 if (nowait)
1362 ilock_flags |= BTRFS_ILOCK_TRY;
1363
1364 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1365 if (ret < 0)
1366 return ret;
1367
1368 /*
1369 * We can only trust the isize with inode lock held, or it can race with
1370 * other buffered writes and cause incorrect call of
1371 * pagecache_isize_extended() to overwrite existing data.
1372 */
1373 old_isize = i_size_read(inode);
1374
1375 ret = generic_write_checks(iocb, iter);
1376 if (ret <= 0)
1377 goto out;
1378
1379 ret = btrfs_write_check(iocb, ret);
1380 if (ret < 0)
1381 goto out;
1382
1383 pos = iocb->ki_pos;
1384 while (iov_iter_count(iter) > 0) {
1385 ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
1386 if (ret < 0)
1387 break;
1388 pos += ret;
1389 num_written += ret;
1390 cond_resched();
1391 }
1392
1393 extent_changeset_free(data_reserved);
1394 if (num_written > 0) {
1395 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1396 iocb->ki_pos += num_written;
1397 }
1398 out:
1399 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1400 return num_written ? num_written : ret;
1401 }
1402
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1403 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1404 const struct btrfs_ioctl_encoded_io_args *encoded)
1405 {
1406 struct file *file = iocb->ki_filp;
1407 struct inode *inode = file_inode(file);
1408 loff_t count;
1409 ssize_t ret;
1410
1411 btrfs_inode_lock(BTRFS_I(inode), 0);
1412 count = encoded->len;
1413 ret = generic_write_checks_count(iocb, &count);
1414 if (ret == 0 && count != encoded->len) {
1415 /*
1416 * The write got truncated by generic_write_checks_count(). We
1417 * can't do a partial encoded write.
1418 */
1419 ret = -EFBIG;
1420 }
1421 if (ret || encoded->len == 0)
1422 goto out;
1423
1424 ret = btrfs_write_check(iocb, encoded->len);
1425 if (ret < 0)
1426 goto out;
1427
1428 ret = btrfs_do_encoded_write(iocb, from, encoded);
1429 out:
1430 btrfs_inode_unlock(BTRFS_I(inode), 0);
1431 return ret;
1432 }
1433
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1434 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1435 const struct btrfs_ioctl_encoded_io_args *encoded)
1436 {
1437 struct file *file = iocb->ki_filp;
1438 struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1439 ssize_t num_written, num_sync;
1440
1441 if (btrfs_is_shutdown(inode->root->fs_info))
1442 return -EIO;
1443 /*
1444 * If the fs flips readonly due to some impossible error, although we
1445 * have opened a file as writable, we have to stop this write operation
1446 * to ensure consistency.
1447 */
1448 if (BTRFS_FS_ERROR(inode->root->fs_info))
1449 return -EROFS;
1450
1451 if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1452 return -EOPNOTSUPP;
1453
1454 if (encoded) {
1455 num_written = btrfs_encoded_write(iocb, from, encoded);
1456 num_sync = encoded->len;
1457 } else if (iocb->ki_flags & IOCB_DIRECT) {
1458 num_written = btrfs_direct_write(iocb, from);
1459 num_sync = num_written;
1460 } else {
1461 num_written = btrfs_buffered_write(iocb, from);
1462 num_sync = num_written;
1463 }
1464
1465 btrfs_set_inode_last_sub_trans(inode);
1466
1467 if (num_sync > 0) {
1468 num_sync = generic_write_sync(iocb, num_sync);
1469 if (num_sync < 0)
1470 num_written = num_sync;
1471 }
1472
1473 return num_written;
1474 }
1475
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1476 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1477 {
1478 return btrfs_do_write_iter(iocb, from, NULL);
1479 }
1480
btrfs_release_file(struct inode * inode,struct file * filp)1481 int btrfs_release_file(struct inode *inode, struct file *filp)
1482 {
1483 struct btrfs_file_private *private = filp->private_data;
1484
1485 if (private) {
1486 kfree(private->filldir_buf);
1487 btrfs_free_extent_state(private->llseek_cached_state);
1488 kfree(private);
1489 filp->private_data = NULL;
1490 }
1491
1492 /*
1493 * Set by setattr when we are about to truncate a file from a non-zero
1494 * size to a zero size. This tries to flush down new bytes that may
1495 * have been written if the application were using truncate to replace
1496 * a file in place.
1497 */
1498 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1499 &BTRFS_I(inode)->runtime_flags))
1500 filemap_flush(inode->i_mapping);
1501 return 0;
1502 }
1503
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1504 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1505 {
1506 int ret;
1507 struct blk_plug plug;
1508
1509 /*
1510 * This is only called in fsync, which would do synchronous writes, so
1511 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1512 * multiple disks using raid profile, a large IO can be split to
1513 * several segments of stripe length (currently 64K).
1514 */
1515 blk_start_plug(&plug);
1516 ret = btrfs_fdatawrite_range(inode, start, end);
1517 blk_finish_plug(&plug);
1518
1519 return ret;
1520 }
1521
skip_inode_logging(const struct btrfs_log_ctx * ctx)1522 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1523 {
1524 struct btrfs_inode *inode = ctx->inode;
1525 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1526
1527 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1528 list_empty(&ctx->ordered_extents))
1529 return true;
1530
1531 /*
1532 * If we are doing a fast fsync we can not bail out if the inode's
1533 * last_trans is <= then the last committed transaction, because we only
1534 * update the last_trans of the inode during ordered extent completion,
1535 * and for a fast fsync we don't wait for that, we only wait for the
1536 * writeback to complete.
1537 */
1538 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1539 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1540 list_empty(&ctx->ordered_extents)))
1541 return true;
1542
1543 return false;
1544 }
1545
1546 /*
1547 * fsync call for both files and directories. This logs the inode into
1548 * the tree log instead of forcing full commits whenever possible.
1549 *
1550 * It needs to call filemap_fdatawait so that all ordered extent updates are
1551 * in the metadata btree are up to date for copying to the log.
1552 *
1553 * It drops the inode mutex before doing the tree log commit. This is an
1554 * important optimization for directories because holding the mutex prevents
1555 * new operations on the dir while we write to disk.
1556 */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1557 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1558 {
1559 struct dentry *dentry = file_dentry(file);
1560 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1561 struct btrfs_root *root = inode->root;
1562 struct btrfs_fs_info *fs_info = root->fs_info;
1563 struct btrfs_trans_handle *trans;
1564 struct btrfs_log_ctx ctx;
1565 int ret = 0, err;
1566 u64 len;
1567 bool full_sync;
1568 bool skip_ilock = false;
1569
1570 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1571 skip_ilock = true;
1572 current->journal_info = NULL;
1573 btrfs_assert_inode_locked(inode);
1574 }
1575
1576 trace_btrfs_sync_file(file, datasync);
1577
1578 btrfs_init_log_ctx(&ctx, inode);
1579
1580 /*
1581 * Always set the range to a full range, otherwise we can get into
1582 * several problems, from missing file extent items to represent holes
1583 * when not using the NO_HOLES feature, to log tree corruption due to
1584 * races between hole detection during logging and completion of ordered
1585 * extents outside the range, to missing checksums due to ordered extents
1586 * for which we flushed only a subset of their pages.
1587 */
1588 start = 0;
1589 end = LLONG_MAX;
1590 len = (u64)LLONG_MAX + 1;
1591
1592 /*
1593 * We write the dirty pages in the range and wait until they complete
1594 * out of the ->i_mutex. If so, we can flush the dirty pages by
1595 * multi-task, and make the performance up. See
1596 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1597 */
1598 ret = start_ordered_ops(inode, start, end);
1599 if (ret)
1600 goto out;
1601
1602 if (skip_ilock)
1603 down_write(&inode->i_mmap_lock);
1604 else
1605 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1606
1607 atomic_inc(&root->log_batch);
1608
1609 /*
1610 * Before we acquired the inode's lock and the mmap lock, someone may
1611 * have dirtied more pages in the target range. We need to make sure
1612 * that writeback for any such pages does not start while we are logging
1613 * the inode, because if it does, any of the following might happen when
1614 * we are not doing a full inode sync:
1615 *
1616 * 1) We log an extent after its writeback finishes but before its
1617 * checksums are added to the csum tree, leading to -EIO errors
1618 * when attempting to read the extent after a log replay.
1619 *
1620 * 2) We can end up logging an extent before its writeback finishes.
1621 * Therefore after the log replay we will have a file extent item
1622 * pointing to an unwritten extent (and no data checksums as well).
1623 *
1624 * So trigger writeback for any eventual new dirty pages and then we
1625 * wait for all ordered extents to complete below.
1626 */
1627 ret = start_ordered_ops(inode, start, end);
1628 if (ret) {
1629 if (skip_ilock)
1630 up_write(&inode->i_mmap_lock);
1631 else
1632 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1633 goto out;
1634 }
1635
1636 /*
1637 * Always check for the full sync flag while holding the inode's lock,
1638 * to avoid races with other tasks. The flag must be either set all the
1639 * time during logging or always off all the time while logging.
1640 * We check the flag here after starting delalloc above, because when
1641 * running delalloc the full sync flag may be set if we need to drop
1642 * extra extent map ranges due to temporary memory allocation failures.
1643 */
1644 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1645
1646 /*
1647 * We have to do this here to avoid the priority inversion of waiting on
1648 * IO of a lower priority task while holding a transaction open.
1649 *
1650 * For a full fsync we wait for the ordered extents to complete while
1651 * for a fast fsync we wait just for writeback to complete, and then
1652 * attach the ordered extents to the transaction so that a transaction
1653 * commit waits for their completion, to avoid data loss if we fsync,
1654 * the current transaction commits before the ordered extents complete
1655 * and a power failure happens right after that.
1656 *
1657 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1658 * logical address recorded in the ordered extent may change. We need
1659 * to wait for the IO to stabilize the logical address.
1660 */
1661 if (full_sync || btrfs_is_zoned(fs_info)) {
1662 ret = btrfs_wait_ordered_range(inode, start, len);
1663 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1664 } else {
1665 /*
1666 * Get our ordered extents as soon as possible to avoid doing
1667 * checksum lookups in the csum tree, and use instead the
1668 * checksums attached to the ordered extents.
1669 */
1670 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1671 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1672 if (ret)
1673 goto out_release_extents;
1674
1675 /*
1676 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1677 * starting and waiting for writeback, because for buffered IO
1678 * it may have been set during the end IO callback
1679 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1680 * case an error happened and we need to wait for ordered
1681 * extents to complete so that any extent maps that point to
1682 * unwritten locations are dropped and we don't log them.
1683 */
1684 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1685 ret = btrfs_wait_ordered_range(inode, start, len);
1686 }
1687
1688 if (ret)
1689 goto out_release_extents;
1690
1691 atomic_inc(&root->log_batch);
1692
1693 if (skip_inode_logging(&ctx)) {
1694 /*
1695 * We've had everything committed since the last time we were
1696 * modified so clear this flag in case it was set for whatever
1697 * reason, it's no longer relevant.
1698 */
1699 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1700 /*
1701 * An ordered extent might have started before and completed
1702 * already with io errors, in which case the inode was not
1703 * updated and we end up here. So check the inode's mapping
1704 * for any errors that might have happened since we last
1705 * checked called fsync.
1706 */
1707 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1708 goto out_release_extents;
1709 }
1710
1711 btrfs_init_log_ctx_scratch_eb(&ctx);
1712
1713 /*
1714 * We use start here because we will need to wait on the IO to complete
1715 * in btrfs_sync_log, which could require joining a transaction (for
1716 * example checking cross references in the nocow path). If we use join
1717 * here we could get into a situation where we're waiting on IO to
1718 * happen that is blocked on a transaction trying to commit. With start
1719 * we inc the extwriter counter, so we wait for all extwriters to exit
1720 * before we start blocking joiners. This comment is to keep somebody
1721 * from thinking they are super smart and changing this to
1722 * btrfs_join_transaction *cough*Josef*cough*.
1723 */
1724 trans = btrfs_start_transaction(root, 0);
1725 if (IS_ERR(trans)) {
1726 ret = PTR_ERR(trans);
1727 goto out_release_extents;
1728 }
1729 trans->in_fsync = true;
1730
1731 ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1732 /*
1733 * Scratch eb no longer needed, release before syncing log or commit
1734 * transaction, to avoid holding unnecessary memory during such long
1735 * operations.
1736 */
1737 if (ctx.scratch_eb) {
1738 free_extent_buffer(ctx.scratch_eb);
1739 ctx.scratch_eb = NULL;
1740 }
1741 btrfs_release_log_ctx_extents(&ctx);
1742 if (ret < 0) {
1743 /* Fallthrough and commit/free transaction. */
1744 ret = BTRFS_LOG_FORCE_COMMIT;
1745 }
1746
1747 /* we've logged all the items and now have a consistent
1748 * version of the file in the log. It is possible that
1749 * someone will come in and modify the file, but that's
1750 * fine because the log is consistent on disk, and we
1751 * have references to all of the file's extents
1752 *
1753 * It is possible that someone will come in and log the
1754 * file again, but that will end up using the synchronization
1755 * inside btrfs_sync_log to keep things safe.
1756 */
1757 if (skip_ilock)
1758 up_write(&inode->i_mmap_lock);
1759 else
1760 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1761
1762 if (ret == BTRFS_NO_LOG_SYNC) {
1763 ret = btrfs_end_transaction(trans);
1764 goto out;
1765 }
1766
1767 /* We successfully logged the inode, attempt to sync the log. */
1768 if (!ret) {
1769 ret = btrfs_sync_log(trans, root, &ctx);
1770 if (!ret) {
1771 ret = btrfs_end_transaction(trans);
1772 goto out;
1773 }
1774 }
1775
1776 /*
1777 * At this point we need to commit the transaction because we had
1778 * btrfs_need_log_full_commit() or some other error.
1779 *
1780 * If we didn't do a full sync we have to stop the trans handle, wait on
1781 * the ordered extents, start it again and commit the transaction. If
1782 * we attempt to wait on the ordered extents here we could deadlock with
1783 * something like fallocate() that is holding the extent lock trying to
1784 * start a transaction while some other thread is trying to commit the
1785 * transaction while we (fsync) are currently holding the transaction
1786 * open.
1787 */
1788 if (!full_sync) {
1789 ret = btrfs_end_transaction(trans);
1790 if (ret)
1791 goto out;
1792 ret = btrfs_wait_ordered_range(inode, start, len);
1793 if (ret)
1794 goto out;
1795
1796 /*
1797 * This is safe to use here because we're only interested in
1798 * making sure the transaction that had the ordered extents is
1799 * committed. We aren't waiting on anything past this point,
1800 * we're purely getting the transaction and committing it.
1801 */
1802 trans = btrfs_attach_transaction_barrier(root);
1803 if (IS_ERR(trans)) {
1804 ret = PTR_ERR(trans);
1805
1806 /*
1807 * We committed the transaction and there's no currently
1808 * running transaction, this means everything we care
1809 * about made it to disk and we are done.
1810 */
1811 if (ret == -ENOENT)
1812 ret = 0;
1813 goto out;
1814 }
1815 }
1816
1817 ret = btrfs_commit_transaction(trans);
1818 out:
1819 free_extent_buffer(ctx.scratch_eb);
1820 ASSERT(list_empty(&ctx.list));
1821 ASSERT(list_empty(&ctx.conflict_inodes));
1822 err = file_check_and_advance_wb_err(file);
1823 if (!ret)
1824 ret = err;
1825 return ret > 0 ? -EIO : ret;
1826
1827 out_release_extents:
1828 btrfs_release_log_ctx_extents(&ctx);
1829 if (skip_ilock)
1830 up_write(&inode->i_mmap_lock);
1831 else
1832 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1833 goto out;
1834 }
1835
1836 /*
1837 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1838 * called from a page fault handler when a page is first dirtied. Hence we must
1839 * be careful to check for EOF conditions here. We set the page up correctly
1840 * for a written page which means we get ENOSPC checking when writing into
1841 * holes and correct delalloc and unwritten extent mapping on filesystems that
1842 * support these features.
1843 *
1844 * We are not allowed to take the i_mutex here so we have to play games to
1845 * protect against truncate races as the page could now be beyond EOF. Because
1846 * truncate_setsize() writes the inode size before removing pages, once we have
1847 * the page lock we can determine safely if the page is beyond EOF. If it is not
1848 * beyond EOF, then the page is guaranteed safe against truncation until we
1849 * unlock the page.
1850 */
btrfs_page_mkwrite(struct vm_fault * vmf)1851 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1852 {
1853 struct page *page = vmf->page;
1854 struct folio *folio = page_folio(page);
1855 struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
1856 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1857 struct extent_io_tree *io_tree = &inode->io_tree;
1858 struct btrfs_ordered_extent *ordered;
1859 struct extent_state *cached_state = NULL;
1860 struct extent_changeset *data_reserved = NULL;
1861 unsigned long zero_start;
1862 loff_t size;
1863 size_t fsize = folio_size(folio);
1864 int ret;
1865 bool only_release_metadata = false;
1866 u64 reserved_space;
1867 u64 page_start;
1868 u64 page_end;
1869 u64 end;
1870
1871 reserved_space = fsize;
1872
1873 sb_start_pagefault(inode->vfs_inode.i_sb);
1874 page_start = folio_pos(folio);
1875 page_end = page_start + folio_size(folio) - 1;
1876 end = page_end;
1877
1878 /*
1879 * Reserving delalloc space after obtaining the page lock can lead to
1880 * deadlock. For example, if a dirty page is locked by this function
1881 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1882 * dirty page write out, then the btrfs_writepages() function could
1883 * end up waiting indefinitely to get a lock on the page currently
1884 * being processed by btrfs_page_mkwrite() function.
1885 */
1886 ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
1887 reserved_space, false);
1888 if (ret < 0) {
1889 size_t write_bytes = reserved_space;
1890
1891 if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
1892 goto out_noreserve;
1893
1894 only_release_metadata = true;
1895
1896 /*
1897 * Can't write the whole range, there may be shared extents or
1898 * holes in the range, bail out with @only_release_metadata set
1899 * to true so that we unlock the nocow lock before returning the
1900 * error.
1901 */
1902 if (write_bytes < reserved_space)
1903 goto out_noreserve;
1904 }
1905 ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
1906 reserved_space, false);
1907 if (ret < 0) {
1908 if (!only_release_metadata)
1909 btrfs_free_reserved_data_space(inode, data_reserved,
1910 page_start, reserved_space);
1911 goto out_noreserve;
1912 }
1913
1914 ret = file_update_time(vmf->vma->vm_file);
1915 if (ret < 0)
1916 goto out;
1917 again:
1918 down_read(&inode->i_mmap_lock);
1919 folio_lock(folio);
1920 size = i_size_read(&inode->vfs_inode);
1921
1922 if ((folio->mapping != inode->vfs_inode.i_mapping) ||
1923 (page_start >= size)) {
1924 /* Page got truncated out from underneath us. */
1925 goto out_unlock;
1926 }
1927 folio_wait_writeback(folio);
1928
1929 btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
1930 ret = set_folio_extent_mapped(folio);
1931 if (ret < 0) {
1932 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1933 goto out_unlock;
1934 }
1935
1936 /*
1937 * We can't set the delalloc bits if there are pending ordered
1938 * extents. Drop our locks and wait for them to finish.
1939 */
1940 ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
1941 if (ordered) {
1942 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1943 folio_unlock(folio);
1944 up_read(&inode->i_mmap_lock);
1945 btrfs_start_ordered_extent(ordered);
1946 btrfs_put_ordered_extent(ordered);
1947 goto again;
1948 }
1949
1950 if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
1951 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1952 if (reserved_space < fsize) {
1953 const u64 to_free = fsize - reserved_space;
1954
1955 end = page_start + reserved_space - 1;
1956 if (only_release_metadata)
1957 btrfs_delalloc_release_metadata(inode, to_free, true);
1958 else
1959 btrfs_delalloc_release_space(inode, data_reserved,
1960 end + 1, to_free, true);
1961 }
1962 }
1963
1964 /*
1965 * page_mkwrite gets called when the page is firstly dirtied after it's
1966 * faulted in, but write(2) could also dirty a page and set delalloc
1967 * bits, thus in this case for space account reason, we still need to
1968 * clear any delalloc bits within this page range since we have to
1969 * reserve data&meta space before lock_page() (see above comments).
1970 */
1971 btrfs_clear_extent_bit(io_tree, page_start, end,
1972 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1973 EXTENT_DEFRAG, &cached_state);
1974
1975 ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
1976 if (ret < 0) {
1977 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1978 goto out_unlock;
1979 }
1980
1981 /* Page is wholly or partially inside EOF. */
1982 if (page_start + folio_size(folio) > size)
1983 zero_start = offset_in_folio(folio, size);
1984 else
1985 zero_start = fsize;
1986
1987 if (zero_start != fsize)
1988 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1989
1990 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1991 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1992 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1993
1994 btrfs_set_inode_last_sub_trans(inode);
1995
1996 if (only_release_metadata)
1997 btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
1998 &cached_state);
1999
2000 btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
2001 up_read(&inode->i_mmap_lock);
2002
2003 btrfs_delalloc_release_extents(inode, fsize);
2004 if (only_release_metadata)
2005 btrfs_check_nocow_unlock(inode);
2006 sb_end_pagefault(inode->vfs_inode.i_sb);
2007 extent_changeset_free(data_reserved);
2008 return VM_FAULT_LOCKED;
2009
2010 out_unlock:
2011 folio_unlock(folio);
2012 up_read(&inode->i_mmap_lock);
2013 out:
2014 btrfs_delalloc_release_extents(inode, fsize);
2015 if (only_release_metadata)
2016 btrfs_delalloc_release_metadata(inode, reserved_space, true);
2017 else
2018 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2019 reserved_space, true);
2020 out_noreserve:
2021 if (only_release_metadata)
2022 btrfs_check_nocow_unlock(inode);
2023
2024 sb_end_pagefault(inode->vfs_inode.i_sb);
2025
2026 extent_changeset_free(data_reserved);
2027
2028 if (ret < 0)
2029 return vmf_error(ret);
2030
2031 /* Make the VM retry the fault. */
2032 return VM_FAULT_NOPAGE;
2033 }
2034
2035 static const struct vm_operations_struct btrfs_file_vm_ops = {
2036 .fault = filemap_fault,
2037 .map_pages = filemap_map_pages,
2038 .page_mkwrite = btrfs_page_mkwrite,
2039 };
2040
btrfs_file_mmap_prepare(struct vm_area_desc * desc)2041 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
2042 {
2043 struct file *filp = desc->file;
2044 struct address_space *mapping = filp->f_mapping;
2045
2046 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))
2047 return -EIO;
2048 if (!mapping->a_ops->read_folio)
2049 return -ENOEXEC;
2050
2051 file_accessed(filp);
2052 desc->vm_ops = &btrfs_file_vm_ops;
2053
2054 return 0;
2055 }
2056
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)2057 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2058 int slot, u64 start, u64 end)
2059 {
2060 struct btrfs_file_extent_item *fi;
2061 struct btrfs_key key;
2062
2063 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2064 return false;
2065
2066 btrfs_item_key_to_cpu(leaf, &key, slot);
2067 if (key.objectid != btrfs_ino(inode) ||
2068 key.type != BTRFS_EXTENT_DATA_KEY)
2069 return false;
2070
2071 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2072
2073 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2074 return false;
2075
2076 if (btrfs_file_extent_disk_bytenr(leaf, fi))
2077 return false;
2078
2079 if (key.offset == end)
2080 return true;
2081 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2082 return true;
2083 return false;
2084 }
2085
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)2086 static int fill_holes(struct btrfs_trans_handle *trans,
2087 struct btrfs_inode *inode,
2088 struct btrfs_path *path, u64 offset, u64 end)
2089 {
2090 struct btrfs_fs_info *fs_info = trans->fs_info;
2091 struct btrfs_root *root = inode->root;
2092 struct extent_buffer *leaf;
2093 struct btrfs_file_extent_item *fi;
2094 struct extent_map *hole_em;
2095 struct btrfs_key key;
2096 int ret;
2097
2098 if (btrfs_fs_incompat(fs_info, NO_HOLES))
2099 goto out;
2100
2101 key.objectid = btrfs_ino(inode);
2102 key.type = BTRFS_EXTENT_DATA_KEY;
2103 key.offset = offset;
2104
2105 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2106 if (ret <= 0) {
2107 /*
2108 * We should have dropped this offset, so if we find it then
2109 * something has gone horribly wrong.
2110 */
2111 if (ret == 0)
2112 ret = -EINVAL;
2113 return ret;
2114 }
2115
2116 leaf = path->nodes[0];
2117 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2118 u64 num_bytes;
2119
2120 path->slots[0]--;
2121 fi = btrfs_item_ptr(leaf, path->slots[0],
2122 struct btrfs_file_extent_item);
2123 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2124 end - offset;
2125 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2126 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2127 btrfs_set_file_extent_offset(leaf, fi, 0);
2128 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2129 goto out;
2130 }
2131
2132 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2133 u64 num_bytes;
2134
2135 key.offset = offset;
2136 btrfs_set_item_key_safe(trans, path, &key);
2137 fi = btrfs_item_ptr(leaf, path->slots[0],
2138 struct btrfs_file_extent_item);
2139 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2140 offset;
2141 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2142 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2143 btrfs_set_file_extent_offset(leaf, fi, 0);
2144 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2145 goto out;
2146 }
2147 btrfs_release_path(path);
2148
2149 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2150 end - offset);
2151 if (ret)
2152 return ret;
2153
2154 out:
2155 btrfs_release_path(path);
2156
2157 hole_em = btrfs_alloc_extent_map();
2158 if (!hole_em) {
2159 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2160 btrfs_set_inode_full_sync(inode);
2161 } else {
2162 hole_em->start = offset;
2163 hole_em->len = end - offset;
2164 hole_em->ram_bytes = hole_em->len;
2165
2166 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2167 hole_em->disk_num_bytes = 0;
2168 hole_em->generation = trans->transid;
2169
2170 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2171 btrfs_free_extent_map(hole_em);
2172 if (ret)
2173 btrfs_set_inode_full_sync(inode);
2174 }
2175
2176 return 0;
2177 }
2178
2179 /*
2180 * Find a hole extent on given inode and change start/len to the end of hole
2181 * extent.(hole/vacuum extent whose em->start <= start &&
2182 * em->start + em->len > start)
2183 * When a hole extent is found, return 1 and modify start/len.
2184 */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2185 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2186 {
2187 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2188 struct extent_map *em;
2189 int ret = 0;
2190
2191 em = btrfs_get_extent(inode, NULL,
2192 round_down(*start, fs_info->sectorsize),
2193 round_up(*len, fs_info->sectorsize));
2194 if (IS_ERR(em))
2195 return PTR_ERR(em);
2196
2197 /* Hole or vacuum extent(only exists in no-hole mode) */
2198 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2199 const u64 em_end = btrfs_extent_map_end(em);
2200
2201 ret = 1;
2202 *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end);
2203 *start = em_end;
2204 }
2205 btrfs_free_extent_map(em);
2206 return ret;
2207 }
2208
2209 /*
2210 * Check if there is no folio in the range.
2211 *
2212 * We cannot utilize filemap_range_has_page() in a filemap with large folios
2213 * as we can hit the following false positive:
2214 *
2215 * start end
2216 * | |
2217 * |//|//|//|//| | | | | | | | |//|//|
2218 * \ / \ /
2219 * Folio A Folio B
2220 *
2221 * That large folio A and B cover the start and end indexes.
2222 * In that case filemap_range_has_page() will always return true, but the above
2223 * case is fine for btrfs_punch_hole_lock_range() usage.
2224 *
2225 * So here we only ensure that no other folios is in the range, excluding the
2226 * head/tail large folio.
2227 */
check_range_has_page(struct inode * inode,u64 start,u64 end)2228 static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
2229 {
2230 struct folio_batch fbatch;
2231 bool ret = false;
2232 /*
2233 * For subpage case, if the range is not at page boundary, we could
2234 * have pages at the leading/tailing part of the range.
2235 * This could lead to dead loop since filemap_range_has_page()
2236 * will always return true.
2237 * So here we need to do extra page alignment for
2238 * filemap_range_has_page().
2239 *
2240 * And do not decrease page_lockend right now, as it can be 0.
2241 */
2242 const u64 page_lockstart = round_up(start, PAGE_SIZE);
2243 const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
2244 const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
2245 const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
2246 pgoff_t tmp = start_index;
2247 int found_folios;
2248
2249 /* The same page or adjacent pages. */
2250 if (page_lockend <= page_lockstart)
2251 return false;
2252
2253 folio_batch_init(&fbatch);
2254 found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
2255 for (int i = 0; i < found_folios; i++) {
2256 struct folio *folio = fbatch.folios[i];
2257
2258 /* A large folio begins before the start. Not a target. */
2259 if (folio->index < start_index)
2260 continue;
2261 /* A large folio extends beyond the end. Not a target. */
2262 if (folio_next_index(folio) > end_index)
2263 continue;
2264 /* A folio doesn't cover the head/tail index. Found a target. */
2265 ret = true;
2266 break;
2267 }
2268 folio_batch_release(&fbatch);
2269 return ret;
2270 }
2271
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2272 static void btrfs_punch_hole_lock_range(struct inode *inode,
2273 const u64 lockstart, const u64 lockend,
2274 struct extent_state **cached_state)
2275 {
2276 while (1) {
2277 truncate_pagecache_range(inode, lockstart, lockend);
2278
2279 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2280 cached_state);
2281 /*
2282 * We can't have ordered extents in the range, nor dirty/writeback
2283 * pages, because we have locked the inode's VFS lock in exclusive
2284 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2285 * we have flushed all delalloc in the range and we have waited
2286 * for any ordered extents in the range to complete.
2287 * We can race with anyone reading pages from this range, so after
2288 * locking the range check if we have pages in the range, and if
2289 * we do, unlock the range and retry.
2290 */
2291 if (!check_range_has_page(inode, lockstart, lockend))
2292 break;
2293
2294 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2295 cached_state);
2296 }
2297
2298 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2299 }
2300
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2301 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2302 struct btrfs_inode *inode,
2303 struct btrfs_path *path,
2304 struct btrfs_replace_extent_info *extent_info,
2305 const u64 replace_len,
2306 const u64 bytes_to_drop)
2307 {
2308 struct btrfs_fs_info *fs_info = trans->fs_info;
2309 struct btrfs_root *root = inode->root;
2310 struct btrfs_file_extent_item *extent;
2311 struct extent_buffer *leaf;
2312 struct btrfs_key key;
2313 int slot;
2314 int ret;
2315
2316 if (replace_len == 0)
2317 return 0;
2318
2319 if (extent_info->disk_offset == 0 &&
2320 btrfs_fs_incompat(fs_info, NO_HOLES)) {
2321 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2322 return 0;
2323 }
2324
2325 key.objectid = btrfs_ino(inode);
2326 key.type = BTRFS_EXTENT_DATA_KEY;
2327 key.offset = extent_info->file_offset;
2328 ret = btrfs_insert_empty_item(trans, root, path, &key,
2329 sizeof(struct btrfs_file_extent_item));
2330 if (ret)
2331 return ret;
2332 leaf = path->nodes[0];
2333 slot = path->slots[0];
2334 write_extent_buffer(leaf, extent_info->extent_buf,
2335 btrfs_item_ptr_offset(leaf, slot),
2336 sizeof(struct btrfs_file_extent_item));
2337 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2338 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2339 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2340 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2341 if (extent_info->is_new_extent)
2342 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2343 btrfs_release_path(path);
2344
2345 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2346 replace_len);
2347 if (ret)
2348 return ret;
2349
2350 /* If it's a hole, nothing more needs to be done. */
2351 if (extent_info->disk_offset == 0) {
2352 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2353 return 0;
2354 }
2355
2356 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2357
2358 if (extent_info->is_new_extent && extent_info->insertions == 0) {
2359 key.objectid = extent_info->disk_offset;
2360 key.type = BTRFS_EXTENT_ITEM_KEY;
2361 key.offset = extent_info->disk_len;
2362 ret = btrfs_alloc_reserved_file_extent(trans, root,
2363 btrfs_ino(inode),
2364 extent_info->file_offset,
2365 extent_info->qgroup_reserved,
2366 &key);
2367 } else {
2368 struct btrfs_ref ref = {
2369 .action = BTRFS_ADD_DELAYED_REF,
2370 .bytenr = extent_info->disk_offset,
2371 .num_bytes = extent_info->disk_len,
2372 .owning_root = btrfs_root_id(root),
2373 .ref_root = btrfs_root_id(root),
2374 };
2375 u64 ref_offset;
2376
2377 ref_offset = extent_info->file_offset - extent_info->data_offset;
2378 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2379 ret = btrfs_inc_extent_ref(trans, &ref);
2380 }
2381
2382 extent_info->insertions++;
2383
2384 return ret;
2385 }
2386
2387 /*
2388 * The respective range must have been previously locked, as well as the inode.
2389 * The end offset is inclusive (last byte of the range).
2390 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2391 * the file range with an extent.
2392 * When not punching a hole, we don't want to end up in a state where we dropped
2393 * extents without inserting a new one, so we must abort the transaction to avoid
2394 * a corruption.
2395 */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2396 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2397 struct btrfs_path *path, const u64 start,
2398 const u64 end,
2399 struct btrfs_replace_extent_info *extent_info,
2400 struct btrfs_trans_handle **trans_out)
2401 {
2402 struct btrfs_drop_extents_args drop_args = { 0 };
2403 struct btrfs_root *root = inode->root;
2404 struct btrfs_fs_info *fs_info = root->fs_info;
2405 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2406 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2407 struct btrfs_trans_handle *trans = NULL;
2408 struct btrfs_block_rsv rsv;
2409 unsigned int rsv_count;
2410 u64 cur_offset;
2411 u64 len = end - start;
2412 int ret = 0;
2413
2414 if (end <= start)
2415 return -EINVAL;
2416
2417 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
2418 rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
2419 rsv.failfast = true;
2420
2421 /*
2422 * 1 - update the inode
2423 * 1 - removing the extents in the range
2424 * 1 - adding the hole extent if no_holes isn't set or if we are
2425 * replacing the range with a new extent
2426 */
2427 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2428 rsv_count = 3;
2429 else
2430 rsv_count = 2;
2431
2432 trans = btrfs_start_transaction(root, rsv_count);
2433 if (IS_ERR(trans)) {
2434 ret = PTR_ERR(trans);
2435 trans = NULL;
2436 goto out_release;
2437 }
2438
2439 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
2440 min_size, false);
2441 if (WARN_ON(ret))
2442 goto out_trans;
2443 trans->block_rsv = &rsv;
2444
2445 cur_offset = start;
2446 drop_args.path = path;
2447 drop_args.end = end + 1;
2448 drop_args.drop_cache = true;
2449 while (cur_offset < end) {
2450 drop_args.start = cur_offset;
2451 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2452 /* If we are punching a hole decrement the inode's byte count */
2453 if (!extent_info)
2454 btrfs_update_inode_bytes(inode, 0,
2455 drop_args.bytes_found);
2456 if (ret != -ENOSPC) {
2457 /*
2458 * The only time we don't want to abort is if we are
2459 * attempting to clone a partial inline extent, in which
2460 * case we'll get EOPNOTSUPP. However if we aren't
2461 * clone we need to abort no matter what, because if we
2462 * got EOPNOTSUPP via prealloc then we messed up and
2463 * need to abort.
2464 */
2465 if (unlikely(ret &&
2466 (ret != -EOPNOTSUPP ||
2467 (extent_info && extent_info->is_new_extent))))
2468 btrfs_abort_transaction(trans, ret);
2469 break;
2470 }
2471
2472 trans->block_rsv = &fs_info->trans_block_rsv;
2473
2474 if (!extent_info && cur_offset < drop_args.drop_end &&
2475 cur_offset < ino_size) {
2476 ret = fill_holes(trans, inode, path, cur_offset,
2477 drop_args.drop_end);
2478 if (unlikely(ret)) {
2479 /*
2480 * If we failed then we didn't insert our hole
2481 * entries for the area we dropped, so now the
2482 * fs is corrupted, so we must abort the
2483 * transaction.
2484 */
2485 btrfs_abort_transaction(trans, ret);
2486 break;
2487 }
2488 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2489 /*
2490 * We are past the i_size here, but since we didn't
2491 * insert holes we need to clear the mapped area so we
2492 * know to not set disk_i_size in this area until a new
2493 * file extent is inserted here.
2494 */
2495 ret = btrfs_inode_clear_file_extent_range(inode,
2496 cur_offset,
2497 drop_args.drop_end - cur_offset);
2498 if (unlikely(ret)) {
2499 /*
2500 * We couldn't clear our area, so we could
2501 * presumably adjust up and corrupt the fs, so
2502 * we need to abort.
2503 */
2504 btrfs_abort_transaction(trans, ret);
2505 break;
2506 }
2507 }
2508
2509 if (extent_info &&
2510 drop_args.drop_end > extent_info->file_offset) {
2511 u64 replace_len = drop_args.drop_end -
2512 extent_info->file_offset;
2513
2514 ret = btrfs_insert_replace_extent(trans, inode, path,
2515 extent_info, replace_len,
2516 drop_args.bytes_found);
2517 if (unlikely(ret)) {
2518 btrfs_abort_transaction(trans, ret);
2519 break;
2520 }
2521 extent_info->data_len -= replace_len;
2522 extent_info->data_offset += replace_len;
2523 extent_info->file_offset += replace_len;
2524 }
2525
2526 /*
2527 * We are releasing our handle on the transaction, balance the
2528 * dirty pages of the btree inode and flush delayed items, and
2529 * then get a new transaction handle, which may now point to a
2530 * new transaction in case someone else may have committed the
2531 * transaction we used to replace/drop file extent items. So
2532 * bump the inode's iversion and update mtime and ctime except
2533 * if we are called from a dedupe context. This is because a
2534 * power failure/crash may happen after the transaction is
2535 * committed and before we finish replacing/dropping all the
2536 * file extent items we need.
2537 */
2538 inode_inc_iversion(&inode->vfs_inode);
2539
2540 if (!extent_info || extent_info->update_times)
2541 inode_set_mtime_to_ts(&inode->vfs_inode,
2542 inode_set_ctime_current(&inode->vfs_inode));
2543
2544 ret = btrfs_update_inode(trans, inode);
2545 if (ret)
2546 break;
2547
2548 btrfs_end_transaction(trans);
2549 btrfs_btree_balance_dirty(fs_info);
2550
2551 trans = btrfs_start_transaction(root, rsv_count);
2552 if (IS_ERR(trans)) {
2553 ret = PTR_ERR(trans);
2554 trans = NULL;
2555 break;
2556 }
2557
2558 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2559 &rsv, min_size, false);
2560 if (WARN_ON(ret))
2561 break;
2562 trans->block_rsv = &rsv;
2563
2564 cur_offset = drop_args.drop_end;
2565 len = end - cur_offset;
2566 if (!extent_info && len) {
2567 ret = find_first_non_hole(inode, &cur_offset, &len);
2568 if (unlikely(ret < 0))
2569 break;
2570 if (ret && !len) {
2571 ret = 0;
2572 break;
2573 }
2574 }
2575 }
2576
2577 /*
2578 * If we were cloning, force the next fsync to be a full one since we
2579 * we replaced (or just dropped in the case of cloning holes when
2580 * NO_HOLES is enabled) file extent items and did not setup new extent
2581 * maps for the replacement extents (or holes).
2582 */
2583 if (extent_info && !extent_info->is_new_extent)
2584 btrfs_set_inode_full_sync(inode);
2585
2586 if (ret)
2587 goto out_trans;
2588
2589 trans->block_rsv = &fs_info->trans_block_rsv;
2590 /*
2591 * If we are using the NO_HOLES feature we might have had already an
2592 * hole that overlaps a part of the region [lockstart, lockend] and
2593 * ends at (or beyond) lockend. Since we have no file extent items to
2594 * represent holes, drop_end can be less than lockend and so we must
2595 * make sure we have an extent map representing the existing hole (the
2596 * call to __btrfs_drop_extents() might have dropped the existing extent
2597 * map representing the existing hole), otherwise the fast fsync path
2598 * will not record the existence of the hole region
2599 * [existing_hole_start, lockend].
2600 */
2601 if (drop_args.drop_end <= end)
2602 drop_args.drop_end = end + 1;
2603 /*
2604 * Don't insert file hole extent item if it's for a range beyond eof
2605 * (because it's useless) or if it represents a 0 bytes range (when
2606 * cur_offset == drop_end).
2607 */
2608 if (!extent_info && cur_offset < ino_size &&
2609 cur_offset < drop_args.drop_end) {
2610 ret = fill_holes(trans, inode, path, cur_offset,
2611 drop_args.drop_end);
2612 if (unlikely(ret)) {
2613 /* Same comment as above. */
2614 btrfs_abort_transaction(trans, ret);
2615 goto out_trans;
2616 }
2617 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2618 /* See the comment in the loop above for the reasoning here. */
2619 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2620 drop_args.drop_end - cur_offset);
2621 if (unlikely(ret)) {
2622 btrfs_abort_transaction(trans, ret);
2623 goto out_trans;
2624 }
2625
2626 }
2627 if (extent_info) {
2628 ret = btrfs_insert_replace_extent(trans, inode, path,
2629 extent_info, extent_info->data_len,
2630 drop_args.bytes_found);
2631 if (unlikely(ret)) {
2632 btrfs_abort_transaction(trans, ret);
2633 goto out_trans;
2634 }
2635 }
2636
2637 out_trans:
2638 if (!trans)
2639 goto out_release;
2640
2641 trans->block_rsv = &fs_info->trans_block_rsv;
2642 if (ret)
2643 btrfs_end_transaction(trans);
2644 else
2645 *trans_out = trans;
2646 out_release:
2647 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
2648 return ret;
2649 }
2650
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2651 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2652 {
2653 struct inode *inode = file_inode(file);
2654 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2655 struct btrfs_root *root = BTRFS_I(inode)->root;
2656 struct extent_state *cached_state = NULL;
2657 struct btrfs_path *path;
2658 struct btrfs_trans_handle *trans = NULL;
2659 u64 lockstart;
2660 u64 lockend;
2661 u64 tail_start;
2662 u64 tail_len;
2663 const u64 orig_start = offset;
2664 const u64 orig_end = offset + len - 1;
2665 int ret = 0;
2666 bool same_block;
2667 u64 ino_size;
2668 bool truncated_block = false;
2669 bool updated_inode = false;
2670
2671 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2672
2673 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2674 if (ret)
2675 goto out_only_mutex;
2676
2677 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2678 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2679 if (ret < 0)
2680 goto out_only_mutex;
2681 if (ret && !len) {
2682 /* Already in a large hole */
2683 ret = 0;
2684 goto out_only_mutex;
2685 }
2686
2687 ret = file_modified(file);
2688 if (ret)
2689 goto out_only_mutex;
2690
2691 lockstart = round_up(offset, fs_info->sectorsize);
2692 lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2693 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2694 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2695 /*
2696 * Only do this if we are in the same block and we aren't doing the
2697 * entire block.
2698 */
2699 if (same_block && len < fs_info->sectorsize) {
2700 if (offset < ino_size) {
2701 truncated_block = true;
2702 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2703 orig_start, orig_end);
2704 } else {
2705 ret = 0;
2706 }
2707 goto out_only_mutex;
2708 }
2709
2710 /* zero back part of the first block */
2711 if (offset < ino_size) {
2712 truncated_block = true;
2713 ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
2714 if (ret) {
2715 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2716 return ret;
2717 }
2718 }
2719
2720 /* Check the aligned pages after the first unaligned page,
2721 * if offset != orig_start, which means the first unaligned page
2722 * including several following pages are already in holes,
2723 * the extra check can be skipped */
2724 if (offset == orig_start) {
2725 /* after truncate page, check hole again */
2726 len = offset + len - lockstart;
2727 offset = lockstart;
2728 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2729 if (ret < 0)
2730 goto out_only_mutex;
2731 if (ret && !len) {
2732 ret = 0;
2733 goto out_only_mutex;
2734 }
2735 lockstart = offset;
2736 }
2737
2738 /* Check the tail unaligned part is in a hole */
2739 tail_start = lockend + 1;
2740 tail_len = offset + len - tail_start;
2741 if (tail_len) {
2742 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2743 if (unlikely(ret < 0))
2744 goto out_only_mutex;
2745 if (!ret) {
2746 /* zero the front end of the last page */
2747 if (tail_start + tail_len < ino_size) {
2748 truncated_block = true;
2749 ret = btrfs_truncate_block(BTRFS_I(inode),
2750 tail_start + tail_len - 1,
2751 orig_start, orig_end);
2752 if (ret)
2753 goto out_only_mutex;
2754 }
2755 }
2756 }
2757
2758 if (lockend < lockstart) {
2759 ret = 0;
2760 goto out_only_mutex;
2761 }
2762
2763 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2764
2765 path = btrfs_alloc_path();
2766 if (!path) {
2767 ret = -ENOMEM;
2768 goto out;
2769 }
2770
2771 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2772 lockend, NULL, &trans);
2773 btrfs_free_path(path);
2774 if (ret)
2775 goto out;
2776
2777 ASSERT(trans != NULL);
2778 inode_inc_iversion(inode);
2779 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2780 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2781 updated_inode = true;
2782 btrfs_end_transaction(trans);
2783 btrfs_btree_balance_dirty(fs_info);
2784 out:
2785 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2786 &cached_state);
2787 out_only_mutex:
2788 if (!updated_inode && truncated_block && !ret) {
2789 /*
2790 * If we only end up zeroing part of a page, we still need to
2791 * update the inode item, so that all the time fields are
2792 * updated as well as the necessary btrfs inode in memory fields
2793 * for detecting, at fsync time, if the inode isn't yet in the
2794 * log tree or it's there but not up to date.
2795 */
2796 struct timespec64 now = inode_set_ctime_current(inode);
2797
2798 inode_inc_iversion(inode);
2799 inode_set_mtime_to_ts(inode, now);
2800 trans = btrfs_start_transaction(root, 1);
2801 if (IS_ERR(trans)) {
2802 ret = PTR_ERR(trans);
2803 } else {
2804 int ret2;
2805
2806 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2807 ret2 = btrfs_end_transaction(trans);
2808 if (!ret)
2809 ret = ret2;
2810 }
2811 }
2812 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2813 return ret;
2814 }
2815
2816 /* Helper structure to record which range is already reserved */
2817 struct falloc_range {
2818 struct list_head list;
2819 u64 start;
2820 u64 len;
2821 };
2822
2823 /*
2824 * Helper function to add falloc range
2825 *
2826 * Caller should have locked the larger range of extent containing
2827 * [start, len)
2828 */
add_falloc_range(struct list_head * head,u64 start,u64 len)2829 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2830 {
2831 struct falloc_range *range = NULL;
2832
2833 if (!list_empty(head)) {
2834 /*
2835 * As fallocate iterates by bytenr order, we only need to check
2836 * the last range.
2837 */
2838 range = list_last_entry(head, struct falloc_range, list);
2839 if (range->start + range->len == start) {
2840 range->len += len;
2841 return 0;
2842 }
2843 }
2844
2845 range = kmalloc_obj(*range);
2846 if (!range)
2847 return -ENOMEM;
2848 range->start = start;
2849 range->len = len;
2850 list_add_tail(&range->list, head);
2851 return 0;
2852 }
2853
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2854 static int btrfs_fallocate_update_isize(struct inode *inode,
2855 const u64 end,
2856 const int mode)
2857 {
2858 struct btrfs_trans_handle *trans;
2859 struct btrfs_root *root = BTRFS_I(inode)->root;
2860 u64 range_start;
2861 u64 range_end;
2862 int ret;
2863 int ret2;
2864
2865 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2866 return 0;
2867
2868 range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
2869 range_end = round_up(end, root->fs_info->sectorsize);
2870
2871 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
2872 range_end - range_start);
2873 if (ret)
2874 return ret;
2875
2876 trans = btrfs_start_transaction(root, 1);
2877 if (IS_ERR(trans))
2878 return PTR_ERR(trans);
2879
2880 inode_set_ctime_current(inode);
2881 i_size_write(inode, end);
2882 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2883 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2884 ret2 = btrfs_end_transaction(trans);
2885
2886 return ret ? ret : ret2;
2887 }
2888
2889 enum {
2890 RANGE_BOUNDARY_WRITTEN_EXTENT,
2891 RANGE_BOUNDARY_PREALLOC_EXTENT,
2892 RANGE_BOUNDARY_HOLE,
2893 };
2894
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2895 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2896 u64 offset)
2897 {
2898 const u64 sectorsize = inode->root->fs_info->sectorsize;
2899 struct extent_map *em;
2900 int ret;
2901
2902 offset = round_down(offset, sectorsize);
2903 em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2904 if (IS_ERR(em))
2905 return PTR_ERR(em);
2906
2907 if (em->disk_bytenr == EXTENT_MAP_HOLE)
2908 ret = RANGE_BOUNDARY_HOLE;
2909 else if (em->flags & EXTENT_FLAG_PREALLOC)
2910 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2911 else
2912 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2913
2914 btrfs_free_extent_map(em);
2915 return ret;
2916 }
2917
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2918 static int btrfs_zero_range(struct inode *inode,
2919 loff_t offset,
2920 loff_t len,
2921 const int mode)
2922 {
2923 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2924 struct extent_map *em;
2925 struct extent_changeset *data_reserved = NULL;
2926 int ret;
2927 u64 alloc_hint = 0;
2928 const u64 sectorsize = fs_info->sectorsize;
2929 const u64 orig_start = offset;
2930 const u64 orig_end = offset + len - 1;
2931 u64 alloc_start = round_down(offset, sectorsize);
2932 u64 alloc_end = round_up(offset + len, sectorsize);
2933 u64 bytes_to_reserve = 0;
2934 bool space_reserved = false;
2935
2936 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2937 alloc_end - alloc_start);
2938 if (IS_ERR(em)) {
2939 ret = PTR_ERR(em);
2940 goto out;
2941 }
2942
2943 /*
2944 * Avoid hole punching and extent allocation for some cases. More cases
2945 * could be considered, but these are unlikely common and we keep things
2946 * as simple as possible for now. Also, intentionally, if the target
2947 * range contains one or more prealloc extents together with regular
2948 * extents and holes, we drop all the existing extents and allocate a
2949 * new prealloc extent, so that we get a larger contiguous disk extent.
2950 */
2951 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2952 const u64 em_end = btrfs_extent_map_end(em);
2953
2954 if (em_end >= offset + len) {
2955 /*
2956 * The whole range is already a prealloc extent,
2957 * do nothing except updating the inode's i_size if
2958 * needed.
2959 */
2960 btrfs_free_extent_map(em);
2961 ret = btrfs_fallocate_update_isize(inode, offset + len,
2962 mode);
2963 goto out;
2964 }
2965 /*
2966 * Part of the range is already a prealloc extent, so operate
2967 * only on the remaining part of the range.
2968 */
2969 alloc_start = em_end;
2970 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2971 len = offset + len - alloc_start;
2972 offset = alloc_start;
2973 alloc_hint = btrfs_extent_map_block_start(em) + em->len;
2974 }
2975 btrfs_free_extent_map(em);
2976
2977 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2978 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2979 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2980 if (IS_ERR(em)) {
2981 ret = PTR_ERR(em);
2982 goto out;
2983 }
2984
2985 if (em->flags & EXTENT_FLAG_PREALLOC) {
2986 btrfs_free_extent_map(em);
2987 ret = btrfs_fallocate_update_isize(inode, offset + len,
2988 mode);
2989 goto out;
2990 }
2991 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2992 btrfs_free_extent_map(em);
2993 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2994 orig_start, orig_end);
2995 if (!ret)
2996 ret = btrfs_fallocate_update_isize(inode,
2997 offset + len,
2998 mode);
2999 return ret;
3000 }
3001 btrfs_free_extent_map(em);
3002 alloc_start = round_down(offset, sectorsize);
3003 alloc_end = alloc_start + sectorsize;
3004 goto reserve_space;
3005 }
3006
3007 alloc_start = round_up(offset, sectorsize);
3008 alloc_end = round_down(offset + len, sectorsize);
3009
3010 /*
3011 * For unaligned ranges, check the pages at the boundaries, they might
3012 * map to an extent, in which case we need to partially zero them, or
3013 * they might map to a hole, in which case we need our allocation range
3014 * to cover them.
3015 */
3016 if (!IS_ALIGNED(offset, sectorsize)) {
3017 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3018 offset);
3019 if (ret < 0)
3020 goto out;
3021 if (ret == RANGE_BOUNDARY_HOLE) {
3022 alloc_start = round_down(offset, sectorsize);
3023 ret = 0;
3024 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3025 ret = btrfs_truncate_block(BTRFS_I(inode), offset,
3026 orig_start, orig_end);
3027 if (ret)
3028 goto out;
3029 } else {
3030 ret = 0;
3031 }
3032 }
3033
3034 if (!IS_ALIGNED(offset + len, sectorsize)) {
3035 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3036 offset + len);
3037 if (ret < 0)
3038 goto out;
3039 if (ret == RANGE_BOUNDARY_HOLE) {
3040 alloc_end = round_up(offset + len, sectorsize);
3041 ret = 0;
3042 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3043 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
3044 orig_start, orig_end);
3045 if (ret)
3046 goto out;
3047 } else {
3048 ret = 0;
3049 }
3050 }
3051
3052 reserve_space:
3053 if (alloc_start < alloc_end) {
3054 struct extent_state *cached_state = NULL;
3055 const u64 lockstart = alloc_start;
3056 const u64 lockend = alloc_end - 1;
3057
3058 bytes_to_reserve = alloc_end - alloc_start;
3059 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3060 bytes_to_reserve);
3061 if (ret < 0)
3062 goto out;
3063 space_reserved = true;
3064 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3065 &cached_state);
3066 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3067 alloc_start, bytes_to_reserve);
3068 if (ret) {
3069 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3070 lockend, &cached_state);
3071 goto out;
3072 }
3073 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3074 alloc_end - alloc_start,
3075 fs_info->sectorsize,
3076 offset + len, &alloc_hint);
3077 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3078 &cached_state);
3079 /* btrfs_prealloc_file_range releases reserved space on error */
3080 if (ret) {
3081 space_reserved = false;
3082 goto out;
3083 }
3084 }
3085 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3086 out:
3087 if (ret && space_reserved)
3088 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3089 alloc_start, bytes_to_reserve);
3090 extent_changeset_free(data_reserved);
3091
3092 return ret;
3093 }
3094
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3095 static long btrfs_fallocate(struct file *file, int mode,
3096 loff_t offset, loff_t len)
3097 {
3098 struct inode *inode = file_inode(file);
3099 struct extent_state *cached_state = NULL;
3100 struct extent_changeset *data_reserved = NULL;
3101 struct falloc_range *range;
3102 struct falloc_range *tmp;
3103 LIST_HEAD(reserve_list);
3104 u64 cur_offset;
3105 u64 last_byte;
3106 u64 alloc_start;
3107 u64 alloc_end;
3108 u64 alloc_hint = 0;
3109 u64 locked_end;
3110 u64 actual_end = 0;
3111 u64 data_space_needed = 0;
3112 u64 data_space_reserved = 0;
3113 u64 qgroup_reserved = 0;
3114 struct extent_map *em;
3115 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3116 int ret;
3117
3118 if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3119 return -EIO;
3120
3121 /* Do not allow fallocate in ZONED mode */
3122 if (btrfs_is_zoned(inode_to_fs_info(inode)))
3123 return -EOPNOTSUPP;
3124
3125 alloc_start = round_down(offset, blocksize);
3126 alloc_end = round_up(offset + len, blocksize);
3127 cur_offset = alloc_start;
3128
3129 /* Make sure we aren't being give some crap mode */
3130 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3131 FALLOC_FL_ZERO_RANGE))
3132 return -EOPNOTSUPP;
3133
3134 if (mode & FALLOC_FL_PUNCH_HOLE)
3135 return btrfs_punch_hole(file, offset, len);
3136
3137 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3138
3139 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3140 ret = inode_newsize_ok(inode, offset + len);
3141 if (ret)
3142 goto out;
3143 }
3144
3145 ret = file_modified(file);
3146 if (ret)
3147 goto out;
3148
3149 /*
3150 * TODO: Move these two operations after we have checked
3151 * accurate reserved space, or fallocate can still fail but
3152 * with page truncated or size expanded.
3153 *
3154 * But that's a minor problem and won't do much harm BTW.
3155 */
3156 if (alloc_start > inode->i_size) {
3157 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3158 alloc_start);
3159 if (ret)
3160 goto out;
3161 } else if (offset + len > inode->i_size) {
3162 /*
3163 * If we are fallocating from the end of the file onward we
3164 * need to zero out the end of the block if i_size lands in the
3165 * middle of a block.
3166 */
3167 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
3168 inode->i_size, (u64)-1);
3169 if (ret)
3170 goto out;
3171 }
3172
3173 /*
3174 * We have locked the inode at the VFS level (in exclusive mode) and we
3175 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3176 * locking the file range, flush all dealloc in the range and wait for
3177 * all ordered extents in the range to complete. After this we can lock
3178 * the file range and, due to the previous locking we did, we know there
3179 * can't be more delalloc or ordered extents in the range.
3180 */
3181 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3182 alloc_end - alloc_start);
3183 if (ret)
3184 goto out;
3185
3186 if (mode & FALLOC_FL_ZERO_RANGE) {
3187 ret = btrfs_zero_range(inode, offset, len, mode);
3188 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3189 return ret;
3190 }
3191
3192 locked_end = alloc_end - 1;
3193 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3194 &cached_state);
3195
3196 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3197
3198 /* First, check if we exceed the qgroup limit */
3199 while (cur_offset < alloc_end) {
3200 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3201 alloc_end - cur_offset);
3202 if (IS_ERR(em)) {
3203 ret = PTR_ERR(em);
3204 break;
3205 }
3206 last_byte = min(btrfs_extent_map_end(em), alloc_end);
3207 actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
3208 last_byte = ALIGN(last_byte, blocksize);
3209 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3210 (cur_offset >= inode->i_size &&
3211 !(em->flags & EXTENT_FLAG_PREALLOC))) {
3212 const u64 range_len = last_byte - cur_offset;
3213
3214 ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3215 if (ret < 0) {
3216 btrfs_free_extent_map(em);
3217 break;
3218 }
3219 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3220 &data_reserved, cur_offset, range_len);
3221 if (ret < 0) {
3222 btrfs_free_extent_map(em);
3223 break;
3224 }
3225 qgroup_reserved += range_len;
3226 data_space_needed += range_len;
3227 }
3228 btrfs_free_extent_map(em);
3229 cur_offset = last_byte;
3230 }
3231
3232 if (!ret && data_space_needed > 0) {
3233 /*
3234 * We are safe to reserve space here as we can't have delalloc
3235 * in the range, see above.
3236 */
3237 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3238 data_space_needed);
3239 if (!ret)
3240 data_space_reserved = data_space_needed;
3241 }
3242
3243 /*
3244 * If ret is still 0, means we're OK to fallocate.
3245 * Or just cleanup the list and exit.
3246 */
3247 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3248 if (!ret) {
3249 ret = btrfs_prealloc_file_range(inode, mode,
3250 range->start,
3251 range->len, blocksize,
3252 offset + len, &alloc_hint);
3253 /*
3254 * btrfs_prealloc_file_range() releases space even
3255 * if it returns an error.
3256 */
3257 data_space_reserved -= range->len;
3258 qgroup_reserved -= range->len;
3259 } else if (data_space_reserved > 0) {
3260 btrfs_free_reserved_data_space(BTRFS_I(inode),
3261 data_reserved, range->start,
3262 range->len);
3263 data_space_reserved -= range->len;
3264 qgroup_reserved -= range->len;
3265 } else if (qgroup_reserved > 0) {
3266 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3267 range->start, range->len, NULL);
3268 qgroup_reserved -= range->len;
3269 }
3270 list_del(&range->list);
3271 kfree(range);
3272 }
3273 if (ret < 0)
3274 goto out_unlock;
3275
3276 /*
3277 * We didn't need to allocate any more space, but we still extended the
3278 * size of the file so we need to update i_size and the inode item.
3279 */
3280 ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3281 out_unlock:
3282 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3283 &cached_state);
3284 out:
3285 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3286 extent_changeset_free(data_reserved);
3287 return ret;
3288 }
3289
3290 /*
3291 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3292 * that has unflushed and/or flushing delalloc. There might be other adjacent
3293 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3294 * looping while it gets adjacent subranges, and merging them together.
3295 */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3296 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3297 struct extent_state **cached_state,
3298 bool *search_io_tree,
3299 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3300 {
3301 u64 len = end + 1 - start;
3302 u64 delalloc_len = 0;
3303 struct btrfs_ordered_extent *oe;
3304 u64 oe_start;
3305 u64 oe_end;
3306
3307 /*
3308 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3309 * means we have delalloc (dirty pages) for which writeback has not
3310 * started yet.
3311 */
3312 if (*search_io_tree) {
3313 spin_lock(&inode->lock);
3314 if (inode->delalloc_bytes > 0) {
3315 spin_unlock(&inode->lock);
3316 *delalloc_start_ret = start;
3317 delalloc_len = btrfs_count_range_bits(&inode->io_tree,
3318 delalloc_start_ret, end,
3319 len, EXTENT_DELALLOC, 1,
3320 cached_state);
3321 } else {
3322 spin_unlock(&inode->lock);
3323 }
3324 }
3325
3326 if (delalloc_len > 0) {
3327 /*
3328 * If delalloc was found then *delalloc_start_ret has a sector size
3329 * aligned value (rounded down).
3330 */
3331 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3332
3333 if (*delalloc_start_ret == start) {
3334 /* Delalloc for the whole range, nothing more to do. */
3335 if (*delalloc_end_ret == end)
3336 return true;
3337 /* Else trim our search range for ordered extents. */
3338 start = *delalloc_end_ret + 1;
3339 len = end + 1 - start;
3340 }
3341 } else {
3342 /* No delalloc, future calls don't need to search again. */
3343 *search_io_tree = false;
3344 }
3345
3346 /*
3347 * Now also check if there's any ordered extent in the range.
3348 * We do this because:
3349 *
3350 * 1) When delalloc is flushed, the file range is locked, we clear the
3351 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3352 * an ordered extent for the write. So we might just have been called
3353 * after delalloc is flushed and before the ordered extent completes
3354 * and inserts the new file extent item in the subvolume's btree;
3355 *
3356 * 2) We may have an ordered extent created by flushing delalloc for a
3357 * subrange that starts before the subrange we found marked with
3358 * EXTENT_DELALLOC in the io tree.
3359 *
3360 * We could also use the extent map tree to find such delalloc that is
3361 * being flushed, but using the ordered extents tree is more efficient
3362 * because it's usually much smaller as ordered extents are removed from
3363 * the tree once they complete. With the extent maps, we may have them
3364 * in the extent map tree for a very long time, and they were either
3365 * created by previous writes or loaded by read operations.
3366 */
3367 oe = btrfs_lookup_first_ordered_range(inode, start, len);
3368 if (!oe)
3369 return (delalloc_len > 0);
3370
3371 /* The ordered extent may span beyond our search range. */
3372 oe_start = max(oe->file_offset, start);
3373 oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3374
3375 btrfs_put_ordered_extent(oe);
3376
3377 /* Don't have unflushed delalloc, return the ordered extent range. */
3378 if (delalloc_len == 0) {
3379 *delalloc_start_ret = oe_start;
3380 *delalloc_end_ret = oe_end;
3381 return true;
3382 }
3383
3384 /*
3385 * We have both unflushed delalloc (io_tree) and an ordered extent.
3386 * If the ranges are adjacent returned a combined range, otherwise
3387 * return the leftmost range.
3388 */
3389 if (oe_start < *delalloc_start_ret) {
3390 if (oe_end < *delalloc_start_ret)
3391 *delalloc_end_ret = oe_end;
3392 *delalloc_start_ret = oe_start;
3393 } else if (*delalloc_end_ret + 1 == oe_start) {
3394 *delalloc_end_ret = oe_end;
3395 }
3396
3397 return true;
3398 }
3399
3400 /*
3401 * Check if there's delalloc in a given range.
3402 *
3403 * @inode: The inode.
3404 * @start: The start offset of the range. It does not need to be
3405 * sector size aligned.
3406 * @end: The end offset (inclusive value) of the search range.
3407 * It does not need to be sector size aligned.
3408 * @cached_state: Extent state record used for speeding up delalloc
3409 * searches in the inode's io_tree. Can be NULL.
3410 * @delalloc_start_ret: Output argument, set to the start offset of the
3411 * subrange found with delalloc (may not be sector size
3412 * aligned).
3413 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3414 * of the subrange found with delalloc.
3415 *
3416 * Returns true if a subrange with delalloc is found within the given range, and
3417 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3418 * end offsets of the subrange.
3419 */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3420 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3421 struct extent_state **cached_state,
3422 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3423 {
3424 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3425 u64 prev_delalloc_end = 0;
3426 bool search_io_tree = true;
3427 bool ret = false;
3428
3429 while (cur_offset <= end) {
3430 u64 delalloc_start;
3431 u64 delalloc_end;
3432 bool delalloc;
3433
3434 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3435 cached_state, &search_io_tree,
3436 &delalloc_start,
3437 &delalloc_end);
3438 if (!delalloc)
3439 break;
3440
3441 if (prev_delalloc_end == 0) {
3442 /* First subrange found. */
3443 *delalloc_start_ret = max(delalloc_start, start);
3444 *delalloc_end_ret = delalloc_end;
3445 ret = true;
3446 } else if (delalloc_start == prev_delalloc_end + 1) {
3447 /* Subrange adjacent to the previous one, merge them. */
3448 *delalloc_end_ret = delalloc_end;
3449 } else {
3450 /* Subrange not adjacent to the previous one, exit. */
3451 break;
3452 }
3453
3454 prev_delalloc_end = delalloc_end;
3455 cur_offset = delalloc_end + 1;
3456 cond_resched();
3457 }
3458
3459 return ret;
3460 }
3461
3462 /*
3463 * Check if there's a hole or delalloc range in a range representing a hole (or
3464 * prealloc extent) found in the inode's subvolume btree.
3465 *
3466 * @inode: The inode.
3467 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3468 * @start: Start offset of the hole region. It does not need to be sector
3469 * size aligned.
3470 * @end: End offset (inclusive value) of the hole region. It does not
3471 * need to be sector size aligned.
3472 * @start_ret: Return parameter, used to set the start of the subrange in the
3473 * hole that matches the search criteria (seek mode), if such
3474 * subrange is found (return value of the function is true).
3475 * The value returned here may not be sector size aligned.
3476 *
3477 * Returns true if a subrange matching the given seek mode is found, and if one
3478 * is found, it updates @start_ret with the start of the subrange.
3479 */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3480 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3481 struct extent_state **cached_state,
3482 u64 start, u64 end, u64 *start_ret)
3483 {
3484 u64 delalloc_start;
3485 u64 delalloc_end;
3486 bool delalloc;
3487
3488 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3489 &delalloc_start, &delalloc_end);
3490 if (delalloc && whence == SEEK_DATA) {
3491 *start_ret = delalloc_start;
3492 return true;
3493 }
3494
3495 if (delalloc && whence == SEEK_HOLE) {
3496 /*
3497 * We found delalloc but it starts after out start offset. So we
3498 * have a hole between our start offset and the delalloc start.
3499 */
3500 if (start < delalloc_start) {
3501 *start_ret = start;
3502 return true;
3503 }
3504 /*
3505 * Delalloc range starts at our start offset.
3506 * If the delalloc range's length is smaller than our range,
3507 * then it means we have a hole that starts where the delalloc
3508 * subrange ends.
3509 */
3510 if (delalloc_end < end) {
3511 *start_ret = delalloc_end + 1;
3512 return true;
3513 }
3514
3515 /* There's delalloc for the whole range. */
3516 return false;
3517 }
3518
3519 if (!delalloc && whence == SEEK_HOLE) {
3520 *start_ret = start;
3521 return true;
3522 }
3523
3524 /*
3525 * No delalloc in the range and we are seeking for data. The caller has
3526 * to iterate to the next extent item in the subvolume btree.
3527 */
3528 return false;
3529 }
3530
find_desired_extent(struct file * file,loff_t offset,int whence)3531 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3532 {
3533 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3534 struct btrfs_file_private *private;
3535 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3536 struct extent_state *cached_state = NULL;
3537 struct extent_state **delalloc_cached_state;
3538 const loff_t i_size = i_size_read(&inode->vfs_inode);
3539 const u64 ino = btrfs_ino(inode);
3540 struct btrfs_root *root = inode->root;
3541 struct btrfs_path *path;
3542 struct btrfs_key key;
3543 u64 last_extent_end;
3544 u64 lockstart;
3545 u64 lockend;
3546 u64 start;
3547 int ret;
3548 bool found = false;
3549
3550 if (i_size == 0 || offset >= i_size)
3551 return -ENXIO;
3552
3553 /*
3554 * Quick path. If the inode has no prealloc extents and its number of
3555 * bytes used matches its i_size, then it can not have holes.
3556 */
3557 if (whence == SEEK_HOLE &&
3558 !(inode->flags & BTRFS_INODE_PREALLOC) &&
3559 inode_get_bytes(&inode->vfs_inode) == i_size)
3560 return i_size;
3561
3562 spin_lock(&inode->lock);
3563 private = file->private_data;
3564 spin_unlock(&inode->lock);
3565
3566 if (private && private->owner_task != current) {
3567 /*
3568 * Not allocated by us, don't use it as its cached state is used
3569 * by the task that allocated it and we don't want neither to
3570 * mess with it nor get incorrect results because it reflects an
3571 * invalid state for the current task.
3572 */
3573 private = NULL;
3574 } else if (!private) {
3575 private = kzalloc_obj(*private);
3576 /*
3577 * No worries if memory allocation failed.
3578 * The private structure is used only for speeding up multiple
3579 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3580 * so everything will still be correct.
3581 */
3582 if (private) {
3583 bool free = false;
3584
3585 private->owner_task = current;
3586
3587 spin_lock(&inode->lock);
3588 if (file->private_data)
3589 free = true;
3590 else
3591 file->private_data = private;
3592 spin_unlock(&inode->lock);
3593
3594 if (free) {
3595 kfree(private);
3596 private = NULL;
3597 }
3598 }
3599 }
3600
3601 if (private)
3602 delalloc_cached_state = &private->llseek_cached_state;
3603 else
3604 delalloc_cached_state = NULL;
3605
3606 /*
3607 * offset can be negative, in this case we start finding DATA/HOLE from
3608 * the very start of the file.
3609 */
3610 start = max_t(loff_t, 0, offset);
3611
3612 lockstart = round_down(start, fs_info->sectorsize);
3613 lockend = round_up(i_size, fs_info->sectorsize);
3614 if (lockend <= lockstart)
3615 lockend = lockstart + fs_info->sectorsize;
3616 lockend--;
3617
3618 path = btrfs_alloc_path();
3619 if (!path)
3620 return -ENOMEM;
3621 path->reada = READA_FORWARD;
3622
3623 key.objectid = ino;
3624 key.type = BTRFS_EXTENT_DATA_KEY;
3625 key.offset = start;
3626
3627 last_extent_end = lockstart;
3628
3629 btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3630
3631 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3632 if (ret < 0) {
3633 goto out;
3634 } else if (ret > 0 && path->slots[0] > 0) {
3635 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3636 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3637 path->slots[0]--;
3638 }
3639
3640 while (start < i_size) {
3641 struct extent_buffer *leaf = path->nodes[0];
3642 struct btrfs_file_extent_item *extent;
3643 u64 extent_end;
3644 u8 type;
3645
3646 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3647 ret = btrfs_next_leaf(root, path);
3648 if (ret < 0)
3649 goto out;
3650 else if (ret > 0)
3651 break;
3652
3653 leaf = path->nodes[0];
3654 }
3655
3656 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3657 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3658 break;
3659
3660 extent_end = btrfs_file_extent_end(path);
3661
3662 /*
3663 * In the first iteration we may have a slot that points to an
3664 * extent that ends before our start offset, so skip it.
3665 */
3666 if (extent_end <= start) {
3667 path->slots[0]++;
3668 continue;
3669 }
3670
3671 /* We have an implicit hole, NO_HOLES feature is likely set. */
3672 if (last_extent_end < key.offset) {
3673 u64 search_start = last_extent_end;
3674 u64 found_start;
3675
3676 /*
3677 * First iteration, @start matches @offset and it's
3678 * within the hole.
3679 */
3680 if (start == offset)
3681 search_start = offset;
3682
3683 found = find_desired_extent_in_hole(inode, whence,
3684 delalloc_cached_state,
3685 search_start,
3686 key.offset - 1,
3687 &found_start);
3688 if (found) {
3689 start = found_start;
3690 break;
3691 }
3692 /*
3693 * Didn't find data or a hole (due to delalloc) in the
3694 * implicit hole range, so need to analyze the extent.
3695 */
3696 }
3697
3698 extent = btrfs_item_ptr(leaf, path->slots[0],
3699 struct btrfs_file_extent_item);
3700 type = btrfs_file_extent_type(leaf, extent);
3701
3702 /*
3703 * Can't access the extent's disk_bytenr field if this is an
3704 * inline extent, since at that offset, it's where the extent
3705 * data starts.
3706 */
3707 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3708 (type == BTRFS_FILE_EXTENT_REG &&
3709 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3710 /*
3711 * Explicit hole or prealloc extent, search for delalloc.
3712 * A prealloc extent is treated like a hole.
3713 */
3714 u64 search_start = key.offset;
3715 u64 found_start;
3716
3717 /*
3718 * First iteration, @start matches @offset and it's
3719 * within the hole.
3720 */
3721 if (start == offset)
3722 search_start = offset;
3723
3724 found = find_desired_extent_in_hole(inode, whence,
3725 delalloc_cached_state,
3726 search_start,
3727 extent_end - 1,
3728 &found_start);
3729 if (found) {
3730 start = found_start;
3731 break;
3732 }
3733 /*
3734 * Didn't find data or a hole (due to delalloc) in the
3735 * implicit hole range, so need to analyze the next
3736 * extent item.
3737 */
3738 } else {
3739 /*
3740 * Found a regular or inline extent.
3741 * If we are seeking for data, adjust the start offset
3742 * and stop, we're done.
3743 */
3744 if (whence == SEEK_DATA) {
3745 start = max_t(u64, key.offset, offset);
3746 found = true;
3747 break;
3748 }
3749 /*
3750 * Else, we are seeking for a hole, check the next file
3751 * extent item.
3752 */
3753 }
3754
3755 start = extent_end;
3756 last_extent_end = extent_end;
3757 path->slots[0]++;
3758 if (fatal_signal_pending(current)) {
3759 ret = -EINTR;
3760 goto out;
3761 }
3762 cond_resched();
3763 }
3764
3765 /* We have an implicit hole from the last extent found up to i_size. */
3766 if (!found && start < i_size) {
3767 found = find_desired_extent_in_hole(inode, whence,
3768 delalloc_cached_state, start,
3769 i_size - 1, &start);
3770 if (!found)
3771 start = i_size;
3772 }
3773
3774 out:
3775 btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3776 btrfs_free_path(path);
3777
3778 if (ret < 0)
3779 return ret;
3780
3781 if (whence == SEEK_DATA && start >= i_size)
3782 return -ENXIO;
3783
3784 return min_t(loff_t, start, i_size);
3785 }
3786
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3787 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3788 {
3789 struct inode *inode = file->f_mapping->host;
3790
3791 switch (whence) {
3792 default:
3793 return generic_file_llseek(file, offset, whence);
3794 case SEEK_DATA:
3795 case SEEK_HOLE:
3796 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3797 offset = find_desired_extent(file, offset, whence);
3798 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3799 break;
3800 }
3801
3802 if (offset < 0)
3803 return offset;
3804
3805 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3806 }
3807
btrfs_file_open(struct inode * inode,struct file * filp)3808 static int btrfs_file_open(struct inode *inode, struct file *filp)
3809 {
3810 int ret;
3811
3812 if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3813 return -EIO;
3814
3815 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3816
3817 ret = fsverity_file_open(inode, filp);
3818 if (ret)
3819 return ret;
3820 return generic_file_open(inode, filp);
3821 }
3822
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3823 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3824 {
3825 ssize_t ret = 0;
3826
3827 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))
3828 return -EIO;
3829
3830 if (iocb->ki_flags & IOCB_DIRECT) {
3831 ret = btrfs_direct_read(iocb, to);
3832 if (ret < 0 || !iov_iter_count(to) ||
3833 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3834 return ret;
3835 }
3836
3837 return filemap_read(iocb, to, ret);
3838 }
3839
btrfs_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)3840 static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
3841 struct pipe_inode_info *pipe,
3842 size_t len, unsigned int flags)
3843 {
3844 if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))
3845 return -EIO;
3846
3847 return filemap_splice_read(in, ppos, pipe, len, flags);
3848 }
3849
3850 const struct file_operations btrfs_file_operations = {
3851 .llseek = btrfs_file_llseek,
3852 .read_iter = btrfs_file_read_iter,
3853 .splice_read = btrfs_file_splice_read,
3854 .write_iter = btrfs_file_write_iter,
3855 .splice_write = iter_file_splice_write,
3856 .mmap_prepare = btrfs_file_mmap_prepare,
3857 .open = btrfs_file_open,
3858 .release = btrfs_release_file,
3859 .get_unmapped_area = thp_get_unmapped_area,
3860 .fsync = btrfs_sync_file,
3861 .fallocate = btrfs_fallocate,
3862 .unlocked_ioctl = btrfs_ioctl,
3863 #ifdef CONFIG_COMPAT
3864 .compat_ioctl = btrfs_compat_ioctl,
3865 #endif
3866 .remap_file_range = btrfs_remap_file_range,
3867 .uring_cmd = btrfs_uring_cmd,
3868 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3869 .setlease = generic_setlease,
3870 };
3871
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3872 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3873 {
3874 struct address_space *mapping = inode->vfs_inode.i_mapping;
3875 int ret;
3876
3877 /*
3878 * So with compression we will find and lock a dirty page and clear the
3879 * first one as dirty, setup an async extent, and immediately return
3880 * with the entire range locked but with nobody actually marked with
3881 * writeback. So we can't just filemap_write_and_wait_range() and
3882 * expect it to work since it will just kick off a thread to do the
3883 * actual work. So we need to call filemap_fdatawrite_range _again_
3884 * since it will wait on the page lock, which won't be unlocked until
3885 * after the pages have been marked as writeback and so we're good to go
3886 * from there. We have to do this otherwise we'll miss the ordered
3887 * extents and that results in badness. Please Josef, do not think you
3888 * know better and pull this out at some point in the future, it is
3889 * right and you are wrong.
3890 */
3891 ret = filemap_fdatawrite_range(mapping, start, end);
3892 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3893 ret = filemap_fdatawrite_range(mapping, start, end);
3894
3895 return ret;
3896 }
3897