1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
20 #include "ctree.h"
21 #include "direct-io.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "btrfs_inode.h"
25 #include "tree-log.h"
26 #include "locking.h"
27 #include "qgroup.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
30 #include "reflink.h"
31 #include "subpage.h"
32 #include "fs.h"
33 #include "accessors.h"
34 #include "extent-tree.h"
35 #include "file-item.h"
36 #include "ioctl.h"
37 #include "file.h"
38 #include "super.h"
39 #include "print-tree.h"
40
41 /*
42 * Unlock folio after btrfs_file_write() is done with it.
43 */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
45 u64 pos, u64 copied)
46 {
47 u64 block_start = round_down(pos, fs_info->sectorsize);
48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
49
50 ASSERT(block_len <= U32_MAX);
51 /*
52 * Folio checked is some magic around finding folios that have been
53 * modified without going through btrfs_dirty_folio(). Clear it here.
54 * There should be no need to mark the pages accessed as
55 * prepare_one_folio() should have marked them accessed in
56 * prepare_one_folio() via find_or_create_page()
57 */
58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
59 folio_unlock(folio);
60 folio_put(folio);
61 }
62
63 /*
64 * After copy_folio_from_iter_atomic(), update the following things for delalloc:
65 * - Mark newly dirtied folio as DELALLOC in the io tree.
66 * Used to advise which range is to be written back.
67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
68 * - Update inode size for past EOF write
69 */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
71 size_t write_bytes, struct extent_state **cached, bool noreserve)
72 {
73 struct btrfs_fs_info *fs_info = inode->root->fs_info;
74 int ret = 0;
75 u64 num_bytes;
76 u64 start_pos;
77 u64 end_of_last_block;
78 u64 end_pos = pos + write_bytes;
79 loff_t isize = i_size_read(&inode->vfs_inode);
80 unsigned int extra_bits = 0;
81
82 if (write_bytes == 0)
83 return 0;
84
85 if (noreserve)
86 extra_bits |= EXTENT_NORESERVE;
87
88 start_pos = round_down(pos, fs_info->sectorsize);
89 num_bytes = round_up(write_bytes + pos - start_pos,
90 fs_info->sectorsize);
91 ASSERT(num_bytes <= U32_MAX);
92 ASSERT(folio_pos(folio) <= pos &&
93 folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
94
95 end_of_last_block = start_pos + num_bytes - 1;
96
97 /*
98 * The pages may have already been dirty, clear out old accounting so
99 * we can set things up properly
100 */
101 clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
102 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
103 cached);
104
105 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
106 extra_bits, cached);
107 if (ret)
108 return ret;
109
110 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
111 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
112 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
113
114 /*
115 * we've only changed i_size in ram, and we haven't updated
116 * the disk i_size. There is no need to log the inode
117 * at this time.
118 */
119 if (end_pos > isize)
120 i_size_write(&inode->vfs_inode, end_pos);
121 return 0;
122 }
123
124 /*
125 * this is very complex, but the basic idea is to drop all extents
126 * in the range start - end. hint_block is filled in with a block number
127 * that would be a good hint to the block allocator for this file.
128 *
129 * If an extent intersects the range but is not entirely inside the range
130 * it is either truncated or split. Anything entirely inside the range
131 * is deleted from the tree.
132 *
133 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
134 * to deal with that. We set the field 'bytes_found' of the arguments structure
135 * with the number of allocated bytes found in the target range, so that the
136 * caller can update the inode's number of bytes in an atomic way when
137 * replacing extents in a range to avoid races with stat(2).
138 */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)139 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
140 struct btrfs_root *root, struct btrfs_inode *inode,
141 struct btrfs_drop_extents_args *args)
142 {
143 struct btrfs_fs_info *fs_info = root->fs_info;
144 struct extent_buffer *leaf;
145 struct btrfs_file_extent_item *fi;
146 struct btrfs_key key;
147 struct btrfs_key new_key;
148 u64 ino = btrfs_ino(inode);
149 u64 search_start = args->start;
150 u64 disk_bytenr = 0;
151 u64 num_bytes = 0;
152 u64 extent_offset = 0;
153 u64 extent_end = 0;
154 u64 last_end = args->start;
155 int del_nr = 0;
156 int del_slot = 0;
157 int extent_type;
158 int recow;
159 int ret;
160 int modify_tree = -1;
161 int update_refs;
162 int found = 0;
163 struct btrfs_path *path = args->path;
164
165 args->bytes_found = 0;
166 args->extent_inserted = false;
167
168 /* Must always have a path if ->replace_extent is true */
169 ASSERT(!(args->replace_extent && !args->path));
170
171 if (!path) {
172 path = btrfs_alloc_path();
173 if (!path) {
174 ret = -ENOMEM;
175 goto out;
176 }
177 }
178
179 if (args->drop_cache)
180 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
181
182 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
183 modify_tree = 0;
184
185 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
186 while (1) {
187 recow = 0;
188 ret = btrfs_lookup_file_extent(trans, root, path, ino,
189 search_start, modify_tree);
190 if (ret < 0)
191 break;
192 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
193 leaf = path->nodes[0];
194 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
195 if (key.objectid == ino &&
196 key.type == BTRFS_EXTENT_DATA_KEY)
197 path->slots[0]--;
198 }
199 ret = 0;
200 next_slot:
201 leaf = path->nodes[0];
202 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
203 if (WARN_ON(del_nr > 0)) {
204 btrfs_print_leaf(leaf);
205 ret = -EINVAL;
206 break;
207 }
208 ret = btrfs_next_leaf(root, path);
209 if (ret < 0)
210 break;
211 if (ret > 0) {
212 ret = 0;
213 break;
214 }
215 leaf = path->nodes[0];
216 recow = 1;
217 }
218
219 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
220
221 if (key.objectid > ino)
222 break;
223 if (WARN_ON_ONCE(key.objectid < ino) ||
224 key.type < BTRFS_EXTENT_DATA_KEY) {
225 ASSERT(del_nr == 0);
226 path->slots[0]++;
227 goto next_slot;
228 }
229 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
230 break;
231
232 fi = btrfs_item_ptr(leaf, path->slots[0],
233 struct btrfs_file_extent_item);
234 extent_type = btrfs_file_extent_type(leaf, fi);
235
236 if (extent_type == BTRFS_FILE_EXTENT_REG ||
237 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
238 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
239 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
240 extent_offset = btrfs_file_extent_offset(leaf, fi);
241 extent_end = key.offset +
242 btrfs_file_extent_num_bytes(leaf, fi);
243 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
244 extent_end = key.offset +
245 btrfs_file_extent_ram_bytes(leaf, fi);
246 } else {
247 /* can't happen */
248 BUG();
249 }
250
251 /*
252 * Don't skip extent items representing 0 byte lengths. They
253 * used to be created (bug) if while punching holes we hit
254 * -ENOSPC condition. So if we find one here, just ensure we
255 * delete it, otherwise we would insert a new file extent item
256 * with the same key (offset) as that 0 bytes length file
257 * extent item in the call to setup_items_for_insert() later
258 * in this function.
259 */
260 if (extent_end == key.offset && extent_end >= search_start) {
261 last_end = extent_end;
262 goto delete_extent_item;
263 }
264
265 if (extent_end <= search_start) {
266 path->slots[0]++;
267 goto next_slot;
268 }
269
270 found = 1;
271 search_start = max(key.offset, args->start);
272 if (recow || !modify_tree) {
273 modify_tree = -1;
274 btrfs_release_path(path);
275 continue;
276 }
277
278 /*
279 * | - range to drop - |
280 * | -------- extent -------- |
281 */
282 if (args->start > key.offset && args->end < extent_end) {
283 if (WARN_ON(del_nr > 0)) {
284 btrfs_print_leaf(leaf);
285 ret = -EINVAL;
286 break;
287 }
288 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
289 ret = -EOPNOTSUPP;
290 break;
291 }
292
293 memcpy(&new_key, &key, sizeof(new_key));
294 new_key.offset = args->start;
295 ret = btrfs_duplicate_item(trans, root, path,
296 &new_key);
297 if (ret == -EAGAIN) {
298 btrfs_release_path(path);
299 continue;
300 }
301 if (ret < 0)
302 break;
303
304 leaf = path->nodes[0];
305 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
306 struct btrfs_file_extent_item);
307 btrfs_set_file_extent_num_bytes(leaf, fi,
308 args->start - key.offset);
309
310 fi = btrfs_item_ptr(leaf, path->slots[0],
311 struct btrfs_file_extent_item);
312
313 extent_offset += args->start - key.offset;
314 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
315 btrfs_set_file_extent_num_bytes(leaf, fi,
316 extent_end - args->start);
317
318 if (update_refs && disk_bytenr > 0) {
319 struct btrfs_ref ref = {
320 .action = BTRFS_ADD_DELAYED_REF,
321 .bytenr = disk_bytenr,
322 .num_bytes = num_bytes,
323 .parent = 0,
324 .owning_root = btrfs_root_id(root),
325 .ref_root = btrfs_root_id(root),
326 };
327 btrfs_init_data_ref(&ref, new_key.objectid,
328 args->start - extent_offset,
329 0, false);
330 ret = btrfs_inc_extent_ref(trans, &ref);
331 if (ret) {
332 btrfs_abort_transaction(trans, ret);
333 break;
334 }
335 }
336 key.offset = args->start;
337 }
338 /*
339 * From here on out we will have actually dropped something, so
340 * last_end can be updated.
341 */
342 last_end = extent_end;
343
344 /*
345 * | ---- range to drop ----- |
346 * | -------- extent -------- |
347 */
348 if (args->start <= key.offset && args->end < extent_end) {
349 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
350 ret = -EOPNOTSUPP;
351 break;
352 }
353
354 memcpy(&new_key, &key, sizeof(new_key));
355 new_key.offset = args->end;
356 btrfs_set_item_key_safe(trans, path, &new_key);
357
358 extent_offset += args->end - key.offset;
359 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
360 btrfs_set_file_extent_num_bytes(leaf, fi,
361 extent_end - args->end);
362 if (update_refs && disk_bytenr > 0)
363 args->bytes_found += args->end - key.offset;
364 break;
365 }
366
367 search_start = extent_end;
368 /*
369 * | ---- range to drop ----- |
370 * | -------- extent -------- |
371 */
372 if (args->start > key.offset && args->end >= extent_end) {
373 if (WARN_ON(del_nr > 0)) {
374 btrfs_print_leaf(leaf);
375 ret = -EINVAL;
376 break;
377 }
378 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
379 ret = -EOPNOTSUPP;
380 break;
381 }
382
383 btrfs_set_file_extent_num_bytes(leaf, fi,
384 args->start - key.offset);
385 if (update_refs && disk_bytenr > 0)
386 args->bytes_found += extent_end - args->start;
387 if (args->end == extent_end)
388 break;
389
390 path->slots[0]++;
391 goto next_slot;
392 }
393
394 /*
395 * | ---- range to drop ----- |
396 * | ------ extent ------ |
397 */
398 if (args->start <= key.offset && args->end >= extent_end) {
399 delete_extent_item:
400 if (del_nr == 0) {
401 del_slot = path->slots[0];
402 del_nr = 1;
403 } else {
404 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
405 btrfs_print_leaf(leaf);
406 ret = -EINVAL;
407 break;
408 }
409 del_nr++;
410 }
411
412 if (update_refs &&
413 extent_type == BTRFS_FILE_EXTENT_INLINE) {
414 args->bytes_found += extent_end - key.offset;
415 extent_end = ALIGN(extent_end,
416 fs_info->sectorsize);
417 } else if (update_refs && disk_bytenr > 0) {
418 struct btrfs_ref ref = {
419 .action = BTRFS_DROP_DELAYED_REF,
420 .bytenr = disk_bytenr,
421 .num_bytes = num_bytes,
422 .parent = 0,
423 .owning_root = btrfs_root_id(root),
424 .ref_root = btrfs_root_id(root),
425 };
426 btrfs_init_data_ref(&ref, key.objectid,
427 key.offset - extent_offset,
428 0, false);
429 ret = btrfs_free_extent(trans, &ref);
430 if (ret) {
431 btrfs_abort_transaction(trans, ret);
432 break;
433 }
434 args->bytes_found += extent_end - key.offset;
435 }
436
437 if (args->end == extent_end)
438 break;
439
440 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
441 path->slots[0]++;
442 goto next_slot;
443 }
444
445 ret = btrfs_del_items(trans, root, path, del_slot,
446 del_nr);
447 if (ret) {
448 btrfs_abort_transaction(trans, ret);
449 break;
450 }
451
452 del_nr = 0;
453 del_slot = 0;
454
455 btrfs_release_path(path);
456 continue;
457 }
458
459 BUG();
460 }
461
462 if (!ret && del_nr > 0) {
463 /*
464 * Set path->slots[0] to first slot, so that after the delete
465 * if items are move off from our leaf to its immediate left or
466 * right neighbor leafs, we end up with a correct and adjusted
467 * path->slots[0] for our insertion (if args->replace_extent).
468 */
469 path->slots[0] = del_slot;
470 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
471 if (ret)
472 btrfs_abort_transaction(trans, ret);
473 }
474
475 leaf = path->nodes[0];
476 /*
477 * If btrfs_del_items() was called, it might have deleted a leaf, in
478 * which case it unlocked our path, so check path->locks[0] matches a
479 * write lock.
480 */
481 if (!ret && args->replace_extent &&
482 path->locks[0] == BTRFS_WRITE_LOCK &&
483 btrfs_leaf_free_space(leaf) >=
484 sizeof(struct btrfs_item) + args->extent_item_size) {
485
486 key.objectid = ino;
487 key.type = BTRFS_EXTENT_DATA_KEY;
488 key.offset = args->start;
489 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
490 struct btrfs_key slot_key;
491
492 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
493 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
494 path->slots[0]++;
495 }
496 btrfs_setup_item_for_insert(trans, root, path, &key,
497 args->extent_item_size);
498 args->extent_inserted = true;
499 }
500
501 if (!args->path)
502 btrfs_free_path(path);
503 else if (!args->extent_inserted)
504 btrfs_release_path(path);
505 out:
506 args->drop_end = found ? min(args->end, last_end) : args->end;
507
508 return ret;
509 }
510
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)511 static int extent_mergeable(struct extent_buffer *leaf, int slot,
512 u64 objectid, u64 bytenr, u64 orig_offset,
513 u64 *start, u64 *end)
514 {
515 struct btrfs_file_extent_item *fi;
516 struct btrfs_key key;
517 u64 extent_end;
518
519 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
520 return 0;
521
522 btrfs_item_key_to_cpu(leaf, &key, slot);
523 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
524 return 0;
525
526 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
527 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
528 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
529 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
530 btrfs_file_extent_compression(leaf, fi) ||
531 btrfs_file_extent_encryption(leaf, fi) ||
532 btrfs_file_extent_other_encoding(leaf, fi))
533 return 0;
534
535 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
536 if ((*start && *start != key.offset) || (*end && *end != extent_end))
537 return 0;
538
539 *start = key.offset;
540 *end = extent_end;
541 return 1;
542 }
543
544 /*
545 * Mark extent in the range start - end as written.
546 *
547 * This changes extent type from 'pre-allocated' to 'regular'. If only
548 * part of extent is marked as written, the extent will be split into
549 * two or three.
550 */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)551 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
552 struct btrfs_inode *inode, u64 start, u64 end)
553 {
554 struct btrfs_root *root = inode->root;
555 struct extent_buffer *leaf;
556 struct btrfs_path *path;
557 struct btrfs_file_extent_item *fi;
558 struct btrfs_ref ref = { 0 };
559 struct btrfs_key key;
560 struct btrfs_key new_key;
561 u64 bytenr;
562 u64 num_bytes;
563 u64 extent_end;
564 u64 orig_offset;
565 u64 other_start;
566 u64 other_end;
567 u64 split;
568 int del_nr = 0;
569 int del_slot = 0;
570 int recow;
571 int ret = 0;
572 u64 ino = btrfs_ino(inode);
573
574 path = btrfs_alloc_path();
575 if (!path)
576 return -ENOMEM;
577 again:
578 recow = 0;
579 split = start;
580 key.objectid = ino;
581 key.type = BTRFS_EXTENT_DATA_KEY;
582 key.offset = split;
583
584 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
585 if (ret < 0)
586 goto out;
587 if (ret > 0 && path->slots[0] > 0)
588 path->slots[0]--;
589
590 leaf = path->nodes[0];
591 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
592 if (key.objectid != ino ||
593 key.type != BTRFS_EXTENT_DATA_KEY) {
594 ret = -EINVAL;
595 btrfs_abort_transaction(trans, ret);
596 goto out;
597 }
598 fi = btrfs_item_ptr(leaf, path->slots[0],
599 struct btrfs_file_extent_item);
600 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
601 ret = -EINVAL;
602 btrfs_abort_transaction(trans, ret);
603 goto out;
604 }
605 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
606 if (key.offset > start || extent_end < end) {
607 ret = -EINVAL;
608 btrfs_abort_transaction(trans, ret);
609 goto out;
610 }
611
612 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
613 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
614 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
615 memcpy(&new_key, &key, sizeof(new_key));
616
617 if (start == key.offset && end < extent_end) {
618 other_start = 0;
619 other_end = start;
620 if (extent_mergeable(leaf, path->slots[0] - 1,
621 ino, bytenr, orig_offset,
622 &other_start, &other_end)) {
623 new_key.offset = end;
624 btrfs_set_item_key_safe(trans, path, &new_key);
625 fi = btrfs_item_ptr(leaf, path->slots[0],
626 struct btrfs_file_extent_item);
627 btrfs_set_file_extent_generation(leaf, fi,
628 trans->transid);
629 btrfs_set_file_extent_num_bytes(leaf, fi,
630 extent_end - end);
631 btrfs_set_file_extent_offset(leaf, fi,
632 end - orig_offset);
633 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
634 struct btrfs_file_extent_item);
635 btrfs_set_file_extent_generation(leaf, fi,
636 trans->transid);
637 btrfs_set_file_extent_num_bytes(leaf, fi,
638 end - other_start);
639 goto out;
640 }
641 }
642
643 if (start > key.offset && end == extent_end) {
644 other_start = end;
645 other_end = 0;
646 if (extent_mergeable(leaf, path->slots[0] + 1,
647 ino, bytenr, orig_offset,
648 &other_start, &other_end)) {
649 fi = btrfs_item_ptr(leaf, path->slots[0],
650 struct btrfs_file_extent_item);
651 btrfs_set_file_extent_num_bytes(leaf, fi,
652 start - key.offset);
653 btrfs_set_file_extent_generation(leaf, fi,
654 trans->transid);
655 path->slots[0]++;
656 new_key.offset = start;
657 btrfs_set_item_key_safe(trans, path, &new_key);
658
659 fi = btrfs_item_ptr(leaf, path->slots[0],
660 struct btrfs_file_extent_item);
661 btrfs_set_file_extent_generation(leaf, fi,
662 trans->transid);
663 btrfs_set_file_extent_num_bytes(leaf, fi,
664 other_end - start);
665 btrfs_set_file_extent_offset(leaf, fi,
666 start - orig_offset);
667 goto out;
668 }
669 }
670
671 while (start > key.offset || end < extent_end) {
672 if (key.offset == start)
673 split = end;
674
675 new_key.offset = split;
676 ret = btrfs_duplicate_item(trans, root, path, &new_key);
677 if (ret == -EAGAIN) {
678 btrfs_release_path(path);
679 goto again;
680 }
681 if (ret < 0) {
682 btrfs_abort_transaction(trans, ret);
683 goto out;
684 }
685
686 leaf = path->nodes[0];
687 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
688 struct btrfs_file_extent_item);
689 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
690 btrfs_set_file_extent_num_bytes(leaf, fi,
691 split - key.offset);
692
693 fi = btrfs_item_ptr(leaf, path->slots[0],
694 struct btrfs_file_extent_item);
695
696 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
697 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
698 btrfs_set_file_extent_num_bytes(leaf, fi,
699 extent_end - split);
700
701 ref.action = BTRFS_ADD_DELAYED_REF;
702 ref.bytenr = bytenr;
703 ref.num_bytes = num_bytes;
704 ref.parent = 0;
705 ref.owning_root = btrfs_root_id(root);
706 ref.ref_root = btrfs_root_id(root);
707 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
708 ret = btrfs_inc_extent_ref(trans, &ref);
709 if (ret) {
710 btrfs_abort_transaction(trans, ret);
711 goto out;
712 }
713
714 if (split == start) {
715 key.offset = start;
716 } else {
717 if (start != key.offset) {
718 ret = -EINVAL;
719 btrfs_abort_transaction(trans, ret);
720 goto out;
721 }
722 path->slots[0]--;
723 extent_end = end;
724 }
725 recow = 1;
726 }
727
728 other_start = end;
729 other_end = 0;
730
731 ref.action = BTRFS_DROP_DELAYED_REF;
732 ref.bytenr = bytenr;
733 ref.num_bytes = num_bytes;
734 ref.parent = 0;
735 ref.owning_root = btrfs_root_id(root);
736 ref.ref_root = btrfs_root_id(root);
737 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
738 if (extent_mergeable(leaf, path->slots[0] + 1,
739 ino, bytenr, orig_offset,
740 &other_start, &other_end)) {
741 if (recow) {
742 btrfs_release_path(path);
743 goto again;
744 }
745 extent_end = other_end;
746 del_slot = path->slots[0] + 1;
747 del_nr++;
748 ret = btrfs_free_extent(trans, &ref);
749 if (ret) {
750 btrfs_abort_transaction(trans, ret);
751 goto out;
752 }
753 }
754 other_start = 0;
755 other_end = start;
756 if (extent_mergeable(leaf, path->slots[0] - 1,
757 ino, bytenr, orig_offset,
758 &other_start, &other_end)) {
759 if (recow) {
760 btrfs_release_path(path);
761 goto again;
762 }
763 key.offset = other_start;
764 del_slot = path->slots[0];
765 del_nr++;
766 ret = btrfs_free_extent(trans, &ref);
767 if (ret) {
768 btrfs_abort_transaction(trans, ret);
769 goto out;
770 }
771 }
772 if (del_nr == 0) {
773 fi = btrfs_item_ptr(leaf, path->slots[0],
774 struct btrfs_file_extent_item);
775 btrfs_set_file_extent_type(leaf, fi,
776 BTRFS_FILE_EXTENT_REG);
777 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
778 } else {
779 fi = btrfs_item_ptr(leaf, del_slot - 1,
780 struct btrfs_file_extent_item);
781 btrfs_set_file_extent_type(leaf, fi,
782 BTRFS_FILE_EXTENT_REG);
783 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
784 btrfs_set_file_extent_num_bytes(leaf, fi,
785 extent_end - key.offset);
786
787 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
788 if (ret < 0) {
789 btrfs_abort_transaction(trans, ret);
790 goto out;
791 }
792 }
793 out:
794 btrfs_free_path(path);
795 return ret;
796 }
797
798 /*
799 * On error return an unlocked folio and the error value
800 * On success return a locked folio and 0
801 */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len,bool force_uptodate)802 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
803 u64 len, bool force_uptodate)
804 {
805 u64 clamp_start = max_t(u64, pos, folio_pos(folio));
806 u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
807 int ret = 0;
808
809 if (folio_test_uptodate(folio))
810 return 0;
811
812 if (!force_uptodate &&
813 IS_ALIGNED(clamp_start, PAGE_SIZE) &&
814 IS_ALIGNED(clamp_end, PAGE_SIZE))
815 return 0;
816
817 ret = btrfs_read_folio(NULL, folio);
818 if (ret)
819 return ret;
820 folio_lock(folio);
821 if (!folio_test_uptodate(folio)) {
822 folio_unlock(folio);
823 return -EIO;
824 }
825
826 /*
827 * Since btrfs_read_folio() will unlock the folio before it returns,
828 * there is a window where btrfs_release_folio() can be called to
829 * release the page. Here we check both inode mapping and page
830 * private to make sure the page was not released.
831 *
832 * The private flag check is essential for subpage as we need to store
833 * extra bitmap using folio private.
834 */
835 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
836 folio_unlock(folio);
837 return -EAGAIN;
838 }
839 return 0;
840 }
841
get_prepare_gfp_flags(struct inode * inode,bool nowait)842 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
843 {
844 gfp_t gfp;
845
846 gfp = btrfs_alloc_write_mask(inode->i_mapping);
847 if (nowait) {
848 gfp &= ~__GFP_DIRECT_RECLAIM;
849 gfp |= GFP_NOWAIT;
850 }
851
852 return gfp;
853 }
854
855 /*
856 * Get folio into the page cache and lock it.
857 */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool force_uptodate,bool nowait)858 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
859 loff_t pos, size_t write_bytes,
860 bool force_uptodate, bool nowait)
861 {
862 unsigned long index = pos >> PAGE_SHIFT;
863 gfp_t mask = get_prepare_gfp_flags(inode, nowait);
864 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
865 struct folio *folio;
866 int ret = 0;
867
868 again:
869 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
870 if (IS_ERR(folio)) {
871 if (nowait)
872 ret = -EAGAIN;
873 else
874 ret = PTR_ERR(folio);
875 return ret;
876 }
877 folio_wait_writeback(folio);
878 /* Only support page sized folio yet. */
879 ASSERT(folio_order(folio) == 0);
880 ret = set_folio_extent_mapped(folio);
881 if (ret < 0) {
882 folio_unlock(folio);
883 folio_put(folio);
884 return ret;
885 }
886 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
887 if (ret) {
888 /* The folio is already unlocked. */
889 folio_put(folio);
890 if (!nowait && ret == -EAGAIN) {
891 ret = 0;
892 goto again;
893 }
894 return ret;
895 }
896 *folio_ret = folio;
897 return 0;
898 }
899
900 /*
901 * Locks the extent and properly waits for data=ordered extents to finish
902 * before allowing the folios to be modified if need.
903 *
904 * Return:
905 * 1 - the extent is locked
906 * 0 - the extent is not locked, and everything is OK
907 * -EAGAIN - need to prepare the folios again
908 */
909 static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)910 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
911 loff_t pos, size_t write_bytes,
912 u64 *lockstart, u64 *lockend, bool nowait,
913 struct extent_state **cached_state)
914 {
915 struct btrfs_fs_info *fs_info = inode->root->fs_info;
916 u64 start_pos;
917 u64 last_pos;
918 int ret = 0;
919
920 start_pos = round_down(pos, fs_info->sectorsize);
921 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
922
923 if (start_pos < inode->vfs_inode.i_size) {
924 struct btrfs_ordered_extent *ordered;
925
926 if (nowait) {
927 if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
928 cached_state)) {
929 folio_unlock(folio);
930 folio_put(folio);
931 return -EAGAIN;
932 }
933 } else {
934 lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
935 }
936
937 ordered = btrfs_lookup_ordered_range(inode, start_pos,
938 last_pos - start_pos + 1);
939 if (ordered &&
940 ordered->file_offset + ordered->num_bytes > start_pos &&
941 ordered->file_offset <= last_pos) {
942 unlock_extent(&inode->io_tree, start_pos, last_pos,
943 cached_state);
944 folio_unlock(folio);
945 folio_put(folio);
946 btrfs_start_ordered_extent(ordered);
947 btrfs_put_ordered_extent(ordered);
948 return -EAGAIN;
949 }
950 if (ordered)
951 btrfs_put_ordered_extent(ordered);
952
953 *lockstart = start_pos;
954 *lockend = last_pos;
955 ret = 1;
956 }
957
958 /*
959 * We should be called after prepare_one_folio() which should have locked
960 * all pages in the range.
961 */
962 WARN_ON(!folio_test_locked(folio));
963
964 return ret;
965 }
966
967 /*
968 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
969 *
970 * @pos: File offset.
971 * @write_bytes: The length to write, will be updated to the nocow writeable
972 * range.
973 *
974 * This function will flush ordered extents in the range to ensure proper
975 * nocow checks.
976 *
977 * Return:
978 * > 0 If we can nocow, and updates @write_bytes.
979 * 0 If we can't do a nocow write.
980 * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
981 * root is in progress.
982 * < 0 If an error happened.
983 *
984 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
985 */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)986 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
987 size_t *write_bytes, bool nowait)
988 {
989 struct btrfs_fs_info *fs_info = inode->root->fs_info;
990 struct btrfs_root *root = inode->root;
991 struct extent_state *cached_state = NULL;
992 u64 lockstart, lockend;
993 u64 num_bytes;
994 int ret;
995
996 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
997 return 0;
998
999 if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1000 return -EAGAIN;
1001
1002 lockstart = round_down(pos, fs_info->sectorsize);
1003 lockend = round_up(pos + *write_bytes,
1004 fs_info->sectorsize) - 1;
1005 num_bytes = lockend - lockstart + 1;
1006
1007 if (nowait) {
1008 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1009 &cached_state)) {
1010 btrfs_drew_write_unlock(&root->snapshot_lock);
1011 return -EAGAIN;
1012 }
1013 } else {
1014 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1015 &cached_state);
1016 }
1017 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1018 NULL, nowait);
1019 if (ret <= 0)
1020 btrfs_drew_write_unlock(&root->snapshot_lock);
1021 else
1022 *write_bytes = min_t(size_t, *write_bytes ,
1023 num_bytes - pos + lockstart);
1024 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1025
1026 return ret;
1027 }
1028
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1029 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1030 {
1031 btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1032 }
1033
btrfs_write_check(struct kiocb * iocb,size_t count)1034 int btrfs_write_check(struct kiocb *iocb, size_t count)
1035 {
1036 struct file *file = iocb->ki_filp;
1037 struct inode *inode = file_inode(file);
1038 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1039 loff_t pos = iocb->ki_pos;
1040 int ret;
1041 loff_t oldsize;
1042
1043 /*
1044 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1045 * prealloc flags, as without those flags we always have to COW. We will
1046 * later check if we can really COW into the target range (using
1047 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1048 */
1049 if ((iocb->ki_flags & IOCB_NOWAIT) &&
1050 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1051 return -EAGAIN;
1052
1053 ret = file_remove_privs(file);
1054 if (ret)
1055 return ret;
1056
1057 /*
1058 * We reserve space for updating the inode when we reserve space for the
1059 * extent we are going to write, so we will enospc out there. We don't
1060 * need to start yet another transaction to update the inode as we will
1061 * update the inode when we finish writing whatever data we write.
1062 */
1063 if (!IS_NOCMTIME(inode)) {
1064 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1065 inode_inc_iversion(inode);
1066 }
1067
1068 oldsize = i_size_read(inode);
1069 if (pos > oldsize) {
1070 /* Expand hole size to cover write data, preventing empty gap */
1071 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1072
1073 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1074 if (ret)
1075 return ret;
1076 }
1077
1078 return 0;
1079 }
1080
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * i)1081 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
1082 {
1083 struct file *file = iocb->ki_filp;
1084 loff_t pos;
1085 struct inode *inode = file_inode(file);
1086 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1087 struct extent_changeset *data_reserved = NULL;
1088 u64 release_bytes = 0;
1089 u64 lockstart;
1090 u64 lockend;
1091 size_t num_written = 0;
1092 ssize_t ret;
1093 loff_t old_isize;
1094 unsigned int ilock_flags = 0;
1095 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1096 unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1097 bool only_release_metadata = false;
1098
1099 if (nowait)
1100 ilock_flags |= BTRFS_ILOCK_TRY;
1101
1102 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1103 if (ret < 0)
1104 return ret;
1105
1106 /*
1107 * We can only trust the isize with inode lock held, or it can race with
1108 * other buffered writes and cause incorrect call of
1109 * pagecache_isize_extended() to overwrite existing data.
1110 */
1111 old_isize = i_size_read(inode);
1112
1113 ret = generic_write_checks(iocb, i);
1114 if (ret <= 0)
1115 goto out;
1116
1117 ret = btrfs_write_check(iocb, ret);
1118 if (ret < 0)
1119 goto out;
1120
1121 pos = iocb->ki_pos;
1122 while (iov_iter_count(i) > 0) {
1123 struct extent_state *cached_state = NULL;
1124 size_t offset = offset_in_page(pos);
1125 size_t sector_offset;
1126 size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
1127 size_t reserve_bytes;
1128 size_t copied;
1129 size_t dirty_sectors;
1130 size_t num_sectors;
1131 struct folio *folio = NULL;
1132 int extents_locked;
1133 bool force_page_uptodate = false;
1134
1135 /*
1136 * Fault pages before locking them in prepare_one_folio()
1137 * to avoid recursive lock
1138 */
1139 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1140 ret = -EFAULT;
1141 break;
1142 }
1143
1144 only_release_metadata = false;
1145 sector_offset = pos & (fs_info->sectorsize - 1);
1146
1147 extent_changeset_release(data_reserved);
1148 ret = btrfs_check_data_free_space(BTRFS_I(inode),
1149 &data_reserved, pos,
1150 write_bytes, nowait);
1151 if (ret < 0) {
1152 int can_nocow;
1153
1154 if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1155 ret = -EAGAIN;
1156 break;
1157 }
1158
1159 /*
1160 * If we don't have to COW at the offset, reserve
1161 * metadata only. write_bytes may get smaller than
1162 * requested here.
1163 */
1164 can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1165 &write_bytes, nowait);
1166 if (can_nocow < 0)
1167 ret = can_nocow;
1168 if (can_nocow > 0)
1169 ret = 0;
1170 if (ret)
1171 break;
1172 only_release_metadata = true;
1173 }
1174
1175 reserve_bytes = round_up(write_bytes + sector_offset,
1176 fs_info->sectorsize);
1177 WARN_ON(reserve_bytes == 0);
1178 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1179 reserve_bytes,
1180 reserve_bytes, nowait);
1181 if (ret) {
1182 if (!only_release_metadata)
1183 btrfs_free_reserved_data_space(BTRFS_I(inode),
1184 data_reserved, pos,
1185 write_bytes);
1186 else
1187 btrfs_check_nocow_unlock(BTRFS_I(inode));
1188
1189 if (nowait && ret == -ENOSPC)
1190 ret = -EAGAIN;
1191 break;
1192 }
1193
1194 release_bytes = reserve_bytes;
1195 again:
1196 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1197 if (ret) {
1198 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1199 break;
1200 }
1201
1202 ret = prepare_one_folio(inode, &folio, pos, write_bytes,
1203 force_page_uptodate, false);
1204 if (ret) {
1205 btrfs_delalloc_release_extents(BTRFS_I(inode),
1206 reserve_bytes);
1207 break;
1208 }
1209
1210 extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
1211 folio, pos, write_bytes, &lockstart,
1212 &lockend, nowait, &cached_state);
1213 if (extents_locked < 0) {
1214 if (!nowait && extents_locked == -EAGAIN)
1215 goto again;
1216
1217 btrfs_delalloc_release_extents(BTRFS_I(inode),
1218 reserve_bytes);
1219 ret = extents_locked;
1220 break;
1221 }
1222
1223 copied = copy_folio_from_iter_atomic(folio,
1224 offset_in_folio(folio, pos), write_bytes, i);
1225 flush_dcache_folio(folio);
1226
1227 /*
1228 * If we get a partial write, we can end up with partially
1229 * uptodate page. Although if sector size < page size we can
1230 * handle it, but if it's not sector aligned it can cause
1231 * a lot of complexity, so make sure they don't happen by
1232 * forcing retry this copy.
1233 */
1234 if (unlikely(copied < write_bytes)) {
1235 if (!folio_test_uptodate(folio)) {
1236 iov_iter_revert(i, copied);
1237 copied = 0;
1238 }
1239 }
1240
1241 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1242 dirty_sectors = round_up(copied + sector_offset,
1243 fs_info->sectorsize);
1244 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1245
1246 if (copied == 0) {
1247 force_page_uptodate = true;
1248 dirty_sectors = 0;
1249 } else {
1250 force_page_uptodate = false;
1251 }
1252
1253 if (num_sectors > dirty_sectors) {
1254 /* release everything except the sectors we dirtied */
1255 release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1256 if (only_release_metadata) {
1257 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1258 release_bytes, true);
1259 } else {
1260 u64 release_start = round_up(pos + copied,
1261 fs_info->sectorsize);
1262 btrfs_delalloc_release_space(BTRFS_I(inode),
1263 data_reserved, release_start,
1264 release_bytes, true);
1265 }
1266 }
1267
1268 release_bytes = round_up(copied + sector_offset,
1269 fs_info->sectorsize);
1270
1271 ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
1272 &cached_state, only_release_metadata);
1273
1274 /*
1275 * If we have not locked the extent range, because the range's
1276 * start offset is >= i_size, we might still have a non-NULL
1277 * cached extent state, acquired while marking the extent range
1278 * as delalloc through btrfs_dirty_page(). Therefore free any
1279 * possible cached extent state to avoid a memory leak.
1280 */
1281 if (extents_locked)
1282 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1283 lockend, &cached_state);
1284 else
1285 free_extent_state(cached_state);
1286
1287 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1288 if (ret) {
1289 btrfs_drop_folio(fs_info, folio, pos, copied);
1290 break;
1291 }
1292
1293 release_bytes = 0;
1294 if (only_release_metadata)
1295 btrfs_check_nocow_unlock(BTRFS_I(inode));
1296
1297 btrfs_drop_folio(fs_info, folio, pos, copied);
1298
1299 cond_resched();
1300
1301 pos += copied;
1302 num_written += copied;
1303 }
1304
1305 if (release_bytes) {
1306 if (only_release_metadata) {
1307 btrfs_check_nocow_unlock(BTRFS_I(inode));
1308 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1309 release_bytes, true);
1310 } else {
1311 btrfs_delalloc_release_space(BTRFS_I(inode),
1312 data_reserved,
1313 round_down(pos, fs_info->sectorsize),
1314 release_bytes, true);
1315 }
1316 }
1317
1318 extent_changeset_free(data_reserved);
1319 if (num_written > 0) {
1320 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1321 iocb->ki_pos += num_written;
1322 }
1323 out:
1324 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1325 return num_written ? num_written : ret;
1326 }
1327
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1328 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1329 const struct btrfs_ioctl_encoded_io_args *encoded)
1330 {
1331 struct file *file = iocb->ki_filp;
1332 struct inode *inode = file_inode(file);
1333 loff_t count;
1334 ssize_t ret;
1335
1336 btrfs_inode_lock(BTRFS_I(inode), 0);
1337 count = encoded->len;
1338 ret = generic_write_checks_count(iocb, &count);
1339 if (ret == 0 && count != encoded->len) {
1340 /*
1341 * The write got truncated by generic_write_checks_count(). We
1342 * can't do a partial encoded write.
1343 */
1344 ret = -EFBIG;
1345 }
1346 if (ret || encoded->len == 0)
1347 goto out;
1348
1349 ret = btrfs_write_check(iocb, encoded->len);
1350 if (ret < 0)
1351 goto out;
1352
1353 ret = btrfs_do_encoded_write(iocb, from, encoded);
1354 out:
1355 btrfs_inode_unlock(BTRFS_I(inode), 0);
1356 return ret;
1357 }
1358
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1359 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1360 const struct btrfs_ioctl_encoded_io_args *encoded)
1361 {
1362 struct file *file = iocb->ki_filp;
1363 struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1364 ssize_t num_written, num_sync;
1365
1366 /*
1367 * If the fs flips readonly due to some impossible error, although we
1368 * have opened a file as writable, we have to stop this write operation
1369 * to ensure consistency.
1370 */
1371 if (BTRFS_FS_ERROR(inode->root->fs_info))
1372 return -EROFS;
1373
1374 if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1375 return -EOPNOTSUPP;
1376
1377 if (encoded) {
1378 num_written = btrfs_encoded_write(iocb, from, encoded);
1379 num_sync = encoded->len;
1380 } else if (iocb->ki_flags & IOCB_DIRECT) {
1381 num_written = btrfs_direct_write(iocb, from);
1382 num_sync = num_written;
1383 } else {
1384 num_written = btrfs_buffered_write(iocb, from);
1385 num_sync = num_written;
1386 }
1387
1388 btrfs_set_inode_last_sub_trans(inode);
1389
1390 if (num_sync > 0) {
1391 num_sync = generic_write_sync(iocb, num_sync);
1392 if (num_sync < 0)
1393 num_written = num_sync;
1394 }
1395
1396 return num_written;
1397 }
1398
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1399 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1400 {
1401 return btrfs_do_write_iter(iocb, from, NULL);
1402 }
1403
btrfs_release_file(struct inode * inode,struct file * filp)1404 int btrfs_release_file(struct inode *inode, struct file *filp)
1405 {
1406 struct btrfs_file_private *private = filp->private_data;
1407
1408 if (private) {
1409 kfree(private->filldir_buf);
1410 free_extent_state(private->llseek_cached_state);
1411 kfree(private);
1412 filp->private_data = NULL;
1413 }
1414
1415 /*
1416 * Set by setattr when we are about to truncate a file from a non-zero
1417 * size to a zero size. This tries to flush down new bytes that may
1418 * have been written if the application were using truncate to replace
1419 * a file in place.
1420 */
1421 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1422 &BTRFS_I(inode)->runtime_flags))
1423 filemap_flush(inode->i_mapping);
1424 return 0;
1425 }
1426
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1427 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1428 {
1429 int ret;
1430 struct blk_plug plug;
1431
1432 /*
1433 * This is only called in fsync, which would do synchronous writes, so
1434 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1435 * multiple disks using raid profile, a large IO can be split to
1436 * several segments of stripe length (currently 64K).
1437 */
1438 blk_start_plug(&plug);
1439 ret = btrfs_fdatawrite_range(inode, start, end);
1440 blk_finish_plug(&plug);
1441
1442 return ret;
1443 }
1444
skip_inode_logging(const struct btrfs_log_ctx * ctx)1445 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1446 {
1447 struct btrfs_inode *inode = ctx->inode;
1448 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1449
1450 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1451 list_empty(&ctx->ordered_extents))
1452 return true;
1453
1454 /*
1455 * If we are doing a fast fsync we can not bail out if the inode's
1456 * last_trans is <= then the last committed transaction, because we only
1457 * update the last_trans of the inode during ordered extent completion,
1458 * and for a fast fsync we don't wait for that, we only wait for the
1459 * writeback to complete.
1460 */
1461 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1462 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1463 list_empty(&ctx->ordered_extents)))
1464 return true;
1465
1466 return false;
1467 }
1468
1469 /*
1470 * fsync call for both files and directories. This logs the inode into
1471 * the tree log instead of forcing full commits whenever possible.
1472 *
1473 * It needs to call filemap_fdatawait so that all ordered extent updates are
1474 * in the metadata btree are up to date for copying to the log.
1475 *
1476 * It drops the inode mutex before doing the tree log commit. This is an
1477 * important optimization for directories because holding the mutex prevents
1478 * new operations on the dir while we write to disk.
1479 */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1480 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1481 {
1482 struct dentry *dentry = file_dentry(file);
1483 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1484 struct btrfs_root *root = inode->root;
1485 struct btrfs_fs_info *fs_info = root->fs_info;
1486 struct btrfs_trans_handle *trans;
1487 struct btrfs_log_ctx ctx;
1488 int ret = 0, err;
1489 u64 len;
1490 bool full_sync;
1491 bool skip_ilock = false;
1492
1493 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1494 skip_ilock = true;
1495 current->journal_info = NULL;
1496 btrfs_assert_inode_locked(inode);
1497 }
1498
1499 trace_btrfs_sync_file(file, datasync);
1500
1501 btrfs_init_log_ctx(&ctx, inode);
1502
1503 /*
1504 * Always set the range to a full range, otherwise we can get into
1505 * several problems, from missing file extent items to represent holes
1506 * when not using the NO_HOLES feature, to log tree corruption due to
1507 * races between hole detection during logging and completion of ordered
1508 * extents outside the range, to missing checksums due to ordered extents
1509 * for which we flushed only a subset of their pages.
1510 */
1511 start = 0;
1512 end = LLONG_MAX;
1513 len = (u64)LLONG_MAX + 1;
1514
1515 /*
1516 * We write the dirty pages in the range and wait until they complete
1517 * out of the ->i_mutex. If so, we can flush the dirty pages by
1518 * multi-task, and make the performance up. See
1519 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1520 */
1521 ret = start_ordered_ops(inode, start, end);
1522 if (ret)
1523 goto out;
1524
1525 if (skip_ilock)
1526 down_write(&inode->i_mmap_lock);
1527 else
1528 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1529
1530 atomic_inc(&root->log_batch);
1531
1532 /*
1533 * Before we acquired the inode's lock and the mmap lock, someone may
1534 * have dirtied more pages in the target range. We need to make sure
1535 * that writeback for any such pages does not start while we are logging
1536 * the inode, because if it does, any of the following might happen when
1537 * we are not doing a full inode sync:
1538 *
1539 * 1) We log an extent after its writeback finishes but before its
1540 * checksums are added to the csum tree, leading to -EIO errors
1541 * when attempting to read the extent after a log replay.
1542 *
1543 * 2) We can end up logging an extent before its writeback finishes.
1544 * Therefore after the log replay we will have a file extent item
1545 * pointing to an unwritten extent (and no data checksums as well).
1546 *
1547 * So trigger writeback for any eventual new dirty pages and then we
1548 * wait for all ordered extents to complete below.
1549 */
1550 ret = start_ordered_ops(inode, start, end);
1551 if (ret) {
1552 if (skip_ilock)
1553 up_write(&inode->i_mmap_lock);
1554 else
1555 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1556 goto out;
1557 }
1558
1559 /*
1560 * Always check for the full sync flag while holding the inode's lock,
1561 * to avoid races with other tasks. The flag must be either set all the
1562 * time during logging or always off all the time while logging.
1563 * We check the flag here after starting delalloc above, because when
1564 * running delalloc the full sync flag may be set if we need to drop
1565 * extra extent map ranges due to temporary memory allocation failures.
1566 */
1567 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1568
1569 /*
1570 * We have to do this here to avoid the priority inversion of waiting on
1571 * IO of a lower priority task while holding a transaction open.
1572 *
1573 * For a full fsync we wait for the ordered extents to complete while
1574 * for a fast fsync we wait just for writeback to complete, and then
1575 * attach the ordered extents to the transaction so that a transaction
1576 * commit waits for their completion, to avoid data loss if we fsync,
1577 * the current transaction commits before the ordered extents complete
1578 * and a power failure happens right after that.
1579 *
1580 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1581 * logical address recorded in the ordered extent may change. We need
1582 * to wait for the IO to stabilize the logical address.
1583 */
1584 if (full_sync || btrfs_is_zoned(fs_info)) {
1585 ret = btrfs_wait_ordered_range(inode, start, len);
1586 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1587 } else {
1588 /*
1589 * Get our ordered extents as soon as possible to avoid doing
1590 * checksum lookups in the csum tree, and use instead the
1591 * checksums attached to the ordered extents.
1592 */
1593 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1594 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1595 if (ret)
1596 goto out_release_extents;
1597
1598 /*
1599 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1600 * starting and waiting for writeback, because for buffered IO
1601 * it may have been set during the end IO callback
1602 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1603 * case an error happened and we need to wait for ordered
1604 * extents to complete so that any extent maps that point to
1605 * unwritten locations are dropped and we don't log them.
1606 */
1607 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1608 ret = btrfs_wait_ordered_range(inode, start, len);
1609 }
1610
1611 if (ret)
1612 goto out_release_extents;
1613
1614 atomic_inc(&root->log_batch);
1615
1616 if (skip_inode_logging(&ctx)) {
1617 /*
1618 * We've had everything committed since the last time we were
1619 * modified so clear this flag in case it was set for whatever
1620 * reason, it's no longer relevant.
1621 */
1622 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1623 /*
1624 * An ordered extent might have started before and completed
1625 * already with io errors, in which case the inode was not
1626 * updated and we end up here. So check the inode's mapping
1627 * for any errors that might have happened since we last
1628 * checked called fsync.
1629 */
1630 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1631 goto out_release_extents;
1632 }
1633
1634 btrfs_init_log_ctx_scratch_eb(&ctx);
1635
1636 /*
1637 * We use start here because we will need to wait on the IO to complete
1638 * in btrfs_sync_log, which could require joining a transaction (for
1639 * example checking cross references in the nocow path). If we use join
1640 * here we could get into a situation where we're waiting on IO to
1641 * happen that is blocked on a transaction trying to commit. With start
1642 * we inc the extwriter counter, so we wait for all extwriters to exit
1643 * before we start blocking joiners. This comment is to keep somebody
1644 * from thinking they are super smart and changing this to
1645 * btrfs_join_transaction *cough*Josef*cough*.
1646 */
1647 trans = btrfs_start_transaction(root, 0);
1648 if (IS_ERR(trans)) {
1649 ret = PTR_ERR(trans);
1650 goto out_release_extents;
1651 }
1652 trans->in_fsync = true;
1653
1654 ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1655 /*
1656 * Scratch eb no longer needed, release before syncing log or commit
1657 * transaction, to avoid holding unnecessary memory during such long
1658 * operations.
1659 */
1660 if (ctx.scratch_eb) {
1661 free_extent_buffer(ctx.scratch_eb);
1662 ctx.scratch_eb = NULL;
1663 }
1664 btrfs_release_log_ctx_extents(&ctx);
1665 if (ret < 0) {
1666 /* Fallthrough and commit/free transaction. */
1667 ret = BTRFS_LOG_FORCE_COMMIT;
1668 }
1669
1670 /* we've logged all the items and now have a consistent
1671 * version of the file in the log. It is possible that
1672 * someone will come in and modify the file, but that's
1673 * fine because the log is consistent on disk, and we
1674 * have references to all of the file's extents
1675 *
1676 * It is possible that someone will come in and log the
1677 * file again, but that will end up using the synchronization
1678 * inside btrfs_sync_log to keep things safe.
1679 */
1680 if (skip_ilock)
1681 up_write(&inode->i_mmap_lock);
1682 else
1683 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1684
1685 if (ret == BTRFS_NO_LOG_SYNC) {
1686 ret = btrfs_end_transaction(trans);
1687 goto out;
1688 }
1689
1690 /* We successfully logged the inode, attempt to sync the log. */
1691 if (!ret) {
1692 ret = btrfs_sync_log(trans, root, &ctx);
1693 if (!ret) {
1694 ret = btrfs_end_transaction(trans);
1695 goto out;
1696 }
1697 }
1698
1699 /*
1700 * At this point we need to commit the transaction because we had
1701 * btrfs_need_log_full_commit() or some other error.
1702 *
1703 * If we didn't do a full sync we have to stop the trans handle, wait on
1704 * the ordered extents, start it again and commit the transaction. If
1705 * we attempt to wait on the ordered extents here we could deadlock with
1706 * something like fallocate() that is holding the extent lock trying to
1707 * start a transaction while some other thread is trying to commit the
1708 * transaction while we (fsync) are currently holding the transaction
1709 * open.
1710 */
1711 if (!full_sync) {
1712 ret = btrfs_end_transaction(trans);
1713 if (ret)
1714 goto out;
1715 ret = btrfs_wait_ordered_range(inode, start, len);
1716 if (ret)
1717 goto out;
1718
1719 /*
1720 * This is safe to use here because we're only interested in
1721 * making sure the transaction that had the ordered extents is
1722 * committed. We aren't waiting on anything past this point,
1723 * we're purely getting the transaction and committing it.
1724 */
1725 trans = btrfs_attach_transaction_barrier(root);
1726 if (IS_ERR(trans)) {
1727 ret = PTR_ERR(trans);
1728
1729 /*
1730 * We committed the transaction and there's no currently
1731 * running transaction, this means everything we care
1732 * about made it to disk and we are done.
1733 */
1734 if (ret == -ENOENT)
1735 ret = 0;
1736 goto out;
1737 }
1738 }
1739
1740 ret = btrfs_commit_transaction(trans);
1741 out:
1742 free_extent_buffer(ctx.scratch_eb);
1743 ASSERT(list_empty(&ctx.list));
1744 ASSERT(list_empty(&ctx.conflict_inodes));
1745 err = file_check_and_advance_wb_err(file);
1746 if (!ret)
1747 ret = err;
1748 return ret > 0 ? -EIO : ret;
1749
1750 out_release_extents:
1751 btrfs_release_log_ctx_extents(&ctx);
1752 if (skip_ilock)
1753 up_write(&inode->i_mmap_lock);
1754 else
1755 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1756 goto out;
1757 }
1758
1759 /*
1760 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1761 * called from a page fault handler when a page is first dirtied. Hence we must
1762 * be careful to check for EOF conditions here. We set the page up correctly
1763 * for a written page which means we get ENOSPC checking when writing into
1764 * holes and correct delalloc and unwritten extent mapping on filesystems that
1765 * support these features.
1766 *
1767 * We are not allowed to take the i_mutex here so we have to play games to
1768 * protect against truncate races as the page could now be beyond EOF. Because
1769 * truncate_setsize() writes the inode size before removing pages, once we have
1770 * the page lock we can determine safely if the page is beyond EOF. If it is not
1771 * beyond EOF, then the page is guaranteed safe against truncation until we
1772 * unlock the page.
1773 */
btrfs_page_mkwrite(struct vm_fault * vmf)1774 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1775 {
1776 struct page *page = vmf->page;
1777 struct folio *folio = page_folio(page);
1778 struct inode *inode = file_inode(vmf->vma->vm_file);
1779 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1780 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1781 struct btrfs_ordered_extent *ordered;
1782 struct extent_state *cached_state = NULL;
1783 struct extent_changeset *data_reserved = NULL;
1784 unsigned long zero_start;
1785 loff_t size;
1786 vm_fault_t ret;
1787 int ret2;
1788 int reserved = 0;
1789 u64 reserved_space;
1790 u64 page_start;
1791 u64 page_end;
1792 u64 end;
1793
1794 ASSERT(folio_order(folio) == 0);
1795
1796 reserved_space = PAGE_SIZE;
1797
1798 sb_start_pagefault(inode->i_sb);
1799 page_start = folio_pos(folio);
1800 page_end = page_start + folio_size(folio) - 1;
1801 end = page_end;
1802
1803 /*
1804 * Reserving delalloc space after obtaining the page lock can lead to
1805 * deadlock. For example, if a dirty page is locked by this function
1806 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1807 * dirty page write out, then the btrfs_writepages() function could
1808 * end up waiting indefinitely to get a lock on the page currently
1809 * being processed by btrfs_page_mkwrite() function.
1810 */
1811 ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
1812 page_start, reserved_space);
1813 if (!ret2) {
1814 ret2 = file_update_time(vmf->vma->vm_file);
1815 reserved = 1;
1816 }
1817 if (ret2) {
1818 ret = vmf_error(ret2);
1819 if (reserved)
1820 goto out;
1821 goto out_noreserve;
1822 }
1823
1824 /* Make the VM retry the fault. */
1825 ret = VM_FAULT_NOPAGE;
1826 again:
1827 down_read(&BTRFS_I(inode)->i_mmap_lock);
1828 folio_lock(folio);
1829 size = i_size_read(inode);
1830
1831 if ((folio->mapping != inode->i_mapping) ||
1832 (page_start >= size)) {
1833 /* Page got truncated out from underneath us. */
1834 goto out_unlock;
1835 }
1836 folio_wait_writeback(folio);
1837
1838 lock_extent(io_tree, page_start, page_end, &cached_state);
1839 ret2 = set_folio_extent_mapped(folio);
1840 if (ret2 < 0) {
1841 ret = vmf_error(ret2);
1842 unlock_extent(io_tree, page_start, page_end, &cached_state);
1843 goto out_unlock;
1844 }
1845
1846 /*
1847 * We can't set the delalloc bits if there are pending ordered
1848 * extents. Drop our locks and wait for them to finish.
1849 */
1850 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
1851 if (ordered) {
1852 unlock_extent(io_tree, page_start, page_end, &cached_state);
1853 folio_unlock(folio);
1854 up_read(&BTRFS_I(inode)->i_mmap_lock);
1855 btrfs_start_ordered_extent(ordered);
1856 btrfs_put_ordered_extent(ordered);
1857 goto again;
1858 }
1859
1860 if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
1861 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1862 if (reserved_space < PAGE_SIZE) {
1863 end = page_start + reserved_space - 1;
1864 btrfs_delalloc_release_space(BTRFS_I(inode),
1865 data_reserved, page_start,
1866 PAGE_SIZE - reserved_space, true);
1867 }
1868 }
1869
1870 /*
1871 * page_mkwrite gets called when the page is firstly dirtied after it's
1872 * faulted in, but write(2) could also dirty a page and set delalloc
1873 * bits, thus in this case for space account reason, we still need to
1874 * clear any delalloc bits within this page range since we have to
1875 * reserve data&meta space before lock_page() (see above comments).
1876 */
1877 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
1878 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1879 EXTENT_DEFRAG, &cached_state);
1880
1881 ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
1882 &cached_state);
1883 if (ret2) {
1884 unlock_extent(io_tree, page_start, page_end, &cached_state);
1885 ret = VM_FAULT_SIGBUS;
1886 goto out_unlock;
1887 }
1888
1889 /* Page is wholly or partially inside EOF. */
1890 if (page_start + folio_size(folio) > size)
1891 zero_start = offset_in_folio(folio, size);
1892 else
1893 zero_start = PAGE_SIZE;
1894
1895 if (zero_start != PAGE_SIZE)
1896 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1897
1898 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
1899 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1900 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1901
1902 btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
1903
1904 unlock_extent(io_tree, page_start, page_end, &cached_state);
1905 up_read(&BTRFS_I(inode)->i_mmap_lock);
1906
1907 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
1908 sb_end_pagefault(inode->i_sb);
1909 extent_changeset_free(data_reserved);
1910 return VM_FAULT_LOCKED;
1911
1912 out_unlock:
1913 folio_unlock(folio);
1914 up_read(&BTRFS_I(inode)->i_mmap_lock);
1915 out:
1916 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
1917 btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
1918 reserved_space, (ret != 0));
1919 out_noreserve:
1920 sb_end_pagefault(inode->i_sb);
1921 extent_changeset_free(data_reserved);
1922 return ret;
1923 }
1924
1925 static const struct vm_operations_struct btrfs_file_vm_ops = {
1926 .fault = filemap_fault,
1927 .map_pages = filemap_map_pages,
1928 .page_mkwrite = btrfs_page_mkwrite,
1929 };
1930
btrfs_file_mmap(struct file * filp,struct vm_area_struct * vma)1931 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1932 {
1933 struct address_space *mapping = filp->f_mapping;
1934
1935 if (!mapping->a_ops->read_folio)
1936 return -ENOEXEC;
1937
1938 file_accessed(filp);
1939 vma->vm_ops = &btrfs_file_vm_ops;
1940
1941 return 0;
1942 }
1943
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)1944 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
1945 int slot, u64 start, u64 end)
1946 {
1947 struct btrfs_file_extent_item *fi;
1948 struct btrfs_key key;
1949
1950 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1951 return 0;
1952
1953 btrfs_item_key_to_cpu(leaf, &key, slot);
1954 if (key.objectid != btrfs_ino(inode) ||
1955 key.type != BTRFS_EXTENT_DATA_KEY)
1956 return 0;
1957
1958 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1959
1960 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1961 return 0;
1962
1963 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1964 return 0;
1965
1966 if (key.offset == end)
1967 return 1;
1968 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1969 return 1;
1970 return 0;
1971 }
1972
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)1973 static int fill_holes(struct btrfs_trans_handle *trans,
1974 struct btrfs_inode *inode,
1975 struct btrfs_path *path, u64 offset, u64 end)
1976 {
1977 struct btrfs_fs_info *fs_info = trans->fs_info;
1978 struct btrfs_root *root = inode->root;
1979 struct extent_buffer *leaf;
1980 struct btrfs_file_extent_item *fi;
1981 struct extent_map *hole_em;
1982 struct btrfs_key key;
1983 int ret;
1984
1985 if (btrfs_fs_incompat(fs_info, NO_HOLES))
1986 goto out;
1987
1988 key.objectid = btrfs_ino(inode);
1989 key.type = BTRFS_EXTENT_DATA_KEY;
1990 key.offset = offset;
1991
1992 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1993 if (ret <= 0) {
1994 /*
1995 * We should have dropped this offset, so if we find it then
1996 * something has gone horribly wrong.
1997 */
1998 if (ret == 0)
1999 ret = -EINVAL;
2000 return ret;
2001 }
2002
2003 leaf = path->nodes[0];
2004 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2005 u64 num_bytes;
2006
2007 path->slots[0]--;
2008 fi = btrfs_item_ptr(leaf, path->slots[0],
2009 struct btrfs_file_extent_item);
2010 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2011 end - offset;
2012 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2013 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2014 btrfs_set_file_extent_offset(leaf, fi, 0);
2015 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2016 goto out;
2017 }
2018
2019 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2020 u64 num_bytes;
2021
2022 key.offset = offset;
2023 btrfs_set_item_key_safe(trans, path, &key);
2024 fi = btrfs_item_ptr(leaf, path->slots[0],
2025 struct btrfs_file_extent_item);
2026 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2027 offset;
2028 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2029 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2030 btrfs_set_file_extent_offset(leaf, fi, 0);
2031 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2032 goto out;
2033 }
2034 btrfs_release_path(path);
2035
2036 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2037 end - offset);
2038 if (ret)
2039 return ret;
2040
2041 out:
2042 btrfs_release_path(path);
2043
2044 hole_em = alloc_extent_map();
2045 if (!hole_em) {
2046 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2047 btrfs_set_inode_full_sync(inode);
2048 } else {
2049 hole_em->start = offset;
2050 hole_em->len = end - offset;
2051 hole_em->ram_bytes = hole_em->len;
2052
2053 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2054 hole_em->disk_num_bytes = 0;
2055 hole_em->generation = trans->transid;
2056
2057 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2058 free_extent_map(hole_em);
2059 if (ret)
2060 btrfs_set_inode_full_sync(inode);
2061 }
2062
2063 return 0;
2064 }
2065
2066 /*
2067 * Find a hole extent on given inode and change start/len to the end of hole
2068 * extent.(hole/vacuum extent whose em->start <= start &&
2069 * em->start + em->len > start)
2070 * When a hole extent is found, return 1 and modify start/len.
2071 */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2072 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2073 {
2074 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2075 struct extent_map *em;
2076 int ret = 0;
2077
2078 em = btrfs_get_extent(inode, NULL,
2079 round_down(*start, fs_info->sectorsize),
2080 round_up(*len, fs_info->sectorsize));
2081 if (IS_ERR(em))
2082 return PTR_ERR(em);
2083
2084 /* Hole or vacuum extent(only exists in no-hole mode) */
2085 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2086 ret = 1;
2087 *len = em->start + em->len > *start + *len ?
2088 0 : *start + *len - em->start - em->len;
2089 *start = em->start + em->len;
2090 }
2091 free_extent_map(em);
2092 return ret;
2093 }
2094
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2095 static void btrfs_punch_hole_lock_range(struct inode *inode,
2096 const u64 lockstart,
2097 const u64 lockend,
2098 struct extent_state **cached_state)
2099 {
2100 /*
2101 * For subpage case, if the range is not at page boundary, we could
2102 * have pages at the leading/tailing part of the range.
2103 * This could lead to dead loop since filemap_range_has_page()
2104 * will always return true.
2105 * So here we need to do extra page alignment for
2106 * filemap_range_has_page().
2107 */
2108 const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2109 const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2110
2111 while (1) {
2112 truncate_pagecache_range(inode, lockstart, lockend);
2113
2114 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2115 cached_state);
2116 /*
2117 * We can't have ordered extents in the range, nor dirty/writeback
2118 * pages, because we have locked the inode's VFS lock in exclusive
2119 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2120 * we have flushed all delalloc in the range and we have waited
2121 * for any ordered extents in the range to complete.
2122 * We can race with anyone reading pages from this range, so after
2123 * locking the range check if we have pages in the range, and if
2124 * we do, unlock the range and retry.
2125 */
2126 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2127 page_lockend))
2128 break;
2129
2130 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2131 cached_state);
2132 }
2133
2134 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2135 }
2136
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2137 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2138 struct btrfs_inode *inode,
2139 struct btrfs_path *path,
2140 struct btrfs_replace_extent_info *extent_info,
2141 const u64 replace_len,
2142 const u64 bytes_to_drop)
2143 {
2144 struct btrfs_fs_info *fs_info = trans->fs_info;
2145 struct btrfs_root *root = inode->root;
2146 struct btrfs_file_extent_item *extent;
2147 struct extent_buffer *leaf;
2148 struct btrfs_key key;
2149 int slot;
2150 int ret;
2151
2152 if (replace_len == 0)
2153 return 0;
2154
2155 if (extent_info->disk_offset == 0 &&
2156 btrfs_fs_incompat(fs_info, NO_HOLES)) {
2157 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2158 return 0;
2159 }
2160
2161 key.objectid = btrfs_ino(inode);
2162 key.type = BTRFS_EXTENT_DATA_KEY;
2163 key.offset = extent_info->file_offset;
2164 ret = btrfs_insert_empty_item(trans, root, path, &key,
2165 sizeof(struct btrfs_file_extent_item));
2166 if (ret)
2167 return ret;
2168 leaf = path->nodes[0];
2169 slot = path->slots[0];
2170 write_extent_buffer(leaf, extent_info->extent_buf,
2171 btrfs_item_ptr_offset(leaf, slot),
2172 sizeof(struct btrfs_file_extent_item));
2173 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2174 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2175 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2176 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2177 if (extent_info->is_new_extent)
2178 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2179 btrfs_release_path(path);
2180
2181 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2182 replace_len);
2183 if (ret)
2184 return ret;
2185
2186 /* If it's a hole, nothing more needs to be done. */
2187 if (extent_info->disk_offset == 0) {
2188 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2189 return 0;
2190 }
2191
2192 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2193
2194 if (extent_info->is_new_extent && extent_info->insertions == 0) {
2195 key.objectid = extent_info->disk_offset;
2196 key.type = BTRFS_EXTENT_ITEM_KEY;
2197 key.offset = extent_info->disk_len;
2198 ret = btrfs_alloc_reserved_file_extent(trans, root,
2199 btrfs_ino(inode),
2200 extent_info->file_offset,
2201 extent_info->qgroup_reserved,
2202 &key);
2203 } else {
2204 struct btrfs_ref ref = {
2205 .action = BTRFS_ADD_DELAYED_REF,
2206 .bytenr = extent_info->disk_offset,
2207 .num_bytes = extent_info->disk_len,
2208 .owning_root = btrfs_root_id(root),
2209 .ref_root = btrfs_root_id(root),
2210 };
2211 u64 ref_offset;
2212
2213 ref_offset = extent_info->file_offset - extent_info->data_offset;
2214 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2215 ret = btrfs_inc_extent_ref(trans, &ref);
2216 }
2217
2218 extent_info->insertions++;
2219
2220 return ret;
2221 }
2222
2223 /*
2224 * The respective range must have been previously locked, as well as the inode.
2225 * The end offset is inclusive (last byte of the range).
2226 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2227 * the file range with an extent.
2228 * When not punching a hole, we don't want to end up in a state where we dropped
2229 * extents without inserting a new one, so we must abort the transaction to avoid
2230 * a corruption.
2231 */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2232 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2233 struct btrfs_path *path, const u64 start,
2234 const u64 end,
2235 struct btrfs_replace_extent_info *extent_info,
2236 struct btrfs_trans_handle **trans_out)
2237 {
2238 struct btrfs_drop_extents_args drop_args = { 0 };
2239 struct btrfs_root *root = inode->root;
2240 struct btrfs_fs_info *fs_info = root->fs_info;
2241 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2242 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2243 struct btrfs_trans_handle *trans = NULL;
2244 struct btrfs_block_rsv *rsv;
2245 unsigned int rsv_count;
2246 u64 cur_offset;
2247 u64 len = end - start;
2248 int ret = 0;
2249
2250 if (end <= start)
2251 return -EINVAL;
2252
2253 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2254 if (!rsv) {
2255 ret = -ENOMEM;
2256 goto out;
2257 }
2258 rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2259 rsv->failfast = true;
2260
2261 /*
2262 * 1 - update the inode
2263 * 1 - removing the extents in the range
2264 * 1 - adding the hole extent if no_holes isn't set or if we are
2265 * replacing the range with a new extent
2266 */
2267 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2268 rsv_count = 3;
2269 else
2270 rsv_count = 2;
2271
2272 trans = btrfs_start_transaction(root, rsv_count);
2273 if (IS_ERR(trans)) {
2274 ret = PTR_ERR(trans);
2275 trans = NULL;
2276 goto out_free;
2277 }
2278
2279 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2280 min_size, false);
2281 if (WARN_ON(ret))
2282 goto out_trans;
2283 trans->block_rsv = rsv;
2284
2285 cur_offset = start;
2286 drop_args.path = path;
2287 drop_args.end = end + 1;
2288 drop_args.drop_cache = true;
2289 while (cur_offset < end) {
2290 drop_args.start = cur_offset;
2291 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2292 /* If we are punching a hole decrement the inode's byte count */
2293 if (!extent_info)
2294 btrfs_update_inode_bytes(inode, 0,
2295 drop_args.bytes_found);
2296 if (ret != -ENOSPC) {
2297 /*
2298 * The only time we don't want to abort is if we are
2299 * attempting to clone a partial inline extent, in which
2300 * case we'll get EOPNOTSUPP. However if we aren't
2301 * clone we need to abort no matter what, because if we
2302 * got EOPNOTSUPP via prealloc then we messed up and
2303 * need to abort.
2304 */
2305 if (ret &&
2306 (ret != -EOPNOTSUPP ||
2307 (extent_info && extent_info->is_new_extent)))
2308 btrfs_abort_transaction(trans, ret);
2309 break;
2310 }
2311
2312 trans->block_rsv = &fs_info->trans_block_rsv;
2313
2314 if (!extent_info && cur_offset < drop_args.drop_end &&
2315 cur_offset < ino_size) {
2316 ret = fill_holes(trans, inode, path, cur_offset,
2317 drop_args.drop_end);
2318 if (ret) {
2319 /*
2320 * If we failed then we didn't insert our hole
2321 * entries for the area we dropped, so now the
2322 * fs is corrupted, so we must abort the
2323 * transaction.
2324 */
2325 btrfs_abort_transaction(trans, ret);
2326 break;
2327 }
2328 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2329 /*
2330 * We are past the i_size here, but since we didn't
2331 * insert holes we need to clear the mapped area so we
2332 * know to not set disk_i_size in this area until a new
2333 * file extent is inserted here.
2334 */
2335 ret = btrfs_inode_clear_file_extent_range(inode,
2336 cur_offset,
2337 drop_args.drop_end - cur_offset);
2338 if (ret) {
2339 /*
2340 * We couldn't clear our area, so we could
2341 * presumably adjust up and corrupt the fs, so
2342 * we need to abort.
2343 */
2344 btrfs_abort_transaction(trans, ret);
2345 break;
2346 }
2347 }
2348
2349 if (extent_info &&
2350 drop_args.drop_end > extent_info->file_offset) {
2351 u64 replace_len = drop_args.drop_end -
2352 extent_info->file_offset;
2353
2354 ret = btrfs_insert_replace_extent(trans, inode, path,
2355 extent_info, replace_len,
2356 drop_args.bytes_found);
2357 if (ret) {
2358 btrfs_abort_transaction(trans, ret);
2359 break;
2360 }
2361 extent_info->data_len -= replace_len;
2362 extent_info->data_offset += replace_len;
2363 extent_info->file_offset += replace_len;
2364 }
2365
2366 /*
2367 * We are releasing our handle on the transaction, balance the
2368 * dirty pages of the btree inode and flush delayed items, and
2369 * then get a new transaction handle, which may now point to a
2370 * new transaction in case someone else may have committed the
2371 * transaction we used to replace/drop file extent items. So
2372 * bump the inode's iversion and update mtime and ctime except
2373 * if we are called from a dedupe context. This is because a
2374 * power failure/crash may happen after the transaction is
2375 * committed and before we finish replacing/dropping all the
2376 * file extent items we need.
2377 */
2378 inode_inc_iversion(&inode->vfs_inode);
2379
2380 if (!extent_info || extent_info->update_times)
2381 inode_set_mtime_to_ts(&inode->vfs_inode,
2382 inode_set_ctime_current(&inode->vfs_inode));
2383
2384 ret = btrfs_update_inode(trans, inode);
2385 if (ret)
2386 break;
2387
2388 btrfs_end_transaction(trans);
2389 btrfs_btree_balance_dirty(fs_info);
2390
2391 trans = btrfs_start_transaction(root, rsv_count);
2392 if (IS_ERR(trans)) {
2393 ret = PTR_ERR(trans);
2394 trans = NULL;
2395 break;
2396 }
2397
2398 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2399 rsv, min_size, false);
2400 if (WARN_ON(ret))
2401 break;
2402 trans->block_rsv = rsv;
2403
2404 cur_offset = drop_args.drop_end;
2405 len = end - cur_offset;
2406 if (!extent_info && len) {
2407 ret = find_first_non_hole(inode, &cur_offset, &len);
2408 if (unlikely(ret < 0))
2409 break;
2410 if (ret && !len) {
2411 ret = 0;
2412 break;
2413 }
2414 }
2415 }
2416
2417 /*
2418 * If we were cloning, force the next fsync to be a full one since we
2419 * we replaced (or just dropped in the case of cloning holes when
2420 * NO_HOLES is enabled) file extent items and did not setup new extent
2421 * maps for the replacement extents (or holes).
2422 */
2423 if (extent_info && !extent_info->is_new_extent)
2424 btrfs_set_inode_full_sync(inode);
2425
2426 if (ret)
2427 goto out_trans;
2428
2429 trans->block_rsv = &fs_info->trans_block_rsv;
2430 /*
2431 * If we are using the NO_HOLES feature we might have had already an
2432 * hole that overlaps a part of the region [lockstart, lockend] and
2433 * ends at (or beyond) lockend. Since we have no file extent items to
2434 * represent holes, drop_end can be less than lockend and so we must
2435 * make sure we have an extent map representing the existing hole (the
2436 * call to __btrfs_drop_extents() might have dropped the existing extent
2437 * map representing the existing hole), otherwise the fast fsync path
2438 * will not record the existence of the hole region
2439 * [existing_hole_start, lockend].
2440 */
2441 if (drop_args.drop_end <= end)
2442 drop_args.drop_end = end + 1;
2443 /*
2444 * Don't insert file hole extent item if it's for a range beyond eof
2445 * (because it's useless) or if it represents a 0 bytes range (when
2446 * cur_offset == drop_end).
2447 */
2448 if (!extent_info && cur_offset < ino_size &&
2449 cur_offset < drop_args.drop_end) {
2450 ret = fill_holes(trans, inode, path, cur_offset,
2451 drop_args.drop_end);
2452 if (ret) {
2453 /* Same comment as above. */
2454 btrfs_abort_transaction(trans, ret);
2455 goto out_trans;
2456 }
2457 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2458 /* See the comment in the loop above for the reasoning here. */
2459 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2460 drop_args.drop_end - cur_offset);
2461 if (ret) {
2462 btrfs_abort_transaction(trans, ret);
2463 goto out_trans;
2464 }
2465
2466 }
2467 if (extent_info) {
2468 ret = btrfs_insert_replace_extent(trans, inode, path,
2469 extent_info, extent_info->data_len,
2470 drop_args.bytes_found);
2471 if (ret) {
2472 btrfs_abort_transaction(trans, ret);
2473 goto out_trans;
2474 }
2475 }
2476
2477 out_trans:
2478 if (!trans)
2479 goto out_free;
2480
2481 trans->block_rsv = &fs_info->trans_block_rsv;
2482 if (ret)
2483 btrfs_end_transaction(trans);
2484 else
2485 *trans_out = trans;
2486 out_free:
2487 btrfs_free_block_rsv(fs_info, rsv);
2488 out:
2489 return ret;
2490 }
2491
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2492 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2493 {
2494 struct inode *inode = file_inode(file);
2495 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2496 struct btrfs_root *root = BTRFS_I(inode)->root;
2497 struct extent_state *cached_state = NULL;
2498 struct btrfs_path *path;
2499 struct btrfs_trans_handle *trans = NULL;
2500 u64 lockstart;
2501 u64 lockend;
2502 u64 tail_start;
2503 u64 tail_len;
2504 u64 orig_start = offset;
2505 int ret = 0;
2506 bool same_block;
2507 u64 ino_size;
2508 bool truncated_block = false;
2509 bool updated_inode = false;
2510
2511 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2512
2513 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2514 if (ret)
2515 goto out_only_mutex;
2516
2517 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2518 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2519 if (ret < 0)
2520 goto out_only_mutex;
2521 if (ret && !len) {
2522 /* Already in a large hole */
2523 ret = 0;
2524 goto out_only_mutex;
2525 }
2526
2527 ret = file_modified(file);
2528 if (ret)
2529 goto out_only_mutex;
2530
2531 lockstart = round_up(offset, fs_info->sectorsize);
2532 lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2533 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2534 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2535 /*
2536 * We needn't truncate any block which is beyond the end of the file
2537 * because we are sure there is no data there.
2538 */
2539 /*
2540 * Only do this if we are in the same block and we aren't doing the
2541 * entire block.
2542 */
2543 if (same_block && len < fs_info->sectorsize) {
2544 if (offset < ino_size) {
2545 truncated_block = true;
2546 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2547 0);
2548 } else {
2549 ret = 0;
2550 }
2551 goto out_only_mutex;
2552 }
2553
2554 /* zero back part of the first block */
2555 if (offset < ino_size) {
2556 truncated_block = true;
2557 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2558 if (ret) {
2559 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2560 return ret;
2561 }
2562 }
2563
2564 /* Check the aligned pages after the first unaligned page,
2565 * if offset != orig_start, which means the first unaligned page
2566 * including several following pages are already in holes,
2567 * the extra check can be skipped */
2568 if (offset == orig_start) {
2569 /* after truncate page, check hole again */
2570 len = offset + len - lockstart;
2571 offset = lockstart;
2572 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2573 if (ret < 0)
2574 goto out_only_mutex;
2575 if (ret && !len) {
2576 ret = 0;
2577 goto out_only_mutex;
2578 }
2579 lockstart = offset;
2580 }
2581
2582 /* Check the tail unaligned part is in a hole */
2583 tail_start = lockend + 1;
2584 tail_len = offset + len - tail_start;
2585 if (tail_len) {
2586 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2587 if (unlikely(ret < 0))
2588 goto out_only_mutex;
2589 if (!ret) {
2590 /* zero the front end of the last page */
2591 if (tail_start + tail_len < ino_size) {
2592 truncated_block = true;
2593 ret = btrfs_truncate_block(BTRFS_I(inode),
2594 tail_start + tail_len,
2595 0, 1);
2596 if (ret)
2597 goto out_only_mutex;
2598 }
2599 }
2600 }
2601
2602 if (lockend < lockstart) {
2603 ret = 0;
2604 goto out_only_mutex;
2605 }
2606
2607 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2608
2609 path = btrfs_alloc_path();
2610 if (!path) {
2611 ret = -ENOMEM;
2612 goto out;
2613 }
2614
2615 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2616 lockend, NULL, &trans);
2617 btrfs_free_path(path);
2618 if (ret)
2619 goto out;
2620
2621 ASSERT(trans != NULL);
2622 inode_inc_iversion(inode);
2623 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2624 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2625 updated_inode = true;
2626 btrfs_end_transaction(trans);
2627 btrfs_btree_balance_dirty(fs_info);
2628 out:
2629 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2630 &cached_state);
2631 out_only_mutex:
2632 if (!updated_inode && truncated_block && !ret) {
2633 /*
2634 * If we only end up zeroing part of a page, we still need to
2635 * update the inode item, so that all the time fields are
2636 * updated as well as the necessary btrfs inode in memory fields
2637 * for detecting, at fsync time, if the inode isn't yet in the
2638 * log tree or it's there but not up to date.
2639 */
2640 struct timespec64 now = inode_set_ctime_current(inode);
2641
2642 inode_inc_iversion(inode);
2643 inode_set_mtime_to_ts(inode, now);
2644 trans = btrfs_start_transaction(root, 1);
2645 if (IS_ERR(trans)) {
2646 ret = PTR_ERR(trans);
2647 } else {
2648 int ret2;
2649
2650 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2651 ret2 = btrfs_end_transaction(trans);
2652 if (!ret)
2653 ret = ret2;
2654 }
2655 }
2656 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2657 return ret;
2658 }
2659
2660 /* Helper structure to record which range is already reserved */
2661 struct falloc_range {
2662 struct list_head list;
2663 u64 start;
2664 u64 len;
2665 };
2666
2667 /*
2668 * Helper function to add falloc range
2669 *
2670 * Caller should have locked the larger range of extent containing
2671 * [start, len)
2672 */
add_falloc_range(struct list_head * head,u64 start,u64 len)2673 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2674 {
2675 struct falloc_range *range = NULL;
2676
2677 if (!list_empty(head)) {
2678 /*
2679 * As fallocate iterates by bytenr order, we only need to check
2680 * the last range.
2681 */
2682 range = list_last_entry(head, struct falloc_range, list);
2683 if (range->start + range->len == start) {
2684 range->len += len;
2685 return 0;
2686 }
2687 }
2688
2689 range = kmalloc(sizeof(*range), GFP_KERNEL);
2690 if (!range)
2691 return -ENOMEM;
2692 range->start = start;
2693 range->len = len;
2694 list_add_tail(&range->list, head);
2695 return 0;
2696 }
2697
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2698 static int btrfs_fallocate_update_isize(struct inode *inode,
2699 const u64 end,
2700 const int mode)
2701 {
2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(inode)->root;
2704 int ret;
2705 int ret2;
2706
2707 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2708 return 0;
2709
2710 trans = btrfs_start_transaction(root, 1);
2711 if (IS_ERR(trans))
2712 return PTR_ERR(trans);
2713
2714 inode_set_ctime_current(inode);
2715 i_size_write(inode, end);
2716 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2717 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2718 ret2 = btrfs_end_transaction(trans);
2719
2720 return ret ? ret : ret2;
2721 }
2722
2723 enum {
2724 RANGE_BOUNDARY_WRITTEN_EXTENT,
2725 RANGE_BOUNDARY_PREALLOC_EXTENT,
2726 RANGE_BOUNDARY_HOLE,
2727 };
2728
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2729 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2730 u64 offset)
2731 {
2732 const u64 sectorsize = inode->root->fs_info->sectorsize;
2733 struct extent_map *em;
2734 int ret;
2735
2736 offset = round_down(offset, sectorsize);
2737 em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2738 if (IS_ERR(em))
2739 return PTR_ERR(em);
2740
2741 if (em->disk_bytenr == EXTENT_MAP_HOLE)
2742 ret = RANGE_BOUNDARY_HOLE;
2743 else if (em->flags & EXTENT_FLAG_PREALLOC)
2744 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2745 else
2746 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2747
2748 free_extent_map(em);
2749 return ret;
2750 }
2751
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2752 static int btrfs_zero_range(struct inode *inode,
2753 loff_t offset,
2754 loff_t len,
2755 const int mode)
2756 {
2757 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2758 struct extent_map *em;
2759 struct extent_changeset *data_reserved = NULL;
2760 int ret;
2761 u64 alloc_hint = 0;
2762 const u64 sectorsize = fs_info->sectorsize;
2763 u64 alloc_start = round_down(offset, sectorsize);
2764 u64 alloc_end = round_up(offset + len, sectorsize);
2765 u64 bytes_to_reserve = 0;
2766 bool space_reserved = false;
2767
2768 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2769 alloc_end - alloc_start);
2770 if (IS_ERR(em)) {
2771 ret = PTR_ERR(em);
2772 goto out;
2773 }
2774
2775 /*
2776 * Avoid hole punching and extent allocation for some cases. More cases
2777 * could be considered, but these are unlikely common and we keep things
2778 * as simple as possible for now. Also, intentionally, if the target
2779 * range contains one or more prealloc extents together with regular
2780 * extents and holes, we drop all the existing extents and allocate a
2781 * new prealloc extent, so that we get a larger contiguous disk extent.
2782 */
2783 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2784 const u64 em_end = em->start + em->len;
2785
2786 if (em_end >= offset + len) {
2787 /*
2788 * The whole range is already a prealloc extent,
2789 * do nothing except updating the inode's i_size if
2790 * needed.
2791 */
2792 free_extent_map(em);
2793 ret = btrfs_fallocate_update_isize(inode, offset + len,
2794 mode);
2795 goto out;
2796 }
2797 /*
2798 * Part of the range is already a prealloc extent, so operate
2799 * only on the remaining part of the range.
2800 */
2801 alloc_start = em_end;
2802 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2803 len = offset + len - alloc_start;
2804 offset = alloc_start;
2805 alloc_hint = extent_map_block_start(em) + em->len;
2806 }
2807 free_extent_map(em);
2808
2809 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2810 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2811 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2812 if (IS_ERR(em)) {
2813 ret = PTR_ERR(em);
2814 goto out;
2815 }
2816
2817 if (em->flags & EXTENT_FLAG_PREALLOC) {
2818 free_extent_map(em);
2819 ret = btrfs_fallocate_update_isize(inode, offset + len,
2820 mode);
2821 goto out;
2822 }
2823 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2824 free_extent_map(em);
2825 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2826 0);
2827 if (!ret)
2828 ret = btrfs_fallocate_update_isize(inode,
2829 offset + len,
2830 mode);
2831 return ret;
2832 }
2833 free_extent_map(em);
2834 alloc_start = round_down(offset, sectorsize);
2835 alloc_end = alloc_start + sectorsize;
2836 goto reserve_space;
2837 }
2838
2839 alloc_start = round_up(offset, sectorsize);
2840 alloc_end = round_down(offset + len, sectorsize);
2841
2842 /*
2843 * For unaligned ranges, check the pages at the boundaries, they might
2844 * map to an extent, in which case we need to partially zero them, or
2845 * they might map to a hole, in which case we need our allocation range
2846 * to cover them.
2847 */
2848 if (!IS_ALIGNED(offset, sectorsize)) {
2849 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2850 offset);
2851 if (ret < 0)
2852 goto out;
2853 if (ret == RANGE_BOUNDARY_HOLE) {
2854 alloc_start = round_down(offset, sectorsize);
2855 ret = 0;
2856 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2857 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2858 if (ret)
2859 goto out;
2860 } else {
2861 ret = 0;
2862 }
2863 }
2864
2865 if (!IS_ALIGNED(offset + len, sectorsize)) {
2866 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2867 offset + len);
2868 if (ret < 0)
2869 goto out;
2870 if (ret == RANGE_BOUNDARY_HOLE) {
2871 alloc_end = round_up(offset + len, sectorsize);
2872 ret = 0;
2873 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2874 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2875 0, 1);
2876 if (ret)
2877 goto out;
2878 } else {
2879 ret = 0;
2880 }
2881 }
2882
2883 reserve_space:
2884 if (alloc_start < alloc_end) {
2885 struct extent_state *cached_state = NULL;
2886 const u64 lockstart = alloc_start;
2887 const u64 lockend = alloc_end - 1;
2888
2889 bytes_to_reserve = alloc_end - alloc_start;
2890 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2891 bytes_to_reserve);
2892 if (ret < 0)
2893 goto out;
2894 space_reserved = true;
2895 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2896 &cached_state);
2897 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2898 alloc_start, bytes_to_reserve);
2899 if (ret) {
2900 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
2901 lockend, &cached_state);
2902 goto out;
2903 }
2904 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
2905 alloc_end - alloc_start,
2906 fs_info->sectorsize,
2907 offset + len, &alloc_hint);
2908 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2909 &cached_state);
2910 /* btrfs_prealloc_file_range releases reserved space on error */
2911 if (ret) {
2912 space_reserved = false;
2913 goto out;
2914 }
2915 }
2916 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
2917 out:
2918 if (ret && space_reserved)
2919 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
2920 alloc_start, bytes_to_reserve);
2921 extent_changeset_free(data_reserved);
2922
2923 return ret;
2924 }
2925
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)2926 static long btrfs_fallocate(struct file *file, int mode,
2927 loff_t offset, loff_t len)
2928 {
2929 struct inode *inode = file_inode(file);
2930 struct extent_state *cached_state = NULL;
2931 struct extent_changeset *data_reserved = NULL;
2932 struct falloc_range *range;
2933 struct falloc_range *tmp;
2934 LIST_HEAD(reserve_list);
2935 u64 cur_offset;
2936 u64 last_byte;
2937 u64 alloc_start;
2938 u64 alloc_end;
2939 u64 alloc_hint = 0;
2940 u64 locked_end;
2941 u64 actual_end = 0;
2942 u64 data_space_needed = 0;
2943 u64 data_space_reserved = 0;
2944 u64 qgroup_reserved = 0;
2945 struct extent_map *em;
2946 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
2947 int ret;
2948
2949 /* Do not allow fallocate in ZONED mode */
2950 if (btrfs_is_zoned(inode_to_fs_info(inode)))
2951 return -EOPNOTSUPP;
2952
2953 alloc_start = round_down(offset, blocksize);
2954 alloc_end = round_up(offset + len, blocksize);
2955 cur_offset = alloc_start;
2956
2957 /* Make sure we aren't being give some crap mode */
2958 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
2959 FALLOC_FL_ZERO_RANGE))
2960 return -EOPNOTSUPP;
2961
2962 if (mode & FALLOC_FL_PUNCH_HOLE)
2963 return btrfs_punch_hole(file, offset, len);
2964
2965 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2966
2967 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
2968 ret = inode_newsize_ok(inode, offset + len);
2969 if (ret)
2970 goto out;
2971 }
2972
2973 ret = file_modified(file);
2974 if (ret)
2975 goto out;
2976
2977 /*
2978 * TODO: Move these two operations after we have checked
2979 * accurate reserved space, or fallocate can still fail but
2980 * with page truncated or size expanded.
2981 *
2982 * But that's a minor problem and won't do much harm BTW.
2983 */
2984 if (alloc_start > inode->i_size) {
2985 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
2986 alloc_start);
2987 if (ret)
2988 goto out;
2989 } else if (offset + len > inode->i_size) {
2990 /*
2991 * If we are fallocating from the end of the file onward we
2992 * need to zero out the end of the block if i_size lands in the
2993 * middle of a block.
2994 */
2995 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
2996 if (ret)
2997 goto out;
2998 }
2999
3000 /*
3001 * We have locked the inode at the VFS level (in exclusive mode) and we
3002 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3003 * locking the file range, flush all dealloc in the range and wait for
3004 * all ordered extents in the range to complete. After this we can lock
3005 * the file range and, due to the previous locking we did, we know there
3006 * can't be more delalloc or ordered extents in the range.
3007 */
3008 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3009 alloc_end - alloc_start);
3010 if (ret)
3011 goto out;
3012
3013 if (mode & FALLOC_FL_ZERO_RANGE) {
3014 ret = btrfs_zero_range(inode, offset, len, mode);
3015 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3016 return ret;
3017 }
3018
3019 locked_end = alloc_end - 1;
3020 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3021 &cached_state);
3022
3023 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3024
3025 /* First, check if we exceed the qgroup limit */
3026 while (cur_offset < alloc_end) {
3027 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3028 alloc_end - cur_offset);
3029 if (IS_ERR(em)) {
3030 ret = PTR_ERR(em);
3031 break;
3032 }
3033 last_byte = min(extent_map_end(em), alloc_end);
3034 actual_end = min_t(u64, extent_map_end(em), offset + len);
3035 last_byte = ALIGN(last_byte, blocksize);
3036 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3037 (cur_offset >= inode->i_size &&
3038 !(em->flags & EXTENT_FLAG_PREALLOC))) {
3039 const u64 range_len = last_byte - cur_offset;
3040
3041 ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3042 if (ret < 0) {
3043 free_extent_map(em);
3044 break;
3045 }
3046 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3047 &data_reserved, cur_offset, range_len);
3048 if (ret < 0) {
3049 free_extent_map(em);
3050 break;
3051 }
3052 qgroup_reserved += range_len;
3053 data_space_needed += range_len;
3054 }
3055 free_extent_map(em);
3056 cur_offset = last_byte;
3057 }
3058
3059 if (!ret && data_space_needed > 0) {
3060 /*
3061 * We are safe to reserve space here as we can't have delalloc
3062 * in the range, see above.
3063 */
3064 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3065 data_space_needed);
3066 if (!ret)
3067 data_space_reserved = data_space_needed;
3068 }
3069
3070 /*
3071 * If ret is still 0, means we're OK to fallocate.
3072 * Or just cleanup the list and exit.
3073 */
3074 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3075 if (!ret) {
3076 ret = btrfs_prealloc_file_range(inode, mode,
3077 range->start,
3078 range->len, blocksize,
3079 offset + len, &alloc_hint);
3080 /*
3081 * btrfs_prealloc_file_range() releases space even
3082 * if it returns an error.
3083 */
3084 data_space_reserved -= range->len;
3085 qgroup_reserved -= range->len;
3086 } else if (data_space_reserved > 0) {
3087 btrfs_free_reserved_data_space(BTRFS_I(inode),
3088 data_reserved, range->start,
3089 range->len);
3090 data_space_reserved -= range->len;
3091 qgroup_reserved -= range->len;
3092 } else if (qgroup_reserved > 0) {
3093 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3094 range->start, range->len, NULL);
3095 qgroup_reserved -= range->len;
3096 }
3097 list_del(&range->list);
3098 kfree(range);
3099 }
3100 if (ret < 0)
3101 goto out_unlock;
3102
3103 /*
3104 * We didn't need to allocate any more space, but we still extended the
3105 * size of the file so we need to update i_size and the inode item.
3106 */
3107 ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3108 out_unlock:
3109 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3110 &cached_state);
3111 out:
3112 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3113 extent_changeset_free(data_reserved);
3114 return ret;
3115 }
3116
3117 /*
3118 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3119 * that has unflushed and/or flushing delalloc. There might be other adjacent
3120 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3121 * looping while it gets adjacent subranges, and merging them together.
3122 */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3123 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3124 struct extent_state **cached_state,
3125 bool *search_io_tree,
3126 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3127 {
3128 u64 len = end + 1 - start;
3129 u64 delalloc_len = 0;
3130 struct btrfs_ordered_extent *oe;
3131 u64 oe_start;
3132 u64 oe_end;
3133
3134 /*
3135 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3136 * means we have delalloc (dirty pages) for which writeback has not
3137 * started yet.
3138 */
3139 if (*search_io_tree) {
3140 spin_lock(&inode->lock);
3141 if (inode->delalloc_bytes > 0) {
3142 spin_unlock(&inode->lock);
3143 *delalloc_start_ret = start;
3144 delalloc_len = count_range_bits(&inode->io_tree,
3145 delalloc_start_ret, end,
3146 len, EXTENT_DELALLOC, 1,
3147 cached_state);
3148 } else {
3149 spin_unlock(&inode->lock);
3150 }
3151 }
3152
3153 if (delalloc_len > 0) {
3154 /*
3155 * If delalloc was found then *delalloc_start_ret has a sector size
3156 * aligned value (rounded down).
3157 */
3158 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3159
3160 if (*delalloc_start_ret == start) {
3161 /* Delalloc for the whole range, nothing more to do. */
3162 if (*delalloc_end_ret == end)
3163 return true;
3164 /* Else trim our search range for ordered extents. */
3165 start = *delalloc_end_ret + 1;
3166 len = end + 1 - start;
3167 }
3168 } else {
3169 /* No delalloc, future calls don't need to search again. */
3170 *search_io_tree = false;
3171 }
3172
3173 /*
3174 * Now also check if there's any ordered extent in the range.
3175 * We do this because:
3176 *
3177 * 1) When delalloc is flushed, the file range is locked, we clear the
3178 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3179 * an ordered extent for the write. So we might just have been called
3180 * after delalloc is flushed and before the ordered extent completes
3181 * and inserts the new file extent item in the subvolume's btree;
3182 *
3183 * 2) We may have an ordered extent created by flushing delalloc for a
3184 * subrange that starts before the subrange we found marked with
3185 * EXTENT_DELALLOC in the io tree.
3186 *
3187 * We could also use the extent map tree to find such delalloc that is
3188 * being flushed, but using the ordered extents tree is more efficient
3189 * because it's usually much smaller as ordered extents are removed from
3190 * the tree once they complete. With the extent maps, we mau have them
3191 * in the extent map tree for a very long time, and they were either
3192 * created by previous writes or loaded by read operations.
3193 */
3194 oe = btrfs_lookup_first_ordered_range(inode, start, len);
3195 if (!oe)
3196 return (delalloc_len > 0);
3197
3198 /* The ordered extent may span beyond our search range. */
3199 oe_start = max(oe->file_offset, start);
3200 oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3201
3202 btrfs_put_ordered_extent(oe);
3203
3204 /* Don't have unflushed delalloc, return the ordered extent range. */
3205 if (delalloc_len == 0) {
3206 *delalloc_start_ret = oe_start;
3207 *delalloc_end_ret = oe_end;
3208 return true;
3209 }
3210
3211 /*
3212 * We have both unflushed delalloc (io_tree) and an ordered extent.
3213 * If the ranges are adjacent returned a combined range, otherwise
3214 * return the leftmost range.
3215 */
3216 if (oe_start < *delalloc_start_ret) {
3217 if (oe_end < *delalloc_start_ret)
3218 *delalloc_end_ret = oe_end;
3219 *delalloc_start_ret = oe_start;
3220 } else if (*delalloc_end_ret + 1 == oe_start) {
3221 *delalloc_end_ret = oe_end;
3222 }
3223
3224 return true;
3225 }
3226
3227 /*
3228 * Check if there's delalloc in a given range.
3229 *
3230 * @inode: The inode.
3231 * @start: The start offset of the range. It does not need to be
3232 * sector size aligned.
3233 * @end: The end offset (inclusive value) of the search range.
3234 * It does not need to be sector size aligned.
3235 * @cached_state: Extent state record used for speeding up delalloc
3236 * searches in the inode's io_tree. Can be NULL.
3237 * @delalloc_start_ret: Output argument, set to the start offset of the
3238 * subrange found with delalloc (may not be sector size
3239 * aligned).
3240 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3241 * of the subrange found with delalloc.
3242 *
3243 * Returns true if a subrange with delalloc is found within the given range, and
3244 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3245 * end offsets of the subrange.
3246 */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3247 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3248 struct extent_state **cached_state,
3249 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3250 {
3251 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3252 u64 prev_delalloc_end = 0;
3253 bool search_io_tree = true;
3254 bool ret = false;
3255
3256 while (cur_offset <= end) {
3257 u64 delalloc_start;
3258 u64 delalloc_end;
3259 bool delalloc;
3260
3261 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3262 cached_state, &search_io_tree,
3263 &delalloc_start,
3264 &delalloc_end);
3265 if (!delalloc)
3266 break;
3267
3268 if (prev_delalloc_end == 0) {
3269 /* First subrange found. */
3270 *delalloc_start_ret = max(delalloc_start, start);
3271 *delalloc_end_ret = delalloc_end;
3272 ret = true;
3273 } else if (delalloc_start == prev_delalloc_end + 1) {
3274 /* Subrange adjacent to the previous one, merge them. */
3275 *delalloc_end_ret = delalloc_end;
3276 } else {
3277 /* Subrange not adjacent to the previous one, exit. */
3278 break;
3279 }
3280
3281 prev_delalloc_end = delalloc_end;
3282 cur_offset = delalloc_end + 1;
3283 cond_resched();
3284 }
3285
3286 return ret;
3287 }
3288
3289 /*
3290 * Check if there's a hole or delalloc range in a range representing a hole (or
3291 * prealloc extent) found in the inode's subvolume btree.
3292 *
3293 * @inode: The inode.
3294 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3295 * @start: Start offset of the hole region. It does not need to be sector
3296 * size aligned.
3297 * @end: End offset (inclusive value) of the hole region. It does not
3298 * need to be sector size aligned.
3299 * @start_ret: Return parameter, used to set the start of the subrange in the
3300 * hole that matches the search criteria (seek mode), if such
3301 * subrange is found (return value of the function is true).
3302 * The value returned here may not be sector size aligned.
3303 *
3304 * Returns true if a subrange matching the given seek mode is found, and if one
3305 * is found, it updates @start_ret with the start of the subrange.
3306 */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3307 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3308 struct extent_state **cached_state,
3309 u64 start, u64 end, u64 *start_ret)
3310 {
3311 u64 delalloc_start;
3312 u64 delalloc_end;
3313 bool delalloc;
3314
3315 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3316 &delalloc_start, &delalloc_end);
3317 if (delalloc && whence == SEEK_DATA) {
3318 *start_ret = delalloc_start;
3319 return true;
3320 }
3321
3322 if (delalloc && whence == SEEK_HOLE) {
3323 /*
3324 * We found delalloc but it starts after out start offset. So we
3325 * have a hole between our start offset and the delalloc start.
3326 */
3327 if (start < delalloc_start) {
3328 *start_ret = start;
3329 return true;
3330 }
3331 /*
3332 * Delalloc range starts at our start offset.
3333 * If the delalloc range's length is smaller than our range,
3334 * then it means we have a hole that starts where the delalloc
3335 * subrange ends.
3336 */
3337 if (delalloc_end < end) {
3338 *start_ret = delalloc_end + 1;
3339 return true;
3340 }
3341
3342 /* There's delalloc for the whole range. */
3343 return false;
3344 }
3345
3346 if (!delalloc && whence == SEEK_HOLE) {
3347 *start_ret = start;
3348 return true;
3349 }
3350
3351 /*
3352 * No delalloc in the range and we are seeking for data. The caller has
3353 * to iterate to the next extent item in the subvolume btree.
3354 */
3355 return false;
3356 }
3357
find_desired_extent(struct file * file,loff_t offset,int whence)3358 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3359 {
3360 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3361 struct btrfs_file_private *private;
3362 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3363 struct extent_state *cached_state = NULL;
3364 struct extent_state **delalloc_cached_state;
3365 const loff_t i_size = i_size_read(&inode->vfs_inode);
3366 const u64 ino = btrfs_ino(inode);
3367 struct btrfs_root *root = inode->root;
3368 struct btrfs_path *path;
3369 struct btrfs_key key;
3370 u64 last_extent_end;
3371 u64 lockstart;
3372 u64 lockend;
3373 u64 start;
3374 int ret;
3375 bool found = false;
3376
3377 if (i_size == 0 || offset >= i_size)
3378 return -ENXIO;
3379
3380 /*
3381 * Quick path. If the inode has no prealloc extents and its number of
3382 * bytes used matches its i_size, then it can not have holes.
3383 */
3384 if (whence == SEEK_HOLE &&
3385 !(inode->flags & BTRFS_INODE_PREALLOC) &&
3386 inode_get_bytes(&inode->vfs_inode) == i_size)
3387 return i_size;
3388
3389 spin_lock(&inode->lock);
3390 private = file->private_data;
3391 spin_unlock(&inode->lock);
3392
3393 if (private && private->owner_task != current) {
3394 /*
3395 * Not allocated by us, don't use it as its cached state is used
3396 * by the task that allocated it and we don't want neither to
3397 * mess with it nor get incorrect results because it reflects an
3398 * invalid state for the current task.
3399 */
3400 private = NULL;
3401 } else if (!private) {
3402 private = kzalloc(sizeof(*private), GFP_KERNEL);
3403 /*
3404 * No worries if memory allocation failed.
3405 * The private structure is used only for speeding up multiple
3406 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3407 * so everything will still be correct.
3408 */
3409 if (private) {
3410 bool free = false;
3411
3412 private->owner_task = current;
3413
3414 spin_lock(&inode->lock);
3415 if (file->private_data)
3416 free = true;
3417 else
3418 file->private_data = private;
3419 spin_unlock(&inode->lock);
3420
3421 if (free) {
3422 kfree(private);
3423 private = NULL;
3424 }
3425 }
3426 }
3427
3428 if (private)
3429 delalloc_cached_state = &private->llseek_cached_state;
3430 else
3431 delalloc_cached_state = NULL;
3432
3433 /*
3434 * offset can be negative, in this case we start finding DATA/HOLE from
3435 * the very start of the file.
3436 */
3437 start = max_t(loff_t, 0, offset);
3438
3439 lockstart = round_down(start, fs_info->sectorsize);
3440 lockend = round_up(i_size, fs_info->sectorsize);
3441 if (lockend <= lockstart)
3442 lockend = lockstart + fs_info->sectorsize;
3443 lockend--;
3444
3445 path = btrfs_alloc_path();
3446 if (!path)
3447 return -ENOMEM;
3448 path->reada = READA_FORWARD;
3449
3450 key.objectid = ino;
3451 key.type = BTRFS_EXTENT_DATA_KEY;
3452 key.offset = start;
3453
3454 last_extent_end = lockstart;
3455
3456 lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3457
3458 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3459 if (ret < 0) {
3460 goto out;
3461 } else if (ret > 0 && path->slots[0] > 0) {
3462 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3463 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3464 path->slots[0]--;
3465 }
3466
3467 while (start < i_size) {
3468 struct extent_buffer *leaf = path->nodes[0];
3469 struct btrfs_file_extent_item *extent;
3470 u64 extent_end;
3471 u8 type;
3472
3473 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3474 ret = btrfs_next_leaf(root, path);
3475 if (ret < 0)
3476 goto out;
3477 else if (ret > 0)
3478 break;
3479
3480 leaf = path->nodes[0];
3481 }
3482
3483 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3484 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3485 break;
3486
3487 extent_end = btrfs_file_extent_end(path);
3488
3489 /*
3490 * In the first iteration we may have a slot that points to an
3491 * extent that ends before our start offset, so skip it.
3492 */
3493 if (extent_end <= start) {
3494 path->slots[0]++;
3495 continue;
3496 }
3497
3498 /* We have an implicit hole, NO_HOLES feature is likely set. */
3499 if (last_extent_end < key.offset) {
3500 u64 search_start = last_extent_end;
3501 u64 found_start;
3502
3503 /*
3504 * First iteration, @start matches @offset and it's
3505 * within the hole.
3506 */
3507 if (start == offset)
3508 search_start = offset;
3509
3510 found = find_desired_extent_in_hole(inode, whence,
3511 delalloc_cached_state,
3512 search_start,
3513 key.offset - 1,
3514 &found_start);
3515 if (found) {
3516 start = found_start;
3517 break;
3518 }
3519 /*
3520 * Didn't find data or a hole (due to delalloc) in the
3521 * implicit hole range, so need to analyze the extent.
3522 */
3523 }
3524
3525 extent = btrfs_item_ptr(leaf, path->slots[0],
3526 struct btrfs_file_extent_item);
3527 type = btrfs_file_extent_type(leaf, extent);
3528
3529 /*
3530 * Can't access the extent's disk_bytenr field if this is an
3531 * inline extent, since at that offset, it's where the extent
3532 * data starts.
3533 */
3534 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3535 (type == BTRFS_FILE_EXTENT_REG &&
3536 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3537 /*
3538 * Explicit hole or prealloc extent, search for delalloc.
3539 * A prealloc extent is treated like a hole.
3540 */
3541 u64 search_start = key.offset;
3542 u64 found_start;
3543
3544 /*
3545 * First iteration, @start matches @offset and it's
3546 * within the hole.
3547 */
3548 if (start == offset)
3549 search_start = offset;
3550
3551 found = find_desired_extent_in_hole(inode, whence,
3552 delalloc_cached_state,
3553 search_start,
3554 extent_end - 1,
3555 &found_start);
3556 if (found) {
3557 start = found_start;
3558 break;
3559 }
3560 /*
3561 * Didn't find data or a hole (due to delalloc) in the
3562 * implicit hole range, so need to analyze the next
3563 * extent item.
3564 */
3565 } else {
3566 /*
3567 * Found a regular or inline extent.
3568 * If we are seeking for data, adjust the start offset
3569 * and stop, we're done.
3570 */
3571 if (whence == SEEK_DATA) {
3572 start = max_t(u64, key.offset, offset);
3573 found = true;
3574 break;
3575 }
3576 /*
3577 * Else, we are seeking for a hole, check the next file
3578 * extent item.
3579 */
3580 }
3581
3582 start = extent_end;
3583 last_extent_end = extent_end;
3584 path->slots[0]++;
3585 if (fatal_signal_pending(current)) {
3586 ret = -EINTR;
3587 goto out;
3588 }
3589 cond_resched();
3590 }
3591
3592 /* We have an implicit hole from the last extent found up to i_size. */
3593 if (!found && start < i_size) {
3594 found = find_desired_extent_in_hole(inode, whence,
3595 delalloc_cached_state, start,
3596 i_size - 1, &start);
3597 if (!found)
3598 start = i_size;
3599 }
3600
3601 out:
3602 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3603 btrfs_free_path(path);
3604
3605 if (ret < 0)
3606 return ret;
3607
3608 if (whence == SEEK_DATA && start >= i_size)
3609 return -ENXIO;
3610
3611 return min_t(loff_t, start, i_size);
3612 }
3613
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3614 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3615 {
3616 struct inode *inode = file->f_mapping->host;
3617
3618 switch (whence) {
3619 default:
3620 return generic_file_llseek(file, offset, whence);
3621 case SEEK_DATA:
3622 case SEEK_HOLE:
3623 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3624 offset = find_desired_extent(file, offset, whence);
3625 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3626 break;
3627 }
3628
3629 if (offset < 0)
3630 return offset;
3631
3632 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3633 }
3634
btrfs_file_open(struct inode * inode,struct file * filp)3635 static int btrfs_file_open(struct inode *inode, struct file *filp)
3636 {
3637 int ret;
3638
3639 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3640
3641 ret = fsverity_file_open(inode, filp);
3642 if (ret)
3643 return ret;
3644 return generic_file_open(inode, filp);
3645 }
3646
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3647 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3648 {
3649 ssize_t ret = 0;
3650
3651 if (iocb->ki_flags & IOCB_DIRECT) {
3652 ret = btrfs_direct_read(iocb, to);
3653 if (ret < 0 || !iov_iter_count(to) ||
3654 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3655 return ret;
3656 }
3657
3658 return filemap_read(iocb, to, ret);
3659 }
3660
3661 const struct file_operations btrfs_file_operations = {
3662 .llseek = btrfs_file_llseek,
3663 .read_iter = btrfs_file_read_iter,
3664 .splice_read = filemap_splice_read,
3665 .write_iter = btrfs_file_write_iter,
3666 .splice_write = iter_file_splice_write,
3667 .mmap = btrfs_file_mmap,
3668 .open = btrfs_file_open,
3669 .release = btrfs_release_file,
3670 .get_unmapped_area = thp_get_unmapped_area,
3671 .fsync = btrfs_sync_file,
3672 .fallocate = btrfs_fallocate,
3673 .unlocked_ioctl = btrfs_ioctl,
3674 #ifdef CONFIG_COMPAT
3675 .compat_ioctl = btrfs_compat_ioctl,
3676 #endif
3677 .remap_file_range = btrfs_remap_file_range,
3678 .uring_cmd = btrfs_uring_cmd,
3679 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3680 };
3681
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3682 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3683 {
3684 struct address_space *mapping = inode->vfs_inode.i_mapping;
3685 int ret;
3686
3687 /*
3688 * So with compression we will find and lock a dirty page and clear the
3689 * first one as dirty, setup an async extent, and immediately return
3690 * with the entire range locked but with nobody actually marked with
3691 * writeback. So we can't just filemap_write_and_wait_range() and
3692 * expect it to work since it will just kick off a thread to do the
3693 * actual work. So we need to call filemap_fdatawrite_range _again_
3694 * since it will wait on the page lock, which won't be unlocked until
3695 * after the pages have been marked as writeback and so we're good to go
3696 * from there. We have to do this otherwise we'll miss the ordered
3697 * extents and that results in badness. Please Josef, do not think you
3698 * know better and pull this out at some point in the future, it is
3699 * right and you are wrong.
3700 */
3701 ret = filemap_fdatawrite_range(mapping, start, end);
3702 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3703 ret = filemap_fdatawrite_range(mapping, start, end);
3704
3705 return ret;
3706 }
3707