xref: /linux/fs/btrfs/file.c (revision fd71def6d9abc5ae362fb9995d46049b7b0ed391)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/fs.h>
7  #include <linux/pagemap.h>
8  #include <linux/time.h>
9  #include <linux/init.h>
10  #include <linux/string.h>
11  #include <linux/backing-dev.h>
12  #include <linux/falloc.h>
13  #include <linux/writeback.h>
14  #include <linux/compat.h>
15  #include <linux/slab.h>
16  #include <linux/btrfs.h>
17  #include <linux/uio.h>
18  #include <linux/iversion.h>
19  #include <linux/fsverity.h>
20  #include "ctree.h"
21  #include "direct-io.h"
22  #include "disk-io.h"
23  #include "transaction.h"
24  #include "btrfs_inode.h"
25  #include "tree-log.h"
26  #include "locking.h"
27  #include "qgroup.h"
28  #include "compression.h"
29  #include "delalloc-space.h"
30  #include "reflink.h"
31  #include "subpage.h"
32  #include "fs.h"
33  #include "accessors.h"
34  #include "extent-tree.h"
35  #include "file-item.h"
36  #include "ioctl.h"
37  #include "file.h"
38  #include "super.h"
39  #include "print-tree.h"
40  
41  /*
42   * Unlock folio after btrfs_file_write() is done with it.
43   */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)44  static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
45  			     u64 pos, u64 copied)
46  {
47  	u64 block_start = round_down(pos, fs_info->sectorsize);
48  	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
49  
50  	ASSERT(block_len <= U32_MAX);
51  	/*
52  	 * Folio checked is some magic around finding folios that have been
53  	 * modified without going through btrfs_dirty_folio().  Clear it here.
54  	 * There should be no need to mark the pages accessed as
55  	 * prepare_one_folio() should have marked them accessed in
56  	 * prepare_one_folio() via find_or_create_page()
57  	 */
58  	btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
59  	folio_unlock(folio);
60  	folio_put(folio);
61  }
62  
63  /*
64   * After copy_folio_from_iter_atomic(), update the following things for delalloc:
65   * - Mark newly dirtied folio as DELALLOC in the io tree.
66   *   Used to advise which range is to be written back.
67   * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
68   * - Update inode size for past EOF write
69   */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)70  int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
71  		      size_t write_bytes, struct extent_state **cached, bool noreserve)
72  {
73  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
74  	int ret = 0;
75  	u64 num_bytes;
76  	u64 start_pos;
77  	u64 end_of_last_block;
78  	u64 end_pos = pos + write_bytes;
79  	loff_t isize = i_size_read(&inode->vfs_inode);
80  	unsigned int extra_bits = 0;
81  
82  	if (write_bytes == 0)
83  		return 0;
84  
85  	if (noreserve)
86  		extra_bits |= EXTENT_NORESERVE;
87  
88  	start_pos = round_down(pos, fs_info->sectorsize);
89  	num_bytes = round_up(write_bytes + pos - start_pos,
90  			     fs_info->sectorsize);
91  	ASSERT(num_bytes <= U32_MAX);
92  	ASSERT(folio_pos(folio) <= pos &&
93  	       folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
94  
95  	end_of_last_block = start_pos + num_bytes - 1;
96  
97  	/*
98  	 * The pages may have already been dirty, clear out old accounting so
99  	 * we can set things up properly
100  	 */
101  	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
102  			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
103  			 cached);
104  
105  	ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
106  					extra_bits, cached);
107  	if (ret)
108  		return ret;
109  
110  	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
111  	btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
112  	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
113  
114  	/*
115  	 * we've only changed i_size in ram, and we haven't updated
116  	 * the disk i_size.  There is no need to log the inode
117  	 * at this time.
118  	 */
119  	if (end_pos > isize)
120  		i_size_write(&inode->vfs_inode, end_pos);
121  	return 0;
122  }
123  
124  /*
125   * this is very complex, but the basic idea is to drop all extents
126   * in the range start - end.  hint_block is filled in with a block number
127   * that would be a good hint to the block allocator for this file.
128   *
129   * If an extent intersects the range but is not entirely inside the range
130   * it is either truncated or split.  Anything entirely inside the range
131   * is deleted from the tree.
132   *
133   * Note: the VFS' inode number of bytes is not updated, it's up to the caller
134   * to deal with that. We set the field 'bytes_found' of the arguments structure
135   * with the number of allocated bytes found in the target range, so that the
136   * caller can update the inode's number of bytes in an atomic way when
137   * replacing extents in a range to avoid races with stat(2).
138   */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)139  int btrfs_drop_extents(struct btrfs_trans_handle *trans,
140  		       struct btrfs_root *root, struct btrfs_inode *inode,
141  		       struct btrfs_drop_extents_args *args)
142  {
143  	struct btrfs_fs_info *fs_info = root->fs_info;
144  	struct extent_buffer *leaf;
145  	struct btrfs_file_extent_item *fi;
146  	struct btrfs_key key;
147  	struct btrfs_key new_key;
148  	u64 ino = btrfs_ino(inode);
149  	u64 search_start = args->start;
150  	u64 disk_bytenr = 0;
151  	u64 num_bytes = 0;
152  	u64 extent_offset = 0;
153  	u64 extent_end = 0;
154  	u64 last_end = args->start;
155  	int del_nr = 0;
156  	int del_slot = 0;
157  	int extent_type;
158  	int recow;
159  	int ret;
160  	int modify_tree = -1;
161  	int update_refs;
162  	int found = 0;
163  	struct btrfs_path *path = args->path;
164  
165  	args->bytes_found = 0;
166  	args->extent_inserted = false;
167  
168  	/* Must always have a path if ->replace_extent is true */
169  	ASSERT(!(args->replace_extent && !args->path));
170  
171  	if (!path) {
172  		path = btrfs_alloc_path();
173  		if (!path) {
174  			ret = -ENOMEM;
175  			goto out;
176  		}
177  	}
178  
179  	if (args->drop_cache)
180  		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
181  
182  	if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
183  		modify_tree = 0;
184  
185  	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
186  	while (1) {
187  		recow = 0;
188  		ret = btrfs_lookup_file_extent(trans, root, path, ino,
189  					       search_start, modify_tree);
190  		if (ret < 0)
191  			break;
192  		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
193  			leaf = path->nodes[0];
194  			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
195  			if (key.objectid == ino &&
196  			    key.type == BTRFS_EXTENT_DATA_KEY)
197  				path->slots[0]--;
198  		}
199  		ret = 0;
200  next_slot:
201  		leaf = path->nodes[0];
202  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
203  			if (WARN_ON(del_nr > 0)) {
204  				btrfs_print_leaf(leaf);
205  				ret = -EINVAL;
206  				break;
207  			}
208  			ret = btrfs_next_leaf(root, path);
209  			if (ret < 0)
210  				break;
211  			if (ret > 0) {
212  				ret = 0;
213  				break;
214  			}
215  			leaf = path->nodes[0];
216  			recow = 1;
217  		}
218  
219  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
220  
221  		if (key.objectid > ino)
222  			break;
223  		if (WARN_ON_ONCE(key.objectid < ino) ||
224  		    key.type < BTRFS_EXTENT_DATA_KEY) {
225  			ASSERT(del_nr == 0);
226  			path->slots[0]++;
227  			goto next_slot;
228  		}
229  		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
230  			break;
231  
232  		fi = btrfs_item_ptr(leaf, path->slots[0],
233  				    struct btrfs_file_extent_item);
234  		extent_type = btrfs_file_extent_type(leaf, fi);
235  
236  		if (extent_type == BTRFS_FILE_EXTENT_REG ||
237  		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
238  			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
239  			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
240  			extent_offset = btrfs_file_extent_offset(leaf, fi);
241  			extent_end = key.offset +
242  				btrfs_file_extent_num_bytes(leaf, fi);
243  		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
244  			extent_end = key.offset +
245  				btrfs_file_extent_ram_bytes(leaf, fi);
246  		} else {
247  			/* can't happen */
248  			BUG();
249  		}
250  
251  		/*
252  		 * Don't skip extent items representing 0 byte lengths. They
253  		 * used to be created (bug) if while punching holes we hit
254  		 * -ENOSPC condition. So if we find one here, just ensure we
255  		 * delete it, otherwise we would insert a new file extent item
256  		 * with the same key (offset) as that 0 bytes length file
257  		 * extent item in the call to setup_items_for_insert() later
258  		 * in this function.
259  		 */
260  		if (extent_end == key.offset && extent_end >= search_start) {
261  			last_end = extent_end;
262  			goto delete_extent_item;
263  		}
264  
265  		if (extent_end <= search_start) {
266  			path->slots[0]++;
267  			goto next_slot;
268  		}
269  
270  		found = 1;
271  		search_start = max(key.offset, args->start);
272  		if (recow || !modify_tree) {
273  			modify_tree = -1;
274  			btrfs_release_path(path);
275  			continue;
276  		}
277  
278  		/*
279  		 *     | - range to drop - |
280  		 *  | -------- extent -------- |
281  		 */
282  		if (args->start > key.offset && args->end < extent_end) {
283  			if (WARN_ON(del_nr > 0)) {
284  				btrfs_print_leaf(leaf);
285  				ret = -EINVAL;
286  				break;
287  			}
288  			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
289  				ret = -EOPNOTSUPP;
290  				break;
291  			}
292  
293  			memcpy(&new_key, &key, sizeof(new_key));
294  			new_key.offset = args->start;
295  			ret = btrfs_duplicate_item(trans, root, path,
296  						   &new_key);
297  			if (ret == -EAGAIN) {
298  				btrfs_release_path(path);
299  				continue;
300  			}
301  			if (ret < 0)
302  				break;
303  
304  			leaf = path->nodes[0];
305  			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
306  					    struct btrfs_file_extent_item);
307  			btrfs_set_file_extent_num_bytes(leaf, fi,
308  							args->start - key.offset);
309  
310  			fi = btrfs_item_ptr(leaf, path->slots[0],
311  					    struct btrfs_file_extent_item);
312  
313  			extent_offset += args->start - key.offset;
314  			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
315  			btrfs_set_file_extent_num_bytes(leaf, fi,
316  							extent_end - args->start);
317  
318  			if (update_refs && disk_bytenr > 0) {
319  				struct btrfs_ref ref = {
320  					.action = BTRFS_ADD_DELAYED_REF,
321  					.bytenr = disk_bytenr,
322  					.num_bytes = num_bytes,
323  					.parent = 0,
324  					.owning_root = btrfs_root_id(root),
325  					.ref_root = btrfs_root_id(root),
326  				};
327  				btrfs_init_data_ref(&ref, new_key.objectid,
328  						    args->start - extent_offset,
329  						    0, false);
330  				ret = btrfs_inc_extent_ref(trans, &ref);
331  				if (ret) {
332  					btrfs_abort_transaction(trans, ret);
333  					break;
334  				}
335  			}
336  			key.offset = args->start;
337  		}
338  		/*
339  		 * From here on out we will have actually dropped something, so
340  		 * last_end can be updated.
341  		 */
342  		last_end = extent_end;
343  
344  		/*
345  		 *  | ---- range to drop ----- |
346  		 *      | -------- extent -------- |
347  		 */
348  		if (args->start <= key.offset && args->end < extent_end) {
349  			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
350  				ret = -EOPNOTSUPP;
351  				break;
352  			}
353  
354  			memcpy(&new_key, &key, sizeof(new_key));
355  			new_key.offset = args->end;
356  			btrfs_set_item_key_safe(trans, path, &new_key);
357  
358  			extent_offset += args->end - key.offset;
359  			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
360  			btrfs_set_file_extent_num_bytes(leaf, fi,
361  							extent_end - args->end);
362  			if (update_refs && disk_bytenr > 0)
363  				args->bytes_found += args->end - key.offset;
364  			break;
365  		}
366  
367  		search_start = extent_end;
368  		/*
369  		 *       | ---- range to drop ----- |
370  		 *  | -------- extent -------- |
371  		 */
372  		if (args->start > key.offset && args->end >= extent_end) {
373  			if (WARN_ON(del_nr > 0)) {
374  				btrfs_print_leaf(leaf);
375  				ret = -EINVAL;
376  				break;
377  			}
378  			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
379  				ret = -EOPNOTSUPP;
380  				break;
381  			}
382  
383  			btrfs_set_file_extent_num_bytes(leaf, fi,
384  							args->start - key.offset);
385  			if (update_refs && disk_bytenr > 0)
386  				args->bytes_found += extent_end - args->start;
387  			if (args->end == extent_end)
388  				break;
389  
390  			path->slots[0]++;
391  			goto next_slot;
392  		}
393  
394  		/*
395  		 *  | ---- range to drop ----- |
396  		 *    | ------ extent ------ |
397  		 */
398  		if (args->start <= key.offset && args->end >= extent_end) {
399  delete_extent_item:
400  			if (del_nr == 0) {
401  				del_slot = path->slots[0];
402  				del_nr = 1;
403  			} else {
404  				if (WARN_ON(del_slot + del_nr != path->slots[0])) {
405  					btrfs_print_leaf(leaf);
406  					ret = -EINVAL;
407  					break;
408  				}
409  				del_nr++;
410  			}
411  
412  			if (update_refs &&
413  			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
414  				args->bytes_found += extent_end - key.offset;
415  				extent_end = ALIGN(extent_end,
416  						   fs_info->sectorsize);
417  			} else if (update_refs && disk_bytenr > 0) {
418  				struct btrfs_ref ref = {
419  					.action = BTRFS_DROP_DELAYED_REF,
420  					.bytenr = disk_bytenr,
421  					.num_bytes = num_bytes,
422  					.parent = 0,
423  					.owning_root = btrfs_root_id(root),
424  					.ref_root = btrfs_root_id(root),
425  				};
426  				btrfs_init_data_ref(&ref, key.objectid,
427  						    key.offset - extent_offset,
428  						    0, false);
429  				ret = btrfs_free_extent(trans, &ref);
430  				if (ret) {
431  					btrfs_abort_transaction(trans, ret);
432  					break;
433  				}
434  				args->bytes_found += extent_end - key.offset;
435  			}
436  
437  			if (args->end == extent_end)
438  				break;
439  
440  			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
441  				path->slots[0]++;
442  				goto next_slot;
443  			}
444  
445  			ret = btrfs_del_items(trans, root, path, del_slot,
446  					      del_nr);
447  			if (ret) {
448  				btrfs_abort_transaction(trans, ret);
449  				break;
450  			}
451  
452  			del_nr = 0;
453  			del_slot = 0;
454  
455  			btrfs_release_path(path);
456  			continue;
457  		}
458  
459  		BUG();
460  	}
461  
462  	if (!ret && del_nr > 0) {
463  		/*
464  		 * Set path->slots[0] to first slot, so that after the delete
465  		 * if items are move off from our leaf to its immediate left or
466  		 * right neighbor leafs, we end up with a correct and adjusted
467  		 * path->slots[0] for our insertion (if args->replace_extent).
468  		 */
469  		path->slots[0] = del_slot;
470  		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
471  		if (ret)
472  			btrfs_abort_transaction(trans, ret);
473  	}
474  
475  	leaf = path->nodes[0];
476  	/*
477  	 * If btrfs_del_items() was called, it might have deleted a leaf, in
478  	 * which case it unlocked our path, so check path->locks[0] matches a
479  	 * write lock.
480  	 */
481  	if (!ret && args->replace_extent &&
482  	    path->locks[0] == BTRFS_WRITE_LOCK &&
483  	    btrfs_leaf_free_space(leaf) >=
484  	    sizeof(struct btrfs_item) + args->extent_item_size) {
485  
486  		key.objectid = ino;
487  		key.type = BTRFS_EXTENT_DATA_KEY;
488  		key.offset = args->start;
489  		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
490  			struct btrfs_key slot_key;
491  
492  			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
493  			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
494  				path->slots[0]++;
495  		}
496  		btrfs_setup_item_for_insert(trans, root, path, &key,
497  					    args->extent_item_size);
498  		args->extent_inserted = true;
499  	}
500  
501  	if (!args->path)
502  		btrfs_free_path(path);
503  	else if (!args->extent_inserted)
504  		btrfs_release_path(path);
505  out:
506  	args->drop_end = found ? min(args->end, last_end) : args->end;
507  
508  	return ret;
509  }
510  
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)511  static int extent_mergeable(struct extent_buffer *leaf, int slot,
512  			    u64 objectid, u64 bytenr, u64 orig_offset,
513  			    u64 *start, u64 *end)
514  {
515  	struct btrfs_file_extent_item *fi;
516  	struct btrfs_key key;
517  	u64 extent_end;
518  
519  	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
520  		return 0;
521  
522  	btrfs_item_key_to_cpu(leaf, &key, slot);
523  	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
524  		return 0;
525  
526  	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
527  	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
528  	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
529  	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
530  	    btrfs_file_extent_compression(leaf, fi) ||
531  	    btrfs_file_extent_encryption(leaf, fi) ||
532  	    btrfs_file_extent_other_encoding(leaf, fi))
533  		return 0;
534  
535  	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
536  	if ((*start && *start != key.offset) || (*end && *end != extent_end))
537  		return 0;
538  
539  	*start = key.offset;
540  	*end = extent_end;
541  	return 1;
542  }
543  
544  /*
545   * Mark extent in the range start - end as written.
546   *
547   * This changes extent type from 'pre-allocated' to 'regular'. If only
548   * part of extent is marked as written, the extent will be split into
549   * two or three.
550   */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)551  int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
552  			      struct btrfs_inode *inode, u64 start, u64 end)
553  {
554  	struct btrfs_root *root = inode->root;
555  	struct extent_buffer *leaf;
556  	struct btrfs_path *path;
557  	struct btrfs_file_extent_item *fi;
558  	struct btrfs_ref ref = { 0 };
559  	struct btrfs_key key;
560  	struct btrfs_key new_key;
561  	u64 bytenr;
562  	u64 num_bytes;
563  	u64 extent_end;
564  	u64 orig_offset;
565  	u64 other_start;
566  	u64 other_end;
567  	u64 split;
568  	int del_nr = 0;
569  	int del_slot = 0;
570  	int recow;
571  	int ret = 0;
572  	u64 ino = btrfs_ino(inode);
573  
574  	path = btrfs_alloc_path();
575  	if (!path)
576  		return -ENOMEM;
577  again:
578  	recow = 0;
579  	split = start;
580  	key.objectid = ino;
581  	key.type = BTRFS_EXTENT_DATA_KEY;
582  	key.offset = split;
583  
584  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
585  	if (ret < 0)
586  		goto out;
587  	if (ret > 0 && path->slots[0] > 0)
588  		path->slots[0]--;
589  
590  	leaf = path->nodes[0];
591  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
592  	if (key.objectid != ino ||
593  	    key.type != BTRFS_EXTENT_DATA_KEY) {
594  		ret = -EINVAL;
595  		btrfs_abort_transaction(trans, ret);
596  		goto out;
597  	}
598  	fi = btrfs_item_ptr(leaf, path->slots[0],
599  			    struct btrfs_file_extent_item);
600  	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
601  		ret = -EINVAL;
602  		btrfs_abort_transaction(trans, ret);
603  		goto out;
604  	}
605  	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
606  	if (key.offset > start || extent_end < end) {
607  		ret = -EINVAL;
608  		btrfs_abort_transaction(trans, ret);
609  		goto out;
610  	}
611  
612  	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
613  	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
614  	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
615  	memcpy(&new_key, &key, sizeof(new_key));
616  
617  	if (start == key.offset && end < extent_end) {
618  		other_start = 0;
619  		other_end = start;
620  		if (extent_mergeable(leaf, path->slots[0] - 1,
621  				     ino, bytenr, orig_offset,
622  				     &other_start, &other_end)) {
623  			new_key.offset = end;
624  			btrfs_set_item_key_safe(trans, path, &new_key);
625  			fi = btrfs_item_ptr(leaf, path->slots[0],
626  					    struct btrfs_file_extent_item);
627  			btrfs_set_file_extent_generation(leaf, fi,
628  							 trans->transid);
629  			btrfs_set_file_extent_num_bytes(leaf, fi,
630  							extent_end - end);
631  			btrfs_set_file_extent_offset(leaf, fi,
632  						     end - orig_offset);
633  			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
634  					    struct btrfs_file_extent_item);
635  			btrfs_set_file_extent_generation(leaf, fi,
636  							 trans->transid);
637  			btrfs_set_file_extent_num_bytes(leaf, fi,
638  							end - other_start);
639  			goto out;
640  		}
641  	}
642  
643  	if (start > key.offset && end == extent_end) {
644  		other_start = end;
645  		other_end = 0;
646  		if (extent_mergeable(leaf, path->slots[0] + 1,
647  				     ino, bytenr, orig_offset,
648  				     &other_start, &other_end)) {
649  			fi = btrfs_item_ptr(leaf, path->slots[0],
650  					    struct btrfs_file_extent_item);
651  			btrfs_set_file_extent_num_bytes(leaf, fi,
652  							start - key.offset);
653  			btrfs_set_file_extent_generation(leaf, fi,
654  							 trans->transid);
655  			path->slots[0]++;
656  			new_key.offset = start;
657  			btrfs_set_item_key_safe(trans, path, &new_key);
658  
659  			fi = btrfs_item_ptr(leaf, path->slots[0],
660  					    struct btrfs_file_extent_item);
661  			btrfs_set_file_extent_generation(leaf, fi,
662  							 trans->transid);
663  			btrfs_set_file_extent_num_bytes(leaf, fi,
664  							other_end - start);
665  			btrfs_set_file_extent_offset(leaf, fi,
666  						     start - orig_offset);
667  			goto out;
668  		}
669  	}
670  
671  	while (start > key.offset || end < extent_end) {
672  		if (key.offset == start)
673  			split = end;
674  
675  		new_key.offset = split;
676  		ret = btrfs_duplicate_item(trans, root, path, &new_key);
677  		if (ret == -EAGAIN) {
678  			btrfs_release_path(path);
679  			goto again;
680  		}
681  		if (ret < 0) {
682  			btrfs_abort_transaction(trans, ret);
683  			goto out;
684  		}
685  
686  		leaf = path->nodes[0];
687  		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
688  				    struct btrfs_file_extent_item);
689  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
690  		btrfs_set_file_extent_num_bytes(leaf, fi,
691  						split - key.offset);
692  
693  		fi = btrfs_item_ptr(leaf, path->slots[0],
694  				    struct btrfs_file_extent_item);
695  
696  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
697  		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
698  		btrfs_set_file_extent_num_bytes(leaf, fi,
699  						extent_end - split);
700  
701  		ref.action = BTRFS_ADD_DELAYED_REF;
702  		ref.bytenr = bytenr;
703  		ref.num_bytes = num_bytes;
704  		ref.parent = 0;
705  		ref.owning_root = btrfs_root_id(root);
706  		ref.ref_root = btrfs_root_id(root);
707  		btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
708  		ret = btrfs_inc_extent_ref(trans, &ref);
709  		if (ret) {
710  			btrfs_abort_transaction(trans, ret);
711  			goto out;
712  		}
713  
714  		if (split == start) {
715  			key.offset = start;
716  		} else {
717  			if (start != key.offset) {
718  				ret = -EINVAL;
719  				btrfs_abort_transaction(trans, ret);
720  				goto out;
721  			}
722  			path->slots[0]--;
723  			extent_end = end;
724  		}
725  		recow = 1;
726  	}
727  
728  	other_start = end;
729  	other_end = 0;
730  
731  	ref.action = BTRFS_DROP_DELAYED_REF;
732  	ref.bytenr = bytenr;
733  	ref.num_bytes = num_bytes;
734  	ref.parent = 0;
735  	ref.owning_root = btrfs_root_id(root);
736  	ref.ref_root = btrfs_root_id(root);
737  	btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
738  	if (extent_mergeable(leaf, path->slots[0] + 1,
739  			     ino, bytenr, orig_offset,
740  			     &other_start, &other_end)) {
741  		if (recow) {
742  			btrfs_release_path(path);
743  			goto again;
744  		}
745  		extent_end = other_end;
746  		del_slot = path->slots[0] + 1;
747  		del_nr++;
748  		ret = btrfs_free_extent(trans, &ref);
749  		if (ret) {
750  			btrfs_abort_transaction(trans, ret);
751  			goto out;
752  		}
753  	}
754  	other_start = 0;
755  	other_end = start;
756  	if (extent_mergeable(leaf, path->slots[0] - 1,
757  			     ino, bytenr, orig_offset,
758  			     &other_start, &other_end)) {
759  		if (recow) {
760  			btrfs_release_path(path);
761  			goto again;
762  		}
763  		key.offset = other_start;
764  		del_slot = path->slots[0];
765  		del_nr++;
766  		ret = btrfs_free_extent(trans, &ref);
767  		if (ret) {
768  			btrfs_abort_transaction(trans, ret);
769  			goto out;
770  		}
771  	}
772  	if (del_nr == 0) {
773  		fi = btrfs_item_ptr(leaf, path->slots[0],
774  			   struct btrfs_file_extent_item);
775  		btrfs_set_file_extent_type(leaf, fi,
776  					   BTRFS_FILE_EXTENT_REG);
777  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
778  	} else {
779  		fi = btrfs_item_ptr(leaf, del_slot - 1,
780  			   struct btrfs_file_extent_item);
781  		btrfs_set_file_extent_type(leaf, fi,
782  					   BTRFS_FILE_EXTENT_REG);
783  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
784  		btrfs_set_file_extent_num_bytes(leaf, fi,
785  						extent_end - key.offset);
786  
787  		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
788  		if (ret < 0) {
789  			btrfs_abort_transaction(trans, ret);
790  			goto out;
791  		}
792  	}
793  out:
794  	btrfs_free_path(path);
795  	return ret;
796  }
797  
798  /*
799   * On error return an unlocked folio and the error value
800   * On success return a locked folio and 0
801   */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len,bool force_uptodate)802  static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
803  				  u64 len, bool force_uptodate)
804  {
805  	u64 clamp_start = max_t(u64, pos, folio_pos(folio));
806  	u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
807  	const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
808  	int ret = 0;
809  
810  	if (folio_test_uptodate(folio))
811  		return 0;
812  
813  	if (!force_uptodate &&
814  	    IS_ALIGNED(clamp_start, blocksize) &&
815  	    IS_ALIGNED(clamp_end, blocksize))
816  		return 0;
817  
818  	ret = btrfs_read_folio(NULL, folio);
819  	if (ret)
820  		return ret;
821  	folio_lock(folio);
822  	if (!folio_test_uptodate(folio)) {
823  		folio_unlock(folio);
824  		return -EIO;
825  	}
826  
827  	/*
828  	 * Since btrfs_read_folio() will unlock the folio before it returns,
829  	 * there is a window where btrfs_release_folio() can be called to
830  	 * release the page.  Here we check both inode mapping and page
831  	 * private to make sure the page was not released.
832  	 *
833  	 * The private flag check is essential for subpage as we need to store
834  	 * extra bitmap using folio private.
835  	 */
836  	if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
837  		folio_unlock(folio);
838  		return -EAGAIN;
839  	}
840  	return 0;
841  }
842  
get_prepare_gfp_flags(struct inode * inode,bool nowait)843  static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
844  {
845  	gfp_t gfp;
846  
847  	gfp = btrfs_alloc_write_mask(inode->i_mapping);
848  	if (nowait) {
849  		gfp &= ~__GFP_DIRECT_RECLAIM;
850  		gfp |= GFP_NOWAIT;
851  	}
852  
853  	return gfp;
854  }
855  
856  /*
857   * Get folio into the page cache and lock it.
858   */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool force_uptodate,bool nowait)859  static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
860  				      loff_t pos, size_t write_bytes,
861  				      bool force_uptodate, bool nowait)
862  {
863  	unsigned long index = pos >> PAGE_SHIFT;
864  	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
865  	fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
866  	struct folio *folio;
867  	int ret = 0;
868  
869  again:
870  	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
871  	if (IS_ERR(folio)) {
872  		if (nowait)
873  			ret = -EAGAIN;
874  		else
875  			ret = PTR_ERR(folio);
876  		return ret;
877  	}
878  	/* Only support page sized folio yet. */
879  	ASSERT(folio_order(folio) == 0);
880  	ret = set_folio_extent_mapped(folio);
881  	if (ret < 0) {
882  		folio_unlock(folio);
883  		folio_put(folio);
884  		return ret;
885  	}
886  	ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
887  	if (ret) {
888  		/* The folio is already unlocked. */
889  		folio_put(folio);
890  		if (!nowait && ret == -EAGAIN) {
891  			ret = 0;
892  			goto again;
893  		}
894  		return ret;
895  	}
896  	*folio_ret = folio;
897  	return 0;
898  }
899  
900  /*
901   * Locks the extent and properly waits for data=ordered extents to finish
902   * before allowing the folios to be modified if need.
903   *
904   * Return:
905   * 1 - the extent is locked
906   * 0 - the extent is not locked, and everything is OK
907   * -EAGAIN - need to prepare the folios again
908   */
909  static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)910  lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
911  				loff_t pos, size_t write_bytes,
912  				u64 *lockstart, u64 *lockend, bool nowait,
913  				struct extent_state **cached_state)
914  {
915  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
916  	u64 start_pos;
917  	u64 last_pos;
918  	int ret = 0;
919  
920  	start_pos = round_down(pos, fs_info->sectorsize);
921  	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
922  
923  	if (start_pos < inode->vfs_inode.i_size) {
924  		struct btrfs_ordered_extent *ordered;
925  
926  		if (nowait) {
927  			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
928  					     cached_state)) {
929  				folio_unlock(folio);
930  				folio_put(folio);
931  				return -EAGAIN;
932  			}
933  		} else {
934  			lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
935  		}
936  
937  		ordered = btrfs_lookup_ordered_range(inode, start_pos,
938  						     last_pos - start_pos + 1);
939  		if (ordered &&
940  		    ordered->file_offset + ordered->num_bytes > start_pos &&
941  		    ordered->file_offset <= last_pos) {
942  			unlock_extent(&inode->io_tree, start_pos, last_pos,
943  				      cached_state);
944  			folio_unlock(folio);
945  			folio_put(folio);
946  			btrfs_start_ordered_extent(ordered);
947  			btrfs_put_ordered_extent(ordered);
948  			return -EAGAIN;
949  		}
950  		if (ordered)
951  			btrfs_put_ordered_extent(ordered);
952  
953  		*lockstart = start_pos;
954  		*lockend = last_pos;
955  		ret = 1;
956  	}
957  
958  	/*
959  	 * We should be called after prepare_one_folio() which should have locked
960  	 * all pages in the range.
961  	 */
962  	WARN_ON(!folio_test_locked(folio));
963  
964  	return ret;
965  }
966  
967  /*
968   * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
969   *
970   * @pos:         File offset.
971   * @write_bytes: The length to write, will be updated to the nocow writeable
972   *               range.
973   *
974   * This function will flush ordered extents in the range to ensure proper
975   * nocow checks.
976   *
977   * Return:
978   * > 0          If we can nocow, and updates @write_bytes.
979   *  0           If we can't do a nocow write.
980   * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
981   *              root is in progress.
982   * < 0          If an error happened.
983   *
984   * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
985   */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)986  int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
987  			   size_t *write_bytes, bool nowait)
988  {
989  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
990  	struct btrfs_root *root = inode->root;
991  	struct extent_state *cached_state = NULL;
992  	u64 lockstart, lockend;
993  	u64 num_bytes;
994  	int ret;
995  
996  	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
997  		return 0;
998  
999  	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1000  		return -EAGAIN;
1001  
1002  	lockstart = round_down(pos, fs_info->sectorsize);
1003  	lockend = round_up(pos + *write_bytes,
1004  			   fs_info->sectorsize) - 1;
1005  	num_bytes = lockend - lockstart + 1;
1006  
1007  	if (nowait) {
1008  		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1009  						  &cached_state)) {
1010  			btrfs_drew_write_unlock(&root->snapshot_lock);
1011  			return -EAGAIN;
1012  		}
1013  	} else {
1014  		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1015  						   &cached_state);
1016  	}
1017  	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
1018  	if (ret <= 0)
1019  		btrfs_drew_write_unlock(&root->snapshot_lock);
1020  	else
1021  		*write_bytes = min_t(size_t, *write_bytes ,
1022  				     num_bytes - pos + lockstart);
1023  	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1024  
1025  	return ret;
1026  }
1027  
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1028  void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1029  {
1030  	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1031  }
1032  
btrfs_write_check(struct kiocb * iocb,size_t count)1033  int btrfs_write_check(struct kiocb *iocb, size_t count)
1034  {
1035  	struct file *file = iocb->ki_filp;
1036  	struct inode *inode = file_inode(file);
1037  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1038  	loff_t pos = iocb->ki_pos;
1039  	int ret;
1040  	loff_t oldsize;
1041  
1042  	/*
1043  	 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1044  	 * prealloc flags, as without those flags we always have to COW. We will
1045  	 * later check if we can really COW into the target range (using
1046  	 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1047  	 */
1048  	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1049  	    !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1050  		return -EAGAIN;
1051  
1052  	ret = file_remove_privs(file);
1053  	if (ret)
1054  		return ret;
1055  
1056  	/*
1057  	 * We reserve space for updating the inode when we reserve space for the
1058  	 * extent we are going to write, so we will enospc out there.  We don't
1059  	 * need to start yet another transaction to update the inode as we will
1060  	 * update the inode when we finish writing whatever data we write.
1061  	 */
1062  	if (!IS_NOCMTIME(inode)) {
1063  		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1064  		inode_inc_iversion(inode);
1065  	}
1066  
1067  	oldsize = i_size_read(inode);
1068  	if (pos > oldsize) {
1069  		/* Expand hole size to cover write data, preventing empty gap */
1070  		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1071  
1072  		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1073  		if (ret)
1074  			return ret;
1075  	}
1076  
1077  	return 0;
1078  }
1079  
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * i)1080  ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
1081  {
1082  	struct file *file = iocb->ki_filp;
1083  	loff_t pos;
1084  	struct inode *inode = file_inode(file);
1085  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1086  	struct extent_changeset *data_reserved = NULL;
1087  	u64 release_bytes = 0;
1088  	u64 lockstart;
1089  	u64 lockend;
1090  	size_t num_written = 0;
1091  	ssize_t ret;
1092  	loff_t old_isize;
1093  	unsigned int ilock_flags = 0;
1094  	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1095  	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1096  	bool only_release_metadata = false;
1097  
1098  	if (nowait)
1099  		ilock_flags |= BTRFS_ILOCK_TRY;
1100  
1101  	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1102  	if (ret < 0)
1103  		return ret;
1104  
1105  	/*
1106  	 * We can only trust the isize with inode lock held, or it can race with
1107  	 * other buffered writes and cause incorrect call of
1108  	 * pagecache_isize_extended() to overwrite existing data.
1109  	 */
1110  	old_isize = i_size_read(inode);
1111  
1112  	ret = generic_write_checks(iocb, i);
1113  	if (ret <= 0)
1114  		goto out;
1115  
1116  	ret = btrfs_write_check(iocb, ret);
1117  	if (ret < 0)
1118  		goto out;
1119  
1120  	pos = iocb->ki_pos;
1121  	while (iov_iter_count(i) > 0) {
1122  		struct extent_state *cached_state = NULL;
1123  		size_t offset = offset_in_page(pos);
1124  		size_t sector_offset;
1125  		size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
1126  		size_t reserve_bytes;
1127  		size_t copied;
1128  		size_t dirty_sectors;
1129  		size_t num_sectors;
1130  		struct folio *folio = NULL;
1131  		int extents_locked;
1132  		bool force_page_uptodate = false;
1133  
1134  		/*
1135  		 * Fault pages before locking them in prepare_one_folio()
1136  		 * to avoid recursive lock
1137  		 */
1138  		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1139  			ret = -EFAULT;
1140  			break;
1141  		}
1142  
1143  		only_release_metadata = false;
1144  		sector_offset = pos & (fs_info->sectorsize - 1);
1145  
1146  		extent_changeset_release(data_reserved);
1147  		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1148  						  &data_reserved, pos,
1149  						  write_bytes, nowait);
1150  		if (ret < 0) {
1151  			int can_nocow;
1152  
1153  			if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1154  				ret = -EAGAIN;
1155  				break;
1156  			}
1157  
1158  			/*
1159  			 * If we don't have to COW at the offset, reserve
1160  			 * metadata only. write_bytes may get smaller than
1161  			 * requested here.
1162  			 */
1163  			can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1164  							   &write_bytes, nowait);
1165  			if (can_nocow < 0)
1166  				ret = can_nocow;
1167  			if (can_nocow > 0)
1168  				ret = 0;
1169  			if (ret)
1170  				break;
1171  			only_release_metadata = true;
1172  		}
1173  
1174  		reserve_bytes = round_up(write_bytes + sector_offset,
1175  					 fs_info->sectorsize);
1176  		WARN_ON(reserve_bytes == 0);
1177  		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1178  						      reserve_bytes,
1179  						      reserve_bytes, nowait);
1180  		if (ret) {
1181  			if (!only_release_metadata)
1182  				btrfs_free_reserved_data_space(BTRFS_I(inode),
1183  						data_reserved, pos,
1184  						write_bytes);
1185  			else
1186  				btrfs_check_nocow_unlock(BTRFS_I(inode));
1187  
1188  			if (nowait && ret == -ENOSPC)
1189  				ret = -EAGAIN;
1190  			break;
1191  		}
1192  
1193  		release_bytes = reserve_bytes;
1194  again:
1195  		ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1196  		if (ret) {
1197  			btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1198  			break;
1199  		}
1200  
1201  		ret = prepare_one_folio(inode, &folio, pos, write_bytes,
1202  					force_page_uptodate, false);
1203  		if (ret) {
1204  			btrfs_delalloc_release_extents(BTRFS_I(inode),
1205  						       reserve_bytes);
1206  			break;
1207  		}
1208  
1209  		extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
1210  						folio, pos, write_bytes, &lockstart,
1211  						&lockend, nowait, &cached_state);
1212  		if (extents_locked < 0) {
1213  			if (!nowait && extents_locked == -EAGAIN)
1214  				goto again;
1215  
1216  			btrfs_delalloc_release_extents(BTRFS_I(inode),
1217  						       reserve_bytes);
1218  			ret = extents_locked;
1219  			break;
1220  		}
1221  
1222  		copied = copy_folio_from_iter_atomic(folio,
1223  				offset_in_folio(folio, pos), write_bytes, i);
1224  		flush_dcache_folio(folio);
1225  
1226  		/*
1227  		 * If we get a partial write, we can end up with partially
1228  		 * uptodate page. Although if sector size < page size we can
1229  		 * handle it, but if it's not sector aligned it can cause
1230  		 * a lot of complexity, so make sure they don't happen by
1231  		 * forcing retry this copy.
1232  		 */
1233  		if (unlikely(copied < write_bytes)) {
1234  			if (!folio_test_uptodate(folio)) {
1235  				iov_iter_revert(i, copied);
1236  				copied = 0;
1237  			}
1238  		}
1239  
1240  		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1241  		dirty_sectors = round_up(copied + sector_offset,
1242  					fs_info->sectorsize);
1243  		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1244  
1245  		if (copied == 0) {
1246  			force_page_uptodate = true;
1247  			dirty_sectors = 0;
1248  		} else {
1249  			force_page_uptodate = false;
1250  		}
1251  
1252  		if (num_sectors > dirty_sectors) {
1253  			/* release everything except the sectors we dirtied */
1254  			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1255  			if (only_release_metadata) {
1256  				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1257  							release_bytes, true);
1258  			} else {
1259  				u64 release_start = round_up(pos + copied,
1260  							     fs_info->sectorsize);
1261  				btrfs_delalloc_release_space(BTRFS_I(inode),
1262  						data_reserved, release_start,
1263  						release_bytes, true);
1264  			}
1265  		}
1266  
1267  		release_bytes = round_up(copied + sector_offset,
1268  					fs_info->sectorsize);
1269  
1270  		ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
1271  					&cached_state, only_release_metadata);
1272  
1273  		/*
1274  		 * If we have not locked the extent range, because the range's
1275  		 * start offset is >= i_size, we might still have a non-NULL
1276  		 * cached extent state, acquired while marking the extent range
1277  		 * as delalloc through btrfs_dirty_page(). Therefore free any
1278  		 * possible cached extent state to avoid a memory leak.
1279  		 */
1280  		if (extents_locked)
1281  			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1282  				      lockend, &cached_state);
1283  		else
1284  			free_extent_state(cached_state);
1285  
1286  		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1287  		if (ret) {
1288  			btrfs_drop_folio(fs_info, folio, pos, copied);
1289  			break;
1290  		}
1291  
1292  		release_bytes = 0;
1293  		if (only_release_metadata)
1294  			btrfs_check_nocow_unlock(BTRFS_I(inode));
1295  
1296  		btrfs_drop_folio(fs_info, folio, pos, copied);
1297  
1298  		cond_resched();
1299  
1300  		pos += copied;
1301  		num_written += copied;
1302  	}
1303  
1304  	if (release_bytes) {
1305  		if (only_release_metadata) {
1306  			btrfs_check_nocow_unlock(BTRFS_I(inode));
1307  			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1308  					release_bytes, true);
1309  		} else {
1310  			btrfs_delalloc_release_space(BTRFS_I(inode),
1311  					data_reserved,
1312  					round_down(pos, fs_info->sectorsize),
1313  					release_bytes, true);
1314  		}
1315  	}
1316  
1317  	extent_changeset_free(data_reserved);
1318  	if (num_written > 0) {
1319  		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1320  		iocb->ki_pos += num_written;
1321  	}
1322  out:
1323  	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1324  	return num_written ? num_written : ret;
1325  }
1326  
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1327  static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1328  			const struct btrfs_ioctl_encoded_io_args *encoded)
1329  {
1330  	struct file *file = iocb->ki_filp;
1331  	struct inode *inode = file_inode(file);
1332  	loff_t count;
1333  	ssize_t ret;
1334  
1335  	btrfs_inode_lock(BTRFS_I(inode), 0);
1336  	count = encoded->len;
1337  	ret = generic_write_checks_count(iocb, &count);
1338  	if (ret == 0 && count != encoded->len) {
1339  		/*
1340  		 * The write got truncated by generic_write_checks_count(). We
1341  		 * can't do a partial encoded write.
1342  		 */
1343  		ret = -EFBIG;
1344  	}
1345  	if (ret || encoded->len == 0)
1346  		goto out;
1347  
1348  	ret = btrfs_write_check(iocb, encoded->len);
1349  	if (ret < 0)
1350  		goto out;
1351  
1352  	ret = btrfs_do_encoded_write(iocb, from, encoded);
1353  out:
1354  	btrfs_inode_unlock(BTRFS_I(inode), 0);
1355  	return ret;
1356  }
1357  
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1358  ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1359  			    const struct btrfs_ioctl_encoded_io_args *encoded)
1360  {
1361  	struct file *file = iocb->ki_filp;
1362  	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1363  	ssize_t num_written, num_sync;
1364  
1365  	/*
1366  	 * If the fs flips readonly due to some impossible error, although we
1367  	 * have opened a file as writable, we have to stop this write operation
1368  	 * to ensure consistency.
1369  	 */
1370  	if (BTRFS_FS_ERROR(inode->root->fs_info))
1371  		return -EROFS;
1372  
1373  	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1374  		return -EOPNOTSUPP;
1375  
1376  	if (encoded) {
1377  		num_written = btrfs_encoded_write(iocb, from, encoded);
1378  		num_sync = encoded->len;
1379  	} else if (iocb->ki_flags & IOCB_DIRECT) {
1380  		num_written = btrfs_direct_write(iocb, from);
1381  		num_sync = num_written;
1382  	} else {
1383  		num_written = btrfs_buffered_write(iocb, from);
1384  		num_sync = num_written;
1385  	}
1386  
1387  	btrfs_set_inode_last_sub_trans(inode);
1388  
1389  	if (num_sync > 0) {
1390  		num_sync = generic_write_sync(iocb, num_sync);
1391  		if (num_sync < 0)
1392  			num_written = num_sync;
1393  	}
1394  
1395  	return num_written;
1396  }
1397  
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1398  static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1399  {
1400  	return btrfs_do_write_iter(iocb, from, NULL);
1401  }
1402  
btrfs_release_file(struct inode * inode,struct file * filp)1403  int btrfs_release_file(struct inode *inode, struct file *filp)
1404  {
1405  	struct btrfs_file_private *private = filp->private_data;
1406  
1407  	if (private) {
1408  		kfree(private->filldir_buf);
1409  		free_extent_state(private->llseek_cached_state);
1410  		kfree(private);
1411  		filp->private_data = NULL;
1412  	}
1413  
1414  	/*
1415  	 * Set by setattr when we are about to truncate a file from a non-zero
1416  	 * size to a zero size.  This tries to flush down new bytes that may
1417  	 * have been written if the application were using truncate to replace
1418  	 * a file in place.
1419  	 */
1420  	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1421  			       &BTRFS_I(inode)->runtime_flags))
1422  			filemap_flush(inode->i_mapping);
1423  	return 0;
1424  }
1425  
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1426  static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1427  {
1428  	int ret;
1429  	struct blk_plug plug;
1430  
1431  	/*
1432  	 * This is only called in fsync, which would do synchronous writes, so
1433  	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1434  	 * multiple disks using raid profile, a large IO can be split to
1435  	 * several segments of stripe length (currently 64K).
1436  	 */
1437  	blk_start_plug(&plug);
1438  	ret = btrfs_fdatawrite_range(inode, start, end);
1439  	blk_finish_plug(&plug);
1440  
1441  	return ret;
1442  }
1443  
skip_inode_logging(const struct btrfs_log_ctx * ctx)1444  static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1445  {
1446  	struct btrfs_inode *inode = ctx->inode;
1447  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1448  
1449  	if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1450  	    list_empty(&ctx->ordered_extents))
1451  		return true;
1452  
1453  	/*
1454  	 * If we are doing a fast fsync we can not bail out if the inode's
1455  	 * last_trans is <= then the last committed transaction, because we only
1456  	 * update the last_trans of the inode during ordered extent completion,
1457  	 * and for a fast fsync we don't wait for that, we only wait for the
1458  	 * writeback to complete.
1459  	 */
1460  	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1461  	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1462  	     list_empty(&ctx->ordered_extents)))
1463  		return true;
1464  
1465  	return false;
1466  }
1467  
1468  /*
1469   * fsync call for both files and directories.  This logs the inode into
1470   * the tree log instead of forcing full commits whenever possible.
1471   *
1472   * It needs to call filemap_fdatawait so that all ordered extent updates are
1473   * in the metadata btree are up to date for copying to the log.
1474   *
1475   * It drops the inode mutex before doing the tree log commit.  This is an
1476   * important optimization for directories because holding the mutex prevents
1477   * new operations on the dir while we write to disk.
1478   */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1479  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1480  {
1481  	struct dentry *dentry = file_dentry(file);
1482  	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1483  	struct btrfs_root *root = inode->root;
1484  	struct btrfs_fs_info *fs_info = root->fs_info;
1485  	struct btrfs_trans_handle *trans;
1486  	struct btrfs_log_ctx ctx;
1487  	int ret = 0, err;
1488  	u64 len;
1489  	bool full_sync;
1490  	bool skip_ilock = false;
1491  
1492  	if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1493  		skip_ilock = true;
1494  		current->journal_info = NULL;
1495  		btrfs_assert_inode_locked(inode);
1496  	}
1497  
1498  	trace_btrfs_sync_file(file, datasync);
1499  
1500  	btrfs_init_log_ctx(&ctx, inode);
1501  
1502  	/*
1503  	 * Always set the range to a full range, otherwise we can get into
1504  	 * several problems, from missing file extent items to represent holes
1505  	 * when not using the NO_HOLES feature, to log tree corruption due to
1506  	 * races between hole detection during logging and completion of ordered
1507  	 * extents outside the range, to missing checksums due to ordered extents
1508  	 * for which we flushed only a subset of their pages.
1509  	 */
1510  	start = 0;
1511  	end = LLONG_MAX;
1512  	len = (u64)LLONG_MAX + 1;
1513  
1514  	/*
1515  	 * We write the dirty pages in the range and wait until they complete
1516  	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1517  	 * multi-task, and make the performance up.  See
1518  	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1519  	 */
1520  	ret = start_ordered_ops(inode, start, end);
1521  	if (ret)
1522  		goto out;
1523  
1524  	if (skip_ilock)
1525  		down_write(&inode->i_mmap_lock);
1526  	else
1527  		btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1528  
1529  	atomic_inc(&root->log_batch);
1530  
1531  	/*
1532  	 * Before we acquired the inode's lock and the mmap lock, someone may
1533  	 * have dirtied more pages in the target range. We need to make sure
1534  	 * that writeback for any such pages does not start while we are logging
1535  	 * the inode, because if it does, any of the following might happen when
1536  	 * we are not doing a full inode sync:
1537  	 *
1538  	 * 1) We log an extent after its writeback finishes but before its
1539  	 *    checksums are added to the csum tree, leading to -EIO errors
1540  	 *    when attempting to read the extent after a log replay.
1541  	 *
1542  	 * 2) We can end up logging an extent before its writeback finishes.
1543  	 *    Therefore after the log replay we will have a file extent item
1544  	 *    pointing to an unwritten extent (and no data checksums as well).
1545  	 *
1546  	 * So trigger writeback for any eventual new dirty pages and then we
1547  	 * wait for all ordered extents to complete below.
1548  	 */
1549  	ret = start_ordered_ops(inode, start, end);
1550  	if (ret) {
1551  		if (skip_ilock)
1552  			up_write(&inode->i_mmap_lock);
1553  		else
1554  			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1555  		goto out;
1556  	}
1557  
1558  	/*
1559  	 * Always check for the full sync flag while holding the inode's lock,
1560  	 * to avoid races with other tasks. The flag must be either set all the
1561  	 * time during logging or always off all the time while logging.
1562  	 * We check the flag here after starting delalloc above, because when
1563  	 * running delalloc the full sync flag may be set if we need to drop
1564  	 * extra extent map ranges due to temporary memory allocation failures.
1565  	 */
1566  	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1567  
1568  	/*
1569  	 * We have to do this here to avoid the priority inversion of waiting on
1570  	 * IO of a lower priority task while holding a transaction open.
1571  	 *
1572  	 * For a full fsync we wait for the ordered extents to complete while
1573  	 * for a fast fsync we wait just for writeback to complete, and then
1574  	 * attach the ordered extents to the transaction so that a transaction
1575  	 * commit waits for their completion, to avoid data loss if we fsync,
1576  	 * the current transaction commits before the ordered extents complete
1577  	 * and a power failure happens right after that.
1578  	 *
1579  	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1580  	 * logical address recorded in the ordered extent may change. We need
1581  	 * to wait for the IO to stabilize the logical address.
1582  	 */
1583  	if (full_sync || btrfs_is_zoned(fs_info)) {
1584  		ret = btrfs_wait_ordered_range(inode, start, len);
1585  		clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1586  	} else {
1587  		/*
1588  		 * Get our ordered extents as soon as possible to avoid doing
1589  		 * checksum lookups in the csum tree, and use instead the
1590  		 * checksums attached to the ordered extents.
1591  		 */
1592  		btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1593  		ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1594  		if (ret)
1595  			goto out_release_extents;
1596  
1597  		/*
1598  		 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1599  		 * starting and waiting for writeback, because for buffered IO
1600  		 * it may have been set during the end IO callback
1601  		 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1602  		 * case an error happened and we need to wait for ordered
1603  		 * extents to complete so that any extent maps that point to
1604  		 * unwritten locations are dropped and we don't log them.
1605  		 */
1606  		if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1607  			ret = btrfs_wait_ordered_range(inode, start, len);
1608  	}
1609  
1610  	if (ret)
1611  		goto out_release_extents;
1612  
1613  	atomic_inc(&root->log_batch);
1614  
1615  	if (skip_inode_logging(&ctx)) {
1616  		/*
1617  		 * We've had everything committed since the last time we were
1618  		 * modified so clear this flag in case it was set for whatever
1619  		 * reason, it's no longer relevant.
1620  		 */
1621  		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1622  		/*
1623  		 * An ordered extent might have started before and completed
1624  		 * already with io errors, in which case the inode was not
1625  		 * updated and we end up here. So check the inode's mapping
1626  		 * for any errors that might have happened since we last
1627  		 * checked called fsync.
1628  		 */
1629  		ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1630  		goto out_release_extents;
1631  	}
1632  
1633  	btrfs_init_log_ctx_scratch_eb(&ctx);
1634  
1635  	/*
1636  	 * We use start here because we will need to wait on the IO to complete
1637  	 * in btrfs_sync_log, which could require joining a transaction (for
1638  	 * example checking cross references in the nocow path).  If we use join
1639  	 * here we could get into a situation where we're waiting on IO to
1640  	 * happen that is blocked on a transaction trying to commit.  With start
1641  	 * we inc the extwriter counter, so we wait for all extwriters to exit
1642  	 * before we start blocking joiners.  This comment is to keep somebody
1643  	 * from thinking they are super smart and changing this to
1644  	 * btrfs_join_transaction *cough*Josef*cough*.
1645  	 */
1646  	trans = btrfs_start_transaction(root, 0);
1647  	if (IS_ERR(trans)) {
1648  		ret = PTR_ERR(trans);
1649  		goto out_release_extents;
1650  	}
1651  	trans->in_fsync = true;
1652  
1653  	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1654  	/*
1655  	 * Scratch eb no longer needed, release before syncing log or commit
1656  	 * transaction, to avoid holding unnecessary memory during such long
1657  	 * operations.
1658  	 */
1659  	if (ctx.scratch_eb) {
1660  		free_extent_buffer(ctx.scratch_eb);
1661  		ctx.scratch_eb = NULL;
1662  	}
1663  	btrfs_release_log_ctx_extents(&ctx);
1664  	if (ret < 0) {
1665  		/* Fallthrough and commit/free transaction. */
1666  		ret = BTRFS_LOG_FORCE_COMMIT;
1667  	}
1668  
1669  	/* we've logged all the items and now have a consistent
1670  	 * version of the file in the log.  It is possible that
1671  	 * someone will come in and modify the file, but that's
1672  	 * fine because the log is consistent on disk, and we
1673  	 * have references to all of the file's extents
1674  	 *
1675  	 * It is possible that someone will come in and log the
1676  	 * file again, but that will end up using the synchronization
1677  	 * inside btrfs_sync_log to keep things safe.
1678  	 */
1679  	if (skip_ilock)
1680  		up_write(&inode->i_mmap_lock);
1681  	else
1682  		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1683  
1684  	if (ret == BTRFS_NO_LOG_SYNC) {
1685  		ret = btrfs_end_transaction(trans);
1686  		goto out;
1687  	}
1688  
1689  	/* We successfully logged the inode, attempt to sync the log. */
1690  	if (!ret) {
1691  		ret = btrfs_sync_log(trans, root, &ctx);
1692  		if (!ret) {
1693  			ret = btrfs_end_transaction(trans);
1694  			goto out;
1695  		}
1696  	}
1697  
1698  	/*
1699  	 * At this point we need to commit the transaction because we had
1700  	 * btrfs_need_log_full_commit() or some other error.
1701  	 *
1702  	 * If we didn't do a full sync we have to stop the trans handle, wait on
1703  	 * the ordered extents, start it again and commit the transaction.  If
1704  	 * we attempt to wait on the ordered extents here we could deadlock with
1705  	 * something like fallocate() that is holding the extent lock trying to
1706  	 * start a transaction while some other thread is trying to commit the
1707  	 * transaction while we (fsync) are currently holding the transaction
1708  	 * open.
1709  	 */
1710  	if (!full_sync) {
1711  		ret = btrfs_end_transaction(trans);
1712  		if (ret)
1713  			goto out;
1714  		ret = btrfs_wait_ordered_range(inode, start, len);
1715  		if (ret)
1716  			goto out;
1717  
1718  		/*
1719  		 * This is safe to use here because we're only interested in
1720  		 * making sure the transaction that had the ordered extents is
1721  		 * committed.  We aren't waiting on anything past this point,
1722  		 * we're purely getting the transaction and committing it.
1723  		 */
1724  		trans = btrfs_attach_transaction_barrier(root);
1725  		if (IS_ERR(trans)) {
1726  			ret = PTR_ERR(trans);
1727  
1728  			/*
1729  			 * We committed the transaction and there's no currently
1730  			 * running transaction, this means everything we care
1731  			 * about made it to disk and we are done.
1732  			 */
1733  			if (ret == -ENOENT)
1734  				ret = 0;
1735  			goto out;
1736  		}
1737  	}
1738  
1739  	ret = btrfs_commit_transaction(trans);
1740  out:
1741  	free_extent_buffer(ctx.scratch_eb);
1742  	ASSERT(list_empty(&ctx.list));
1743  	ASSERT(list_empty(&ctx.conflict_inodes));
1744  	err = file_check_and_advance_wb_err(file);
1745  	if (!ret)
1746  		ret = err;
1747  	return ret > 0 ? -EIO : ret;
1748  
1749  out_release_extents:
1750  	btrfs_release_log_ctx_extents(&ctx);
1751  	if (skip_ilock)
1752  		up_write(&inode->i_mmap_lock);
1753  	else
1754  		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1755  	goto out;
1756  }
1757  
1758  /*
1759   * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1760   * called from a page fault handler when a page is first dirtied. Hence we must
1761   * be careful to check for EOF conditions here. We set the page up correctly
1762   * for a written page which means we get ENOSPC checking when writing into
1763   * holes and correct delalloc and unwritten extent mapping on filesystems that
1764   * support these features.
1765   *
1766   * We are not allowed to take the i_mutex here so we have to play games to
1767   * protect against truncate races as the page could now be beyond EOF.  Because
1768   * truncate_setsize() writes the inode size before removing pages, once we have
1769   * the page lock we can determine safely if the page is beyond EOF. If it is not
1770   * beyond EOF, then the page is guaranteed safe against truncation until we
1771   * unlock the page.
1772   */
btrfs_page_mkwrite(struct vm_fault * vmf)1773  static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1774  {
1775  	struct page *page = vmf->page;
1776  	struct folio *folio = page_folio(page);
1777  	struct inode *inode = file_inode(vmf->vma->vm_file);
1778  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1779  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1780  	struct btrfs_ordered_extent *ordered;
1781  	struct extent_state *cached_state = NULL;
1782  	struct extent_changeset *data_reserved = NULL;
1783  	unsigned long zero_start;
1784  	loff_t size;
1785  	size_t fsize = folio_size(folio);
1786  	vm_fault_t ret;
1787  	int ret2;
1788  	int reserved = 0;
1789  	u64 reserved_space;
1790  	u64 page_start;
1791  	u64 page_end;
1792  	u64 end;
1793  
1794  	ASSERT(folio_order(folio) == 0);
1795  
1796  	reserved_space = fsize;
1797  
1798  	sb_start_pagefault(inode->i_sb);
1799  	page_start = folio_pos(folio);
1800  	page_end = page_start + folio_size(folio) - 1;
1801  	end = page_end;
1802  
1803  	/*
1804  	 * Reserving delalloc space after obtaining the page lock can lead to
1805  	 * deadlock. For example, if a dirty page is locked by this function
1806  	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1807  	 * dirty page write out, then the btrfs_writepages() function could
1808  	 * end up waiting indefinitely to get a lock on the page currently
1809  	 * being processed by btrfs_page_mkwrite() function.
1810  	 */
1811  	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
1812  					    page_start, reserved_space);
1813  	if (!ret2) {
1814  		ret2 = file_update_time(vmf->vma->vm_file);
1815  		reserved = 1;
1816  	}
1817  	if (ret2) {
1818  		ret = vmf_error(ret2);
1819  		if (reserved)
1820  			goto out;
1821  		goto out_noreserve;
1822  	}
1823  
1824  	/* Make the VM retry the fault. */
1825  	ret = VM_FAULT_NOPAGE;
1826  again:
1827  	down_read(&BTRFS_I(inode)->i_mmap_lock);
1828  	folio_lock(folio);
1829  	size = i_size_read(inode);
1830  
1831  	if ((folio->mapping != inode->i_mapping) ||
1832  	    (page_start >= size)) {
1833  		/* Page got truncated out from underneath us. */
1834  		goto out_unlock;
1835  	}
1836  	folio_wait_writeback(folio);
1837  
1838  	lock_extent(io_tree, page_start, page_end, &cached_state);
1839  	ret2 = set_folio_extent_mapped(folio);
1840  	if (ret2 < 0) {
1841  		ret = vmf_error(ret2);
1842  		unlock_extent(io_tree, page_start, page_end, &cached_state);
1843  		goto out_unlock;
1844  	}
1845  
1846  	/*
1847  	 * We can't set the delalloc bits if there are pending ordered
1848  	 * extents.  Drop our locks and wait for them to finish.
1849  	 */
1850  	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
1851  	if (ordered) {
1852  		unlock_extent(io_tree, page_start, page_end, &cached_state);
1853  		folio_unlock(folio);
1854  		up_read(&BTRFS_I(inode)->i_mmap_lock);
1855  		btrfs_start_ordered_extent(ordered);
1856  		btrfs_put_ordered_extent(ordered);
1857  		goto again;
1858  	}
1859  
1860  	if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
1861  		reserved_space = round_up(size - page_start, fs_info->sectorsize);
1862  		if (reserved_space < fsize) {
1863  			end = page_start + reserved_space - 1;
1864  			btrfs_delalloc_release_space(BTRFS_I(inode),
1865  					data_reserved, page_start,
1866  					fsize - reserved_space, true);
1867  		}
1868  	}
1869  
1870  	/*
1871  	 * page_mkwrite gets called when the page is firstly dirtied after it's
1872  	 * faulted in, but write(2) could also dirty a page and set delalloc
1873  	 * bits, thus in this case for space account reason, we still need to
1874  	 * clear any delalloc bits within this page range since we have to
1875  	 * reserve data&meta space before lock_page() (see above comments).
1876  	 */
1877  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
1878  			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1879  			  EXTENT_DEFRAG, &cached_state);
1880  
1881  	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
1882  					&cached_state);
1883  	if (ret2) {
1884  		unlock_extent(io_tree, page_start, page_end, &cached_state);
1885  		ret = VM_FAULT_SIGBUS;
1886  		goto out_unlock;
1887  	}
1888  
1889  	/* Page is wholly or partially inside EOF. */
1890  	if (page_start + folio_size(folio) > size)
1891  		zero_start = offset_in_folio(folio, size);
1892  	else
1893  		zero_start = fsize;
1894  
1895  	if (zero_start != fsize)
1896  		folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1897  
1898  	btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1899  	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1900  	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1901  
1902  	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
1903  
1904  	unlock_extent(io_tree, page_start, page_end, &cached_state);
1905  	up_read(&BTRFS_I(inode)->i_mmap_lock);
1906  
1907  	btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1908  	sb_end_pagefault(inode->i_sb);
1909  	extent_changeset_free(data_reserved);
1910  	return VM_FAULT_LOCKED;
1911  
1912  out_unlock:
1913  	folio_unlock(folio);
1914  	up_read(&BTRFS_I(inode)->i_mmap_lock);
1915  out:
1916  	btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1917  	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
1918  				     reserved_space, (ret != 0));
1919  out_noreserve:
1920  	sb_end_pagefault(inode->i_sb);
1921  	extent_changeset_free(data_reserved);
1922  	return ret;
1923  }
1924  
1925  static const struct vm_operations_struct btrfs_file_vm_ops = {
1926  	.fault		= filemap_fault,
1927  	.map_pages	= filemap_map_pages,
1928  	.page_mkwrite	= btrfs_page_mkwrite,
1929  };
1930  
btrfs_file_mmap(struct file * filp,struct vm_area_struct * vma)1931  static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
1932  {
1933  	struct address_space *mapping = filp->f_mapping;
1934  
1935  	if (!mapping->a_ops->read_folio)
1936  		return -ENOEXEC;
1937  
1938  	file_accessed(filp);
1939  	vma->vm_ops = &btrfs_file_vm_ops;
1940  
1941  	return 0;
1942  }
1943  
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)1944  static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
1945  			  int slot, u64 start, u64 end)
1946  {
1947  	struct btrfs_file_extent_item *fi;
1948  	struct btrfs_key key;
1949  
1950  	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1951  		return 0;
1952  
1953  	btrfs_item_key_to_cpu(leaf, &key, slot);
1954  	if (key.objectid != btrfs_ino(inode) ||
1955  	    key.type != BTRFS_EXTENT_DATA_KEY)
1956  		return 0;
1957  
1958  	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1959  
1960  	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1961  		return 0;
1962  
1963  	if (btrfs_file_extent_disk_bytenr(leaf, fi))
1964  		return 0;
1965  
1966  	if (key.offset == end)
1967  		return 1;
1968  	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1969  		return 1;
1970  	return 0;
1971  }
1972  
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)1973  static int fill_holes(struct btrfs_trans_handle *trans,
1974  		struct btrfs_inode *inode,
1975  		struct btrfs_path *path, u64 offset, u64 end)
1976  {
1977  	struct btrfs_fs_info *fs_info = trans->fs_info;
1978  	struct btrfs_root *root = inode->root;
1979  	struct extent_buffer *leaf;
1980  	struct btrfs_file_extent_item *fi;
1981  	struct extent_map *hole_em;
1982  	struct btrfs_key key;
1983  	int ret;
1984  
1985  	if (btrfs_fs_incompat(fs_info, NO_HOLES))
1986  		goto out;
1987  
1988  	key.objectid = btrfs_ino(inode);
1989  	key.type = BTRFS_EXTENT_DATA_KEY;
1990  	key.offset = offset;
1991  
1992  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1993  	if (ret <= 0) {
1994  		/*
1995  		 * We should have dropped this offset, so if we find it then
1996  		 * something has gone horribly wrong.
1997  		 */
1998  		if (ret == 0)
1999  			ret = -EINVAL;
2000  		return ret;
2001  	}
2002  
2003  	leaf = path->nodes[0];
2004  	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2005  		u64 num_bytes;
2006  
2007  		path->slots[0]--;
2008  		fi = btrfs_item_ptr(leaf, path->slots[0],
2009  				    struct btrfs_file_extent_item);
2010  		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2011  			end - offset;
2012  		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2013  		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2014  		btrfs_set_file_extent_offset(leaf, fi, 0);
2015  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2016  		goto out;
2017  	}
2018  
2019  	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2020  		u64 num_bytes;
2021  
2022  		key.offset = offset;
2023  		btrfs_set_item_key_safe(trans, path, &key);
2024  		fi = btrfs_item_ptr(leaf, path->slots[0],
2025  				    struct btrfs_file_extent_item);
2026  		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2027  			offset;
2028  		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2029  		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2030  		btrfs_set_file_extent_offset(leaf, fi, 0);
2031  		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2032  		goto out;
2033  	}
2034  	btrfs_release_path(path);
2035  
2036  	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2037  				       end - offset);
2038  	if (ret)
2039  		return ret;
2040  
2041  out:
2042  	btrfs_release_path(path);
2043  
2044  	hole_em = alloc_extent_map();
2045  	if (!hole_em) {
2046  		btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2047  		btrfs_set_inode_full_sync(inode);
2048  	} else {
2049  		hole_em->start = offset;
2050  		hole_em->len = end - offset;
2051  		hole_em->ram_bytes = hole_em->len;
2052  
2053  		hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2054  		hole_em->disk_num_bytes = 0;
2055  		hole_em->generation = trans->transid;
2056  
2057  		ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2058  		free_extent_map(hole_em);
2059  		if (ret)
2060  			btrfs_set_inode_full_sync(inode);
2061  	}
2062  
2063  	return 0;
2064  }
2065  
2066  /*
2067   * Find a hole extent on given inode and change start/len to the end of hole
2068   * extent.(hole/vacuum extent whose em->start <= start &&
2069   *	   em->start + em->len > start)
2070   * When a hole extent is found, return 1 and modify start/len.
2071   */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2072  static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2073  {
2074  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2075  	struct extent_map *em;
2076  	int ret = 0;
2077  
2078  	em = btrfs_get_extent(inode, NULL,
2079  			      round_down(*start, fs_info->sectorsize),
2080  			      round_up(*len, fs_info->sectorsize));
2081  	if (IS_ERR(em))
2082  		return PTR_ERR(em);
2083  
2084  	/* Hole or vacuum extent(only exists in no-hole mode) */
2085  	if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2086  		ret = 1;
2087  		*len = em->start + em->len > *start + *len ?
2088  		       0 : *start + *len - em->start - em->len;
2089  		*start = em->start + em->len;
2090  	}
2091  	free_extent_map(em);
2092  	return ret;
2093  }
2094  
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2095  static void btrfs_punch_hole_lock_range(struct inode *inode,
2096  					const u64 lockstart,
2097  					const u64 lockend,
2098  					struct extent_state **cached_state)
2099  {
2100  	/*
2101  	 * For subpage case, if the range is not at page boundary, we could
2102  	 * have pages at the leading/tailing part of the range.
2103  	 * This could lead to dead loop since filemap_range_has_page()
2104  	 * will always return true.
2105  	 * So here we need to do extra page alignment for
2106  	 * filemap_range_has_page().
2107  	 */
2108  	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2109  	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2110  
2111  	while (1) {
2112  		truncate_pagecache_range(inode, lockstart, lockend);
2113  
2114  		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2115  			    cached_state);
2116  		/*
2117  		 * We can't have ordered extents in the range, nor dirty/writeback
2118  		 * pages, because we have locked the inode's VFS lock in exclusive
2119  		 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2120  		 * we have flushed all delalloc in the range and we have waited
2121  		 * for any ordered extents in the range to complete.
2122  		 * We can race with anyone reading pages from this range, so after
2123  		 * locking the range check if we have pages in the range, and if
2124  		 * we do, unlock the range and retry.
2125  		 */
2126  		if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2127  					    page_lockend))
2128  			break;
2129  
2130  		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2131  			      cached_state);
2132  	}
2133  
2134  	btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2135  }
2136  
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2137  static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2138  				     struct btrfs_inode *inode,
2139  				     struct btrfs_path *path,
2140  				     struct btrfs_replace_extent_info *extent_info,
2141  				     const u64 replace_len,
2142  				     const u64 bytes_to_drop)
2143  {
2144  	struct btrfs_fs_info *fs_info = trans->fs_info;
2145  	struct btrfs_root *root = inode->root;
2146  	struct btrfs_file_extent_item *extent;
2147  	struct extent_buffer *leaf;
2148  	struct btrfs_key key;
2149  	int slot;
2150  	int ret;
2151  
2152  	if (replace_len == 0)
2153  		return 0;
2154  
2155  	if (extent_info->disk_offset == 0 &&
2156  	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2157  		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2158  		return 0;
2159  	}
2160  
2161  	key.objectid = btrfs_ino(inode);
2162  	key.type = BTRFS_EXTENT_DATA_KEY;
2163  	key.offset = extent_info->file_offset;
2164  	ret = btrfs_insert_empty_item(trans, root, path, &key,
2165  				      sizeof(struct btrfs_file_extent_item));
2166  	if (ret)
2167  		return ret;
2168  	leaf = path->nodes[0];
2169  	slot = path->slots[0];
2170  	write_extent_buffer(leaf, extent_info->extent_buf,
2171  			    btrfs_item_ptr_offset(leaf, slot),
2172  			    sizeof(struct btrfs_file_extent_item));
2173  	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2174  	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2175  	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2176  	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2177  	if (extent_info->is_new_extent)
2178  		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2179  	btrfs_release_path(path);
2180  
2181  	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2182  						replace_len);
2183  	if (ret)
2184  		return ret;
2185  
2186  	/* If it's a hole, nothing more needs to be done. */
2187  	if (extent_info->disk_offset == 0) {
2188  		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2189  		return 0;
2190  	}
2191  
2192  	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2193  
2194  	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2195  		key.objectid = extent_info->disk_offset;
2196  		key.type = BTRFS_EXTENT_ITEM_KEY;
2197  		key.offset = extent_info->disk_len;
2198  		ret = btrfs_alloc_reserved_file_extent(trans, root,
2199  						       btrfs_ino(inode),
2200  						       extent_info->file_offset,
2201  						       extent_info->qgroup_reserved,
2202  						       &key);
2203  	} else {
2204  		struct btrfs_ref ref = {
2205  			.action = BTRFS_ADD_DELAYED_REF,
2206  			.bytenr = extent_info->disk_offset,
2207  			.num_bytes = extent_info->disk_len,
2208  			.owning_root = btrfs_root_id(root),
2209  			.ref_root = btrfs_root_id(root),
2210  		};
2211  		u64 ref_offset;
2212  
2213  		ref_offset = extent_info->file_offset - extent_info->data_offset;
2214  		btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2215  		ret = btrfs_inc_extent_ref(trans, &ref);
2216  	}
2217  
2218  	extent_info->insertions++;
2219  
2220  	return ret;
2221  }
2222  
2223  /*
2224   * The respective range must have been previously locked, as well as the inode.
2225   * The end offset is inclusive (last byte of the range).
2226   * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2227   * the file range with an extent.
2228   * When not punching a hole, we don't want to end up in a state where we dropped
2229   * extents without inserting a new one, so we must abort the transaction to avoid
2230   * a corruption.
2231   */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2232  int btrfs_replace_file_extents(struct btrfs_inode *inode,
2233  			       struct btrfs_path *path, const u64 start,
2234  			       const u64 end,
2235  			       struct btrfs_replace_extent_info *extent_info,
2236  			       struct btrfs_trans_handle **trans_out)
2237  {
2238  	struct btrfs_drop_extents_args drop_args = { 0 };
2239  	struct btrfs_root *root = inode->root;
2240  	struct btrfs_fs_info *fs_info = root->fs_info;
2241  	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2242  	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2243  	struct btrfs_trans_handle *trans = NULL;
2244  	struct btrfs_block_rsv *rsv;
2245  	unsigned int rsv_count;
2246  	u64 cur_offset;
2247  	u64 len = end - start;
2248  	int ret = 0;
2249  
2250  	if (end <= start)
2251  		return -EINVAL;
2252  
2253  	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2254  	if (!rsv) {
2255  		ret = -ENOMEM;
2256  		goto out;
2257  	}
2258  	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2259  	rsv->failfast = true;
2260  
2261  	/*
2262  	 * 1 - update the inode
2263  	 * 1 - removing the extents in the range
2264  	 * 1 - adding the hole extent if no_holes isn't set or if we are
2265  	 *     replacing the range with a new extent
2266  	 */
2267  	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2268  		rsv_count = 3;
2269  	else
2270  		rsv_count = 2;
2271  
2272  	trans = btrfs_start_transaction(root, rsv_count);
2273  	if (IS_ERR(trans)) {
2274  		ret = PTR_ERR(trans);
2275  		trans = NULL;
2276  		goto out_free;
2277  	}
2278  
2279  	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2280  				      min_size, false);
2281  	if (WARN_ON(ret))
2282  		goto out_trans;
2283  	trans->block_rsv = rsv;
2284  
2285  	cur_offset = start;
2286  	drop_args.path = path;
2287  	drop_args.end = end + 1;
2288  	drop_args.drop_cache = true;
2289  	while (cur_offset < end) {
2290  		drop_args.start = cur_offset;
2291  		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2292  		/* If we are punching a hole decrement the inode's byte count */
2293  		if (!extent_info)
2294  			btrfs_update_inode_bytes(inode, 0,
2295  						 drop_args.bytes_found);
2296  		if (ret != -ENOSPC) {
2297  			/*
2298  			 * The only time we don't want to abort is if we are
2299  			 * attempting to clone a partial inline extent, in which
2300  			 * case we'll get EOPNOTSUPP.  However if we aren't
2301  			 * clone we need to abort no matter what, because if we
2302  			 * got EOPNOTSUPP via prealloc then we messed up and
2303  			 * need to abort.
2304  			 */
2305  			if (ret &&
2306  			    (ret != -EOPNOTSUPP ||
2307  			     (extent_info && extent_info->is_new_extent)))
2308  				btrfs_abort_transaction(trans, ret);
2309  			break;
2310  		}
2311  
2312  		trans->block_rsv = &fs_info->trans_block_rsv;
2313  
2314  		if (!extent_info && cur_offset < drop_args.drop_end &&
2315  		    cur_offset < ino_size) {
2316  			ret = fill_holes(trans, inode, path, cur_offset,
2317  					 drop_args.drop_end);
2318  			if (ret) {
2319  				/*
2320  				 * If we failed then we didn't insert our hole
2321  				 * entries for the area we dropped, so now the
2322  				 * fs is corrupted, so we must abort the
2323  				 * transaction.
2324  				 */
2325  				btrfs_abort_transaction(trans, ret);
2326  				break;
2327  			}
2328  		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2329  			/*
2330  			 * We are past the i_size here, but since we didn't
2331  			 * insert holes we need to clear the mapped area so we
2332  			 * know to not set disk_i_size in this area until a new
2333  			 * file extent is inserted here.
2334  			 */
2335  			ret = btrfs_inode_clear_file_extent_range(inode,
2336  					cur_offset,
2337  					drop_args.drop_end - cur_offset);
2338  			if (ret) {
2339  				/*
2340  				 * We couldn't clear our area, so we could
2341  				 * presumably adjust up and corrupt the fs, so
2342  				 * we need to abort.
2343  				 */
2344  				btrfs_abort_transaction(trans, ret);
2345  				break;
2346  			}
2347  		}
2348  
2349  		if (extent_info &&
2350  		    drop_args.drop_end > extent_info->file_offset) {
2351  			u64 replace_len = drop_args.drop_end -
2352  					  extent_info->file_offset;
2353  
2354  			ret = btrfs_insert_replace_extent(trans, inode,	path,
2355  					extent_info, replace_len,
2356  					drop_args.bytes_found);
2357  			if (ret) {
2358  				btrfs_abort_transaction(trans, ret);
2359  				break;
2360  			}
2361  			extent_info->data_len -= replace_len;
2362  			extent_info->data_offset += replace_len;
2363  			extent_info->file_offset += replace_len;
2364  		}
2365  
2366  		/*
2367  		 * We are releasing our handle on the transaction, balance the
2368  		 * dirty pages of the btree inode and flush delayed items, and
2369  		 * then get a new transaction handle, which may now point to a
2370  		 * new transaction in case someone else may have committed the
2371  		 * transaction we used to replace/drop file extent items. So
2372  		 * bump the inode's iversion and update mtime and ctime except
2373  		 * if we are called from a dedupe context. This is because a
2374  		 * power failure/crash may happen after the transaction is
2375  		 * committed and before we finish replacing/dropping all the
2376  		 * file extent items we need.
2377  		 */
2378  		inode_inc_iversion(&inode->vfs_inode);
2379  
2380  		if (!extent_info || extent_info->update_times)
2381  			inode_set_mtime_to_ts(&inode->vfs_inode,
2382  					      inode_set_ctime_current(&inode->vfs_inode));
2383  
2384  		ret = btrfs_update_inode(trans, inode);
2385  		if (ret)
2386  			break;
2387  
2388  		btrfs_end_transaction(trans);
2389  		btrfs_btree_balance_dirty(fs_info);
2390  
2391  		trans = btrfs_start_transaction(root, rsv_count);
2392  		if (IS_ERR(trans)) {
2393  			ret = PTR_ERR(trans);
2394  			trans = NULL;
2395  			break;
2396  		}
2397  
2398  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2399  					      rsv, min_size, false);
2400  		if (WARN_ON(ret))
2401  			break;
2402  		trans->block_rsv = rsv;
2403  
2404  		cur_offset = drop_args.drop_end;
2405  		len = end - cur_offset;
2406  		if (!extent_info && len) {
2407  			ret = find_first_non_hole(inode, &cur_offset, &len);
2408  			if (unlikely(ret < 0))
2409  				break;
2410  			if (ret && !len) {
2411  				ret = 0;
2412  				break;
2413  			}
2414  		}
2415  	}
2416  
2417  	/*
2418  	 * If we were cloning, force the next fsync to be a full one since we
2419  	 * we replaced (or just dropped in the case of cloning holes when
2420  	 * NO_HOLES is enabled) file extent items and did not setup new extent
2421  	 * maps for the replacement extents (or holes).
2422  	 */
2423  	if (extent_info && !extent_info->is_new_extent)
2424  		btrfs_set_inode_full_sync(inode);
2425  
2426  	if (ret)
2427  		goto out_trans;
2428  
2429  	trans->block_rsv = &fs_info->trans_block_rsv;
2430  	/*
2431  	 * If we are using the NO_HOLES feature we might have had already an
2432  	 * hole that overlaps a part of the region [lockstart, lockend] and
2433  	 * ends at (or beyond) lockend. Since we have no file extent items to
2434  	 * represent holes, drop_end can be less than lockend and so we must
2435  	 * make sure we have an extent map representing the existing hole (the
2436  	 * call to __btrfs_drop_extents() might have dropped the existing extent
2437  	 * map representing the existing hole), otherwise the fast fsync path
2438  	 * will not record the existence of the hole region
2439  	 * [existing_hole_start, lockend].
2440  	 */
2441  	if (drop_args.drop_end <= end)
2442  		drop_args.drop_end = end + 1;
2443  	/*
2444  	 * Don't insert file hole extent item if it's for a range beyond eof
2445  	 * (because it's useless) or if it represents a 0 bytes range (when
2446  	 * cur_offset == drop_end).
2447  	 */
2448  	if (!extent_info && cur_offset < ino_size &&
2449  	    cur_offset < drop_args.drop_end) {
2450  		ret = fill_holes(trans, inode, path, cur_offset,
2451  				 drop_args.drop_end);
2452  		if (ret) {
2453  			/* Same comment as above. */
2454  			btrfs_abort_transaction(trans, ret);
2455  			goto out_trans;
2456  		}
2457  	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2458  		/* See the comment in the loop above for the reasoning here. */
2459  		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2460  					drop_args.drop_end - cur_offset);
2461  		if (ret) {
2462  			btrfs_abort_transaction(trans, ret);
2463  			goto out_trans;
2464  		}
2465  
2466  	}
2467  	if (extent_info) {
2468  		ret = btrfs_insert_replace_extent(trans, inode, path,
2469  				extent_info, extent_info->data_len,
2470  				drop_args.bytes_found);
2471  		if (ret) {
2472  			btrfs_abort_transaction(trans, ret);
2473  			goto out_trans;
2474  		}
2475  	}
2476  
2477  out_trans:
2478  	if (!trans)
2479  		goto out_free;
2480  
2481  	trans->block_rsv = &fs_info->trans_block_rsv;
2482  	if (ret)
2483  		btrfs_end_transaction(trans);
2484  	else
2485  		*trans_out = trans;
2486  out_free:
2487  	btrfs_free_block_rsv(fs_info, rsv);
2488  out:
2489  	return ret;
2490  }
2491  
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2492  static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2493  {
2494  	struct inode *inode = file_inode(file);
2495  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2496  	struct btrfs_root *root = BTRFS_I(inode)->root;
2497  	struct extent_state *cached_state = NULL;
2498  	struct btrfs_path *path;
2499  	struct btrfs_trans_handle *trans = NULL;
2500  	u64 lockstart;
2501  	u64 lockend;
2502  	u64 tail_start;
2503  	u64 tail_len;
2504  	u64 orig_start = offset;
2505  	int ret = 0;
2506  	bool same_block;
2507  	u64 ino_size;
2508  	bool truncated_block = false;
2509  	bool updated_inode = false;
2510  
2511  	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2512  
2513  	ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2514  	if (ret)
2515  		goto out_only_mutex;
2516  
2517  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2518  	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2519  	if (ret < 0)
2520  		goto out_only_mutex;
2521  	if (ret && !len) {
2522  		/* Already in a large hole */
2523  		ret = 0;
2524  		goto out_only_mutex;
2525  	}
2526  
2527  	ret = file_modified(file);
2528  	if (ret)
2529  		goto out_only_mutex;
2530  
2531  	lockstart = round_up(offset, fs_info->sectorsize);
2532  	lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2533  	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2534  		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2535  	/*
2536  	 * We needn't truncate any block which is beyond the end of the file
2537  	 * because we are sure there is no data there.
2538  	 */
2539  	/*
2540  	 * Only do this if we are in the same block and we aren't doing the
2541  	 * entire block.
2542  	 */
2543  	if (same_block && len < fs_info->sectorsize) {
2544  		if (offset < ino_size) {
2545  			truncated_block = true;
2546  			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2547  						   0);
2548  		} else {
2549  			ret = 0;
2550  		}
2551  		goto out_only_mutex;
2552  	}
2553  
2554  	/* zero back part of the first block */
2555  	if (offset < ino_size) {
2556  		truncated_block = true;
2557  		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2558  		if (ret) {
2559  			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2560  			return ret;
2561  		}
2562  	}
2563  
2564  	/* Check the aligned pages after the first unaligned page,
2565  	 * if offset != orig_start, which means the first unaligned page
2566  	 * including several following pages are already in holes,
2567  	 * the extra check can be skipped */
2568  	if (offset == orig_start) {
2569  		/* after truncate page, check hole again */
2570  		len = offset + len - lockstart;
2571  		offset = lockstart;
2572  		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2573  		if (ret < 0)
2574  			goto out_only_mutex;
2575  		if (ret && !len) {
2576  			ret = 0;
2577  			goto out_only_mutex;
2578  		}
2579  		lockstart = offset;
2580  	}
2581  
2582  	/* Check the tail unaligned part is in a hole */
2583  	tail_start = lockend + 1;
2584  	tail_len = offset + len - tail_start;
2585  	if (tail_len) {
2586  		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2587  		if (unlikely(ret < 0))
2588  			goto out_only_mutex;
2589  		if (!ret) {
2590  			/* zero the front end of the last page */
2591  			if (tail_start + tail_len < ino_size) {
2592  				truncated_block = true;
2593  				ret = btrfs_truncate_block(BTRFS_I(inode),
2594  							tail_start + tail_len,
2595  							0, 1);
2596  				if (ret)
2597  					goto out_only_mutex;
2598  			}
2599  		}
2600  	}
2601  
2602  	if (lockend < lockstart) {
2603  		ret = 0;
2604  		goto out_only_mutex;
2605  	}
2606  
2607  	btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2608  
2609  	path = btrfs_alloc_path();
2610  	if (!path) {
2611  		ret = -ENOMEM;
2612  		goto out;
2613  	}
2614  
2615  	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2616  					 lockend, NULL, &trans);
2617  	btrfs_free_path(path);
2618  	if (ret)
2619  		goto out;
2620  
2621  	ASSERT(trans != NULL);
2622  	inode_inc_iversion(inode);
2623  	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2624  	ret = btrfs_update_inode(trans, BTRFS_I(inode));
2625  	updated_inode = true;
2626  	btrfs_end_transaction(trans);
2627  	btrfs_btree_balance_dirty(fs_info);
2628  out:
2629  	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2630  		      &cached_state);
2631  out_only_mutex:
2632  	if (!updated_inode && truncated_block && !ret) {
2633  		/*
2634  		 * If we only end up zeroing part of a page, we still need to
2635  		 * update the inode item, so that all the time fields are
2636  		 * updated as well as the necessary btrfs inode in memory fields
2637  		 * for detecting, at fsync time, if the inode isn't yet in the
2638  		 * log tree or it's there but not up to date.
2639  		 */
2640  		struct timespec64 now = inode_set_ctime_current(inode);
2641  
2642  		inode_inc_iversion(inode);
2643  		inode_set_mtime_to_ts(inode, now);
2644  		trans = btrfs_start_transaction(root, 1);
2645  		if (IS_ERR(trans)) {
2646  			ret = PTR_ERR(trans);
2647  		} else {
2648  			int ret2;
2649  
2650  			ret = btrfs_update_inode(trans, BTRFS_I(inode));
2651  			ret2 = btrfs_end_transaction(trans);
2652  			if (!ret)
2653  				ret = ret2;
2654  		}
2655  	}
2656  	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2657  	return ret;
2658  }
2659  
2660  /* Helper structure to record which range is already reserved */
2661  struct falloc_range {
2662  	struct list_head list;
2663  	u64 start;
2664  	u64 len;
2665  };
2666  
2667  /*
2668   * Helper function to add falloc range
2669   *
2670   * Caller should have locked the larger range of extent containing
2671   * [start, len)
2672   */
add_falloc_range(struct list_head * head,u64 start,u64 len)2673  static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2674  {
2675  	struct falloc_range *range = NULL;
2676  
2677  	if (!list_empty(head)) {
2678  		/*
2679  		 * As fallocate iterates by bytenr order, we only need to check
2680  		 * the last range.
2681  		 */
2682  		range = list_last_entry(head, struct falloc_range, list);
2683  		if (range->start + range->len == start) {
2684  			range->len += len;
2685  			return 0;
2686  		}
2687  	}
2688  
2689  	range = kmalloc(sizeof(*range), GFP_KERNEL);
2690  	if (!range)
2691  		return -ENOMEM;
2692  	range->start = start;
2693  	range->len = len;
2694  	list_add_tail(&range->list, head);
2695  	return 0;
2696  }
2697  
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2698  static int btrfs_fallocate_update_isize(struct inode *inode,
2699  					const u64 end,
2700  					const int mode)
2701  {
2702  	struct btrfs_trans_handle *trans;
2703  	struct btrfs_root *root = BTRFS_I(inode)->root;
2704  	int ret;
2705  	int ret2;
2706  
2707  	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2708  		return 0;
2709  
2710  	trans = btrfs_start_transaction(root, 1);
2711  	if (IS_ERR(trans))
2712  		return PTR_ERR(trans);
2713  
2714  	inode_set_ctime_current(inode);
2715  	i_size_write(inode, end);
2716  	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2717  	ret = btrfs_update_inode(trans, BTRFS_I(inode));
2718  	ret2 = btrfs_end_transaction(trans);
2719  
2720  	return ret ? ret : ret2;
2721  }
2722  
2723  enum {
2724  	RANGE_BOUNDARY_WRITTEN_EXTENT,
2725  	RANGE_BOUNDARY_PREALLOC_EXTENT,
2726  	RANGE_BOUNDARY_HOLE,
2727  };
2728  
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2729  static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2730  						 u64 offset)
2731  {
2732  	const u64 sectorsize = inode->root->fs_info->sectorsize;
2733  	struct extent_map *em;
2734  	int ret;
2735  
2736  	offset = round_down(offset, sectorsize);
2737  	em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2738  	if (IS_ERR(em))
2739  		return PTR_ERR(em);
2740  
2741  	if (em->disk_bytenr == EXTENT_MAP_HOLE)
2742  		ret = RANGE_BOUNDARY_HOLE;
2743  	else if (em->flags & EXTENT_FLAG_PREALLOC)
2744  		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2745  	else
2746  		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2747  
2748  	free_extent_map(em);
2749  	return ret;
2750  }
2751  
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2752  static int btrfs_zero_range(struct inode *inode,
2753  			    loff_t offset,
2754  			    loff_t len,
2755  			    const int mode)
2756  {
2757  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2758  	struct extent_map *em;
2759  	struct extent_changeset *data_reserved = NULL;
2760  	int ret;
2761  	u64 alloc_hint = 0;
2762  	const u64 sectorsize = fs_info->sectorsize;
2763  	u64 alloc_start = round_down(offset, sectorsize);
2764  	u64 alloc_end = round_up(offset + len, sectorsize);
2765  	u64 bytes_to_reserve = 0;
2766  	bool space_reserved = false;
2767  
2768  	em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2769  			      alloc_end - alloc_start);
2770  	if (IS_ERR(em)) {
2771  		ret = PTR_ERR(em);
2772  		goto out;
2773  	}
2774  
2775  	/*
2776  	 * Avoid hole punching and extent allocation for some cases. More cases
2777  	 * could be considered, but these are unlikely common and we keep things
2778  	 * as simple as possible for now. Also, intentionally, if the target
2779  	 * range contains one or more prealloc extents together with regular
2780  	 * extents and holes, we drop all the existing extents and allocate a
2781  	 * new prealloc extent, so that we get a larger contiguous disk extent.
2782  	 */
2783  	if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2784  		const u64 em_end = em->start + em->len;
2785  
2786  		if (em_end >= offset + len) {
2787  			/*
2788  			 * The whole range is already a prealloc extent,
2789  			 * do nothing except updating the inode's i_size if
2790  			 * needed.
2791  			 */
2792  			free_extent_map(em);
2793  			ret = btrfs_fallocate_update_isize(inode, offset + len,
2794  							   mode);
2795  			goto out;
2796  		}
2797  		/*
2798  		 * Part of the range is already a prealloc extent, so operate
2799  		 * only on the remaining part of the range.
2800  		 */
2801  		alloc_start = em_end;
2802  		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2803  		len = offset + len - alloc_start;
2804  		offset = alloc_start;
2805  		alloc_hint = extent_map_block_start(em) + em->len;
2806  	}
2807  	free_extent_map(em);
2808  
2809  	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2810  	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2811  		em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2812  		if (IS_ERR(em)) {
2813  			ret = PTR_ERR(em);
2814  			goto out;
2815  		}
2816  
2817  		if (em->flags & EXTENT_FLAG_PREALLOC) {
2818  			free_extent_map(em);
2819  			ret = btrfs_fallocate_update_isize(inode, offset + len,
2820  							   mode);
2821  			goto out;
2822  		}
2823  		if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2824  			free_extent_map(em);
2825  			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2826  						   0);
2827  			if (!ret)
2828  				ret = btrfs_fallocate_update_isize(inode,
2829  								   offset + len,
2830  								   mode);
2831  			return ret;
2832  		}
2833  		free_extent_map(em);
2834  		alloc_start = round_down(offset, sectorsize);
2835  		alloc_end = alloc_start + sectorsize;
2836  		goto reserve_space;
2837  	}
2838  
2839  	alloc_start = round_up(offset, sectorsize);
2840  	alloc_end = round_down(offset + len, sectorsize);
2841  
2842  	/*
2843  	 * For unaligned ranges, check the pages at the boundaries, they might
2844  	 * map to an extent, in which case we need to partially zero them, or
2845  	 * they might map to a hole, in which case we need our allocation range
2846  	 * to cover them.
2847  	 */
2848  	if (!IS_ALIGNED(offset, sectorsize)) {
2849  		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2850  							    offset);
2851  		if (ret < 0)
2852  			goto out;
2853  		if (ret == RANGE_BOUNDARY_HOLE) {
2854  			alloc_start = round_down(offset, sectorsize);
2855  			ret = 0;
2856  		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2857  			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2858  			if (ret)
2859  				goto out;
2860  		} else {
2861  			ret = 0;
2862  		}
2863  	}
2864  
2865  	if (!IS_ALIGNED(offset + len, sectorsize)) {
2866  		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2867  							    offset + len);
2868  		if (ret < 0)
2869  			goto out;
2870  		if (ret == RANGE_BOUNDARY_HOLE) {
2871  			alloc_end = round_up(offset + len, sectorsize);
2872  			ret = 0;
2873  		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2874  			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2875  						   0, 1);
2876  			if (ret)
2877  				goto out;
2878  		} else {
2879  			ret = 0;
2880  		}
2881  	}
2882  
2883  reserve_space:
2884  	if (alloc_start < alloc_end) {
2885  		struct extent_state *cached_state = NULL;
2886  		const u64 lockstart = alloc_start;
2887  		const u64 lockend = alloc_end - 1;
2888  
2889  		bytes_to_reserve = alloc_end - alloc_start;
2890  		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2891  						      bytes_to_reserve);
2892  		if (ret < 0)
2893  			goto out;
2894  		space_reserved = true;
2895  		btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2896  					    &cached_state);
2897  		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2898  						alloc_start, bytes_to_reserve);
2899  		if (ret) {
2900  			unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
2901  				      lockend, &cached_state);
2902  			goto out;
2903  		}
2904  		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
2905  						alloc_end - alloc_start,
2906  						fs_info->sectorsize,
2907  						offset + len, &alloc_hint);
2908  		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2909  			      &cached_state);
2910  		/* btrfs_prealloc_file_range releases reserved space on error */
2911  		if (ret) {
2912  			space_reserved = false;
2913  			goto out;
2914  		}
2915  	}
2916  	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
2917   out:
2918  	if (ret && space_reserved)
2919  		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
2920  					       alloc_start, bytes_to_reserve);
2921  	extent_changeset_free(data_reserved);
2922  
2923  	return ret;
2924  }
2925  
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)2926  static long btrfs_fallocate(struct file *file, int mode,
2927  			    loff_t offset, loff_t len)
2928  {
2929  	struct inode *inode = file_inode(file);
2930  	struct extent_state *cached_state = NULL;
2931  	struct extent_changeset *data_reserved = NULL;
2932  	struct falloc_range *range;
2933  	struct falloc_range *tmp;
2934  	LIST_HEAD(reserve_list);
2935  	u64 cur_offset;
2936  	u64 last_byte;
2937  	u64 alloc_start;
2938  	u64 alloc_end;
2939  	u64 alloc_hint = 0;
2940  	u64 locked_end;
2941  	u64 actual_end = 0;
2942  	u64 data_space_needed = 0;
2943  	u64 data_space_reserved = 0;
2944  	u64 qgroup_reserved = 0;
2945  	struct extent_map *em;
2946  	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
2947  	int ret;
2948  
2949  	/* Do not allow fallocate in ZONED mode */
2950  	if (btrfs_is_zoned(inode_to_fs_info(inode)))
2951  		return -EOPNOTSUPP;
2952  
2953  	alloc_start = round_down(offset, blocksize);
2954  	alloc_end = round_up(offset + len, blocksize);
2955  	cur_offset = alloc_start;
2956  
2957  	/* Make sure we aren't being give some crap mode */
2958  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
2959  		     FALLOC_FL_ZERO_RANGE))
2960  		return -EOPNOTSUPP;
2961  
2962  	if (mode & FALLOC_FL_PUNCH_HOLE)
2963  		return btrfs_punch_hole(file, offset, len);
2964  
2965  	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2966  
2967  	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
2968  		ret = inode_newsize_ok(inode, offset + len);
2969  		if (ret)
2970  			goto out;
2971  	}
2972  
2973  	ret = file_modified(file);
2974  	if (ret)
2975  		goto out;
2976  
2977  	/*
2978  	 * TODO: Move these two operations after we have checked
2979  	 * accurate reserved space, or fallocate can still fail but
2980  	 * with page truncated or size expanded.
2981  	 *
2982  	 * But that's a minor problem and won't do much harm BTW.
2983  	 */
2984  	if (alloc_start > inode->i_size) {
2985  		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
2986  					alloc_start);
2987  		if (ret)
2988  			goto out;
2989  	} else if (offset + len > inode->i_size) {
2990  		/*
2991  		 * If we are fallocating from the end of the file onward we
2992  		 * need to zero out the end of the block if i_size lands in the
2993  		 * middle of a block.
2994  		 */
2995  		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
2996  		if (ret)
2997  			goto out;
2998  	}
2999  
3000  	/*
3001  	 * We have locked the inode at the VFS level (in exclusive mode) and we
3002  	 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3003  	 * locking the file range, flush all dealloc in the range and wait for
3004  	 * all ordered extents in the range to complete. After this we can lock
3005  	 * the file range and, due to the previous locking we did, we know there
3006  	 * can't be more delalloc or ordered extents in the range.
3007  	 */
3008  	ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3009  				       alloc_end - alloc_start);
3010  	if (ret)
3011  		goto out;
3012  
3013  	if (mode & FALLOC_FL_ZERO_RANGE) {
3014  		ret = btrfs_zero_range(inode, offset, len, mode);
3015  		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3016  		return ret;
3017  	}
3018  
3019  	locked_end = alloc_end - 1;
3020  	lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3021  		    &cached_state);
3022  
3023  	btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3024  
3025  	/* First, check if we exceed the qgroup limit */
3026  	while (cur_offset < alloc_end) {
3027  		em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3028  				      alloc_end - cur_offset);
3029  		if (IS_ERR(em)) {
3030  			ret = PTR_ERR(em);
3031  			break;
3032  		}
3033  		last_byte = min(extent_map_end(em), alloc_end);
3034  		actual_end = min_t(u64, extent_map_end(em), offset + len);
3035  		last_byte = ALIGN(last_byte, blocksize);
3036  		if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3037  		    (cur_offset >= inode->i_size &&
3038  		     !(em->flags & EXTENT_FLAG_PREALLOC))) {
3039  			const u64 range_len = last_byte - cur_offset;
3040  
3041  			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3042  			if (ret < 0) {
3043  				free_extent_map(em);
3044  				break;
3045  			}
3046  			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3047  					&data_reserved, cur_offset, range_len);
3048  			if (ret < 0) {
3049  				free_extent_map(em);
3050  				break;
3051  			}
3052  			qgroup_reserved += range_len;
3053  			data_space_needed += range_len;
3054  		}
3055  		free_extent_map(em);
3056  		cur_offset = last_byte;
3057  	}
3058  
3059  	if (!ret && data_space_needed > 0) {
3060  		/*
3061  		 * We are safe to reserve space here as we can't have delalloc
3062  		 * in the range, see above.
3063  		 */
3064  		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3065  						      data_space_needed);
3066  		if (!ret)
3067  			data_space_reserved = data_space_needed;
3068  	}
3069  
3070  	/*
3071  	 * If ret is still 0, means we're OK to fallocate.
3072  	 * Or just cleanup the list and exit.
3073  	 */
3074  	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3075  		if (!ret) {
3076  			ret = btrfs_prealloc_file_range(inode, mode,
3077  					range->start,
3078  					range->len, blocksize,
3079  					offset + len, &alloc_hint);
3080  			/*
3081  			 * btrfs_prealloc_file_range() releases space even
3082  			 * if it returns an error.
3083  			 */
3084  			data_space_reserved -= range->len;
3085  			qgroup_reserved -= range->len;
3086  		} else if (data_space_reserved > 0) {
3087  			btrfs_free_reserved_data_space(BTRFS_I(inode),
3088  					       data_reserved, range->start,
3089  					       range->len);
3090  			data_space_reserved -= range->len;
3091  			qgroup_reserved -= range->len;
3092  		} else if (qgroup_reserved > 0) {
3093  			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3094  					       range->start, range->len, NULL);
3095  			qgroup_reserved -= range->len;
3096  		}
3097  		list_del(&range->list);
3098  		kfree(range);
3099  	}
3100  	if (ret < 0)
3101  		goto out_unlock;
3102  
3103  	/*
3104  	 * We didn't need to allocate any more space, but we still extended the
3105  	 * size of the file so we need to update i_size and the inode item.
3106  	 */
3107  	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3108  out_unlock:
3109  	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3110  		      &cached_state);
3111  out:
3112  	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3113  	extent_changeset_free(data_reserved);
3114  	return ret;
3115  }
3116  
3117  /*
3118   * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3119   * that has unflushed and/or flushing delalloc. There might be other adjacent
3120   * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3121   * looping while it gets adjacent subranges, and merging them together.
3122   */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3123  static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3124  				   struct extent_state **cached_state,
3125  				   bool *search_io_tree,
3126  				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3127  {
3128  	u64 len = end + 1 - start;
3129  	u64 delalloc_len = 0;
3130  	struct btrfs_ordered_extent *oe;
3131  	u64 oe_start;
3132  	u64 oe_end;
3133  
3134  	/*
3135  	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3136  	 * means we have delalloc (dirty pages) for which writeback has not
3137  	 * started yet.
3138  	 */
3139  	if (*search_io_tree) {
3140  		spin_lock(&inode->lock);
3141  		if (inode->delalloc_bytes > 0) {
3142  			spin_unlock(&inode->lock);
3143  			*delalloc_start_ret = start;
3144  			delalloc_len = count_range_bits(&inode->io_tree,
3145  							delalloc_start_ret, end,
3146  							len, EXTENT_DELALLOC, 1,
3147  							cached_state);
3148  		} else {
3149  			spin_unlock(&inode->lock);
3150  		}
3151  	}
3152  
3153  	if (delalloc_len > 0) {
3154  		/*
3155  		 * If delalloc was found then *delalloc_start_ret has a sector size
3156  		 * aligned value (rounded down).
3157  		 */
3158  		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3159  
3160  		if (*delalloc_start_ret == start) {
3161  			/* Delalloc for the whole range, nothing more to do. */
3162  			if (*delalloc_end_ret == end)
3163  				return true;
3164  			/* Else trim our search range for ordered extents. */
3165  			start = *delalloc_end_ret + 1;
3166  			len = end + 1 - start;
3167  		}
3168  	} else {
3169  		/* No delalloc, future calls don't need to search again. */
3170  		*search_io_tree = false;
3171  	}
3172  
3173  	/*
3174  	 * Now also check if there's any ordered extent in the range.
3175  	 * We do this because:
3176  	 *
3177  	 * 1) When delalloc is flushed, the file range is locked, we clear the
3178  	 *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3179  	 *    an ordered extent for the write. So we might just have been called
3180  	 *    after delalloc is flushed and before the ordered extent completes
3181  	 *    and inserts the new file extent item in the subvolume's btree;
3182  	 *
3183  	 * 2) We may have an ordered extent created by flushing delalloc for a
3184  	 *    subrange that starts before the subrange we found marked with
3185  	 *    EXTENT_DELALLOC in the io tree.
3186  	 *
3187  	 * We could also use the extent map tree to find such delalloc that is
3188  	 * being flushed, but using the ordered extents tree is more efficient
3189  	 * because it's usually much smaller as ordered extents are removed from
3190  	 * the tree once they complete. With the extent maps, we mau have them
3191  	 * in the extent map tree for a very long time, and they were either
3192  	 * created by previous writes or loaded by read operations.
3193  	 */
3194  	oe = btrfs_lookup_first_ordered_range(inode, start, len);
3195  	if (!oe)
3196  		return (delalloc_len > 0);
3197  
3198  	/* The ordered extent may span beyond our search range. */
3199  	oe_start = max(oe->file_offset, start);
3200  	oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3201  
3202  	btrfs_put_ordered_extent(oe);
3203  
3204  	/* Don't have unflushed delalloc, return the ordered extent range. */
3205  	if (delalloc_len == 0) {
3206  		*delalloc_start_ret = oe_start;
3207  		*delalloc_end_ret = oe_end;
3208  		return true;
3209  	}
3210  
3211  	/*
3212  	 * We have both unflushed delalloc (io_tree) and an ordered extent.
3213  	 * If the ranges are adjacent returned a combined range, otherwise
3214  	 * return the leftmost range.
3215  	 */
3216  	if (oe_start < *delalloc_start_ret) {
3217  		if (oe_end < *delalloc_start_ret)
3218  			*delalloc_end_ret = oe_end;
3219  		*delalloc_start_ret = oe_start;
3220  	} else if (*delalloc_end_ret + 1 == oe_start) {
3221  		*delalloc_end_ret = oe_end;
3222  	}
3223  
3224  	return true;
3225  }
3226  
3227  /*
3228   * Check if there's delalloc in a given range.
3229   *
3230   * @inode:               The inode.
3231   * @start:               The start offset of the range. It does not need to be
3232   *                       sector size aligned.
3233   * @end:                 The end offset (inclusive value) of the search range.
3234   *                       It does not need to be sector size aligned.
3235   * @cached_state:        Extent state record used for speeding up delalloc
3236   *                       searches in the inode's io_tree. Can be NULL.
3237   * @delalloc_start_ret:  Output argument, set to the start offset of the
3238   *                       subrange found with delalloc (may not be sector size
3239   *                       aligned).
3240   * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3241   *                       of the subrange found with delalloc.
3242   *
3243   * Returns true if a subrange with delalloc is found within the given range, and
3244   * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3245   * end offsets of the subrange.
3246   */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3247  bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3248  				  struct extent_state **cached_state,
3249  				  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3250  {
3251  	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3252  	u64 prev_delalloc_end = 0;
3253  	bool search_io_tree = true;
3254  	bool ret = false;
3255  
3256  	while (cur_offset <= end) {
3257  		u64 delalloc_start;
3258  		u64 delalloc_end;
3259  		bool delalloc;
3260  
3261  		delalloc = find_delalloc_subrange(inode, cur_offset, end,
3262  						  cached_state, &search_io_tree,
3263  						  &delalloc_start,
3264  						  &delalloc_end);
3265  		if (!delalloc)
3266  			break;
3267  
3268  		if (prev_delalloc_end == 0) {
3269  			/* First subrange found. */
3270  			*delalloc_start_ret = max(delalloc_start, start);
3271  			*delalloc_end_ret = delalloc_end;
3272  			ret = true;
3273  		} else if (delalloc_start == prev_delalloc_end + 1) {
3274  			/* Subrange adjacent to the previous one, merge them. */
3275  			*delalloc_end_ret = delalloc_end;
3276  		} else {
3277  			/* Subrange not adjacent to the previous one, exit. */
3278  			break;
3279  		}
3280  
3281  		prev_delalloc_end = delalloc_end;
3282  		cur_offset = delalloc_end + 1;
3283  		cond_resched();
3284  	}
3285  
3286  	return ret;
3287  }
3288  
3289  /*
3290   * Check if there's a hole or delalloc range in a range representing a hole (or
3291   * prealloc extent) found in the inode's subvolume btree.
3292   *
3293   * @inode:      The inode.
3294   * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3295   * @start:      Start offset of the hole region. It does not need to be sector
3296   *              size aligned.
3297   * @end:        End offset (inclusive value) of the hole region. It does not
3298   *              need to be sector size aligned.
3299   * @start_ret:  Return parameter, used to set the start of the subrange in the
3300   *              hole that matches the search criteria (seek mode), if such
3301   *              subrange is found (return value of the function is true).
3302   *              The value returned here may not be sector size aligned.
3303   *
3304   * Returns true if a subrange matching the given seek mode is found, and if one
3305   * is found, it updates @start_ret with the start of the subrange.
3306   */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3307  static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3308  					struct extent_state **cached_state,
3309  					u64 start, u64 end, u64 *start_ret)
3310  {
3311  	u64 delalloc_start;
3312  	u64 delalloc_end;
3313  	bool delalloc;
3314  
3315  	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3316  						&delalloc_start, &delalloc_end);
3317  	if (delalloc && whence == SEEK_DATA) {
3318  		*start_ret = delalloc_start;
3319  		return true;
3320  	}
3321  
3322  	if (delalloc && whence == SEEK_HOLE) {
3323  		/*
3324  		 * We found delalloc but it starts after out start offset. So we
3325  		 * have a hole between our start offset and the delalloc start.
3326  		 */
3327  		if (start < delalloc_start) {
3328  			*start_ret = start;
3329  			return true;
3330  		}
3331  		/*
3332  		 * Delalloc range starts at our start offset.
3333  		 * If the delalloc range's length is smaller than our range,
3334  		 * then it means we have a hole that starts where the delalloc
3335  		 * subrange ends.
3336  		 */
3337  		if (delalloc_end < end) {
3338  			*start_ret = delalloc_end + 1;
3339  			return true;
3340  		}
3341  
3342  		/* There's delalloc for the whole range. */
3343  		return false;
3344  	}
3345  
3346  	if (!delalloc && whence == SEEK_HOLE) {
3347  		*start_ret = start;
3348  		return true;
3349  	}
3350  
3351  	/*
3352  	 * No delalloc in the range and we are seeking for data. The caller has
3353  	 * to iterate to the next extent item in the subvolume btree.
3354  	 */
3355  	return false;
3356  }
3357  
find_desired_extent(struct file * file,loff_t offset,int whence)3358  static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3359  {
3360  	struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3361  	struct btrfs_file_private *private;
3362  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3363  	struct extent_state *cached_state = NULL;
3364  	struct extent_state **delalloc_cached_state;
3365  	const loff_t i_size = i_size_read(&inode->vfs_inode);
3366  	const u64 ino = btrfs_ino(inode);
3367  	struct btrfs_root *root = inode->root;
3368  	struct btrfs_path *path;
3369  	struct btrfs_key key;
3370  	u64 last_extent_end;
3371  	u64 lockstart;
3372  	u64 lockend;
3373  	u64 start;
3374  	int ret;
3375  	bool found = false;
3376  
3377  	if (i_size == 0 || offset >= i_size)
3378  		return -ENXIO;
3379  
3380  	/*
3381  	 * Quick path. If the inode has no prealloc extents and its number of
3382  	 * bytes used matches its i_size, then it can not have holes.
3383  	 */
3384  	if (whence == SEEK_HOLE &&
3385  	    !(inode->flags & BTRFS_INODE_PREALLOC) &&
3386  	    inode_get_bytes(&inode->vfs_inode) == i_size)
3387  		return i_size;
3388  
3389  	spin_lock(&inode->lock);
3390  	private = file->private_data;
3391  	spin_unlock(&inode->lock);
3392  
3393  	if (private && private->owner_task != current) {
3394  		/*
3395  		 * Not allocated by us, don't use it as its cached state is used
3396  		 * by the task that allocated it and we don't want neither to
3397  		 * mess with it nor get incorrect results because it reflects an
3398  		 * invalid state for the current task.
3399  		 */
3400  		private = NULL;
3401  	} else if (!private) {
3402  		private = kzalloc(sizeof(*private), GFP_KERNEL);
3403  		/*
3404  		 * No worries if memory allocation failed.
3405  		 * The private structure is used only for speeding up multiple
3406  		 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3407  		 * so everything will still be correct.
3408  		 */
3409  		if (private) {
3410  			bool free = false;
3411  
3412  			private->owner_task = current;
3413  
3414  			spin_lock(&inode->lock);
3415  			if (file->private_data)
3416  				free = true;
3417  			else
3418  				file->private_data = private;
3419  			spin_unlock(&inode->lock);
3420  
3421  			if (free) {
3422  				kfree(private);
3423  				private = NULL;
3424  			}
3425  		}
3426  	}
3427  
3428  	if (private)
3429  		delalloc_cached_state = &private->llseek_cached_state;
3430  	else
3431  		delalloc_cached_state = NULL;
3432  
3433  	/*
3434  	 * offset can be negative, in this case we start finding DATA/HOLE from
3435  	 * the very start of the file.
3436  	 */
3437  	start = max_t(loff_t, 0, offset);
3438  
3439  	lockstart = round_down(start, fs_info->sectorsize);
3440  	lockend = round_up(i_size, fs_info->sectorsize);
3441  	if (lockend <= lockstart)
3442  		lockend = lockstart + fs_info->sectorsize;
3443  	lockend--;
3444  
3445  	path = btrfs_alloc_path();
3446  	if (!path)
3447  		return -ENOMEM;
3448  	path->reada = READA_FORWARD;
3449  
3450  	key.objectid = ino;
3451  	key.type = BTRFS_EXTENT_DATA_KEY;
3452  	key.offset = start;
3453  
3454  	last_extent_end = lockstart;
3455  
3456  	lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3457  
3458  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3459  	if (ret < 0) {
3460  		goto out;
3461  	} else if (ret > 0 && path->slots[0] > 0) {
3462  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3463  		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3464  			path->slots[0]--;
3465  	}
3466  
3467  	while (start < i_size) {
3468  		struct extent_buffer *leaf = path->nodes[0];
3469  		struct btrfs_file_extent_item *extent;
3470  		u64 extent_end;
3471  		u8 type;
3472  
3473  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3474  			ret = btrfs_next_leaf(root, path);
3475  			if (ret < 0)
3476  				goto out;
3477  			else if (ret > 0)
3478  				break;
3479  
3480  			leaf = path->nodes[0];
3481  		}
3482  
3483  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3484  		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3485  			break;
3486  
3487  		extent_end = btrfs_file_extent_end(path);
3488  
3489  		/*
3490  		 * In the first iteration we may have a slot that points to an
3491  		 * extent that ends before our start offset, so skip it.
3492  		 */
3493  		if (extent_end <= start) {
3494  			path->slots[0]++;
3495  			continue;
3496  		}
3497  
3498  		/* We have an implicit hole, NO_HOLES feature is likely set. */
3499  		if (last_extent_end < key.offset) {
3500  			u64 search_start = last_extent_end;
3501  			u64 found_start;
3502  
3503  			/*
3504  			 * First iteration, @start matches @offset and it's
3505  			 * within the hole.
3506  			 */
3507  			if (start == offset)
3508  				search_start = offset;
3509  
3510  			found = find_desired_extent_in_hole(inode, whence,
3511  							    delalloc_cached_state,
3512  							    search_start,
3513  							    key.offset - 1,
3514  							    &found_start);
3515  			if (found) {
3516  				start = found_start;
3517  				break;
3518  			}
3519  			/*
3520  			 * Didn't find data or a hole (due to delalloc) in the
3521  			 * implicit hole range, so need to analyze the extent.
3522  			 */
3523  		}
3524  
3525  		extent = btrfs_item_ptr(leaf, path->slots[0],
3526  					struct btrfs_file_extent_item);
3527  		type = btrfs_file_extent_type(leaf, extent);
3528  
3529  		/*
3530  		 * Can't access the extent's disk_bytenr field if this is an
3531  		 * inline extent, since at that offset, it's where the extent
3532  		 * data starts.
3533  		 */
3534  		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3535  		    (type == BTRFS_FILE_EXTENT_REG &&
3536  		     btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3537  			/*
3538  			 * Explicit hole or prealloc extent, search for delalloc.
3539  			 * A prealloc extent is treated like a hole.
3540  			 */
3541  			u64 search_start = key.offset;
3542  			u64 found_start;
3543  
3544  			/*
3545  			 * First iteration, @start matches @offset and it's
3546  			 * within the hole.
3547  			 */
3548  			if (start == offset)
3549  				search_start = offset;
3550  
3551  			found = find_desired_extent_in_hole(inode, whence,
3552  							    delalloc_cached_state,
3553  							    search_start,
3554  							    extent_end - 1,
3555  							    &found_start);
3556  			if (found) {
3557  				start = found_start;
3558  				break;
3559  			}
3560  			/*
3561  			 * Didn't find data or a hole (due to delalloc) in the
3562  			 * implicit hole range, so need to analyze the next
3563  			 * extent item.
3564  			 */
3565  		} else {
3566  			/*
3567  			 * Found a regular or inline extent.
3568  			 * If we are seeking for data, adjust the start offset
3569  			 * and stop, we're done.
3570  			 */
3571  			if (whence == SEEK_DATA) {
3572  				start = max_t(u64, key.offset, offset);
3573  				found = true;
3574  				break;
3575  			}
3576  			/*
3577  			 * Else, we are seeking for a hole, check the next file
3578  			 * extent item.
3579  			 */
3580  		}
3581  
3582  		start = extent_end;
3583  		last_extent_end = extent_end;
3584  		path->slots[0]++;
3585  		if (fatal_signal_pending(current)) {
3586  			ret = -EINTR;
3587  			goto out;
3588  		}
3589  		cond_resched();
3590  	}
3591  
3592  	/* We have an implicit hole from the last extent found up to i_size. */
3593  	if (!found && start < i_size) {
3594  		found = find_desired_extent_in_hole(inode, whence,
3595  						    delalloc_cached_state, start,
3596  						    i_size - 1, &start);
3597  		if (!found)
3598  			start = i_size;
3599  	}
3600  
3601  out:
3602  	unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3603  	btrfs_free_path(path);
3604  
3605  	if (ret < 0)
3606  		return ret;
3607  
3608  	if (whence == SEEK_DATA && start >= i_size)
3609  		return -ENXIO;
3610  
3611  	return min_t(loff_t, start, i_size);
3612  }
3613  
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3614  static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3615  {
3616  	struct inode *inode = file->f_mapping->host;
3617  
3618  	switch (whence) {
3619  	default:
3620  		return generic_file_llseek(file, offset, whence);
3621  	case SEEK_DATA:
3622  	case SEEK_HOLE:
3623  		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3624  		offset = find_desired_extent(file, offset, whence);
3625  		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3626  		break;
3627  	}
3628  
3629  	if (offset < 0)
3630  		return offset;
3631  
3632  	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3633  }
3634  
btrfs_file_open(struct inode * inode,struct file * filp)3635  static int btrfs_file_open(struct inode *inode, struct file *filp)
3636  {
3637  	int ret;
3638  
3639  	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3640  
3641  	ret = fsverity_file_open(inode, filp);
3642  	if (ret)
3643  		return ret;
3644  	return generic_file_open(inode, filp);
3645  }
3646  
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3647  static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3648  {
3649  	ssize_t ret = 0;
3650  
3651  	if (iocb->ki_flags & IOCB_DIRECT) {
3652  		ret = btrfs_direct_read(iocb, to);
3653  		if (ret < 0 || !iov_iter_count(to) ||
3654  		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3655  			return ret;
3656  	}
3657  
3658  	return filemap_read(iocb, to, ret);
3659  }
3660  
3661  const struct file_operations btrfs_file_operations = {
3662  	.llseek		= btrfs_file_llseek,
3663  	.read_iter      = btrfs_file_read_iter,
3664  	.splice_read	= filemap_splice_read,
3665  	.write_iter	= btrfs_file_write_iter,
3666  	.splice_write	= iter_file_splice_write,
3667  	.mmap		= btrfs_file_mmap,
3668  	.open		= btrfs_file_open,
3669  	.release	= btrfs_release_file,
3670  	.get_unmapped_area = thp_get_unmapped_area,
3671  	.fsync		= btrfs_sync_file,
3672  	.fallocate	= btrfs_fallocate,
3673  	.unlocked_ioctl	= btrfs_ioctl,
3674  #ifdef CONFIG_COMPAT
3675  	.compat_ioctl	= btrfs_compat_ioctl,
3676  #endif
3677  	.remap_file_range = btrfs_remap_file_range,
3678  	.uring_cmd	= btrfs_uring_cmd,
3679  	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3680  };
3681  
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3682  int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3683  {
3684  	struct address_space *mapping = inode->vfs_inode.i_mapping;
3685  	int ret;
3686  
3687  	/*
3688  	 * So with compression we will find and lock a dirty page and clear the
3689  	 * first one as dirty, setup an async extent, and immediately return
3690  	 * with the entire range locked but with nobody actually marked with
3691  	 * writeback.  So we can't just filemap_write_and_wait_range() and
3692  	 * expect it to work since it will just kick off a thread to do the
3693  	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3694  	 * since it will wait on the page lock, which won't be unlocked until
3695  	 * after the pages have been marked as writeback and so we're good to go
3696  	 * from there.  We have to do this otherwise we'll miss the ordered
3697  	 * extents and that results in badness.  Please Josef, do not think you
3698  	 * know better and pull this out at some point in the future, it is
3699  	 * right and you are wrong.
3700  	 */
3701  	ret = filemap_fdatawrite_range(mapping, start, end);
3702  	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3703  		ret = filemap_fdatawrite_range(mapping, start, end);
3704  
3705  	return ret;
3706  }
3707