xref: /linux/fs/btrfs/file.c (revision 79bdd8846317f3dea26c53d75700045f62265557)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/filelock.h>
14 #include <linux/writeback.h>
15 #include <linux/compat.h>
16 #include <linux/slab.h>
17 #include <linux/btrfs.h>
18 #include <linux/uio.h>
19 #include <linux/iversion.h>
20 #include <linux/fsverity.h>
21 #include "ctree.h"
22 #include "direct-io.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "btrfs_inode.h"
26 #include "tree-log.h"
27 #include "locking.h"
28 #include "qgroup.h"
29 #include "compression.h"
30 #include "delalloc-space.h"
31 #include "reflink.h"
32 #include "subpage.h"
33 #include "fs.h"
34 #include "accessors.h"
35 #include "extent-tree.h"
36 #include "file-item.h"
37 #include "ioctl.h"
38 #include "file.h"
39 #include "super.h"
40 #include "print-tree.h"
41 
42 /*
43  * Unlock folio after btrfs_file_write() is done with it.
44  */
45 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
46 			     u64 pos, u64 copied)
47 {
48 	u64 block_start = round_down(pos, fs_info->sectorsize);
49 	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
50 
51 	ASSERT(block_len <= U32_MAX);
52 	folio_unlock(folio);
53 	folio_put(folio);
54 }
55 
56 /*
57  * After copy_folio_from_iter_atomic(), update the following things for delalloc:
58  * - Mark newly dirtied folio as DELALLOC in the io tree.
59  *   Used to advise which range is to be written back.
60  * - Mark modified folio as Uptodate/Dirty
61  * - Update inode size for past EOF write
62  */
63 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
64 		      size_t write_bytes, struct extent_state **cached, bool noreserve)
65 {
66 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
67 	int ret = 0;
68 	u64 num_bytes;
69 	u64 start_pos;
70 	u64 end_of_last_block;
71 	const u64 end_pos = pos + write_bytes;
72 	loff_t isize = i_size_read(&inode->vfs_inode);
73 	unsigned int extra_bits = 0;
74 
75 	if (write_bytes == 0)
76 		return 0;
77 
78 	if (noreserve)
79 		extra_bits |= EXTENT_NORESERVE;
80 
81 	start_pos = round_down(pos, fs_info->sectorsize);
82 	num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize);
83 	ASSERT(num_bytes <= U32_MAX);
84 	ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos);
85 
86 	end_of_last_block = start_pos + num_bytes - 1;
87 
88 	ret = btrfs_reset_extent_delalloc(inode, start_pos, end_of_last_block,
89 					  extra_bits, cached);
90 	if (ret)
91 		return ret;
92 
93 	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
94 	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
95 
96 	/*
97 	 * we've only changed i_size in ram, and we haven't updated
98 	 * the disk i_size.  There is no need to log the inode
99 	 * at this time.
100 	 */
101 	if (end_pos > isize)
102 		i_size_write(&inode->vfs_inode, end_pos);
103 	return 0;
104 }
105 
106 /*
107  * this is very complex, but the basic idea is to drop all extents
108  * in the range start - end.  hint_block is filled in with a block number
109  * that would be a good hint to the block allocator for this file.
110  *
111  * If an extent intersects the range but is not entirely inside the range
112  * it is either truncated or split.  Anything entirely inside the range
113  * is deleted from the tree.
114  *
115  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
116  * to deal with that. We set the field 'bytes_found' of the arguments structure
117  * with the number of allocated bytes found in the target range, so that the
118  * caller can update the inode's number of bytes in an atomic way when
119  * replacing extents in a range to avoid races with stat(2).
120  */
121 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
122 		       struct btrfs_root *root, struct btrfs_inode *inode,
123 		       struct btrfs_drop_extents_args *args)
124 {
125 	struct btrfs_fs_info *fs_info = root->fs_info;
126 	struct extent_buffer *leaf;
127 	struct btrfs_file_extent_item *fi;
128 	struct btrfs_key key;
129 	struct btrfs_key new_key;
130 	u64 ino = btrfs_ino(inode);
131 	u64 search_start = args->start;
132 	u64 disk_bytenr = 0;
133 	u64 num_bytes = 0;
134 	u64 extent_offset = 0;
135 	u64 extent_end = 0;
136 	u64 last_end = args->start;
137 	int del_nr = 0;
138 	int del_slot = 0;
139 	int extent_type;
140 	int recow;
141 	int ret;
142 	int modify_tree = -1;
143 	int update_refs;
144 	bool found = false;
145 	struct btrfs_path *path = args->path;
146 
147 	args->bytes_found = 0;
148 	args->extent_inserted = false;
149 
150 	/* Must always have a path if ->replace_extent is true */
151 	ASSERT(!(args->replace_extent && !args->path));
152 
153 	if (!path) {
154 		path = btrfs_alloc_path();
155 		if (!path) {
156 			ret = -ENOMEM;
157 			goto out;
158 		}
159 	}
160 
161 	if (args->drop_cache)
162 		btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
163 
164 	if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
165 		modify_tree = 0;
166 
167 	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
168 	while (1) {
169 		recow = 0;
170 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
171 					       search_start, modify_tree);
172 		if (ret < 0)
173 			break;
174 		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
175 			leaf = path->nodes[0];
176 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
177 			if (key.objectid == ino &&
178 			    key.type == BTRFS_EXTENT_DATA_KEY)
179 				path->slots[0]--;
180 		}
181 		ret = 0;
182 next_slot:
183 		leaf = path->nodes[0];
184 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
185 			if (WARN_ON(del_nr > 0)) {
186 				btrfs_print_leaf(leaf);
187 				ret = -EINVAL;
188 				break;
189 			}
190 			ret = btrfs_next_leaf(root, path);
191 			if (ret < 0)
192 				break;
193 			if (ret > 0) {
194 				ret = 0;
195 				break;
196 			}
197 			leaf = path->nodes[0];
198 			recow = 1;
199 		}
200 
201 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
202 
203 		if (key.objectid > ino)
204 			break;
205 		if (WARN_ON_ONCE(key.objectid < ino) ||
206 		    key.type < BTRFS_EXTENT_DATA_KEY) {
207 			ASSERT(del_nr == 0);
208 			path->slots[0]++;
209 			goto next_slot;
210 		}
211 		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
212 			break;
213 
214 		fi = btrfs_item_ptr(leaf, path->slots[0],
215 				    struct btrfs_file_extent_item);
216 		extent_type = btrfs_file_extent_type(leaf, fi);
217 
218 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
219 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
220 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
221 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
222 			extent_offset = btrfs_file_extent_offset(leaf, fi);
223 			extent_end = key.offset +
224 				btrfs_file_extent_num_bytes(leaf, fi);
225 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
226 			extent_end = key.offset +
227 				btrfs_file_extent_ram_bytes(leaf, fi);
228 		} else {
229 			/* can't happen */
230 			BUG();
231 		}
232 
233 		/*
234 		 * Don't skip extent items representing 0 byte lengths. They
235 		 * used to be created (bug) if while punching holes we hit
236 		 * -ENOSPC condition. So if we find one here, just ensure we
237 		 * delete it, otherwise we would insert a new file extent item
238 		 * with the same key (offset) as that 0 bytes length file
239 		 * extent item in the call to setup_items_for_insert() later
240 		 * in this function.
241 		 */
242 		if (extent_end == key.offset && extent_end >= search_start) {
243 			last_end = extent_end;
244 			goto delete_extent_item;
245 		}
246 
247 		if (extent_end <= search_start) {
248 			path->slots[0]++;
249 			goto next_slot;
250 		}
251 
252 		found = true;
253 		search_start = max(key.offset, args->start);
254 		if (recow || !modify_tree) {
255 			modify_tree = -1;
256 			btrfs_release_path(path);
257 			continue;
258 		}
259 
260 		/*
261 		 *     | - range to drop - |
262 		 *  | -------- extent -------- |
263 		 */
264 		if (args->start > key.offset && args->end < extent_end) {
265 			if (WARN_ON(del_nr > 0)) {
266 				btrfs_print_leaf(leaf);
267 				ret = -EINVAL;
268 				break;
269 			}
270 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
271 				ret = -EOPNOTSUPP;
272 				break;
273 			}
274 
275 			memcpy(&new_key, &key, sizeof(new_key));
276 			new_key.offset = args->start;
277 			ret = btrfs_duplicate_item(trans, root, path,
278 						   &new_key);
279 			if (ret == -EAGAIN) {
280 				btrfs_release_path(path);
281 				continue;
282 			}
283 			if (ret < 0)
284 				break;
285 
286 			leaf = path->nodes[0];
287 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
288 					    struct btrfs_file_extent_item);
289 			btrfs_set_file_extent_num_bytes(leaf, fi,
290 							args->start - key.offset);
291 
292 			fi = btrfs_item_ptr(leaf, path->slots[0],
293 					    struct btrfs_file_extent_item);
294 
295 			extent_offset += args->start - key.offset;
296 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
297 			btrfs_set_file_extent_num_bytes(leaf, fi,
298 							extent_end - args->start);
299 
300 			if (update_refs && disk_bytenr > 0) {
301 				struct btrfs_ref ref = {
302 					.action = BTRFS_ADD_DELAYED_REF,
303 					.bytenr = disk_bytenr,
304 					.num_bytes = num_bytes,
305 					.parent = 0,
306 					.owning_root = btrfs_root_id(root),
307 					.ref_root = btrfs_root_id(root),
308 				};
309 				btrfs_init_data_ref(&ref, new_key.objectid,
310 						    args->start - extent_offset,
311 						    0, false);
312 				ret = btrfs_inc_extent_ref(trans, &ref);
313 				if (unlikely(ret)) {
314 					btrfs_abort_transaction(trans, ret);
315 					break;
316 				}
317 			}
318 			key.offset = args->start;
319 		}
320 		/*
321 		 * From here on out we will have actually dropped something, so
322 		 * last_end can be updated.
323 		 */
324 		last_end = extent_end;
325 
326 		/*
327 		 *  | ---- range to drop ----- |
328 		 *      | -------- extent -------- |
329 		 */
330 		if (args->start <= key.offset && args->end < extent_end) {
331 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
332 				ret = -EOPNOTSUPP;
333 				break;
334 			}
335 
336 			memcpy(&new_key, &key, sizeof(new_key));
337 			new_key.offset = args->end;
338 			btrfs_set_item_key_safe(trans, path, &new_key);
339 
340 			extent_offset += args->end - key.offset;
341 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
342 			btrfs_set_file_extent_num_bytes(leaf, fi,
343 							extent_end - args->end);
344 			if (update_refs && disk_bytenr > 0)
345 				args->bytes_found += args->end - key.offset;
346 			break;
347 		}
348 
349 		search_start = extent_end;
350 		/*
351 		 *       | ---- range to drop ----- |
352 		 *  | -------- extent -------- |
353 		 */
354 		if (args->start > key.offset && args->end >= extent_end) {
355 			if (WARN_ON(del_nr > 0)) {
356 				btrfs_print_leaf(leaf);
357 				ret = -EINVAL;
358 				break;
359 			}
360 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
361 				ret = -EOPNOTSUPP;
362 				break;
363 			}
364 
365 			btrfs_set_file_extent_num_bytes(leaf, fi,
366 							args->start - key.offset);
367 			if (update_refs && disk_bytenr > 0)
368 				args->bytes_found += extent_end - args->start;
369 			if (args->end == extent_end)
370 				break;
371 
372 			path->slots[0]++;
373 			goto next_slot;
374 		}
375 
376 		/*
377 		 *  | ---- range to drop ----- |
378 		 *    | ------ extent ------ |
379 		 */
380 		if (args->start <= key.offset && args->end >= extent_end) {
381 delete_extent_item:
382 			if (del_nr == 0) {
383 				del_slot = path->slots[0];
384 				del_nr = 1;
385 			} else {
386 				if (WARN_ON(del_slot + del_nr != path->slots[0])) {
387 					btrfs_print_leaf(leaf);
388 					ret = -EINVAL;
389 					break;
390 				}
391 				del_nr++;
392 			}
393 
394 			if (update_refs &&
395 			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
396 				args->bytes_found += extent_end - key.offset;
397 				extent_end = ALIGN(extent_end,
398 						   fs_info->sectorsize);
399 			} else if (update_refs && disk_bytenr > 0) {
400 				struct btrfs_ref ref = {
401 					.action = BTRFS_DROP_DELAYED_REF,
402 					.bytenr = disk_bytenr,
403 					.num_bytes = num_bytes,
404 					.parent = 0,
405 					.owning_root = btrfs_root_id(root),
406 					.ref_root = btrfs_root_id(root),
407 				};
408 				btrfs_init_data_ref(&ref, key.objectid,
409 						    key.offset - extent_offset,
410 						    0, false);
411 				ret = btrfs_free_extent(trans, &ref);
412 				if (unlikely(ret)) {
413 					btrfs_abort_transaction(trans, ret);
414 					break;
415 				}
416 				args->bytes_found += extent_end - key.offset;
417 			}
418 
419 			if (args->end == extent_end)
420 				break;
421 
422 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
423 				path->slots[0]++;
424 				goto next_slot;
425 			}
426 
427 			ret = btrfs_del_items(trans, root, path, del_slot,
428 					      del_nr);
429 			if (unlikely(ret)) {
430 				btrfs_abort_transaction(trans, ret);
431 				break;
432 			}
433 
434 			del_nr = 0;
435 			del_slot = 0;
436 
437 			btrfs_release_path(path);
438 			continue;
439 		}
440 
441 		BUG();
442 	}
443 
444 	if (!ret && del_nr > 0) {
445 		/*
446 		 * Set path->slots[0] to first slot, so that after the delete
447 		 * if items are move off from our leaf to its immediate left or
448 		 * right neighbor leafs, we end up with a correct and adjusted
449 		 * path->slots[0] for our insertion (if args->replace_extent).
450 		 */
451 		path->slots[0] = del_slot;
452 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
453 		if (ret)
454 			btrfs_abort_transaction(trans, ret);
455 	}
456 
457 	leaf = path->nodes[0];
458 	/*
459 	 * If btrfs_del_items() was called, it might have deleted a leaf, in
460 	 * which case it unlocked our path, so check path->locks[0] matches a
461 	 * write lock.
462 	 */
463 	if (!ret && args->replace_extent &&
464 	    path->locks[0] == BTRFS_WRITE_LOCK &&
465 	    btrfs_leaf_free_space(leaf) >=
466 	    sizeof(struct btrfs_item) + args->extent_item_size) {
467 
468 		key.objectid = ino;
469 		key.type = BTRFS_EXTENT_DATA_KEY;
470 		key.offset = args->start;
471 		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
472 			struct btrfs_key slot_key;
473 
474 			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
475 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
476 				path->slots[0]++;
477 		}
478 		btrfs_setup_item_for_insert(trans, root, path, &key,
479 					    args->extent_item_size);
480 		args->extent_inserted = true;
481 	}
482 
483 	if (!args->path)
484 		btrfs_free_path(path);
485 	else if (!args->extent_inserted)
486 		btrfs_release_path(path);
487 out:
488 	args->drop_end = found ? min(args->end, last_end) : args->end;
489 
490 	return ret;
491 }
492 
493 static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
494 			     u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
495 {
496 	struct btrfs_file_extent_item *fi;
497 	struct btrfs_key key;
498 	u64 extent_end;
499 
500 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
501 		return false;
502 
503 	btrfs_item_key_to_cpu(leaf, &key, slot);
504 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
505 		return false;
506 
507 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
508 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
509 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
510 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
511 	    btrfs_file_extent_compression(leaf, fi) ||
512 	    btrfs_file_extent_encryption(leaf, fi) ||
513 	    btrfs_file_extent_other_encoding(leaf, fi))
514 		return false;
515 
516 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
517 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
518 		return false;
519 
520 	*start = key.offset;
521 	*end = extent_end;
522 	return true;
523 }
524 
525 /*
526  * Mark extent in the range start - end as written.
527  *
528  * This changes extent type from 'pre-allocated' to 'regular'. If only
529  * part of extent is marked as written, the extent will be split into
530  * two or three.
531  */
532 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
533 			      struct btrfs_inode *inode, u64 start, u64 end)
534 {
535 	struct btrfs_root *root = inode->root;
536 	struct extent_buffer *leaf;
537 	BTRFS_PATH_AUTO_FREE(path);
538 	struct btrfs_file_extent_item *fi;
539 	struct btrfs_ref ref = { 0 };
540 	struct btrfs_key key;
541 	struct btrfs_key new_key;
542 	u64 bytenr;
543 	u64 num_bytes;
544 	u64 extent_end;
545 	u64 orig_offset;
546 	u64 other_start;
547 	u64 other_end;
548 	u64 split;
549 	int del_nr = 0;
550 	int del_slot = 0;
551 	int recow;
552 	int ret;
553 	u64 ino = btrfs_ino(inode);
554 
555 	path = btrfs_alloc_path();
556 	if (!path)
557 		return -ENOMEM;
558 again:
559 	recow = 0;
560 	split = start;
561 	key.objectid = ino;
562 	key.type = BTRFS_EXTENT_DATA_KEY;
563 	key.offset = split;
564 
565 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
566 	if (ret < 0)
567 		return ret;
568 	if (ret > 0 && path->slots[0] > 0)
569 		path->slots[0]--;
570 
571 	leaf = path->nodes[0];
572 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
573 	if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
574 		ret = -EINVAL;
575 		btrfs_abort_transaction(trans, ret);
576 		return ret;
577 	}
578 	fi = btrfs_item_ptr(leaf, path->slots[0],
579 			    struct btrfs_file_extent_item);
580 	if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
581 		ret = -EINVAL;
582 		btrfs_abort_transaction(trans, ret);
583 		return ret;
584 	}
585 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
586 	if (unlikely(key.offset > start || extent_end < end)) {
587 		ret = -EINVAL;
588 		btrfs_abort_transaction(trans, ret);
589 		return ret;
590 	}
591 
592 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
593 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
594 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
595 	memcpy(&new_key, &key, sizeof(new_key));
596 
597 	if (start == key.offset && end < extent_end) {
598 		other_start = 0;
599 		other_end = start;
600 		if (extent_mergeable(leaf, path->slots[0] - 1,
601 				     ino, bytenr, orig_offset,
602 				     &other_start, &other_end)) {
603 			new_key.offset = end;
604 			btrfs_set_item_key_safe(trans, path, &new_key);
605 			fi = btrfs_item_ptr(leaf, path->slots[0],
606 					    struct btrfs_file_extent_item);
607 			btrfs_set_file_extent_generation(leaf, fi,
608 							 trans->transid);
609 			btrfs_set_file_extent_num_bytes(leaf, fi,
610 							extent_end - end);
611 			btrfs_set_file_extent_offset(leaf, fi,
612 						     end - orig_offset);
613 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
614 					    struct btrfs_file_extent_item);
615 			btrfs_set_file_extent_generation(leaf, fi,
616 							 trans->transid);
617 			btrfs_set_file_extent_num_bytes(leaf, fi,
618 							end - other_start);
619 			goto mark_dirty;
620 		}
621 	}
622 
623 	if (start > key.offset && end == extent_end) {
624 		other_start = end;
625 		other_end = 0;
626 		if (extent_mergeable(leaf, path->slots[0] + 1,
627 				     ino, bytenr, orig_offset,
628 				     &other_start, &other_end)) {
629 			fi = btrfs_item_ptr(leaf, path->slots[0],
630 					    struct btrfs_file_extent_item);
631 			btrfs_set_file_extent_num_bytes(leaf, fi,
632 							start - key.offset);
633 			btrfs_set_file_extent_generation(leaf, fi,
634 							 trans->transid);
635 			path->slots[0]++;
636 			new_key.offset = start;
637 			btrfs_set_item_key_safe(trans, path, &new_key);
638 
639 			fi = btrfs_item_ptr(leaf, path->slots[0],
640 					    struct btrfs_file_extent_item);
641 			btrfs_set_file_extent_generation(leaf, fi,
642 							 trans->transid);
643 			btrfs_set_file_extent_num_bytes(leaf, fi,
644 							other_end - start);
645 			btrfs_set_file_extent_offset(leaf, fi,
646 						     start - orig_offset);
647 			goto mark_dirty;
648 		}
649 	}
650 
651 	while (start > key.offset || end < extent_end) {
652 		if (key.offset == start)
653 			split = end;
654 
655 		new_key.offset = split;
656 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
657 		if (ret == -EAGAIN) {
658 			btrfs_release_path(path);
659 			goto again;
660 		}
661 		if (unlikely(ret < 0)) {
662 			btrfs_abort_transaction(trans, ret);
663 			return ret;
664 		}
665 
666 		leaf = path->nodes[0];
667 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
668 				    struct btrfs_file_extent_item);
669 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
670 		btrfs_set_file_extent_num_bytes(leaf, fi,
671 						split - key.offset);
672 
673 		fi = btrfs_item_ptr(leaf, path->slots[0],
674 				    struct btrfs_file_extent_item);
675 
676 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
677 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
678 		btrfs_set_file_extent_num_bytes(leaf, fi,
679 						extent_end - split);
680 
681 		ref.action = BTRFS_ADD_DELAYED_REF;
682 		ref.bytenr = bytenr;
683 		ref.num_bytes = num_bytes;
684 		ref.parent = 0;
685 		ref.owning_root = btrfs_root_id(root);
686 		ref.ref_root = btrfs_root_id(root);
687 		btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
688 		ret = btrfs_inc_extent_ref(trans, &ref);
689 		if (unlikely(ret)) {
690 			btrfs_abort_transaction(trans, ret);
691 			return ret;
692 		}
693 
694 		if (split == start) {
695 			key.offset = start;
696 		} else {
697 			if (unlikely(start != key.offset)) {
698 				ret = -EINVAL;
699 				btrfs_abort_transaction(trans, ret);
700 				return ret;
701 			}
702 			path->slots[0]--;
703 			extent_end = end;
704 		}
705 		recow = 1;
706 	}
707 
708 	other_start = end;
709 	other_end = 0;
710 
711 	ref.action = BTRFS_DROP_DELAYED_REF;
712 	ref.bytenr = bytenr;
713 	ref.num_bytes = num_bytes;
714 	ref.parent = 0;
715 	ref.owning_root = btrfs_root_id(root);
716 	ref.ref_root = btrfs_root_id(root);
717 	btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
718 	if (extent_mergeable(leaf, path->slots[0] + 1,
719 			     ino, bytenr, orig_offset,
720 			     &other_start, &other_end)) {
721 		if (recow) {
722 			btrfs_release_path(path);
723 			goto again;
724 		}
725 		extent_end = other_end;
726 		del_slot = path->slots[0] + 1;
727 		del_nr++;
728 		ret = btrfs_free_extent(trans, &ref);
729 		if (unlikely(ret)) {
730 			btrfs_abort_transaction(trans, ret);
731 			return ret;
732 		}
733 	}
734 	other_start = 0;
735 	other_end = start;
736 	if (extent_mergeable(leaf, path->slots[0] - 1,
737 			     ino, bytenr, orig_offset,
738 			     &other_start, &other_end)) {
739 		if (recow) {
740 			btrfs_release_path(path);
741 			goto again;
742 		}
743 		key.offset = other_start;
744 		del_slot = path->slots[0];
745 		del_nr++;
746 		ret = btrfs_free_extent(trans, &ref);
747 		if (unlikely(ret)) {
748 			btrfs_abort_transaction(trans, ret);
749 			return ret;
750 		}
751 	}
752 	if (del_nr == 0) {
753 		fi = btrfs_item_ptr(leaf, path->slots[0],
754 			   struct btrfs_file_extent_item);
755 		btrfs_set_file_extent_type(leaf, fi,
756 					   BTRFS_FILE_EXTENT_REG);
757 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
758 	} else {
759 		fi = btrfs_item_ptr(leaf, del_slot - 1,
760 			   struct btrfs_file_extent_item);
761 		btrfs_set_file_extent_type(leaf, fi,
762 					   BTRFS_FILE_EXTENT_REG);
763 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
764 		btrfs_set_file_extent_num_bytes(leaf, fi,
765 						extent_end - key.offset);
766 
767 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
768 		if (unlikely(ret < 0)) {
769 			btrfs_abort_transaction(trans, ret);
770 			return ret;
771 		}
772 	}
773 
774 mark_dirty:
775 	ret = btrfs_inode_set_file_extent_range(inode, start, end - start);
776 	if (ret)
777 		btrfs_abort_transaction(trans, ret);
778 
779 	return ret;
780 }
781 
782 /*
783  * On error return an unlocked folio and the error value
784  * On success return a locked folio and 0
785  */
786 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
787 				  u64 len)
788 {
789 	u64 clamp_start = max_t(u64, pos, folio_pos(folio));
790 	u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
791 	const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
792 	int ret = 0;
793 
794 	if (folio_test_uptodate(folio))
795 		return 0;
796 
797 	if (IS_ALIGNED(clamp_start, blocksize) &&
798 	    IS_ALIGNED(clamp_end, blocksize))
799 		return 0;
800 
801 	ret = btrfs_read_folio(NULL, folio);
802 	if (ret)
803 		return ret;
804 	folio_lock(folio);
805 	if (unlikely(!folio_test_uptodate(folio))) {
806 		folio_unlock(folio);
807 		return -EIO;
808 	}
809 
810 	/*
811 	 * Since btrfs_read_folio() will unlock the folio before it returns,
812 	 * there is a window where btrfs_release_folio() can be called to
813 	 * release the page.  Here we check both inode mapping and page
814 	 * private to make sure the page was not released.
815 	 *
816 	 * The private flag check is essential for subpage as we need to store
817 	 * extra bitmap using folio private.
818 	 */
819 	if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
820 		folio_unlock(folio);
821 		return -EAGAIN;
822 	}
823 	return 0;
824 }
825 
826 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
827 {
828 	gfp_t gfp;
829 
830 	gfp = btrfs_alloc_write_mask(inode->i_mapping);
831 	if (nowait) {
832 		gfp &= ~__GFP_DIRECT_RECLAIM;
833 		gfp |= GFP_NOWAIT;
834 	}
835 
836 	return gfp;
837 }
838 
839 /*
840  * Get folio into the page cache and lock it.
841  */
842 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
843 				      loff_t pos, size_t write_bytes,
844 				      bool nowait)
845 {
846 	const pgoff_t index = pos >> PAGE_SHIFT;
847 	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
848 	fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
849 			  fgf_set_order(write_bytes);
850 	struct folio *folio;
851 	int ret;
852 
853 again:
854 	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
855 	if (IS_ERR(folio))
856 		return PTR_ERR(folio);
857 
858 	ret = set_folio_extent_mapped(folio);
859 	if (ret < 0) {
860 		folio_unlock(folio);
861 		folio_put(folio);
862 		return ret;
863 	}
864 	ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
865 	if (ret) {
866 		/* The folio is already unlocked. */
867 		folio_put(folio);
868 		if (!nowait && ret == -EAGAIN)
869 			goto again;
870 		return ret;
871 	}
872 	*folio_ret = folio;
873 	return 0;
874 }
875 
876 /*
877  * Locks the extent and properly waits for data=ordered extents to finish
878  * before allowing the folios to be modified if need.
879  *
880  * Return:
881  * 1 - the extent is locked
882  * 0 - the extent is not locked, and everything is OK
883  * -EAGAIN - need to prepare the folios again
884  */
885 static noinline int
886 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
887 				loff_t pos, size_t write_bytes,
888 				u64 *lockstart, u64 *lockend, bool nowait,
889 				struct extent_state **cached_state)
890 {
891 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
892 	u64 start_pos;
893 	u64 last_pos;
894 	int ret = 0;
895 
896 	start_pos = round_down(pos, fs_info->sectorsize);
897 	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
898 
899 	if (start_pos < inode->vfs_inode.i_size) {
900 		struct btrfs_ordered_extent *ordered;
901 
902 		if (nowait) {
903 			if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
904 						   last_pos, cached_state)) {
905 				folio_unlock(folio);
906 				folio_put(folio);
907 				return -EAGAIN;
908 			}
909 		} else {
910 			btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
911 					  cached_state);
912 		}
913 
914 		ordered = btrfs_lookup_ordered_range(inode, start_pos,
915 						     last_pos - start_pos + 1);
916 		if (ordered &&
917 		    ordered->file_offset + ordered->num_bytes > start_pos &&
918 		    ordered->file_offset <= last_pos) {
919 			btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
920 					    cached_state);
921 			folio_unlock(folio);
922 			folio_put(folio);
923 			btrfs_start_ordered_extent(ordered);
924 			btrfs_put_ordered_extent(ordered);
925 			return -EAGAIN;
926 		}
927 		if (ordered)
928 			btrfs_put_ordered_extent(ordered);
929 
930 		*lockstart = start_pos;
931 		*lockend = last_pos;
932 		ret = 1;
933 	}
934 
935 	/*
936 	 * We should be called after prepare_one_folio() which should have locked
937 	 * all pages in the range.
938 	 */
939 	WARN_ON(!folio_test_locked(folio));
940 
941 	return ret;
942 }
943 
944 /*
945  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
946  *
947  * @pos:         File offset.
948  * @write_bytes: The length to write, will be updated to the nocow writeable
949  *               range.
950  * @nowait:      Indicate if we can block or not (non-blocking IO context).
951  *
952  * This function will flush ordered extents in the range to ensure proper
953  * nocow checks.
954  *
955  * Return:
956  * > 0          If we can nocow, and updates @write_bytes.
957  *  0           If we can't do a nocow write.
958  * -EAGAIN      If we can't do a nocow write because snapshotting of the inode's
959  *              root is in progress or because we are in a non-blocking IO
960  *              context and need to block (@nowait is true).
961  * < 0          If an error happened.
962  *
963  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
964  */
965 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
966 			   size_t *write_bytes, bool nowait)
967 {
968 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
969 	struct btrfs_root *root = inode->root;
970 	struct extent_state *cached_state = NULL;
971 	u64 lockstart, lockend;
972 	u64 cur_offset;
973 	int ret = 0;
974 
975 	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
976 		return 0;
977 
978 	if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
979 		return -EAGAIN;
980 
981 	lockstart = round_down(pos, fs_info->sectorsize);
982 	lockend = round_up(pos + *write_bytes,
983 			   fs_info->sectorsize) - 1;
984 
985 	if (nowait) {
986 		if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
987 						  &cached_state)) {
988 			btrfs_drew_write_unlock(&root->snapshot_lock);
989 			return -EAGAIN;
990 		}
991 	} else {
992 		btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
993 						   &cached_state);
994 	}
995 
996 	cur_offset = lockstart;
997 	while (cur_offset < lockend) {
998 		u64 num_bytes = lockend - cur_offset + 1;
999 
1000 		ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
1001 		if (ret <= 0) {
1002 			/*
1003 			 * If cur_offset == lockstart it means we haven't found
1004 			 * any extent against which we can NOCOW, so unlock the
1005 			 * snapshot lock.
1006 			 */
1007 			if (cur_offset == lockstart)
1008 				btrfs_drew_write_unlock(&root->snapshot_lock);
1009 			break;
1010 		}
1011 		cur_offset += num_bytes;
1012 	}
1013 
1014 	btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1015 
1016 	/*
1017 	 * cur_offset > lockstart means there's at least a partial range we can
1018 	 * NOCOW, and that range can cover one or more extents.
1019 	 */
1020 	if (cur_offset > lockstart) {
1021 		*write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
1022 		return 1;
1023 	}
1024 
1025 	return ret;
1026 }
1027 
1028 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1029 {
1030 	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1031 }
1032 
1033 int btrfs_write_check(struct kiocb *iocb, size_t count)
1034 {
1035 	struct file *file = iocb->ki_filp;
1036 	struct inode *inode = file_inode(file);
1037 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1038 	loff_t pos = iocb->ki_pos;
1039 	int ret;
1040 	loff_t oldsize;
1041 
1042 	/*
1043 	 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1044 	 * prealloc flags, as without those flags we always have to COW. We will
1045 	 * later check if we can really COW into the target range (using
1046 	 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1047 	 */
1048 	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1049 	    !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1050 		return -EAGAIN;
1051 
1052 	ret = file_remove_privs(file);
1053 	if (ret)
1054 		return ret;
1055 
1056 	/*
1057 	 * We reserve space for updating the inode when we reserve space for the
1058 	 * extent we are going to write, so we will enospc out there.  We don't
1059 	 * need to start yet another transaction to update the inode as we will
1060 	 * update the inode when we finish writing whatever data we write.
1061 	 */
1062 	if (!IS_NOCMTIME(inode)) {
1063 		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1064 		inode_inc_iversion(inode);
1065 	}
1066 
1067 	oldsize = i_size_read(inode);
1068 	if (pos > oldsize) {
1069 		/* Expand hole size to cover write data, preventing empty gap */
1070 		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1071 
1072 		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1073 		if (ret)
1074 			return ret;
1075 	}
1076 
1077 	return 0;
1078 }
1079 
1080 static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
1081 			  u64 start, u64 len, bool only_release_metadata)
1082 {
1083 	if (len == 0)
1084 		return;
1085 
1086 	if (only_release_metadata) {
1087 		btrfs_check_nocow_unlock(inode);
1088 		btrfs_delalloc_release_metadata(inode, len, true);
1089 	} else {
1090 		const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1091 
1092 		btrfs_delalloc_release_space(inode, data_reserved,
1093 					     round_down(start, fs_info->sectorsize),
1094 					     len, true);
1095 	}
1096 }
1097 
1098 /*
1099  * Reserve data and metadata space for this buffered write range.
1100  *
1101  * Return >0 for the number of bytes reserved, which is always block aligned.
1102  * Return <0 for error.
1103  */
1104 static ssize_t reserve_space(struct btrfs_inode *inode,
1105 			     struct extent_changeset **data_reserved,
1106 			     u64 start, size_t *len, bool nowait,
1107 			     bool *only_release_metadata)
1108 {
1109 	const struct btrfs_fs_info *fs_info = inode->root->fs_info;
1110 	const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
1111 	size_t reserve_bytes;
1112 	int ret;
1113 
1114 	ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
1115 	if (ret < 0) {
1116 		int can_nocow;
1117 
1118 		if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
1119 			return -EAGAIN;
1120 
1121 		/*
1122 		 * If we don't have to COW at the offset, reserve metadata only.
1123 		 * write_bytes may get smaller than requested here.
1124 		 */
1125 		can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
1126 		if (can_nocow < 0)
1127 			ret = can_nocow;
1128 		if (can_nocow > 0)
1129 			ret = 0;
1130 		if (ret)
1131 			return ret;
1132 		*only_release_metadata = true;
1133 	}
1134 
1135 	reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
1136 	WARN_ON(reserve_bytes == 0);
1137 	ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
1138 					      reserve_bytes, nowait);
1139 	if (ret) {
1140 		if (!*only_release_metadata)
1141 			btrfs_free_reserved_data_space(inode, *data_reserved,
1142 						       start, *len);
1143 		else
1144 			btrfs_check_nocow_unlock(inode);
1145 
1146 		if (nowait && ret == -ENOSPC)
1147 			ret = -EAGAIN;
1148 		return ret;
1149 	}
1150 	return reserve_bytes;
1151 }
1152 
1153 /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
1154 static void shrink_reserved_space(struct btrfs_inode *inode,
1155 				  struct extent_changeset *data_reserved,
1156 				  u64 reserved_start, u64 reserved_len,
1157 				  u64 new_len, bool only_release_metadata)
1158 {
1159 	const u64 diff = reserved_len - new_len;
1160 
1161 	ASSERT(new_len <= reserved_len);
1162 	btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
1163 	if (only_release_metadata)
1164 		btrfs_delalloc_release_metadata(inode, diff, true);
1165 	else
1166 		btrfs_delalloc_release_space(inode, data_reserved,
1167 					     reserved_start + new_len, diff, true);
1168 }
1169 
1170 /* Calculate the maximum amount of bytes we can write into one folio. */
1171 static size_t calc_write_bytes(const struct btrfs_inode *inode,
1172 			       const struct iov_iter *iter, u64 start)
1173 {
1174 	const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
1175 
1176 	return min(max_folio_size - (start & (max_folio_size - 1)),
1177 		   iov_iter_count(iter));
1178 }
1179 
1180 /*
1181  * Do the heavy-lifting work to copy one range into one folio of the page cache.
1182  *
1183  * Return > 0 in case we copied all bytes or just some of them.
1184  * Return 0 if no bytes were copied, in which case the caller should retry.
1185  * Return <0 on error.
1186  */
1187 static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
1188 			  struct extent_changeset **data_reserved, u64 start,
1189 			  bool nowait)
1190 {
1191 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1192 	struct extent_state *cached_state = NULL;
1193 	size_t write_bytes = calc_write_bytes(inode, iter, start);
1194 	size_t copied;
1195 	const u64 reserved_start = round_down(start, fs_info->sectorsize);
1196 	u64 reserved_len;
1197 	struct folio *folio = NULL;
1198 	int extents_locked;
1199 	u64 lockstart;
1200 	u64 lockend;
1201 	bool only_release_metadata = false;
1202 	const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1203 	int ret;
1204 
1205 	/*
1206 	 * Fault all pages before locking them in prepare_one_folio() to avoid
1207 	 * recursive lock.
1208 	 */
1209 	if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
1210 		return -EFAULT;
1211 	extent_changeset_release(*data_reserved);
1212 	ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
1213 			    &only_release_metadata);
1214 	if (ret < 0)
1215 		return ret;
1216 	reserved_len = ret;
1217 	/* Write range must be inside the reserved range. */
1218 	ASSERT(reserved_start <= start, "reserved_start=%llu start=%llu",
1219 	       reserved_start, start);
1220 	ASSERT(start + write_bytes <= reserved_start + reserved_len,
1221 	       "start=%llu write_bytes=%zu reserved_start=%llu reserved_len=%llu",
1222 	       start, write_bytes, reserved_start, reserved_len);
1223 
1224 again:
1225 	ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
1226 						    bdp_flags);
1227 	if (ret) {
1228 		btrfs_delalloc_release_extents(inode, reserved_len);
1229 		release_space(inode, *data_reserved, reserved_start, reserved_len,
1230 			      only_release_metadata);
1231 		return ret;
1232 	}
1233 
1234 	ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
1235 	if (ret) {
1236 		btrfs_delalloc_release_extents(inode, reserved_len);
1237 		release_space(inode, *data_reserved, reserved_start, reserved_len,
1238 			      only_release_metadata);
1239 		return ret;
1240 	}
1241 
1242 	/*
1243 	 * The reserved range goes beyond the current folio, shrink the reserved
1244 	 * space to the folio boundary.
1245 	 */
1246 	if (reserved_start + reserved_len > folio_next_pos(folio)) {
1247 		const u64 last_block = folio_next_pos(folio);
1248 
1249 		shrink_reserved_space(inode, *data_reserved, reserved_start,
1250 				      reserved_len, last_block - reserved_start,
1251 				      only_release_metadata);
1252 		write_bytes = last_block - start;
1253 		reserved_len = last_block - reserved_start;
1254 	}
1255 
1256 	extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
1257 							 write_bytes, &lockstart,
1258 							 &lockend, nowait,
1259 							 &cached_state);
1260 	if (extents_locked < 0) {
1261 		if (!nowait && extents_locked == -EAGAIN)
1262 			goto again;
1263 
1264 		btrfs_delalloc_release_extents(inode, reserved_len);
1265 		release_space(inode, *data_reserved, reserved_start, reserved_len,
1266 			      only_release_metadata);
1267 		return extents_locked;
1268 	}
1269 
1270 	copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
1271 					     write_bytes, iter);
1272 	flush_dcache_folio(folio);
1273 
1274 	if (unlikely(copied < write_bytes)) {
1275 		u64 last_block;
1276 
1277 		/*
1278 		 * The original write range doesn't need an uptodate folio as
1279 		 * the range is block aligned. But now a short copy happened.
1280 		 * We cannot handle it without an uptodate folio.
1281 		 *
1282 		 * So just revert the range and we will retry.
1283 		 */
1284 		if (!folio_test_uptodate(folio)) {
1285 			iov_iter_revert(iter, copied);
1286 			copied = 0;
1287 		}
1288 
1289 		/* No copied bytes, unlock, release reserved space and exit. */
1290 		if (copied == 0) {
1291 			if (extents_locked)
1292 				btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
1293 						    &cached_state);
1294 			else
1295 				btrfs_free_extent_state(cached_state);
1296 			btrfs_delalloc_release_extents(inode, reserved_len);
1297 			release_space(inode, *data_reserved, reserved_start, reserved_len,
1298 				      only_release_metadata);
1299 			btrfs_drop_folio(fs_info, folio, start, copied);
1300 			return 0;
1301 		}
1302 
1303 		/* Release the reserved space beyond the last block. */
1304 		last_block = round_up(start + copied, fs_info->sectorsize);
1305 
1306 		shrink_reserved_space(inode, *data_reserved, reserved_start,
1307 				      reserved_len, last_block - reserved_start,
1308 				      only_release_metadata);
1309 		reserved_len = last_block - reserved_start;
1310 	}
1311 
1312 	ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
1313 				only_release_metadata);
1314 	/*
1315 	 * If we have not locked the extent range, because the range's start
1316 	 * offset is >= i_size, we might still have a non-NULL cached extent
1317 	 * state, acquired while marking the extent range as delalloc through
1318 	 * btrfs_dirty_page(). Therefore free any possible cached extent state
1319 	 * to avoid a memory leak.
1320 	 */
1321 	if (extents_locked)
1322 		btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1323 	else
1324 		btrfs_free_extent_state(cached_state);
1325 
1326 	btrfs_delalloc_release_extents(inode, reserved_len);
1327 	if (ret) {
1328 		btrfs_drop_folio(fs_info, folio, start, copied);
1329 		release_space(inode, *data_reserved, reserved_start, reserved_len,
1330 			      only_release_metadata);
1331 		return ret;
1332 	}
1333 	if (only_release_metadata)
1334 		btrfs_check_nocow_unlock(inode);
1335 
1336 	btrfs_drop_folio(fs_info, folio, start, copied);
1337 	return copied;
1338 }
1339 
1340 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1341 {
1342 	struct file *file = iocb->ki_filp;
1343 	loff_t pos;
1344 	struct inode *inode = file_inode(file);
1345 	struct extent_changeset *data_reserved = NULL;
1346 	size_t num_written = 0;
1347 	ssize_t ret;
1348 	loff_t old_isize;
1349 	unsigned int ilock_flags = 0;
1350 	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1351 
1352 	if (nowait)
1353 		ilock_flags |= BTRFS_ILOCK_TRY;
1354 
1355 	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1356 	if (ret < 0)
1357 		return ret;
1358 
1359 	/*
1360 	 * We can only trust the isize with inode lock held, or it can race with
1361 	 * other buffered writes and cause incorrect call of
1362 	 * pagecache_isize_extended() to overwrite existing data.
1363 	 */
1364 	old_isize = i_size_read(inode);
1365 
1366 	ret = generic_write_checks(iocb, iter);
1367 	if (ret <= 0)
1368 		goto out;
1369 
1370 	ret = btrfs_write_check(iocb, ret);
1371 	if (ret < 0)
1372 		goto out;
1373 
1374 	pos = iocb->ki_pos;
1375 	while (iov_iter_count(iter) > 0) {
1376 		ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
1377 		if (ret < 0)
1378 			break;
1379 		pos += ret;
1380 		num_written += ret;
1381 		cond_resched();
1382 	}
1383 
1384 	extent_changeset_free(data_reserved);
1385 	if (num_written > 0) {
1386 		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1387 		iocb->ki_pos += num_written;
1388 	}
1389 out:
1390 	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1391 	return num_written ? num_written : ret;
1392 }
1393 
1394 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1395 			const struct btrfs_ioctl_encoded_io_args *encoded)
1396 {
1397 	struct file *file = iocb->ki_filp;
1398 	struct inode *inode = file_inode(file);
1399 	loff_t count;
1400 	ssize_t ret;
1401 
1402 	btrfs_inode_lock(BTRFS_I(inode), 0);
1403 	count = encoded->len;
1404 	ret = generic_write_checks_count(iocb, &count);
1405 	if (ret == 0 && count != encoded->len) {
1406 		/*
1407 		 * The write got truncated by generic_write_checks_count(). We
1408 		 * can't do a partial encoded write.
1409 		 */
1410 		ret = -EFBIG;
1411 	}
1412 	if (ret || encoded->len == 0)
1413 		goto out;
1414 
1415 	ret = btrfs_write_check(iocb, encoded->len);
1416 	if (ret < 0)
1417 		goto out;
1418 
1419 	ret = btrfs_do_encoded_write(iocb, from, encoded);
1420 out:
1421 	btrfs_inode_unlock(BTRFS_I(inode), 0);
1422 	return ret;
1423 }
1424 
1425 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1426 			    const struct btrfs_ioctl_encoded_io_args *encoded)
1427 {
1428 	struct file *file = iocb->ki_filp;
1429 	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1430 	ssize_t num_written, num_sync;
1431 
1432 	if (btrfs_is_shutdown(inode->root->fs_info))
1433 		return -EIO;
1434 	/*
1435 	 * If the fs flips readonly due to some impossible error, although we
1436 	 * have opened a file as writable, we have to stop this write operation
1437 	 * to ensure consistency.
1438 	 */
1439 	if (unlikely(BTRFS_FS_ERROR(inode->root->fs_info)))
1440 		return -EROFS;
1441 
1442 	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1443 		return -EOPNOTSUPP;
1444 
1445 	if (encoded) {
1446 		num_written = btrfs_encoded_write(iocb, from, encoded);
1447 		num_sync = encoded->len;
1448 	} else if (iocb->ki_flags & IOCB_DIRECT) {
1449 		num_written = btrfs_direct_write(iocb, from);
1450 		num_sync = num_written;
1451 	} else {
1452 		num_written = btrfs_buffered_write(iocb, from);
1453 		num_sync = num_written;
1454 	}
1455 
1456 	btrfs_set_inode_last_sub_trans(inode);
1457 
1458 	if (num_sync > 0) {
1459 		num_sync = generic_write_sync(iocb, num_sync);
1460 		if (num_sync < 0)
1461 			num_written = num_sync;
1462 	}
1463 
1464 	return num_written;
1465 }
1466 
1467 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1468 {
1469 	return btrfs_do_write_iter(iocb, from, NULL);
1470 }
1471 
1472 int btrfs_release_file(struct inode *inode, struct file *filp)
1473 {
1474 	struct btrfs_file_private *private = filp->private_data;
1475 
1476 	if (private) {
1477 		kfree(private->filldir_buf);
1478 		btrfs_free_extent_state(private->llseek_cached_state);
1479 		kfree(private);
1480 		filp->private_data = NULL;
1481 	}
1482 
1483 	/*
1484 	 * Set by setattr when we are about to truncate a file from a non-zero
1485 	 * size to a zero size.  This tries to flush down new bytes that may
1486 	 * have been written if the application were using truncate to replace
1487 	 * a file in place.
1488 	 */
1489 	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1490 			       &BTRFS_I(inode)->runtime_flags))
1491 			filemap_flush(inode->i_mapping);
1492 	return 0;
1493 }
1494 
1495 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1496 {
1497 	int ret;
1498 	struct blk_plug plug;
1499 
1500 	/*
1501 	 * This is only called in fsync, which would do synchronous writes, so
1502 	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1503 	 * multiple disks using raid profile, a large IO can be split to
1504 	 * several segments of stripe length (currently 64K).
1505 	 */
1506 	blk_start_plug(&plug);
1507 	ret = btrfs_fdatawrite_range(inode, start, end);
1508 	blk_finish_plug(&plug);
1509 
1510 	return ret;
1511 }
1512 
1513 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1514 {
1515 	struct btrfs_inode *inode = ctx->inode;
1516 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1517 
1518 	if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1519 	    list_empty(&ctx->ordered_extents))
1520 		return true;
1521 
1522 	/*
1523 	 * If we are doing a fast fsync we can not bail out if the inode's
1524 	 * last_trans is <= then the last committed transaction, because we only
1525 	 * update the last_trans of the inode during ordered extent completion,
1526 	 * and for a fast fsync we don't wait for that, we only wait for the
1527 	 * writeback to complete.
1528 	 */
1529 	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1530 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1531 	     list_empty(&ctx->ordered_extents)))
1532 		return true;
1533 
1534 	return false;
1535 }
1536 
1537 /*
1538  * fsync call for both files and directories.  This logs the inode into
1539  * the tree log instead of forcing full commits whenever possible.
1540  *
1541  * It needs to call filemap_fdatawait so that all ordered extent updates are
1542  * in the metadata btree are up to date for copying to the log.
1543  *
1544  * It drops the inode mutex before doing the tree log commit.  This is an
1545  * important optimization for directories because holding the mutex prevents
1546  * new operations on the dir while we write to disk.
1547  */
1548 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1549 {
1550 	struct dentry *dentry = file_dentry(file);
1551 	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1552 	struct btrfs_root *root = inode->root;
1553 	struct btrfs_fs_info *fs_info = root->fs_info;
1554 	struct btrfs_trans_handle *trans;
1555 	struct btrfs_log_ctx ctx;
1556 	int ret = 0, err;
1557 	u64 len;
1558 	bool full_sync;
1559 	bool skip_ilock = false;
1560 
1561 	if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1562 		skip_ilock = true;
1563 		current->journal_info = NULL;
1564 		btrfs_assert_inode_locked(inode);
1565 	}
1566 
1567 	trace_btrfs_sync_file_enter(file, datasync);
1568 
1569 	btrfs_init_log_ctx(&ctx, inode);
1570 
1571 	/*
1572 	 * Always set the range to a full range, otherwise we can get into
1573 	 * several problems, from missing file extent items to represent holes
1574 	 * when not using the NO_HOLES feature, to log tree corruption due to
1575 	 * races between hole detection during logging and completion of ordered
1576 	 * extents outside the range, to missing checksums due to ordered extents
1577 	 * for which we flushed only a subset of their pages.
1578 	 */
1579 	start = 0;
1580 	end = LLONG_MAX;
1581 	len = (u64)LLONG_MAX + 1;
1582 
1583 	/*
1584 	 * We write the dirty pages in the range and wait until they complete
1585 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1586 	 * multi-task, and make the performance up.  See
1587 	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1588 	 */
1589 	ret = start_ordered_ops(inode, start, end);
1590 	if (ret)
1591 		goto out;
1592 
1593 	if (skip_ilock)
1594 		down_write(&inode->i_mmap_lock);
1595 	else
1596 		btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1597 
1598 	atomic_inc(&root->log_batch);
1599 
1600 	/*
1601 	 * Before we acquired the inode's lock and the mmap lock, someone may
1602 	 * have dirtied more pages in the target range. We need to make sure
1603 	 * that writeback for any such pages does not start while we are logging
1604 	 * the inode, because if it does, any of the following might happen when
1605 	 * we are not doing a full inode sync:
1606 	 *
1607 	 * 1) We log an extent after its writeback finishes but before its
1608 	 *    checksums are added to the csum tree, leading to -EIO errors
1609 	 *    when attempting to read the extent after a log replay.
1610 	 *
1611 	 * 2) We can end up logging an extent before its writeback finishes.
1612 	 *    Therefore after the log replay we will have a file extent item
1613 	 *    pointing to an unwritten extent (and no data checksums as well).
1614 	 *
1615 	 * So trigger writeback for any eventual new dirty pages and then we
1616 	 * wait for all ordered extents to complete below.
1617 	 */
1618 	ret = start_ordered_ops(inode, start, end);
1619 	if (ret) {
1620 		if (skip_ilock)
1621 			up_write(&inode->i_mmap_lock);
1622 		else
1623 			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1624 		goto out;
1625 	}
1626 
1627 	/*
1628 	 * Always check for the full sync flag while holding the inode's lock,
1629 	 * to avoid races with other tasks. The flag must be either set all the
1630 	 * time during logging or always off all the time while logging.
1631 	 * We check the flag here after starting delalloc above, because when
1632 	 * running delalloc the full sync flag may be set if we need to drop
1633 	 * extra extent map ranges due to temporary memory allocation failures.
1634 	 */
1635 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1636 
1637 	/*
1638 	 * We have to do this here to avoid the priority inversion of waiting on
1639 	 * IO of a lower priority task while holding a transaction open.
1640 	 *
1641 	 * For a full fsync we wait for the ordered extents to complete while
1642 	 * for a fast fsync we wait just for writeback to complete, and then
1643 	 * attach the ordered extents to the transaction so that a transaction
1644 	 * commit waits for their completion, to avoid data loss if we fsync,
1645 	 * the current transaction commits before the ordered extents complete
1646 	 * and a power failure happens right after that.
1647 	 *
1648 	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1649 	 * logical address recorded in the ordered extent may change. We need
1650 	 * to wait for the IO to stabilize the logical address.
1651 	 */
1652 	if (full_sync || btrfs_is_zoned(fs_info)) {
1653 		ret = btrfs_wait_ordered_range(inode, start, len);
1654 		clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1655 	} else {
1656 		/*
1657 		 * Get our ordered extents as soon as possible to avoid doing
1658 		 * checksum lookups in the csum tree, and use instead the
1659 		 * checksums attached to the ordered extents.
1660 		 */
1661 		btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1662 		ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1663 		if (ret)
1664 			goto out_release_extents;
1665 
1666 		/*
1667 		 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1668 		 * starting and waiting for writeback, because for buffered IO
1669 		 * it may have been set during the end IO callback
1670 		 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1671 		 * case an error happened and we need to wait for ordered
1672 		 * extents to complete so that any extent maps that point to
1673 		 * unwritten locations are dropped and we don't log them.
1674 		 */
1675 		if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1676 			ret = btrfs_wait_ordered_range(inode, start, len);
1677 	}
1678 
1679 	if (ret)
1680 		goto out_release_extents;
1681 
1682 	atomic_inc(&root->log_batch);
1683 
1684 	if (skip_inode_logging(&ctx)) {
1685 		/*
1686 		 * We've had everything committed since the last time we were
1687 		 * modified so clear this flag in case it was set for whatever
1688 		 * reason, it's no longer relevant.
1689 		 */
1690 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1691 		goto out_release_extents;
1692 	}
1693 
1694 	btrfs_init_log_ctx_scratch_eb(&ctx);
1695 
1696 	/*
1697 	 * We use start here because we will need to wait on the IO to complete
1698 	 * in btrfs_sync_log, which could require joining a transaction (for
1699 	 * example checking cross references in the nocow path).  If we use join
1700 	 * here we could get into a situation where we're waiting on IO to
1701 	 * happen that is blocked on a transaction trying to commit.  With start
1702 	 * we inc the extwriter counter, so we wait for all extwriters to exit
1703 	 * before we start blocking joiners.  This comment is to keep somebody
1704 	 * from thinking they are super smart and changing this to
1705 	 * btrfs_join_transaction *cough*Josef*cough*.
1706 	 */
1707 	trans = btrfs_start_transaction(root, 0);
1708 	if (IS_ERR(trans)) {
1709 		ret = PTR_ERR(trans);
1710 		goto out_release_extents;
1711 	}
1712 	trans->in_fsync = true;
1713 
1714 	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1715 	/*
1716 	 * Scratch eb no longer needed, release before syncing log or commit
1717 	 * transaction, to avoid holding unnecessary memory during such long
1718 	 * operations.
1719 	 */
1720 	if (ctx.scratch_eb) {
1721 		free_extent_buffer(ctx.scratch_eb);
1722 		ctx.scratch_eb = NULL;
1723 	}
1724 	btrfs_release_log_ctx_extents(&ctx);
1725 	if (ret < 0) {
1726 		/* Fallthrough and commit/free transaction. */
1727 		ret = BTRFS_LOG_FORCE_COMMIT;
1728 	}
1729 
1730 	/* we've logged all the items and now have a consistent
1731 	 * version of the file in the log.  It is possible that
1732 	 * someone will come in and modify the file, but that's
1733 	 * fine because the log is consistent on disk, and we
1734 	 * have references to all of the file's extents
1735 	 *
1736 	 * It is possible that someone will come in and log the
1737 	 * file again, but that will end up using the synchronization
1738 	 * inside btrfs_sync_log to keep things safe.
1739 	 */
1740 	if (skip_ilock)
1741 		up_write(&inode->i_mmap_lock);
1742 	else
1743 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1744 
1745 	if (ret == BTRFS_NO_LOG_SYNC) {
1746 		ret = btrfs_end_transaction(trans);
1747 		goto out;
1748 	}
1749 
1750 	/* We successfully logged the inode, attempt to sync the log. */
1751 	if (!ret) {
1752 		ret = btrfs_sync_log(trans, root, &ctx);
1753 		if (!ret) {
1754 			ret = btrfs_end_transaction(trans);
1755 			goto out;
1756 		}
1757 	}
1758 
1759 	/*
1760 	 * At this point we need to commit the transaction because we had
1761 	 * btrfs_need_log_full_commit() or some other error.
1762 	 *
1763 	 * If we didn't do a full sync we have to stop the trans handle, wait on
1764 	 * the ordered extents, start it again and commit the transaction.  If
1765 	 * we attempt to wait on the ordered extents here we could deadlock with
1766 	 * something like fallocate() that is holding the extent lock trying to
1767 	 * start a transaction while some other thread is trying to commit the
1768 	 * transaction while we (fsync) are currently holding the transaction
1769 	 * open.
1770 	 */
1771 	if (!full_sync) {
1772 		ret = btrfs_end_transaction(trans);
1773 		if (ret)
1774 			goto out;
1775 		ret = btrfs_wait_ordered_range(inode, start, len);
1776 		if (ret)
1777 			goto out;
1778 
1779 		/*
1780 		 * This is safe to use here because we're only interested in
1781 		 * making sure the transaction that had the ordered extents is
1782 		 * committed.  We aren't waiting on anything past this point,
1783 		 * we're purely getting the transaction and committing it.
1784 		 */
1785 		trans = btrfs_attach_transaction_barrier(root);
1786 		if (IS_ERR(trans)) {
1787 			ret = PTR_ERR(trans);
1788 
1789 			/*
1790 			 * We committed the transaction and there's no currently
1791 			 * running transaction, this means everything we care
1792 			 * about made it to disk and we are done.
1793 			 */
1794 			if (ret == -ENOENT)
1795 				ret = 0;
1796 			goto out;
1797 		}
1798 	}
1799 
1800 	ret = btrfs_commit_transaction(trans);
1801 out:
1802 	free_extent_buffer(ctx.scratch_eb);
1803 	ASSERT(list_empty(&ctx.list));
1804 	ASSERT(list_empty(&ctx.conflict_inodes));
1805 	ASSERT(ret <= 0, "ret=%d", ret);
1806 	/*
1807 	 * Ordered extents might have started and completed before this fsync,
1808 	 * so check for any io errors and advance the writeback error sequence.
1809 	 */
1810 	err = file_check_and_advance_wb_err(file);
1811 	if (!ret)
1812 		ret = err;
1813 	trace_btrfs_sync_file_exit(file, ret);
1814 
1815 	return ret;
1816 
1817 out_release_extents:
1818 	btrfs_release_log_ctx_extents(&ctx);
1819 	if (skip_ilock)
1820 		up_write(&inode->i_mmap_lock);
1821 	else
1822 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1823 	goto out;
1824 }
1825 
1826 /*
1827  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1828  * called from a page fault handler when a page is first dirtied. Hence we must
1829  * be careful to check for EOF conditions here. We set the page up correctly
1830  * for a written page which means we get ENOSPC checking when writing into
1831  * holes and correct delalloc and unwritten extent mapping on filesystems that
1832  * support these features.
1833  *
1834  * We are not allowed to take the i_mutex here so we have to play games to
1835  * protect against truncate races as the page could now be beyond EOF.  Because
1836  * truncate_setsize() writes the inode size before removing pages, once we have
1837  * the page lock we can determine safely if the page is beyond EOF. If it is not
1838  * beyond EOF, then the page is guaranteed safe against truncation until we
1839  * unlock the page.
1840  */
1841 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1842 {
1843 	struct page *page = vmf->page;
1844 	struct folio *folio = page_folio(page);
1845 	struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
1846 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1847 	struct extent_io_tree *io_tree = &inode->io_tree;
1848 	struct btrfs_ordered_extent *ordered;
1849 	struct extent_state *cached_state = NULL;
1850 	struct extent_changeset *data_reserved = NULL;
1851 	unsigned long zero_start;
1852 	loff_t size;
1853 	size_t fsize = folio_size(folio);
1854 	int ret;
1855 	bool only_release_metadata = false;
1856 	u64 reserved_space;
1857 	u64 page_start;
1858 	u64 page_end;
1859 	u64 end;
1860 
1861 	reserved_space = fsize;
1862 
1863 	sb_start_pagefault(inode->vfs_inode.i_sb);
1864 	page_start = folio_pos(folio);
1865 	page_end = page_start + folio_size(folio) - 1;
1866 	end = page_end;
1867 
1868 	/*
1869 	 * Reserving delalloc space after obtaining the page lock can lead to
1870 	 * deadlock. For example, if a dirty page is locked by this function
1871 	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1872 	 * dirty page write out, then the btrfs_writepages() function could
1873 	 * end up waiting indefinitely to get a lock on the page currently
1874 	 * being processed by btrfs_page_mkwrite() function.
1875 	 */
1876 	ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
1877 					  reserved_space, false);
1878 	if (ret < 0) {
1879 		size_t write_bytes = reserved_space;
1880 
1881 		if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
1882 			goto out_noreserve;
1883 
1884 		only_release_metadata = true;
1885 
1886 		/*
1887 		 * Can't write the whole range, there may be shared extents or
1888 		 * holes in the range, bail out with @only_release_metadata set
1889 		 * to true so that we unlock the nocow lock before returning the
1890 		 * error.
1891 		 */
1892 		if (write_bytes < reserved_space)
1893 			goto out_noreserve;
1894 	}
1895 	ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
1896 					      reserved_space, false);
1897 	if (ret < 0) {
1898 		if (!only_release_metadata)
1899 			btrfs_free_reserved_data_space(inode, data_reserved,
1900 						       page_start, reserved_space);
1901 		goto out_noreserve;
1902 	}
1903 
1904 	ret = file_update_time(vmf->vma->vm_file);
1905 	if (ret < 0)
1906 		goto out;
1907 again:
1908 	down_read(&inode->i_mmap_lock);
1909 	folio_lock(folio);
1910 	size = i_size_read(&inode->vfs_inode);
1911 
1912 	if ((folio->mapping != inode->vfs_inode.i_mapping) ||
1913 	    (page_start >= size)) {
1914 		/* Page got truncated out from underneath us. */
1915 		goto out_unlock;
1916 	}
1917 	folio_wait_writeback(folio);
1918 
1919 	btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
1920 	ret = set_folio_extent_mapped(folio);
1921 	if (ret < 0) {
1922 		btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1923 		goto out_unlock;
1924 	}
1925 
1926 	/*
1927 	 * We can't set the delalloc bits if there are pending ordered
1928 	 * extents.  Drop our locks and wait for them to finish.
1929 	 */
1930 	ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
1931 	if (ordered) {
1932 		btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1933 		folio_unlock(folio);
1934 		up_read(&inode->i_mmap_lock);
1935 		btrfs_start_ordered_extent(ordered);
1936 		btrfs_put_ordered_extent(ordered);
1937 		goto again;
1938 	}
1939 
1940 	if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
1941 		reserved_space = round_up(size - page_start, fs_info->sectorsize);
1942 		if (reserved_space < fsize) {
1943 			const u64 to_free = fsize - reserved_space;
1944 
1945 			end = page_start + reserved_space - 1;
1946 			if (only_release_metadata)
1947 				btrfs_delalloc_release_metadata(inode, to_free, true);
1948 			else
1949 				btrfs_delalloc_release_space(inode, data_reserved,
1950 							     end + 1, to_free, true);
1951 		}
1952 	}
1953 
1954 	ret = btrfs_reset_extent_delalloc(inode, page_start, end, 0, &cached_state);
1955 	if (ret < 0) {
1956 		btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1957 		goto out_unlock;
1958 	}
1959 
1960 	/* Page is wholly or partially inside EOF. */
1961 	if (page_start + folio_size(folio) > size)
1962 		zero_start = offset_in_folio(folio, size);
1963 	else
1964 		zero_start = fsize;
1965 
1966 	if (zero_start != fsize)
1967 		folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1968 
1969 	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1970 	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1971 
1972 	btrfs_set_inode_last_sub_trans(inode);
1973 
1974 	if (only_release_metadata)
1975 		btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
1976 				     &cached_state);
1977 
1978 	btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
1979 	up_read(&inode->i_mmap_lock);
1980 
1981 	btrfs_delalloc_release_extents(inode, fsize);
1982 	if (only_release_metadata)
1983 		btrfs_check_nocow_unlock(inode);
1984 	sb_end_pagefault(inode->vfs_inode.i_sb);
1985 	extent_changeset_free(data_reserved);
1986 	return VM_FAULT_LOCKED;
1987 
1988 out_unlock:
1989 	folio_unlock(folio);
1990 	up_read(&inode->i_mmap_lock);
1991 out:
1992 	btrfs_delalloc_release_extents(inode, fsize);
1993 	if (only_release_metadata)
1994 		btrfs_delalloc_release_metadata(inode, reserved_space, true);
1995 	else
1996 		btrfs_delalloc_release_space(inode, data_reserved, page_start,
1997 					     reserved_space, true);
1998 out_noreserve:
1999 	if (only_release_metadata)
2000 		btrfs_check_nocow_unlock(inode);
2001 
2002 	sb_end_pagefault(inode->vfs_inode.i_sb);
2003 
2004 	extent_changeset_free(data_reserved);
2005 
2006 	if (ret < 0)
2007 		return vmf_error(ret);
2008 
2009 	/* Make the VM retry the fault. */
2010 	return VM_FAULT_NOPAGE;
2011 }
2012 
2013 static const struct vm_operations_struct btrfs_file_vm_ops = {
2014 	.fault		= filemap_fault,
2015 	.map_pages	= filemap_map_pages,
2016 	.page_mkwrite	= btrfs_page_mkwrite,
2017 };
2018 
2019 static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
2020 {
2021 	struct file *filp = desc->file;
2022 	struct address_space *mapping = filp->f_mapping;
2023 
2024 	if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))
2025 		return -EIO;
2026 	if (!mapping->a_ops->read_folio)
2027 		return -ENOEXEC;
2028 
2029 	file_accessed(filp);
2030 	desc->vm_ops = &btrfs_file_vm_ops;
2031 
2032 	return 0;
2033 }
2034 
2035 static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2036 			   int slot, u64 start, u64 end)
2037 {
2038 	struct btrfs_file_extent_item *fi;
2039 	struct btrfs_key key;
2040 
2041 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2042 		return false;
2043 
2044 	btrfs_item_key_to_cpu(leaf, &key, slot);
2045 	if (key.objectid != btrfs_ino(inode) ||
2046 	    key.type != BTRFS_EXTENT_DATA_KEY)
2047 		return false;
2048 
2049 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2050 
2051 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2052 		return false;
2053 
2054 	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2055 		return false;
2056 
2057 	if (key.offset == end)
2058 		return true;
2059 	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2060 		return true;
2061 	return false;
2062 }
2063 
2064 static int fill_holes(struct btrfs_trans_handle *trans,
2065 		struct btrfs_inode *inode,
2066 		struct btrfs_path *path, u64 offset, u64 end)
2067 {
2068 	struct btrfs_fs_info *fs_info = trans->fs_info;
2069 	struct btrfs_root *root = inode->root;
2070 	struct extent_buffer *leaf;
2071 	struct btrfs_file_extent_item *fi;
2072 	struct extent_map *hole_em;
2073 	struct btrfs_key key;
2074 	int modify_slot = -1;
2075 	int del_slot = -1;
2076 	bool update_offset = false;
2077 	u64 num_bytes = 0;
2078 	int ret;
2079 
2080 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2081 		goto out;
2082 
2083 	key.objectid = btrfs_ino(inode);
2084 	key.type = BTRFS_EXTENT_DATA_KEY;
2085 	key.offset = offset;
2086 
2087 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2088 	if (ret <= 0) {
2089 		/*
2090 		 * We should have dropped this offset, so if we find it then
2091 		 * something has gone horribly wrong.
2092 		 */
2093 		if (ret == 0)
2094 			ret = -EINVAL;
2095 		return ret;
2096 	}
2097 
2098 	leaf = path->nodes[0];
2099 	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2100 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
2101 				    struct btrfs_file_extent_item);
2102 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2103 			end - offset;
2104 		modify_slot = path->slots[0] - 1;
2105 	}
2106 	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2107 		fi = btrfs_item_ptr(leaf, path->slots[0],
2108 				    struct btrfs_file_extent_item);
2109 		if (modify_slot != -1) {
2110 			num_bytes += btrfs_file_extent_num_bytes(leaf, fi);
2111 			del_slot = path->slots[0];
2112 		} else {
2113 			num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2114 				end - offset;
2115 			modify_slot = path->slots[0];
2116 			update_offset = true;
2117 		}
2118 	}
2119 	if (modify_slot >= 0) {
2120 		fi = btrfs_item_ptr(leaf, modify_slot,
2121 				    struct btrfs_file_extent_item);
2122 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2123 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2124 		if (update_offset) {
2125 			key.offset = offset;
2126 			btrfs_set_item_key_safe(trans, path, &key);
2127 		}
2128 		btrfs_set_file_extent_offset(leaf, fi, 0);
2129 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2130 		if (del_slot >= 0) {
2131 			ret = btrfs_del_items(trans, root, path, del_slot, 1);
2132 			if (ret) {
2133 				btrfs_abort_transaction(trans, ret);
2134 				btrfs_release_path(path);
2135 				return ret;
2136 			}
2137 		}
2138 		goto out;
2139 	}
2140 	btrfs_release_path(path);
2141 
2142 	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2143 				       end - offset);
2144 	if (ret)
2145 		return ret;
2146 
2147 out:
2148 	btrfs_release_path(path);
2149 
2150 	hole_em = btrfs_alloc_extent_map();
2151 	if (!hole_em) {
2152 		btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2153 		btrfs_set_inode_full_sync(inode);
2154 	} else {
2155 		hole_em->start = offset;
2156 		hole_em->len = end - offset;
2157 		hole_em->ram_bytes = hole_em->len;
2158 
2159 		hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2160 		hole_em->disk_num_bytes = 0;
2161 		hole_em->generation = trans->transid;
2162 
2163 		ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2164 		btrfs_free_extent_map(hole_em);
2165 		if (ret)
2166 			btrfs_set_inode_full_sync(inode);
2167 	}
2168 
2169 	return 0;
2170 }
2171 
2172 /*
2173  * Find a hole extent on given inode and change start/len to the end of hole
2174  * extent.(hole/vacuum extent whose em->start <= start &&
2175  *	   em->start + em->len > start)
2176  * When a hole extent is found, return 1 and modify start/len.
2177  */
2178 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2179 {
2180 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2181 	struct extent_map *em;
2182 	int ret = 0;
2183 
2184 	em = btrfs_get_extent(inode, NULL,
2185 			      round_down(*start, fs_info->sectorsize),
2186 			      round_up(*len, fs_info->sectorsize));
2187 	if (IS_ERR(em))
2188 		return PTR_ERR(em);
2189 
2190 	/* Hole or vacuum extent(only exists in no-hole mode) */
2191 	if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2192 		const u64 em_end = btrfs_extent_map_end(em);
2193 
2194 		ret = 1;
2195 		*len = (em_end > *start + *len) ? 0 : (*start + *len - em_end);
2196 		*start = em_end;
2197 	}
2198 	btrfs_free_extent_map(em);
2199 	return ret;
2200 }
2201 
2202 /*
2203  * Check if there is no folio in the range.
2204  *
2205  * We cannot utilize filemap_range_has_page() in a filemap with large folios
2206  * as we can hit the following false positive:
2207  *
2208  *        start                            end
2209  *        |                                |
2210  *  |//|//|//|//|  |  |  |  |  |  |  |  |//|//|
2211  *   \         /                         \   /
2212  *    Folio A                            Folio B
2213  *
2214  * That large folio A and B cover the start and end indexes.
2215  * In that case filemap_range_has_page() will always return true, but the above
2216  * case is fine for btrfs_punch_hole_lock_range() usage.
2217  *
2218  * So here we only ensure that no other folios is in the range, excluding the
2219  * head/tail large folio.
2220  */
2221 static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
2222 {
2223 	struct folio_batch fbatch;
2224 	bool ret = false;
2225 	/*
2226 	 * For subpage case, if the range is not at page boundary, we could
2227 	 * have pages at the leading/tailing part of the range.
2228 	 * This could lead to dead loop since filemap_range_has_page()
2229 	 * will always return true.
2230 	 * So here we need to do extra page alignment for
2231 	 * filemap_range_has_page().
2232 	 *
2233 	 * And do not decrease page_lockend right now, as it can be 0.
2234 	 */
2235 	const u64 page_lockstart = round_up(start, PAGE_SIZE);
2236 	const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
2237 	const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
2238 	const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
2239 	pgoff_t tmp = start_index;
2240 	int found_folios;
2241 
2242 	/* The same page or adjacent pages. */
2243 	if (page_lockend <= page_lockstart)
2244 		return false;
2245 
2246 	folio_batch_init(&fbatch);
2247 	found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
2248 	for (int i = 0; i < found_folios; i++) {
2249 		struct folio *folio = fbatch.folios[i];
2250 
2251 		/* A large folio begins before the start. Not a target. */
2252 		if (folio->index < start_index)
2253 			continue;
2254 		/* A large folio extends beyond the end. Not a target. */
2255 		if (folio_next_index(folio) > end_index)
2256 			continue;
2257 		/* A folio doesn't cover the head/tail index. Found a target. */
2258 		ret = true;
2259 		break;
2260 	}
2261 	folio_batch_release(&fbatch);
2262 	return ret;
2263 }
2264 
2265 static void btrfs_punch_hole_lock_range(struct inode *inode,
2266 					const u64 lockstart, const u64 lockend,
2267 					struct extent_state **cached_state)
2268 {
2269 	while (1) {
2270 		truncate_pagecache_range(inode, lockstart, lockend);
2271 
2272 		btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2273 				  cached_state);
2274 		/*
2275 		 * We can't have ordered extents in the range, nor dirty/writeback
2276 		 * pages, because we have locked the inode's VFS lock in exclusive
2277 		 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2278 		 * we have flushed all delalloc in the range and we have waited
2279 		 * for any ordered extents in the range to complete.
2280 		 * We can race with anyone reading pages from this range, so after
2281 		 * locking the range check if we have pages in the range, and if
2282 		 * we do, unlock the range and retry.
2283 		 */
2284 		if (!check_range_has_page(inode, lockstart, lockend))
2285 			break;
2286 
2287 		btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2288 				    cached_state);
2289 	}
2290 
2291 	btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2292 }
2293 
2294 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2295 				     struct btrfs_inode *inode,
2296 				     struct btrfs_path *path,
2297 				     struct btrfs_replace_extent_info *extent_info,
2298 				     const u64 replace_len,
2299 				     const u64 bytes_to_drop)
2300 {
2301 	struct btrfs_fs_info *fs_info = trans->fs_info;
2302 	struct btrfs_root *root = inode->root;
2303 	struct btrfs_file_extent_item *extent;
2304 	struct extent_buffer *leaf;
2305 	struct btrfs_key key;
2306 	int slot;
2307 	int ret;
2308 
2309 	if (replace_len == 0)
2310 		return 0;
2311 
2312 	if (extent_info->disk_offset == 0 &&
2313 	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2314 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2315 		return 0;
2316 	}
2317 
2318 	key.objectid = btrfs_ino(inode);
2319 	key.type = BTRFS_EXTENT_DATA_KEY;
2320 	key.offset = extent_info->file_offset;
2321 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2322 				      sizeof(struct btrfs_file_extent_item));
2323 	if (ret)
2324 		return ret;
2325 	leaf = path->nodes[0];
2326 	slot = path->slots[0];
2327 	write_extent_buffer(leaf, extent_info->extent_buf,
2328 			    btrfs_item_ptr_offset(leaf, slot),
2329 			    sizeof(struct btrfs_file_extent_item));
2330 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2331 	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2332 	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2333 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2334 	if (extent_info->is_new_extent)
2335 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2336 	btrfs_release_path(path);
2337 
2338 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2339 						replace_len);
2340 	if (ret)
2341 		return ret;
2342 
2343 	/* If it's a hole, nothing more needs to be done. */
2344 	if (extent_info->disk_offset == 0) {
2345 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2346 		return 0;
2347 	}
2348 
2349 	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2350 
2351 	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2352 		key.objectid = extent_info->disk_offset;
2353 		key.type = BTRFS_EXTENT_ITEM_KEY;
2354 		key.offset = extent_info->disk_len;
2355 		ret = btrfs_alloc_reserved_file_extent(trans, root,
2356 						       btrfs_ino(inode),
2357 						       extent_info->file_offset,
2358 						       extent_info->qgroup_reserved,
2359 						       &key);
2360 	} else {
2361 		struct btrfs_ref ref = {
2362 			.action = BTRFS_ADD_DELAYED_REF,
2363 			.bytenr = extent_info->disk_offset,
2364 			.num_bytes = extent_info->disk_len,
2365 			.owning_root = btrfs_root_id(root),
2366 			.ref_root = btrfs_root_id(root),
2367 		};
2368 		u64 ref_offset;
2369 
2370 		ref_offset = extent_info->file_offset - extent_info->data_offset;
2371 		btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2372 		ret = btrfs_inc_extent_ref(trans, &ref);
2373 	}
2374 
2375 	extent_info->insertions++;
2376 
2377 	return ret;
2378 }
2379 
2380 /*
2381  * The respective range must have been previously locked, as well as the inode.
2382  * The end offset is inclusive (last byte of the range).
2383  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2384  * the file range with an extent.
2385  * When not punching a hole, we don't want to end up in a state where we dropped
2386  * extents without inserting a new one, so we must abort the transaction to avoid
2387  * a corruption.
2388  */
2389 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2390 			       struct btrfs_path *path, const u64 start,
2391 			       const u64 end,
2392 			       struct btrfs_replace_extent_info *extent_info,
2393 			       struct btrfs_trans_handle **trans_out)
2394 {
2395 	struct btrfs_drop_extents_args drop_args = { 0 };
2396 	struct btrfs_root *root = inode->root;
2397 	struct btrfs_fs_info *fs_info = root->fs_info;
2398 	const u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2399 	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2400 	struct btrfs_trans_handle *trans = NULL;
2401 	struct btrfs_block_rsv rsv;
2402 	unsigned int rsv_count;
2403 	u64 cur_offset;
2404 	u64 len = end - start;
2405 	int ret = 0;
2406 
2407 	if (end <= start)
2408 		return -EINVAL;
2409 
2410 	btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
2411 	rsv.size = min_size;
2412 	rsv.failfast = true;
2413 
2414 	/*
2415 	 * 1 - update the inode
2416 	 * 1 - removing the extents in the range
2417 	 * 1 - adding the hole extent if no_holes isn't set or if we are
2418 	 *     replacing the range with a new extent
2419 	 */
2420 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2421 		rsv_count = 3;
2422 	else
2423 		rsv_count = 2;
2424 
2425 	trans = btrfs_start_transaction(root, rsv_count);
2426 	if (IS_ERR(trans)) {
2427 		ret = PTR_ERR(trans);
2428 		trans = NULL;
2429 		goto out_release;
2430 	}
2431 
2432 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
2433 				      min_size, false);
2434 	if (WARN_ON(ret))
2435 		goto out_trans;
2436 	trans->block_rsv = &rsv;
2437 
2438 	cur_offset = start;
2439 	drop_args.path = path;
2440 	drop_args.end = end + 1;
2441 	drop_args.drop_cache = true;
2442 	while (cur_offset < end) {
2443 		drop_args.start = cur_offset;
2444 		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2445 		/* If we are punching a hole decrement the inode's byte count */
2446 		if (!extent_info)
2447 			btrfs_update_inode_bytes(inode, 0,
2448 						 drop_args.bytes_found);
2449 		if (ret != -ENOSPC) {
2450 			/*
2451 			 * The only time we don't want to abort is if we are
2452 			 * attempting to clone a partial inline extent, in which
2453 			 * case we'll get EOPNOTSUPP.  However if we aren't
2454 			 * clone we need to abort no matter what, because if we
2455 			 * got EOPNOTSUPP via prealloc then we messed up and
2456 			 * need to abort.
2457 			 */
2458 			if (unlikely(ret &&
2459 				     (ret != -EOPNOTSUPP ||
2460 				      (extent_info && extent_info->is_new_extent))))
2461 				btrfs_abort_transaction(trans, ret);
2462 			break;
2463 		}
2464 
2465 		trans->block_rsv = &fs_info->trans_block_rsv;
2466 
2467 		if (!extent_info && cur_offset < drop_args.drop_end &&
2468 		    cur_offset < ino_size) {
2469 			ret = fill_holes(trans, inode, path, cur_offset,
2470 					 drop_args.drop_end);
2471 			if (unlikely(ret)) {
2472 				/*
2473 				 * If we failed then we didn't insert our hole
2474 				 * entries for the area we dropped, so now the
2475 				 * fs is corrupted, so we must abort the
2476 				 * transaction.
2477 				 */
2478 				btrfs_abort_transaction(trans, ret);
2479 				break;
2480 			}
2481 		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2482 			/*
2483 			 * We are past the i_size here, but since we didn't
2484 			 * insert holes we need to clear the mapped area so we
2485 			 * know to not set disk_i_size in this area until a new
2486 			 * file extent is inserted here.
2487 			 */
2488 			ret = btrfs_inode_clear_file_extent_range(inode,
2489 					cur_offset,
2490 					drop_args.drop_end - cur_offset);
2491 			if (unlikely(ret)) {
2492 				/*
2493 				 * We couldn't clear our area, so we could
2494 				 * presumably adjust up and corrupt the fs, so
2495 				 * we need to abort.
2496 				 */
2497 				btrfs_abort_transaction(trans, ret);
2498 				break;
2499 			}
2500 		}
2501 
2502 		if (extent_info &&
2503 		    drop_args.drop_end > extent_info->file_offset) {
2504 			u64 replace_len = drop_args.drop_end -
2505 					  extent_info->file_offset;
2506 
2507 			ret = btrfs_insert_replace_extent(trans, inode,	path,
2508 					extent_info, replace_len,
2509 					drop_args.bytes_found);
2510 			if (unlikely(ret)) {
2511 				btrfs_abort_transaction(trans, ret);
2512 				break;
2513 			}
2514 			extent_info->data_len -= replace_len;
2515 			extent_info->data_offset += replace_len;
2516 			extent_info->file_offset += replace_len;
2517 		}
2518 
2519 		/*
2520 		 * We are releasing our handle on the transaction, balance the
2521 		 * dirty pages of the btree inode and flush delayed items, and
2522 		 * then get a new transaction handle, which may now point to a
2523 		 * new transaction in case someone else may have committed the
2524 		 * transaction we used to replace/drop file extent items. So
2525 		 * bump the inode's iversion and update mtime and ctime except
2526 		 * if we are called from a dedupe context. This is because a
2527 		 * power failure/crash may happen after the transaction is
2528 		 * committed and before we finish replacing/dropping all the
2529 		 * file extent items we need.
2530 		 */
2531 		inode_inc_iversion(&inode->vfs_inode);
2532 
2533 		if (!extent_info || extent_info->update_times)
2534 			inode_set_mtime_to_ts(&inode->vfs_inode,
2535 					      inode_set_ctime_current(&inode->vfs_inode));
2536 
2537 		ret = btrfs_update_inode(trans, inode);
2538 		if (ret)
2539 			break;
2540 
2541 		btrfs_end_transaction(trans);
2542 		btrfs_btree_balance_dirty(fs_info);
2543 
2544 		trans = btrfs_start_transaction(root, rsv_count);
2545 		if (IS_ERR(trans)) {
2546 			ret = PTR_ERR(trans);
2547 			trans = NULL;
2548 			break;
2549 		}
2550 
2551 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2552 					      &rsv, min_size, false);
2553 		if (WARN_ON(ret))
2554 			break;
2555 		trans->block_rsv = &rsv;
2556 
2557 		cur_offset = drop_args.drop_end;
2558 		len = end - cur_offset;
2559 		if (!extent_info && len) {
2560 			ret = find_first_non_hole(inode, &cur_offset, &len);
2561 			if (unlikely(ret < 0))
2562 				break;
2563 			if (ret && !len) {
2564 				ret = 0;
2565 				break;
2566 			}
2567 		}
2568 	}
2569 
2570 	/*
2571 	 * If we were cloning, force the next fsync to be a full one since we
2572 	 * we replaced (or just dropped in the case of cloning holes when
2573 	 * NO_HOLES is enabled) file extent items and did not setup new extent
2574 	 * maps for the replacement extents (or holes).
2575 	 */
2576 	if (extent_info && !extent_info->is_new_extent)
2577 		btrfs_set_inode_full_sync(inode);
2578 
2579 	if (ret)
2580 		goto out_trans;
2581 
2582 	trans->block_rsv = &fs_info->trans_block_rsv;
2583 	/*
2584 	 * If we are using the NO_HOLES feature we might have had already an
2585 	 * hole that overlaps a part of the region [lockstart, lockend] and
2586 	 * ends at (or beyond) lockend. Since we have no file extent items to
2587 	 * represent holes, drop_end can be less than lockend and so we must
2588 	 * make sure we have an extent map representing the existing hole (the
2589 	 * call to __btrfs_drop_extents() might have dropped the existing extent
2590 	 * map representing the existing hole), otherwise the fast fsync path
2591 	 * will not record the existence of the hole region
2592 	 * [existing_hole_start, lockend].
2593 	 */
2594 	if (drop_args.drop_end <= end)
2595 		drop_args.drop_end = end + 1;
2596 	/*
2597 	 * Don't insert file hole extent item if it's for a range beyond eof
2598 	 * (because it's useless) or if it represents a 0 bytes range (when
2599 	 * cur_offset == drop_end).
2600 	 */
2601 	if (!extent_info && cur_offset < ino_size &&
2602 	    cur_offset < drop_args.drop_end) {
2603 		ret = fill_holes(trans, inode, path, cur_offset,
2604 				 drop_args.drop_end);
2605 		if (unlikely(ret)) {
2606 			/* Same comment as above. */
2607 			btrfs_abort_transaction(trans, ret);
2608 			goto out_trans;
2609 		}
2610 	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2611 		/* See the comment in the loop above for the reasoning here. */
2612 		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2613 					drop_args.drop_end - cur_offset);
2614 		if (unlikely(ret)) {
2615 			btrfs_abort_transaction(trans, ret);
2616 			goto out_trans;
2617 		}
2618 
2619 	}
2620 	if (extent_info) {
2621 		ret = btrfs_insert_replace_extent(trans, inode, path,
2622 				extent_info, extent_info->data_len,
2623 				drop_args.bytes_found);
2624 		if (unlikely(ret)) {
2625 			btrfs_abort_transaction(trans, ret);
2626 			goto out_trans;
2627 		}
2628 	}
2629 
2630 out_trans:
2631 	if (!trans)
2632 		goto out_release;
2633 
2634 	trans->block_rsv = &fs_info->trans_block_rsv;
2635 	if (ret)
2636 		btrfs_end_transaction(trans);
2637 	else
2638 		*trans_out = trans;
2639 out_release:
2640 	btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
2641 	return ret;
2642 }
2643 
2644 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2645 {
2646 	struct inode *inode = file_inode(file);
2647 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2648 	struct btrfs_root *root = BTRFS_I(inode)->root;
2649 	struct extent_state *cached_state = NULL;
2650 	struct btrfs_path *path;
2651 	struct btrfs_trans_handle *trans = NULL;
2652 	u64 lockstart;
2653 	u64 lockend;
2654 	u64 tail_start;
2655 	u64 tail_len;
2656 	const u64 orig_start = offset;
2657 	const u64 orig_end = offset + len - 1;
2658 	int ret = 0;
2659 	bool same_block;
2660 	u64 ino_size;
2661 	bool truncated_block = false;
2662 	bool updated_inode = false;
2663 
2664 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2665 
2666 	ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2667 	if (ret)
2668 		goto out_only_mutex;
2669 
2670 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2671 	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2672 	if (ret < 0)
2673 		goto out_only_mutex;
2674 	if (ret && !len) {
2675 		/* Already in a large hole */
2676 		ret = 0;
2677 		goto out_only_mutex;
2678 	}
2679 
2680 	ret = file_modified(file);
2681 	if (ret)
2682 		goto out_only_mutex;
2683 
2684 	lockstart = round_up(offset, fs_info->sectorsize);
2685 	lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2686 	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2687 		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2688 	/*
2689 	 * Only do this if we are in the same block and we aren't doing the
2690 	 * entire block.
2691 	 */
2692 	if (same_block && len < fs_info->sectorsize) {
2693 		if (offset < ino_size) {
2694 			truncated_block = true;
2695 			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2696 						   orig_start, orig_end);
2697 		} else {
2698 			ret = 0;
2699 		}
2700 		goto out_only_mutex;
2701 	}
2702 
2703 	/* zero back part of the first block */
2704 	if (offset < ino_size) {
2705 		truncated_block = true;
2706 		ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
2707 		if (ret) {
2708 			btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2709 			return ret;
2710 		}
2711 	}
2712 
2713 	/* Check the aligned pages after the first unaligned page,
2714 	 * if offset != orig_start, which means the first unaligned page
2715 	 * including several following pages are already in holes,
2716 	 * the extra check can be skipped */
2717 	if (offset == orig_start) {
2718 		/* after truncate page, check hole again */
2719 		len = offset + len - lockstart;
2720 		offset = lockstart;
2721 		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2722 		if (ret < 0)
2723 			goto out_only_mutex;
2724 		if (ret && !len) {
2725 			ret = 0;
2726 			goto out_only_mutex;
2727 		}
2728 		lockstart = offset;
2729 	}
2730 
2731 	/* Check the tail unaligned part is in a hole */
2732 	tail_start = lockend + 1;
2733 	tail_len = offset + len - tail_start;
2734 	if (tail_len) {
2735 		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2736 		if (unlikely(ret < 0))
2737 			goto out_only_mutex;
2738 		if (!ret) {
2739 			/* zero the front end of the last page */
2740 			if (tail_start + tail_len < ino_size) {
2741 				truncated_block = true;
2742 				ret = btrfs_truncate_block(BTRFS_I(inode),
2743 							tail_start + tail_len - 1,
2744 							orig_start, orig_end);
2745 				if (ret)
2746 					goto out_only_mutex;
2747 			}
2748 		}
2749 	}
2750 
2751 	if (lockend < lockstart) {
2752 		ret = 0;
2753 		goto out_only_mutex;
2754 	}
2755 
2756 	btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2757 
2758 	path = btrfs_alloc_path();
2759 	if (!path) {
2760 		ret = -ENOMEM;
2761 		goto out;
2762 	}
2763 
2764 	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2765 					 lockend, NULL, &trans);
2766 	btrfs_free_path(path);
2767 	if (ret)
2768 		goto out;
2769 
2770 	ASSERT(trans != NULL);
2771 	inode_inc_iversion(inode);
2772 	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2773 	ret = btrfs_update_inode(trans, BTRFS_I(inode));
2774 	updated_inode = true;
2775 	btrfs_end_transaction(trans);
2776 	btrfs_btree_balance_dirty(fs_info);
2777 out:
2778 	btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2779 			    &cached_state);
2780 out_only_mutex:
2781 	if (!updated_inode && truncated_block && !ret) {
2782 		/*
2783 		 * If we only end up zeroing part of a page, we still need to
2784 		 * update the inode item, so that all the time fields are
2785 		 * updated as well as the necessary btrfs inode in memory fields
2786 		 * for detecting, at fsync time, if the inode isn't yet in the
2787 		 * log tree or it's there but not up to date.
2788 		 */
2789 		struct timespec64 now = inode_set_ctime_current(inode);
2790 
2791 		inode_inc_iversion(inode);
2792 		inode_set_mtime_to_ts(inode, now);
2793 		trans = btrfs_start_transaction(root, 1);
2794 		if (IS_ERR(trans)) {
2795 			ret = PTR_ERR(trans);
2796 		} else {
2797 			int ret2;
2798 
2799 			ret = btrfs_update_inode(trans, BTRFS_I(inode));
2800 			ret2 = btrfs_end_transaction(trans);
2801 			if (!ret)
2802 				ret = ret2;
2803 		}
2804 	}
2805 	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2806 	return ret;
2807 }
2808 
2809 /* Helper structure to record which range is already reserved */
2810 struct falloc_range {
2811 	struct list_head list;
2812 	u64 start;
2813 	u64 len;
2814 };
2815 
2816 /*
2817  * Helper function to add falloc range
2818  *
2819  * Caller should have locked the larger range of extent containing
2820  * [start, len)
2821  */
2822 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2823 {
2824 	struct falloc_range *range = NULL;
2825 
2826 	if (!list_empty(head)) {
2827 		/*
2828 		 * As fallocate iterates by bytenr order, we only need to check
2829 		 * the last range.
2830 		 */
2831 		range = list_last_entry(head, struct falloc_range, list);
2832 		if (range->start + range->len == start) {
2833 			range->len += len;
2834 			return 0;
2835 		}
2836 	}
2837 
2838 	range = kmalloc_obj(*range);
2839 	if (!range)
2840 		return -ENOMEM;
2841 	range->start = start;
2842 	range->len = len;
2843 	list_add_tail(&range->list, head);
2844 	return 0;
2845 }
2846 
2847 static int btrfs_fallocate_update_isize(struct inode *inode,
2848 					const u64 end,
2849 					const int mode)
2850 {
2851 	struct btrfs_trans_handle *trans;
2852 	struct btrfs_root *root = BTRFS_I(inode)->root;
2853 	u64 range_start;
2854 	u64 range_end;
2855 	int ret;
2856 	int ret2;
2857 
2858 	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2859 		return 0;
2860 
2861 	range_start = round_down(i_size_read(inode), root->fs_info->sectorsize);
2862 	range_end = round_up(end, root->fs_info->sectorsize);
2863 
2864 	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start,
2865 						range_end - range_start);
2866 	if (ret)
2867 		return ret;
2868 
2869 	trans = btrfs_start_transaction(root, 1);
2870 	if (IS_ERR(trans))
2871 		return PTR_ERR(trans);
2872 
2873 	inode_set_ctime_current(inode);
2874 	i_size_write(inode, end);
2875 	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2876 	ret = btrfs_update_inode(trans, BTRFS_I(inode));
2877 	ret2 = btrfs_end_transaction(trans);
2878 
2879 	return ret ? ret : ret2;
2880 }
2881 
2882 enum {
2883 	RANGE_BOUNDARY_WRITTEN_EXTENT,
2884 	RANGE_BOUNDARY_PREALLOC_EXTENT,
2885 	RANGE_BOUNDARY_HOLE,
2886 };
2887 
2888 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2889 						 u64 offset)
2890 {
2891 	const u64 sectorsize = inode->root->fs_info->sectorsize;
2892 	struct extent_map *em;
2893 	int ret;
2894 
2895 	offset = round_down(offset, sectorsize);
2896 	em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2897 	if (IS_ERR(em))
2898 		return PTR_ERR(em);
2899 
2900 	if (em->disk_bytenr == EXTENT_MAP_HOLE)
2901 		ret = RANGE_BOUNDARY_HOLE;
2902 	else if (em->flags & EXTENT_FLAG_PREALLOC)
2903 		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2904 	else
2905 		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2906 
2907 	btrfs_free_extent_map(em);
2908 	return ret;
2909 }
2910 
2911 static int btrfs_zero_range(struct inode *inode,
2912 			    loff_t offset,
2913 			    loff_t len,
2914 			    const int mode)
2915 {
2916 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2917 	struct extent_map *em;
2918 	struct extent_changeset *data_reserved = NULL;
2919 	int ret;
2920 	u64 alloc_hint = 0;
2921 	const u64 sectorsize = fs_info->sectorsize;
2922 	const u64 orig_start = offset;
2923 	const u64 orig_end = offset + len - 1;
2924 	u64 alloc_start = round_down(offset, sectorsize);
2925 	u64 alloc_end = round_up(offset + len, sectorsize);
2926 	u64 bytes_to_reserve = 0;
2927 	bool space_reserved = false;
2928 
2929 	em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2930 			      alloc_end - alloc_start);
2931 	if (IS_ERR(em)) {
2932 		ret = PTR_ERR(em);
2933 		goto out;
2934 	}
2935 
2936 	/*
2937 	 * Avoid hole punching and extent allocation for some cases. More cases
2938 	 * could be considered, but these are unlikely common and we keep things
2939 	 * as simple as possible for now. Also, intentionally, if the target
2940 	 * range contains one or more prealloc extents together with regular
2941 	 * extents and holes, we drop all the existing extents and allocate a
2942 	 * new prealloc extent, so that we get a larger contiguous disk extent.
2943 	 */
2944 	if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2945 		const u64 em_end = btrfs_extent_map_end(em);
2946 
2947 		if (em_end >= offset + len) {
2948 			/*
2949 			 * The whole range is already a prealloc extent,
2950 			 * do nothing except updating the inode's i_size if
2951 			 * needed.
2952 			 */
2953 			btrfs_free_extent_map(em);
2954 			ret = btrfs_fallocate_update_isize(inode, offset + len,
2955 							   mode);
2956 			goto out;
2957 		}
2958 		/*
2959 		 * Part of the range is already a prealloc extent, so operate
2960 		 * only on the remaining part of the range.
2961 		 */
2962 		alloc_start = em_end;
2963 		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2964 		len = offset + len - alloc_start;
2965 		offset = alloc_start;
2966 		alloc_hint = btrfs_extent_map_block_start(em) + em->len;
2967 	}
2968 	btrfs_free_extent_map(em);
2969 
2970 	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2971 	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2972 		em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2973 		if (IS_ERR(em)) {
2974 			ret = PTR_ERR(em);
2975 			goto out;
2976 		}
2977 
2978 		if (em->flags & EXTENT_FLAG_PREALLOC) {
2979 			btrfs_free_extent_map(em);
2980 			ret = btrfs_fallocate_update_isize(inode, offset + len,
2981 							   mode);
2982 			goto out;
2983 		}
2984 		if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2985 			btrfs_free_extent_map(em);
2986 			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
2987 						   orig_start, orig_end);
2988 			if (!ret)
2989 				ret = btrfs_fallocate_update_isize(inode,
2990 								   offset + len,
2991 								   mode);
2992 			return ret;
2993 		}
2994 		btrfs_free_extent_map(em);
2995 		alloc_start = round_down(offset, sectorsize);
2996 		alloc_end = alloc_start + sectorsize;
2997 		goto reserve_space;
2998 	}
2999 
3000 	alloc_start = round_up(offset, sectorsize);
3001 	alloc_end = round_down(offset + len, sectorsize);
3002 
3003 	/*
3004 	 * For unaligned ranges, check the pages at the boundaries, they might
3005 	 * map to an extent, in which case we need to partially zero them, or
3006 	 * they might map to a hole, in which case we need our allocation range
3007 	 * to cover them.
3008 	 */
3009 	if (!IS_ALIGNED(offset, sectorsize)) {
3010 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3011 							    offset);
3012 		if (ret < 0)
3013 			goto out;
3014 		if (ret == RANGE_BOUNDARY_HOLE) {
3015 			alloc_start = round_down(offset, sectorsize);
3016 			ret = 0;
3017 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3018 			ret = btrfs_truncate_block(BTRFS_I(inode), offset,
3019 						   orig_start, orig_end);
3020 			if (ret)
3021 				goto out;
3022 		} else {
3023 			ret = 0;
3024 		}
3025 	}
3026 
3027 	if (!IS_ALIGNED(offset + len, sectorsize)) {
3028 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3029 							    offset + len);
3030 		if (ret < 0)
3031 			goto out;
3032 		if (ret == RANGE_BOUNDARY_HOLE) {
3033 			alloc_end = round_up(offset + len, sectorsize);
3034 			ret = 0;
3035 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3036 			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
3037 						   orig_start, orig_end);
3038 			if (ret)
3039 				goto out;
3040 		} else {
3041 			ret = 0;
3042 		}
3043 	}
3044 
3045 reserve_space:
3046 	if (alloc_start < alloc_end) {
3047 		struct extent_state *cached_state = NULL;
3048 		const u64 lockstart = alloc_start;
3049 		const u64 lockend = alloc_end - 1;
3050 
3051 		bytes_to_reserve = alloc_end - alloc_start;
3052 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3053 						      bytes_to_reserve);
3054 		if (ret < 0)
3055 			goto out;
3056 		space_reserved = true;
3057 		btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3058 					    &cached_state);
3059 		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3060 						alloc_start, bytes_to_reserve);
3061 		if (ret) {
3062 			btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3063 					    lockend, &cached_state);
3064 			goto out;
3065 		}
3066 		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3067 						alloc_end - alloc_start,
3068 						fs_info->sectorsize,
3069 						offset + len, &alloc_hint);
3070 		btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3071 				    &cached_state);
3072 		/* btrfs_prealloc_file_range releases reserved space on error */
3073 		if (ret) {
3074 			space_reserved = false;
3075 			goto out;
3076 		}
3077 	}
3078 	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3079  out:
3080 	if (ret && space_reserved)
3081 		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3082 					       alloc_start, bytes_to_reserve);
3083 	extent_changeset_free(data_reserved);
3084 
3085 	return ret;
3086 }
3087 
3088 static long btrfs_fallocate(struct file *file, int mode,
3089 			    loff_t offset, loff_t len)
3090 {
3091 	struct inode *inode = file_inode(file);
3092 	struct extent_state *cached_state = NULL;
3093 	struct extent_changeset *data_reserved = NULL;
3094 	struct falloc_range *range;
3095 	struct falloc_range *tmp;
3096 	LIST_HEAD(reserve_list);
3097 	u64 cur_offset;
3098 	u64 last_byte;
3099 	u64 alloc_start;
3100 	u64 alloc_end;
3101 	u64 alloc_hint = 0;
3102 	u64 locked_end;
3103 	u64 actual_end = 0;
3104 	u64 data_space_needed = 0;
3105 	u64 data_space_reserved = 0;
3106 	u64 qgroup_reserved = 0;
3107 	struct extent_map *em;
3108 	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3109 	int ret;
3110 
3111 	if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3112 		return -EIO;
3113 
3114 	/* Do not allow fallocate in ZONED mode */
3115 	if (btrfs_is_zoned(inode_to_fs_info(inode)))
3116 		return -EOPNOTSUPP;
3117 
3118 	alloc_start = round_down(offset, blocksize);
3119 	alloc_end = round_up(offset + len, blocksize);
3120 	cur_offset = alloc_start;
3121 
3122 	/* Make sure we aren't being give some crap mode */
3123 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3124 		     FALLOC_FL_ZERO_RANGE))
3125 		return -EOPNOTSUPP;
3126 
3127 	if (mode & FALLOC_FL_PUNCH_HOLE)
3128 		return btrfs_punch_hole(file, offset, len);
3129 
3130 	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3131 
3132 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3133 		ret = inode_newsize_ok(inode, offset + len);
3134 		if (ret)
3135 			goto out;
3136 	}
3137 
3138 	ret = file_modified(file);
3139 	if (ret)
3140 		goto out;
3141 
3142 	/*
3143 	 * TODO: Move these two operations after we have checked
3144 	 * accurate reserved space, or fallocate can still fail but
3145 	 * with page truncated or size expanded.
3146 	 *
3147 	 * But that's a minor problem and won't do much harm BTW.
3148 	 */
3149 	if (alloc_start > inode->i_size) {
3150 		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3151 					alloc_start);
3152 		if (ret)
3153 			goto out;
3154 	} else if (offset + len > inode->i_size) {
3155 		/*
3156 		 * If we are fallocating from the end of the file onward we
3157 		 * need to zero out the end of the block if i_size lands in the
3158 		 * middle of a block.
3159 		 */
3160 		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
3161 					   inode->i_size, (u64)-1);
3162 		if (ret)
3163 			goto out;
3164 	}
3165 
3166 	/*
3167 	 * We have locked the inode at the VFS level (in exclusive mode) and we
3168 	 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3169 	 * locking the file range, flush all dealloc in the range and wait for
3170 	 * all ordered extents in the range to complete. After this we can lock
3171 	 * the file range and, due to the previous locking we did, we know there
3172 	 * can't be more delalloc or ordered extents in the range.
3173 	 */
3174 	ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3175 				       alloc_end - alloc_start);
3176 	if (ret)
3177 		goto out;
3178 
3179 	if (mode & FALLOC_FL_ZERO_RANGE) {
3180 		ret = btrfs_zero_range(inode, offset, len, mode);
3181 		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3182 		return ret;
3183 	}
3184 
3185 	locked_end = alloc_end - 1;
3186 	btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3187 			  &cached_state);
3188 
3189 	btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3190 
3191 	/* First, check if we exceed the qgroup limit */
3192 	while (cur_offset < alloc_end) {
3193 		em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3194 				      alloc_end - cur_offset);
3195 		if (IS_ERR(em)) {
3196 			ret = PTR_ERR(em);
3197 			break;
3198 		}
3199 		last_byte = min(btrfs_extent_map_end(em), alloc_end);
3200 		actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
3201 		last_byte = ALIGN(last_byte, blocksize);
3202 		if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3203 		    (cur_offset >= inode->i_size &&
3204 		     !(em->flags & EXTENT_FLAG_PREALLOC))) {
3205 			const u64 range_len = last_byte - cur_offset;
3206 
3207 			ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3208 			if (ret < 0) {
3209 				btrfs_free_extent_map(em);
3210 				break;
3211 			}
3212 			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3213 					&data_reserved, cur_offset, range_len);
3214 			if (ret < 0) {
3215 				btrfs_free_extent_map(em);
3216 				break;
3217 			}
3218 			qgroup_reserved += range_len;
3219 			data_space_needed += range_len;
3220 		}
3221 		btrfs_free_extent_map(em);
3222 		cur_offset = last_byte;
3223 	}
3224 
3225 	if (!ret && data_space_needed > 0) {
3226 		/*
3227 		 * We are safe to reserve space here as we can't have delalloc
3228 		 * in the range, see above.
3229 		 */
3230 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3231 						      data_space_needed);
3232 		if (!ret)
3233 			data_space_reserved = data_space_needed;
3234 	}
3235 
3236 	/*
3237 	 * If ret is still 0, means we're OK to fallocate.
3238 	 * Or just cleanup the list and exit.
3239 	 */
3240 	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3241 		if (!ret) {
3242 			ret = btrfs_prealloc_file_range(inode, mode,
3243 					range->start,
3244 					range->len, blocksize,
3245 					offset + len, &alloc_hint);
3246 			/*
3247 			 * btrfs_prealloc_file_range() releases space even
3248 			 * if it returns an error.
3249 			 */
3250 			data_space_reserved -= range->len;
3251 			qgroup_reserved -= range->len;
3252 		} else if (data_space_reserved > 0) {
3253 			btrfs_free_reserved_data_space(BTRFS_I(inode),
3254 					       data_reserved, range->start,
3255 					       range->len);
3256 			data_space_reserved -= range->len;
3257 			qgroup_reserved -= range->len;
3258 		} else if (qgroup_reserved > 0) {
3259 			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3260 					       range->start, range->len, NULL);
3261 			qgroup_reserved -= range->len;
3262 		}
3263 		list_del(&range->list);
3264 		kfree(range);
3265 	}
3266 	if (ret < 0)
3267 		goto out_unlock;
3268 
3269 	/*
3270 	 * We didn't need to allocate any more space, but we still extended the
3271 	 * size of the file so we need to update i_size and the inode item.
3272 	 */
3273 	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3274 out_unlock:
3275 	btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3276 			    &cached_state);
3277 out:
3278 	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3279 	extent_changeset_free(data_reserved);
3280 	return ret;
3281 }
3282 
3283 /*
3284  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3285  * that has unflushed and/or flushing delalloc. There might be other adjacent
3286  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3287  * looping while it gets adjacent subranges, and merging them together.
3288  */
3289 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3290 				   struct extent_state **cached_state,
3291 				   bool *search_io_tree,
3292 				   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3293 {
3294 	u64 len = end + 1 - start;
3295 	u64 delalloc_len = 0;
3296 	struct btrfs_ordered_extent *oe;
3297 	u64 oe_start;
3298 	u64 oe_end;
3299 
3300 	/*
3301 	 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3302 	 * means we have delalloc (dirty pages) for which writeback has not
3303 	 * started yet.
3304 	 */
3305 	if (*search_io_tree) {
3306 		spin_lock(&inode->lock);
3307 		if (inode->delalloc_bytes > 0) {
3308 			spin_unlock(&inode->lock);
3309 			*delalloc_start_ret = start;
3310 			delalloc_len = btrfs_count_range_bits(&inode->io_tree,
3311 							      delalloc_start_ret, end,
3312 							      len, EXTENT_DELALLOC,
3313 							      true, cached_state);
3314 		} else {
3315 			spin_unlock(&inode->lock);
3316 		}
3317 	}
3318 
3319 	if (delalloc_len > 0) {
3320 		/*
3321 		 * If delalloc was found then *delalloc_start_ret has a sector size
3322 		 * aligned value (rounded down).
3323 		 */
3324 		*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3325 
3326 		if (*delalloc_start_ret == start) {
3327 			/* Delalloc for the whole range, nothing more to do. */
3328 			if (*delalloc_end_ret == end)
3329 				return true;
3330 			/* Else trim our search range for ordered extents. */
3331 			start = *delalloc_end_ret + 1;
3332 			len = end + 1 - start;
3333 		}
3334 	} else {
3335 		/* No delalloc, future calls don't need to search again. */
3336 		*search_io_tree = false;
3337 	}
3338 
3339 	/*
3340 	 * Now also check if there's any ordered extent in the range.
3341 	 * We do this because:
3342 	 *
3343 	 * 1) When delalloc is flushed, the file range is locked, we clear the
3344 	 *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3345 	 *    an ordered extent for the write. So we might just have been called
3346 	 *    after delalloc is flushed and before the ordered extent completes
3347 	 *    and inserts the new file extent item in the subvolume's btree;
3348 	 *
3349 	 * 2) We may have an ordered extent created by flushing delalloc for a
3350 	 *    subrange that starts before the subrange we found marked with
3351 	 *    EXTENT_DELALLOC in the io tree.
3352 	 *
3353 	 * We could also use the extent map tree to find such delalloc that is
3354 	 * being flushed, but using the ordered extents tree is more efficient
3355 	 * because it's usually much smaller as ordered extents are removed from
3356 	 * the tree once they complete. With the extent maps, we may have them
3357 	 * in the extent map tree for a very long time, and they were either
3358 	 * created by previous writes or loaded by read operations.
3359 	 */
3360 	oe = btrfs_lookup_first_ordered_range(inode, start, len);
3361 	if (!oe)
3362 		return (delalloc_len > 0);
3363 
3364 	/* The ordered extent may span beyond our search range. */
3365 	oe_start = max(oe->file_offset, start);
3366 	oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3367 
3368 	btrfs_put_ordered_extent(oe);
3369 
3370 	/* Don't have unflushed delalloc, return the ordered extent range. */
3371 	if (delalloc_len == 0) {
3372 		*delalloc_start_ret = oe_start;
3373 		*delalloc_end_ret = oe_end;
3374 		return true;
3375 	}
3376 
3377 	/*
3378 	 * We have both unflushed delalloc (io_tree) and an ordered extent.
3379 	 * If the ranges are adjacent returned a combined range, otherwise
3380 	 * return the leftmost range.
3381 	 */
3382 	if (oe_start < *delalloc_start_ret) {
3383 		if (oe_end < *delalloc_start_ret)
3384 			*delalloc_end_ret = oe_end;
3385 		*delalloc_start_ret = oe_start;
3386 	} else if (*delalloc_end_ret + 1 == oe_start) {
3387 		*delalloc_end_ret = oe_end;
3388 	}
3389 
3390 	return true;
3391 }
3392 
3393 /*
3394  * Check if there's delalloc in a given range.
3395  *
3396  * @inode:               The inode.
3397  * @start:               The start offset of the range. It does not need to be
3398  *                       sector size aligned.
3399  * @end:                 The end offset (inclusive value) of the search range.
3400  *                       It does not need to be sector size aligned.
3401  * @cached_state:        Extent state record used for speeding up delalloc
3402  *                       searches in the inode's io_tree. Can be NULL.
3403  * @delalloc_start_ret:  Output argument, set to the start offset of the
3404  *                       subrange found with delalloc (may not be sector size
3405  *                       aligned).
3406  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3407  *                       of the subrange found with delalloc.
3408  *
3409  * Returns true if a subrange with delalloc is found within the given range, and
3410  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3411  * end offsets of the subrange.
3412  */
3413 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3414 				  struct extent_state **cached_state,
3415 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3416 {
3417 	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3418 	u64 prev_delalloc_end = 0;
3419 	bool search_io_tree = true;
3420 	bool ret = false;
3421 
3422 	while (cur_offset <= end) {
3423 		u64 delalloc_start;
3424 		u64 delalloc_end;
3425 		bool delalloc;
3426 
3427 		delalloc = find_delalloc_subrange(inode, cur_offset, end,
3428 						  cached_state, &search_io_tree,
3429 						  &delalloc_start,
3430 						  &delalloc_end);
3431 		if (!delalloc)
3432 			break;
3433 
3434 		if (prev_delalloc_end == 0) {
3435 			/* First subrange found. */
3436 			*delalloc_start_ret = max(delalloc_start, start);
3437 			*delalloc_end_ret = delalloc_end;
3438 			ret = true;
3439 		} else if (delalloc_start == prev_delalloc_end + 1) {
3440 			/* Subrange adjacent to the previous one, merge them. */
3441 			*delalloc_end_ret = delalloc_end;
3442 		} else {
3443 			/* Subrange not adjacent to the previous one, exit. */
3444 			break;
3445 		}
3446 
3447 		prev_delalloc_end = delalloc_end;
3448 		cur_offset = delalloc_end + 1;
3449 		cond_resched();
3450 	}
3451 
3452 	return ret;
3453 }
3454 
3455 /*
3456  * Check if there's a hole or delalloc range in a range representing a hole (or
3457  * prealloc extent) found in the inode's subvolume btree.
3458  *
3459  * @inode:      The inode.
3460  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3461  * @start:      Start offset of the hole region. It does not need to be sector
3462  *              size aligned.
3463  * @end:        End offset (inclusive value) of the hole region. It does not
3464  *              need to be sector size aligned.
3465  * @start_ret:  Return parameter, used to set the start of the subrange in the
3466  *              hole that matches the search criteria (seek mode), if such
3467  *              subrange is found (return value of the function is true).
3468  *              The value returned here may not be sector size aligned.
3469  *
3470  * Returns true if a subrange matching the given seek mode is found, and if one
3471  * is found, it updates @start_ret with the start of the subrange.
3472  */
3473 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3474 					struct extent_state **cached_state,
3475 					u64 start, u64 end, u64 *start_ret)
3476 {
3477 	u64 delalloc_start;
3478 	u64 delalloc_end;
3479 	bool delalloc;
3480 
3481 	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3482 						&delalloc_start, &delalloc_end);
3483 	if (delalloc && whence == SEEK_DATA) {
3484 		*start_ret = delalloc_start;
3485 		return true;
3486 	}
3487 
3488 	if (delalloc && whence == SEEK_HOLE) {
3489 		/*
3490 		 * We found delalloc but it starts after out start offset. So we
3491 		 * have a hole between our start offset and the delalloc start.
3492 		 */
3493 		if (start < delalloc_start) {
3494 			*start_ret = start;
3495 			return true;
3496 		}
3497 		/*
3498 		 * Delalloc range starts at our start offset.
3499 		 * If the delalloc range's length is smaller than our range,
3500 		 * then it means we have a hole that starts where the delalloc
3501 		 * subrange ends.
3502 		 */
3503 		if (delalloc_end < end) {
3504 			*start_ret = delalloc_end + 1;
3505 			return true;
3506 		}
3507 
3508 		/* There's delalloc for the whole range. */
3509 		return false;
3510 	}
3511 
3512 	if (!delalloc && whence == SEEK_HOLE) {
3513 		*start_ret = start;
3514 		return true;
3515 	}
3516 
3517 	/*
3518 	 * No delalloc in the range and we are seeking for data. The caller has
3519 	 * to iterate to the next extent item in the subvolume btree.
3520 	 */
3521 	return false;
3522 }
3523 
3524 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3525 {
3526 	struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3527 	struct btrfs_file_private *private;
3528 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3529 	struct extent_state *cached_state = NULL;
3530 	struct extent_state **delalloc_cached_state;
3531 	const loff_t i_size = i_size_read(&inode->vfs_inode);
3532 	const u64 ino = btrfs_ino(inode);
3533 	struct btrfs_root *root = inode->root;
3534 	struct btrfs_path *path;
3535 	struct btrfs_key key;
3536 	u64 last_extent_end;
3537 	u64 lockstart;
3538 	u64 lockend;
3539 	u64 start;
3540 	int ret;
3541 	bool found = false;
3542 
3543 	if (i_size == 0 || offset >= i_size)
3544 		return -ENXIO;
3545 
3546 	/*
3547 	 * Quick path. If the inode has no prealloc extents and its number of
3548 	 * bytes used matches its i_size, then it can not have holes.
3549 	 */
3550 	if (whence == SEEK_HOLE &&
3551 	    !(inode->flags & BTRFS_INODE_PREALLOC) &&
3552 	    inode_get_bytes(&inode->vfs_inode) == i_size)
3553 		return i_size;
3554 
3555 	spin_lock(&inode->lock);
3556 	private = file->private_data;
3557 	spin_unlock(&inode->lock);
3558 
3559 	if (private && private->owner_task != current) {
3560 		/*
3561 		 * Not allocated by us, don't use it as its cached state is used
3562 		 * by the task that allocated it and we don't want neither to
3563 		 * mess with it nor get incorrect results because it reflects an
3564 		 * invalid state for the current task.
3565 		 */
3566 		private = NULL;
3567 	} else if (!private) {
3568 		private = kzalloc_obj(*private);
3569 		/*
3570 		 * No worries if memory allocation failed.
3571 		 * The private structure is used only for speeding up multiple
3572 		 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3573 		 * so everything will still be correct.
3574 		 */
3575 		if (private) {
3576 			bool free = false;
3577 
3578 			private->owner_task = current;
3579 
3580 			spin_lock(&inode->lock);
3581 			if (file->private_data)
3582 				free = true;
3583 			else
3584 				file->private_data = private;
3585 			spin_unlock(&inode->lock);
3586 
3587 			if (free) {
3588 				kfree(private);
3589 				private = NULL;
3590 			}
3591 		}
3592 	}
3593 
3594 	if (private)
3595 		delalloc_cached_state = &private->llseek_cached_state;
3596 	else
3597 		delalloc_cached_state = NULL;
3598 
3599 	/*
3600 	 * offset can be negative, in this case we start finding DATA/HOLE from
3601 	 * the very start of the file.
3602 	 */
3603 	start = max_t(loff_t, 0, offset);
3604 
3605 	lockstart = round_down(start, fs_info->sectorsize);
3606 	lockend = round_up(i_size, fs_info->sectorsize);
3607 	if (lockend <= lockstart)
3608 		lockend = lockstart + fs_info->sectorsize;
3609 	lockend--;
3610 
3611 	path = btrfs_alloc_path();
3612 	if (!path)
3613 		return -ENOMEM;
3614 	path->reada = READA_FORWARD;
3615 
3616 	key.objectid = ino;
3617 	key.type = BTRFS_EXTENT_DATA_KEY;
3618 	key.offset = start;
3619 
3620 	last_extent_end = lockstart;
3621 
3622 	btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3623 
3624 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3625 	if (ret < 0) {
3626 		goto out;
3627 	} else if (ret > 0 && path->slots[0] > 0) {
3628 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3629 		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3630 			path->slots[0]--;
3631 	}
3632 
3633 	while (start < i_size) {
3634 		struct extent_buffer *leaf = path->nodes[0];
3635 		struct btrfs_file_extent_item *extent;
3636 		u64 extent_end;
3637 		u8 type;
3638 
3639 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3640 			ret = btrfs_next_leaf(root, path);
3641 			if (ret < 0)
3642 				goto out;
3643 			else if (ret > 0)
3644 				break;
3645 
3646 			leaf = path->nodes[0];
3647 		}
3648 
3649 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3650 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3651 			break;
3652 
3653 		extent_end = btrfs_file_extent_end(path);
3654 
3655 		/*
3656 		 * In the first iteration we may have a slot that points to an
3657 		 * extent that ends before our start offset, so skip it.
3658 		 */
3659 		if (extent_end <= start) {
3660 			path->slots[0]++;
3661 			continue;
3662 		}
3663 
3664 		/* We have an implicit hole, NO_HOLES feature is likely set. */
3665 		if (last_extent_end < key.offset) {
3666 			u64 search_start = last_extent_end;
3667 			u64 found_start;
3668 
3669 			/*
3670 			 * First iteration, @start matches @offset and it's
3671 			 * within the hole.
3672 			 */
3673 			if (start == offset)
3674 				search_start = offset;
3675 
3676 			found = find_desired_extent_in_hole(inode, whence,
3677 							    delalloc_cached_state,
3678 							    search_start,
3679 							    key.offset - 1,
3680 							    &found_start);
3681 			if (found) {
3682 				start = found_start;
3683 				break;
3684 			}
3685 			/*
3686 			 * Didn't find data or a hole (due to delalloc) in the
3687 			 * implicit hole range, so need to analyze the extent.
3688 			 */
3689 		}
3690 
3691 		extent = btrfs_item_ptr(leaf, path->slots[0],
3692 					struct btrfs_file_extent_item);
3693 		type = btrfs_file_extent_type(leaf, extent);
3694 
3695 		/*
3696 		 * Can't access the extent's disk_bytenr field if this is an
3697 		 * inline extent, since at that offset, it's where the extent
3698 		 * data starts.
3699 		 */
3700 		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3701 		    (type == BTRFS_FILE_EXTENT_REG &&
3702 		     btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3703 			/*
3704 			 * Explicit hole or prealloc extent, search for delalloc.
3705 			 * A prealloc extent is treated like a hole.
3706 			 */
3707 			u64 search_start = key.offset;
3708 			u64 found_start;
3709 
3710 			/*
3711 			 * First iteration, @start matches @offset and it's
3712 			 * within the hole.
3713 			 */
3714 			if (start == offset)
3715 				search_start = offset;
3716 
3717 			found = find_desired_extent_in_hole(inode, whence,
3718 							    delalloc_cached_state,
3719 							    search_start,
3720 							    extent_end - 1,
3721 							    &found_start);
3722 			if (found) {
3723 				start = found_start;
3724 				break;
3725 			}
3726 			/*
3727 			 * Didn't find data or a hole (due to delalloc) in the
3728 			 * implicit hole range, so need to analyze the next
3729 			 * extent item.
3730 			 */
3731 		} else {
3732 			/*
3733 			 * Found a regular or inline extent.
3734 			 * If we are seeking for data, adjust the start offset
3735 			 * and stop, we're done.
3736 			 */
3737 			if (whence == SEEK_DATA) {
3738 				start = max_t(u64, key.offset, offset);
3739 				found = true;
3740 				break;
3741 			}
3742 			/*
3743 			 * Else, we are seeking for a hole, check the next file
3744 			 * extent item.
3745 			 */
3746 		}
3747 
3748 		start = extent_end;
3749 		last_extent_end = extent_end;
3750 		path->slots[0]++;
3751 		if (fatal_signal_pending(current)) {
3752 			ret = -EINTR;
3753 			goto out;
3754 		}
3755 		cond_resched();
3756 	}
3757 
3758 	/* We have an implicit hole from the last extent found up to i_size. */
3759 	if (!found && start < i_size) {
3760 		found = find_desired_extent_in_hole(inode, whence,
3761 						    delalloc_cached_state, start,
3762 						    i_size - 1, &start);
3763 		if (!found)
3764 			start = i_size;
3765 	}
3766 
3767 out:
3768 	btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3769 	btrfs_free_path(path);
3770 
3771 	if (ret < 0)
3772 		return ret;
3773 
3774 	if (whence == SEEK_DATA && start >= i_size)
3775 		return -ENXIO;
3776 
3777 	return min_t(loff_t, start, i_size);
3778 }
3779 
3780 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3781 {
3782 	struct inode *inode = file->f_mapping->host;
3783 
3784 	switch (whence) {
3785 	default:
3786 		return generic_file_llseek(file, offset, whence);
3787 	case SEEK_DATA:
3788 	case SEEK_HOLE:
3789 		btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3790 		offset = find_desired_extent(file, offset, whence);
3791 		btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3792 		break;
3793 	}
3794 
3795 	if (offset < 0)
3796 		return offset;
3797 
3798 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3799 }
3800 
3801 static int btrfs_file_open(struct inode *inode, struct file *filp)
3802 {
3803 	int ret;
3804 
3805 	if (btrfs_is_shutdown(inode_to_fs_info(inode)))
3806 		return -EIO;
3807 
3808 	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3809 
3810 	ret = fsverity_file_open(inode, filp);
3811 	if (ret)
3812 		return ret;
3813 	return generic_file_open(inode, filp);
3814 }
3815 
3816 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3817 {
3818 	ssize_t ret = 0;
3819 
3820 	if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))
3821 		return -EIO;
3822 
3823 	if (iocb->ki_flags & IOCB_DIRECT) {
3824 		ret = btrfs_direct_read(iocb, to);
3825 		if (ret < 0 || !iov_iter_count(to) ||
3826 		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3827 			return ret;
3828 	}
3829 
3830 	return filemap_read(iocb, to, ret);
3831 }
3832 
3833 static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos,
3834 				      struct pipe_inode_info *pipe,
3835 				      size_t len, unsigned int flags)
3836 {
3837 	if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))
3838 		return -EIO;
3839 
3840 	return filemap_splice_read(in, ppos, pipe, len, flags);
3841 }
3842 
3843 const struct file_operations btrfs_file_operations = {
3844 	.llseek		= btrfs_file_llseek,
3845 	.read_iter      = btrfs_file_read_iter,
3846 	.splice_read	= btrfs_file_splice_read,
3847 	.write_iter	= btrfs_file_write_iter,
3848 	.splice_write	= iter_file_splice_write,
3849 	.mmap_prepare	= btrfs_file_mmap_prepare,
3850 	.open		= btrfs_file_open,
3851 	.release	= btrfs_release_file,
3852 	.get_unmapped_area = thp_get_unmapped_area,
3853 	.fsync		= btrfs_sync_file,
3854 	.fallocate	= btrfs_fallocate,
3855 	.unlocked_ioctl	= btrfs_ioctl,
3856 #ifdef CONFIG_COMPAT
3857 	.compat_ioctl	= btrfs_compat_ioctl,
3858 #endif
3859 	.remap_file_range = btrfs_remap_file_range,
3860 	.uring_cmd	= btrfs_uring_cmd,
3861 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3862 	.setlease	= generic_setlease,
3863 };
3864 
3865 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3866 {
3867 	struct address_space *mapping = inode->vfs_inode.i_mapping;
3868 	int ret;
3869 
3870 	/*
3871 	 * So with compression we will find and lock a dirty page and clear the
3872 	 * first one as dirty, setup an async extent, and immediately return
3873 	 * with the entire range locked but with nobody actually marked with
3874 	 * writeback.  So we can't just filemap_write_and_wait_range() and
3875 	 * expect it to work since it will just kick off a thread to do the
3876 	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3877 	 * since it will wait on the page lock, which won't be unlocked until
3878 	 * after the pages have been marked as writeback and so we're good to go
3879 	 * from there.  We have to do this otherwise we'll miss the ordered
3880 	 * extents and that results in badness.  Please Josef, do not think you
3881 	 * know better and pull this out at some point in the future, it is
3882 	 * right and you are wrong.
3883 	 */
3884 	ret = filemap_fdatawrite_range(mapping, start, end);
3885 	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3886 		ret = filemap_fdatawrite_range(mapping, start, end);
3887 
3888 	return ret;
3889 }
3890