xref: /linux/fs/btrfs/fiemap.c (revision 352af6a011d586ff042db4b2d1f7421875eb8a14)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "backref.h"
4 #include "btrfs_inode.h"
5 #include "fiemap.h"
6 #include "file.h"
7 #include "file-item.h"
8 
9 struct btrfs_fiemap_entry {
10 	u64 offset;
11 	u64 phys;
12 	u64 len;
13 	u32 flags;
14 };
15 
16 /*
17  * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
18  * range from the inode's io tree, unlock the subvolume tree search path, flush
19  * the fiemap cache and relock the file range and research the subvolume tree.
20  * The value here is something negative that can't be confused with a valid
21  * errno value and different from 1 because that's also a return value from
22  * fiemap_fill_next_extent() and also it's often used to mean some btree search
23  * did not find a key, so make it some distinct negative value.
24  */
25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
26 
27 /*
28  * Used to:
29  *
30  * - Cache the next entry to be emitted to the fiemap buffer, so that we can
31  *   merge extents that are contiguous and can be grouped as a single one;
32  *
33  * - Store extents ready to be written to the fiemap buffer in an intermediary
34  *   buffer. This intermediary buffer is to ensure that in case the fiemap
35  *   buffer is memory mapped to the fiemap target file, we don't deadlock
36  *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
37  *   an extent range in order to prevent races with delalloc flushing and
38  *   ordered extent completion, which is needed in order to reliably detect
39  *   delalloc in holes and prealloc extents. And this can lead to a deadlock
40  *   if the fiemap buffer is memory mapped to the file we are running fiemap
41  *   against (a silly, useless in practice scenario, but possible) because
42  *   btrfs_page_mkwrite() will try to lock the same extent range.
43  */
44 struct fiemap_cache {
45 	/* An array of ready fiemap entries. */
46 	struct btrfs_fiemap_entry *entries;
47 	/* Number of entries in the entries array. */
48 	int entries_size;
49 	/* Index of the next entry in the entries array to write to. */
50 	int entries_pos;
51 	/*
52 	 * Once the entries array is full, this indicates what's the offset for
53 	 * the next file extent item we must search for in the inode's subvolume
54 	 * tree after unlocking the extent range in the inode's io tree and
55 	 * releasing the search path.
56 	 */
57 	u64 next_search_offset;
58 	/*
59 	 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
60 	 * to count ourselves emitted extents and stop instead of relying on
61 	 * fiemap_fill_next_extent() because we buffer ready fiemap entries at
62 	 * the @entries array, and we want to stop as soon as we hit the max
63 	 * amount of extents to map, not just to save time but also to make the
64 	 * logic at extent_fiemap() simpler.
65 	 */
66 	unsigned int extents_mapped;
67 	/* Fields for the cached extent (unsubmitted, not ready, extent). */
68 	u64 offset;
69 	u64 phys;
70 	u64 len;
71 	u32 flags;
72 	bool cached;
73 };
74 
75 static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
76 			      struct fiemap_cache *cache)
77 {
78 	for (int i = 0; i < cache->entries_pos; i++) {
79 		struct btrfs_fiemap_entry *entry = &cache->entries[i];
80 		int ret;
81 
82 		ret = fiemap_fill_next_extent(fieinfo, entry->offset,
83 					      entry->phys, entry->len,
84 					      entry->flags);
85 		/*
86 		 * Ignore 1 (reached max entries) because we keep track of that
87 		 * ourselves in emit_fiemap_extent().
88 		 */
89 		if (ret < 0)
90 			return ret;
91 	}
92 	cache->entries_pos = 0;
93 
94 	return 0;
95 }
96 
97 /*
98  * Helper to submit fiemap extent.
99  *
100  * Will try to merge current fiemap extent specified by @offset, @phys,
101  * @len and @flags with cached one.
102  * And only when we fails to merge, cached one will be submitted as
103  * fiemap extent.
104  *
105  * Return value is the same as fiemap_fill_next_extent().
106  */
107 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
108 				struct fiemap_cache *cache,
109 				u64 offset, u64 phys, u64 len, u32 flags)
110 {
111 	struct btrfs_fiemap_entry *entry;
112 	u64 cache_end;
113 
114 	/* Set at the end of extent_fiemap(). */
115 	ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
116 
117 	if (!cache->cached)
118 		goto assign;
119 
120 	/*
121 	 * When iterating the extents of the inode, at extent_fiemap(), we may
122 	 * find an extent that starts at an offset behind the end offset of the
123 	 * previous extent we processed. This happens if fiemap is called
124 	 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125 	 * after we had to unlock the file range, release the search path, emit
126 	 * the fiemap extents stored in the buffer (cache->entries array) and
127 	 * the lock the remainder of the range and re-search the btree.
128 	 *
129 	 * For example we are in leaf X processing its last item, which is the
130 	 * file extent item for file range [512K, 1M[, and after
131 	 * btrfs_next_leaf() releases the path, there's an ordered extent that
132 	 * completes for the file range [768K, 2M[, and that results in trimming
133 	 * the file extent item so that it now corresponds to the file range
134 	 * [512K, 768K[ and a new file extent item is inserted for the file
135 	 * range [768K, 2M[, which may end up as the last item of leaf X or as
136 	 * the first item of the next leaf - in either case btrfs_next_leaf()
137 	 * will leave us with a path pointing to the new extent item, for the
138 	 * file range [768K, 2M[, since that's the first key that follows the
139 	 * last one we processed. So in order not to report overlapping extents
140 	 * to user space, we trim the length of the previously cached extent and
141 	 * emit it.
142 	 *
143 	 * Upon calling btrfs_next_leaf() we may also find an extent with an
144 	 * offset smaller than or equals to cache->offset, and this happens
145 	 * when we had a hole or prealloc extent with several delalloc ranges in
146 	 * it, but after btrfs_next_leaf() released the path, delalloc was
147 	 * flushed and the resulting ordered extents were completed, so we can
148 	 * now have found a file extent item for an offset that is smaller than
149 	 * or equals to what we have in cache->offset. We deal with this as
150 	 * described below.
151 	 */
152 	cache_end = cache->offset + cache->len;
153 	if (cache_end > offset) {
154 		if (offset == cache->offset) {
155 			/*
156 			 * We cached a dealloc range (found in the io tree) for
157 			 * a hole or prealloc extent and we have now found a
158 			 * file extent item for the same offset. What we have
159 			 * now is more recent and up to date, so discard what
160 			 * we had in the cache and use what we have just found.
161 			 */
162 			goto assign;
163 		} else if (offset > cache->offset) {
164 			/*
165 			 * The extent range we previously found ends after the
166 			 * offset of the file extent item we found and that
167 			 * offset falls somewhere in the middle of that previous
168 			 * extent range. So adjust the range we previously found
169 			 * to end at the offset of the file extent item we have
170 			 * just found, since this extent is more up to date.
171 			 * Emit that adjusted range and cache the file extent
172 			 * item we have just found. This corresponds to the case
173 			 * where a previously found file extent item was split
174 			 * due to an ordered extent completing.
175 			 */
176 			cache->len = offset - cache->offset;
177 			goto emit;
178 		} else {
179 			const u64 range_end = offset + len;
180 
181 			/*
182 			 * The offset of the file extent item we have just found
183 			 * is behind the cached offset. This means we were
184 			 * processing a hole or prealloc extent for which we
185 			 * have found delalloc ranges (in the io tree), so what
186 			 * we have in the cache is the last delalloc range we
187 			 * found while the file extent item we found can be
188 			 * either for a whole delalloc range we previously
189 			 * emitted or only a part of that range.
190 			 *
191 			 * We have two cases here:
192 			 *
193 			 * 1) The file extent item's range ends at or behind the
194 			 *    cached extent's end. In this case just ignore the
195 			 *    current file extent item because we don't want to
196 			 *    overlap with previous ranges that may have been
197 			 *    emitted already;
198 			 *
199 			 * 2) The file extent item starts behind the currently
200 			 *    cached extent but its end offset goes beyond the
201 			 *    end offset of the cached extent. We don't want to
202 			 *    overlap with a previous range that may have been
203 			 *    emitted already, so we emit the currently cached
204 			 *    extent and then partially store the current file
205 			 *    extent item's range in the cache, for the subrange
206 			 *    going the cached extent's end to the end of the
207 			 *    file extent item.
208 			 */
209 			if (range_end <= cache_end)
210 				return 0;
211 
212 			if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
213 				phys += cache_end - offset;
214 
215 			offset = cache_end;
216 			len = range_end - cache_end;
217 			goto emit;
218 		}
219 	}
220 
221 	/*
222 	 * Only merges fiemap extents if
223 	 * 1) Their logical addresses are continuous
224 	 *
225 	 * 2) Their physical addresses are continuous
226 	 *    So truly compressed (physical size smaller than logical size)
227 	 *    extents won't get merged with each other
228 	 *
229 	 * 3) Share same flags
230 	 */
231 	if (cache->offset + cache->len  == offset &&
232 	    cache->phys + cache->len == phys  &&
233 	    cache->flags == flags) {
234 		cache->len += len;
235 		return 0;
236 	}
237 
238 emit:
239 	/* Not mergeable, need to submit cached one */
240 
241 	if (cache->entries_pos == cache->entries_size) {
242 		/*
243 		 * We will need to research for the end offset of the last
244 		 * stored extent and not from the current offset, because after
245 		 * unlocking the range and releasing the path, if there's a hole
246 		 * between that end offset and this current offset, a new extent
247 		 * may have been inserted due to a new write, so we don't want
248 		 * to miss it.
249 		 */
250 		entry = &cache->entries[cache->entries_size - 1];
251 		cache->next_search_offset = entry->offset + entry->len;
252 		cache->cached = false;
253 
254 		return BTRFS_FIEMAP_FLUSH_CACHE;
255 	}
256 
257 	entry = &cache->entries[cache->entries_pos];
258 	entry->offset = cache->offset;
259 	entry->phys = cache->phys;
260 	entry->len = cache->len;
261 	entry->flags = cache->flags;
262 	cache->entries_pos++;
263 	cache->extents_mapped++;
264 
265 	if (cache->extents_mapped == fieinfo->fi_extents_max) {
266 		cache->cached = false;
267 		return 1;
268 	}
269 assign:
270 	cache->cached = true;
271 	cache->offset = offset;
272 	cache->phys = phys;
273 	cache->len = len;
274 	cache->flags = flags;
275 
276 	return 0;
277 }
278 
279 /*
280  * Emit last fiemap cache
281  *
282  * The last fiemap cache may still be cached in the following case:
283  * 0		      4k		    8k
284  * |<- Fiemap range ->|
285  * |<------------  First extent ----------->|
286  *
287  * In this case, the first extent range will be cached but not emitted.
288  * So we must emit it before ending extent_fiemap().
289  */
290 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
291 				  struct fiemap_cache *cache)
292 {
293 	int ret;
294 
295 	if (!cache->cached)
296 		return 0;
297 
298 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
299 				      cache->len, cache->flags);
300 	cache->cached = false;
301 	if (ret > 0)
302 		ret = 0;
303 	return ret;
304 }
305 
306 static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
307 {
308 	struct extent_buffer *clone = path->nodes[0];
309 	struct btrfs_key key;
310 	int slot;
311 	int ret;
312 
313 	path->slots[0]++;
314 	if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
315 		return 0;
316 
317 	/*
318 	 * Add a temporary extra ref to an already cloned extent buffer to
319 	 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320 	 * the cost of allocating a new one.
321 	 */
322 	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
323 	refcount_inc(&clone->refs);
324 
325 	ret = btrfs_next_leaf(inode->root, path);
326 	if (ret != 0)
327 		goto out;
328 
329 	/*
330 	 * Don't bother with cloning if there are no more file extent items for
331 	 * our inode.
332 	 */
333 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
334 	if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
335 		ret = 1;
336 		goto out;
337 	}
338 
339 	/*
340 	 * Important to preserve the start field, for the optimizations when
341 	 * checking if extents are shared (see extent_fiemap()).
342 	 *
343 	 * We must set ->start before calling copy_extent_buffer_full().  If we
344 	 * are on sub-pagesize blocksize, we use ->start to determine the offset
345 	 * into the folio where our eb exists, and if we update ->start after
346 	 * the fact then any subsequent reads of the eb may read from a
347 	 * different offset in the folio than where we originally copied into.
348 	 */
349 	clone->start = path->nodes[0]->start;
350 	/* See the comment at fiemap_search_slot() about why we clone. */
351 	copy_extent_buffer_full(clone, path->nodes[0]);
352 
353 	slot = path->slots[0];
354 	btrfs_release_path(path);
355 	path->nodes[0] = clone;
356 	path->slots[0] = slot;
357 out:
358 	if (ret)
359 		free_extent_buffer(clone);
360 
361 	return ret;
362 }
363 
364 /*
365  * Search for the first file extent item that starts at a given file offset or
366  * the one that starts immediately before that offset.
367  * Returns: 0 on success, < 0 on error, 1 if not found.
368  */
369 static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
370 			      u64 file_offset)
371 {
372 	const u64 ino = btrfs_ino(inode);
373 	struct btrfs_root *root = inode->root;
374 	struct extent_buffer *clone;
375 	struct btrfs_key key;
376 	int slot;
377 	int ret;
378 
379 	key.objectid = ino;
380 	key.type = BTRFS_EXTENT_DATA_KEY;
381 	key.offset = file_offset;
382 
383 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
384 	if (ret < 0)
385 		return ret;
386 
387 	if (ret > 0 && path->slots[0] > 0) {
388 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
389 		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
390 			path->slots[0]--;
391 	}
392 
393 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
394 		ret = btrfs_next_leaf(root, path);
395 		if (ret != 0)
396 			return ret;
397 
398 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
399 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
400 			return 1;
401 	}
402 
403 	/*
404 	 * We clone the leaf and use it during fiemap. This is because while
405 	 * using the leaf we do expensive things like checking if an extent is
406 	 * shared, which can take a long time. In order to prevent blocking
407 	 * other tasks for too long, we use a clone of the leaf. We have locked
408 	 * the file range in the inode's io tree, so we know none of our file
409 	 * extent items can change. This way we avoid blocking other tasks that
410 	 * want to insert items for other inodes in the same leaf or b+tree
411 	 * rebalance operations (triggered for example when someone is trying
412 	 * to push items into this leaf when trying to insert an item in a
413 	 * neighbour leaf).
414 	 * We also need the private clone because holding a read lock on an
415 	 * extent buffer of the subvolume's b+tree will make lockdep unhappy
416 	 * when we check if extents are shared, as backref walking may need to
417 	 * lock the same leaf we are processing.
418 	 */
419 	clone = btrfs_clone_extent_buffer(path->nodes[0]);
420 	if (!clone)
421 		return -ENOMEM;
422 
423 	slot = path->slots[0];
424 	btrfs_release_path(path);
425 	path->nodes[0] = clone;
426 	path->slots[0] = slot;
427 
428 	return 0;
429 }
430 
431 /*
432  * Process a range which is a hole or a prealloc extent in the inode's subvolume
433  * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434  * extent. The end offset (@end) is inclusive.
435  */
436 static int fiemap_process_hole(struct btrfs_inode *inode,
437 			       struct fiemap_extent_info *fieinfo,
438 			       struct fiemap_cache *cache,
439 			       struct extent_state **delalloc_cached_state,
440 			       struct btrfs_backref_share_check_ctx *backref_ctx,
441 			       u64 disk_bytenr, u64 extent_offset,
442 			       u64 extent_gen,
443 			       u64 start, u64 end)
444 {
445 	const u64 i_size = i_size_read(&inode->vfs_inode);
446 	u64 cur_offset = start;
447 	u64 last_delalloc_end = 0;
448 	u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
449 	bool checked_extent_shared = false;
450 	int ret;
451 
452 	/*
453 	 * There can be no delalloc past i_size, so don't waste time looking for
454 	 * it beyond i_size.
455 	 */
456 	while (cur_offset < end && cur_offset < i_size) {
457 		u64 delalloc_start;
458 		u64 delalloc_end;
459 		u64 prealloc_start;
460 		u64 prealloc_len = 0;
461 		bool delalloc;
462 
463 		delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
464 							delalloc_cached_state,
465 							&delalloc_start,
466 							&delalloc_end);
467 		if (!delalloc)
468 			break;
469 
470 		/*
471 		 * If this is a prealloc extent we have to report every section
472 		 * of it that has no delalloc.
473 		 */
474 		if (disk_bytenr != 0) {
475 			if (last_delalloc_end == 0) {
476 				prealloc_start = start;
477 				prealloc_len = delalloc_start - start;
478 			} else {
479 				prealloc_start = last_delalloc_end + 1;
480 				prealloc_len = delalloc_start - prealloc_start;
481 			}
482 		}
483 
484 		if (prealloc_len > 0) {
485 			if (!checked_extent_shared && fieinfo->fi_extents_max) {
486 				ret = btrfs_is_data_extent_shared(inode,
487 								  disk_bytenr,
488 								  extent_gen,
489 								  backref_ctx);
490 				if (ret < 0)
491 					return ret;
492 				else if (ret > 0)
493 					prealloc_flags |= FIEMAP_EXTENT_SHARED;
494 
495 				checked_extent_shared = true;
496 			}
497 			ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
498 						 disk_bytenr + extent_offset,
499 						 prealloc_len, prealloc_flags);
500 			if (ret)
501 				return ret;
502 			extent_offset += prealloc_len;
503 		}
504 
505 		ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
506 					 delalloc_end + 1 - delalloc_start,
507 					 FIEMAP_EXTENT_DELALLOC |
508 					 FIEMAP_EXTENT_UNKNOWN);
509 		if (ret)
510 			return ret;
511 
512 		last_delalloc_end = delalloc_end;
513 		cur_offset = delalloc_end + 1;
514 		extent_offset += cur_offset - delalloc_start;
515 		cond_resched();
516 	}
517 
518 	/*
519 	 * Either we found no delalloc for the whole prealloc extent or we have
520 	 * a prealloc extent that spans i_size or starts at or after i_size.
521 	 */
522 	if (disk_bytenr != 0 && last_delalloc_end < end) {
523 		u64 prealloc_start;
524 		u64 prealloc_len;
525 
526 		if (last_delalloc_end == 0) {
527 			prealloc_start = start;
528 			prealloc_len = end + 1 - start;
529 		} else {
530 			prealloc_start = last_delalloc_end + 1;
531 			prealloc_len = end + 1 - prealloc_start;
532 		}
533 
534 		if (!checked_extent_shared && fieinfo->fi_extents_max) {
535 			ret = btrfs_is_data_extent_shared(inode,
536 							  disk_bytenr,
537 							  extent_gen,
538 							  backref_ctx);
539 			if (ret < 0)
540 				return ret;
541 			else if (ret > 0)
542 				prealloc_flags |= FIEMAP_EXTENT_SHARED;
543 		}
544 		ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
545 					 disk_bytenr + extent_offset,
546 					 prealloc_len, prealloc_flags);
547 		if (ret)
548 			return ret;
549 	}
550 
551 	return 0;
552 }
553 
554 static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
555 					  struct btrfs_path *path,
556 					  u64 *last_extent_end_ret)
557 {
558 	const u64 ino = btrfs_ino(inode);
559 	struct btrfs_root *root = inode->root;
560 	struct extent_buffer *leaf;
561 	struct btrfs_file_extent_item *ei;
562 	struct btrfs_key key;
563 	u64 disk_bytenr;
564 	int ret;
565 
566 	/*
567 	 * Lookup the last file extent. We're not using i_size here because
568 	 * there might be preallocation past i_size.
569 	 */
570 	ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
571 	/* There can't be a file extent item at offset (u64)-1 */
572 	ASSERT(ret != 0);
573 	if (ret < 0)
574 		return ret;
575 
576 	/*
577 	 * For a non-existing key, btrfs_search_slot() always leaves us at a
578 	 * slot > 0, except if the btree is empty, which is impossible because
579 	 * at least it has the inode item for this inode and all the items for
580 	 * the root inode 256.
581 	 */
582 	ASSERT(path->slots[0] > 0);
583 	path->slots[0]--;
584 	leaf = path->nodes[0];
585 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
586 	if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
587 		/* No file extent items in the subvolume tree. */
588 		*last_extent_end_ret = 0;
589 		return 0;
590 	}
591 
592 	/*
593 	 * For an inline extent, the disk_bytenr is where inline data starts at,
594 	 * so first check if we have an inline extent item before checking if we
595 	 * have an implicit hole (disk_bytenr == 0).
596 	 */
597 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
598 	if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
599 		*last_extent_end_ret = btrfs_file_extent_end(path);
600 		return 0;
601 	}
602 
603 	/*
604 	 * Find the last file extent item that is not a hole (when NO_HOLES is
605 	 * not enabled). This should take at most 2 iterations in the worst
606 	 * case: we have one hole file extent item at slot 0 of a leaf and
607 	 * another hole file extent item as the last item in the previous leaf.
608 	 * This is because we merge file extent items that represent holes.
609 	 */
610 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
611 	while (disk_bytenr == 0) {
612 		ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
613 		if (ret < 0) {
614 			return ret;
615 		} else if (ret > 0) {
616 			/* No file extent items that are not holes. */
617 			*last_extent_end_ret = 0;
618 			return 0;
619 		}
620 		leaf = path->nodes[0];
621 		ei = btrfs_item_ptr(leaf, path->slots[0],
622 				    struct btrfs_file_extent_item);
623 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
624 	}
625 
626 	*last_extent_end_ret = btrfs_file_extent_end(path);
627 	return 0;
628 }
629 
630 static int extent_fiemap(struct btrfs_inode *inode,
631 			 struct fiemap_extent_info *fieinfo,
632 			 u64 start, u64 len)
633 {
634 	const u64 ino = btrfs_ino(inode);
635 	struct extent_state *cached_state = NULL;
636 	struct extent_state *delalloc_cached_state = NULL;
637 	BTRFS_PATH_AUTO_FREE(path);
638 	struct fiemap_cache cache = { 0 };
639 	struct btrfs_backref_share_check_ctx *backref_ctx;
640 	u64 last_extent_end = 0;
641 	u64 prev_extent_end;
642 	u64 range_start;
643 	u64 range_end;
644 	const u64 sectorsize = inode->root->fs_info->sectorsize;
645 	bool stopped = false;
646 	int ret;
647 
648 	cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
649 	cache.entries = kmalloc_array(cache.entries_size,
650 				      sizeof(struct btrfs_fiemap_entry),
651 				      GFP_KERNEL);
652 	backref_ctx = btrfs_alloc_backref_share_check_ctx();
653 	path = btrfs_alloc_path();
654 	if (!cache.entries || !backref_ctx || !path) {
655 		ret = -ENOMEM;
656 		goto out;
657 	}
658 
659 restart:
660 	range_start = round_down(start, sectorsize);
661 	range_end = round_up(start + len, sectorsize);
662 	prev_extent_end = range_start;
663 
664 	btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
665 
666 	ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
667 	if (ret < 0)
668 		goto out_unlock;
669 	btrfs_release_path(path);
670 
671 	path->reada = READA_FORWARD;
672 	ret = fiemap_search_slot(inode, path, range_start);
673 	if (ret < 0) {
674 		goto out_unlock;
675 	} else if (ret > 0) {
676 		/*
677 		 * No file extent item found, but we may have delalloc between
678 		 * the current offset and i_size. So check for that.
679 		 */
680 		ret = 0;
681 		goto check_eof_delalloc;
682 	}
683 
684 	while (prev_extent_end < range_end) {
685 		struct extent_buffer *leaf = path->nodes[0];
686 		struct btrfs_file_extent_item *ei;
687 		struct btrfs_key key;
688 		u64 extent_end;
689 		u64 extent_len;
690 		u64 extent_offset = 0;
691 		u64 extent_gen;
692 		u64 disk_bytenr = 0;
693 		u64 flags = 0;
694 		int extent_type;
695 		u8 compression;
696 
697 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
698 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
699 			break;
700 
701 		extent_end = btrfs_file_extent_end(path);
702 
703 		/*
704 		 * The first iteration can leave us at an extent item that ends
705 		 * before our range's start. Move to the next item.
706 		 */
707 		if (extent_end <= range_start)
708 			goto next_item;
709 
710 		backref_ctx->curr_leaf_bytenr = leaf->start;
711 
712 		/* We have in implicit hole (NO_HOLES feature enabled). */
713 		if (prev_extent_end < key.offset) {
714 			const u64 hole_end = min(key.offset, range_end) - 1;
715 
716 			ret = fiemap_process_hole(inode, fieinfo, &cache,
717 						  &delalloc_cached_state,
718 						  backref_ctx, 0, 0, 0,
719 						  prev_extent_end, hole_end);
720 			if (ret < 0) {
721 				goto out_unlock;
722 			} else if (ret > 0) {
723 				/* fiemap_fill_next_extent() told us to stop. */
724 				stopped = true;
725 				break;
726 			}
727 
728 			/* We've reached the end of the fiemap range, stop. */
729 			if (key.offset >= range_end) {
730 				stopped = true;
731 				break;
732 			}
733 		}
734 
735 		extent_len = extent_end - key.offset;
736 		ei = btrfs_item_ptr(leaf, path->slots[0],
737 				    struct btrfs_file_extent_item);
738 		compression = btrfs_file_extent_compression(leaf, ei);
739 		extent_type = btrfs_file_extent_type(leaf, ei);
740 		extent_gen = btrfs_file_extent_generation(leaf, ei);
741 
742 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
743 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
744 			if (compression == BTRFS_COMPRESS_NONE)
745 				extent_offset = btrfs_file_extent_offset(leaf, ei);
746 		}
747 
748 		if (compression != BTRFS_COMPRESS_NONE)
749 			flags |= FIEMAP_EXTENT_ENCODED;
750 
751 		if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
752 			flags |= FIEMAP_EXTENT_DATA_INLINE;
753 			flags |= FIEMAP_EXTENT_NOT_ALIGNED;
754 			ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
755 						 extent_len, flags);
756 		} else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
757 			ret = fiemap_process_hole(inode, fieinfo, &cache,
758 						  &delalloc_cached_state,
759 						  backref_ctx,
760 						  disk_bytenr, extent_offset,
761 						  extent_gen, key.offset,
762 						  extent_end - 1);
763 		} else if (disk_bytenr == 0) {
764 			/* We have an explicit hole. */
765 			ret = fiemap_process_hole(inode, fieinfo, &cache,
766 						  &delalloc_cached_state,
767 						  backref_ctx, 0, 0, 0,
768 						  key.offset, extent_end - 1);
769 		} else {
770 			/* We have a regular extent. */
771 			if (fieinfo->fi_extents_max) {
772 				ret = btrfs_is_data_extent_shared(inode,
773 								  disk_bytenr,
774 								  extent_gen,
775 								  backref_ctx);
776 				if (ret < 0)
777 					goto out_unlock;
778 				else if (ret > 0)
779 					flags |= FIEMAP_EXTENT_SHARED;
780 			}
781 
782 			ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
783 						 disk_bytenr + extent_offset,
784 						 extent_len, flags);
785 		}
786 
787 		if (ret < 0) {
788 			goto out_unlock;
789 		} else if (ret > 0) {
790 			/* emit_fiemap_extent() told us to stop. */
791 			stopped = true;
792 			break;
793 		}
794 
795 		prev_extent_end = extent_end;
796 next_item:
797 		if (fatal_signal_pending(current)) {
798 			ret = -EINTR;
799 			goto out_unlock;
800 		}
801 
802 		ret = fiemap_next_leaf_item(inode, path);
803 		if (ret < 0) {
804 			goto out_unlock;
805 		} else if (ret > 0) {
806 			/* No more file extent items for this inode. */
807 			break;
808 		}
809 		cond_resched();
810 	}
811 
812 check_eof_delalloc:
813 	if (!stopped && prev_extent_end < range_end) {
814 		ret = fiemap_process_hole(inode, fieinfo, &cache,
815 					  &delalloc_cached_state, backref_ctx,
816 					  0, 0, 0, prev_extent_end, range_end - 1);
817 		if (ret < 0)
818 			goto out_unlock;
819 		prev_extent_end = range_end;
820 	}
821 
822 	if (cache.cached && cache.offset + cache.len >= last_extent_end) {
823 		const u64 i_size = i_size_read(&inode->vfs_inode);
824 
825 		if (prev_extent_end < i_size) {
826 			u64 delalloc_start;
827 			u64 delalloc_end;
828 			bool delalloc;
829 
830 			delalloc = btrfs_find_delalloc_in_range(inode,
831 								prev_extent_end,
832 								i_size - 1,
833 								&delalloc_cached_state,
834 								&delalloc_start,
835 								&delalloc_end);
836 			if (!delalloc)
837 				cache.flags |= FIEMAP_EXTENT_LAST;
838 		} else {
839 			cache.flags |= FIEMAP_EXTENT_LAST;
840 		}
841 	}
842 
843 out_unlock:
844 	btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
845 
846 	if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
847 		btrfs_release_path(path);
848 		ret = flush_fiemap_cache(fieinfo, &cache);
849 		if (ret)
850 			goto out;
851 		len -= cache.next_search_offset - start;
852 		start = cache.next_search_offset;
853 		goto restart;
854 	} else if (ret < 0) {
855 		goto out;
856 	}
857 
858 	/*
859 	 * Must free the path before emitting to the fiemap buffer because we
860 	 * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861 	 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862 	 * waiting for an ordered extent that in order to complete needs to
863 	 * modify that leaf, therefore leading to a deadlock.
864 	 */
865 	btrfs_free_path(path);
866 	path = NULL;
867 
868 	ret = flush_fiemap_cache(fieinfo, &cache);
869 	if (ret)
870 		goto out;
871 
872 	ret = emit_last_fiemap_cache(fieinfo, &cache);
873 out:
874 	btrfs_free_extent_state(delalloc_cached_state);
875 	kfree(cache.entries);
876 	btrfs_free_backref_share_ctx(backref_ctx);
877 	return ret;
878 }
879 
880 int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
881 		 u64 start, u64 len)
882 {
883 	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
884 	int ret;
885 
886 	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
887 	if (ret)
888 		return ret;
889 
890 	/*
891 	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
892 	 * file range (0 to LLONG_MAX), but that is not enough if we have
893 	 * compression enabled. The first filemap_fdatawrite_range() only kicks
894 	 * in the compression of data (in an async thread) and will return
895 	 * before the compression is done and writeback is started. A second
896 	 * filemap_fdatawrite_range() is needed to wait for the compression to
897 	 * complete and writeback to start. We also need to wait for ordered
898 	 * extents to complete, because our fiemap implementation uses mainly
899 	 * file extent items to list the extents, searching for extent maps
900 	 * only for file ranges with holes or prealloc extents to figure out
901 	 * if we have delalloc in those ranges.
902 	 */
903 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
904 		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
905 		if (ret)
906 			return ret;
907 	}
908 
909 	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
910 
911 	/*
912 	 * We did an initial flush to avoid holding the inode's lock while
913 	 * triggering writeback and waiting for the completion of IO and ordered
914 	 * extents. Now after we locked the inode we do it again, because it's
915 	 * possible a new write may have happened in between those two steps.
916 	 */
917 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
918 		ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
919 		if (ret) {
920 			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
921 			return ret;
922 		}
923 	}
924 
925 	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
926 	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
927 
928 	return ret;
929 }
930