1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "backref.h"
4 #include "btrfs_inode.h"
5 #include "fiemap.h"
6 #include "file.h"
7 #include "file-item.h"
8
9 struct btrfs_fiemap_entry {
10 u64 offset;
11 u64 phys;
12 u64 len;
13 u32 flags;
14 };
15
16 /*
17 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
18 * range from the inode's io tree, unlock the subvolume tree search path, flush
19 * the fiemap cache and relock the file range and research the subvolume tree.
20 * The value here is something negative that can't be confused with a valid
21 * errno value and different from 1 because that's also a return value from
22 * fiemap_fill_next_extent() and also it's often used to mean some btree search
23 * did not find a key, so make it some distinct negative value.
24 */
25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
26
27 /*
28 * Used to:
29 *
30 * - Cache the next entry to be emitted to the fiemap buffer, so that we can
31 * merge extents that are contiguous and can be grouped as a single one;
32 *
33 * - Store extents ready to be written to the fiemap buffer in an intermediary
34 * buffer. This intermediary buffer is to ensure that in case the fiemap
35 * buffer is memory mapped to the fiemap target file, we don't deadlock
36 * during btrfs_page_mkwrite(). This is because during fiemap we are locking
37 * an extent range in order to prevent races with delalloc flushing and
38 * ordered extent completion, which is needed in order to reliably detect
39 * delalloc in holes and prealloc extents. And this can lead to a deadlock
40 * if the fiemap buffer is memory mapped to the file we are running fiemap
41 * against (a silly, useless in practice scenario, but possible) because
42 * btrfs_page_mkwrite() will try to lock the same extent range.
43 */
44 struct fiemap_cache {
45 /* An array of ready fiemap entries. */
46 struct btrfs_fiemap_entry *entries;
47 /* Number of entries in the entries array. */
48 int entries_size;
49 /* Index of the next entry in the entries array to write to. */
50 int entries_pos;
51 /*
52 * Once the entries array is full, this indicates what's the offset for
53 * the next file extent item we must search for in the inode's subvolume
54 * tree after unlocking the extent range in the inode's io tree and
55 * releasing the search path.
56 */
57 u64 next_search_offset;
58 /*
59 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
60 * to count ourselves emitted extents and stop instead of relying on
61 * fiemap_fill_next_extent() because we buffer ready fiemap entries at
62 * the @entries array, and we want to stop as soon as we hit the max
63 * amount of extents to map, not just to save time but also to make the
64 * logic at extent_fiemap() simpler.
65 */
66 unsigned int extents_mapped;
67 /* Fields for the cached extent (unsubmitted, not ready, extent). */
68 u64 offset;
69 u64 phys;
70 u64 len;
71 u32 flags;
72 bool cached;
73 };
74
flush_fiemap_cache(struct fiemap_extent_info * fieinfo,struct fiemap_cache * cache)75 static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
76 struct fiemap_cache *cache)
77 {
78 for (int i = 0; i < cache->entries_pos; i++) {
79 struct btrfs_fiemap_entry *entry = &cache->entries[i];
80 int ret;
81
82 ret = fiemap_fill_next_extent(fieinfo, entry->offset,
83 entry->phys, entry->len,
84 entry->flags);
85 /*
86 * Ignore 1 (reached max entries) because we keep track of that
87 * ourselves in emit_fiemap_extent().
88 */
89 if (ret < 0)
90 return ret;
91 }
92 cache->entries_pos = 0;
93
94 return 0;
95 }
96
97 /*
98 * Helper to submit fiemap extent.
99 *
100 * Will try to merge current fiemap extent specified by @offset, @phys,
101 * @len and @flags with cached one.
102 * And only when we fails to merge, cached one will be submitted as
103 * fiemap extent.
104 *
105 * Return value is the same as fiemap_fill_next_extent().
106 */
emit_fiemap_extent(struct fiemap_extent_info * fieinfo,struct fiemap_cache * cache,u64 offset,u64 phys,u64 len,u32 flags)107 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
108 struct fiemap_cache *cache,
109 u64 offset, u64 phys, u64 len, u32 flags)
110 {
111 struct btrfs_fiemap_entry *entry;
112 u64 cache_end;
113
114 /* Set at the end of extent_fiemap(). */
115 ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
116
117 if (!cache->cached)
118 goto assign;
119
120 /*
121 * When iterating the extents of the inode, at extent_fiemap(), we may
122 * find an extent that starts at an offset behind the end offset of the
123 * previous extent we processed. This happens if fiemap is called
124 * without FIEMAP_FLAG_SYNC and there are ordered extents completing
125 * after we had to unlock the file range, release the search path, emit
126 * the fiemap extents stored in the buffer (cache->entries array) and
127 * the lock the remainder of the range and re-search the btree.
128 *
129 * For example we are in leaf X processing its last item, which is the
130 * file extent item for file range [512K, 1M[, and after
131 * btrfs_next_leaf() releases the path, there's an ordered extent that
132 * completes for the file range [768K, 2M[, and that results in trimming
133 * the file extent item so that it now corresponds to the file range
134 * [512K, 768K[ and a new file extent item is inserted for the file
135 * range [768K, 2M[, which may end up as the last item of leaf X or as
136 * the first item of the next leaf - in either case btrfs_next_leaf()
137 * will leave us with a path pointing to the new extent item, for the
138 * file range [768K, 2M[, since that's the first key that follows the
139 * last one we processed. So in order not to report overlapping extents
140 * to user space, we trim the length of the previously cached extent and
141 * emit it.
142 *
143 * Upon calling btrfs_next_leaf() we may also find an extent with an
144 * offset smaller than or equals to cache->offset, and this happens
145 * when we had a hole or prealloc extent with several delalloc ranges in
146 * it, but after btrfs_next_leaf() released the path, delalloc was
147 * flushed and the resulting ordered extents were completed, so we can
148 * now have found a file extent item for an offset that is smaller than
149 * or equals to what we have in cache->offset. We deal with this as
150 * described below.
151 */
152 cache_end = cache->offset + cache->len;
153 if (cache_end > offset) {
154 if (offset == cache->offset) {
155 /*
156 * We cached a dealloc range (found in the io tree) for
157 * a hole or prealloc extent and we have now found a
158 * file extent item for the same offset. What we have
159 * now is more recent and up to date, so discard what
160 * we had in the cache and use what we have just found.
161 */
162 goto assign;
163 } else if (offset > cache->offset) {
164 /*
165 * The extent range we previously found ends after the
166 * offset of the file extent item we found and that
167 * offset falls somewhere in the middle of that previous
168 * extent range. So adjust the range we previously found
169 * to end at the offset of the file extent item we have
170 * just found, since this extent is more up to date.
171 * Emit that adjusted range and cache the file extent
172 * item we have just found. This corresponds to the case
173 * where a previously found file extent item was split
174 * due to an ordered extent completing.
175 */
176 cache->len = offset - cache->offset;
177 goto emit;
178 } else {
179 const u64 range_end = offset + len;
180
181 /*
182 * The offset of the file extent item we have just found
183 * is behind the cached offset. This means we were
184 * processing a hole or prealloc extent for which we
185 * have found delalloc ranges (in the io tree), so what
186 * we have in the cache is the last delalloc range we
187 * found while the file extent item we found can be
188 * either for a whole delalloc range we previously
189 * emitted or only a part of that range.
190 *
191 * We have two cases here:
192 *
193 * 1) The file extent item's range ends at or behind the
194 * cached extent's end. In this case just ignore the
195 * current file extent item because we don't want to
196 * overlap with previous ranges that may have been
197 * emitted already;
198 *
199 * 2) The file extent item starts behind the currently
200 * cached extent but its end offset goes beyond the
201 * end offset of the cached extent. We don't want to
202 * overlap with a previous range that may have been
203 * emitted already, so we emit the currently cached
204 * extent and then partially store the current file
205 * extent item's range in the cache, for the subrange
206 * going the cached extent's end to the end of the
207 * file extent item.
208 */
209 if (range_end <= cache_end)
210 return 0;
211
212 if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
213 phys += cache_end - offset;
214
215 offset = cache_end;
216 len = range_end - cache_end;
217 goto emit;
218 }
219 }
220
221 /*
222 * Only merges fiemap extents if
223 * 1) Their logical addresses are continuous
224 *
225 * 2) Their physical addresses are continuous
226 * So truly compressed (physical size smaller than logical size)
227 * extents won't get merged with each other
228 *
229 * 3) Share same flags
230 */
231 if (cache->offset + cache->len == offset &&
232 cache->phys + cache->len == phys &&
233 cache->flags == flags) {
234 cache->len += len;
235 return 0;
236 }
237
238 emit:
239 /* Not mergeable, need to submit cached one */
240
241 if (cache->entries_pos == cache->entries_size) {
242 /*
243 * We will need to research for the end offset of the last
244 * stored extent and not from the current offset, because after
245 * unlocking the range and releasing the path, if there's a hole
246 * between that end offset and this current offset, a new extent
247 * may have been inserted due to a new write, so we don't want
248 * to miss it.
249 */
250 entry = &cache->entries[cache->entries_size - 1];
251 cache->next_search_offset = entry->offset + entry->len;
252 cache->cached = false;
253
254 return BTRFS_FIEMAP_FLUSH_CACHE;
255 }
256
257 entry = &cache->entries[cache->entries_pos];
258 entry->offset = cache->offset;
259 entry->phys = cache->phys;
260 entry->len = cache->len;
261 entry->flags = cache->flags;
262 cache->entries_pos++;
263 cache->extents_mapped++;
264
265 if (cache->extents_mapped == fieinfo->fi_extents_max) {
266 cache->cached = false;
267 return 1;
268 }
269 assign:
270 cache->cached = true;
271 cache->offset = offset;
272 cache->phys = phys;
273 cache->len = len;
274 cache->flags = flags;
275
276 return 0;
277 }
278
279 /*
280 * Emit last fiemap cache
281 *
282 * The last fiemap cache may still be cached in the following case:
283 * 0 4k 8k
284 * |<- Fiemap range ->|
285 * |<------------ First extent ----------->|
286 *
287 * In this case, the first extent range will be cached but not emitted.
288 * So we must emit it before ending extent_fiemap().
289 */
emit_last_fiemap_cache(struct fiemap_extent_info * fieinfo,struct fiemap_cache * cache)290 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
291 struct fiemap_cache *cache)
292 {
293 int ret;
294
295 if (!cache->cached)
296 return 0;
297
298 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
299 cache->len, cache->flags);
300 cache->cached = false;
301 if (ret > 0)
302 ret = 0;
303 return ret;
304 }
305
fiemap_next_leaf_item(struct btrfs_inode * inode,struct btrfs_path * path)306 static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
307 {
308 struct extent_buffer *clone = path->nodes[0];
309 struct btrfs_key key;
310 int slot;
311 int ret;
312
313 path->slots[0]++;
314 if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
315 return 0;
316
317 /*
318 * Add a temporary extra ref to an already cloned extent buffer to
319 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
320 * the cost of allocating a new one.
321 */
322 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
323 atomic_inc(&clone->refs);
324
325 ret = btrfs_next_leaf(inode->root, path);
326 if (ret != 0)
327 goto out;
328
329 /*
330 * Don't bother with cloning if there are no more file extent items for
331 * our inode.
332 */
333 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
334 if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
335 ret = 1;
336 goto out;
337 }
338
339 /*
340 * Important to preserve the start field, for the optimizations when
341 * checking if extents are shared (see extent_fiemap()).
342 *
343 * We must set ->start before calling copy_extent_buffer_full(). If we
344 * are on sub-pagesize blocksize, we use ->start to determine the offset
345 * into the folio where our eb exists, and if we update ->start after
346 * the fact then any subsequent reads of the eb may read from a
347 * different offset in the folio than where we originally copied into.
348 */
349 clone->start = path->nodes[0]->start;
350 /* See the comment at fiemap_search_slot() about why we clone. */
351 copy_extent_buffer_full(clone, path->nodes[0]);
352
353 slot = path->slots[0];
354 btrfs_release_path(path);
355 path->nodes[0] = clone;
356 path->slots[0] = slot;
357 out:
358 if (ret)
359 free_extent_buffer(clone);
360
361 return ret;
362 }
363
364 /*
365 * Search for the first file extent item that starts at a given file offset or
366 * the one that starts immediately before that offset.
367 * Returns: 0 on success, < 0 on error, 1 if not found.
368 */
fiemap_search_slot(struct btrfs_inode * inode,struct btrfs_path * path,u64 file_offset)369 static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
370 u64 file_offset)
371 {
372 const u64 ino = btrfs_ino(inode);
373 struct btrfs_root *root = inode->root;
374 struct extent_buffer *clone;
375 struct btrfs_key key;
376 int slot;
377 int ret;
378
379 key.objectid = ino;
380 key.type = BTRFS_EXTENT_DATA_KEY;
381 key.offset = file_offset;
382
383 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
384 if (ret < 0)
385 return ret;
386
387 if (ret > 0 && path->slots[0] > 0) {
388 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
389 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
390 path->slots[0]--;
391 }
392
393 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
394 ret = btrfs_next_leaf(root, path);
395 if (ret != 0)
396 return ret;
397
398 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
399 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
400 return 1;
401 }
402
403 /*
404 * We clone the leaf and use it during fiemap. This is because while
405 * using the leaf we do expensive things like checking if an extent is
406 * shared, which can take a long time. In order to prevent blocking
407 * other tasks for too long, we use a clone of the leaf. We have locked
408 * the file range in the inode's io tree, so we know none of our file
409 * extent items can change. This way we avoid blocking other tasks that
410 * want to insert items for other inodes in the same leaf or b+tree
411 * rebalance operations (triggered for example when someone is trying
412 * to push items into this leaf when trying to insert an item in a
413 * neighbour leaf).
414 * We also need the private clone because holding a read lock on an
415 * extent buffer of the subvolume's b+tree will make lockdep unhappy
416 * when we check if extents are shared, as backref walking may need to
417 * lock the same leaf we are processing.
418 */
419 clone = btrfs_clone_extent_buffer(path->nodes[0]);
420 if (!clone)
421 return -ENOMEM;
422
423 slot = path->slots[0];
424 btrfs_release_path(path);
425 path->nodes[0] = clone;
426 path->slots[0] = slot;
427
428 return 0;
429 }
430
431 /*
432 * Process a range which is a hole or a prealloc extent in the inode's subvolume
433 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
434 * extent. The end offset (@end) is inclusive.
435 */
fiemap_process_hole(struct btrfs_inode * inode,struct fiemap_extent_info * fieinfo,struct fiemap_cache * cache,struct extent_state ** delalloc_cached_state,struct btrfs_backref_share_check_ctx * backref_ctx,u64 disk_bytenr,u64 extent_offset,u64 extent_gen,u64 start,u64 end)436 static int fiemap_process_hole(struct btrfs_inode *inode,
437 struct fiemap_extent_info *fieinfo,
438 struct fiemap_cache *cache,
439 struct extent_state **delalloc_cached_state,
440 struct btrfs_backref_share_check_ctx *backref_ctx,
441 u64 disk_bytenr, u64 extent_offset,
442 u64 extent_gen,
443 u64 start, u64 end)
444 {
445 const u64 i_size = i_size_read(&inode->vfs_inode);
446 u64 cur_offset = start;
447 u64 last_delalloc_end = 0;
448 u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
449 bool checked_extent_shared = false;
450 int ret;
451
452 /*
453 * There can be no delalloc past i_size, so don't waste time looking for
454 * it beyond i_size.
455 */
456 while (cur_offset < end && cur_offset < i_size) {
457 u64 delalloc_start;
458 u64 delalloc_end;
459 u64 prealloc_start;
460 u64 prealloc_len = 0;
461 bool delalloc;
462
463 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
464 delalloc_cached_state,
465 &delalloc_start,
466 &delalloc_end);
467 if (!delalloc)
468 break;
469
470 /*
471 * If this is a prealloc extent we have to report every section
472 * of it that has no delalloc.
473 */
474 if (disk_bytenr != 0) {
475 if (last_delalloc_end == 0) {
476 prealloc_start = start;
477 prealloc_len = delalloc_start - start;
478 } else {
479 prealloc_start = last_delalloc_end + 1;
480 prealloc_len = delalloc_start - prealloc_start;
481 }
482 }
483
484 if (prealloc_len > 0) {
485 if (!checked_extent_shared && fieinfo->fi_extents_max) {
486 ret = btrfs_is_data_extent_shared(inode,
487 disk_bytenr,
488 extent_gen,
489 backref_ctx);
490 if (ret < 0)
491 return ret;
492 else if (ret > 0)
493 prealloc_flags |= FIEMAP_EXTENT_SHARED;
494
495 checked_extent_shared = true;
496 }
497 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
498 disk_bytenr + extent_offset,
499 prealloc_len, prealloc_flags);
500 if (ret)
501 return ret;
502 extent_offset += prealloc_len;
503 }
504
505 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
506 delalloc_end + 1 - delalloc_start,
507 FIEMAP_EXTENT_DELALLOC |
508 FIEMAP_EXTENT_UNKNOWN);
509 if (ret)
510 return ret;
511
512 last_delalloc_end = delalloc_end;
513 cur_offset = delalloc_end + 1;
514 extent_offset += cur_offset - delalloc_start;
515 cond_resched();
516 }
517
518 /*
519 * Either we found no delalloc for the whole prealloc extent or we have
520 * a prealloc extent that spans i_size or starts at or after i_size.
521 */
522 if (disk_bytenr != 0 && last_delalloc_end < end) {
523 u64 prealloc_start;
524 u64 prealloc_len;
525
526 if (last_delalloc_end == 0) {
527 prealloc_start = start;
528 prealloc_len = end + 1 - start;
529 } else {
530 prealloc_start = last_delalloc_end + 1;
531 prealloc_len = end + 1 - prealloc_start;
532 }
533
534 if (!checked_extent_shared && fieinfo->fi_extents_max) {
535 ret = btrfs_is_data_extent_shared(inode,
536 disk_bytenr,
537 extent_gen,
538 backref_ctx);
539 if (ret < 0)
540 return ret;
541 else if (ret > 0)
542 prealloc_flags |= FIEMAP_EXTENT_SHARED;
543 }
544 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
545 disk_bytenr + extent_offset,
546 prealloc_len, prealloc_flags);
547 if (ret)
548 return ret;
549 }
550
551 return 0;
552 }
553
fiemap_find_last_extent_offset(struct btrfs_inode * inode,struct btrfs_path * path,u64 * last_extent_end_ret)554 static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
555 struct btrfs_path *path,
556 u64 *last_extent_end_ret)
557 {
558 const u64 ino = btrfs_ino(inode);
559 struct btrfs_root *root = inode->root;
560 struct extent_buffer *leaf;
561 struct btrfs_file_extent_item *ei;
562 struct btrfs_key key;
563 u64 disk_bytenr;
564 int ret;
565
566 /*
567 * Lookup the last file extent. We're not using i_size here because
568 * there might be preallocation past i_size.
569 */
570 ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
571 /* There can't be a file extent item at offset (u64)-1 */
572 ASSERT(ret != 0);
573 if (ret < 0)
574 return ret;
575
576 /*
577 * For a non-existing key, btrfs_search_slot() always leaves us at a
578 * slot > 0, except if the btree is empty, which is impossible because
579 * at least it has the inode item for this inode and all the items for
580 * the root inode 256.
581 */
582 ASSERT(path->slots[0] > 0);
583 path->slots[0]--;
584 leaf = path->nodes[0];
585 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
586 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
587 /* No file extent items in the subvolume tree. */
588 *last_extent_end_ret = 0;
589 return 0;
590 }
591
592 /*
593 * For an inline extent, the disk_bytenr is where inline data starts at,
594 * so first check if we have an inline extent item before checking if we
595 * have an implicit hole (disk_bytenr == 0).
596 */
597 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
598 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
599 *last_extent_end_ret = btrfs_file_extent_end(path);
600 return 0;
601 }
602
603 /*
604 * Find the last file extent item that is not a hole (when NO_HOLES is
605 * not enabled). This should take at most 2 iterations in the worst
606 * case: we have one hole file extent item at slot 0 of a leaf and
607 * another hole file extent item as the last item in the previous leaf.
608 * This is because we merge file extent items that represent holes.
609 */
610 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
611 while (disk_bytenr == 0) {
612 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
613 if (ret < 0) {
614 return ret;
615 } else if (ret > 0) {
616 /* No file extent items that are not holes. */
617 *last_extent_end_ret = 0;
618 return 0;
619 }
620 leaf = path->nodes[0];
621 ei = btrfs_item_ptr(leaf, path->slots[0],
622 struct btrfs_file_extent_item);
623 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
624 }
625
626 *last_extent_end_ret = btrfs_file_extent_end(path);
627 return 0;
628 }
629
extent_fiemap(struct btrfs_inode * inode,struct fiemap_extent_info * fieinfo,u64 start,u64 len)630 static int extent_fiemap(struct btrfs_inode *inode,
631 struct fiemap_extent_info *fieinfo,
632 u64 start, u64 len)
633 {
634 const u64 ino = btrfs_ino(inode);
635 struct extent_state *cached_state = NULL;
636 struct extent_state *delalloc_cached_state = NULL;
637 struct btrfs_path *path;
638 struct fiemap_cache cache = { 0 };
639 struct btrfs_backref_share_check_ctx *backref_ctx;
640 u64 last_extent_end = 0;
641 u64 prev_extent_end;
642 u64 range_start;
643 u64 range_end;
644 const u64 sectorsize = inode->root->fs_info->sectorsize;
645 bool stopped = false;
646 int ret;
647
648 cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
649 cache.entries = kmalloc_array(cache.entries_size,
650 sizeof(struct btrfs_fiemap_entry),
651 GFP_KERNEL);
652 backref_ctx = btrfs_alloc_backref_share_check_ctx();
653 path = btrfs_alloc_path();
654 if (!cache.entries || !backref_ctx || !path) {
655 ret = -ENOMEM;
656 goto out;
657 }
658
659 restart:
660 range_start = round_down(start, sectorsize);
661 range_end = round_up(start + len, sectorsize);
662 prev_extent_end = range_start;
663
664 lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
665
666 ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
667 if (ret < 0)
668 goto out_unlock;
669 btrfs_release_path(path);
670
671 path->reada = READA_FORWARD;
672 ret = fiemap_search_slot(inode, path, range_start);
673 if (ret < 0) {
674 goto out_unlock;
675 } else if (ret > 0) {
676 /*
677 * No file extent item found, but we may have delalloc between
678 * the current offset and i_size. So check for that.
679 */
680 ret = 0;
681 goto check_eof_delalloc;
682 }
683
684 while (prev_extent_end < range_end) {
685 struct extent_buffer *leaf = path->nodes[0];
686 struct btrfs_file_extent_item *ei;
687 struct btrfs_key key;
688 u64 extent_end;
689 u64 extent_len;
690 u64 extent_offset = 0;
691 u64 extent_gen;
692 u64 disk_bytenr = 0;
693 u64 flags = 0;
694 int extent_type;
695 u8 compression;
696
697 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
698 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
699 break;
700
701 extent_end = btrfs_file_extent_end(path);
702
703 /*
704 * The first iteration can leave us at an extent item that ends
705 * before our range's start. Move to the next item.
706 */
707 if (extent_end <= range_start)
708 goto next_item;
709
710 backref_ctx->curr_leaf_bytenr = leaf->start;
711
712 /* We have in implicit hole (NO_HOLES feature enabled). */
713 if (prev_extent_end < key.offset) {
714 const u64 hole_end = min(key.offset, range_end) - 1;
715
716 ret = fiemap_process_hole(inode, fieinfo, &cache,
717 &delalloc_cached_state,
718 backref_ctx, 0, 0, 0,
719 prev_extent_end, hole_end);
720 if (ret < 0) {
721 goto out_unlock;
722 } else if (ret > 0) {
723 /* fiemap_fill_next_extent() told us to stop. */
724 stopped = true;
725 break;
726 }
727
728 /* We've reached the end of the fiemap range, stop. */
729 if (key.offset >= range_end) {
730 stopped = true;
731 break;
732 }
733 }
734
735 extent_len = extent_end - key.offset;
736 ei = btrfs_item_ptr(leaf, path->slots[0],
737 struct btrfs_file_extent_item);
738 compression = btrfs_file_extent_compression(leaf, ei);
739 extent_type = btrfs_file_extent_type(leaf, ei);
740 extent_gen = btrfs_file_extent_generation(leaf, ei);
741
742 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
743 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
744 if (compression == BTRFS_COMPRESS_NONE)
745 extent_offset = btrfs_file_extent_offset(leaf, ei);
746 }
747
748 if (compression != BTRFS_COMPRESS_NONE)
749 flags |= FIEMAP_EXTENT_ENCODED;
750
751 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
752 flags |= FIEMAP_EXTENT_DATA_INLINE;
753 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
754 ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
755 extent_len, flags);
756 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
757 ret = fiemap_process_hole(inode, fieinfo, &cache,
758 &delalloc_cached_state,
759 backref_ctx,
760 disk_bytenr, extent_offset,
761 extent_gen, key.offset,
762 extent_end - 1);
763 } else if (disk_bytenr == 0) {
764 /* We have an explicit hole. */
765 ret = fiemap_process_hole(inode, fieinfo, &cache,
766 &delalloc_cached_state,
767 backref_ctx, 0, 0, 0,
768 key.offset, extent_end - 1);
769 } else {
770 /* We have a regular extent. */
771 if (fieinfo->fi_extents_max) {
772 ret = btrfs_is_data_extent_shared(inode,
773 disk_bytenr,
774 extent_gen,
775 backref_ctx);
776 if (ret < 0)
777 goto out_unlock;
778 else if (ret > 0)
779 flags |= FIEMAP_EXTENT_SHARED;
780 }
781
782 ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
783 disk_bytenr + extent_offset,
784 extent_len, flags);
785 }
786
787 if (ret < 0) {
788 goto out_unlock;
789 } else if (ret > 0) {
790 /* emit_fiemap_extent() told us to stop. */
791 stopped = true;
792 break;
793 }
794
795 prev_extent_end = extent_end;
796 next_item:
797 if (fatal_signal_pending(current)) {
798 ret = -EINTR;
799 goto out_unlock;
800 }
801
802 ret = fiemap_next_leaf_item(inode, path);
803 if (ret < 0) {
804 goto out_unlock;
805 } else if (ret > 0) {
806 /* No more file extent items for this inode. */
807 break;
808 }
809 cond_resched();
810 }
811
812 check_eof_delalloc:
813 if (!stopped && prev_extent_end < range_end) {
814 ret = fiemap_process_hole(inode, fieinfo, &cache,
815 &delalloc_cached_state, backref_ctx,
816 0, 0, 0, prev_extent_end, range_end - 1);
817 if (ret < 0)
818 goto out_unlock;
819 prev_extent_end = range_end;
820 }
821
822 if (cache.cached && cache.offset + cache.len >= last_extent_end) {
823 const u64 i_size = i_size_read(&inode->vfs_inode);
824
825 if (prev_extent_end < i_size) {
826 u64 delalloc_start;
827 u64 delalloc_end;
828 bool delalloc;
829
830 delalloc = btrfs_find_delalloc_in_range(inode,
831 prev_extent_end,
832 i_size - 1,
833 &delalloc_cached_state,
834 &delalloc_start,
835 &delalloc_end);
836 if (!delalloc)
837 cache.flags |= FIEMAP_EXTENT_LAST;
838 } else {
839 cache.flags |= FIEMAP_EXTENT_LAST;
840 }
841 }
842
843 out_unlock:
844 unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
845
846 if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
847 btrfs_release_path(path);
848 ret = flush_fiemap_cache(fieinfo, &cache);
849 if (ret)
850 goto out;
851 len -= cache.next_search_offset - start;
852 start = cache.next_search_offset;
853 goto restart;
854 } else if (ret < 0) {
855 goto out;
856 }
857
858 /*
859 * Must free the path before emitting to the fiemap buffer because we
860 * may have a non-cloned leaf and if the fiemap buffer is memory mapped
861 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
862 * waiting for an ordered extent that in order to complete needs to
863 * modify that leaf, therefore leading to a deadlock.
864 */
865 btrfs_free_path(path);
866 path = NULL;
867
868 ret = flush_fiemap_cache(fieinfo, &cache);
869 if (ret)
870 goto out;
871
872 ret = emit_last_fiemap_cache(fieinfo, &cache);
873 out:
874 free_extent_state(delalloc_cached_state);
875 kfree(cache.entries);
876 btrfs_free_backref_share_ctx(backref_ctx);
877 btrfs_free_path(path);
878 return ret;
879 }
880
btrfs_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,u64 start,u64 len)881 int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
882 u64 start, u64 len)
883 {
884 struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
885 int ret;
886
887 ret = fiemap_prep(inode, fieinfo, start, &len, 0);
888 if (ret)
889 return ret;
890
891 /*
892 * fiemap_prep() called filemap_write_and_wait() for the whole possible
893 * file range (0 to LLONG_MAX), but that is not enough if we have
894 * compression enabled. The first filemap_fdatawrite_range() only kicks
895 * in the compression of data (in an async thread) and will return
896 * before the compression is done and writeback is started. A second
897 * filemap_fdatawrite_range() is needed to wait for the compression to
898 * complete and writeback to start. We also need to wait for ordered
899 * extents to complete, because our fiemap implementation uses mainly
900 * file extent items to list the extents, searching for extent maps
901 * only for file ranges with holes or prealloc extents to figure out
902 * if we have delalloc in those ranges.
903 */
904 if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
905 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
906 if (ret)
907 return ret;
908 }
909
910 btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
911
912 /*
913 * We did an initial flush to avoid holding the inode's lock while
914 * triggering writeback and waiting for the completion of IO and ordered
915 * extents. Now after we locked the inode we do it again, because it's
916 * possible a new write may have happened in between those two steps.
917 */
918 if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
919 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
920 if (ret) {
921 btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
922 return ret;
923 }
924 }
925
926 ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
927 btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
928
929 return ret;
930 }
931