1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "backref.h" 4 #include "btrfs_inode.h" 5 #include "fiemap.h" 6 #include "file.h" 7 #include "file-item.h" 8 9 struct btrfs_fiemap_entry { 10 u64 offset; 11 u64 phys; 12 u64 len; 13 u32 flags; 14 }; 15 16 /* 17 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file 18 * range from the inode's io tree, unlock the subvolume tree search path, flush 19 * the fiemap cache and relock the file range and research the subvolume tree. 20 * The value here is something negative that can't be confused with a valid 21 * errno value and different from 1 because that's also a return value from 22 * fiemap_fill_next_extent() and also it's often used to mean some btree search 23 * did not find a key, so make it some distinct negative value. 24 */ 25 #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) 26 27 /* 28 * Used to: 29 * 30 * - Cache the next entry to be emitted to the fiemap buffer, so that we can 31 * merge extents that are contiguous and can be grouped as a single one; 32 * 33 * - Store extents ready to be written to the fiemap buffer in an intermediary 34 * buffer. This intermediary buffer is to ensure that in case the fiemap 35 * buffer is memory mapped to the fiemap target file, we don't deadlock 36 * during btrfs_page_mkwrite(). This is because during fiemap we are locking 37 * an extent range in order to prevent races with delalloc flushing and 38 * ordered extent completion, which is needed in order to reliably detect 39 * delalloc in holes and prealloc extents. And this can lead to a deadlock 40 * if the fiemap buffer is memory mapped to the file we are running fiemap 41 * against (a silly, useless in practice scenario, but possible) because 42 * btrfs_page_mkwrite() will try to lock the same extent range. 43 */ 44 struct fiemap_cache { 45 /* An array of ready fiemap entries. */ 46 struct btrfs_fiemap_entry *entries; 47 /* Number of entries in the entries array. */ 48 int entries_size; 49 /* Index of the next entry in the entries array to write to. */ 50 int entries_pos; 51 /* 52 * Once the entries array is full, this indicates what's the offset for 53 * the next file extent item we must search for in the inode's subvolume 54 * tree after unlocking the extent range in the inode's io tree and 55 * releasing the search path. 56 */ 57 u64 next_search_offset; 58 /* 59 * This matches struct fiemap_extent_info::fi_mapped_extents, we use it 60 * to count ourselves emitted extents and stop instead of relying on 61 * fiemap_fill_next_extent() because we buffer ready fiemap entries at 62 * the @entries array, and we want to stop as soon as we hit the max 63 * amount of extents to map, not just to save time but also to make the 64 * logic at extent_fiemap() simpler. 65 */ 66 unsigned int extents_mapped; 67 /* Fields for the cached extent (unsubmitted, not ready, extent). */ 68 u64 offset; 69 u64 phys; 70 u64 len; 71 u32 flags; 72 bool cached; 73 }; 74 75 static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, 76 struct fiemap_cache *cache) 77 { 78 for (int i = 0; i < cache->entries_pos; i++) { 79 struct btrfs_fiemap_entry *entry = &cache->entries[i]; 80 int ret; 81 82 ret = fiemap_fill_next_extent(fieinfo, entry->offset, 83 entry->phys, entry->len, 84 entry->flags); 85 /* 86 * Ignore 1 (reached max entries) because we keep track of that 87 * ourselves in emit_fiemap_extent(). 88 */ 89 if (ret < 0) 90 return ret; 91 } 92 cache->entries_pos = 0; 93 94 return 0; 95 } 96 97 /* 98 * Helper to submit fiemap extent. 99 * 100 * Will try to merge current fiemap extent specified by @offset, @phys, 101 * @len and @flags with cached one. 102 * And only when we fails to merge, cached one will be submitted as 103 * fiemap extent. 104 * 105 * Return value is the same as fiemap_fill_next_extent(). 106 */ 107 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 108 struct fiemap_cache *cache, 109 u64 offset, u64 phys, u64 len, u32 flags) 110 { 111 struct btrfs_fiemap_entry *entry; 112 u64 cache_end; 113 114 /* Set at the end of extent_fiemap(). */ 115 ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); 116 117 if (!cache->cached) 118 goto assign; 119 120 /* 121 * When iterating the extents of the inode, at extent_fiemap(), we may 122 * find an extent that starts at an offset behind the end offset of the 123 * previous extent we processed. This happens if fiemap is called 124 * without FIEMAP_FLAG_SYNC and there are ordered extents completing 125 * after we had to unlock the file range, release the search path, emit 126 * the fiemap extents stored in the buffer (cache->entries array) and 127 * the lock the remainder of the range and re-search the btree. 128 * 129 * For example we are in leaf X processing its last item, which is the 130 * file extent item for file range [512K, 1M[, and after 131 * btrfs_next_leaf() releases the path, there's an ordered extent that 132 * completes for the file range [768K, 2M[, and that results in trimming 133 * the file extent item so that it now corresponds to the file range 134 * [512K, 768K[ and a new file extent item is inserted for the file 135 * range [768K, 2M[, which may end up as the last item of leaf X or as 136 * the first item of the next leaf - in either case btrfs_next_leaf() 137 * will leave us with a path pointing to the new extent item, for the 138 * file range [768K, 2M[, since that's the first key that follows the 139 * last one we processed. So in order not to report overlapping extents 140 * to user space, we trim the length of the previously cached extent and 141 * emit it. 142 * 143 * Upon calling btrfs_next_leaf() we may also find an extent with an 144 * offset smaller than or equals to cache->offset, and this happens 145 * when we had a hole or prealloc extent with several delalloc ranges in 146 * it, but after btrfs_next_leaf() released the path, delalloc was 147 * flushed and the resulting ordered extents were completed, so we can 148 * now have found a file extent item for an offset that is smaller than 149 * or equals to what we have in cache->offset. We deal with this as 150 * described below. 151 */ 152 cache_end = cache->offset + cache->len; 153 if (cache_end > offset) { 154 if (offset == cache->offset) { 155 /* 156 * We cached a dealloc range (found in the io tree) for 157 * a hole or prealloc extent and we have now found a 158 * file extent item for the same offset. What we have 159 * now is more recent and up to date, so discard what 160 * we had in the cache and use what we have just found. 161 */ 162 goto assign; 163 } else if (offset > cache->offset) { 164 /* 165 * The extent range we previously found ends after the 166 * offset of the file extent item we found and that 167 * offset falls somewhere in the middle of that previous 168 * extent range. So adjust the range we previously found 169 * to end at the offset of the file extent item we have 170 * just found, since this extent is more up to date. 171 * Emit that adjusted range and cache the file extent 172 * item we have just found. This corresponds to the case 173 * where a previously found file extent item was split 174 * due to an ordered extent completing. 175 */ 176 cache->len = offset - cache->offset; 177 goto emit; 178 } else { 179 const u64 range_end = offset + len; 180 181 /* 182 * The offset of the file extent item we have just found 183 * is behind the cached offset. This means we were 184 * processing a hole or prealloc extent for which we 185 * have found delalloc ranges (in the io tree), so what 186 * we have in the cache is the last delalloc range we 187 * found while the file extent item we found can be 188 * either for a whole delalloc range we previously 189 * emmitted or only a part of that range. 190 * 191 * We have two cases here: 192 * 193 * 1) The file extent item's range ends at or behind the 194 * cached extent's end. In this case just ignore the 195 * current file extent item because we don't want to 196 * overlap with previous ranges that may have been 197 * emmitted already; 198 * 199 * 2) The file extent item starts behind the currently 200 * cached extent but its end offset goes beyond the 201 * end offset of the cached extent. We don't want to 202 * overlap with a previous range that may have been 203 * emmitted already, so we emit the currently cached 204 * extent and then partially store the current file 205 * extent item's range in the cache, for the subrange 206 * going the cached extent's end to the end of the 207 * file extent item. 208 */ 209 if (range_end <= cache_end) 210 return 0; 211 212 if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) 213 phys += cache_end - offset; 214 215 offset = cache_end; 216 len = range_end - cache_end; 217 goto emit; 218 } 219 } 220 221 /* 222 * Only merges fiemap extents if 223 * 1) Their logical addresses are continuous 224 * 225 * 2) Their physical addresses are continuous 226 * So truly compressed (physical size smaller than logical size) 227 * extents won't get merged with each other 228 * 229 * 3) Share same flags 230 */ 231 if (cache->offset + cache->len == offset && 232 cache->phys + cache->len == phys && 233 cache->flags == flags) { 234 cache->len += len; 235 return 0; 236 } 237 238 emit: 239 /* Not mergeable, need to submit cached one */ 240 241 if (cache->entries_pos == cache->entries_size) { 242 /* 243 * We will need to research for the end offset of the last 244 * stored extent and not from the current offset, because after 245 * unlocking the range and releasing the path, if there's a hole 246 * between that end offset and this current offset, a new extent 247 * may have been inserted due to a new write, so we don't want 248 * to miss it. 249 */ 250 entry = &cache->entries[cache->entries_size - 1]; 251 cache->next_search_offset = entry->offset + entry->len; 252 cache->cached = false; 253 254 return BTRFS_FIEMAP_FLUSH_CACHE; 255 } 256 257 entry = &cache->entries[cache->entries_pos]; 258 entry->offset = cache->offset; 259 entry->phys = cache->phys; 260 entry->len = cache->len; 261 entry->flags = cache->flags; 262 cache->entries_pos++; 263 cache->extents_mapped++; 264 265 if (cache->extents_mapped == fieinfo->fi_extents_max) { 266 cache->cached = false; 267 return 1; 268 } 269 assign: 270 cache->cached = true; 271 cache->offset = offset; 272 cache->phys = phys; 273 cache->len = len; 274 cache->flags = flags; 275 276 return 0; 277 } 278 279 /* 280 * Emit last fiemap cache 281 * 282 * The last fiemap cache may still be cached in the following case: 283 * 0 4k 8k 284 * |<- Fiemap range ->| 285 * |<------------ First extent ----------->| 286 * 287 * In this case, the first extent range will be cached but not emitted. 288 * So we must emit it before ending extent_fiemap(). 289 */ 290 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 291 struct fiemap_cache *cache) 292 { 293 int ret; 294 295 if (!cache->cached) 296 return 0; 297 298 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 299 cache->len, cache->flags); 300 cache->cached = false; 301 if (ret > 0) 302 ret = 0; 303 return ret; 304 } 305 306 static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) 307 { 308 struct extent_buffer *clone = path->nodes[0]; 309 struct btrfs_key key; 310 int slot; 311 int ret; 312 313 path->slots[0]++; 314 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) 315 return 0; 316 317 /* 318 * Add a temporary extra ref to an already cloned extent buffer to 319 * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid 320 * the cost of allocating a new one. 321 */ 322 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); 323 atomic_inc(&clone->refs); 324 325 ret = btrfs_next_leaf(inode->root, path); 326 if (ret != 0) 327 goto out; 328 329 /* 330 * Don't bother with cloning if there are no more file extent items for 331 * our inode. 332 */ 333 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 334 if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { 335 ret = 1; 336 goto out; 337 } 338 339 /* 340 * Important to preserve the start field, for the optimizations when 341 * checking if extents are shared (see extent_fiemap()). 342 * 343 * We must set ->start before calling copy_extent_buffer_full(). If we 344 * are on sub-pagesize blocksize, we use ->start to determine the offset 345 * into the folio where our eb exists, and if we update ->start after 346 * the fact then any subsequent reads of the eb may read from a 347 * different offset in the folio than where we originally copied into. 348 */ 349 clone->start = path->nodes[0]->start; 350 /* See the comment at fiemap_search_slot() about why we clone. */ 351 copy_extent_buffer_full(clone, path->nodes[0]); 352 353 slot = path->slots[0]; 354 btrfs_release_path(path); 355 path->nodes[0] = clone; 356 path->slots[0] = slot; 357 out: 358 if (ret) 359 free_extent_buffer(clone); 360 361 return ret; 362 } 363 364 /* 365 * Search for the first file extent item that starts at a given file offset or 366 * the one that starts immediately before that offset. 367 * Returns: 0 on success, < 0 on error, 1 if not found. 368 */ 369 static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, 370 u64 file_offset) 371 { 372 const u64 ino = btrfs_ino(inode); 373 struct btrfs_root *root = inode->root; 374 struct extent_buffer *clone; 375 struct btrfs_key key; 376 int slot; 377 int ret; 378 379 key.objectid = ino; 380 key.type = BTRFS_EXTENT_DATA_KEY; 381 key.offset = file_offset; 382 383 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 384 if (ret < 0) 385 return ret; 386 387 if (ret > 0 && path->slots[0] > 0) { 388 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 389 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 390 path->slots[0]--; 391 } 392 393 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 394 ret = btrfs_next_leaf(root, path); 395 if (ret != 0) 396 return ret; 397 398 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 399 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 400 return 1; 401 } 402 403 /* 404 * We clone the leaf and use it during fiemap. This is because while 405 * using the leaf we do expensive things like checking if an extent is 406 * shared, which can take a long time. In order to prevent blocking 407 * other tasks for too long, we use a clone of the leaf. We have locked 408 * the file range in the inode's io tree, so we know none of our file 409 * extent items can change. This way we avoid blocking other tasks that 410 * want to insert items for other inodes in the same leaf or b+tree 411 * rebalance operations (triggered for example when someone is trying 412 * to push items into this leaf when trying to insert an item in a 413 * neighbour leaf). 414 * We also need the private clone because holding a read lock on an 415 * extent buffer of the subvolume's b+tree will make lockdep unhappy 416 * when we check if extents are shared, as backref walking may need to 417 * lock the same leaf we are processing. 418 */ 419 clone = btrfs_clone_extent_buffer(path->nodes[0]); 420 if (!clone) 421 return -ENOMEM; 422 423 slot = path->slots[0]; 424 btrfs_release_path(path); 425 path->nodes[0] = clone; 426 path->slots[0] = slot; 427 428 return 0; 429 } 430 431 /* 432 * Process a range which is a hole or a prealloc extent in the inode's subvolume 433 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc 434 * extent. The end offset (@end) is inclusive. 435 */ 436 static int fiemap_process_hole(struct btrfs_inode *inode, 437 struct fiemap_extent_info *fieinfo, 438 struct fiemap_cache *cache, 439 struct extent_state **delalloc_cached_state, 440 struct btrfs_backref_share_check_ctx *backref_ctx, 441 u64 disk_bytenr, u64 extent_offset, 442 u64 extent_gen, 443 u64 start, u64 end) 444 { 445 const u64 i_size = i_size_read(&inode->vfs_inode); 446 u64 cur_offset = start; 447 u64 last_delalloc_end = 0; 448 u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; 449 bool checked_extent_shared = false; 450 int ret; 451 452 /* 453 * There can be no delalloc past i_size, so don't waste time looking for 454 * it beyond i_size. 455 */ 456 while (cur_offset < end && cur_offset < i_size) { 457 u64 delalloc_start; 458 u64 delalloc_end; 459 u64 prealloc_start; 460 u64 prealloc_len = 0; 461 bool delalloc; 462 463 delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, 464 delalloc_cached_state, 465 &delalloc_start, 466 &delalloc_end); 467 if (!delalloc) 468 break; 469 470 /* 471 * If this is a prealloc extent we have to report every section 472 * of it that has no delalloc. 473 */ 474 if (disk_bytenr != 0) { 475 if (last_delalloc_end == 0) { 476 prealloc_start = start; 477 prealloc_len = delalloc_start - start; 478 } else { 479 prealloc_start = last_delalloc_end + 1; 480 prealloc_len = delalloc_start - prealloc_start; 481 } 482 } 483 484 if (prealloc_len > 0) { 485 if (!checked_extent_shared && fieinfo->fi_extents_max) { 486 ret = btrfs_is_data_extent_shared(inode, 487 disk_bytenr, 488 extent_gen, 489 backref_ctx); 490 if (ret < 0) 491 return ret; 492 else if (ret > 0) 493 prealloc_flags |= FIEMAP_EXTENT_SHARED; 494 495 checked_extent_shared = true; 496 } 497 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, 498 disk_bytenr + extent_offset, 499 prealloc_len, prealloc_flags); 500 if (ret) 501 return ret; 502 extent_offset += prealloc_len; 503 } 504 505 ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, 506 delalloc_end + 1 - delalloc_start, 507 FIEMAP_EXTENT_DELALLOC | 508 FIEMAP_EXTENT_UNKNOWN); 509 if (ret) 510 return ret; 511 512 last_delalloc_end = delalloc_end; 513 cur_offset = delalloc_end + 1; 514 extent_offset += cur_offset - delalloc_start; 515 cond_resched(); 516 } 517 518 /* 519 * Either we found no delalloc for the whole prealloc extent or we have 520 * a prealloc extent that spans i_size or starts at or after i_size. 521 */ 522 if (disk_bytenr != 0 && last_delalloc_end < end) { 523 u64 prealloc_start; 524 u64 prealloc_len; 525 526 if (last_delalloc_end == 0) { 527 prealloc_start = start; 528 prealloc_len = end + 1 - start; 529 } else { 530 prealloc_start = last_delalloc_end + 1; 531 prealloc_len = end + 1 - prealloc_start; 532 } 533 534 if (!checked_extent_shared && fieinfo->fi_extents_max) { 535 ret = btrfs_is_data_extent_shared(inode, 536 disk_bytenr, 537 extent_gen, 538 backref_ctx); 539 if (ret < 0) 540 return ret; 541 else if (ret > 0) 542 prealloc_flags |= FIEMAP_EXTENT_SHARED; 543 } 544 ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, 545 disk_bytenr + extent_offset, 546 prealloc_len, prealloc_flags); 547 if (ret) 548 return ret; 549 } 550 551 return 0; 552 } 553 554 static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, 555 struct btrfs_path *path, 556 u64 *last_extent_end_ret) 557 { 558 const u64 ino = btrfs_ino(inode); 559 struct btrfs_root *root = inode->root; 560 struct extent_buffer *leaf; 561 struct btrfs_file_extent_item *ei; 562 struct btrfs_key key; 563 u64 disk_bytenr; 564 int ret; 565 566 /* 567 * Lookup the last file extent. We're not using i_size here because 568 * there might be preallocation past i_size. 569 */ 570 ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); 571 /* There can't be a file extent item at offset (u64)-1 */ 572 ASSERT(ret != 0); 573 if (ret < 0) 574 return ret; 575 576 /* 577 * For a non-existing key, btrfs_search_slot() always leaves us at a 578 * slot > 0, except if the btree is empty, which is impossible because 579 * at least it has the inode item for this inode and all the items for 580 * the root inode 256. 581 */ 582 ASSERT(path->slots[0] > 0); 583 path->slots[0]--; 584 leaf = path->nodes[0]; 585 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 586 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { 587 /* No file extent items in the subvolume tree. */ 588 *last_extent_end_ret = 0; 589 return 0; 590 } 591 592 /* 593 * For an inline extent, the disk_bytenr is where inline data starts at, 594 * so first check if we have an inline extent item before checking if we 595 * have an implicit hole (disk_bytenr == 0). 596 */ 597 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 598 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { 599 *last_extent_end_ret = btrfs_file_extent_end(path); 600 return 0; 601 } 602 603 /* 604 * Find the last file extent item that is not a hole (when NO_HOLES is 605 * not enabled). This should take at most 2 iterations in the worst 606 * case: we have one hole file extent item at slot 0 of a leaf and 607 * another hole file extent item as the last item in the previous leaf. 608 * This is because we merge file extent items that represent holes. 609 */ 610 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 611 while (disk_bytenr == 0) { 612 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 613 if (ret < 0) { 614 return ret; 615 } else if (ret > 0) { 616 /* No file extent items that are not holes. */ 617 *last_extent_end_ret = 0; 618 return 0; 619 } 620 leaf = path->nodes[0]; 621 ei = btrfs_item_ptr(leaf, path->slots[0], 622 struct btrfs_file_extent_item); 623 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 624 } 625 626 *last_extent_end_ret = btrfs_file_extent_end(path); 627 return 0; 628 } 629 630 static int extent_fiemap(struct btrfs_inode *inode, 631 struct fiemap_extent_info *fieinfo, 632 u64 start, u64 len) 633 { 634 const u64 ino = btrfs_ino(inode); 635 struct extent_state *cached_state = NULL; 636 struct extent_state *delalloc_cached_state = NULL; 637 struct btrfs_path *path; 638 struct fiemap_cache cache = { 0 }; 639 struct btrfs_backref_share_check_ctx *backref_ctx; 640 u64 last_extent_end; 641 u64 prev_extent_end; 642 u64 range_start; 643 u64 range_end; 644 const u64 sectorsize = inode->root->fs_info->sectorsize; 645 bool stopped = false; 646 int ret; 647 648 cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); 649 cache.entries = kmalloc_array(cache.entries_size, 650 sizeof(struct btrfs_fiemap_entry), 651 GFP_KERNEL); 652 backref_ctx = btrfs_alloc_backref_share_check_ctx(); 653 path = btrfs_alloc_path(); 654 if (!cache.entries || !backref_ctx || !path) { 655 ret = -ENOMEM; 656 goto out; 657 } 658 659 restart: 660 range_start = round_down(start, sectorsize); 661 range_end = round_up(start + len, sectorsize); 662 prev_extent_end = range_start; 663 664 lock_extent(&inode->io_tree, range_start, range_end, &cached_state); 665 666 ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); 667 if (ret < 0) 668 goto out_unlock; 669 btrfs_release_path(path); 670 671 path->reada = READA_FORWARD; 672 ret = fiemap_search_slot(inode, path, range_start); 673 if (ret < 0) { 674 goto out_unlock; 675 } else if (ret > 0) { 676 /* 677 * No file extent item found, but we may have delalloc between 678 * the current offset and i_size. So check for that. 679 */ 680 ret = 0; 681 goto check_eof_delalloc; 682 } 683 684 while (prev_extent_end < range_end) { 685 struct extent_buffer *leaf = path->nodes[0]; 686 struct btrfs_file_extent_item *ei; 687 struct btrfs_key key; 688 u64 extent_end; 689 u64 extent_len; 690 u64 extent_offset = 0; 691 u64 extent_gen; 692 u64 disk_bytenr = 0; 693 u64 flags = 0; 694 int extent_type; 695 u8 compression; 696 697 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 698 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 699 break; 700 701 extent_end = btrfs_file_extent_end(path); 702 703 /* 704 * The first iteration can leave us at an extent item that ends 705 * before our range's start. Move to the next item. 706 */ 707 if (extent_end <= range_start) 708 goto next_item; 709 710 backref_ctx->curr_leaf_bytenr = leaf->start; 711 712 /* We have in implicit hole (NO_HOLES feature enabled). */ 713 if (prev_extent_end < key.offset) { 714 const u64 hole_end = min(key.offset, range_end) - 1; 715 716 ret = fiemap_process_hole(inode, fieinfo, &cache, 717 &delalloc_cached_state, 718 backref_ctx, 0, 0, 0, 719 prev_extent_end, hole_end); 720 if (ret < 0) { 721 goto out_unlock; 722 } else if (ret > 0) { 723 /* fiemap_fill_next_extent() told us to stop. */ 724 stopped = true; 725 break; 726 } 727 728 /* We've reached the end of the fiemap range, stop. */ 729 if (key.offset >= range_end) { 730 stopped = true; 731 break; 732 } 733 } 734 735 extent_len = extent_end - key.offset; 736 ei = btrfs_item_ptr(leaf, path->slots[0], 737 struct btrfs_file_extent_item); 738 compression = btrfs_file_extent_compression(leaf, ei); 739 extent_type = btrfs_file_extent_type(leaf, ei); 740 extent_gen = btrfs_file_extent_generation(leaf, ei); 741 742 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 743 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 744 if (compression == BTRFS_COMPRESS_NONE) 745 extent_offset = btrfs_file_extent_offset(leaf, ei); 746 } 747 748 if (compression != BTRFS_COMPRESS_NONE) 749 flags |= FIEMAP_EXTENT_ENCODED; 750 751 if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 752 flags |= FIEMAP_EXTENT_DATA_INLINE; 753 flags |= FIEMAP_EXTENT_NOT_ALIGNED; 754 ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, 755 extent_len, flags); 756 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 757 ret = fiemap_process_hole(inode, fieinfo, &cache, 758 &delalloc_cached_state, 759 backref_ctx, 760 disk_bytenr, extent_offset, 761 extent_gen, key.offset, 762 extent_end - 1); 763 } else if (disk_bytenr == 0) { 764 /* We have an explicit hole. */ 765 ret = fiemap_process_hole(inode, fieinfo, &cache, 766 &delalloc_cached_state, 767 backref_ctx, 0, 0, 0, 768 key.offset, extent_end - 1); 769 } else { 770 /* We have a regular extent. */ 771 if (fieinfo->fi_extents_max) { 772 ret = btrfs_is_data_extent_shared(inode, 773 disk_bytenr, 774 extent_gen, 775 backref_ctx); 776 if (ret < 0) 777 goto out_unlock; 778 else if (ret > 0) 779 flags |= FIEMAP_EXTENT_SHARED; 780 } 781 782 ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 783 disk_bytenr + extent_offset, 784 extent_len, flags); 785 } 786 787 if (ret < 0) { 788 goto out_unlock; 789 } else if (ret > 0) { 790 /* emit_fiemap_extent() told us to stop. */ 791 stopped = true; 792 break; 793 } 794 795 prev_extent_end = extent_end; 796 next_item: 797 if (fatal_signal_pending(current)) { 798 ret = -EINTR; 799 goto out_unlock; 800 } 801 802 ret = fiemap_next_leaf_item(inode, path); 803 if (ret < 0) { 804 goto out_unlock; 805 } else if (ret > 0) { 806 /* No more file extent items for this inode. */ 807 break; 808 } 809 cond_resched(); 810 } 811 812 check_eof_delalloc: 813 if (!stopped && prev_extent_end < range_end) { 814 ret = fiemap_process_hole(inode, fieinfo, &cache, 815 &delalloc_cached_state, backref_ctx, 816 0, 0, 0, prev_extent_end, range_end - 1); 817 if (ret < 0) 818 goto out_unlock; 819 prev_extent_end = range_end; 820 } 821 822 if (cache.cached && cache.offset + cache.len >= last_extent_end) { 823 const u64 i_size = i_size_read(&inode->vfs_inode); 824 825 if (prev_extent_end < i_size) { 826 u64 delalloc_start; 827 u64 delalloc_end; 828 bool delalloc; 829 830 delalloc = btrfs_find_delalloc_in_range(inode, 831 prev_extent_end, 832 i_size - 1, 833 &delalloc_cached_state, 834 &delalloc_start, 835 &delalloc_end); 836 if (!delalloc) 837 cache.flags |= FIEMAP_EXTENT_LAST; 838 } else { 839 cache.flags |= FIEMAP_EXTENT_LAST; 840 } 841 } 842 843 out_unlock: 844 unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); 845 846 if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { 847 btrfs_release_path(path); 848 ret = flush_fiemap_cache(fieinfo, &cache); 849 if (ret) 850 goto out; 851 len -= cache.next_search_offset - start; 852 start = cache.next_search_offset; 853 goto restart; 854 } else if (ret < 0) { 855 goto out; 856 } 857 858 /* 859 * Must free the path before emitting to the fiemap buffer because we 860 * may have a non-cloned leaf and if the fiemap buffer is memory mapped 861 * to a file, a write into it (through btrfs_page_mkwrite()) may trigger 862 * waiting for an ordered extent that in order to complete needs to 863 * modify that leaf, therefore leading to a deadlock. 864 */ 865 btrfs_free_path(path); 866 path = NULL; 867 868 ret = flush_fiemap_cache(fieinfo, &cache); 869 if (ret) 870 goto out; 871 872 ret = emit_last_fiemap_cache(fieinfo, &cache); 873 out: 874 free_extent_state(delalloc_cached_state); 875 kfree(cache.entries); 876 btrfs_free_backref_share_ctx(backref_ctx); 877 btrfs_free_path(path); 878 return ret; 879 } 880 881 int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 882 u64 start, u64 len) 883 { 884 struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 885 int ret; 886 887 ret = fiemap_prep(inode, fieinfo, start, &len, 0); 888 if (ret) 889 return ret; 890 891 /* 892 * fiemap_prep() called filemap_write_and_wait() for the whole possible 893 * file range (0 to LLONG_MAX), but that is not enough if we have 894 * compression enabled. The first filemap_fdatawrite_range() only kicks 895 * in the compression of data (in an async thread) and will return 896 * before the compression is done and writeback is started. A second 897 * filemap_fdatawrite_range() is needed to wait for the compression to 898 * complete and writeback to start. We also need to wait for ordered 899 * extents to complete, because our fiemap implementation uses mainly 900 * file extent items to list the extents, searching for extent maps 901 * only for file ranges with holes or prealloc extents to figure out 902 * if we have delalloc in those ranges. 903 */ 904 if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { 905 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); 906 if (ret) 907 return ret; 908 } 909 910 btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); 911 912 /* 913 * We did an initial flush to avoid holding the inode's lock while 914 * triggering writeback and waiting for the completion of IO and ordered 915 * extents. Now after we locked the inode we do it again, because it's 916 * possible a new write may have happened in between those two steps. 917 */ 918 if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { 919 ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX); 920 if (ret) { 921 btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); 922 return ret; 923 } 924 } 925 926 ret = extent_fiemap(btrfs_inode, fieinfo, start, len); 927 btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); 928 929 return ret; 930 } 931