1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "extent_io.h" 18 #include "extent-io-tree.h" 19 #include "extent_map.h" 20 #include "ctree.h" 21 #include "btrfs_inode.h" 22 #include "bio.h" 23 #include "locking.h" 24 #include "backref.h" 25 #include "disk-io.h" 26 #include "subpage.h" 27 #include "zoned.h" 28 #include "block-group.h" 29 #include "compression.h" 30 #include "fs.h" 31 #include "accessors.h" 32 #include "file-item.h" 33 #include "file.h" 34 #include "dev-replace.h" 35 #include "super.h" 36 #include "transaction.h" 37 38 static struct kmem_cache *extent_buffer_cache; 39 40 #ifdef CONFIG_BTRFS_DEBUG 41 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) 42 { 43 struct btrfs_fs_info *fs_info = eb->fs_info; 44 unsigned long flags; 45 46 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 47 list_add(&eb->leak_list, &fs_info->allocated_ebs); 48 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 49 } 50 51 static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) 52 { 53 struct btrfs_fs_info *fs_info = eb->fs_info; 54 unsigned long flags; 55 56 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 57 list_del(&eb->leak_list); 58 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 59 } 60 61 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 62 { 63 struct extent_buffer *eb; 64 unsigned long flags; 65 66 /* 67 * If we didn't get into open_ctree our allocated_ebs will not be 68 * initialized, so just skip this. 69 */ 70 if (!fs_info->allocated_ebs.next) 71 return; 72 73 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 74 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 75 while (!list_empty(&fs_info->allocated_ebs)) { 76 eb = list_first_entry(&fs_info->allocated_ebs, 77 struct extent_buffer, leak_list); 78 btrfs_err(fs_info, 79 "buffer leak start %llu len %u refs %d bflags %lu owner %llu", 80 eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, 81 btrfs_header_owner(eb)); 82 list_del(&eb->leak_list); 83 WARN_ON_ONCE(1); 84 kmem_cache_free(extent_buffer_cache, eb); 85 } 86 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 87 } 88 #else 89 #define btrfs_leak_debug_add_eb(eb) do {} while (0) 90 #define btrfs_leak_debug_del_eb(eb) do {} while (0) 91 #endif 92 93 /* 94 * Structure to record info about the bio being assembled, and other info like 95 * how many bytes are there before stripe/ordered extent boundary. 96 */ 97 struct btrfs_bio_ctrl { 98 struct btrfs_bio *bbio; 99 /* Last byte contained in bbio + 1 . */ 100 loff_t next_file_offset; 101 enum btrfs_compression_type compress_type; 102 u32 len_to_oe_boundary; 103 blk_opf_t opf; 104 /* 105 * For data read bios, we attempt to optimize csum lookups if the extent 106 * generation is older than the current one. To make this possible, we 107 * need to track the maximum generation of an extent in a bio_ctrl to 108 * make the decision when submitting the bio. 109 * 110 * The pattern between do_readpage(), submit_one_bio() and 111 * submit_extent_folio() is quite subtle, so tracking this is tricky. 112 * 113 * As we process extent E, we might submit a bio with existing built up 114 * extents before adding E to a new bio, or we might just add E to the 115 * bio. As a result, E's generation could apply to the current bio or 116 * to the next one, so we need to be careful to update the bio_ctrl's 117 * generation with E's only when we are sure E is added to bio_ctrl->bbio 118 * in submit_extent_folio(). 119 * 120 * See the comment in btrfs_lookup_bio_sums() for more detail on the 121 * need for this optimization. 122 */ 123 u64 generation; 124 btrfs_bio_end_io_t end_io_func; 125 struct writeback_control *wbc; 126 127 /* 128 * The sectors of the page which are going to be submitted by 129 * extent_writepage_io(). 130 * This is to avoid touching ranges covered by compression/inline. 131 */ 132 unsigned long submit_bitmap; 133 struct readahead_control *ractl; 134 135 /* 136 * The start offset of the last used extent map by a read operation. 137 * 138 * This is for proper compressed read merge. 139 * U64_MAX means we are starting the read and have made no progress yet. 140 * 141 * The current btrfs_bio_is_contig() only uses disk_bytenr as 142 * the condition to check if the read can be merged with previous 143 * bio, which is not correct. E.g. two file extents pointing to the 144 * same extent but with different offset. 145 * 146 * So here we need to do extra checks to only merge reads that are 147 * covered by the same extent map. 148 * Just extent_map::start will be enough, as they are unique 149 * inside the same inode. 150 */ 151 u64 last_em_start; 152 }; 153 154 /* 155 * Helper to set the csum search commit root option for a bio_ctrl's bbio 156 * before submitting the bio. 157 * 158 * Only for use by submit_one_bio(). 159 */ 160 static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) 161 { 162 struct btrfs_bio *bbio = bio_ctrl->bbio; 163 164 ASSERT(bbio); 165 166 if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) 167 return; 168 169 bio_ctrl->bbio->csum_search_commit_root = 170 (bio_ctrl->generation && 171 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); 172 } 173 174 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 175 { 176 struct btrfs_bio *bbio = bio_ctrl->bbio; 177 178 if (!bbio) 179 return; 180 181 /* Caller should ensure the bio has at least some range added */ 182 ASSERT(bbio->bio.bi_iter.bi_size); 183 184 bio_set_csum_search_commit_root(bio_ctrl); 185 186 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && 187 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 188 btrfs_submit_compressed_read(bbio); 189 else 190 btrfs_submit_bbio(bbio, 0); 191 192 /* The bbio is owned by the end_io handler now */ 193 bio_ctrl->bbio = NULL; 194 /* 195 * We used the generation to decide whether to lookup csums in the 196 * commit_root or not when we called bio_set_csum_search_commit_root() 197 * above. Now, reset the generation for the next bio. 198 */ 199 bio_ctrl->generation = 0; 200 } 201 202 /* 203 * Submit or fail the current bio in the bio_ctrl structure. 204 */ 205 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) 206 { 207 struct btrfs_bio *bbio = bio_ctrl->bbio; 208 209 if (!bbio) 210 return; 211 212 if (ret) { 213 ASSERT(ret < 0); 214 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 215 /* The bio is owned by the end_io handler now */ 216 bio_ctrl->bbio = NULL; 217 } else { 218 submit_one_bio(bio_ctrl); 219 } 220 } 221 222 int __init extent_buffer_init_cachep(void) 223 { 224 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 225 sizeof(struct extent_buffer), 0, 0, 226 NULL); 227 if (!extent_buffer_cache) 228 return -ENOMEM; 229 230 return 0; 231 } 232 233 void __cold extent_buffer_free_cachep(void) 234 { 235 /* 236 * Make sure all delayed rcu free are flushed before we 237 * destroy caches. 238 */ 239 rcu_barrier(); 240 kmem_cache_destroy(extent_buffer_cache); 241 } 242 243 static void process_one_folio(struct btrfs_fs_info *fs_info, 244 struct folio *folio, const struct folio *locked_folio, 245 unsigned long page_ops, u64 start, u64 end) 246 { 247 u32 len; 248 249 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 250 len = end + 1 - start; 251 252 if (page_ops & PAGE_SET_ORDERED) 253 btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); 254 if (page_ops & PAGE_START_WRITEBACK) { 255 btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); 256 btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); 257 } 258 if (page_ops & PAGE_END_WRITEBACK) 259 btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); 260 261 if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) 262 btrfs_folio_end_lock(fs_info, folio, start, len); 263 } 264 265 static void __process_folios_contig(struct address_space *mapping, 266 const struct folio *locked_folio, u64 start, 267 u64 end, unsigned long page_ops) 268 { 269 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 270 pgoff_t index = start >> PAGE_SHIFT; 271 pgoff_t end_index = end >> PAGE_SHIFT; 272 struct folio_batch fbatch; 273 int i; 274 275 folio_batch_init(&fbatch); 276 while (index <= end_index) { 277 int found_folios; 278 279 found_folios = filemap_get_folios_contig(mapping, &index, 280 end_index, &fbatch); 281 for (i = 0; i < found_folios; i++) { 282 struct folio *folio = fbatch.folios[i]; 283 284 process_one_folio(fs_info, folio, locked_folio, 285 page_ops, start, end); 286 } 287 folio_batch_release(&fbatch); 288 cond_resched(); 289 } 290 } 291 292 static noinline void unlock_delalloc_folio(const struct inode *inode, 293 struct folio *locked_folio, 294 u64 start, u64 end) 295 { 296 ASSERT(locked_folio); 297 298 __process_folios_contig(inode->i_mapping, locked_folio, start, end, 299 PAGE_UNLOCK); 300 } 301 302 static noinline int lock_delalloc_folios(struct inode *inode, 303 struct folio *locked_folio, 304 u64 start, u64 end) 305 { 306 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 307 struct address_space *mapping = inode->i_mapping; 308 pgoff_t index = start >> PAGE_SHIFT; 309 pgoff_t end_index = end >> PAGE_SHIFT; 310 u64 processed_end = start; 311 struct folio_batch fbatch; 312 313 folio_batch_init(&fbatch); 314 while (index <= end_index) { 315 unsigned int found_folios, i; 316 317 found_folios = filemap_get_folios_contig(mapping, &index, 318 end_index, &fbatch); 319 if (found_folios == 0) 320 goto out; 321 322 for (i = 0; i < found_folios; i++) { 323 struct folio *folio = fbatch.folios[i]; 324 u64 range_start; 325 u32 range_len; 326 327 if (folio == locked_folio) 328 continue; 329 330 folio_lock(folio); 331 if (!folio_test_dirty(folio) || folio->mapping != mapping) { 332 folio_unlock(folio); 333 goto out; 334 } 335 range_start = max_t(u64, folio_pos(folio), start); 336 range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; 337 btrfs_folio_set_lock(fs_info, folio, range_start, range_len); 338 339 processed_end = range_start + range_len - 1; 340 } 341 folio_batch_release(&fbatch); 342 cond_resched(); 343 } 344 345 return 0; 346 out: 347 folio_batch_release(&fbatch); 348 if (processed_end > start) 349 unlock_delalloc_folio(inode, locked_folio, start, processed_end); 350 return -EAGAIN; 351 } 352 353 /* 354 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 355 * more than @max_bytes. 356 * 357 * @start: The original start bytenr to search. 358 * Will store the extent range start bytenr. 359 * @end: The original end bytenr of the search range 360 * Will store the extent range end bytenr. 361 * 362 * Return true if we find a delalloc range which starts inside the original 363 * range, and @start/@end will store the delalloc range start/end. 364 * 365 * Return false if we can't find any delalloc range which starts inside the 366 * original range, and @start/@end will be the non-delalloc range start/end. 367 */ 368 EXPORT_FOR_TESTS 369 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 370 struct folio *locked_folio, 371 u64 *start, u64 *end) 372 { 373 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 375 const u64 orig_start = *start; 376 const u64 orig_end = *end; 377 u64 max_bytes = fs_info->max_extent_size; 378 u64 delalloc_start; 379 u64 delalloc_end; 380 bool found; 381 struct extent_state *cached_state = NULL; 382 int ret; 383 int loops = 0; 384 385 /* Caller should pass a valid @end to indicate the search range end */ 386 ASSERT(orig_end > orig_start); 387 388 /* The range should at least cover part of the folio */ 389 ASSERT(!(orig_start >= folio_next_pos(locked_folio) || 390 orig_end <= folio_pos(locked_folio))); 391 again: 392 /* step one, find a bunch of delalloc bytes starting at start */ 393 delalloc_start = *start; 394 delalloc_end = 0; 395 396 /* 397 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can 398 * return early without handling any dirty ranges. 399 */ 400 ASSERT(max_bytes >= fs_info->sectorsize); 401 402 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 403 max_bytes, &cached_state); 404 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 405 *start = delalloc_start; 406 407 /* @delalloc_end can be -1, never go beyond @orig_end */ 408 *end = min(delalloc_end, orig_end); 409 btrfs_free_extent_state(cached_state); 410 return false; 411 } 412 413 /* 414 * start comes from the offset of locked_folio. We have to lock 415 * folios in order, so we can't process delalloc bytes before 416 * locked_folio 417 */ 418 if (delalloc_start < *start) 419 delalloc_start = *start; 420 421 /* 422 * make sure to limit the number of folios we try to lock down 423 */ 424 if (delalloc_end + 1 - delalloc_start > max_bytes) 425 delalloc_end = delalloc_start + max_bytes - 1; 426 427 /* step two, lock all the folios after the folios that has start */ 428 ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, 429 delalloc_end); 430 ASSERT(!ret || ret == -EAGAIN); 431 if (ret == -EAGAIN) { 432 /* 433 * Some of the folios are gone, lets avoid looping by 434 * shortening the size of the delalloc range we're searching. 435 */ 436 btrfs_free_extent_state(cached_state); 437 cached_state = NULL; 438 if (!loops) { 439 max_bytes = fs_info->sectorsize; 440 loops = 1; 441 goto again; 442 } else { 443 return false; 444 } 445 } 446 447 /* step three, lock the state bits for the whole range */ 448 btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); 449 450 /* then test to make sure it is all still delalloc */ 451 ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, 452 EXTENT_DELALLOC, cached_state); 453 454 btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); 455 if (!ret) { 456 unlock_delalloc_folio(inode, locked_folio, delalloc_start, 457 delalloc_end); 458 cond_resched(); 459 goto again; 460 } 461 *start = delalloc_start; 462 *end = delalloc_end; 463 464 return found; 465 } 466 467 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 468 const struct folio *locked_folio, 469 struct extent_state **cached, 470 u32 clear_bits, unsigned long page_ops) 471 { 472 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); 473 474 __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, 475 end, page_ops); 476 } 477 478 static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) 479 { 480 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 481 482 if (!fsverity_active(folio->mapping->host) || 483 btrfs_folio_test_uptodate(fs_info, folio, start, len) || 484 start >= i_size_read(folio->mapping->host)) 485 return true; 486 return fsverity_verify_folio(folio); 487 } 488 489 static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) 490 { 491 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 492 493 ASSERT(folio_pos(folio) <= start && 494 start + len <= folio_next_pos(folio)); 495 496 if (uptodate && btrfs_verify_folio(folio, start, len)) 497 btrfs_folio_set_uptodate(fs_info, folio, start, len); 498 else 499 btrfs_folio_clear_uptodate(fs_info, folio, start, len); 500 501 if (!btrfs_is_subpage(fs_info, folio)) 502 folio_unlock(folio); 503 else 504 btrfs_folio_end_lock(fs_info, folio, start, len); 505 } 506 507 /* 508 * After a write IO is done, we need to: 509 * 510 * - clear the uptodate bits on error 511 * - clear the writeback bits in the extent tree for the range 512 * - filio_end_writeback() if there is no more pending io for the folio 513 * 514 * Scheduling is not allowed, so the extent state tree is expected 515 * to have one and only one object corresponding to this IO. 516 */ 517 static void end_bbio_data_write(struct btrfs_bio *bbio) 518 { 519 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 520 struct bio *bio = &bbio->bio; 521 int error = blk_status_to_errno(bio->bi_status); 522 struct folio_iter fi; 523 const u32 sectorsize = fs_info->sectorsize; 524 525 ASSERT(!bio_flagged(bio, BIO_CLONED)); 526 bio_for_each_folio_all(fi, bio) { 527 struct folio *folio = fi.folio; 528 u64 start = folio_pos(folio) + fi.offset; 529 u32 len = fi.length; 530 531 /* Our read/write should always be sector aligned. */ 532 if (!IS_ALIGNED(fi.offset, sectorsize)) 533 btrfs_err(fs_info, 534 "partial page write in btrfs with offset %zu and length %zu", 535 fi.offset, fi.length); 536 else if (!IS_ALIGNED(fi.length, sectorsize)) 537 btrfs_info(fs_info, 538 "incomplete page write with offset %zu and length %zu", 539 fi.offset, fi.length); 540 541 btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, 542 !error); 543 if (error) 544 mapping_set_error(folio->mapping, error); 545 btrfs_folio_clear_writeback(fs_info, folio, start, len); 546 } 547 548 bio_put(bio); 549 } 550 551 static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) 552 { 553 ASSERT(folio_test_locked(folio)); 554 if (!btrfs_is_subpage(fs_info, folio)) 555 return; 556 557 ASSERT(folio_test_private(folio)); 558 btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); 559 } 560 561 /* 562 * After a data read IO is done, we need to: 563 * 564 * - clear the uptodate bits on error 565 * - set the uptodate bits if things worked 566 * - set the folio up to date if all extents in the tree are uptodate 567 * - clear the lock bit in the extent tree 568 * - unlock the folio if there are no other extents locked for it 569 * 570 * Scheduling is not allowed, so the extent state tree is expected 571 * to have one and only one object corresponding to this IO. 572 */ 573 static void end_bbio_data_read(struct btrfs_bio *bbio) 574 { 575 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 576 struct bio *bio = &bbio->bio; 577 struct folio_iter fi; 578 579 ASSERT(!bio_flagged(bio, BIO_CLONED)); 580 bio_for_each_folio_all(fi, &bbio->bio) { 581 bool uptodate = !bio->bi_status; 582 struct folio *folio = fi.folio; 583 struct inode *inode = folio->mapping->host; 584 u64 start = folio_pos(folio) + fi.offset; 585 586 btrfs_debug(fs_info, 587 "%s: bi_sector=%llu, err=%d, mirror=%u", 588 __func__, bio->bi_iter.bi_sector, bio->bi_status, 589 bbio->mirror_num); 590 591 592 if (likely(uptodate)) { 593 u64 end = start + fi.length - 1; 594 loff_t i_size = i_size_read(inode); 595 596 /* 597 * Zero out the remaining part if this range straddles 598 * i_size. 599 * 600 * Here we should only zero the range inside the folio, 601 * not touch anything else. 602 * 603 * NOTE: i_size is exclusive while end is inclusive and 604 * folio_contains() takes PAGE_SIZE units. 605 */ 606 if (folio_contains(folio, i_size >> PAGE_SHIFT) && 607 i_size <= end) { 608 u32 zero_start = max(offset_in_folio(folio, i_size), 609 offset_in_folio(folio, start)); 610 u32 zero_len = offset_in_folio(folio, end) + 1 - 611 zero_start; 612 613 folio_zero_range(folio, zero_start, zero_len); 614 } 615 } 616 617 /* Update page status and unlock. */ 618 end_folio_read(folio, uptodate, start, fi.length); 619 } 620 bio_put(bio); 621 } 622 623 /* 624 * Populate every free slot in a provided array with folios using GFP_NOFS. 625 * 626 * @nr_folios: number of folios to allocate 627 * @order: the order of the folios to be allocated 628 * @folio_array: the array to fill with folios; any existing non-NULL entries in 629 * the array will be skipped 630 * 631 * Return: 0 if all folios were able to be allocated; 632 * -ENOMEM otherwise, the partially allocated folios would be freed and 633 * the array slots zeroed 634 */ 635 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, 636 struct folio **folio_array) 637 { 638 for (int i = 0; i < nr_folios; i++) { 639 if (folio_array[i]) 640 continue; 641 folio_array[i] = folio_alloc(GFP_NOFS, order); 642 if (!folio_array[i]) 643 goto error; 644 } 645 return 0; 646 error: 647 for (int i = 0; i < nr_folios; i++) { 648 if (folio_array[i]) 649 folio_put(folio_array[i]); 650 folio_array[i] = NULL; 651 } 652 return -ENOMEM; 653 } 654 655 /* 656 * Populate every free slot in a provided array with pages, using GFP_NOFS. 657 * 658 * @nr_pages: number of pages to allocate 659 * @page_array: the array to fill with pages; any existing non-null entries in 660 * the array will be skipped 661 * @nofail: whether using __GFP_NOFAIL flag 662 * 663 * Return: 0 if all pages were able to be allocated; 664 * -ENOMEM otherwise, the partially allocated pages would be freed and 665 * the array slots zeroed 666 */ 667 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, 668 bool nofail) 669 { 670 const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; 671 unsigned int allocated; 672 673 for (allocated = 0; allocated < nr_pages;) { 674 unsigned int last = allocated; 675 676 allocated = alloc_pages_bulk(gfp, nr_pages, page_array); 677 if (unlikely(allocated == last)) { 678 /* No progress, fail and do cleanup. */ 679 for (int i = 0; i < allocated; i++) { 680 __free_page(page_array[i]); 681 page_array[i] = NULL; 682 } 683 return -ENOMEM; 684 } 685 } 686 return 0; 687 } 688 689 /* 690 * Populate needed folios for the extent buffer. 691 * 692 * For now, the folios populated are always in order 0 (aka, single page). 693 */ 694 static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) 695 { 696 struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; 697 int num_pages = num_extent_pages(eb); 698 int ret; 699 700 ret = btrfs_alloc_page_array(num_pages, page_array, nofail); 701 if (ret < 0) 702 return ret; 703 704 for (int i = 0; i < num_pages; i++) 705 eb->folios[i] = page_folio(page_array[i]); 706 eb->folio_size = PAGE_SIZE; 707 eb->folio_shift = PAGE_SHIFT; 708 return 0; 709 } 710 711 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, 712 u64 disk_bytenr, loff_t file_offset) 713 { 714 struct bio *bio = &bio_ctrl->bbio->bio; 715 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 716 717 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 718 /* 719 * For compression, all IO should have its logical bytenr set 720 * to the starting bytenr of the compressed extent. 721 */ 722 return bio->bi_iter.bi_sector == sector; 723 } 724 725 /* 726 * To merge into a bio both the disk sector and the logical offset in 727 * the file need to be contiguous. 728 */ 729 return bio_ctrl->next_file_offset == file_offset && 730 bio_end_sector(bio) == sector; 731 } 732 733 static void alloc_new_bio(struct btrfs_inode *inode, 734 struct btrfs_bio_ctrl *bio_ctrl, 735 u64 disk_bytenr, u64 file_offset) 736 { 737 struct btrfs_fs_info *fs_info = inode->root->fs_info; 738 struct btrfs_bio *bbio; 739 740 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode, 741 file_offset, bio_ctrl->end_io_func, NULL); 742 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 743 bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; 744 bio_ctrl->bbio = bbio; 745 bio_ctrl->len_to_oe_boundary = U32_MAX; 746 bio_ctrl->next_file_offset = file_offset; 747 748 /* Limit data write bios to the ordered boundary. */ 749 if (bio_ctrl->wbc) { 750 struct btrfs_ordered_extent *ordered; 751 752 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 753 if (ordered) { 754 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 755 ordered->file_offset + 756 ordered->disk_num_bytes - file_offset); 757 bbio->ordered = ordered; 758 } 759 760 /* 761 * Pick the last added device to support cgroup writeback. For 762 * multi-device file systems this means blk-cgroup policies have 763 * to always be set on the last added/replaced device. 764 * This is a bit odd but has been like that for a long time. 765 */ 766 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 767 wbc_init_bio(bio_ctrl->wbc, &bbio->bio); 768 } 769 } 770 771 /* 772 * @disk_bytenr: logical bytenr where the write will be 773 * @page: page to add to the bio 774 * @size: portion of page that we want to write to 775 * @pg_offset: offset of the new bio or to check whether we are adding 776 * a contiguous page to the previous one 777 * @read_em_generation: generation of the extent_map we are submitting 778 * (only used for read) 779 * 780 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a 781 * new one in @bio_ctrl->bbio. 782 * The mirror number for this IO should already be initialized in 783 * @bio_ctrl->mirror_num. 784 */ 785 static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, 786 u64 disk_bytenr, struct folio *folio, 787 size_t size, unsigned long pg_offset, 788 u64 read_em_generation) 789 { 790 struct btrfs_inode *inode = folio_to_inode(folio); 791 loff_t file_offset = folio_pos(folio) + pg_offset; 792 793 ASSERT(pg_offset + size <= folio_size(folio)); 794 ASSERT(bio_ctrl->end_io_func); 795 796 if (bio_ctrl->bbio && 797 !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) 798 submit_one_bio(bio_ctrl); 799 800 do { 801 u32 len = size; 802 803 /* Allocate new bio if needed */ 804 if (!bio_ctrl->bbio) 805 alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); 806 807 /* Cap to the current ordered extent boundary if there is one. */ 808 if (len > bio_ctrl->len_to_oe_boundary) { 809 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); 810 ASSERT(is_data_inode(inode)); 811 len = bio_ctrl->len_to_oe_boundary; 812 } 813 814 if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { 815 /* bio full: move on to a new one */ 816 submit_one_bio(bio_ctrl); 817 continue; 818 } 819 /* 820 * Now that the folio is definitely added to the bio, include its 821 * generation in the max generation calculation. 822 */ 823 bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); 824 bio_ctrl->next_file_offset += len; 825 826 if (bio_ctrl->wbc) 827 wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); 828 829 size -= len; 830 pg_offset += len; 831 disk_bytenr += len; 832 file_offset += len; 833 834 /* 835 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or 836 * sector aligned. alloc_new_bio() then sets it to the end of 837 * our ordered extent for writes into zoned devices. 838 * 839 * When len_to_oe_boundary is tracking an ordered extent, we 840 * trust the ordered extent code to align things properly, and 841 * the check above to cap our write to the ordered extent 842 * boundary is correct. 843 * 844 * When len_to_oe_boundary is U32_MAX, the cap above would 845 * result in a 4095 byte IO for the last folio right before 846 * we hit the bio limit of UINT_MAX. bio_add_folio() has all 847 * the checks required to make sure we don't overflow the bio, 848 * and we should just ignore len_to_oe_boundary completely 849 * unless we're using it to track an ordered extent. 850 * 851 * It's pretty hard to make a bio sized U32_MAX, but it can 852 * happen when the page cache is able to feed us contiguous 853 * folios for large extents. 854 */ 855 if (bio_ctrl->len_to_oe_boundary != U32_MAX) 856 bio_ctrl->len_to_oe_boundary -= len; 857 858 /* Ordered extent boundary: move on to a new bio. */ 859 if (bio_ctrl->len_to_oe_boundary == 0) 860 submit_one_bio(bio_ctrl); 861 } while (size); 862 } 863 864 static int attach_extent_buffer_folio(struct extent_buffer *eb, 865 struct folio *folio, 866 struct btrfs_folio_state *prealloc) 867 { 868 struct btrfs_fs_info *fs_info = eb->fs_info; 869 int ret = 0; 870 871 /* 872 * If the page is mapped to btree inode, we should hold the private 873 * lock to prevent race. 874 * For cloned or dummy extent buffers, their pages are not mapped and 875 * will not race with any other ebs. 876 */ 877 if (folio->mapping) 878 lockdep_assert_held(&folio->mapping->i_private_lock); 879 880 if (!btrfs_meta_is_subpage(fs_info)) { 881 if (!folio_test_private(folio)) 882 folio_attach_private(folio, eb); 883 else 884 WARN_ON(folio_get_private(folio) != eb); 885 return 0; 886 } 887 888 /* Already mapped, just free prealloc */ 889 if (folio_test_private(folio)) { 890 btrfs_free_folio_state(prealloc); 891 return 0; 892 } 893 894 if (prealloc) 895 /* Has preallocated memory for subpage */ 896 folio_attach_private(folio, prealloc); 897 else 898 /* Do new allocation to attach subpage */ 899 ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 900 return ret; 901 } 902 903 int set_folio_extent_mapped(struct folio *folio) 904 { 905 struct btrfs_fs_info *fs_info; 906 907 ASSERT(folio->mapping); 908 909 if (folio_test_private(folio)) 910 return 0; 911 912 fs_info = folio_to_fs_info(folio); 913 914 if (btrfs_is_subpage(fs_info, folio)) 915 return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 916 917 folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); 918 return 0; 919 } 920 921 void clear_folio_extent_mapped(struct folio *folio) 922 { 923 struct btrfs_fs_info *fs_info; 924 925 ASSERT(folio->mapping); 926 927 if (!folio_test_private(folio)) 928 return; 929 930 fs_info = folio_to_fs_info(folio); 931 if (btrfs_is_subpage(fs_info, folio)) 932 return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 933 934 folio_detach_private(folio); 935 } 936 937 static struct extent_map *get_extent_map(struct btrfs_inode *inode, 938 struct folio *folio, u64 start, 939 u64 len, struct extent_map **em_cached) 940 { 941 struct extent_map *em; 942 943 ASSERT(em_cached); 944 945 if (*em_cached) { 946 em = *em_cached; 947 if (btrfs_extent_map_in_tree(em) && start >= em->start && 948 start < btrfs_extent_map_end(em)) { 949 refcount_inc(&em->refs); 950 return em; 951 } 952 953 btrfs_free_extent_map(em); 954 *em_cached = NULL; 955 } 956 957 em = btrfs_get_extent(inode, folio, start, len); 958 if (!IS_ERR(em)) { 959 BUG_ON(*em_cached); 960 refcount_inc(&em->refs); 961 *em_cached = em; 962 } 963 964 return em; 965 } 966 967 static void btrfs_readahead_expand(struct readahead_control *ractl, 968 const struct extent_map *em) 969 { 970 const u64 ra_pos = readahead_pos(ractl); 971 const u64 ra_end = ra_pos + readahead_length(ractl); 972 const u64 em_end = btrfs_extent_map_end(em); 973 974 /* No expansion for holes and inline extents. */ 975 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) 976 return; 977 978 ASSERT(em_end >= ra_pos, 979 "extent_map %llu %llu ends before current readahead position %llu", 980 em->start, em->len, ra_pos); 981 if (em_end > ra_end) 982 readahead_expand(ractl, ra_pos, em_end - ra_pos); 983 } 984 985 /* 986 * basic readpage implementation. Locked extent state structs are inserted 987 * into the tree that are removed when the IO is done (by the end_io 988 * handlers) 989 * XXX JDM: This needs looking at to ensure proper page locking 990 * return 0 on success, otherwise return error 991 */ 992 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, 993 struct btrfs_bio_ctrl *bio_ctrl) 994 { 995 struct inode *inode = folio->mapping->host; 996 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 997 u64 start = folio_pos(folio); 998 const u64 end = start + folio_size(folio) - 1; 999 u64 extent_offset; 1000 u64 locked_end; 1001 u64 last_byte = i_size_read(inode); 1002 struct extent_map *em; 1003 int ret = 0; 1004 const size_t blocksize = fs_info->sectorsize; 1005 1006 if (bio_ctrl->ractl) 1007 locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1; 1008 else 1009 locked_end = end; 1010 1011 ret = set_folio_extent_mapped(folio); 1012 if (ret < 0) { 1013 folio_unlock(folio); 1014 return ret; 1015 } 1016 1017 if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { 1018 size_t zero_offset = offset_in_folio(folio, last_byte); 1019 1020 if (zero_offset) 1021 folio_zero_range(folio, zero_offset, 1022 folio_size(folio) - zero_offset); 1023 } 1024 bio_ctrl->end_io_func = end_bbio_data_read; 1025 begin_folio_read(fs_info, folio); 1026 for (u64 cur = start; cur <= end; cur += blocksize) { 1027 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; 1028 unsigned long pg_offset = offset_in_folio(folio, cur); 1029 bool force_bio_submit = false; 1030 u64 disk_bytenr; 1031 u64 block_start; 1032 u64 em_gen; 1033 1034 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 1035 if (cur >= last_byte) { 1036 folio_zero_range(folio, pg_offset, end - cur + 1); 1037 end_folio_read(folio, true, cur, end - cur + 1); 1038 break; 1039 } 1040 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1041 end_folio_read(folio, true, cur, blocksize); 1042 continue; 1043 } 1044 /* 1045 * Search extent map for the whole locked range. 1046 * This will allow btrfs_get_extent() to return a larger hole 1047 * when possible. 1048 * This can reduce duplicated btrfs_get_extent() calls for large 1049 * holes. 1050 */ 1051 em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached); 1052 if (IS_ERR(em)) { 1053 end_folio_read(folio, false, cur, end + 1 - cur); 1054 return PTR_ERR(em); 1055 } 1056 extent_offset = cur - em->start; 1057 BUG_ON(btrfs_extent_map_end(em) <= cur); 1058 BUG_ON(end < cur); 1059 1060 compress_type = btrfs_extent_map_compression(em); 1061 1062 /* 1063 * Only expand readahead for extents which are already creating 1064 * the pages anyway in add_ra_bio_pages, which is compressed 1065 * extents in the non subpage case. 1066 */ 1067 if (bio_ctrl->ractl && 1068 !btrfs_is_subpage(fs_info, folio) && 1069 compress_type != BTRFS_COMPRESS_NONE) 1070 btrfs_readahead_expand(bio_ctrl->ractl, em); 1071 1072 if (compress_type != BTRFS_COMPRESS_NONE) 1073 disk_bytenr = em->disk_bytenr; 1074 else 1075 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1076 1077 if (em->flags & EXTENT_FLAG_PREALLOC) 1078 block_start = EXTENT_MAP_HOLE; 1079 else 1080 block_start = btrfs_extent_map_block_start(em); 1081 1082 /* 1083 * If we have a file range that points to a compressed extent 1084 * and it's followed by a consecutive file range that points 1085 * to the same compressed extent (possibly with a different 1086 * offset and/or length, so it either points to the whole extent 1087 * or only part of it), we must make sure we do not submit a 1088 * single bio to populate the folios for the 2 ranges because 1089 * this makes the compressed extent read zero out the folios 1090 * belonging to the 2nd range. Imagine the following scenario: 1091 * 1092 * File layout 1093 * [0 - 8K] [8K - 24K] 1094 * | | 1095 * | | 1096 * points to extent X, points to extent X, 1097 * offset 4K, length of 8K offset 0, length 16K 1098 * 1099 * [extent X, compressed length = 4K uncompressed length = 16K] 1100 * 1101 * If the bio to read the compressed extent covers both ranges, 1102 * it will decompress extent X into the folios belonging to the 1103 * first range and then it will stop, zeroing out the remaining 1104 * folios that belong to the other range that points to extent X. 1105 * So here we make sure we submit 2 bios, one for the first 1106 * range and another one for the third range. Both will target 1107 * the same physical extent from disk, but we can't currently 1108 * make the compressed bio endio callback populate the folios 1109 * for both ranges because each compressed bio is tightly 1110 * coupled with a single extent map, and each range can have 1111 * an extent map with a different offset value relative to the 1112 * uncompressed data of our extent and different lengths. This 1113 * is a corner case so we prioritize correctness over 1114 * non-optimal behavior (submitting 2 bios for the same extent). 1115 */ 1116 if (compress_type != BTRFS_COMPRESS_NONE && 1117 bio_ctrl->last_em_start != U64_MAX && 1118 bio_ctrl->last_em_start != em->start) 1119 force_bio_submit = true; 1120 1121 bio_ctrl->last_em_start = em->start; 1122 1123 em_gen = em->generation; 1124 btrfs_free_extent_map(em); 1125 em = NULL; 1126 1127 /* we've found a hole, just zero and go on */ 1128 if (block_start == EXTENT_MAP_HOLE) { 1129 folio_zero_range(folio, pg_offset, blocksize); 1130 end_folio_read(folio, true, cur, blocksize); 1131 continue; 1132 } 1133 /* the get_extent function already copied into the folio */ 1134 if (block_start == EXTENT_MAP_INLINE) { 1135 end_folio_read(folio, true, cur, blocksize); 1136 continue; 1137 } 1138 1139 if (bio_ctrl->compress_type != compress_type) { 1140 submit_one_bio(bio_ctrl); 1141 bio_ctrl->compress_type = compress_type; 1142 } 1143 1144 if (force_bio_submit) 1145 submit_one_bio(bio_ctrl); 1146 submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, 1147 pg_offset, em_gen); 1148 } 1149 return 0; 1150 } 1151 1152 /* 1153 * Check if we can skip waiting the @ordered extent covering the block at @fileoff. 1154 * 1155 * @fileoff: Both input and output. 1156 * Input as the file offset where the check should start at. 1157 * Output as where the next check should start at, 1158 * if the function returns true. 1159 * 1160 * Return true if we can skip to @fileoff. The caller needs to check the new 1161 * @fileoff value to make sure it covers the full range, before skipping the 1162 * full OE. 1163 * 1164 * Return false if we must wait for the ordered extent. 1165 */ 1166 static bool can_skip_one_ordered_range(struct btrfs_inode *inode, 1167 struct btrfs_ordered_extent *ordered, 1168 u64 *fileoff) 1169 { 1170 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1171 struct folio *folio; 1172 const u32 blocksize = fs_info->sectorsize; 1173 u64 cur = *fileoff; 1174 bool ret; 1175 1176 folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); 1177 1178 /* 1179 * We should have locked the folio(s) for range [start, end], thus 1180 * there must be a folio and it must be locked. 1181 */ 1182 ASSERT(!IS_ERR(folio)); 1183 ASSERT(folio_test_locked(folio)); 1184 1185 /* 1186 * There are several cases for the folio and OE combination: 1187 * 1188 * 1) Folio has no private flag 1189 * The OE has all its IO done but not yet finished, and folio got 1190 * invalidated. 1191 * 1192 * Have we have to wait for the OE to finish, as it may contain the 1193 * to-be-inserted data checksum. 1194 * Without the data checksum inserted into the csum tree, read will 1195 * just fail with missing csum. 1196 */ 1197 if (!folio_test_private(folio)) { 1198 ret = false; 1199 goto out; 1200 } 1201 1202 /* 1203 * 2) The first block is DIRTY. 1204 * 1205 * This means the OE is created by some other folios whose file pos is 1206 * before this one. And since we are holding the folio lock, the writeback 1207 * of this folio cannot start. 1208 * 1209 * We must skip the whole OE, because it will never start until we 1210 * finished our folio read and unlocked the folio. 1211 */ 1212 if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { 1213 u64 range_len = umin(folio_next_pos(folio), 1214 ordered->file_offset + ordered->num_bytes) - cur; 1215 1216 ret = true; 1217 /* 1218 * At least inside the folio, all the remaining blocks should 1219 * also be dirty. 1220 */ 1221 ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); 1222 *fileoff = ordered->file_offset + ordered->num_bytes; 1223 goto out; 1224 } 1225 1226 /* 1227 * 3) The first block is uptodate. 1228 * 1229 * At least the first block can be skipped, but we are still not fully 1230 * sure. E.g. if the OE has some other folios in the range that cannot 1231 * be skipped. 1232 * So we return true and update @next_ret to the OE/folio boundary. 1233 */ 1234 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1235 u64 range_len = umin(folio_next_pos(folio), 1236 ordered->file_offset + ordered->num_bytes) - cur; 1237 1238 /* 1239 * The whole range to the OE end or folio boundary should also 1240 * be uptodate. 1241 */ 1242 ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); 1243 ret = true; 1244 *fileoff = cur + range_len; 1245 goto out; 1246 } 1247 1248 /* 1249 * 4) The first block is not uptodate. 1250 * 1251 * This means the folio is invalidated after the writeback was finished, 1252 * but by some other operations (e.g. block aligned buffered write) the 1253 * folio is inserted into filemap. 1254 * Very much the same as case 1). 1255 */ 1256 ret = false; 1257 out: 1258 folio_put(folio); 1259 return ret; 1260 } 1261 1262 static bool can_skip_ordered_extent(struct btrfs_inode *inode, 1263 struct btrfs_ordered_extent *ordered, 1264 u64 start, u64 end) 1265 { 1266 const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); 1267 u64 cur = max(start, ordered->file_offset); 1268 1269 while (cur < range_end) { 1270 bool can_skip; 1271 1272 can_skip = can_skip_one_ordered_range(inode, ordered, &cur); 1273 if (!can_skip) 1274 return false; 1275 } 1276 return true; 1277 } 1278 1279 /* 1280 * Locking helper to make sure we get a stable view of extent maps for the 1281 * involved range. 1282 * 1283 * This is for folio read paths (read and readahead), thus the involved range 1284 * should have all the folios locked. 1285 */ 1286 static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, 1287 struct extent_state **cached_state) 1288 { 1289 u64 cur_pos; 1290 1291 /* Caller must provide a valid @cached_state. */ 1292 ASSERT(cached_state); 1293 1294 /* The range must at least be page aligned, as all read paths are folio based. */ 1295 ASSERT(IS_ALIGNED(start, PAGE_SIZE)); 1296 ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); 1297 1298 again: 1299 btrfs_lock_extent(&inode->io_tree, start, end, cached_state); 1300 cur_pos = start; 1301 while (cur_pos < end) { 1302 struct btrfs_ordered_extent *ordered; 1303 1304 ordered = btrfs_lookup_ordered_range(inode, cur_pos, 1305 end - cur_pos + 1); 1306 /* 1307 * No ordered extents in the range, and we hold the extent lock, 1308 * no one can modify the extent maps in the range, we're safe to return. 1309 */ 1310 if (!ordered) 1311 break; 1312 1313 /* Check if we can skip waiting for the whole OE. */ 1314 if (can_skip_ordered_extent(inode, ordered, start, end)) { 1315 cur_pos = min(ordered->file_offset + ordered->num_bytes, 1316 end + 1); 1317 btrfs_put_ordered_extent(ordered); 1318 continue; 1319 } 1320 1321 /* Now wait for the OE to finish. */ 1322 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1323 btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); 1324 btrfs_put_ordered_extent(ordered); 1325 /* We have unlocked the whole range, restart from the beginning. */ 1326 goto again; 1327 } 1328 } 1329 1330 int btrfs_read_folio(struct file *file, struct folio *folio) 1331 { 1332 struct btrfs_inode *inode = folio_to_inode(folio); 1333 const u64 start = folio_pos(folio); 1334 const u64 end = start + folio_size(folio) - 1; 1335 struct extent_state *cached_state = NULL; 1336 struct btrfs_bio_ctrl bio_ctrl = { 1337 .opf = REQ_OP_READ, 1338 .last_em_start = U64_MAX, 1339 }; 1340 struct extent_map *em_cached = NULL; 1341 int ret; 1342 1343 lock_extents_for_read(inode, start, end, &cached_state); 1344 ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 1345 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1346 1347 btrfs_free_extent_map(em_cached); 1348 1349 /* 1350 * If btrfs_do_readpage() failed we will want to submit the assembled 1351 * bio to do the cleanup. 1352 */ 1353 submit_one_bio(&bio_ctrl); 1354 return ret; 1355 } 1356 1357 static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, 1358 u64 start, u32 len) 1359 { 1360 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1361 const u64 folio_start = folio_pos(folio); 1362 unsigned int start_bit; 1363 unsigned int nbits; 1364 1365 ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); 1366 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1367 nbits = len >> fs_info->sectorsize_bits; 1368 ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); 1369 bitmap_set(delalloc_bitmap, start_bit, nbits); 1370 } 1371 1372 static bool find_next_delalloc_bitmap(struct folio *folio, 1373 unsigned long *delalloc_bitmap, u64 start, 1374 u64 *found_start, u32 *found_len) 1375 { 1376 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1377 const u64 folio_start = folio_pos(folio); 1378 const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); 1379 unsigned int start_bit; 1380 unsigned int first_zero; 1381 unsigned int first_set; 1382 1383 ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); 1384 1385 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1386 first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); 1387 if (first_set >= bitmap_size) 1388 return false; 1389 1390 *found_start = folio_start + (first_set << fs_info->sectorsize_bits); 1391 first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); 1392 *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; 1393 return true; 1394 } 1395 1396 /* 1397 * Do all of the delayed allocation setup. 1398 * 1399 * Return >0 if all the dirty blocks are submitted async (compression) or inlined. 1400 * The @folio should no longer be touched (treat it as already unlocked). 1401 * 1402 * Return 0 if there is still dirty block that needs to be submitted through 1403 * extent_writepage_io(). 1404 * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be 1405 * submitted, and @folio is still kept locked. 1406 * 1407 * Return <0 if there is any error hit. 1408 * Any allocated ordered extent range covering this folio will be marked 1409 * finished (IOERR), and @folio is still kept locked. 1410 */ 1411 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 1412 struct folio *folio, 1413 struct btrfs_bio_ctrl *bio_ctrl) 1414 { 1415 struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); 1416 struct writeback_control *wbc = bio_ctrl->wbc; 1417 const bool is_subpage = btrfs_is_subpage(fs_info, folio); 1418 const u64 page_start = folio_pos(folio); 1419 const u64 page_end = page_start + folio_size(folio) - 1; 1420 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1421 unsigned long delalloc_bitmap = 0; 1422 /* 1423 * Save the last found delalloc end. As the delalloc end can go beyond 1424 * page boundary, thus we cannot rely on subpage bitmap to locate the 1425 * last delalloc end. 1426 */ 1427 u64 last_delalloc_end = 0; 1428 /* 1429 * The range end (exclusive) of the last successfully finished delalloc 1430 * range. 1431 * Any range covered by ordered extent must either be manually marked 1432 * finished (error handling), or has IO submitted (and finish the 1433 * ordered extent normally). 1434 * 1435 * This records the end of ordered extent cleanup if we hit an error. 1436 */ 1437 u64 last_finished_delalloc_end = page_start; 1438 u64 delalloc_start = page_start; 1439 u64 delalloc_end = page_end; 1440 u64 delalloc_to_write = 0; 1441 unsigned int start_bit; 1442 unsigned int end_bit; 1443 int ret = 0; 1444 1445 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1446 if (btrfs_is_subpage(fs_info, folio)) { 1447 ASSERT(blocks_per_folio > 1); 1448 btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); 1449 } else { 1450 bio_ctrl->submit_bitmap = 1; 1451 } 1452 1453 for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, 1454 blocks_per_folio) { 1455 u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1456 u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1457 1458 btrfs_folio_set_lock(fs_info, folio, start, len); 1459 } 1460 1461 /* Lock all (subpage) delalloc ranges inside the folio first. */ 1462 while (delalloc_start < page_end) { 1463 delalloc_end = page_end; 1464 if (!find_lock_delalloc_range(&inode->vfs_inode, folio, 1465 &delalloc_start, &delalloc_end)) { 1466 delalloc_start = delalloc_end + 1; 1467 continue; 1468 } 1469 set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, 1470 min(delalloc_end, page_end) + 1 - delalloc_start); 1471 last_delalloc_end = delalloc_end; 1472 delalloc_start = delalloc_end + 1; 1473 } 1474 delalloc_start = page_start; 1475 1476 if (!last_delalloc_end) 1477 goto out; 1478 1479 /* Run the delalloc ranges for the above locked ranges. */ 1480 while (delalloc_start < page_end) { 1481 u64 found_start; 1482 u32 found_len; 1483 bool found; 1484 1485 if (!is_subpage) { 1486 /* 1487 * For non-subpage case, the found delalloc range must 1488 * cover this folio and there must be only one locked 1489 * delalloc range. 1490 */ 1491 found_start = page_start; 1492 found_len = last_delalloc_end + 1 - found_start; 1493 found = true; 1494 } else { 1495 found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, 1496 delalloc_start, &found_start, &found_len); 1497 } 1498 if (!found) 1499 break; 1500 /* 1501 * The subpage range covers the last sector, the delalloc range may 1502 * end beyond the folio boundary, use the saved delalloc_end 1503 * instead. 1504 */ 1505 if (found_start + found_len >= page_end) 1506 found_len = last_delalloc_end + 1 - found_start; 1507 1508 if (ret >= 0) { 1509 /* 1510 * Some delalloc range may be created by previous folios. 1511 * Thus we still need to clean up this range during error 1512 * handling. 1513 */ 1514 last_finished_delalloc_end = found_start; 1515 /* No errors hit so far, run the current delalloc range. */ 1516 ret = btrfs_run_delalloc_range(inode, folio, 1517 found_start, 1518 found_start + found_len - 1, 1519 wbc); 1520 if (ret >= 0) 1521 last_finished_delalloc_end = found_start + found_len; 1522 if (unlikely(ret < 0)) 1523 btrfs_err_rl(fs_info, 1524 "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", 1525 btrfs_root_id(inode->root), 1526 btrfs_ino(inode), 1527 folio_pos(folio), 1528 blocks_per_folio, 1529 &bio_ctrl->submit_bitmap, 1530 found_start, found_len, ret); 1531 } else { 1532 /* 1533 * We've hit an error during previous delalloc range, 1534 * have to cleanup the remaining locked ranges. 1535 */ 1536 btrfs_unlock_extent(&inode->io_tree, found_start, 1537 found_start + found_len - 1, NULL); 1538 unlock_delalloc_folio(&inode->vfs_inode, folio, 1539 found_start, 1540 found_start + found_len - 1); 1541 } 1542 1543 /* 1544 * We have some ranges that's going to be submitted asynchronously 1545 * (compression or inline). These range have their own control 1546 * on when to unlock the pages. We should not touch them 1547 * anymore, so clear the range from the submission bitmap. 1548 */ 1549 if (ret > 0) { 1550 unsigned int start_bit = (found_start - page_start) >> 1551 fs_info->sectorsize_bits; 1552 unsigned int end_bit = (min(page_end + 1, found_start + found_len) - 1553 page_start) >> fs_info->sectorsize_bits; 1554 bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); 1555 } 1556 /* 1557 * Above btrfs_run_delalloc_range() may have unlocked the folio, 1558 * thus for the last range, we cannot touch the folio anymore. 1559 */ 1560 if (found_start + found_len >= last_delalloc_end + 1) 1561 break; 1562 1563 delalloc_start = found_start + found_len; 1564 } 1565 /* 1566 * It's possible we had some ordered extents created before we hit 1567 * an error, cleanup non-async successfully created delalloc ranges. 1568 */ 1569 if (unlikely(ret < 0)) { 1570 unsigned int bitmap_size = min( 1571 (last_finished_delalloc_end - page_start) >> 1572 fs_info->sectorsize_bits, 1573 blocks_per_folio); 1574 1575 for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, 1576 bitmap_size) { 1577 u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1578 u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1579 1580 btrfs_mark_ordered_io_finished(inode, folio, start, len, false); 1581 } 1582 return ret; 1583 } 1584 out: 1585 if (last_delalloc_end) 1586 delalloc_end = last_delalloc_end; 1587 else 1588 delalloc_end = page_end; 1589 /* 1590 * delalloc_end is already one less than the total length, so 1591 * we don't subtract one from PAGE_SIZE. 1592 */ 1593 delalloc_to_write += 1594 DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); 1595 1596 /* 1597 * If all ranges are submitted asynchronously, we just need to account 1598 * for them here. 1599 */ 1600 if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { 1601 wbc->nr_to_write -= delalloc_to_write; 1602 return 1; 1603 } 1604 1605 if (wbc->nr_to_write < delalloc_to_write) { 1606 int thresh = 8192; 1607 1608 if (delalloc_to_write < thresh * 2) 1609 thresh = delalloc_to_write; 1610 wbc->nr_to_write = min_t(u64, delalloc_to_write, 1611 thresh); 1612 } 1613 1614 return 0; 1615 } 1616 1617 /* 1618 * Return 0 if we have submitted or queued the sector for submission. 1619 * Return <0 for critical errors, and the involved sector will be cleaned up. 1620 * 1621 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1622 */ 1623 static int submit_one_sector(struct btrfs_inode *inode, 1624 struct folio *folio, 1625 u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, 1626 loff_t i_size) 1627 { 1628 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1629 struct extent_map *em; 1630 u64 block_start; 1631 u64 disk_bytenr; 1632 u64 extent_offset; 1633 u64 em_end; 1634 const u32 sectorsize = fs_info->sectorsize; 1635 1636 ASSERT(IS_ALIGNED(filepos, sectorsize)); 1637 1638 /* @filepos >= i_size case should be handled by the caller. */ 1639 ASSERT(filepos < i_size); 1640 1641 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1642 if (IS_ERR(em)) { 1643 /* 1644 * bio_ctrl may contain a bio crossing several folios. 1645 * Submit it immediately so that the bio has a chance 1646 * to finish normally, other than marked as error. 1647 */ 1648 submit_one_bio(bio_ctrl); 1649 1650 /* 1651 * When submission failed, we should still clear the folio dirty. 1652 * Or the folio will be written back again but without any 1653 * ordered extent. 1654 */ 1655 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1656 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1657 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1658 1659 /* 1660 * Since there is no bio submitted to finish the ordered 1661 * extent, we have to manually finish this sector. 1662 */ 1663 btrfs_mark_ordered_io_finished(inode, folio, filepos, 1664 fs_info->sectorsize, false); 1665 return PTR_ERR(em); 1666 } 1667 1668 extent_offset = filepos - em->start; 1669 em_end = btrfs_extent_map_end(em); 1670 ASSERT(filepos <= em_end); 1671 ASSERT(IS_ALIGNED(em->start, sectorsize)); 1672 ASSERT(IS_ALIGNED(em->len, sectorsize)); 1673 1674 block_start = btrfs_extent_map_block_start(em); 1675 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1676 1677 ASSERT(!btrfs_extent_map_is_compressed(em)); 1678 ASSERT(block_start != EXTENT_MAP_HOLE); 1679 ASSERT(block_start != EXTENT_MAP_INLINE); 1680 1681 btrfs_free_extent_map(em); 1682 em = NULL; 1683 1684 /* 1685 * Although the PageDirty bit is cleared before entering this 1686 * function, subpage dirty bit is not cleared. 1687 * So clear subpage dirty bit here so next time we won't submit 1688 * a folio for a range already written to disk. 1689 */ 1690 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1691 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1692 /* 1693 * Above call should set the whole folio with writeback flag, even 1694 * just for a single subpage sector. 1695 * As long as the folio is properly locked and the range is correct, 1696 * we should always get the folio with writeback flag. 1697 */ 1698 ASSERT(folio_test_writeback(folio)); 1699 1700 submit_extent_folio(bio_ctrl, disk_bytenr, folio, 1701 sectorsize, filepos - folio_pos(folio), 0); 1702 return 0; 1703 } 1704 1705 /* 1706 * Helper for extent_writepage(). This calls the writepage start hooks, 1707 * and does the loop to map the page into extents and bios. 1708 * 1709 * We return 1 if the IO is started and the page is unlocked, 1710 * 0 if all went well (page still locked) 1711 * < 0 if there were errors (page still locked) 1712 */ 1713 static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, 1714 struct folio *folio, 1715 u64 start, u32 len, 1716 struct btrfs_bio_ctrl *bio_ctrl, 1717 loff_t i_size) 1718 { 1719 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1720 unsigned long range_bitmap = 0; 1721 bool submitted_io = false; 1722 int found_error = 0; 1723 const u64 end = start + len; 1724 const u64 folio_start = folio_pos(folio); 1725 const u64 folio_end = folio_start + folio_size(folio); 1726 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1727 u64 cur; 1728 int bit; 1729 int ret = 0; 1730 1731 ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start); 1732 ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu", 1733 start, len, folio_start, folio_size(folio)); 1734 1735 ret = btrfs_writepage_cow_fixup(folio); 1736 if (ret == -EAGAIN) { 1737 /* Fixup worker will requeue */ 1738 folio_redirty_for_writepage(bio_ctrl->wbc, folio); 1739 folio_unlock(folio); 1740 return 1; 1741 } 1742 if (ret < 0) { 1743 btrfs_folio_clear_dirty(fs_info, folio, start, len); 1744 btrfs_folio_set_writeback(fs_info, folio, start, len); 1745 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1746 return ret; 1747 } 1748 1749 bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits, 1750 len >> fs_info->sectorsize_bits); 1751 bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, 1752 blocks_per_folio); 1753 1754 bio_ctrl->end_io_func = end_bbio_data_write; 1755 1756 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1757 cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); 1758 1759 if (cur >= i_size) { 1760 struct btrfs_ordered_extent *ordered; 1761 1762 ordered = btrfs_lookup_first_ordered_range(inode, cur, 1763 fs_info->sectorsize); 1764 /* 1765 * We have just run delalloc before getting here, so 1766 * there must be an ordered extent. 1767 */ 1768 ASSERT(ordered != NULL); 1769 spin_lock(&inode->ordered_tree_lock); 1770 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 1771 ordered->truncated_len = min(ordered->truncated_len, 1772 cur - ordered->file_offset); 1773 spin_unlock(&inode->ordered_tree_lock); 1774 btrfs_put_ordered_extent(ordered); 1775 1776 btrfs_mark_ordered_io_finished(inode, folio, cur, 1777 fs_info->sectorsize, true); 1778 /* 1779 * This range is beyond i_size, thus we don't need to 1780 * bother writing back. 1781 * But we still need to clear the dirty subpage bit, or 1782 * the next time the folio gets dirtied, we will try to 1783 * writeback the sectors with subpage dirty bits, 1784 * causing writeback without ordered extent. 1785 */ 1786 btrfs_folio_clear_dirty(fs_info, folio, cur, fs_info->sectorsize); 1787 continue; 1788 } 1789 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1790 if (unlikely(ret < 0)) { 1791 if (!found_error) 1792 found_error = ret; 1793 continue; 1794 } 1795 submitted_io = true; 1796 } 1797 1798 /* 1799 * If we didn't submitted any sector (>= i_size), folio dirty get 1800 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared 1801 * by folio_start_writeback() if the folio is not dirty). 1802 * 1803 * Here we set writeback and clear for the range. If the full folio 1804 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1805 * 1806 * If we hit any error, the corresponding sector will have its dirty 1807 * flag cleared and writeback finished, thus no need to handle the error case. 1808 */ 1809 if (!submitted_io && !found_error) { 1810 btrfs_folio_set_writeback(fs_info, folio, start, len); 1811 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1812 } 1813 return found_error; 1814 } 1815 1816 /* 1817 * the writepage semantics are similar to regular writepage. extent 1818 * records are inserted to lock ranges in the tree, and as dirty areas 1819 * are found, they are marked writeback. Then the lock bits are removed 1820 * and the end_io handler clears the writeback ranges 1821 * 1822 * Return 0 if everything goes well. 1823 * Return <0 for error. 1824 */ 1825 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) 1826 { 1827 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 1828 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1829 int ret; 1830 size_t pg_offset; 1831 loff_t i_size = i_size_read(&inode->vfs_inode); 1832 const pgoff_t end_index = i_size >> PAGE_SHIFT; 1833 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1834 1835 trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); 1836 1837 WARN_ON(!folio_test_locked(folio)); 1838 1839 pg_offset = offset_in_folio(folio, i_size); 1840 if (folio->index > end_index || 1841 (folio->index == end_index && !pg_offset)) { 1842 folio_invalidate(folio, 0, folio_size(folio)); 1843 folio_unlock(folio); 1844 return 0; 1845 } 1846 1847 if (folio_contains(folio, end_index)) 1848 folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); 1849 1850 /* 1851 * Default to unlock the whole folio. 1852 * The proper bitmap can only be initialized until writepage_delalloc(). 1853 */ 1854 bio_ctrl->submit_bitmap = (unsigned long)-1; 1855 1856 /* 1857 * If the page is dirty but without private set, it's marked dirty 1858 * without informing the fs. 1859 * Nowadays that is a bug, since the introduction of 1860 * pin_user_pages*(). 1861 * 1862 * So here we check if the page has private set to rule out such 1863 * case. 1864 * But we also have a long history of relying on the COW fixup, 1865 * so here we only enable this check for experimental builds until 1866 * we're sure it's safe. 1867 */ 1868 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && 1869 unlikely(!folio_test_private(folio))) { 1870 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 1871 btrfs_err_rl(fs_info, 1872 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 1873 btrfs_root_id(inode->root), 1874 btrfs_ino(inode), folio_pos(folio)); 1875 ret = -EUCLEAN; 1876 goto done; 1877 } 1878 1879 ret = set_folio_extent_mapped(folio); 1880 if (ret < 0) 1881 goto done; 1882 1883 ret = writepage_delalloc(inode, folio, bio_ctrl); 1884 if (ret == 1) 1885 return 0; 1886 if (ret) 1887 goto done; 1888 1889 ret = extent_writepage_io(inode, folio, folio_pos(folio), 1890 folio_size(folio), bio_ctrl, i_size); 1891 if (ret == 1) 1892 return 0; 1893 if (unlikely(ret < 0)) 1894 btrfs_err_rl(fs_info, 1895 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", 1896 btrfs_root_id(inode->root), btrfs_ino(inode), 1897 folio_pos(folio), blocks_per_folio, 1898 &bio_ctrl->submit_bitmap, ret); 1899 1900 bio_ctrl->wbc->nr_to_write--; 1901 1902 done: 1903 if (ret < 0) 1904 mapping_set_error(folio->mapping, ret); 1905 /* 1906 * Only unlock ranges that are submitted. As there can be some async 1907 * submitted ranges inside the folio. 1908 */ 1909 btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1910 ASSERT(ret <= 0); 1911 return ret; 1912 } 1913 1914 /* 1915 * Lock extent buffer status and pages for writeback. 1916 * 1917 * Return %false if the extent buffer doesn't need to be submitted (e.g. the 1918 * extent buffer is not dirty) 1919 * Return %true is the extent buffer is submitted to bio. 1920 */ 1921 static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, 1922 struct writeback_control *wbc) 1923 { 1924 struct btrfs_fs_info *fs_info = eb->fs_info; 1925 bool ret = false; 1926 1927 btrfs_tree_lock(eb); 1928 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 1929 btrfs_tree_unlock(eb); 1930 if (wbc->sync_mode != WB_SYNC_ALL) 1931 return false; 1932 wait_on_extent_buffer_writeback(eb); 1933 btrfs_tree_lock(eb); 1934 } 1935 1936 /* 1937 * We need to do this to prevent races in people who check if the eb is 1938 * under IO since we can end up having no IO bits set for a short period 1939 * of time. 1940 */ 1941 spin_lock(&eb->refs_lock); 1942 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1943 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1944 unsigned long flags; 1945 1946 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 1947 spin_unlock(&eb->refs_lock); 1948 1949 xas_lock_irqsave(&xas, flags); 1950 xas_load(&xas); 1951 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 1952 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 1953 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 1954 xas_unlock_irqrestore(&xas, flags); 1955 1956 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 1957 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 1958 -eb->len, 1959 fs_info->dirty_metadata_batch); 1960 ret = true; 1961 } else { 1962 spin_unlock(&eb->refs_lock); 1963 } 1964 btrfs_tree_unlock(eb); 1965 return ret; 1966 } 1967 1968 static void set_btree_ioerr(struct extent_buffer *eb) 1969 { 1970 struct btrfs_fs_info *fs_info = eb->fs_info; 1971 1972 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 1973 1974 /* 1975 * A read may stumble upon this buffer later, make sure that it gets an 1976 * error and knows there was an error. 1977 */ 1978 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 1979 1980 /* 1981 * We need to set the mapping with the io error as well because a write 1982 * error will flip the file system readonly, and then syncfs() will 1983 * return a 0 because we are readonly if we don't modify the err seq for 1984 * the superblock. 1985 */ 1986 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); 1987 1988 /* 1989 * If writeback for a btree extent that doesn't belong to a log tree 1990 * failed, increment the counter transaction->eb_write_errors. 1991 * We do this because while the transaction is running and before it's 1992 * committing (when we call filemap_fdata[write|wait]_range against 1993 * the btree inode), we might have 1994 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 1995 * returns an error or an error happens during writeback, when we're 1996 * committing the transaction we wouldn't know about it, since the pages 1997 * can be no longer dirty nor marked anymore for writeback (if a 1998 * subsequent modification to the extent buffer didn't happen before the 1999 * transaction commit), which makes filemap_fdata[write|wait]_range not 2000 * able to find the pages which contain errors at transaction 2001 * commit time. So if this happens we must abort the transaction, 2002 * otherwise we commit a super block with btree roots that point to 2003 * btree nodes/leafs whose content on disk is invalid - either garbage 2004 * or the content of some node/leaf from a past generation that got 2005 * cowed or deleted and is no longer valid. 2006 * 2007 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 2008 * not be enough - we need to distinguish between log tree extents vs 2009 * non-log tree extents, and the next filemap_fdatawait_range() call 2010 * will catch and clear such errors in the mapping - and that call might 2011 * be from a log sync and not from a transaction commit. Also, checking 2012 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 2013 * not done and would not be reliable - the eb might have been released 2014 * from memory and reading it back again means that flag would not be 2015 * set (since it's a runtime flag, not persisted on disk). 2016 * 2017 * Using the flags below in the btree inode also makes us achieve the 2018 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 2019 * writeback for all dirty pages and before filemap_fdatawait_range() 2020 * is called, the writeback for all dirty pages had already finished 2021 * with errors - because we were not using AS_EIO/AS_ENOSPC, 2022 * filemap_fdatawait_range() would return success, as it could not know 2023 * that writeback errors happened (the pages were no longer tagged for 2024 * writeback). 2025 */ 2026 switch (eb->log_index) { 2027 case -1: 2028 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 2029 break; 2030 case 0: 2031 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 2032 break; 2033 case 1: 2034 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2035 break; 2036 default: 2037 BUG(); /* unexpected, logic error */ 2038 } 2039 } 2040 2041 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) 2042 { 2043 struct btrfs_fs_info *fs_info = eb->fs_info; 2044 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2045 unsigned long flags; 2046 2047 xas_lock_irqsave(&xas, flags); 2048 xas_load(&xas); 2049 xas_set_mark(&xas, mark); 2050 xas_unlock_irqrestore(&xas, flags); 2051 } 2052 2053 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) 2054 { 2055 struct btrfs_fs_info *fs_info = eb->fs_info; 2056 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2057 unsigned long flags; 2058 2059 xas_lock_irqsave(&xas, flags); 2060 xas_load(&xas); 2061 xas_clear_mark(&xas, mark); 2062 xas_unlock_irqrestore(&xas, flags); 2063 } 2064 2065 static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, 2066 unsigned long start, unsigned long end) 2067 { 2068 XA_STATE(xas, &fs_info->buffer_tree, start); 2069 unsigned int tagged = 0; 2070 void *eb; 2071 2072 xas_lock_irq(&xas); 2073 xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { 2074 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 2075 if (++tagged % XA_CHECK_SCHED) 2076 continue; 2077 xas_pause(&xas); 2078 xas_unlock_irq(&xas); 2079 cond_resched(); 2080 xas_lock_irq(&xas); 2081 } 2082 xas_unlock_irq(&xas); 2083 } 2084 2085 struct eb_batch { 2086 unsigned int nr; 2087 unsigned int cur; 2088 struct extent_buffer *ebs[PAGEVEC_SIZE]; 2089 }; 2090 2091 static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) 2092 { 2093 batch->ebs[batch->nr++] = eb; 2094 return (batch->nr < PAGEVEC_SIZE); 2095 } 2096 2097 static inline void eb_batch_init(struct eb_batch *batch) 2098 { 2099 batch->nr = 0; 2100 batch->cur = 0; 2101 } 2102 2103 static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) 2104 { 2105 if (batch->cur >= batch->nr) 2106 return NULL; 2107 return batch->ebs[batch->cur++]; 2108 } 2109 2110 static inline void eb_batch_release(struct eb_batch *batch) 2111 { 2112 for (unsigned int i = 0; i < batch->nr; i++) 2113 free_extent_buffer(batch->ebs[i]); 2114 eb_batch_init(batch); 2115 } 2116 2117 static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, 2118 xa_mark_t mark) 2119 { 2120 struct extent_buffer *eb; 2121 2122 retry: 2123 eb = xas_find_marked(xas, max, mark); 2124 2125 if (xas_retry(xas, eb)) 2126 goto retry; 2127 2128 if (!eb) 2129 return NULL; 2130 2131 if (!refcount_inc_not_zero(&eb->refs)) { 2132 xas_reset(xas); 2133 goto retry; 2134 } 2135 2136 if (unlikely(eb != xas_reload(xas))) { 2137 free_extent_buffer(eb); 2138 xas_reset(xas); 2139 goto retry; 2140 } 2141 2142 return eb; 2143 } 2144 2145 static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, 2146 unsigned long *start, 2147 unsigned long end, xa_mark_t tag, 2148 struct eb_batch *batch) 2149 { 2150 XA_STATE(xas, &fs_info->buffer_tree, *start); 2151 struct extent_buffer *eb; 2152 2153 rcu_read_lock(); 2154 while ((eb = find_get_eb(&xas, end, tag)) != NULL) { 2155 if (!eb_batch_add(batch, eb)) { 2156 *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); 2157 goto out; 2158 } 2159 } 2160 if (end == ULONG_MAX) 2161 *start = ULONG_MAX; 2162 else 2163 *start = end + 1; 2164 out: 2165 rcu_read_unlock(); 2166 2167 return batch->nr; 2168 } 2169 2170 /* 2171 * The endio specific version which won't touch any unsafe spinlock in endio 2172 * context. 2173 */ 2174 static struct extent_buffer *find_extent_buffer_nolock( 2175 struct btrfs_fs_info *fs_info, u64 start) 2176 { 2177 struct extent_buffer *eb; 2178 unsigned long index = (start >> fs_info->nodesize_bits); 2179 2180 rcu_read_lock(); 2181 eb = xa_load(&fs_info->buffer_tree, index); 2182 if (eb && !refcount_inc_not_zero(&eb->refs)) 2183 eb = NULL; 2184 rcu_read_unlock(); 2185 return eb; 2186 } 2187 2188 static void end_bbio_meta_write(struct btrfs_bio *bbio) 2189 { 2190 struct extent_buffer *eb = bbio->private; 2191 struct folio_iter fi; 2192 2193 if (bbio->bio.bi_status != BLK_STS_OK) 2194 set_btree_ioerr(eb); 2195 2196 bio_for_each_folio_all(fi, &bbio->bio) { 2197 btrfs_meta_folio_clear_writeback(fi.folio, eb); 2198 } 2199 2200 buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); 2201 clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 2202 bio_put(&bbio->bio); 2203 } 2204 2205 static void prepare_eb_write(struct extent_buffer *eb) 2206 { 2207 u32 nritems; 2208 unsigned long start; 2209 unsigned long end; 2210 2211 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2212 2213 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 2214 nritems = btrfs_header_nritems(eb); 2215 if (btrfs_header_level(eb) > 0) { 2216 end = btrfs_node_key_ptr_offset(eb, nritems); 2217 memzero_extent_buffer(eb, end, eb->len - end); 2218 } else { 2219 /* 2220 * Leaf: 2221 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 2222 */ 2223 start = btrfs_item_nr_offset(eb, nritems); 2224 end = btrfs_item_nr_offset(eb, 0); 2225 if (nritems == 0) 2226 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); 2227 else 2228 end += btrfs_item_offset(eb, nritems - 1); 2229 memzero_extent_buffer(eb, start, end - start); 2230 } 2231 } 2232 2233 static noinline_for_stack void write_one_eb(struct extent_buffer *eb, 2234 struct writeback_control *wbc) 2235 { 2236 struct btrfs_fs_info *fs_info = eb->fs_info; 2237 struct btrfs_bio *bbio; 2238 2239 prepare_eb_write(eb); 2240 2241 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 2242 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), 2243 BTRFS_I(fs_info->btree_inode), eb->start, 2244 end_bbio_meta_write, eb); 2245 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 2246 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 2247 wbc_init_bio(wbc, &bbio->bio); 2248 for (int i = 0; i < num_extent_folios(eb); i++) { 2249 struct folio *folio = eb->folios[i]; 2250 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 2251 u32 range_len = min_t(u64, folio_next_pos(folio), 2252 eb->start + eb->len) - range_start; 2253 2254 folio_lock(folio); 2255 btrfs_meta_folio_clear_dirty(folio, eb); 2256 btrfs_meta_folio_set_writeback(folio, eb); 2257 if (!folio_test_dirty(folio)) 2258 wbc->nr_to_write -= folio_nr_pages(folio); 2259 bio_add_folio_nofail(&bbio->bio, folio, range_len, 2260 offset_in_folio(folio, range_start)); 2261 wbc_account_cgroup_owner(wbc, folio, range_len); 2262 folio_unlock(folio); 2263 } 2264 /* 2265 * If the fs is already in error status, do not submit any writeback 2266 * but immediately finish it. 2267 */ 2268 if (unlikely(BTRFS_FS_ERROR(fs_info))) { 2269 btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); 2270 return; 2271 } 2272 btrfs_submit_bbio(bbio, 0); 2273 } 2274 2275 /* 2276 * Wait for all eb writeback in the given range to finish. 2277 * 2278 * @fs_info: The fs_info for this file system. 2279 * @start: The offset of the range to start waiting on writeback. 2280 * @end: The end of the range, inclusive. This is meant to be used in 2281 * conjunction with wait_marked_extents, so this will usually be 2282 * the_next_eb->start - 1. 2283 */ 2284 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, 2285 u64 end) 2286 { 2287 struct eb_batch batch; 2288 unsigned long start_index = (start >> fs_info->nodesize_bits); 2289 unsigned long end_index = (end >> fs_info->nodesize_bits); 2290 2291 eb_batch_init(&batch); 2292 while (start_index <= end_index) { 2293 struct extent_buffer *eb; 2294 unsigned int nr_ebs; 2295 2296 nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, 2297 PAGECACHE_TAG_WRITEBACK, &batch); 2298 if (!nr_ebs) 2299 break; 2300 2301 while ((eb = eb_batch_next(&batch)) != NULL) 2302 wait_on_extent_buffer_writeback(eb); 2303 eb_batch_release(&batch); 2304 cond_resched(); 2305 } 2306 } 2307 2308 int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) 2309 { 2310 struct btrfs_eb_write_context ctx = { .wbc = wbc }; 2311 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 2312 int ret = 0; 2313 int done = 0; 2314 int nr_to_write_done = 0; 2315 struct eb_batch batch; 2316 unsigned int nr_ebs; 2317 unsigned long index; 2318 unsigned long end; 2319 int scanned = 0; 2320 xa_mark_t tag; 2321 2322 eb_batch_init(&batch); 2323 if (wbc->range_cyclic) { 2324 index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); 2325 end = -1; 2326 2327 /* 2328 * Start from the beginning does not need to cycle over the 2329 * range, mark it as scanned. 2330 */ 2331 scanned = (index == 0); 2332 } else { 2333 index = (wbc->range_start >> fs_info->nodesize_bits); 2334 end = (wbc->range_end >> fs_info->nodesize_bits); 2335 2336 scanned = 1; 2337 } 2338 if (wbc->sync_mode == WB_SYNC_ALL) 2339 tag = PAGECACHE_TAG_TOWRITE; 2340 else 2341 tag = PAGECACHE_TAG_DIRTY; 2342 btrfs_zoned_meta_io_lock(fs_info); 2343 retry: 2344 if (wbc->sync_mode == WB_SYNC_ALL) 2345 buffer_tree_tag_for_writeback(fs_info, index, end); 2346 while (!done && !nr_to_write_done && (index <= end) && 2347 (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { 2348 struct extent_buffer *eb; 2349 2350 while ((eb = eb_batch_next(&batch)) != NULL) { 2351 ctx.eb = eb; 2352 2353 ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); 2354 if (ret) { 2355 if (ret == -EBUSY) 2356 ret = 0; 2357 2358 if (ret) { 2359 done = 1; 2360 break; 2361 } 2362 continue; 2363 } 2364 2365 if (!lock_extent_buffer_for_io(eb, wbc)) 2366 continue; 2367 2368 /* Implies write in zoned mode. */ 2369 if (ctx.zoned_bg) { 2370 /* Mark the last eb in the block group. */ 2371 btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); 2372 ctx.zoned_bg->meta_write_pointer += eb->len; 2373 } 2374 write_one_eb(eb, wbc); 2375 } 2376 nr_to_write_done = (wbc->nr_to_write <= 0); 2377 eb_batch_release(&batch); 2378 cond_resched(); 2379 } 2380 if (!scanned && !done) { 2381 /* 2382 * We hit the last page and there is more work to be done: wrap 2383 * back to the start of the file 2384 */ 2385 scanned = 1; 2386 index = 0; 2387 goto retry; 2388 } 2389 /* 2390 * If something went wrong, don't allow any metadata write bio to be 2391 * submitted. 2392 * 2393 * This would prevent use-after-free if we had dirty pages not 2394 * cleaned up, which can still happen by fuzzed images. 2395 * 2396 * - Bad extent tree 2397 * Allowing existing tree block to be allocated for other trees. 2398 * 2399 * - Log tree operations 2400 * Exiting tree blocks get allocated to log tree, bumps its 2401 * generation, then get cleaned in tree re-balance. 2402 * Such tree block will not be written back, since it's clean, 2403 * thus no WRITTEN flag set. 2404 * And after log writes back, this tree block is not traced by 2405 * any dirty extent_io_tree. 2406 * 2407 * - Offending tree block gets re-dirtied from its original owner 2408 * Since it has bumped generation, no WRITTEN flag, it can be 2409 * reused without COWing. This tree block will not be traced 2410 * by btrfs_transaction::dirty_pages. 2411 * 2412 * Now such dirty tree block will not be cleaned by any dirty 2413 * extent io tree. Thus we don't want to submit such wild eb 2414 * if the fs already has error. 2415 * 2416 * We can get ret > 0 from submit_extent_folio() indicating how many ebs 2417 * were submitted. Reset it to 0 to avoid false alerts for the caller. 2418 */ 2419 if (ret > 0) 2420 ret = 0; 2421 if (!ret && BTRFS_FS_ERROR(fs_info)) 2422 ret = -EROFS; 2423 2424 if (ctx.zoned_bg) 2425 btrfs_put_block_group(ctx.zoned_bg); 2426 btrfs_zoned_meta_io_unlock(fs_info); 2427 return ret; 2428 } 2429 2430 /* 2431 * Walk the list of dirty pages of the given address space and write all of them. 2432 * 2433 * @mapping: address space structure to write 2434 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2435 * @bio_ctrl: holds context for the write, namely the bio 2436 * 2437 * If a page is already under I/O, write_cache_pages() skips it, even 2438 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2439 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2440 * and msync() need to guarantee that all the data which was dirty at the time 2441 * the call was made get new I/O started against them. If wbc->sync_mode is 2442 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2443 * existing IO to complete. 2444 */ 2445 static int extent_write_cache_pages(struct address_space *mapping, 2446 struct btrfs_bio_ctrl *bio_ctrl) 2447 { 2448 struct writeback_control *wbc = bio_ctrl->wbc; 2449 struct inode *inode = mapping->host; 2450 int ret = 0; 2451 int done = 0; 2452 int nr_to_write_done = 0; 2453 struct folio_batch fbatch; 2454 unsigned int nr_folios; 2455 pgoff_t index; 2456 pgoff_t end; /* Inclusive */ 2457 pgoff_t done_index; 2458 int range_whole = 0; 2459 int scanned = 0; 2460 xa_mark_t tag; 2461 2462 /* 2463 * We have to hold onto the inode so that ordered extents can do their 2464 * work when the IO finishes. The alternative to this is failing to add 2465 * an ordered extent if the igrab() fails there and that is a huge pain 2466 * to deal with, so instead just hold onto the inode throughout the 2467 * writepages operation. If it fails here we are freeing up the inode 2468 * anyway and we'd rather not waste our time writing out stuff that is 2469 * going to be truncated anyway. 2470 */ 2471 if (!igrab(inode)) 2472 return 0; 2473 2474 folio_batch_init(&fbatch); 2475 if (wbc->range_cyclic) { 2476 index = mapping->writeback_index; /* Start from prev offset */ 2477 end = -1; 2478 /* 2479 * Start from the beginning does not need to cycle over the 2480 * range, mark it as scanned. 2481 */ 2482 scanned = (index == 0); 2483 } else { 2484 index = wbc->range_start >> PAGE_SHIFT; 2485 end = wbc->range_end >> PAGE_SHIFT; 2486 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2487 range_whole = 1; 2488 scanned = 1; 2489 } 2490 2491 /* 2492 * We do the tagged writepage as long as the snapshot flush bit is set 2493 * and we are the first one who do the filemap_flush() on this inode. 2494 * 2495 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 2496 * not race in and drop the bit. 2497 */ 2498 if (range_whole && wbc->nr_to_write == LONG_MAX && 2499 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 2500 &BTRFS_I(inode)->runtime_flags)) 2501 wbc->tagged_writepages = 1; 2502 2503 tag = wbc_to_tag(wbc); 2504 retry: 2505 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2506 tag_pages_for_writeback(mapping, index, end); 2507 done_index = index; 2508 while (!done && !nr_to_write_done && (index <= end) && 2509 (nr_folios = filemap_get_folios_tag(mapping, &index, 2510 end, tag, &fbatch))) { 2511 unsigned i; 2512 2513 for (i = 0; i < nr_folios; i++) { 2514 struct folio *folio = fbatch.folios[i]; 2515 2516 done_index = folio_next_index(folio); 2517 /* 2518 * At this point we hold neither the i_pages lock nor 2519 * the folio lock: the folio may be truncated or 2520 * invalidated (changing folio->mapping to NULL). 2521 */ 2522 if (!folio_trylock(folio)) { 2523 submit_write_bio(bio_ctrl, 0); 2524 folio_lock(folio); 2525 } 2526 2527 if (unlikely(folio->mapping != mapping)) { 2528 folio_unlock(folio); 2529 continue; 2530 } 2531 2532 if (!folio_test_dirty(folio)) { 2533 /* Someone wrote it for us. */ 2534 folio_unlock(folio); 2535 continue; 2536 } 2537 2538 /* 2539 * For subpage case, compression can lead to mixed 2540 * writeback and dirty flags, e.g: 2541 * 0 32K 64K 96K 128K 2542 * | |//////||/////| |//| 2543 * 2544 * In above case, [32K, 96K) is asynchronously submitted 2545 * for compression, and [124K, 128K) needs to be written back. 2546 * 2547 * If we didn't wait writeback for page 64K, [128K, 128K) 2548 * won't be submitted as the page still has writeback flag 2549 * and will be skipped in the next check. 2550 * 2551 * This mixed writeback and dirty case is only possible for 2552 * subpage case. 2553 * 2554 * TODO: Remove this check after migrating compression to 2555 * regular submission. 2556 */ 2557 if (wbc->sync_mode != WB_SYNC_NONE || 2558 btrfs_is_subpage(inode_to_fs_info(inode), folio)) { 2559 if (folio_test_writeback(folio)) 2560 submit_write_bio(bio_ctrl, 0); 2561 folio_wait_writeback(folio); 2562 } 2563 2564 if (folio_test_writeback(folio) || 2565 !folio_clear_dirty_for_io(folio)) { 2566 folio_unlock(folio); 2567 continue; 2568 } 2569 2570 ret = extent_writepage(folio, bio_ctrl); 2571 if (ret < 0) { 2572 done = 1; 2573 break; 2574 } 2575 2576 /* 2577 * The filesystem may choose to bump up nr_to_write. 2578 * We have to make sure to honor the new nr_to_write 2579 * at any time. 2580 */ 2581 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && 2582 wbc->nr_to_write <= 0); 2583 } 2584 folio_batch_release(&fbatch); 2585 cond_resched(); 2586 } 2587 if (!scanned && !done) { 2588 /* 2589 * We hit the last page and there is more work to be done: wrap 2590 * back to the start of the file 2591 */ 2592 scanned = 1; 2593 index = 0; 2594 2595 /* 2596 * If we're looping we could run into a page that is locked by a 2597 * writer and that writer could be waiting on writeback for a 2598 * page in our current bio, and thus deadlock, so flush the 2599 * write bio here. 2600 */ 2601 submit_write_bio(bio_ctrl, 0); 2602 goto retry; 2603 } 2604 2605 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 2606 mapping->writeback_index = done_index; 2607 2608 btrfs_add_delayed_iput(BTRFS_I(inode)); 2609 return ret; 2610 } 2611 2612 /* 2613 * Submit the pages in the range to bio for call sites which delalloc range has 2614 * already been ran (aka, ordered extent inserted) and all pages are still 2615 * locked. 2616 */ 2617 void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, 2618 u64 start, u64 end, struct writeback_control *wbc, 2619 bool pages_dirty) 2620 { 2621 bool found_error = false; 2622 int ret = 0; 2623 struct address_space *mapping = inode->i_mapping; 2624 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2625 const u32 sectorsize = fs_info->sectorsize; 2626 loff_t i_size = i_size_read(inode); 2627 u64 cur = start; 2628 struct btrfs_bio_ctrl bio_ctrl = { 2629 .wbc = wbc, 2630 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2631 }; 2632 2633 if (wbc->no_cgroup_owner) 2634 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; 2635 2636 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 2637 2638 while (cur <= end) { 2639 u64 cur_end; 2640 u32 cur_len; 2641 struct folio *folio; 2642 2643 folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); 2644 2645 /* 2646 * This shouldn't happen, the pages are pinned and locked, this 2647 * code is just in case, but shouldn't actually be run. 2648 */ 2649 if (IS_ERR(folio)) { 2650 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2651 cur_len = cur_end + 1 - cur; 2652 btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, 2653 cur, cur_len, false); 2654 mapping_set_error(mapping, PTR_ERR(folio)); 2655 cur = cur_end; 2656 continue; 2657 } 2658 2659 cur_end = min_t(u64, folio_next_pos(folio) - 1, end); 2660 cur_len = cur_end + 1 - cur; 2661 2662 ASSERT(folio_test_locked(folio)); 2663 if (pages_dirty && folio != locked_folio) 2664 ASSERT(folio_test_dirty(folio)); 2665 2666 /* 2667 * Set the submission bitmap to submit all sectors. 2668 * extent_writepage_io() will do the truncation correctly. 2669 */ 2670 bio_ctrl.submit_bitmap = (unsigned long)-1; 2671 ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, 2672 &bio_ctrl, i_size); 2673 if (ret == 1) 2674 goto next_page; 2675 2676 if (ret) 2677 mapping_set_error(mapping, ret); 2678 btrfs_folio_end_lock(fs_info, folio, cur, cur_len); 2679 if (ret < 0) 2680 found_error = true; 2681 next_page: 2682 folio_put(folio); 2683 cur = cur_end + 1; 2684 } 2685 2686 submit_write_bio(&bio_ctrl, found_error ? ret : 0); 2687 } 2688 2689 int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 2690 { 2691 struct inode *inode = mapping->host; 2692 int ret = 0; 2693 struct btrfs_bio_ctrl bio_ctrl = { 2694 .wbc = wbc, 2695 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2696 }; 2697 2698 /* 2699 * Allow only a single thread to do the reloc work in zoned mode to 2700 * protect the write pointer updates. 2701 */ 2702 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 2703 ret = extent_write_cache_pages(mapping, &bio_ctrl); 2704 submit_write_bio(&bio_ctrl, ret); 2705 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 2706 return ret; 2707 } 2708 2709 void btrfs_readahead(struct readahead_control *rac) 2710 { 2711 struct btrfs_bio_ctrl bio_ctrl = { 2712 .opf = REQ_OP_READ | REQ_RAHEAD, 2713 .ractl = rac, 2714 .last_em_start = U64_MAX, 2715 }; 2716 struct folio *folio; 2717 struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); 2718 const u64 start = readahead_pos(rac); 2719 const u64 end = start + readahead_length(rac) - 1; 2720 struct extent_state *cached_state = NULL; 2721 struct extent_map *em_cached = NULL; 2722 2723 lock_extents_for_read(inode, start, end, &cached_state); 2724 2725 while ((folio = readahead_folio(rac)) != NULL) 2726 btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 2727 2728 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2729 2730 if (em_cached) 2731 btrfs_free_extent_map(em_cached); 2732 submit_one_bio(&bio_ctrl); 2733 } 2734 2735 /* 2736 * basic invalidate_folio code, this waits on any locked or writeback 2737 * ranges corresponding to the folio, and then deletes any extent state 2738 * records from the tree 2739 */ 2740 int extent_invalidate_folio(struct extent_io_tree *tree, 2741 struct folio *folio, size_t offset) 2742 { 2743 struct extent_state *cached_state = NULL; 2744 u64 start = folio_pos(folio); 2745 u64 end = start + folio_size(folio) - 1; 2746 size_t blocksize = folio_to_fs_info(folio)->sectorsize; 2747 2748 /* This function is only called for the btree inode */ 2749 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 2750 2751 start += ALIGN(offset, blocksize); 2752 if (start > end) 2753 return 0; 2754 2755 btrfs_lock_extent(tree, start, end, &cached_state); 2756 folio_wait_writeback(folio); 2757 2758 /* 2759 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 2760 * so here we only need to unlock the extent range to free any 2761 * existing extent state. 2762 */ 2763 btrfs_unlock_extent(tree, start, end, &cached_state); 2764 return 0; 2765 } 2766 2767 /* 2768 * A helper for struct address_space_operations::release_folio, this tests for 2769 * areas of the folio that are locked or under IO and drops the related state 2770 * bits if it is safe to drop the folio. 2771 */ 2772 static bool try_release_extent_state(struct extent_io_tree *tree, 2773 struct folio *folio) 2774 { 2775 struct extent_state *cached_state = NULL; 2776 u64 start = folio_pos(folio); 2777 u64 end = start + folio_size(folio) - 1; 2778 u32 range_bits; 2779 u32 clear_bits; 2780 bool ret = false; 2781 int ret2; 2782 2783 btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); 2784 2785 /* 2786 * We can release the folio if it's locked only for ordered extent 2787 * completion, since that doesn't require using the folio. 2788 */ 2789 if ((range_bits & EXTENT_LOCKED) && 2790 !(range_bits & EXTENT_FINISHING_ORDERED)) 2791 goto out; 2792 2793 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | 2794 EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | 2795 EXTENT_FINISHING_ORDERED); 2796 /* 2797 * At this point we can safely clear everything except the locked, 2798 * nodatasum, delalloc new and finishing ordered bits. The delalloc new 2799 * bit will be cleared by ordered extent completion. 2800 */ 2801 ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); 2802 /* 2803 * If clear_extent_bit failed for enomem reasons, we can't allow the 2804 * release to continue. 2805 */ 2806 if (ret2 == 0) 2807 ret = true; 2808 out: 2809 btrfs_free_extent_state(cached_state); 2810 2811 return ret; 2812 } 2813 2814 /* 2815 * a helper for release_folio. As long as there are no locked extents 2816 * in the range corresponding to the page, both state records and extent 2817 * map records are removed 2818 */ 2819 bool try_release_extent_mapping(struct folio *folio, gfp_t mask) 2820 { 2821 u64 start = folio_pos(folio); 2822 u64 end = start + folio_size(folio) - 1; 2823 struct btrfs_inode *inode = folio_to_inode(folio); 2824 struct extent_io_tree *io_tree = &inode->io_tree; 2825 2826 while (start <= end) { 2827 const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); 2828 const u64 len = end - start + 1; 2829 struct extent_map_tree *extent_tree = &inode->extent_tree; 2830 struct extent_map *em; 2831 2832 write_lock(&extent_tree->lock); 2833 em = btrfs_lookup_extent_mapping(extent_tree, start, len); 2834 if (!em) { 2835 write_unlock(&extent_tree->lock); 2836 break; 2837 } 2838 if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { 2839 write_unlock(&extent_tree->lock); 2840 btrfs_free_extent_map(em); 2841 break; 2842 } 2843 if (btrfs_test_range_bit_exists(io_tree, em->start, 2844 btrfs_extent_map_end(em) - 1, 2845 EXTENT_LOCKED)) 2846 goto next; 2847 /* 2848 * If it's not in the list of modified extents, used by a fast 2849 * fsync, we can remove it. If it's being logged we can safely 2850 * remove it since fsync took an extra reference on the em. 2851 */ 2852 if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) 2853 goto remove_em; 2854 /* 2855 * If it's in the list of modified extents, remove it only if 2856 * its generation is older then the current one, in which case 2857 * we don't need it for a fast fsync. Otherwise don't remove it, 2858 * we could be racing with an ongoing fast fsync that could miss 2859 * the new extent. 2860 */ 2861 if (em->generation >= cur_gen) 2862 goto next; 2863 remove_em: 2864 /* 2865 * We only remove extent maps that are not in the list of 2866 * modified extents or that are in the list but with a 2867 * generation lower then the current generation, so there is no 2868 * need to set the full fsync flag on the inode (it hurts the 2869 * fsync performance for workloads with a data size that exceeds 2870 * or is close to the system's memory). 2871 */ 2872 btrfs_remove_extent_mapping(inode, em); 2873 /* Once for the inode's extent map tree. */ 2874 btrfs_free_extent_map(em); 2875 next: 2876 start = btrfs_extent_map_end(em); 2877 write_unlock(&extent_tree->lock); 2878 2879 /* Once for us, for the lookup_extent_mapping() reference. */ 2880 btrfs_free_extent_map(em); 2881 2882 if (need_resched()) { 2883 /* 2884 * If we need to resched but we can't block just exit 2885 * and leave any remaining extent maps. 2886 */ 2887 if (!gfpflags_allow_blocking(mask)) 2888 break; 2889 2890 cond_resched(); 2891 } 2892 } 2893 return try_release_extent_state(io_tree, folio); 2894 } 2895 2896 static int extent_buffer_under_io(const struct extent_buffer *eb) 2897 { 2898 return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 2899 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2900 } 2901 2902 static bool folio_range_has_eb(struct folio *folio) 2903 { 2904 struct btrfs_folio_state *bfs; 2905 2906 lockdep_assert_held(&folio->mapping->i_private_lock); 2907 2908 if (folio_test_private(folio)) { 2909 bfs = folio_get_private(folio); 2910 if (atomic_read(&bfs->eb_refs)) 2911 return true; 2912 } 2913 return false; 2914 } 2915 2916 static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) 2917 { 2918 struct btrfs_fs_info *fs_info = eb->fs_info; 2919 struct address_space *mapping = folio->mapping; 2920 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 2921 2922 /* 2923 * For mapped eb, we're going to change the folio private, which should 2924 * be done under the i_private_lock. 2925 */ 2926 if (mapped) 2927 spin_lock(&mapping->i_private_lock); 2928 2929 if (!folio_test_private(folio)) { 2930 if (mapped) 2931 spin_unlock(&mapping->i_private_lock); 2932 return; 2933 } 2934 2935 if (!btrfs_meta_is_subpage(fs_info)) { 2936 /* 2937 * We do this since we'll remove the pages after we've removed 2938 * the eb from the xarray, so we could race and have this page 2939 * now attached to the new eb. So only clear folio if it's 2940 * still connected to this eb. 2941 */ 2942 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2943 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2944 BUG_ON(folio_test_dirty(folio)); 2945 BUG_ON(folio_test_writeback(folio)); 2946 /* We need to make sure we haven't be attached to a new eb. */ 2947 folio_detach_private(folio); 2948 } 2949 if (mapped) 2950 spin_unlock(&mapping->i_private_lock); 2951 return; 2952 } 2953 2954 /* 2955 * For subpage, we can have dummy eb with folio private attached. In 2956 * this case, we can directly detach the private as such folio is only 2957 * attached to one dummy eb, no sharing. 2958 */ 2959 if (!mapped) { 2960 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2961 return; 2962 } 2963 2964 btrfs_folio_dec_eb_refs(fs_info, folio); 2965 2966 /* 2967 * We can only detach the folio private if there are no other ebs in the 2968 * page range and no unfinished IO. 2969 */ 2970 if (!folio_range_has_eb(folio)) 2971 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2972 2973 spin_unlock(&mapping->i_private_lock); 2974 } 2975 2976 /* Release all folios attached to the extent buffer */ 2977 static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) 2978 { 2979 ASSERT(!extent_buffer_under_io(eb)); 2980 2981 for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { 2982 struct folio *folio = eb->folios[i]; 2983 2984 if (!folio) 2985 continue; 2986 2987 detach_extent_buffer_folio(eb, folio); 2988 } 2989 } 2990 2991 /* 2992 * Helper for releasing the extent buffer. 2993 */ 2994 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 2995 { 2996 btrfs_release_extent_buffer_folios(eb); 2997 btrfs_leak_debug_del_eb(eb); 2998 kmem_cache_free(extent_buffer_cache, eb); 2999 } 3000 3001 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3002 u64 start) 3003 { 3004 struct extent_buffer *eb = NULL; 3005 3006 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 3007 eb->start = start; 3008 eb->len = fs_info->nodesize; 3009 eb->fs_info = fs_info; 3010 init_rwsem(&eb->lock); 3011 3012 btrfs_leak_debug_add_eb(eb); 3013 3014 spin_lock_init(&eb->refs_lock); 3015 refcount_set(&eb->refs, 1); 3016 3017 ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); 3018 3019 return eb; 3020 } 3021 3022 /* 3023 * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() 3024 * does not call folio_put(), and we need to set the folios to NULL so that 3025 * btrfs_release_extent_buffer() will not detach them a second time. 3026 */ 3027 static void cleanup_extent_buffer_folios(struct extent_buffer *eb) 3028 { 3029 const int num_folios = num_extent_folios(eb); 3030 3031 /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ 3032 for (int i = 0; i < num_folios; i++) { 3033 ASSERT(eb->folios[i]); 3034 detach_extent_buffer_folio(eb, eb->folios[i]); 3035 folio_put(eb->folios[i]); 3036 eb->folios[i] = NULL; 3037 } 3038 } 3039 3040 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 3041 { 3042 struct extent_buffer *new; 3043 int num_folios; 3044 int ret; 3045 3046 new = __alloc_extent_buffer(src->fs_info, src->start); 3047 if (new == NULL) 3048 return NULL; 3049 3050 /* 3051 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 3052 * btrfs_release_extent_buffer() have different behavior for 3053 * UNMAPPED subpage extent buffer. 3054 */ 3055 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 3056 3057 ret = alloc_eb_folio_array(new, false); 3058 if (ret) 3059 goto release_eb; 3060 3061 ASSERT(num_extent_folios(src) == num_extent_folios(new), 3062 "%d != %d", num_extent_folios(src), num_extent_folios(new)); 3063 /* Explicitly use the cached num_extent value from now on. */ 3064 num_folios = num_extent_folios(src); 3065 for (int i = 0; i < num_folios; i++) { 3066 struct folio *folio = new->folios[i]; 3067 3068 ret = attach_extent_buffer_folio(new, folio, NULL); 3069 if (ret < 0) 3070 goto cleanup_folios; 3071 WARN_ON(folio_test_dirty(folio)); 3072 } 3073 for (int i = 0; i < num_folios; i++) 3074 folio_put(new->folios[i]); 3075 3076 copy_extent_buffer_full(new, src); 3077 set_extent_buffer_uptodate(new); 3078 3079 return new; 3080 3081 cleanup_folios: 3082 cleanup_extent_buffer_folios(new); 3083 release_eb: 3084 btrfs_release_extent_buffer(new); 3085 return NULL; 3086 } 3087 3088 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 3089 u64 start) 3090 { 3091 struct extent_buffer *eb; 3092 int ret; 3093 3094 eb = __alloc_extent_buffer(fs_info, start); 3095 if (!eb) 3096 return NULL; 3097 3098 ret = alloc_eb_folio_array(eb, false); 3099 if (ret) 3100 goto release_eb; 3101 3102 for (int i = 0; i < num_extent_folios(eb); i++) { 3103 ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); 3104 if (ret < 0) 3105 goto cleanup_folios; 3106 } 3107 for (int i = 0; i < num_extent_folios(eb); i++) 3108 folio_put(eb->folios[i]); 3109 3110 set_extent_buffer_uptodate(eb); 3111 btrfs_set_header_nritems(eb, 0); 3112 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 3113 3114 return eb; 3115 3116 cleanup_folios: 3117 cleanup_extent_buffer_folios(eb); 3118 release_eb: 3119 btrfs_release_extent_buffer(eb); 3120 return NULL; 3121 } 3122 3123 static void check_buffer_tree_ref(struct extent_buffer *eb) 3124 { 3125 int refs; 3126 /* 3127 * The TREE_REF bit is first set when the extent_buffer is added to the 3128 * xarray. It is also reset, if unset, when a new reference is created 3129 * by find_extent_buffer. 3130 * 3131 * It is only cleared in two cases: freeing the last non-tree 3132 * reference to the extent_buffer when its STALE bit is set or 3133 * calling release_folio when the tree reference is the only reference. 3134 * 3135 * In both cases, care is taken to ensure that the extent_buffer's 3136 * pages are not under io. However, release_folio can be concurrently 3137 * called with creating new references, which is prone to race 3138 * conditions between the calls to check_buffer_tree_ref in those 3139 * codepaths and clearing TREE_REF in try_release_extent_buffer. 3140 * 3141 * The actual lifetime of the extent_buffer in the xarray is adequately 3142 * protected by the refcount, but the TREE_REF bit and its corresponding 3143 * reference are not. To protect against this class of races, we call 3144 * check_buffer_tree_ref() from the code paths which trigger io. Note that 3145 * once io is initiated, TREE_REF can no longer be cleared, so that is 3146 * the moment at which any such race is best fixed. 3147 */ 3148 refs = refcount_read(&eb->refs); 3149 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3150 return; 3151 3152 spin_lock(&eb->refs_lock); 3153 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3154 refcount_inc(&eb->refs); 3155 spin_unlock(&eb->refs_lock); 3156 } 3157 3158 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 3159 { 3160 check_buffer_tree_ref(eb); 3161 3162 for (int i = 0; i < num_extent_folios(eb); i++) 3163 folio_mark_accessed(eb->folios[i]); 3164 } 3165 3166 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 3167 u64 start) 3168 { 3169 struct extent_buffer *eb; 3170 3171 eb = find_extent_buffer_nolock(fs_info, start); 3172 if (!eb) 3173 return NULL; 3174 /* 3175 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 3176 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 3177 * another task running free_extent_buffer() might have seen that flag 3178 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 3179 * writeback flags not set) and it's still in the tree (flag 3180 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 3181 * decrementing the extent buffer's reference count twice. So here we 3182 * could race and increment the eb's reference count, clear its stale 3183 * flag, mark it as dirty and drop our reference before the other task 3184 * finishes executing free_extent_buffer, which would later result in 3185 * an attempt to free an extent buffer that is dirty. 3186 */ 3187 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 3188 spin_lock(&eb->refs_lock); 3189 spin_unlock(&eb->refs_lock); 3190 } 3191 mark_extent_buffer_accessed(eb); 3192 return eb; 3193 } 3194 3195 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 3196 u64 start) 3197 { 3198 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3199 struct extent_buffer *eb, *exists = NULL; 3200 int ret; 3201 3202 eb = find_extent_buffer(fs_info, start); 3203 if (eb) 3204 return eb; 3205 eb = alloc_dummy_extent_buffer(fs_info, start); 3206 if (!eb) 3207 return ERR_PTR(-ENOMEM); 3208 eb->fs_info = fs_info; 3209 again: 3210 xa_lock_irq(&fs_info->buffer_tree); 3211 exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, 3212 NULL, eb, GFP_NOFS); 3213 if (xa_is_err(exists)) { 3214 ret = xa_err(exists); 3215 xa_unlock_irq(&fs_info->buffer_tree); 3216 btrfs_release_extent_buffer(eb); 3217 return ERR_PTR(ret); 3218 } 3219 if (exists) { 3220 if (!refcount_inc_not_zero(&exists->refs)) { 3221 /* The extent buffer is being freed, retry. */ 3222 xa_unlock_irq(&fs_info->buffer_tree); 3223 goto again; 3224 } 3225 xa_unlock_irq(&fs_info->buffer_tree); 3226 btrfs_release_extent_buffer(eb); 3227 return exists; 3228 } 3229 xa_unlock_irq(&fs_info->buffer_tree); 3230 check_buffer_tree_ref(eb); 3231 3232 return eb; 3233 #else 3234 /* Stub to avoid linker error when compiled with optimizations turned off. */ 3235 return NULL; 3236 #endif 3237 } 3238 3239 static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, 3240 struct folio *folio) 3241 { 3242 struct extent_buffer *exists; 3243 3244 lockdep_assert_held(&folio->mapping->i_private_lock); 3245 3246 /* 3247 * For subpage case, we completely rely on xarray to ensure we don't try 3248 * to insert two ebs for the same bytenr. So here we always return NULL 3249 * and just continue. 3250 */ 3251 if (btrfs_meta_is_subpage(fs_info)) 3252 return NULL; 3253 3254 /* Page not yet attached to an extent buffer */ 3255 if (!folio_test_private(folio)) 3256 return NULL; 3257 3258 /* 3259 * We could have already allocated an eb for this folio and attached one 3260 * so lets see if we can get a ref on the existing eb, and if we can we 3261 * know it's good and we can just return that one, else we know we can 3262 * just overwrite folio private. 3263 */ 3264 exists = folio_get_private(folio); 3265 if (refcount_inc_not_zero(&exists->refs)) 3266 return exists; 3267 3268 WARN_ON(folio_test_dirty(folio)); 3269 folio_detach_private(folio); 3270 return NULL; 3271 } 3272 3273 /* 3274 * Validate alignment constraints of eb at logical address @start. 3275 */ 3276 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 3277 { 3278 const u32 nodesize = fs_info->nodesize; 3279 3280 if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { 3281 btrfs_err(fs_info, "bad tree block start %llu", start); 3282 return true; 3283 } 3284 3285 if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { 3286 btrfs_err(fs_info, 3287 "tree block is not nodesize aligned, start %llu nodesize %u", 3288 start, nodesize); 3289 return true; 3290 } 3291 if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { 3292 btrfs_err(fs_info, 3293 "tree block is not page aligned, start %llu nodesize %u", 3294 start, nodesize); 3295 return true; 3296 } 3297 if (unlikely(!IS_ALIGNED(start, nodesize) && 3298 !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { 3299 btrfs_warn(fs_info, 3300 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", 3301 start, nodesize); 3302 } 3303 return false; 3304 } 3305 3306 /* 3307 * Return 0 if eb->folios[i] is attached to btree inode successfully. 3308 * Return >0 if there is already another extent buffer for the range, 3309 * and @found_eb_ret would be updated. 3310 * Return -EAGAIN if the filemap has an existing folio but with different size 3311 * than @eb. 3312 * The caller needs to free the existing folios and retry using the same order. 3313 */ 3314 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3315 struct btrfs_folio_state *prealloc, 3316 struct extent_buffer **found_eb_ret) 3317 { 3318 3319 struct btrfs_fs_info *fs_info = eb->fs_info; 3320 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3321 const pgoff_t index = eb->start >> PAGE_SHIFT; 3322 struct folio *existing_folio; 3323 int ret; 3324 3325 ASSERT(found_eb_ret); 3326 3327 /* Caller should ensure the folio exists. */ 3328 ASSERT(eb->folios[i]); 3329 3330 retry: 3331 existing_folio = NULL; 3332 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3333 GFP_NOFS | __GFP_NOFAIL); 3334 if (!ret) 3335 goto finish; 3336 3337 existing_folio = filemap_lock_folio(mapping, index + i); 3338 /* The page cache only exists for a very short time, just retry. */ 3339 if (IS_ERR(existing_folio)) 3340 goto retry; 3341 3342 /* For now, we should only have single-page folios for btree inode. */ 3343 ASSERT(folio_nr_pages(existing_folio) == 1); 3344 3345 if (folio_size(existing_folio) != eb->folio_size) { 3346 folio_unlock(existing_folio); 3347 folio_put(existing_folio); 3348 return -EAGAIN; 3349 } 3350 3351 finish: 3352 spin_lock(&mapping->i_private_lock); 3353 if (existing_folio && btrfs_meta_is_subpage(fs_info)) { 3354 /* We're going to reuse the existing page, can drop our folio now. */ 3355 __free_page(folio_page(eb->folios[i], 0)); 3356 eb->folios[i] = existing_folio; 3357 } else if (existing_folio) { 3358 struct extent_buffer *existing_eb; 3359 3360 existing_eb = grab_extent_buffer(fs_info, existing_folio); 3361 if (existing_eb) { 3362 /* The extent buffer still exists, we can use it directly. */ 3363 *found_eb_ret = existing_eb; 3364 spin_unlock(&mapping->i_private_lock); 3365 folio_unlock(existing_folio); 3366 folio_put(existing_folio); 3367 return 1; 3368 } 3369 /* The extent buffer no longer exists, we can reuse the folio. */ 3370 __free_page(folio_page(eb->folios[i], 0)); 3371 eb->folios[i] = existing_folio; 3372 } 3373 eb->folio_size = folio_size(eb->folios[i]); 3374 eb->folio_shift = folio_shift(eb->folios[i]); 3375 /* Should not fail, as we have preallocated the memory. */ 3376 ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3377 ASSERT(!ret); 3378 /* 3379 * To inform we have an extra eb under allocation, so that 3380 * detach_extent_buffer_page() won't release the folio private when the 3381 * eb hasn't been inserted into the xarray yet. 3382 * 3383 * The ref will be decreased when the eb releases the page, in 3384 * detach_extent_buffer_page(). Thus needs no special handling in the 3385 * error path. 3386 */ 3387 btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3388 spin_unlock(&mapping->i_private_lock); 3389 return 0; 3390 } 3391 3392 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3393 u64 start, u64 owner_root, int level) 3394 { 3395 int attached = 0; 3396 struct extent_buffer *eb; 3397 struct extent_buffer *existing_eb = NULL; 3398 struct btrfs_folio_state *prealloc = NULL; 3399 u64 lockdep_owner = owner_root; 3400 bool page_contig = true; 3401 int uptodate = 1; 3402 int ret; 3403 3404 if (check_eb_alignment(fs_info, start)) 3405 return ERR_PTR(-EINVAL); 3406 3407 #if BITS_PER_LONG == 32 3408 if (start >= MAX_LFS_FILESIZE) { 3409 btrfs_err_rl(fs_info, 3410 "extent buffer %llu is beyond 32bit page cache limit", start); 3411 btrfs_err_32bit_limit(fs_info); 3412 return ERR_PTR(-EOVERFLOW); 3413 } 3414 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 3415 btrfs_warn_32bit_limit(fs_info); 3416 #endif 3417 3418 eb = find_extent_buffer(fs_info, start); 3419 if (eb) 3420 return eb; 3421 3422 eb = __alloc_extent_buffer(fs_info, start); 3423 if (!eb) 3424 return ERR_PTR(-ENOMEM); 3425 3426 /* 3427 * The reloc trees are just snapshots, so we need them to appear to be 3428 * just like any other fs tree WRT lockdep. 3429 */ 3430 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 3431 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 3432 3433 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 3434 3435 /* 3436 * Preallocate folio private for subpage case, so that we won't 3437 * allocate memory with i_private_lock nor page lock hold. 3438 * 3439 * The memory will be freed by attach_extent_buffer_page() or freed 3440 * manually if we exit earlier. 3441 */ 3442 if (btrfs_meta_is_subpage(fs_info)) { 3443 prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); 3444 if (IS_ERR(prealloc)) { 3445 ret = PTR_ERR(prealloc); 3446 goto out; 3447 } 3448 } 3449 3450 reallocate: 3451 /* Allocate all pages first. */ 3452 ret = alloc_eb_folio_array(eb, true); 3453 if (ret < 0) { 3454 btrfs_free_folio_state(prealloc); 3455 goto out; 3456 } 3457 3458 /* Attach all pages to the filemap. */ 3459 for (int i = 0; i < num_extent_folios(eb); i++) { 3460 struct folio *folio; 3461 3462 ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3463 if (ret > 0) { 3464 ASSERT(existing_eb); 3465 goto out; 3466 } 3467 3468 /* 3469 * TODO: Special handling for a corner case where the order of 3470 * folios mismatch between the new eb and filemap. 3471 * 3472 * This happens when: 3473 * 3474 * - the new eb is using higher order folio 3475 * 3476 * - the filemap is still using 0-order folios for the range 3477 * This can happen at the previous eb allocation, and we don't 3478 * have higher order folio for the call. 3479 * 3480 * - the existing eb has already been freed 3481 * 3482 * In this case, we have to free the existing folios first, and 3483 * re-allocate using the same order. 3484 * Thankfully this is not going to happen yet, as we're still 3485 * using 0-order folios. 3486 */ 3487 if (unlikely(ret == -EAGAIN)) { 3488 DEBUG_WARN("folio order mismatch between new eb and filemap"); 3489 goto reallocate; 3490 } 3491 attached++; 3492 3493 /* 3494 * Only after attach_eb_folio_to_filemap(), eb->folios[] is 3495 * reliable, as we may choose to reuse the existing page cache 3496 * and free the allocated page. 3497 */ 3498 folio = eb->folios[i]; 3499 WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); 3500 3501 /* 3502 * Check if the current page is physically contiguous with previous eb 3503 * page. 3504 * At this stage, either we allocated a large folio, thus @i 3505 * would only be 0, or we fall back to per-page allocation. 3506 */ 3507 if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) 3508 page_contig = false; 3509 3510 if (!btrfs_meta_folio_test_uptodate(folio, eb)) 3511 uptodate = 0; 3512 3513 /* 3514 * We can't unlock the pages just yet since the extent buffer 3515 * hasn't been properly inserted into the xarray, this opens a 3516 * race with btree_release_folio() which can free a page while we 3517 * are still filling in all pages for the buffer and we could crash. 3518 */ 3519 } 3520 if (uptodate) 3521 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3522 /* All pages are physically contiguous, can skip cross page handling. */ 3523 if (page_contig) 3524 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3525 again: 3526 xa_lock_irq(&fs_info->buffer_tree); 3527 existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3528 start >> fs_info->nodesize_bits, NULL, eb, 3529 GFP_NOFS); 3530 if (xa_is_err(existing_eb)) { 3531 ret = xa_err(existing_eb); 3532 xa_unlock_irq(&fs_info->buffer_tree); 3533 goto out; 3534 } 3535 if (existing_eb) { 3536 if (!refcount_inc_not_zero(&existing_eb->refs)) { 3537 xa_unlock_irq(&fs_info->buffer_tree); 3538 goto again; 3539 } 3540 xa_unlock_irq(&fs_info->buffer_tree); 3541 goto out; 3542 } 3543 xa_unlock_irq(&fs_info->buffer_tree); 3544 3545 /* add one reference for the tree */ 3546 check_buffer_tree_ref(eb); 3547 3548 /* 3549 * Now it's safe to unlock the pages because any calls to 3550 * btree_release_folio will correctly detect that a page belongs to a 3551 * live buffer and won't free them prematurely. 3552 */ 3553 for (int i = 0; i < num_extent_folios(eb); i++) { 3554 folio_unlock(eb->folios[i]); 3555 /* 3556 * A folio that has been added to an address_space mapping 3557 * should not continue holding the refcount from its original 3558 * allocation indefinitely. 3559 */ 3560 folio_put(eb->folios[i]); 3561 } 3562 return eb; 3563 3564 out: 3565 WARN_ON(!refcount_dec_and_test(&eb->refs)); 3566 3567 /* 3568 * Any attached folios need to be detached before we unlock them. This 3569 * is because when we're inserting our new folios into the mapping, and 3570 * then attaching our eb to that folio. If we fail to insert our folio 3571 * we'll lookup the folio for that index, and grab that EB. We do not 3572 * want that to grab this eb, as we're getting ready to free it. So we 3573 * have to detach it first and then unlock it. 3574 * 3575 * Note: the bounds is num_extent_pages() as we need to go through all slots. 3576 */ 3577 for (int i = 0; i < num_extent_pages(eb); i++) { 3578 struct folio *folio = eb->folios[i]; 3579 3580 if (i < attached) { 3581 ASSERT(folio); 3582 detach_extent_buffer_folio(eb, folio); 3583 folio_unlock(folio); 3584 } else if (!folio) { 3585 continue; 3586 } 3587 3588 folio_put(folio); 3589 eb->folios[i] = NULL; 3590 } 3591 btrfs_release_extent_buffer(eb); 3592 if (ret < 0) 3593 return ERR_PTR(ret); 3594 ASSERT(existing_eb); 3595 return existing_eb; 3596 } 3597 3598 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3599 { 3600 struct extent_buffer *eb = 3601 container_of(head, struct extent_buffer, rcu_head); 3602 3603 kmem_cache_free(extent_buffer_cache, eb); 3604 } 3605 3606 static int release_extent_buffer(struct extent_buffer *eb) 3607 __releases(&eb->refs_lock) 3608 { 3609 lockdep_assert_held(&eb->refs_lock); 3610 3611 if (refcount_dec_and_test(&eb->refs)) { 3612 struct btrfs_fs_info *fs_info = eb->fs_info; 3613 3614 spin_unlock(&eb->refs_lock); 3615 3616 /* 3617 * We're erasing, theoretically there will be no allocations, so 3618 * just use GFP_ATOMIC. 3619 * 3620 * We use cmpxchg instead of erase because we do not know if 3621 * this eb is actually in the tree or not, we could be cleaning 3622 * up an eb that we allocated but never inserted into the tree. 3623 * Thus use cmpxchg to remove it from the tree if it is there, 3624 * or leave the other entry if this isn't in the tree. 3625 * 3626 * The documentation says that putting a NULL value is the same 3627 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3628 * in this case. 3629 */ 3630 xa_cmpxchg_irq(&fs_info->buffer_tree, 3631 eb->start >> fs_info->nodesize_bits, eb, NULL, 3632 GFP_ATOMIC); 3633 3634 btrfs_leak_debug_del_eb(eb); 3635 /* Should be safe to release folios at this point. */ 3636 btrfs_release_extent_buffer_folios(eb); 3637 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3638 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 3639 kmem_cache_free(extent_buffer_cache, eb); 3640 return 1; 3641 } 3642 #endif 3643 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3644 return 1; 3645 } 3646 spin_unlock(&eb->refs_lock); 3647 3648 return 0; 3649 } 3650 3651 void free_extent_buffer(struct extent_buffer *eb) 3652 { 3653 int refs; 3654 if (!eb) 3655 return; 3656 3657 refs = refcount_read(&eb->refs); 3658 while (1) { 3659 if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { 3660 if (refs == 1) 3661 break; 3662 } else if (refs <= 3) { 3663 break; 3664 } 3665 3666 /* Optimization to avoid locking eb->refs_lock. */ 3667 if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) 3668 return; 3669 } 3670 3671 spin_lock(&eb->refs_lock); 3672 if (refcount_read(&eb->refs) == 2 && 3673 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 3674 !extent_buffer_under_io(eb) && 3675 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3676 refcount_dec(&eb->refs); 3677 3678 /* 3679 * I know this is terrible, but it's temporary until we stop tracking 3680 * the uptodate bits and such for the extent buffers. 3681 */ 3682 release_extent_buffer(eb); 3683 } 3684 3685 void free_extent_buffer_stale(struct extent_buffer *eb) 3686 { 3687 if (!eb) 3688 return; 3689 3690 spin_lock(&eb->refs_lock); 3691 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 3692 3693 if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 3694 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3695 refcount_dec(&eb->refs); 3696 release_extent_buffer(eb); 3697 } 3698 3699 static void btree_clear_folio_dirty_tag(struct folio *folio) 3700 { 3701 ASSERT(!folio_test_dirty(folio)); 3702 ASSERT(folio_test_locked(folio)); 3703 xa_lock_irq(&folio->mapping->i_pages); 3704 if (!folio_test_dirty(folio)) 3705 __xa_clear_mark(&folio->mapping->i_pages, folio->index, 3706 PAGECACHE_TAG_DIRTY); 3707 xa_unlock_irq(&folio->mapping->i_pages); 3708 } 3709 3710 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, 3711 struct extent_buffer *eb) 3712 { 3713 struct btrfs_fs_info *fs_info = eb->fs_info; 3714 3715 btrfs_assert_tree_write_locked(eb); 3716 3717 if (trans && btrfs_header_generation(eb) != trans->transid) 3718 return; 3719 3720 /* 3721 * Instead of clearing the dirty flag off of the buffer, mark it as 3722 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve 3723 * write-ordering in zoned mode, without the need to later re-dirty 3724 * the extent_buffer. 3725 * 3726 * The actual zeroout of the buffer will happen later in 3727 * btree_csum_one_bio. 3728 */ 3729 if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3730 set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); 3731 return; 3732 } 3733 3734 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) 3735 return; 3736 3737 buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); 3738 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, 3739 fs_info->dirty_metadata_batch); 3740 3741 for (int i = 0; i < num_extent_folios(eb); i++) { 3742 struct folio *folio = eb->folios[i]; 3743 bool last; 3744 3745 if (!folio_test_dirty(folio)) 3746 continue; 3747 folio_lock(folio); 3748 last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); 3749 if (last) 3750 btree_clear_folio_dirty_tag(folio); 3751 folio_unlock(folio); 3752 } 3753 WARN_ON(refcount_read(&eb->refs) == 0); 3754 } 3755 3756 void set_extent_buffer_dirty(struct extent_buffer *eb) 3757 { 3758 bool was_dirty; 3759 3760 check_buffer_tree_ref(eb); 3761 3762 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3763 3764 WARN_ON(refcount_read(&eb->refs) == 0); 3765 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 3766 WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); 3767 3768 if (!was_dirty) { 3769 bool subpage = btrfs_meta_is_subpage(eb->fs_info); 3770 3771 /* 3772 * For subpage case, we can have other extent buffers in the 3773 * same page, and in clear_extent_buffer_dirty() we 3774 * have to clear page dirty without subpage lock held. 3775 * This can cause race where our page gets dirty cleared after 3776 * we just set it. 3777 * 3778 * Thankfully, clear_extent_buffer_dirty() has locked 3779 * its page for other reasons, we can use page lock to prevent 3780 * the above race. 3781 */ 3782 if (subpage) 3783 folio_lock(eb->folios[0]); 3784 for (int i = 0; i < num_extent_folios(eb); i++) 3785 btrfs_meta_folio_set_dirty(eb->folios[i], eb); 3786 buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); 3787 if (subpage) 3788 folio_unlock(eb->folios[0]); 3789 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, 3790 eb->len, 3791 eb->fs_info->dirty_metadata_batch); 3792 } 3793 #ifdef CONFIG_BTRFS_DEBUG 3794 for (int i = 0; i < num_extent_folios(eb); i++) 3795 ASSERT(folio_test_dirty(eb->folios[i])); 3796 #endif 3797 } 3798 3799 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 3800 { 3801 3802 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3803 for (int i = 0; i < num_extent_folios(eb); i++) { 3804 struct folio *folio = eb->folios[i]; 3805 3806 if (!folio) 3807 continue; 3808 3809 btrfs_meta_folio_clear_uptodate(folio, eb); 3810 } 3811 } 3812 3813 void set_extent_buffer_uptodate(struct extent_buffer *eb) 3814 { 3815 3816 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3817 for (int i = 0; i < num_extent_folios(eb); i++) 3818 btrfs_meta_folio_set_uptodate(eb->folios[i], eb); 3819 } 3820 3821 static void clear_extent_buffer_reading(struct extent_buffer *eb) 3822 { 3823 clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); 3824 } 3825 3826 static void end_bbio_meta_read(struct btrfs_bio *bbio) 3827 { 3828 struct extent_buffer *eb = bbio->private; 3829 bool uptodate = !bbio->bio.bi_status; 3830 3831 /* 3832 * If the extent buffer is marked UPTODATE before the read operation 3833 * completes, other calls to read_extent_buffer_pages() will return 3834 * early without waiting for the read to finish, causing data races. 3835 */ 3836 WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); 3837 3838 eb->read_mirror = bbio->mirror_num; 3839 3840 if (uptodate && 3841 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) 3842 uptodate = false; 3843 3844 if (uptodate) 3845 set_extent_buffer_uptodate(eb); 3846 else 3847 clear_extent_buffer_uptodate(eb); 3848 3849 clear_extent_buffer_reading(eb); 3850 free_extent_buffer(eb); 3851 3852 bio_put(&bbio->bio); 3853 } 3854 3855 int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, 3856 const struct btrfs_tree_parent_check *check) 3857 { 3858 struct btrfs_fs_info *fs_info = eb->fs_info; 3859 struct btrfs_bio *bbio; 3860 3861 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3862 return 0; 3863 3864 /* 3865 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 3866 * operation, which could potentially still be in flight. In this case 3867 * we simply want to return an error. 3868 */ 3869 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 3870 return -EIO; 3871 3872 /* Someone else is already reading the buffer, just wait for it. */ 3873 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) 3874 return 0; 3875 3876 /* 3877 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above 3878 * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have 3879 * started and finished reading the same eb. In this case, UPTODATE 3880 * will now be set, and we shouldn't read it in again. 3881 */ 3882 if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { 3883 clear_extent_buffer_reading(eb); 3884 return 0; 3885 } 3886 3887 eb->read_mirror = 0; 3888 check_buffer_tree_ref(eb); 3889 refcount_inc(&eb->refs); 3890 3891 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 3892 REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode), 3893 eb->start, end_bbio_meta_read, eb); 3894 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 3895 memcpy(&bbio->parent_check, check, sizeof(*check)); 3896 for (int i = 0; i < num_extent_folios(eb); i++) { 3897 struct folio *folio = eb->folios[i]; 3898 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 3899 u32 range_len = min_t(u64, folio_next_pos(folio), 3900 eb->start + eb->len) - range_start; 3901 3902 bio_add_folio_nofail(&bbio->bio, folio, range_len, 3903 offset_in_folio(folio, range_start)); 3904 } 3905 btrfs_submit_bbio(bbio, mirror_num); 3906 return 0; 3907 } 3908 3909 int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, 3910 const struct btrfs_tree_parent_check *check) 3911 { 3912 int ret; 3913 3914 ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); 3915 if (ret < 0) 3916 return ret; 3917 3918 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3919 if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) 3920 return -EIO; 3921 return 0; 3922 } 3923 3924 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 3925 unsigned long len) 3926 { 3927 btrfs_warn(eb->fs_info, 3928 "access to eb bytenr %llu len %u out of range start %lu len %lu", 3929 eb->start, eb->len, start, len); 3930 DEBUG_WARN(); 3931 3932 return true; 3933 } 3934 3935 /* 3936 * Check if the [start, start + len) range is valid before reading/writing 3937 * the eb. 3938 * NOTE: @start and @len are offset inside the eb, not logical address. 3939 * 3940 * Caller should not touch the dst/src memory if this function returns error. 3941 */ 3942 static inline int check_eb_range(const struct extent_buffer *eb, 3943 unsigned long start, unsigned long len) 3944 { 3945 unsigned long offset; 3946 3947 /* start, start + len should not go beyond eb->len nor overflow */ 3948 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 3949 return report_eb_range(eb, start, len); 3950 3951 return false; 3952 } 3953 3954 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 3955 unsigned long start, unsigned long len) 3956 { 3957 const int unit_size = eb->folio_size; 3958 size_t cur; 3959 size_t offset; 3960 char *dst = (char *)dstv; 3961 unsigned long i = get_eb_folio_index(eb, start); 3962 3963 if (check_eb_range(eb, start, len)) { 3964 /* 3965 * Invalid range hit, reset the memory, so callers won't get 3966 * some random garbage for their uninitialized memory. 3967 */ 3968 memset(dstv, 0, len); 3969 return; 3970 } 3971 3972 if (eb->addr) { 3973 memcpy(dstv, eb->addr + start, len); 3974 return; 3975 } 3976 3977 offset = get_eb_offset_in_folio(eb, start); 3978 3979 while (len > 0) { 3980 char *kaddr; 3981 3982 cur = min(len, unit_size - offset); 3983 kaddr = folio_address(eb->folios[i]); 3984 memcpy(dst, kaddr + offset, cur); 3985 3986 dst += cur; 3987 len -= cur; 3988 offset = 0; 3989 i++; 3990 } 3991 } 3992 3993 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 3994 void __user *dstv, 3995 unsigned long start, unsigned long len) 3996 { 3997 const int unit_size = eb->folio_size; 3998 size_t cur; 3999 size_t offset; 4000 char __user *dst = (char __user *)dstv; 4001 unsigned long i = get_eb_folio_index(eb, start); 4002 int ret = 0; 4003 4004 WARN_ON(start > eb->len); 4005 WARN_ON(start + len > eb->start + eb->len); 4006 4007 if (eb->addr) { 4008 if (copy_to_user_nofault(dstv, eb->addr + start, len)) 4009 ret = -EFAULT; 4010 return ret; 4011 } 4012 4013 offset = get_eb_offset_in_folio(eb, start); 4014 4015 while (len > 0) { 4016 char *kaddr; 4017 4018 cur = min(len, unit_size - offset); 4019 kaddr = folio_address(eb->folios[i]); 4020 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 4021 ret = -EFAULT; 4022 break; 4023 } 4024 4025 dst += cur; 4026 len -= cur; 4027 offset = 0; 4028 i++; 4029 } 4030 4031 return ret; 4032 } 4033 4034 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 4035 unsigned long start, unsigned long len) 4036 { 4037 const int unit_size = eb->folio_size; 4038 size_t cur; 4039 size_t offset; 4040 char *kaddr; 4041 char *ptr = (char *)ptrv; 4042 unsigned long i = get_eb_folio_index(eb, start); 4043 int ret = 0; 4044 4045 if (check_eb_range(eb, start, len)) 4046 return -EINVAL; 4047 4048 if (eb->addr) 4049 return memcmp(ptrv, eb->addr + start, len); 4050 4051 offset = get_eb_offset_in_folio(eb, start); 4052 4053 while (len > 0) { 4054 cur = min(len, unit_size - offset); 4055 kaddr = folio_address(eb->folios[i]); 4056 ret = memcmp(ptr, kaddr + offset, cur); 4057 if (ret) 4058 break; 4059 4060 ptr += cur; 4061 len -= cur; 4062 offset = 0; 4063 i++; 4064 } 4065 return ret; 4066 } 4067 4068 /* 4069 * Check that the extent buffer is uptodate. 4070 * 4071 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 4072 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 4073 */ 4074 static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) 4075 { 4076 struct btrfs_fs_info *fs_info = eb->fs_info; 4077 struct folio *folio = eb->folios[i]; 4078 4079 ASSERT(folio); 4080 4081 /* 4082 * If we are using the commit root we could potentially clear a page 4083 * Uptodate while we're using the extent buffer that we've previously 4084 * looked up. We don't want to complain in this case, as the page was 4085 * valid before, we just didn't write it out. Instead we want to catch 4086 * the case where we didn't actually read the block properly, which 4087 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. 4088 */ 4089 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4090 return; 4091 4092 if (btrfs_meta_is_subpage(fs_info)) { 4093 folio = eb->folios[0]; 4094 ASSERT(i == 0); 4095 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, 4096 eb->start, eb->len))) 4097 btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); 4098 } else { 4099 WARN_ON(!folio_test_uptodate(folio)); 4100 } 4101 } 4102 4103 static void __write_extent_buffer(const struct extent_buffer *eb, 4104 const void *srcv, unsigned long start, 4105 unsigned long len, bool use_memmove) 4106 { 4107 const int unit_size = eb->folio_size; 4108 size_t cur; 4109 size_t offset; 4110 char *kaddr; 4111 const char *src = (const char *)srcv; 4112 unsigned long i = get_eb_folio_index(eb, start); 4113 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4114 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4115 4116 if (check_eb_range(eb, start, len)) 4117 return; 4118 4119 if (eb->addr) { 4120 if (use_memmove) 4121 memmove(eb->addr + start, srcv, len); 4122 else 4123 memcpy(eb->addr + start, srcv, len); 4124 return; 4125 } 4126 4127 offset = get_eb_offset_in_folio(eb, start); 4128 4129 while (len > 0) { 4130 if (check_uptodate) 4131 assert_eb_folio_uptodate(eb, i); 4132 4133 cur = min(len, unit_size - offset); 4134 kaddr = folio_address(eb->folios[i]); 4135 if (use_memmove) 4136 memmove(kaddr + offset, src, cur); 4137 else 4138 memcpy(kaddr + offset, src, cur); 4139 4140 src += cur; 4141 len -= cur; 4142 offset = 0; 4143 i++; 4144 } 4145 } 4146 4147 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 4148 unsigned long start, unsigned long len) 4149 { 4150 return __write_extent_buffer(eb, srcv, start, len, false); 4151 } 4152 4153 static void memset_extent_buffer(const struct extent_buffer *eb, int c, 4154 unsigned long start, unsigned long len) 4155 { 4156 const int unit_size = eb->folio_size; 4157 unsigned long cur = start; 4158 4159 if (eb->addr) { 4160 memset(eb->addr + start, c, len); 4161 return; 4162 } 4163 4164 while (cur < start + len) { 4165 unsigned long index = get_eb_folio_index(eb, cur); 4166 unsigned int offset = get_eb_offset_in_folio(eb, cur); 4167 unsigned int cur_len = min(start + len - cur, unit_size - offset); 4168 4169 assert_eb_folio_uptodate(eb, index); 4170 memset(folio_address(eb->folios[index]) + offset, c, cur_len); 4171 4172 cur += cur_len; 4173 } 4174 } 4175 4176 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 4177 unsigned long len) 4178 { 4179 if (check_eb_range(eb, start, len)) 4180 return; 4181 return memset_extent_buffer(eb, 0, start, len); 4182 } 4183 4184 void copy_extent_buffer_full(const struct extent_buffer *dst, 4185 const struct extent_buffer *src) 4186 { 4187 const int unit_size = src->folio_size; 4188 unsigned long cur = 0; 4189 4190 ASSERT(dst->len == src->len); 4191 4192 while (cur < src->len) { 4193 unsigned long index = get_eb_folio_index(src, cur); 4194 unsigned long offset = get_eb_offset_in_folio(src, cur); 4195 unsigned long cur_len = min(src->len, unit_size - offset); 4196 void *addr = folio_address(src->folios[index]) + offset; 4197 4198 write_extent_buffer(dst, addr, cur, cur_len); 4199 4200 cur += cur_len; 4201 } 4202 } 4203 4204 void copy_extent_buffer(const struct extent_buffer *dst, 4205 const struct extent_buffer *src, 4206 unsigned long dst_offset, unsigned long src_offset, 4207 unsigned long len) 4208 { 4209 const int unit_size = dst->folio_size; 4210 u64 dst_len = dst->len; 4211 size_t cur; 4212 size_t offset; 4213 char *kaddr; 4214 unsigned long i = get_eb_folio_index(dst, dst_offset); 4215 4216 if (check_eb_range(dst, dst_offset, len) || 4217 check_eb_range(src, src_offset, len)) 4218 return; 4219 4220 WARN_ON(src->len != dst_len); 4221 4222 offset = get_eb_offset_in_folio(dst, dst_offset); 4223 4224 while (len > 0) { 4225 assert_eb_folio_uptodate(dst, i); 4226 4227 cur = min(len, (unsigned long)(unit_size - offset)); 4228 4229 kaddr = folio_address(dst->folios[i]); 4230 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4231 4232 src_offset += cur; 4233 len -= cur; 4234 offset = 0; 4235 i++; 4236 } 4237 } 4238 4239 /* 4240 * Calculate the folio and offset of the byte containing the given bit number. 4241 * 4242 * @eb: the extent buffer 4243 * @start: offset of the bitmap item in the extent buffer 4244 * @nr: bit number 4245 * @folio_index: return index of the folio in the extent buffer that contains 4246 * the given bit number 4247 * @folio_offset: return offset into the folio given by folio_index 4248 * 4249 * This helper hides the ugliness of finding the byte in an extent buffer which 4250 * contains a given bit. 4251 */ 4252 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 4253 unsigned long start, unsigned long nr, 4254 unsigned long *folio_index, 4255 size_t *folio_offset) 4256 { 4257 size_t byte_offset = BIT_BYTE(nr); 4258 size_t offset; 4259 4260 /* 4261 * The byte we want is the offset of the extent buffer + the offset of 4262 * the bitmap item in the extent buffer + the offset of the byte in the 4263 * bitmap item. 4264 */ 4265 offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; 4266 4267 *folio_index = offset >> eb->folio_shift; 4268 *folio_offset = offset_in_eb_folio(eb, offset); 4269 } 4270 4271 /* 4272 * Determine whether a bit in a bitmap item is set. 4273 * 4274 * @eb: the extent buffer 4275 * @start: offset of the bitmap item in the extent buffer 4276 * @nr: bit number to test 4277 */ 4278 bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 4279 unsigned long nr) 4280 { 4281 unsigned long i; 4282 size_t offset; 4283 u8 *kaddr; 4284 4285 eb_bitmap_offset(eb, start, nr, &i, &offset); 4286 assert_eb_folio_uptodate(eb, i); 4287 kaddr = folio_address(eb->folios[i]); 4288 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 4289 } 4290 4291 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) 4292 { 4293 unsigned long index = get_eb_folio_index(eb, bytenr); 4294 4295 if (check_eb_range(eb, bytenr, 1)) 4296 return NULL; 4297 return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); 4298 } 4299 4300 /* 4301 * Set an area of a bitmap to 1. 4302 * 4303 * @eb: the extent buffer 4304 * @start: offset of the bitmap item in the extent buffer 4305 * @pos: bit number of the first bit 4306 * @len: number of bits to set 4307 */ 4308 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 4309 unsigned long pos, unsigned long len) 4310 { 4311 unsigned int first_byte = start + BIT_BYTE(pos); 4312 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4313 const bool same_byte = (first_byte == last_byte); 4314 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4315 u8 *kaddr; 4316 4317 if (same_byte) 4318 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4319 4320 /* Handle the first byte. */ 4321 kaddr = extent_buffer_get_byte(eb, first_byte); 4322 *kaddr |= mask; 4323 if (same_byte) 4324 return; 4325 4326 /* Handle the byte aligned part. */ 4327 ASSERT(first_byte + 1 <= last_byte); 4328 memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); 4329 4330 /* Handle the last byte. */ 4331 kaddr = extent_buffer_get_byte(eb, last_byte); 4332 *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); 4333 } 4334 4335 4336 /* 4337 * Clear an area of a bitmap. 4338 * 4339 * @eb: the extent buffer 4340 * @start: offset of the bitmap item in the extent buffer 4341 * @pos: bit number of the first bit 4342 * @len: number of bits to clear 4343 */ 4344 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 4345 unsigned long start, unsigned long pos, 4346 unsigned long len) 4347 { 4348 unsigned int first_byte = start + BIT_BYTE(pos); 4349 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4350 const bool same_byte = (first_byte == last_byte); 4351 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4352 u8 *kaddr; 4353 4354 if (same_byte) 4355 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4356 4357 /* Handle the first byte. */ 4358 kaddr = extent_buffer_get_byte(eb, first_byte); 4359 *kaddr &= ~mask; 4360 if (same_byte) 4361 return; 4362 4363 /* Handle the byte aligned part. */ 4364 ASSERT(first_byte + 1 <= last_byte); 4365 memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); 4366 4367 /* Handle the last byte. */ 4368 kaddr = extent_buffer_get_byte(eb, last_byte); 4369 *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); 4370 } 4371 4372 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4373 { 4374 unsigned long distance = (src > dst) ? src - dst : dst - src; 4375 return distance < len; 4376 } 4377 4378 void memcpy_extent_buffer(const struct extent_buffer *dst, 4379 unsigned long dst_offset, unsigned long src_offset, 4380 unsigned long len) 4381 { 4382 const int unit_size = dst->folio_size; 4383 unsigned long cur_off = 0; 4384 4385 if (check_eb_range(dst, dst_offset, len) || 4386 check_eb_range(dst, src_offset, len)) 4387 return; 4388 4389 if (dst->addr) { 4390 const bool use_memmove = areas_overlap(src_offset, dst_offset, len); 4391 4392 if (use_memmove) 4393 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4394 else 4395 memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); 4396 return; 4397 } 4398 4399 while (cur_off < len) { 4400 unsigned long cur_src = cur_off + src_offset; 4401 unsigned long folio_index = get_eb_folio_index(dst, cur_src); 4402 unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); 4403 unsigned long cur_len = min(src_offset + len - cur_src, 4404 unit_size - folio_off); 4405 void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; 4406 const bool use_memmove = areas_overlap(src_offset + cur_off, 4407 dst_offset + cur_off, cur_len); 4408 4409 __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, 4410 use_memmove); 4411 cur_off += cur_len; 4412 } 4413 } 4414 4415 void memmove_extent_buffer(const struct extent_buffer *dst, 4416 unsigned long dst_offset, unsigned long src_offset, 4417 unsigned long len) 4418 { 4419 unsigned long dst_end = dst_offset + len - 1; 4420 unsigned long src_end = src_offset + len - 1; 4421 4422 if (check_eb_range(dst, dst_offset, len) || 4423 check_eb_range(dst, src_offset, len)) 4424 return; 4425 4426 if (dst_offset < src_offset) { 4427 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4428 return; 4429 } 4430 4431 if (dst->addr) { 4432 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4433 return; 4434 } 4435 4436 while (len > 0) { 4437 unsigned long src_i; 4438 size_t cur; 4439 size_t dst_off_in_folio; 4440 size_t src_off_in_folio; 4441 void *src_addr; 4442 bool use_memmove; 4443 4444 src_i = get_eb_folio_index(dst, src_end); 4445 4446 dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); 4447 src_off_in_folio = get_eb_offset_in_folio(dst, src_end); 4448 4449 cur = min_t(unsigned long, len, src_off_in_folio + 1); 4450 cur = min(cur, dst_off_in_folio + 1); 4451 4452 src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - 4453 cur + 1; 4454 use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, 4455 cur); 4456 4457 __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, 4458 use_memmove); 4459 4460 dst_end -= cur; 4461 src_end -= cur; 4462 len -= cur; 4463 } 4464 } 4465 4466 static int try_release_subpage_extent_buffer(struct folio *folio) 4467 { 4468 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4469 struct extent_buffer *eb; 4470 unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); 4471 unsigned long index = start; 4472 unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; 4473 int ret; 4474 4475 rcu_read_lock(); 4476 xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4477 /* 4478 * The same as try_release_extent_buffer(), to ensure the eb 4479 * won't disappear out from under us. 4480 */ 4481 spin_lock(&eb->refs_lock); 4482 rcu_read_unlock(); 4483 4484 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4485 spin_unlock(&eb->refs_lock); 4486 rcu_read_lock(); 4487 continue; 4488 } 4489 4490 /* 4491 * If tree ref isn't set then we know the ref on this eb is a 4492 * real ref, so just return, this eb will likely be freed soon 4493 * anyway. 4494 */ 4495 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4496 spin_unlock(&eb->refs_lock); 4497 break; 4498 } 4499 4500 /* 4501 * Here we don't care about the return value, we will always 4502 * check the folio private at the end. And 4503 * release_extent_buffer() will release the refs_lock. 4504 */ 4505 release_extent_buffer(eb); 4506 rcu_read_lock(); 4507 } 4508 rcu_read_unlock(); 4509 4510 /* 4511 * Finally to check if we have cleared folio private, as if we have 4512 * released all ebs in the page, the folio private should be cleared now. 4513 */ 4514 spin_lock(&folio->mapping->i_private_lock); 4515 if (!folio_test_private(folio)) 4516 ret = 1; 4517 else 4518 ret = 0; 4519 spin_unlock(&folio->mapping->i_private_lock); 4520 return ret; 4521 } 4522 4523 int try_release_extent_buffer(struct folio *folio) 4524 { 4525 struct extent_buffer *eb; 4526 4527 if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) 4528 return try_release_subpage_extent_buffer(folio); 4529 4530 /* 4531 * We need to make sure nobody is changing folio private, as we rely on 4532 * folio private as the pointer to extent buffer. 4533 */ 4534 spin_lock(&folio->mapping->i_private_lock); 4535 if (!folio_test_private(folio)) { 4536 spin_unlock(&folio->mapping->i_private_lock); 4537 return 1; 4538 } 4539 4540 eb = folio_get_private(folio); 4541 BUG_ON(!eb); 4542 4543 /* 4544 * This is a little awful but should be ok, we need to make sure that 4545 * the eb doesn't disappear out from under us while we're looking at 4546 * this page. 4547 */ 4548 spin_lock(&eb->refs_lock); 4549 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4550 spin_unlock(&eb->refs_lock); 4551 spin_unlock(&folio->mapping->i_private_lock); 4552 return 0; 4553 } 4554 spin_unlock(&folio->mapping->i_private_lock); 4555 4556 /* 4557 * If tree ref isn't set then we know the ref on this eb is a real ref, 4558 * so just return, this page will likely be freed soon anyway. 4559 */ 4560 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4561 spin_unlock(&eb->refs_lock); 4562 return 0; 4563 } 4564 4565 return release_extent_buffer(eb); 4566 } 4567 4568 /* 4569 * Attempt to readahead a child block. 4570 * 4571 * @fs_info: the fs_info 4572 * @bytenr: bytenr to read 4573 * @owner_root: objectid of the root that owns this eb 4574 * @gen: generation for the uptodate check, can be 0 4575 * @level: level for the eb 4576 * 4577 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 4578 * normal uptodate check of the eb, without checking the generation. If we have 4579 * to read the block we will not block on anything. 4580 */ 4581 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 4582 u64 bytenr, u64 owner_root, u64 gen, int level) 4583 { 4584 struct btrfs_tree_parent_check check = { 4585 .level = level, 4586 .transid = gen 4587 }; 4588 struct extent_buffer *eb; 4589 int ret; 4590 4591 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 4592 if (IS_ERR(eb)) 4593 return; 4594 4595 if (btrfs_buffer_uptodate(eb, gen, true)) { 4596 free_extent_buffer(eb); 4597 return; 4598 } 4599 4600 ret = read_extent_buffer_pages_nowait(eb, 0, &check); 4601 if (ret < 0) 4602 free_extent_buffer_stale(eb); 4603 else 4604 free_extent_buffer(eb); 4605 } 4606 4607 /* 4608 * Readahead a node's child block. 4609 * 4610 * @node: parent node we're reading from 4611 * @slot: slot in the parent node for the child we want to read 4612 * 4613 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 4614 * the slot in the node provided. 4615 */ 4616 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 4617 { 4618 btrfs_readahead_tree_block(node->fs_info, 4619 btrfs_node_blockptr(node, slot), 4620 btrfs_header_owner(node), 4621 btrfs_node_ptr_generation(node, slot), 4622 btrfs_header_level(node) - 1); 4623 } 4624