1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "extent_io.h" 18 #include "extent-io-tree.h" 19 #include "extent_map.h" 20 #include "ctree.h" 21 #include "btrfs_inode.h" 22 #include "bio.h" 23 #include "locking.h" 24 #include "backref.h" 25 #include "disk-io.h" 26 #include "subpage.h" 27 #include "zoned.h" 28 #include "block-group.h" 29 #include "compression.h" 30 #include "fs.h" 31 #include "accessors.h" 32 #include "file-item.h" 33 #include "file.h" 34 #include "dev-replace.h" 35 #include "super.h" 36 #include "transaction.h" 37 38 static struct kmem_cache *extent_buffer_cache; 39 40 #ifdef CONFIG_BTRFS_DEBUG 41 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) 42 { 43 struct btrfs_fs_info *fs_info = eb->fs_info; 44 unsigned long flags; 45 46 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 47 list_add(&eb->leak_list, &fs_info->allocated_ebs); 48 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 49 } 50 51 static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) 52 { 53 struct btrfs_fs_info *fs_info = eb->fs_info; 54 unsigned long flags; 55 56 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 57 list_del(&eb->leak_list); 58 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 59 } 60 61 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 62 { 63 struct extent_buffer *eb; 64 unsigned long flags; 65 66 /* 67 * If we didn't get into open_ctree our allocated_ebs will not be 68 * initialized, so just skip this. 69 */ 70 if (!fs_info->allocated_ebs.next) 71 return; 72 73 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 74 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 75 while (!list_empty(&fs_info->allocated_ebs)) { 76 eb = list_first_entry(&fs_info->allocated_ebs, 77 struct extent_buffer, leak_list); 78 btrfs_err(fs_info, 79 "buffer leak start %llu len %u refs %d bflags %lu owner %llu", 80 eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, 81 btrfs_header_owner(eb)); 82 list_del(&eb->leak_list); 83 WARN_ON_ONCE(1); 84 kmem_cache_free(extent_buffer_cache, eb); 85 } 86 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 87 } 88 #else 89 #define btrfs_leak_debug_add_eb(eb) do {} while (0) 90 #define btrfs_leak_debug_del_eb(eb) do {} while (0) 91 #endif 92 93 /* 94 * Structure to record info about the bio being assembled, and other info like 95 * how many bytes are there before stripe/ordered extent boundary. 96 */ 97 struct btrfs_bio_ctrl { 98 struct btrfs_bio *bbio; 99 /* Last byte contained in bbio + 1 . */ 100 loff_t next_file_offset; 101 enum btrfs_compression_type compress_type; 102 u32 len_to_oe_boundary; 103 blk_opf_t opf; 104 /* 105 * For data read bios, we attempt to optimize csum lookups if the extent 106 * generation is older than the current one. To make this possible, we 107 * need to track the maximum generation of an extent in a bio_ctrl to 108 * make the decision when submitting the bio. 109 * 110 * The pattern between do_readpage(), submit_one_bio() and 111 * submit_extent_folio() is quite subtle, so tracking this is tricky. 112 * 113 * As we process extent E, we might submit a bio with existing built up 114 * extents before adding E to a new bio, or we might just add E to the 115 * bio. As a result, E's generation could apply to the current bio or 116 * to the next one, so we need to be careful to update the bio_ctrl's 117 * generation with E's only when we are sure E is added to bio_ctrl->bbio 118 * in submit_extent_folio(). 119 * 120 * See the comment in btrfs_lookup_bio_sums() for more detail on the 121 * need for this optimization. 122 */ 123 u64 generation; 124 btrfs_bio_end_io_t end_io_func; 125 struct writeback_control *wbc; 126 127 /* 128 * The sectors of the page which are going to be submitted by 129 * extent_writepage_io(). 130 * This is to avoid touching ranges covered by compression/inline. 131 */ 132 unsigned long submit_bitmap; 133 struct readahead_control *ractl; 134 135 /* 136 * The start offset of the last used extent map by a read operation. 137 * 138 * This is for proper compressed read merge. 139 * U64_MAX means we are starting the read and have made no progress yet. 140 * 141 * The current btrfs_bio_is_contig() only uses disk_bytenr as 142 * the condition to check if the read can be merged with previous 143 * bio, which is not correct. E.g. two file extents pointing to the 144 * same extent but with different offset. 145 * 146 * So here we need to do extra checks to only merge reads that are 147 * covered by the same extent map. 148 * Just extent_map::start will be enough, as they are unique 149 * inside the same inode. 150 */ 151 u64 last_em_start; 152 }; 153 154 /* 155 * Helper to set the csum search commit root option for a bio_ctrl's bbio 156 * before submitting the bio. 157 * 158 * Only for use by submit_one_bio(). 159 */ 160 static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) 161 { 162 struct btrfs_bio *bbio = bio_ctrl->bbio; 163 164 ASSERT(bbio); 165 166 if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) 167 return; 168 169 bio_ctrl->bbio->csum_search_commit_root = 170 (bio_ctrl->generation && 171 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); 172 } 173 174 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 175 { 176 struct btrfs_bio *bbio = bio_ctrl->bbio; 177 178 if (!bbio) 179 return; 180 181 /* Caller should ensure the bio has at least some range added */ 182 ASSERT(bbio->bio.bi_iter.bi_size); 183 184 bio_set_csum_search_commit_root(bio_ctrl); 185 186 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && 187 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 188 btrfs_submit_compressed_read(bbio); 189 else 190 btrfs_submit_bbio(bbio, 0); 191 192 /* The bbio is owned by the end_io handler now */ 193 bio_ctrl->bbio = NULL; 194 /* 195 * We used the generation to decide whether to lookup csums in the 196 * commit_root or not when we called bio_set_csum_search_commit_root() 197 * above. Now, reset the generation for the next bio. 198 */ 199 bio_ctrl->generation = 0; 200 } 201 202 /* 203 * Submit or fail the current bio in the bio_ctrl structure. 204 */ 205 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) 206 { 207 struct btrfs_bio *bbio = bio_ctrl->bbio; 208 209 if (!bbio) 210 return; 211 212 if (ret) { 213 ASSERT(ret < 0); 214 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 215 /* The bio is owned by the end_io handler now */ 216 bio_ctrl->bbio = NULL; 217 } else { 218 submit_one_bio(bio_ctrl); 219 } 220 } 221 222 int __init extent_buffer_init_cachep(void) 223 { 224 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 225 sizeof(struct extent_buffer), 0, 0, 226 NULL); 227 if (!extent_buffer_cache) 228 return -ENOMEM; 229 230 return 0; 231 } 232 233 void __cold extent_buffer_free_cachep(void) 234 { 235 /* 236 * Make sure all delayed rcu free are flushed before we 237 * destroy caches. 238 */ 239 rcu_barrier(); 240 kmem_cache_destroy(extent_buffer_cache); 241 } 242 243 static void process_one_folio(struct btrfs_fs_info *fs_info, 244 struct folio *folio, const struct folio *locked_folio, 245 unsigned long page_ops, u64 start, u64 end) 246 { 247 u32 len; 248 249 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 250 len = end + 1 - start; 251 252 if (page_ops & PAGE_SET_ORDERED) 253 btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); 254 if (page_ops & PAGE_START_WRITEBACK) { 255 btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); 256 btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); 257 } 258 if (page_ops & PAGE_END_WRITEBACK) 259 btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); 260 261 if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) 262 btrfs_folio_end_lock(fs_info, folio, start, len); 263 } 264 265 static void __process_folios_contig(struct address_space *mapping, 266 const struct folio *locked_folio, u64 start, 267 u64 end, unsigned long page_ops) 268 { 269 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 270 pgoff_t index = start >> PAGE_SHIFT; 271 pgoff_t end_index = end >> PAGE_SHIFT; 272 struct folio_batch fbatch; 273 int i; 274 275 folio_batch_init(&fbatch); 276 while (index <= end_index) { 277 int found_folios; 278 279 found_folios = filemap_get_folios_contig(mapping, &index, 280 end_index, &fbatch); 281 for (i = 0; i < found_folios; i++) { 282 struct folio *folio = fbatch.folios[i]; 283 284 process_one_folio(fs_info, folio, locked_folio, 285 page_ops, start, end); 286 } 287 folio_batch_release(&fbatch); 288 cond_resched(); 289 } 290 } 291 292 static noinline void unlock_delalloc_folio(const struct inode *inode, 293 struct folio *locked_folio, 294 u64 start, u64 end) 295 { 296 ASSERT(locked_folio); 297 298 __process_folios_contig(inode->i_mapping, locked_folio, start, end, 299 PAGE_UNLOCK); 300 } 301 302 static noinline int lock_delalloc_folios(struct inode *inode, 303 struct folio *locked_folio, 304 u64 start, u64 end) 305 { 306 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 307 struct address_space *mapping = inode->i_mapping; 308 pgoff_t index = start >> PAGE_SHIFT; 309 pgoff_t end_index = end >> PAGE_SHIFT; 310 u64 processed_end = start; 311 struct folio_batch fbatch; 312 313 folio_batch_init(&fbatch); 314 while (index <= end_index) { 315 unsigned int found_folios, i; 316 317 found_folios = filemap_get_folios_contig(mapping, &index, 318 end_index, &fbatch); 319 if (found_folios == 0) 320 goto out; 321 322 for (i = 0; i < found_folios; i++) { 323 struct folio *folio = fbatch.folios[i]; 324 u64 range_start; 325 u32 range_len; 326 327 if (folio == locked_folio) 328 continue; 329 330 folio_lock(folio); 331 if (!folio_test_dirty(folio) || folio->mapping != mapping) { 332 folio_unlock(folio); 333 goto out; 334 } 335 range_start = max_t(u64, folio_pos(folio), start); 336 range_len = min_t(u64, folio_end(folio), end + 1) - range_start; 337 btrfs_folio_set_lock(fs_info, folio, range_start, range_len); 338 339 processed_end = range_start + range_len - 1; 340 } 341 folio_batch_release(&fbatch); 342 cond_resched(); 343 } 344 345 return 0; 346 out: 347 folio_batch_release(&fbatch); 348 if (processed_end > start) 349 unlock_delalloc_folio(inode, locked_folio, start, processed_end); 350 return -EAGAIN; 351 } 352 353 /* 354 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 355 * more than @max_bytes. 356 * 357 * @start: The original start bytenr to search. 358 * Will store the extent range start bytenr. 359 * @end: The original end bytenr of the search range 360 * Will store the extent range end bytenr. 361 * 362 * Return true if we find a delalloc range which starts inside the original 363 * range, and @start/@end will store the delalloc range start/end. 364 * 365 * Return false if we can't find any delalloc range which starts inside the 366 * original range, and @start/@end will be the non-delalloc range start/end. 367 */ 368 EXPORT_FOR_TESTS 369 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 370 struct folio *locked_folio, 371 u64 *start, u64 *end) 372 { 373 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 375 const u64 orig_start = *start; 376 const u64 orig_end = *end; 377 /* The sanity tests may not set a valid fs_info. */ 378 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; 379 u64 delalloc_start; 380 u64 delalloc_end; 381 bool found; 382 struct extent_state *cached_state = NULL; 383 int ret; 384 int loops = 0; 385 386 /* Caller should pass a valid @end to indicate the search range end */ 387 ASSERT(orig_end > orig_start); 388 389 /* The range should at least cover part of the folio */ 390 ASSERT(!(orig_start >= folio_end(locked_folio) || 391 orig_end <= folio_pos(locked_folio))); 392 again: 393 /* step one, find a bunch of delalloc bytes starting at start */ 394 delalloc_start = *start; 395 delalloc_end = 0; 396 397 /* 398 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can 399 * return early without handling any dirty ranges. 400 */ 401 ASSERT(max_bytes >= fs_info->sectorsize); 402 403 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 404 max_bytes, &cached_state); 405 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 406 *start = delalloc_start; 407 408 /* @delalloc_end can be -1, never go beyond @orig_end */ 409 *end = min(delalloc_end, orig_end); 410 btrfs_free_extent_state(cached_state); 411 return false; 412 } 413 414 /* 415 * start comes from the offset of locked_folio. We have to lock 416 * folios in order, so we can't process delalloc bytes before 417 * locked_folio 418 */ 419 if (delalloc_start < *start) 420 delalloc_start = *start; 421 422 /* 423 * make sure to limit the number of folios we try to lock down 424 */ 425 if (delalloc_end + 1 - delalloc_start > max_bytes) 426 delalloc_end = delalloc_start + max_bytes - 1; 427 428 /* step two, lock all the folios after the folios that has start */ 429 ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, 430 delalloc_end); 431 ASSERT(!ret || ret == -EAGAIN); 432 if (ret == -EAGAIN) { 433 /* 434 * Some of the folios are gone, lets avoid looping by 435 * shortening the size of the delalloc range we're searching. 436 */ 437 btrfs_free_extent_state(cached_state); 438 cached_state = NULL; 439 if (!loops) { 440 max_bytes = fs_info->sectorsize; 441 loops = 1; 442 goto again; 443 } else { 444 found = false; 445 goto out_failed; 446 } 447 } 448 449 /* step three, lock the state bits for the whole range */ 450 btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); 451 452 /* then test to make sure it is all still delalloc */ 453 ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, 454 EXTENT_DELALLOC, cached_state); 455 456 btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); 457 if (!ret) { 458 unlock_delalloc_folio(inode, locked_folio, delalloc_start, 459 delalloc_end); 460 cond_resched(); 461 goto again; 462 } 463 *start = delalloc_start; 464 *end = delalloc_end; 465 out_failed: 466 return found; 467 } 468 469 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 470 const struct folio *locked_folio, 471 struct extent_state **cached, 472 u32 clear_bits, unsigned long page_ops) 473 { 474 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); 475 476 __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, 477 end, page_ops); 478 } 479 480 static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) 481 { 482 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 483 484 if (!fsverity_active(folio->mapping->host) || 485 btrfs_folio_test_uptodate(fs_info, folio, start, len) || 486 start >= i_size_read(folio->mapping->host)) 487 return true; 488 return fsverity_verify_folio(folio); 489 } 490 491 static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) 492 { 493 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 494 495 ASSERT(folio_pos(folio) <= start && 496 start + len <= folio_end(folio)); 497 498 if (uptodate && btrfs_verify_folio(folio, start, len)) 499 btrfs_folio_set_uptodate(fs_info, folio, start, len); 500 else 501 btrfs_folio_clear_uptodate(fs_info, folio, start, len); 502 503 if (!btrfs_is_subpage(fs_info, folio)) 504 folio_unlock(folio); 505 else 506 btrfs_folio_end_lock(fs_info, folio, start, len); 507 } 508 509 /* 510 * After a write IO is done, we need to: 511 * 512 * - clear the uptodate bits on error 513 * - clear the writeback bits in the extent tree for the range 514 * - filio_end_writeback() if there is no more pending io for the folio 515 * 516 * Scheduling is not allowed, so the extent state tree is expected 517 * to have one and only one object corresponding to this IO. 518 */ 519 static void end_bbio_data_write(struct btrfs_bio *bbio) 520 { 521 struct btrfs_fs_info *fs_info = bbio->fs_info; 522 struct bio *bio = &bbio->bio; 523 int error = blk_status_to_errno(bio->bi_status); 524 struct folio_iter fi; 525 const u32 sectorsize = fs_info->sectorsize; 526 527 ASSERT(!bio_flagged(bio, BIO_CLONED)); 528 bio_for_each_folio_all(fi, bio) { 529 struct folio *folio = fi.folio; 530 u64 start = folio_pos(folio) + fi.offset; 531 u32 len = fi.length; 532 533 /* Our read/write should always be sector aligned. */ 534 if (!IS_ALIGNED(fi.offset, sectorsize)) 535 btrfs_err(fs_info, 536 "partial page write in btrfs with offset %zu and length %zu", 537 fi.offset, fi.length); 538 else if (!IS_ALIGNED(fi.length, sectorsize)) 539 btrfs_info(fs_info, 540 "incomplete page write with offset %zu and length %zu", 541 fi.offset, fi.length); 542 543 btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, 544 !error); 545 if (error) 546 mapping_set_error(folio->mapping, error); 547 btrfs_folio_clear_writeback(fs_info, folio, start, len); 548 } 549 550 bio_put(bio); 551 } 552 553 static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) 554 { 555 ASSERT(folio_test_locked(folio)); 556 if (!btrfs_is_subpage(fs_info, folio)) 557 return; 558 559 ASSERT(folio_test_private(folio)); 560 btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); 561 } 562 563 /* 564 * After a data read IO is done, we need to: 565 * 566 * - clear the uptodate bits on error 567 * - set the uptodate bits if things worked 568 * - set the folio up to date if all extents in the tree are uptodate 569 * - clear the lock bit in the extent tree 570 * - unlock the folio if there are no other extents locked for it 571 * 572 * Scheduling is not allowed, so the extent state tree is expected 573 * to have one and only one object corresponding to this IO. 574 */ 575 static void end_bbio_data_read(struct btrfs_bio *bbio) 576 { 577 struct btrfs_fs_info *fs_info = bbio->fs_info; 578 struct bio *bio = &bbio->bio; 579 struct folio_iter fi; 580 581 ASSERT(!bio_flagged(bio, BIO_CLONED)); 582 bio_for_each_folio_all(fi, &bbio->bio) { 583 bool uptodate = !bio->bi_status; 584 struct folio *folio = fi.folio; 585 struct inode *inode = folio->mapping->host; 586 u64 start = folio_pos(folio) + fi.offset; 587 588 btrfs_debug(fs_info, 589 "%s: bi_sector=%llu, err=%d, mirror=%u", 590 __func__, bio->bi_iter.bi_sector, bio->bi_status, 591 bbio->mirror_num); 592 593 594 if (likely(uptodate)) { 595 u64 end = start + fi.length - 1; 596 loff_t i_size = i_size_read(inode); 597 598 /* 599 * Zero out the remaining part if this range straddles 600 * i_size. 601 * 602 * Here we should only zero the range inside the folio, 603 * not touch anything else. 604 * 605 * NOTE: i_size is exclusive while end is inclusive and 606 * folio_contains() takes PAGE_SIZE units. 607 */ 608 if (folio_contains(folio, i_size >> PAGE_SHIFT) && 609 i_size <= end) { 610 u32 zero_start = max(offset_in_folio(folio, i_size), 611 offset_in_folio(folio, start)); 612 u32 zero_len = offset_in_folio(folio, end) + 1 - 613 zero_start; 614 615 folio_zero_range(folio, zero_start, zero_len); 616 } 617 } 618 619 /* Update page status and unlock. */ 620 end_folio_read(folio, uptodate, start, fi.length); 621 } 622 bio_put(bio); 623 } 624 625 /* 626 * Populate every free slot in a provided array with folios using GFP_NOFS. 627 * 628 * @nr_folios: number of folios to allocate 629 * @order: the order of the folios to be allocated 630 * @folio_array: the array to fill with folios; any existing non-NULL entries in 631 * the array will be skipped 632 * 633 * Return: 0 if all folios were able to be allocated; 634 * -ENOMEM otherwise, the partially allocated folios would be freed and 635 * the array slots zeroed 636 */ 637 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, 638 struct folio **folio_array) 639 { 640 for (int i = 0; i < nr_folios; i++) { 641 if (folio_array[i]) 642 continue; 643 folio_array[i] = folio_alloc(GFP_NOFS, order); 644 if (!folio_array[i]) 645 goto error; 646 } 647 return 0; 648 error: 649 for (int i = 0; i < nr_folios; i++) { 650 if (folio_array[i]) 651 folio_put(folio_array[i]); 652 folio_array[i] = NULL; 653 } 654 return -ENOMEM; 655 } 656 657 /* 658 * Populate every free slot in a provided array with pages, using GFP_NOFS. 659 * 660 * @nr_pages: number of pages to allocate 661 * @page_array: the array to fill with pages; any existing non-null entries in 662 * the array will be skipped 663 * @nofail: whether using __GFP_NOFAIL flag 664 * 665 * Return: 0 if all pages were able to be allocated; 666 * -ENOMEM otherwise, the partially allocated pages would be freed and 667 * the array slots zeroed 668 */ 669 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, 670 bool nofail) 671 { 672 const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; 673 unsigned int allocated; 674 675 for (allocated = 0; allocated < nr_pages;) { 676 unsigned int last = allocated; 677 678 allocated = alloc_pages_bulk(gfp, nr_pages, page_array); 679 if (unlikely(allocated == last)) { 680 /* No progress, fail and do cleanup. */ 681 for (int i = 0; i < allocated; i++) { 682 __free_page(page_array[i]); 683 page_array[i] = NULL; 684 } 685 return -ENOMEM; 686 } 687 } 688 return 0; 689 } 690 691 /* 692 * Populate needed folios for the extent buffer. 693 * 694 * For now, the folios populated are always in order 0 (aka, single page). 695 */ 696 static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) 697 { 698 struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; 699 int num_pages = num_extent_pages(eb); 700 int ret; 701 702 ret = btrfs_alloc_page_array(num_pages, page_array, nofail); 703 if (ret < 0) 704 return ret; 705 706 for (int i = 0; i < num_pages; i++) 707 eb->folios[i] = page_folio(page_array[i]); 708 eb->folio_size = PAGE_SIZE; 709 eb->folio_shift = PAGE_SHIFT; 710 return 0; 711 } 712 713 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, 714 u64 disk_bytenr, loff_t file_offset) 715 { 716 struct bio *bio = &bio_ctrl->bbio->bio; 717 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 718 719 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 720 /* 721 * For compression, all IO should have its logical bytenr set 722 * to the starting bytenr of the compressed extent. 723 */ 724 return bio->bi_iter.bi_sector == sector; 725 } 726 727 /* 728 * To merge into a bio both the disk sector and the logical offset in 729 * the file need to be contiguous. 730 */ 731 return bio_ctrl->next_file_offset == file_offset && 732 bio_end_sector(bio) == sector; 733 } 734 735 static void alloc_new_bio(struct btrfs_inode *inode, 736 struct btrfs_bio_ctrl *bio_ctrl, 737 u64 disk_bytenr, u64 file_offset) 738 { 739 struct btrfs_fs_info *fs_info = inode->root->fs_info; 740 struct btrfs_bio *bbio; 741 742 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, 743 bio_ctrl->end_io_func, NULL); 744 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 745 bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; 746 bbio->inode = inode; 747 bbio->file_offset = file_offset; 748 bio_ctrl->bbio = bbio; 749 bio_ctrl->len_to_oe_boundary = U32_MAX; 750 bio_ctrl->next_file_offset = file_offset; 751 752 /* Limit data write bios to the ordered boundary. */ 753 if (bio_ctrl->wbc) { 754 struct btrfs_ordered_extent *ordered; 755 756 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 757 if (ordered) { 758 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 759 ordered->file_offset + 760 ordered->disk_num_bytes - file_offset); 761 bbio->ordered = ordered; 762 } 763 764 /* 765 * Pick the last added device to support cgroup writeback. For 766 * multi-device file systems this means blk-cgroup policies have 767 * to always be set on the last added/replaced device. 768 * This is a bit odd but has been like that for a long time. 769 */ 770 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 771 wbc_init_bio(bio_ctrl->wbc, &bbio->bio); 772 } 773 } 774 775 /* 776 * @disk_bytenr: logical bytenr where the write will be 777 * @page: page to add to the bio 778 * @size: portion of page that we want to write to 779 * @pg_offset: offset of the new bio or to check whether we are adding 780 * a contiguous page to the previous one 781 * @read_em_generation: generation of the extent_map we are submitting 782 * (only used for read) 783 * 784 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a 785 * new one in @bio_ctrl->bbio. 786 * The mirror number for this IO should already be initialized in 787 * @bio_ctrl->mirror_num. 788 */ 789 static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, 790 u64 disk_bytenr, struct folio *folio, 791 size_t size, unsigned long pg_offset, 792 u64 read_em_generation) 793 { 794 struct btrfs_inode *inode = folio_to_inode(folio); 795 loff_t file_offset = folio_pos(folio) + pg_offset; 796 797 ASSERT(pg_offset + size <= folio_size(folio)); 798 ASSERT(bio_ctrl->end_io_func); 799 800 if (bio_ctrl->bbio && 801 !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) 802 submit_one_bio(bio_ctrl); 803 804 do { 805 u32 len = size; 806 807 /* Allocate new bio if needed */ 808 if (!bio_ctrl->bbio) 809 alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); 810 811 /* Cap to the current ordered extent boundary if there is one. */ 812 if (len > bio_ctrl->len_to_oe_boundary) { 813 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); 814 ASSERT(is_data_inode(inode)); 815 len = bio_ctrl->len_to_oe_boundary; 816 } 817 818 if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { 819 /* bio full: move on to a new one */ 820 submit_one_bio(bio_ctrl); 821 continue; 822 } 823 /* 824 * Now that the folio is definitely added to the bio, include its 825 * generation in the max generation calculation. 826 */ 827 bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); 828 bio_ctrl->next_file_offset += len; 829 830 if (bio_ctrl->wbc) 831 wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); 832 833 size -= len; 834 pg_offset += len; 835 disk_bytenr += len; 836 file_offset += len; 837 838 /* 839 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or 840 * sector aligned. alloc_new_bio() then sets it to the end of 841 * our ordered extent for writes into zoned devices. 842 * 843 * When len_to_oe_boundary is tracking an ordered extent, we 844 * trust the ordered extent code to align things properly, and 845 * the check above to cap our write to the ordered extent 846 * boundary is correct. 847 * 848 * When len_to_oe_boundary is U32_MAX, the cap above would 849 * result in a 4095 byte IO for the last folio right before 850 * we hit the bio limit of UINT_MAX. bio_add_folio() has all 851 * the checks required to make sure we don't overflow the bio, 852 * and we should just ignore len_to_oe_boundary completely 853 * unless we're using it to track an ordered extent. 854 * 855 * It's pretty hard to make a bio sized U32_MAX, but it can 856 * happen when the page cache is able to feed us contiguous 857 * folios for large extents. 858 */ 859 if (bio_ctrl->len_to_oe_boundary != U32_MAX) 860 bio_ctrl->len_to_oe_boundary -= len; 861 862 /* Ordered extent boundary: move on to a new bio. */ 863 if (bio_ctrl->len_to_oe_boundary == 0) 864 submit_one_bio(bio_ctrl); 865 } while (size); 866 } 867 868 static int attach_extent_buffer_folio(struct extent_buffer *eb, 869 struct folio *folio, 870 struct btrfs_folio_state *prealloc) 871 { 872 struct btrfs_fs_info *fs_info = eb->fs_info; 873 int ret = 0; 874 875 /* 876 * If the page is mapped to btree inode, we should hold the private 877 * lock to prevent race. 878 * For cloned or dummy extent buffers, their pages are not mapped and 879 * will not race with any other ebs. 880 */ 881 if (folio->mapping) 882 lockdep_assert_held(&folio->mapping->i_private_lock); 883 884 if (!btrfs_meta_is_subpage(fs_info)) { 885 if (!folio_test_private(folio)) 886 folio_attach_private(folio, eb); 887 else 888 WARN_ON(folio_get_private(folio) != eb); 889 return 0; 890 } 891 892 /* Already mapped, just free prealloc */ 893 if (folio_test_private(folio)) { 894 btrfs_free_folio_state(prealloc); 895 return 0; 896 } 897 898 if (prealloc) 899 /* Has preallocated memory for subpage */ 900 folio_attach_private(folio, prealloc); 901 else 902 /* Do new allocation to attach subpage */ 903 ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 904 return ret; 905 } 906 907 int set_folio_extent_mapped(struct folio *folio) 908 { 909 struct btrfs_fs_info *fs_info; 910 911 ASSERT(folio->mapping); 912 913 if (folio_test_private(folio)) 914 return 0; 915 916 fs_info = folio_to_fs_info(folio); 917 918 if (btrfs_is_subpage(fs_info, folio)) 919 return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 920 921 folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); 922 return 0; 923 } 924 925 void clear_folio_extent_mapped(struct folio *folio) 926 { 927 struct btrfs_fs_info *fs_info; 928 929 ASSERT(folio->mapping); 930 931 if (!folio_test_private(folio)) 932 return; 933 934 fs_info = folio_to_fs_info(folio); 935 if (btrfs_is_subpage(fs_info, folio)) 936 return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 937 938 folio_detach_private(folio); 939 } 940 941 static struct extent_map *get_extent_map(struct btrfs_inode *inode, 942 struct folio *folio, u64 start, 943 u64 len, struct extent_map **em_cached) 944 { 945 struct extent_map *em; 946 947 ASSERT(em_cached); 948 949 if (*em_cached) { 950 em = *em_cached; 951 if (btrfs_extent_map_in_tree(em) && start >= em->start && 952 start < btrfs_extent_map_end(em)) { 953 refcount_inc(&em->refs); 954 return em; 955 } 956 957 btrfs_free_extent_map(em); 958 *em_cached = NULL; 959 } 960 961 em = btrfs_get_extent(inode, folio, start, len); 962 if (!IS_ERR(em)) { 963 BUG_ON(*em_cached); 964 refcount_inc(&em->refs); 965 *em_cached = em; 966 } 967 968 return em; 969 } 970 971 static void btrfs_readahead_expand(struct readahead_control *ractl, 972 const struct extent_map *em) 973 { 974 const u64 ra_pos = readahead_pos(ractl); 975 const u64 ra_end = ra_pos + readahead_length(ractl); 976 const u64 em_end = em->start + em->len; 977 978 /* No expansion for holes and inline extents. */ 979 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) 980 return; 981 982 ASSERT(em_end >= ra_pos, 983 "extent_map %llu %llu ends before current readahead position %llu", 984 em->start, em->len, ra_pos); 985 if (em_end > ra_end) 986 readahead_expand(ractl, ra_pos, em_end - ra_pos); 987 } 988 989 /* 990 * basic readpage implementation. Locked extent state structs are inserted 991 * into the tree that are removed when the IO is done (by the end_io 992 * handlers) 993 * XXX JDM: This needs looking at to ensure proper page locking 994 * return 0 on success, otherwise return error 995 */ 996 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, 997 struct btrfs_bio_ctrl *bio_ctrl) 998 { 999 struct inode *inode = folio->mapping->host; 1000 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1001 u64 start = folio_pos(folio); 1002 const u64 end = start + folio_size(folio) - 1; 1003 u64 extent_offset; 1004 u64 last_byte = i_size_read(inode); 1005 struct extent_map *em; 1006 int ret = 0; 1007 const size_t blocksize = fs_info->sectorsize; 1008 1009 ret = set_folio_extent_mapped(folio); 1010 if (ret < 0) { 1011 folio_unlock(folio); 1012 return ret; 1013 } 1014 1015 if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { 1016 size_t zero_offset = offset_in_folio(folio, last_byte); 1017 1018 if (zero_offset) 1019 folio_zero_range(folio, zero_offset, 1020 folio_size(folio) - zero_offset); 1021 } 1022 bio_ctrl->end_io_func = end_bbio_data_read; 1023 begin_folio_read(fs_info, folio); 1024 for (u64 cur = start; cur <= end; cur += blocksize) { 1025 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; 1026 unsigned long pg_offset = offset_in_folio(folio, cur); 1027 bool force_bio_submit = false; 1028 u64 disk_bytenr; 1029 u64 block_start; 1030 u64 em_gen; 1031 1032 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 1033 if (cur >= last_byte) { 1034 folio_zero_range(folio, pg_offset, end - cur + 1); 1035 end_folio_read(folio, true, cur, end - cur + 1); 1036 break; 1037 } 1038 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1039 end_folio_read(folio, true, cur, blocksize); 1040 continue; 1041 } 1042 em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); 1043 if (IS_ERR(em)) { 1044 end_folio_read(folio, false, cur, end + 1 - cur); 1045 return PTR_ERR(em); 1046 } 1047 extent_offset = cur - em->start; 1048 BUG_ON(btrfs_extent_map_end(em) <= cur); 1049 BUG_ON(end < cur); 1050 1051 compress_type = btrfs_extent_map_compression(em); 1052 1053 /* 1054 * Only expand readahead for extents which are already creating 1055 * the pages anyway in add_ra_bio_pages, which is compressed 1056 * extents in the non subpage case. 1057 */ 1058 if (bio_ctrl->ractl && 1059 !btrfs_is_subpage(fs_info, folio) && 1060 compress_type != BTRFS_COMPRESS_NONE) 1061 btrfs_readahead_expand(bio_ctrl->ractl, em); 1062 1063 if (compress_type != BTRFS_COMPRESS_NONE) 1064 disk_bytenr = em->disk_bytenr; 1065 else 1066 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1067 1068 if (em->flags & EXTENT_FLAG_PREALLOC) 1069 block_start = EXTENT_MAP_HOLE; 1070 else 1071 block_start = btrfs_extent_map_block_start(em); 1072 1073 /* 1074 * If we have a file range that points to a compressed extent 1075 * and it's followed by a consecutive file range that points 1076 * to the same compressed extent (possibly with a different 1077 * offset and/or length, so it either points to the whole extent 1078 * or only part of it), we must make sure we do not submit a 1079 * single bio to populate the folios for the 2 ranges because 1080 * this makes the compressed extent read zero out the folios 1081 * belonging to the 2nd range. Imagine the following scenario: 1082 * 1083 * File layout 1084 * [0 - 8K] [8K - 24K] 1085 * | | 1086 * | | 1087 * points to extent X, points to extent X, 1088 * offset 4K, length of 8K offset 0, length 16K 1089 * 1090 * [extent X, compressed length = 4K uncompressed length = 16K] 1091 * 1092 * If the bio to read the compressed extent covers both ranges, 1093 * it will decompress extent X into the folios belonging to the 1094 * first range and then it will stop, zeroing out the remaining 1095 * folios that belong to the other range that points to extent X. 1096 * So here we make sure we submit 2 bios, one for the first 1097 * range and another one for the third range. Both will target 1098 * the same physical extent from disk, but we can't currently 1099 * make the compressed bio endio callback populate the folios 1100 * for both ranges because each compressed bio is tightly 1101 * coupled with a single extent map, and each range can have 1102 * an extent map with a different offset value relative to the 1103 * uncompressed data of our extent and different lengths. This 1104 * is a corner case so we prioritize correctness over 1105 * non-optimal behavior (submitting 2 bios for the same extent). 1106 */ 1107 if (compress_type != BTRFS_COMPRESS_NONE && 1108 bio_ctrl->last_em_start != U64_MAX && 1109 bio_ctrl->last_em_start != em->start) 1110 force_bio_submit = true; 1111 1112 bio_ctrl->last_em_start = em->start; 1113 1114 em_gen = em->generation; 1115 btrfs_free_extent_map(em); 1116 em = NULL; 1117 1118 /* we've found a hole, just zero and go on */ 1119 if (block_start == EXTENT_MAP_HOLE) { 1120 folio_zero_range(folio, pg_offset, blocksize); 1121 end_folio_read(folio, true, cur, blocksize); 1122 continue; 1123 } 1124 /* the get_extent function already copied into the folio */ 1125 if (block_start == EXTENT_MAP_INLINE) { 1126 end_folio_read(folio, true, cur, blocksize); 1127 continue; 1128 } 1129 1130 if (bio_ctrl->compress_type != compress_type) { 1131 submit_one_bio(bio_ctrl); 1132 bio_ctrl->compress_type = compress_type; 1133 } 1134 1135 if (force_bio_submit) 1136 submit_one_bio(bio_ctrl); 1137 submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, 1138 pg_offset, em_gen); 1139 } 1140 return 0; 1141 } 1142 1143 /* 1144 * Check if we can skip waiting the @ordered extent covering the block at @fileoff. 1145 * 1146 * @fileoff: Both input and output. 1147 * Input as the file offset where the check should start at. 1148 * Output as where the next check should start at, 1149 * if the function returns true. 1150 * 1151 * Return true if we can skip to @fileoff. The caller needs to check the new 1152 * @fileoff value to make sure it covers the full range, before skipping the 1153 * full OE. 1154 * 1155 * Return false if we must wait for the ordered extent. 1156 */ 1157 static bool can_skip_one_ordered_range(struct btrfs_inode *inode, 1158 struct btrfs_ordered_extent *ordered, 1159 u64 *fileoff) 1160 { 1161 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1162 struct folio *folio; 1163 const u32 blocksize = fs_info->sectorsize; 1164 u64 cur = *fileoff; 1165 bool ret; 1166 1167 folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); 1168 1169 /* 1170 * We should have locked the folio(s) for range [start, end], thus 1171 * there must be a folio and it must be locked. 1172 */ 1173 ASSERT(!IS_ERR(folio)); 1174 ASSERT(folio_test_locked(folio)); 1175 1176 /* 1177 * There are several cases for the folio and OE combination: 1178 * 1179 * 1) Folio has no private flag 1180 * The OE has all its IO done but not yet finished, and folio got 1181 * invalidated. 1182 * 1183 * Have we have to wait for the OE to finish, as it may contain the 1184 * to-be-inserted data checksum. 1185 * Without the data checksum inserted into the csum tree, read will 1186 * just fail with missing csum. 1187 */ 1188 if (!folio_test_private(folio)) { 1189 ret = false; 1190 goto out; 1191 } 1192 1193 /* 1194 * 2) The first block is DIRTY. 1195 * 1196 * This means the OE is created by some other folios whose file pos is 1197 * before this one. And since we are holding the folio lock, the writeback 1198 * of this folio cannot start. 1199 * 1200 * We must skip the whole OE, because it will never start until we 1201 * finished our folio read and unlocked the folio. 1202 */ 1203 if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { 1204 u64 range_len = min(folio_end(folio), 1205 ordered->file_offset + ordered->num_bytes) - cur; 1206 1207 ret = true; 1208 /* 1209 * At least inside the folio, all the remaining blocks should 1210 * also be dirty. 1211 */ 1212 ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); 1213 *fileoff = ordered->file_offset + ordered->num_bytes; 1214 goto out; 1215 } 1216 1217 /* 1218 * 3) The first block is uptodate. 1219 * 1220 * At least the first block can be skipped, but we are still not fully 1221 * sure. E.g. if the OE has some other folios in the range that cannot 1222 * be skipped. 1223 * So we return true and update @next_ret to the OE/folio boundary. 1224 */ 1225 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1226 u64 range_len = min(folio_end(folio), 1227 ordered->file_offset + ordered->num_bytes) - cur; 1228 1229 /* 1230 * The whole range to the OE end or folio boundary should also 1231 * be uptodate. 1232 */ 1233 ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); 1234 ret = true; 1235 *fileoff = cur + range_len; 1236 goto out; 1237 } 1238 1239 /* 1240 * 4) The first block is not uptodate. 1241 * 1242 * This means the folio is invalidated after the writeback was finished, 1243 * but by some other operations (e.g. block aligned buffered write) the 1244 * folio is inserted into filemap. 1245 * Very much the same as case 1). 1246 */ 1247 ret = false; 1248 out: 1249 folio_put(folio); 1250 return ret; 1251 } 1252 1253 static bool can_skip_ordered_extent(struct btrfs_inode *inode, 1254 struct btrfs_ordered_extent *ordered, 1255 u64 start, u64 end) 1256 { 1257 const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); 1258 u64 cur = max(start, ordered->file_offset); 1259 1260 while (cur < range_end) { 1261 bool can_skip; 1262 1263 can_skip = can_skip_one_ordered_range(inode, ordered, &cur); 1264 if (!can_skip) 1265 return false; 1266 } 1267 return true; 1268 } 1269 1270 /* 1271 * Locking helper to make sure we get a stable view of extent maps for the 1272 * involved range. 1273 * 1274 * This is for folio read paths (read and readahead), thus the involved range 1275 * should have all the folios locked. 1276 */ 1277 static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, 1278 struct extent_state **cached_state) 1279 { 1280 u64 cur_pos; 1281 1282 /* Caller must provide a valid @cached_state. */ 1283 ASSERT(cached_state); 1284 1285 /* The range must at least be page aligned, as all read paths are folio based. */ 1286 ASSERT(IS_ALIGNED(start, PAGE_SIZE)); 1287 ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); 1288 1289 again: 1290 btrfs_lock_extent(&inode->io_tree, start, end, cached_state); 1291 cur_pos = start; 1292 while (cur_pos < end) { 1293 struct btrfs_ordered_extent *ordered; 1294 1295 ordered = btrfs_lookup_ordered_range(inode, cur_pos, 1296 end - cur_pos + 1); 1297 /* 1298 * No ordered extents in the range, and we hold the extent lock, 1299 * no one can modify the extent maps in the range, we're safe to return. 1300 */ 1301 if (!ordered) 1302 break; 1303 1304 /* Check if we can skip waiting for the whole OE. */ 1305 if (can_skip_ordered_extent(inode, ordered, start, end)) { 1306 cur_pos = min(ordered->file_offset + ordered->num_bytes, 1307 end + 1); 1308 btrfs_put_ordered_extent(ordered); 1309 continue; 1310 } 1311 1312 /* Now wait for the OE to finish. */ 1313 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1314 btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); 1315 btrfs_put_ordered_extent(ordered); 1316 /* We have unlocked the whole range, restart from the beginning. */ 1317 goto again; 1318 } 1319 } 1320 1321 int btrfs_read_folio(struct file *file, struct folio *folio) 1322 { 1323 struct btrfs_inode *inode = folio_to_inode(folio); 1324 const u64 start = folio_pos(folio); 1325 const u64 end = start + folio_size(folio) - 1; 1326 struct extent_state *cached_state = NULL; 1327 struct btrfs_bio_ctrl bio_ctrl = { 1328 .opf = REQ_OP_READ, 1329 .last_em_start = U64_MAX, 1330 }; 1331 struct extent_map *em_cached = NULL; 1332 int ret; 1333 1334 lock_extents_for_read(inode, start, end, &cached_state); 1335 ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 1336 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1337 1338 btrfs_free_extent_map(em_cached); 1339 1340 /* 1341 * If btrfs_do_readpage() failed we will want to submit the assembled 1342 * bio to do the cleanup. 1343 */ 1344 submit_one_bio(&bio_ctrl); 1345 return ret; 1346 } 1347 1348 static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, 1349 u64 start, u32 len) 1350 { 1351 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1352 const u64 folio_start = folio_pos(folio); 1353 unsigned int start_bit; 1354 unsigned int nbits; 1355 1356 ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); 1357 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1358 nbits = len >> fs_info->sectorsize_bits; 1359 ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); 1360 bitmap_set(delalloc_bitmap, start_bit, nbits); 1361 } 1362 1363 static bool find_next_delalloc_bitmap(struct folio *folio, 1364 unsigned long *delalloc_bitmap, u64 start, 1365 u64 *found_start, u32 *found_len) 1366 { 1367 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1368 const u64 folio_start = folio_pos(folio); 1369 const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); 1370 unsigned int start_bit; 1371 unsigned int first_zero; 1372 unsigned int first_set; 1373 1374 ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); 1375 1376 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1377 first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); 1378 if (first_set >= bitmap_size) 1379 return false; 1380 1381 *found_start = folio_start + (first_set << fs_info->sectorsize_bits); 1382 first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); 1383 *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; 1384 return true; 1385 } 1386 1387 /* 1388 * Do all of the delayed allocation setup. 1389 * 1390 * Return >0 if all the dirty blocks are submitted async (compression) or inlined. 1391 * The @folio should no longer be touched (treat it as already unlocked). 1392 * 1393 * Return 0 if there is still dirty block that needs to be submitted through 1394 * extent_writepage_io(). 1395 * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be 1396 * submitted, and @folio is still kept locked. 1397 * 1398 * Return <0 if there is any error hit. 1399 * Any allocated ordered extent range covering this folio will be marked 1400 * finished (IOERR), and @folio is still kept locked. 1401 */ 1402 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 1403 struct folio *folio, 1404 struct btrfs_bio_ctrl *bio_ctrl) 1405 { 1406 struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); 1407 struct writeback_control *wbc = bio_ctrl->wbc; 1408 const bool is_subpage = btrfs_is_subpage(fs_info, folio); 1409 const u64 page_start = folio_pos(folio); 1410 const u64 page_end = page_start + folio_size(folio) - 1; 1411 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1412 unsigned long delalloc_bitmap = 0; 1413 /* 1414 * Save the last found delalloc end. As the delalloc end can go beyond 1415 * page boundary, thus we cannot rely on subpage bitmap to locate the 1416 * last delalloc end. 1417 */ 1418 u64 last_delalloc_end = 0; 1419 /* 1420 * The range end (exclusive) of the last successfully finished delalloc 1421 * range. 1422 * Any range covered by ordered extent must either be manually marked 1423 * finished (error handling), or has IO submitted (and finish the 1424 * ordered extent normally). 1425 * 1426 * This records the end of ordered extent cleanup if we hit an error. 1427 */ 1428 u64 last_finished_delalloc_end = page_start; 1429 u64 delalloc_start = page_start; 1430 u64 delalloc_end = page_end; 1431 u64 delalloc_to_write = 0; 1432 int ret = 0; 1433 int bit; 1434 1435 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1436 if (btrfs_is_subpage(fs_info, folio)) { 1437 ASSERT(blocks_per_folio > 1); 1438 btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); 1439 } else { 1440 bio_ctrl->submit_bitmap = 1; 1441 } 1442 1443 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1444 u64 start = page_start + (bit << fs_info->sectorsize_bits); 1445 1446 btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); 1447 } 1448 1449 /* Lock all (subpage) delalloc ranges inside the folio first. */ 1450 while (delalloc_start < page_end) { 1451 delalloc_end = page_end; 1452 if (!find_lock_delalloc_range(&inode->vfs_inode, folio, 1453 &delalloc_start, &delalloc_end)) { 1454 delalloc_start = delalloc_end + 1; 1455 continue; 1456 } 1457 set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, 1458 min(delalloc_end, page_end) + 1 - delalloc_start); 1459 last_delalloc_end = delalloc_end; 1460 delalloc_start = delalloc_end + 1; 1461 } 1462 delalloc_start = page_start; 1463 1464 if (!last_delalloc_end) 1465 goto out; 1466 1467 /* Run the delalloc ranges for the above locked ranges. */ 1468 while (delalloc_start < page_end) { 1469 u64 found_start; 1470 u32 found_len; 1471 bool found; 1472 1473 if (!is_subpage) { 1474 /* 1475 * For non-subpage case, the found delalloc range must 1476 * cover this folio and there must be only one locked 1477 * delalloc range. 1478 */ 1479 found_start = page_start; 1480 found_len = last_delalloc_end + 1 - found_start; 1481 found = true; 1482 } else { 1483 found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, 1484 delalloc_start, &found_start, &found_len); 1485 } 1486 if (!found) 1487 break; 1488 /* 1489 * The subpage range covers the last sector, the delalloc range may 1490 * end beyond the folio boundary, use the saved delalloc_end 1491 * instead. 1492 */ 1493 if (found_start + found_len >= page_end) 1494 found_len = last_delalloc_end + 1 - found_start; 1495 1496 if (ret >= 0) { 1497 /* 1498 * Some delalloc range may be created by previous folios. 1499 * Thus we still need to clean up this range during error 1500 * handling. 1501 */ 1502 last_finished_delalloc_end = found_start; 1503 /* No errors hit so far, run the current delalloc range. */ 1504 ret = btrfs_run_delalloc_range(inode, folio, 1505 found_start, 1506 found_start + found_len - 1, 1507 wbc); 1508 if (ret >= 0) 1509 last_finished_delalloc_end = found_start + found_len; 1510 if (unlikely(ret < 0)) 1511 btrfs_err_rl(fs_info, 1512 "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", 1513 btrfs_root_id(inode->root), 1514 btrfs_ino(inode), 1515 folio_pos(folio), 1516 blocks_per_folio, 1517 &bio_ctrl->submit_bitmap, 1518 found_start, found_len, ret); 1519 } else { 1520 /* 1521 * We've hit an error during previous delalloc range, 1522 * have to cleanup the remaining locked ranges. 1523 */ 1524 btrfs_unlock_extent(&inode->io_tree, found_start, 1525 found_start + found_len - 1, NULL); 1526 unlock_delalloc_folio(&inode->vfs_inode, folio, 1527 found_start, 1528 found_start + found_len - 1); 1529 } 1530 1531 /* 1532 * We have some ranges that's going to be submitted asynchronously 1533 * (compression or inline). These range have their own control 1534 * on when to unlock the pages. We should not touch them 1535 * anymore, so clear the range from the submission bitmap. 1536 */ 1537 if (ret > 0) { 1538 unsigned int start_bit = (found_start - page_start) >> 1539 fs_info->sectorsize_bits; 1540 unsigned int end_bit = (min(page_end + 1, found_start + found_len) - 1541 page_start) >> fs_info->sectorsize_bits; 1542 bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); 1543 } 1544 /* 1545 * Above btrfs_run_delalloc_range() may have unlocked the folio, 1546 * thus for the last range, we cannot touch the folio anymore. 1547 */ 1548 if (found_start + found_len >= last_delalloc_end + 1) 1549 break; 1550 1551 delalloc_start = found_start + found_len; 1552 } 1553 /* 1554 * It's possible we had some ordered extents created before we hit 1555 * an error, cleanup non-async successfully created delalloc ranges. 1556 */ 1557 if (unlikely(ret < 0)) { 1558 unsigned int bitmap_size = min( 1559 (last_finished_delalloc_end - page_start) >> 1560 fs_info->sectorsize_bits, 1561 blocks_per_folio); 1562 1563 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) 1564 btrfs_mark_ordered_io_finished(inode, folio, 1565 page_start + (bit << fs_info->sectorsize_bits), 1566 fs_info->sectorsize, false); 1567 return ret; 1568 } 1569 out: 1570 if (last_delalloc_end) 1571 delalloc_end = last_delalloc_end; 1572 else 1573 delalloc_end = page_end; 1574 /* 1575 * delalloc_end is already one less than the total length, so 1576 * we don't subtract one from PAGE_SIZE. 1577 */ 1578 delalloc_to_write += 1579 DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); 1580 1581 /* 1582 * If all ranges are submitted asynchronously, we just need to account 1583 * for them here. 1584 */ 1585 if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { 1586 wbc->nr_to_write -= delalloc_to_write; 1587 return 1; 1588 } 1589 1590 if (wbc->nr_to_write < delalloc_to_write) { 1591 int thresh = 8192; 1592 1593 if (delalloc_to_write < thresh * 2) 1594 thresh = delalloc_to_write; 1595 wbc->nr_to_write = min_t(u64, delalloc_to_write, 1596 thresh); 1597 } 1598 1599 return 0; 1600 } 1601 1602 /* 1603 * Return 0 if we have submitted or queued the sector for submission. 1604 * Return <0 for critical errors, and the sector will have its dirty flag cleared. 1605 * 1606 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1607 */ 1608 static int submit_one_sector(struct btrfs_inode *inode, 1609 struct folio *folio, 1610 u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, 1611 loff_t i_size) 1612 { 1613 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1614 struct extent_map *em; 1615 u64 block_start; 1616 u64 disk_bytenr; 1617 u64 extent_offset; 1618 u64 em_end; 1619 const u32 sectorsize = fs_info->sectorsize; 1620 1621 ASSERT(IS_ALIGNED(filepos, sectorsize)); 1622 1623 /* @filepos >= i_size case should be handled by the caller. */ 1624 ASSERT(filepos < i_size); 1625 1626 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1627 if (IS_ERR(em)) { 1628 /* 1629 * When submission failed, we should still clear the folio dirty. 1630 * Or the folio will be written back again but without any 1631 * ordered extent. 1632 */ 1633 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1634 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1635 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1636 return PTR_ERR(em); 1637 } 1638 1639 extent_offset = filepos - em->start; 1640 em_end = btrfs_extent_map_end(em); 1641 ASSERT(filepos <= em_end); 1642 ASSERT(IS_ALIGNED(em->start, sectorsize)); 1643 ASSERT(IS_ALIGNED(em->len, sectorsize)); 1644 1645 block_start = btrfs_extent_map_block_start(em); 1646 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1647 1648 ASSERT(!btrfs_extent_map_is_compressed(em)); 1649 ASSERT(block_start != EXTENT_MAP_HOLE); 1650 ASSERT(block_start != EXTENT_MAP_INLINE); 1651 1652 btrfs_free_extent_map(em); 1653 em = NULL; 1654 1655 /* 1656 * Although the PageDirty bit is cleared before entering this 1657 * function, subpage dirty bit is not cleared. 1658 * So clear subpage dirty bit here so next time we won't submit 1659 * a folio for a range already written to disk. 1660 */ 1661 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1662 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1663 /* 1664 * Above call should set the whole folio with writeback flag, even 1665 * just for a single subpage sector. 1666 * As long as the folio is properly locked and the range is correct, 1667 * we should always get the folio with writeback flag. 1668 */ 1669 ASSERT(folio_test_writeback(folio)); 1670 1671 submit_extent_folio(bio_ctrl, disk_bytenr, folio, 1672 sectorsize, filepos - folio_pos(folio), 0); 1673 return 0; 1674 } 1675 1676 /* 1677 * Helper for extent_writepage(). This calls the writepage start hooks, 1678 * and does the loop to map the page into extents and bios. 1679 * 1680 * We return 1 if the IO is started and the page is unlocked, 1681 * 0 if all went well (page still locked) 1682 * < 0 if there were errors (page still locked) 1683 */ 1684 static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, 1685 struct folio *folio, 1686 u64 start, u32 len, 1687 struct btrfs_bio_ctrl *bio_ctrl, 1688 loff_t i_size) 1689 { 1690 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1691 unsigned long range_bitmap = 0; 1692 bool submitted_io = false; 1693 int found_error = 0; 1694 const u64 folio_start = folio_pos(folio); 1695 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1696 u64 cur; 1697 int bit; 1698 int ret = 0; 1699 1700 ASSERT(start >= folio_start && 1701 start + len <= folio_start + folio_size(folio)); 1702 1703 ret = btrfs_writepage_cow_fixup(folio); 1704 if (ret == -EAGAIN) { 1705 /* Fixup worker will requeue */ 1706 folio_redirty_for_writepage(bio_ctrl->wbc, folio); 1707 folio_unlock(folio); 1708 return 1; 1709 } 1710 if (ret < 0) { 1711 btrfs_folio_clear_dirty(fs_info, folio, start, len); 1712 btrfs_folio_set_writeback(fs_info, folio, start, len); 1713 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1714 return ret; 1715 } 1716 1717 for (cur = start; cur < start + len; cur += fs_info->sectorsize) 1718 set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); 1719 bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, 1720 blocks_per_folio); 1721 1722 bio_ctrl->end_io_func = end_bbio_data_write; 1723 1724 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1725 cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); 1726 1727 if (cur >= i_size) { 1728 btrfs_mark_ordered_io_finished(inode, folio, cur, 1729 start + len - cur, true); 1730 /* 1731 * This range is beyond i_size, thus we don't need to 1732 * bother writing back. 1733 * But we still need to clear the dirty subpage bit, or 1734 * the next time the folio gets dirtied, we will try to 1735 * writeback the sectors with subpage dirty bits, 1736 * causing writeback without ordered extent. 1737 */ 1738 btrfs_folio_clear_dirty(fs_info, folio, cur, 1739 start + len - cur); 1740 break; 1741 } 1742 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1743 if (unlikely(ret < 0)) { 1744 /* 1745 * bio_ctrl may contain a bio crossing several folios. 1746 * Submit it immediately so that the bio has a chance 1747 * to finish normally, other than marked as error. 1748 */ 1749 submit_one_bio(bio_ctrl); 1750 /* 1751 * Failed to grab the extent map which should be very rare. 1752 * Since there is no bio submitted to finish the ordered 1753 * extent, we have to manually finish this sector. 1754 */ 1755 btrfs_mark_ordered_io_finished(inode, folio, cur, 1756 fs_info->sectorsize, false); 1757 if (!found_error) 1758 found_error = ret; 1759 continue; 1760 } 1761 submitted_io = true; 1762 } 1763 1764 /* 1765 * If we didn't submitted any sector (>= i_size), folio dirty get 1766 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared 1767 * by folio_start_writeback() if the folio is not dirty). 1768 * 1769 * Here we set writeback and clear for the range. If the full folio 1770 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1771 * 1772 * If we hit any error, the corresponding sector will have its dirty 1773 * flag cleared and writeback finished, thus no need to handle the error case. 1774 */ 1775 if (!submitted_io && !found_error) { 1776 btrfs_folio_set_writeback(fs_info, folio, start, len); 1777 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1778 } 1779 return found_error; 1780 } 1781 1782 /* 1783 * the writepage semantics are similar to regular writepage. extent 1784 * records are inserted to lock ranges in the tree, and as dirty areas 1785 * are found, they are marked writeback. Then the lock bits are removed 1786 * and the end_io handler clears the writeback ranges 1787 * 1788 * Return 0 if everything goes well. 1789 * Return <0 for error. 1790 */ 1791 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) 1792 { 1793 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 1794 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1795 int ret; 1796 size_t pg_offset; 1797 loff_t i_size = i_size_read(&inode->vfs_inode); 1798 const pgoff_t end_index = i_size >> PAGE_SHIFT; 1799 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1800 1801 trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); 1802 1803 WARN_ON(!folio_test_locked(folio)); 1804 1805 pg_offset = offset_in_folio(folio, i_size); 1806 if (folio->index > end_index || 1807 (folio->index == end_index && !pg_offset)) { 1808 folio_invalidate(folio, 0, folio_size(folio)); 1809 folio_unlock(folio); 1810 return 0; 1811 } 1812 1813 if (folio_contains(folio, end_index)) 1814 folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); 1815 1816 /* 1817 * Default to unlock the whole folio. 1818 * The proper bitmap can only be initialized until writepage_delalloc(). 1819 */ 1820 bio_ctrl->submit_bitmap = (unsigned long)-1; 1821 1822 /* 1823 * If the page is dirty but without private set, it's marked dirty 1824 * without informing the fs. 1825 * Nowadays that is a bug, since the introduction of 1826 * pin_user_pages*(). 1827 * 1828 * So here we check if the page has private set to rule out such 1829 * case. 1830 * But we also have a long history of relying on the COW fixup, 1831 * so here we only enable this check for experimental builds until 1832 * we're sure it's safe. 1833 */ 1834 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && 1835 unlikely(!folio_test_private(folio))) { 1836 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 1837 btrfs_err_rl(fs_info, 1838 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 1839 btrfs_root_id(inode->root), 1840 btrfs_ino(inode), folio_pos(folio)); 1841 ret = -EUCLEAN; 1842 goto done; 1843 } 1844 1845 ret = set_folio_extent_mapped(folio); 1846 if (ret < 0) 1847 goto done; 1848 1849 ret = writepage_delalloc(inode, folio, bio_ctrl); 1850 if (ret == 1) 1851 return 0; 1852 if (ret) 1853 goto done; 1854 1855 ret = extent_writepage_io(inode, folio, folio_pos(folio), 1856 folio_size(folio), bio_ctrl, i_size); 1857 if (ret == 1) 1858 return 0; 1859 if (ret < 0) 1860 btrfs_err_rl(fs_info, 1861 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", 1862 btrfs_root_id(inode->root), btrfs_ino(inode), 1863 folio_pos(folio), blocks_per_folio, 1864 &bio_ctrl->submit_bitmap, ret); 1865 1866 bio_ctrl->wbc->nr_to_write--; 1867 1868 done: 1869 if (ret < 0) 1870 mapping_set_error(folio->mapping, ret); 1871 /* 1872 * Only unlock ranges that are submitted. As there can be some async 1873 * submitted ranges inside the folio. 1874 */ 1875 btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1876 ASSERT(ret <= 0); 1877 return ret; 1878 } 1879 1880 /* 1881 * Lock extent buffer status and pages for writeback. 1882 * 1883 * Return %false if the extent buffer doesn't need to be submitted (e.g. the 1884 * extent buffer is not dirty) 1885 * Return %true is the extent buffer is submitted to bio. 1886 */ 1887 static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, 1888 struct writeback_control *wbc) 1889 { 1890 struct btrfs_fs_info *fs_info = eb->fs_info; 1891 bool ret = false; 1892 1893 btrfs_tree_lock(eb); 1894 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 1895 btrfs_tree_unlock(eb); 1896 if (wbc->sync_mode != WB_SYNC_ALL) 1897 return false; 1898 wait_on_extent_buffer_writeback(eb); 1899 btrfs_tree_lock(eb); 1900 } 1901 1902 /* 1903 * We need to do this to prevent races in people who check if the eb is 1904 * under IO since we can end up having no IO bits set for a short period 1905 * of time. 1906 */ 1907 spin_lock(&eb->refs_lock); 1908 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1909 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1910 unsigned long flags; 1911 1912 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 1913 spin_unlock(&eb->refs_lock); 1914 1915 xas_lock_irqsave(&xas, flags); 1916 xas_load(&xas); 1917 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 1918 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 1919 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 1920 xas_unlock_irqrestore(&xas, flags); 1921 1922 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 1923 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 1924 -eb->len, 1925 fs_info->dirty_metadata_batch); 1926 ret = true; 1927 } else { 1928 spin_unlock(&eb->refs_lock); 1929 } 1930 btrfs_tree_unlock(eb); 1931 return ret; 1932 } 1933 1934 static void set_btree_ioerr(struct extent_buffer *eb) 1935 { 1936 struct btrfs_fs_info *fs_info = eb->fs_info; 1937 1938 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 1939 1940 /* 1941 * A read may stumble upon this buffer later, make sure that it gets an 1942 * error and knows there was an error. 1943 */ 1944 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 1945 1946 /* 1947 * We need to set the mapping with the io error as well because a write 1948 * error will flip the file system readonly, and then syncfs() will 1949 * return a 0 because we are readonly if we don't modify the err seq for 1950 * the superblock. 1951 */ 1952 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); 1953 1954 /* 1955 * If writeback for a btree extent that doesn't belong to a log tree 1956 * failed, increment the counter transaction->eb_write_errors. 1957 * We do this because while the transaction is running and before it's 1958 * committing (when we call filemap_fdata[write|wait]_range against 1959 * the btree inode), we might have 1960 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 1961 * returns an error or an error happens during writeback, when we're 1962 * committing the transaction we wouldn't know about it, since the pages 1963 * can be no longer dirty nor marked anymore for writeback (if a 1964 * subsequent modification to the extent buffer didn't happen before the 1965 * transaction commit), which makes filemap_fdata[write|wait]_range not 1966 * able to find the pages which contain errors at transaction 1967 * commit time. So if this happens we must abort the transaction, 1968 * otherwise we commit a super block with btree roots that point to 1969 * btree nodes/leafs whose content on disk is invalid - either garbage 1970 * or the content of some node/leaf from a past generation that got 1971 * cowed or deleted and is no longer valid. 1972 * 1973 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 1974 * not be enough - we need to distinguish between log tree extents vs 1975 * non-log tree extents, and the next filemap_fdatawait_range() call 1976 * will catch and clear such errors in the mapping - and that call might 1977 * be from a log sync and not from a transaction commit. Also, checking 1978 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 1979 * not done and would not be reliable - the eb might have been released 1980 * from memory and reading it back again means that flag would not be 1981 * set (since it's a runtime flag, not persisted on disk). 1982 * 1983 * Using the flags below in the btree inode also makes us achieve the 1984 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 1985 * writeback for all dirty pages and before filemap_fdatawait_range() 1986 * is called, the writeback for all dirty pages had already finished 1987 * with errors - because we were not using AS_EIO/AS_ENOSPC, 1988 * filemap_fdatawait_range() would return success, as it could not know 1989 * that writeback errors happened (the pages were no longer tagged for 1990 * writeback). 1991 */ 1992 switch (eb->log_index) { 1993 case -1: 1994 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 1995 break; 1996 case 0: 1997 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 1998 break; 1999 case 1: 2000 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2001 break; 2002 default: 2003 BUG(); /* unexpected, logic error */ 2004 } 2005 } 2006 2007 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) 2008 { 2009 struct btrfs_fs_info *fs_info = eb->fs_info; 2010 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2011 unsigned long flags; 2012 2013 xas_lock_irqsave(&xas, flags); 2014 xas_load(&xas); 2015 xas_set_mark(&xas, mark); 2016 xas_unlock_irqrestore(&xas, flags); 2017 } 2018 2019 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) 2020 { 2021 struct btrfs_fs_info *fs_info = eb->fs_info; 2022 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2023 unsigned long flags; 2024 2025 xas_lock_irqsave(&xas, flags); 2026 xas_load(&xas); 2027 xas_clear_mark(&xas, mark); 2028 xas_unlock_irqrestore(&xas, flags); 2029 } 2030 2031 static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, 2032 unsigned long start, unsigned long end) 2033 { 2034 XA_STATE(xas, &fs_info->buffer_tree, start); 2035 unsigned int tagged = 0; 2036 void *eb; 2037 2038 xas_lock_irq(&xas); 2039 xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { 2040 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 2041 if (++tagged % XA_CHECK_SCHED) 2042 continue; 2043 xas_pause(&xas); 2044 xas_unlock_irq(&xas); 2045 cond_resched(); 2046 xas_lock_irq(&xas); 2047 } 2048 xas_unlock_irq(&xas); 2049 } 2050 2051 struct eb_batch { 2052 unsigned int nr; 2053 unsigned int cur; 2054 struct extent_buffer *ebs[PAGEVEC_SIZE]; 2055 }; 2056 2057 static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) 2058 { 2059 batch->ebs[batch->nr++] = eb; 2060 return (batch->nr < PAGEVEC_SIZE); 2061 } 2062 2063 static inline void eb_batch_init(struct eb_batch *batch) 2064 { 2065 batch->nr = 0; 2066 batch->cur = 0; 2067 } 2068 2069 static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) 2070 { 2071 if (batch->cur >= batch->nr) 2072 return NULL; 2073 return batch->ebs[batch->cur++]; 2074 } 2075 2076 static inline void eb_batch_release(struct eb_batch *batch) 2077 { 2078 for (unsigned int i = 0; i < batch->nr; i++) 2079 free_extent_buffer(batch->ebs[i]); 2080 eb_batch_init(batch); 2081 } 2082 2083 static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, 2084 xa_mark_t mark) 2085 { 2086 struct extent_buffer *eb; 2087 2088 retry: 2089 eb = xas_find_marked(xas, max, mark); 2090 2091 if (xas_retry(xas, eb)) 2092 goto retry; 2093 2094 if (!eb) 2095 return NULL; 2096 2097 if (!refcount_inc_not_zero(&eb->refs)) { 2098 xas_reset(xas); 2099 goto retry; 2100 } 2101 2102 if (unlikely(eb != xas_reload(xas))) { 2103 free_extent_buffer(eb); 2104 xas_reset(xas); 2105 goto retry; 2106 } 2107 2108 return eb; 2109 } 2110 2111 static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, 2112 unsigned long *start, 2113 unsigned long end, xa_mark_t tag, 2114 struct eb_batch *batch) 2115 { 2116 XA_STATE(xas, &fs_info->buffer_tree, *start); 2117 struct extent_buffer *eb; 2118 2119 rcu_read_lock(); 2120 while ((eb = find_get_eb(&xas, end, tag)) != NULL) { 2121 if (!eb_batch_add(batch, eb)) { 2122 *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); 2123 goto out; 2124 } 2125 } 2126 if (end == ULONG_MAX) 2127 *start = ULONG_MAX; 2128 else 2129 *start = end + 1; 2130 out: 2131 rcu_read_unlock(); 2132 2133 return batch->nr; 2134 } 2135 2136 /* 2137 * The endio specific version which won't touch any unsafe spinlock in endio 2138 * context. 2139 */ 2140 static struct extent_buffer *find_extent_buffer_nolock( 2141 struct btrfs_fs_info *fs_info, u64 start) 2142 { 2143 struct extent_buffer *eb; 2144 unsigned long index = (start >> fs_info->nodesize_bits); 2145 2146 rcu_read_lock(); 2147 eb = xa_load(&fs_info->buffer_tree, index); 2148 if (eb && !refcount_inc_not_zero(&eb->refs)) 2149 eb = NULL; 2150 rcu_read_unlock(); 2151 return eb; 2152 } 2153 2154 static void end_bbio_meta_write(struct btrfs_bio *bbio) 2155 { 2156 struct extent_buffer *eb = bbio->private; 2157 struct folio_iter fi; 2158 2159 if (bbio->bio.bi_status != BLK_STS_OK) 2160 set_btree_ioerr(eb); 2161 2162 bio_for_each_folio_all(fi, &bbio->bio) { 2163 btrfs_meta_folio_clear_writeback(fi.folio, eb); 2164 } 2165 2166 buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); 2167 clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 2168 bio_put(&bbio->bio); 2169 } 2170 2171 static void prepare_eb_write(struct extent_buffer *eb) 2172 { 2173 u32 nritems; 2174 unsigned long start; 2175 unsigned long end; 2176 2177 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2178 2179 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 2180 nritems = btrfs_header_nritems(eb); 2181 if (btrfs_header_level(eb) > 0) { 2182 end = btrfs_node_key_ptr_offset(eb, nritems); 2183 memzero_extent_buffer(eb, end, eb->len - end); 2184 } else { 2185 /* 2186 * Leaf: 2187 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 2188 */ 2189 start = btrfs_item_nr_offset(eb, nritems); 2190 end = btrfs_item_nr_offset(eb, 0); 2191 if (nritems == 0) 2192 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); 2193 else 2194 end += btrfs_item_offset(eb, nritems - 1); 2195 memzero_extent_buffer(eb, start, end - start); 2196 } 2197 } 2198 2199 static noinline_for_stack void write_one_eb(struct extent_buffer *eb, 2200 struct writeback_control *wbc) 2201 { 2202 struct btrfs_fs_info *fs_info = eb->fs_info; 2203 struct btrfs_bio *bbio; 2204 2205 prepare_eb_write(eb); 2206 2207 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 2208 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), 2209 eb->fs_info, end_bbio_meta_write, eb); 2210 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 2211 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 2212 wbc_init_bio(wbc, &bbio->bio); 2213 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 2214 bbio->file_offset = eb->start; 2215 for (int i = 0; i < num_extent_folios(eb); i++) { 2216 struct folio *folio = eb->folios[i]; 2217 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 2218 u32 range_len = min_t(u64, folio_end(folio), 2219 eb->start + eb->len) - range_start; 2220 2221 folio_lock(folio); 2222 btrfs_meta_folio_clear_dirty(folio, eb); 2223 btrfs_meta_folio_set_writeback(folio, eb); 2224 if (!folio_test_dirty(folio)) 2225 wbc->nr_to_write -= folio_nr_pages(folio); 2226 bio_add_folio_nofail(&bbio->bio, folio, range_len, 2227 offset_in_folio(folio, range_start)); 2228 wbc_account_cgroup_owner(wbc, folio, range_len); 2229 folio_unlock(folio); 2230 } 2231 /* 2232 * If the fs is already in error status, do not submit any writeback 2233 * but immediately finish it. 2234 */ 2235 if (unlikely(BTRFS_FS_ERROR(fs_info))) { 2236 btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); 2237 return; 2238 } 2239 btrfs_submit_bbio(bbio, 0); 2240 } 2241 2242 /* 2243 * Wait for all eb writeback in the given range to finish. 2244 * 2245 * @fs_info: The fs_info for this file system. 2246 * @start: The offset of the range to start waiting on writeback. 2247 * @end: The end of the range, inclusive. This is meant to be used in 2248 * conjunction with wait_marked_extents, so this will usually be 2249 * the_next_eb->start - 1. 2250 */ 2251 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, 2252 u64 end) 2253 { 2254 struct eb_batch batch; 2255 unsigned long start_index = (start >> fs_info->nodesize_bits); 2256 unsigned long end_index = (end >> fs_info->nodesize_bits); 2257 2258 eb_batch_init(&batch); 2259 while (start_index <= end_index) { 2260 struct extent_buffer *eb; 2261 unsigned int nr_ebs; 2262 2263 nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, 2264 PAGECACHE_TAG_WRITEBACK, &batch); 2265 if (!nr_ebs) 2266 break; 2267 2268 while ((eb = eb_batch_next(&batch)) != NULL) 2269 wait_on_extent_buffer_writeback(eb); 2270 eb_batch_release(&batch); 2271 cond_resched(); 2272 } 2273 } 2274 2275 int btree_write_cache_pages(struct address_space *mapping, 2276 struct writeback_control *wbc) 2277 { 2278 struct btrfs_eb_write_context ctx = { .wbc = wbc }; 2279 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 2280 int ret = 0; 2281 int done = 0; 2282 int nr_to_write_done = 0; 2283 struct eb_batch batch; 2284 unsigned int nr_ebs; 2285 unsigned long index; 2286 unsigned long end; 2287 int scanned = 0; 2288 xa_mark_t tag; 2289 2290 eb_batch_init(&batch); 2291 if (wbc->range_cyclic) { 2292 index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); 2293 end = -1; 2294 2295 /* 2296 * Start from the beginning does not need to cycle over the 2297 * range, mark it as scanned. 2298 */ 2299 scanned = (index == 0); 2300 } else { 2301 index = (wbc->range_start >> fs_info->nodesize_bits); 2302 end = (wbc->range_end >> fs_info->nodesize_bits); 2303 2304 scanned = 1; 2305 } 2306 if (wbc->sync_mode == WB_SYNC_ALL) 2307 tag = PAGECACHE_TAG_TOWRITE; 2308 else 2309 tag = PAGECACHE_TAG_DIRTY; 2310 btrfs_zoned_meta_io_lock(fs_info); 2311 retry: 2312 if (wbc->sync_mode == WB_SYNC_ALL) 2313 buffer_tree_tag_for_writeback(fs_info, index, end); 2314 while (!done && !nr_to_write_done && (index <= end) && 2315 (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { 2316 struct extent_buffer *eb; 2317 2318 while ((eb = eb_batch_next(&batch)) != NULL) { 2319 ctx.eb = eb; 2320 2321 ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); 2322 if (ret) { 2323 if (ret == -EBUSY) 2324 ret = 0; 2325 2326 if (ret) { 2327 done = 1; 2328 break; 2329 } 2330 continue; 2331 } 2332 2333 if (!lock_extent_buffer_for_io(eb, wbc)) 2334 continue; 2335 2336 /* Implies write in zoned mode. */ 2337 if (ctx.zoned_bg) { 2338 /* Mark the last eb in the block group. */ 2339 btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); 2340 ctx.zoned_bg->meta_write_pointer += eb->len; 2341 } 2342 write_one_eb(eb, wbc); 2343 } 2344 nr_to_write_done = (wbc->nr_to_write <= 0); 2345 eb_batch_release(&batch); 2346 cond_resched(); 2347 } 2348 if (!scanned && !done) { 2349 /* 2350 * We hit the last page and there is more work to be done: wrap 2351 * back to the start of the file 2352 */ 2353 scanned = 1; 2354 index = 0; 2355 goto retry; 2356 } 2357 /* 2358 * If something went wrong, don't allow any metadata write bio to be 2359 * submitted. 2360 * 2361 * This would prevent use-after-free if we had dirty pages not 2362 * cleaned up, which can still happen by fuzzed images. 2363 * 2364 * - Bad extent tree 2365 * Allowing existing tree block to be allocated for other trees. 2366 * 2367 * - Log tree operations 2368 * Exiting tree blocks get allocated to log tree, bumps its 2369 * generation, then get cleaned in tree re-balance. 2370 * Such tree block will not be written back, since it's clean, 2371 * thus no WRITTEN flag set. 2372 * And after log writes back, this tree block is not traced by 2373 * any dirty extent_io_tree. 2374 * 2375 * - Offending tree block gets re-dirtied from its original owner 2376 * Since it has bumped generation, no WRITTEN flag, it can be 2377 * reused without COWing. This tree block will not be traced 2378 * by btrfs_transaction::dirty_pages. 2379 * 2380 * Now such dirty tree block will not be cleaned by any dirty 2381 * extent io tree. Thus we don't want to submit such wild eb 2382 * if the fs already has error. 2383 * 2384 * We can get ret > 0 from submit_extent_folio() indicating how many ebs 2385 * were submitted. Reset it to 0 to avoid false alerts for the caller. 2386 */ 2387 if (ret > 0) 2388 ret = 0; 2389 if (!ret && BTRFS_FS_ERROR(fs_info)) 2390 ret = -EROFS; 2391 2392 if (ctx.zoned_bg) 2393 btrfs_put_block_group(ctx.zoned_bg); 2394 btrfs_zoned_meta_io_unlock(fs_info); 2395 return ret; 2396 } 2397 2398 /* 2399 * Walk the list of dirty pages of the given address space and write all of them. 2400 * 2401 * @mapping: address space structure to write 2402 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2403 * @bio_ctrl: holds context for the write, namely the bio 2404 * 2405 * If a page is already under I/O, write_cache_pages() skips it, even 2406 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2407 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2408 * and msync() need to guarantee that all the data which was dirty at the time 2409 * the call was made get new I/O started against them. If wbc->sync_mode is 2410 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2411 * existing IO to complete. 2412 */ 2413 static int extent_write_cache_pages(struct address_space *mapping, 2414 struct btrfs_bio_ctrl *bio_ctrl) 2415 { 2416 struct writeback_control *wbc = bio_ctrl->wbc; 2417 struct inode *inode = mapping->host; 2418 int ret = 0; 2419 int done = 0; 2420 int nr_to_write_done = 0; 2421 struct folio_batch fbatch; 2422 unsigned int nr_folios; 2423 pgoff_t index; 2424 pgoff_t end; /* Inclusive */ 2425 pgoff_t done_index; 2426 int range_whole = 0; 2427 int scanned = 0; 2428 xa_mark_t tag; 2429 2430 /* 2431 * We have to hold onto the inode so that ordered extents can do their 2432 * work when the IO finishes. The alternative to this is failing to add 2433 * an ordered extent if the igrab() fails there and that is a huge pain 2434 * to deal with, so instead just hold onto the inode throughout the 2435 * writepages operation. If it fails here we are freeing up the inode 2436 * anyway and we'd rather not waste our time writing out stuff that is 2437 * going to be truncated anyway. 2438 */ 2439 if (!igrab(inode)) 2440 return 0; 2441 2442 folio_batch_init(&fbatch); 2443 if (wbc->range_cyclic) { 2444 index = mapping->writeback_index; /* Start from prev offset */ 2445 end = -1; 2446 /* 2447 * Start from the beginning does not need to cycle over the 2448 * range, mark it as scanned. 2449 */ 2450 scanned = (index == 0); 2451 } else { 2452 index = wbc->range_start >> PAGE_SHIFT; 2453 end = wbc->range_end >> PAGE_SHIFT; 2454 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2455 range_whole = 1; 2456 scanned = 1; 2457 } 2458 2459 /* 2460 * We do the tagged writepage as long as the snapshot flush bit is set 2461 * and we are the first one who do the filemap_flush() on this inode. 2462 * 2463 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 2464 * not race in and drop the bit. 2465 */ 2466 if (range_whole && wbc->nr_to_write == LONG_MAX && 2467 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 2468 &BTRFS_I(inode)->runtime_flags)) 2469 wbc->tagged_writepages = 1; 2470 2471 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2472 tag = PAGECACHE_TAG_TOWRITE; 2473 else 2474 tag = PAGECACHE_TAG_DIRTY; 2475 retry: 2476 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2477 tag_pages_for_writeback(mapping, index, end); 2478 done_index = index; 2479 while (!done && !nr_to_write_done && (index <= end) && 2480 (nr_folios = filemap_get_folios_tag(mapping, &index, 2481 end, tag, &fbatch))) { 2482 unsigned i; 2483 2484 for (i = 0; i < nr_folios; i++) { 2485 struct folio *folio = fbatch.folios[i]; 2486 2487 done_index = folio_next_index(folio); 2488 /* 2489 * At this point we hold neither the i_pages lock nor 2490 * the folio lock: the folio may be truncated or 2491 * invalidated (changing folio->mapping to NULL). 2492 */ 2493 if (!folio_trylock(folio)) { 2494 submit_write_bio(bio_ctrl, 0); 2495 folio_lock(folio); 2496 } 2497 2498 if (unlikely(folio->mapping != mapping)) { 2499 folio_unlock(folio); 2500 continue; 2501 } 2502 2503 if (!folio_test_dirty(folio)) { 2504 /* Someone wrote it for us. */ 2505 folio_unlock(folio); 2506 continue; 2507 } 2508 2509 /* 2510 * For subpage case, compression can lead to mixed 2511 * writeback and dirty flags, e.g: 2512 * 0 32K 64K 96K 128K 2513 * | |//////||/////| |//| 2514 * 2515 * In above case, [32K, 96K) is asynchronously submitted 2516 * for compression, and [124K, 128K) needs to be written back. 2517 * 2518 * If we didn't wait writeback for page 64K, [128K, 128K) 2519 * won't be submitted as the page still has writeback flag 2520 * and will be skipped in the next check. 2521 * 2522 * This mixed writeback and dirty case is only possible for 2523 * subpage case. 2524 * 2525 * TODO: Remove this check after migrating compression to 2526 * regular submission. 2527 */ 2528 if (wbc->sync_mode != WB_SYNC_NONE || 2529 btrfs_is_subpage(inode_to_fs_info(inode), folio)) { 2530 if (folio_test_writeback(folio)) 2531 submit_write_bio(bio_ctrl, 0); 2532 folio_wait_writeback(folio); 2533 } 2534 2535 if (folio_test_writeback(folio) || 2536 !folio_clear_dirty_for_io(folio)) { 2537 folio_unlock(folio); 2538 continue; 2539 } 2540 2541 ret = extent_writepage(folio, bio_ctrl); 2542 if (ret < 0) { 2543 done = 1; 2544 break; 2545 } 2546 2547 /* 2548 * The filesystem may choose to bump up nr_to_write. 2549 * We have to make sure to honor the new nr_to_write 2550 * at any time. 2551 */ 2552 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && 2553 wbc->nr_to_write <= 0); 2554 } 2555 folio_batch_release(&fbatch); 2556 cond_resched(); 2557 } 2558 if (!scanned && !done) { 2559 /* 2560 * We hit the last page and there is more work to be done: wrap 2561 * back to the start of the file 2562 */ 2563 scanned = 1; 2564 index = 0; 2565 2566 /* 2567 * If we're looping we could run into a page that is locked by a 2568 * writer and that writer could be waiting on writeback for a 2569 * page in our current bio, and thus deadlock, so flush the 2570 * write bio here. 2571 */ 2572 submit_write_bio(bio_ctrl, 0); 2573 goto retry; 2574 } 2575 2576 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 2577 mapping->writeback_index = done_index; 2578 2579 btrfs_add_delayed_iput(BTRFS_I(inode)); 2580 return ret; 2581 } 2582 2583 /* 2584 * Submit the pages in the range to bio for call sites which delalloc range has 2585 * already been ran (aka, ordered extent inserted) and all pages are still 2586 * locked. 2587 */ 2588 void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, 2589 u64 start, u64 end, struct writeback_control *wbc, 2590 bool pages_dirty) 2591 { 2592 bool found_error = false; 2593 int ret = 0; 2594 struct address_space *mapping = inode->i_mapping; 2595 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2596 const u32 sectorsize = fs_info->sectorsize; 2597 loff_t i_size = i_size_read(inode); 2598 u64 cur = start; 2599 struct btrfs_bio_ctrl bio_ctrl = { 2600 .wbc = wbc, 2601 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2602 }; 2603 2604 if (wbc->no_cgroup_owner) 2605 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; 2606 2607 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 2608 2609 while (cur <= end) { 2610 u64 cur_end; 2611 u32 cur_len; 2612 struct folio *folio; 2613 2614 folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); 2615 2616 /* 2617 * This shouldn't happen, the pages are pinned and locked, this 2618 * code is just in case, but shouldn't actually be run. 2619 */ 2620 if (IS_ERR(folio)) { 2621 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2622 cur_len = cur_end + 1 - cur; 2623 btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, 2624 cur, cur_len, false); 2625 mapping_set_error(mapping, PTR_ERR(folio)); 2626 cur = cur_end; 2627 continue; 2628 } 2629 2630 cur_end = min_t(u64, folio_end(folio) - 1, end); 2631 cur_len = cur_end + 1 - cur; 2632 2633 ASSERT(folio_test_locked(folio)); 2634 if (pages_dirty && folio != locked_folio) 2635 ASSERT(folio_test_dirty(folio)); 2636 2637 /* 2638 * Set the submission bitmap to submit all sectors. 2639 * extent_writepage_io() will do the truncation correctly. 2640 */ 2641 bio_ctrl.submit_bitmap = (unsigned long)-1; 2642 ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, 2643 &bio_ctrl, i_size); 2644 if (ret == 1) 2645 goto next_page; 2646 2647 if (ret) 2648 mapping_set_error(mapping, ret); 2649 btrfs_folio_end_lock(fs_info, folio, cur, cur_len); 2650 if (ret < 0) 2651 found_error = true; 2652 next_page: 2653 folio_put(folio); 2654 cur = cur_end + 1; 2655 } 2656 2657 submit_write_bio(&bio_ctrl, found_error ? ret : 0); 2658 } 2659 2660 int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 2661 { 2662 struct inode *inode = mapping->host; 2663 int ret = 0; 2664 struct btrfs_bio_ctrl bio_ctrl = { 2665 .wbc = wbc, 2666 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2667 }; 2668 2669 /* 2670 * Allow only a single thread to do the reloc work in zoned mode to 2671 * protect the write pointer updates. 2672 */ 2673 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 2674 ret = extent_write_cache_pages(mapping, &bio_ctrl); 2675 submit_write_bio(&bio_ctrl, ret); 2676 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 2677 return ret; 2678 } 2679 2680 void btrfs_readahead(struct readahead_control *rac) 2681 { 2682 struct btrfs_bio_ctrl bio_ctrl = { 2683 .opf = REQ_OP_READ | REQ_RAHEAD, 2684 .ractl = rac, 2685 .last_em_start = U64_MAX, 2686 }; 2687 struct folio *folio; 2688 struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); 2689 const u64 start = readahead_pos(rac); 2690 const u64 end = start + readahead_length(rac) - 1; 2691 struct extent_state *cached_state = NULL; 2692 struct extent_map *em_cached = NULL; 2693 2694 lock_extents_for_read(inode, start, end, &cached_state); 2695 2696 while ((folio = readahead_folio(rac)) != NULL) 2697 btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 2698 2699 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2700 2701 if (em_cached) 2702 btrfs_free_extent_map(em_cached); 2703 submit_one_bio(&bio_ctrl); 2704 } 2705 2706 /* 2707 * basic invalidate_folio code, this waits on any locked or writeback 2708 * ranges corresponding to the folio, and then deletes any extent state 2709 * records from the tree 2710 */ 2711 int extent_invalidate_folio(struct extent_io_tree *tree, 2712 struct folio *folio, size_t offset) 2713 { 2714 struct extent_state *cached_state = NULL; 2715 u64 start = folio_pos(folio); 2716 u64 end = start + folio_size(folio) - 1; 2717 size_t blocksize = folio_to_fs_info(folio)->sectorsize; 2718 2719 /* This function is only called for the btree inode */ 2720 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 2721 2722 start += ALIGN(offset, blocksize); 2723 if (start > end) 2724 return 0; 2725 2726 btrfs_lock_extent(tree, start, end, &cached_state); 2727 folio_wait_writeback(folio); 2728 2729 /* 2730 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 2731 * so here we only need to unlock the extent range to free any 2732 * existing extent state. 2733 */ 2734 btrfs_unlock_extent(tree, start, end, &cached_state); 2735 return 0; 2736 } 2737 2738 /* 2739 * A helper for struct address_space_operations::release_folio, this tests for 2740 * areas of the folio that are locked or under IO and drops the related state 2741 * bits if it is safe to drop the folio. 2742 */ 2743 static bool try_release_extent_state(struct extent_io_tree *tree, 2744 struct folio *folio) 2745 { 2746 struct extent_state *cached_state = NULL; 2747 u64 start = folio_pos(folio); 2748 u64 end = start + folio_size(folio) - 1; 2749 u32 range_bits; 2750 u32 clear_bits; 2751 bool ret = false; 2752 int ret2; 2753 2754 btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); 2755 2756 /* 2757 * We can release the folio if it's locked only for ordered extent 2758 * completion, since that doesn't require using the folio. 2759 */ 2760 if ((range_bits & EXTENT_LOCKED) && 2761 !(range_bits & EXTENT_FINISHING_ORDERED)) 2762 goto out; 2763 2764 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | 2765 EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | 2766 EXTENT_FINISHING_ORDERED); 2767 /* 2768 * At this point we can safely clear everything except the locked, 2769 * nodatasum, delalloc new and finishing ordered bits. The delalloc new 2770 * bit will be cleared by ordered extent completion. 2771 */ 2772 ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); 2773 /* 2774 * If clear_extent_bit failed for enomem reasons, we can't allow the 2775 * release to continue. 2776 */ 2777 if (ret2 == 0) 2778 ret = true; 2779 out: 2780 btrfs_free_extent_state(cached_state); 2781 2782 return ret; 2783 } 2784 2785 /* 2786 * a helper for release_folio. As long as there are no locked extents 2787 * in the range corresponding to the page, both state records and extent 2788 * map records are removed 2789 */ 2790 bool try_release_extent_mapping(struct folio *folio, gfp_t mask) 2791 { 2792 u64 start = folio_pos(folio); 2793 u64 end = start + folio_size(folio) - 1; 2794 struct btrfs_inode *inode = folio_to_inode(folio); 2795 struct extent_io_tree *io_tree = &inode->io_tree; 2796 2797 while (start <= end) { 2798 const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); 2799 const u64 len = end - start + 1; 2800 struct extent_map_tree *extent_tree = &inode->extent_tree; 2801 struct extent_map *em; 2802 2803 write_lock(&extent_tree->lock); 2804 em = btrfs_lookup_extent_mapping(extent_tree, start, len); 2805 if (!em) { 2806 write_unlock(&extent_tree->lock); 2807 break; 2808 } 2809 if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { 2810 write_unlock(&extent_tree->lock); 2811 btrfs_free_extent_map(em); 2812 break; 2813 } 2814 if (btrfs_test_range_bit_exists(io_tree, em->start, 2815 btrfs_extent_map_end(em) - 1, 2816 EXTENT_LOCKED)) 2817 goto next; 2818 /* 2819 * If it's not in the list of modified extents, used by a fast 2820 * fsync, we can remove it. If it's being logged we can safely 2821 * remove it since fsync took an extra reference on the em. 2822 */ 2823 if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) 2824 goto remove_em; 2825 /* 2826 * If it's in the list of modified extents, remove it only if 2827 * its generation is older then the current one, in which case 2828 * we don't need it for a fast fsync. Otherwise don't remove it, 2829 * we could be racing with an ongoing fast fsync that could miss 2830 * the new extent. 2831 */ 2832 if (em->generation >= cur_gen) 2833 goto next; 2834 remove_em: 2835 /* 2836 * We only remove extent maps that are not in the list of 2837 * modified extents or that are in the list but with a 2838 * generation lower then the current generation, so there is no 2839 * need to set the full fsync flag on the inode (it hurts the 2840 * fsync performance for workloads with a data size that exceeds 2841 * or is close to the system's memory). 2842 */ 2843 btrfs_remove_extent_mapping(inode, em); 2844 /* Once for the inode's extent map tree. */ 2845 btrfs_free_extent_map(em); 2846 next: 2847 start = btrfs_extent_map_end(em); 2848 write_unlock(&extent_tree->lock); 2849 2850 /* Once for us, for the lookup_extent_mapping() reference. */ 2851 btrfs_free_extent_map(em); 2852 2853 if (need_resched()) { 2854 /* 2855 * If we need to resched but we can't block just exit 2856 * and leave any remaining extent maps. 2857 */ 2858 if (!gfpflags_allow_blocking(mask)) 2859 break; 2860 2861 cond_resched(); 2862 } 2863 } 2864 return try_release_extent_state(io_tree, folio); 2865 } 2866 2867 static int extent_buffer_under_io(const struct extent_buffer *eb) 2868 { 2869 return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 2870 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2871 } 2872 2873 static bool folio_range_has_eb(struct folio *folio) 2874 { 2875 struct btrfs_folio_state *bfs; 2876 2877 lockdep_assert_held(&folio->mapping->i_private_lock); 2878 2879 if (folio_test_private(folio)) { 2880 bfs = folio_get_private(folio); 2881 if (atomic_read(&bfs->eb_refs)) 2882 return true; 2883 } 2884 return false; 2885 } 2886 2887 static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) 2888 { 2889 struct btrfs_fs_info *fs_info = eb->fs_info; 2890 struct address_space *mapping = folio->mapping; 2891 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 2892 2893 /* 2894 * For mapped eb, we're going to change the folio private, which should 2895 * be done under the i_private_lock. 2896 */ 2897 if (mapped) 2898 spin_lock(&mapping->i_private_lock); 2899 2900 if (!folio_test_private(folio)) { 2901 if (mapped) 2902 spin_unlock(&mapping->i_private_lock); 2903 return; 2904 } 2905 2906 if (!btrfs_meta_is_subpage(fs_info)) { 2907 /* 2908 * We do this since we'll remove the pages after we've removed 2909 * the eb from the xarray, so we could race and have this page 2910 * now attached to the new eb. So only clear folio if it's 2911 * still connected to this eb. 2912 */ 2913 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2914 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2915 BUG_ON(folio_test_dirty(folio)); 2916 BUG_ON(folio_test_writeback(folio)); 2917 /* We need to make sure we haven't be attached to a new eb. */ 2918 folio_detach_private(folio); 2919 } 2920 if (mapped) 2921 spin_unlock(&mapping->i_private_lock); 2922 return; 2923 } 2924 2925 /* 2926 * For subpage, we can have dummy eb with folio private attached. In 2927 * this case, we can directly detach the private as such folio is only 2928 * attached to one dummy eb, no sharing. 2929 */ 2930 if (!mapped) { 2931 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2932 return; 2933 } 2934 2935 btrfs_folio_dec_eb_refs(fs_info, folio); 2936 2937 /* 2938 * We can only detach the folio private if there are no other ebs in the 2939 * page range and no unfinished IO. 2940 */ 2941 if (!folio_range_has_eb(folio)) 2942 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2943 2944 spin_unlock(&mapping->i_private_lock); 2945 } 2946 2947 /* Release all folios attached to the extent buffer */ 2948 static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) 2949 { 2950 ASSERT(!extent_buffer_under_io(eb)); 2951 2952 for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { 2953 struct folio *folio = eb->folios[i]; 2954 2955 if (!folio) 2956 continue; 2957 2958 detach_extent_buffer_folio(eb, folio); 2959 } 2960 } 2961 2962 /* 2963 * Helper for releasing the extent buffer. 2964 */ 2965 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 2966 { 2967 btrfs_release_extent_buffer_folios(eb); 2968 btrfs_leak_debug_del_eb(eb); 2969 kmem_cache_free(extent_buffer_cache, eb); 2970 } 2971 2972 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 2973 u64 start) 2974 { 2975 struct extent_buffer *eb = NULL; 2976 2977 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 2978 eb->start = start; 2979 eb->len = fs_info->nodesize; 2980 eb->fs_info = fs_info; 2981 init_rwsem(&eb->lock); 2982 2983 btrfs_leak_debug_add_eb(eb); 2984 2985 spin_lock_init(&eb->refs_lock); 2986 refcount_set(&eb->refs, 1); 2987 2988 ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); 2989 2990 return eb; 2991 } 2992 2993 /* 2994 * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() 2995 * does not call folio_put(), and we need to set the folios to NULL so that 2996 * btrfs_release_extent_buffer() will not detach them a second time. 2997 */ 2998 static void cleanup_extent_buffer_folios(struct extent_buffer *eb) 2999 { 3000 const int num_folios = num_extent_folios(eb); 3001 3002 /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ 3003 for (int i = 0; i < num_folios; i++) { 3004 ASSERT(eb->folios[i]); 3005 detach_extent_buffer_folio(eb, eb->folios[i]); 3006 folio_put(eb->folios[i]); 3007 eb->folios[i] = NULL; 3008 } 3009 } 3010 3011 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 3012 { 3013 struct extent_buffer *new; 3014 int num_folios; 3015 int ret; 3016 3017 new = __alloc_extent_buffer(src->fs_info, src->start); 3018 if (new == NULL) 3019 return NULL; 3020 3021 /* 3022 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 3023 * btrfs_release_extent_buffer() have different behavior for 3024 * UNMAPPED subpage extent buffer. 3025 */ 3026 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 3027 3028 ret = alloc_eb_folio_array(new, false); 3029 if (ret) 3030 goto release_eb; 3031 3032 ASSERT(num_extent_folios(src) == num_extent_folios(new), 3033 "%d != %d", num_extent_folios(src), num_extent_folios(new)); 3034 /* Explicitly use the cached num_extent value from now on. */ 3035 num_folios = num_extent_folios(src); 3036 for (int i = 0; i < num_folios; i++) { 3037 struct folio *folio = new->folios[i]; 3038 3039 ret = attach_extent_buffer_folio(new, folio, NULL); 3040 if (ret < 0) 3041 goto cleanup_folios; 3042 WARN_ON(folio_test_dirty(folio)); 3043 } 3044 for (int i = 0; i < num_folios; i++) 3045 folio_put(new->folios[i]); 3046 3047 copy_extent_buffer_full(new, src); 3048 set_extent_buffer_uptodate(new); 3049 3050 return new; 3051 3052 cleanup_folios: 3053 cleanup_extent_buffer_folios(new); 3054 release_eb: 3055 btrfs_release_extent_buffer(new); 3056 return NULL; 3057 } 3058 3059 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 3060 u64 start) 3061 { 3062 struct extent_buffer *eb; 3063 int ret; 3064 3065 eb = __alloc_extent_buffer(fs_info, start); 3066 if (!eb) 3067 return NULL; 3068 3069 ret = alloc_eb_folio_array(eb, false); 3070 if (ret) 3071 goto release_eb; 3072 3073 for (int i = 0; i < num_extent_folios(eb); i++) { 3074 ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); 3075 if (ret < 0) 3076 goto cleanup_folios; 3077 } 3078 for (int i = 0; i < num_extent_folios(eb); i++) 3079 folio_put(eb->folios[i]); 3080 3081 set_extent_buffer_uptodate(eb); 3082 btrfs_set_header_nritems(eb, 0); 3083 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 3084 3085 return eb; 3086 3087 cleanup_folios: 3088 cleanup_extent_buffer_folios(eb); 3089 release_eb: 3090 btrfs_release_extent_buffer(eb); 3091 return NULL; 3092 } 3093 3094 static void check_buffer_tree_ref(struct extent_buffer *eb) 3095 { 3096 int refs; 3097 /* 3098 * The TREE_REF bit is first set when the extent_buffer is added to the 3099 * xarray. It is also reset, if unset, when a new reference is created 3100 * by find_extent_buffer. 3101 * 3102 * It is only cleared in two cases: freeing the last non-tree 3103 * reference to the extent_buffer when its STALE bit is set or 3104 * calling release_folio when the tree reference is the only reference. 3105 * 3106 * In both cases, care is taken to ensure that the extent_buffer's 3107 * pages are not under io. However, release_folio can be concurrently 3108 * called with creating new references, which is prone to race 3109 * conditions between the calls to check_buffer_tree_ref in those 3110 * codepaths and clearing TREE_REF in try_release_extent_buffer. 3111 * 3112 * The actual lifetime of the extent_buffer in the xarray is adequately 3113 * protected by the refcount, but the TREE_REF bit and its corresponding 3114 * reference are not. To protect against this class of races, we call 3115 * check_buffer_tree_ref() from the code paths which trigger io. Note that 3116 * once io is initiated, TREE_REF can no longer be cleared, so that is 3117 * the moment at which any such race is best fixed. 3118 */ 3119 refs = refcount_read(&eb->refs); 3120 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3121 return; 3122 3123 spin_lock(&eb->refs_lock); 3124 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3125 refcount_inc(&eb->refs); 3126 spin_unlock(&eb->refs_lock); 3127 } 3128 3129 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 3130 { 3131 check_buffer_tree_ref(eb); 3132 3133 for (int i = 0; i < num_extent_folios(eb); i++) 3134 folio_mark_accessed(eb->folios[i]); 3135 } 3136 3137 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 3138 u64 start) 3139 { 3140 struct extent_buffer *eb; 3141 3142 eb = find_extent_buffer_nolock(fs_info, start); 3143 if (!eb) 3144 return NULL; 3145 /* 3146 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 3147 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 3148 * another task running free_extent_buffer() might have seen that flag 3149 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 3150 * writeback flags not set) and it's still in the tree (flag 3151 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 3152 * decrementing the extent buffer's reference count twice. So here we 3153 * could race and increment the eb's reference count, clear its stale 3154 * flag, mark it as dirty and drop our reference before the other task 3155 * finishes executing free_extent_buffer, which would later result in 3156 * an attempt to free an extent buffer that is dirty. 3157 */ 3158 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 3159 spin_lock(&eb->refs_lock); 3160 spin_unlock(&eb->refs_lock); 3161 } 3162 mark_extent_buffer_accessed(eb); 3163 return eb; 3164 } 3165 3166 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 3167 u64 start) 3168 { 3169 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3170 struct extent_buffer *eb, *exists = NULL; 3171 int ret; 3172 3173 eb = find_extent_buffer(fs_info, start); 3174 if (eb) 3175 return eb; 3176 eb = alloc_dummy_extent_buffer(fs_info, start); 3177 if (!eb) 3178 return ERR_PTR(-ENOMEM); 3179 eb->fs_info = fs_info; 3180 again: 3181 xa_lock_irq(&fs_info->buffer_tree); 3182 exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, 3183 NULL, eb, GFP_NOFS); 3184 if (xa_is_err(exists)) { 3185 ret = xa_err(exists); 3186 xa_unlock_irq(&fs_info->buffer_tree); 3187 btrfs_release_extent_buffer(eb); 3188 return ERR_PTR(ret); 3189 } 3190 if (exists) { 3191 if (!refcount_inc_not_zero(&exists->refs)) { 3192 /* The extent buffer is being freed, retry. */ 3193 xa_unlock_irq(&fs_info->buffer_tree); 3194 goto again; 3195 } 3196 xa_unlock_irq(&fs_info->buffer_tree); 3197 btrfs_release_extent_buffer(eb); 3198 return exists; 3199 } 3200 xa_unlock_irq(&fs_info->buffer_tree); 3201 check_buffer_tree_ref(eb); 3202 3203 return eb; 3204 #else 3205 /* Stub to avoid linker error when compiled with optimizations turned off. */ 3206 return NULL; 3207 #endif 3208 } 3209 3210 static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, 3211 struct folio *folio) 3212 { 3213 struct extent_buffer *exists; 3214 3215 lockdep_assert_held(&folio->mapping->i_private_lock); 3216 3217 /* 3218 * For subpage case, we completely rely on xarray to ensure we don't try 3219 * to insert two ebs for the same bytenr. So here we always return NULL 3220 * and just continue. 3221 */ 3222 if (btrfs_meta_is_subpage(fs_info)) 3223 return NULL; 3224 3225 /* Page not yet attached to an extent buffer */ 3226 if (!folio_test_private(folio)) 3227 return NULL; 3228 3229 /* 3230 * We could have already allocated an eb for this folio and attached one 3231 * so lets see if we can get a ref on the existing eb, and if we can we 3232 * know it's good and we can just return that one, else we know we can 3233 * just overwrite folio private. 3234 */ 3235 exists = folio_get_private(folio); 3236 if (refcount_inc_not_zero(&exists->refs)) 3237 return exists; 3238 3239 WARN_ON(folio_test_dirty(folio)); 3240 folio_detach_private(folio); 3241 return NULL; 3242 } 3243 3244 /* 3245 * Validate alignment constraints of eb at logical address @start. 3246 */ 3247 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 3248 { 3249 const u32 nodesize = fs_info->nodesize; 3250 3251 if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { 3252 btrfs_err(fs_info, "bad tree block start %llu", start); 3253 return true; 3254 } 3255 3256 if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { 3257 btrfs_err(fs_info, 3258 "tree block is not nodesize aligned, start %llu nodesize %u", 3259 start, nodesize); 3260 return true; 3261 } 3262 if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { 3263 btrfs_err(fs_info, 3264 "tree block is not page aligned, start %llu nodesize %u", 3265 start, nodesize); 3266 return true; 3267 } 3268 if (unlikely(!IS_ALIGNED(start, nodesize) && 3269 !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { 3270 btrfs_warn(fs_info, 3271 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", 3272 start, nodesize); 3273 } 3274 return false; 3275 } 3276 3277 /* 3278 * Return 0 if eb->folios[i] is attached to btree inode successfully. 3279 * Return >0 if there is already another extent buffer for the range, 3280 * and @found_eb_ret would be updated. 3281 * Return -EAGAIN if the filemap has an existing folio but with different size 3282 * than @eb. 3283 * The caller needs to free the existing folios and retry using the same order. 3284 */ 3285 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3286 struct btrfs_folio_state *prealloc, 3287 struct extent_buffer **found_eb_ret) 3288 { 3289 3290 struct btrfs_fs_info *fs_info = eb->fs_info; 3291 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3292 const pgoff_t index = eb->start >> PAGE_SHIFT; 3293 struct folio *existing_folio; 3294 int ret; 3295 3296 ASSERT(found_eb_ret); 3297 3298 /* Caller should ensure the folio exists. */ 3299 ASSERT(eb->folios[i]); 3300 3301 retry: 3302 existing_folio = NULL; 3303 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3304 GFP_NOFS | __GFP_NOFAIL); 3305 if (!ret) 3306 goto finish; 3307 3308 existing_folio = filemap_lock_folio(mapping, index + i); 3309 /* The page cache only exists for a very short time, just retry. */ 3310 if (IS_ERR(existing_folio)) 3311 goto retry; 3312 3313 /* For now, we should only have single-page folios for btree inode. */ 3314 ASSERT(folio_nr_pages(existing_folio) == 1); 3315 3316 if (folio_size(existing_folio) != eb->folio_size) { 3317 folio_unlock(existing_folio); 3318 folio_put(existing_folio); 3319 return -EAGAIN; 3320 } 3321 3322 finish: 3323 spin_lock(&mapping->i_private_lock); 3324 if (existing_folio && btrfs_meta_is_subpage(fs_info)) { 3325 /* We're going to reuse the existing page, can drop our folio now. */ 3326 __free_page(folio_page(eb->folios[i], 0)); 3327 eb->folios[i] = existing_folio; 3328 } else if (existing_folio) { 3329 struct extent_buffer *existing_eb; 3330 3331 existing_eb = grab_extent_buffer(fs_info, existing_folio); 3332 if (existing_eb) { 3333 /* The extent buffer still exists, we can use it directly. */ 3334 *found_eb_ret = existing_eb; 3335 spin_unlock(&mapping->i_private_lock); 3336 folio_unlock(existing_folio); 3337 folio_put(existing_folio); 3338 return 1; 3339 } 3340 /* The extent buffer no longer exists, we can reuse the folio. */ 3341 __free_page(folio_page(eb->folios[i], 0)); 3342 eb->folios[i] = existing_folio; 3343 } 3344 eb->folio_size = folio_size(eb->folios[i]); 3345 eb->folio_shift = folio_shift(eb->folios[i]); 3346 /* Should not fail, as we have preallocated the memory. */ 3347 ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3348 ASSERT(!ret); 3349 /* 3350 * To inform we have an extra eb under allocation, so that 3351 * detach_extent_buffer_page() won't release the folio private when the 3352 * eb hasn't been inserted into the xarray yet. 3353 * 3354 * The ref will be decreased when the eb releases the page, in 3355 * detach_extent_buffer_page(). Thus needs no special handling in the 3356 * error path. 3357 */ 3358 btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3359 spin_unlock(&mapping->i_private_lock); 3360 return 0; 3361 } 3362 3363 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3364 u64 start, u64 owner_root, int level) 3365 { 3366 int attached = 0; 3367 struct extent_buffer *eb; 3368 struct extent_buffer *existing_eb = NULL; 3369 struct btrfs_folio_state *prealloc = NULL; 3370 u64 lockdep_owner = owner_root; 3371 bool page_contig = true; 3372 int uptodate = 1; 3373 int ret; 3374 3375 if (check_eb_alignment(fs_info, start)) 3376 return ERR_PTR(-EINVAL); 3377 3378 #if BITS_PER_LONG == 32 3379 if (start >= MAX_LFS_FILESIZE) { 3380 btrfs_err_rl(fs_info, 3381 "extent buffer %llu is beyond 32bit page cache limit", start); 3382 btrfs_err_32bit_limit(fs_info); 3383 return ERR_PTR(-EOVERFLOW); 3384 } 3385 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 3386 btrfs_warn_32bit_limit(fs_info); 3387 #endif 3388 3389 eb = find_extent_buffer(fs_info, start); 3390 if (eb) 3391 return eb; 3392 3393 eb = __alloc_extent_buffer(fs_info, start); 3394 if (!eb) 3395 return ERR_PTR(-ENOMEM); 3396 3397 /* 3398 * The reloc trees are just snapshots, so we need them to appear to be 3399 * just like any other fs tree WRT lockdep. 3400 */ 3401 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 3402 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 3403 3404 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 3405 3406 /* 3407 * Preallocate folio private for subpage case, so that we won't 3408 * allocate memory with i_private_lock nor page lock hold. 3409 * 3410 * The memory will be freed by attach_extent_buffer_page() or freed 3411 * manually if we exit earlier. 3412 */ 3413 if (btrfs_meta_is_subpage(fs_info)) { 3414 prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); 3415 if (IS_ERR(prealloc)) { 3416 ret = PTR_ERR(prealloc); 3417 goto out; 3418 } 3419 } 3420 3421 reallocate: 3422 /* Allocate all pages first. */ 3423 ret = alloc_eb_folio_array(eb, true); 3424 if (ret < 0) { 3425 btrfs_free_folio_state(prealloc); 3426 goto out; 3427 } 3428 3429 /* Attach all pages to the filemap. */ 3430 for (int i = 0; i < num_extent_folios(eb); i++) { 3431 struct folio *folio; 3432 3433 ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3434 if (ret > 0) { 3435 ASSERT(existing_eb); 3436 goto out; 3437 } 3438 3439 /* 3440 * TODO: Special handling for a corner case where the order of 3441 * folios mismatch between the new eb and filemap. 3442 * 3443 * This happens when: 3444 * 3445 * - the new eb is using higher order folio 3446 * 3447 * - the filemap is still using 0-order folios for the range 3448 * This can happen at the previous eb allocation, and we don't 3449 * have higher order folio for the call. 3450 * 3451 * - the existing eb has already been freed 3452 * 3453 * In this case, we have to free the existing folios first, and 3454 * re-allocate using the same order. 3455 * Thankfully this is not going to happen yet, as we're still 3456 * using 0-order folios. 3457 */ 3458 if (unlikely(ret == -EAGAIN)) { 3459 DEBUG_WARN("folio order mismatch between new eb and filemap"); 3460 goto reallocate; 3461 } 3462 attached++; 3463 3464 /* 3465 * Only after attach_eb_folio_to_filemap(), eb->folios[] is 3466 * reliable, as we may choose to reuse the existing page cache 3467 * and free the allocated page. 3468 */ 3469 folio = eb->folios[i]; 3470 WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); 3471 3472 /* 3473 * Check if the current page is physically contiguous with previous eb 3474 * page. 3475 * At this stage, either we allocated a large folio, thus @i 3476 * would only be 0, or we fall back to per-page allocation. 3477 */ 3478 if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) 3479 page_contig = false; 3480 3481 if (!btrfs_meta_folio_test_uptodate(folio, eb)) 3482 uptodate = 0; 3483 3484 /* 3485 * We can't unlock the pages just yet since the extent buffer 3486 * hasn't been properly inserted into the xarray, this opens a 3487 * race with btree_release_folio() which can free a page while we 3488 * are still filling in all pages for the buffer and we could crash. 3489 */ 3490 } 3491 if (uptodate) 3492 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3493 /* All pages are physically contiguous, can skip cross page handling. */ 3494 if (page_contig) 3495 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3496 again: 3497 xa_lock_irq(&fs_info->buffer_tree); 3498 existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3499 start >> fs_info->nodesize_bits, NULL, eb, 3500 GFP_NOFS); 3501 if (xa_is_err(existing_eb)) { 3502 ret = xa_err(existing_eb); 3503 xa_unlock_irq(&fs_info->buffer_tree); 3504 goto out; 3505 } 3506 if (existing_eb) { 3507 if (!refcount_inc_not_zero(&existing_eb->refs)) { 3508 xa_unlock_irq(&fs_info->buffer_tree); 3509 goto again; 3510 } 3511 xa_unlock_irq(&fs_info->buffer_tree); 3512 goto out; 3513 } 3514 xa_unlock_irq(&fs_info->buffer_tree); 3515 3516 /* add one reference for the tree */ 3517 check_buffer_tree_ref(eb); 3518 3519 /* 3520 * Now it's safe to unlock the pages because any calls to 3521 * btree_release_folio will correctly detect that a page belongs to a 3522 * live buffer and won't free them prematurely. 3523 */ 3524 for (int i = 0; i < num_extent_folios(eb); i++) { 3525 folio_unlock(eb->folios[i]); 3526 /* 3527 * A folio that has been added to an address_space mapping 3528 * should not continue holding the refcount from its original 3529 * allocation indefinitely. 3530 */ 3531 folio_put(eb->folios[i]); 3532 } 3533 return eb; 3534 3535 out: 3536 WARN_ON(!refcount_dec_and_test(&eb->refs)); 3537 3538 /* 3539 * Any attached folios need to be detached before we unlock them. This 3540 * is because when we're inserting our new folios into the mapping, and 3541 * then attaching our eb to that folio. If we fail to insert our folio 3542 * we'll lookup the folio for that index, and grab that EB. We do not 3543 * want that to grab this eb, as we're getting ready to free it. So we 3544 * have to detach it first and then unlock it. 3545 * 3546 * Note: the bounds is num_extent_pages() as we need to go through all slots. 3547 */ 3548 for (int i = 0; i < num_extent_pages(eb); i++) { 3549 struct folio *folio = eb->folios[i]; 3550 3551 if (i < attached) { 3552 ASSERT(folio); 3553 detach_extent_buffer_folio(eb, folio); 3554 folio_unlock(folio); 3555 } else if (!folio) { 3556 continue; 3557 } 3558 3559 folio_put(folio); 3560 eb->folios[i] = NULL; 3561 } 3562 btrfs_release_extent_buffer(eb); 3563 if (ret < 0) 3564 return ERR_PTR(ret); 3565 ASSERT(existing_eb); 3566 return existing_eb; 3567 } 3568 3569 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3570 { 3571 struct extent_buffer *eb = 3572 container_of(head, struct extent_buffer, rcu_head); 3573 3574 kmem_cache_free(extent_buffer_cache, eb); 3575 } 3576 3577 static int release_extent_buffer(struct extent_buffer *eb) 3578 __releases(&eb->refs_lock) 3579 { 3580 lockdep_assert_held(&eb->refs_lock); 3581 3582 if (refcount_dec_and_test(&eb->refs)) { 3583 struct btrfs_fs_info *fs_info = eb->fs_info; 3584 3585 spin_unlock(&eb->refs_lock); 3586 3587 /* 3588 * We're erasing, theoretically there will be no allocations, so 3589 * just use GFP_ATOMIC. 3590 * 3591 * We use cmpxchg instead of erase because we do not know if 3592 * this eb is actually in the tree or not, we could be cleaning 3593 * up an eb that we allocated but never inserted into the tree. 3594 * Thus use cmpxchg to remove it from the tree if it is there, 3595 * or leave the other entry if this isn't in the tree. 3596 * 3597 * The documentation says that putting a NULL value is the same 3598 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3599 * in this case. 3600 */ 3601 xa_cmpxchg_irq(&fs_info->buffer_tree, 3602 eb->start >> fs_info->nodesize_bits, eb, NULL, 3603 GFP_ATOMIC); 3604 3605 btrfs_leak_debug_del_eb(eb); 3606 /* Should be safe to release folios at this point. */ 3607 btrfs_release_extent_buffer_folios(eb); 3608 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3609 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 3610 kmem_cache_free(extent_buffer_cache, eb); 3611 return 1; 3612 } 3613 #endif 3614 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3615 return 1; 3616 } 3617 spin_unlock(&eb->refs_lock); 3618 3619 return 0; 3620 } 3621 3622 void free_extent_buffer(struct extent_buffer *eb) 3623 { 3624 int refs; 3625 if (!eb) 3626 return; 3627 3628 refs = refcount_read(&eb->refs); 3629 while (1) { 3630 if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { 3631 if (refs == 1) 3632 break; 3633 } else if (refs <= 3) { 3634 break; 3635 } 3636 3637 /* Optimization to avoid locking eb->refs_lock. */ 3638 if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) 3639 return; 3640 } 3641 3642 spin_lock(&eb->refs_lock); 3643 if (refcount_read(&eb->refs) == 2 && 3644 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 3645 !extent_buffer_under_io(eb) && 3646 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3647 refcount_dec(&eb->refs); 3648 3649 /* 3650 * I know this is terrible, but it's temporary until we stop tracking 3651 * the uptodate bits and such for the extent buffers. 3652 */ 3653 release_extent_buffer(eb); 3654 } 3655 3656 void free_extent_buffer_stale(struct extent_buffer *eb) 3657 { 3658 if (!eb) 3659 return; 3660 3661 spin_lock(&eb->refs_lock); 3662 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 3663 3664 if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 3665 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3666 refcount_dec(&eb->refs); 3667 release_extent_buffer(eb); 3668 } 3669 3670 static void btree_clear_folio_dirty_tag(struct folio *folio) 3671 { 3672 ASSERT(!folio_test_dirty(folio)); 3673 ASSERT(folio_test_locked(folio)); 3674 xa_lock_irq(&folio->mapping->i_pages); 3675 if (!folio_test_dirty(folio)) 3676 __xa_clear_mark(&folio->mapping->i_pages, folio->index, 3677 PAGECACHE_TAG_DIRTY); 3678 xa_unlock_irq(&folio->mapping->i_pages); 3679 } 3680 3681 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, 3682 struct extent_buffer *eb) 3683 { 3684 struct btrfs_fs_info *fs_info = eb->fs_info; 3685 3686 btrfs_assert_tree_write_locked(eb); 3687 3688 if (trans && btrfs_header_generation(eb) != trans->transid) 3689 return; 3690 3691 /* 3692 * Instead of clearing the dirty flag off of the buffer, mark it as 3693 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve 3694 * write-ordering in zoned mode, without the need to later re-dirty 3695 * the extent_buffer. 3696 * 3697 * The actual zeroout of the buffer will happen later in 3698 * btree_csum_one_bio. 3699 */ 3700 if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3701 set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); 3702 return; 3703 } 3704 3705 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) 3706 return; 3707 3708 buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); 3709 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, 3710 fs_info->dirty_metadata_batch); 3711 3712 for (int i = 0; i < num_extent_folios(eb); i++) { 3713 struct folio *folio = eb->folios[i]; 3714 bool last; 3715 3716 if (!folio_test_dirty(folio)) 3717 continue; 3718 folio_lock(folio); 3719 last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); 3720 if (last) 3721 btree_clear_folio_dirty_tag(folio); 3722 folio_unlock(folio); 3723 } 3724 WARN_ON(refcount_read(&eb->refs) == 0); 3725 } 3726 3727 void set_extent_buffer_dirty(struct extent_buffer *eb) 3728 { 3729 bool was_dirty; 3730 3731 check_buffer_tree_ref(eb); 3732 3733 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3734 3735 WARN_ON(refcount_read(&eb->refs) == 0); 3736 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 3737 WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); 3738 3739 if (!was_dirty) { 3740 bool subpage = btrfs_meta_is_subpage(eb->fs_info); 3741 3742 /* 3743 * For subpage case, we can have other extent buffers in the 3744 * same page, and in clear_extent_buffer_dirty() we 3745 * have to clear page dirty without subpage lock held. 3746 * This can cause race where our page gets dirty cleared after 3747 * we just set it. 3748 * 3749 * Thankfully, clear_extent_buffer_dirty() has locked 3750 * its page for other reasons, we can use page lock to prevent 3751 * the above race. 3752 */ 3753 if (subpage) 3754 folio_lock(eb->folios[0]); 3755 for (int i = 0; i < num_extent_folios(eb); i++) 3756 btrfs_meta_folio_set_dirty(eb->folios[i], eb); 3757 buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); 3758 if (subpage) 3759 folio_unlock(eb->folios[0]); 3760 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, 3761 eb->len, 3762 eb->fs_info->dirty_metadata_batch); 3763 } 3764 #ifdef CONFIG_BTRFS_DEBUG 3765 for (int i = 0; i < num_extent_folios(eb); i++) 3766 ASSERT(folio_test_dirty(eb->folios[i])); 3767 #endif 3768 } 3769 3770 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 3771 { 3772 3773 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3774 for (int i = 0; i < num_extent_folios(eb); i++) { 3775 struct folio *folio = eb->folios[i]; 3776 3777 if (!folio) 3778 continue; 3779 3780 btrfs_meta_folio_clear_uptodate(folio, eb); 3781 } 3782 } 3783 3784 void set_extent_buffer_uptodate(struct extent_buffer *eb) 3785 { 3786 3787 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3788 for (int i = 0; i < num_extent_folios(eb); i++) 3789 btrfs_meta_folio_set_uptodate(eb->folios[i], eb); 3790 } 3791 3792 static void clear_extent_buffer_reading(struct extent_buffer *eb) 3793 { 3794 clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); 3795 } 3796 3797 static void end_bbio_meta_read(struct btrfs_bio *bbio) 3798 { 3799 struct extent_buffer *eb = bbio->private; 3800 bool uptodate = !bbio->bio.bi_status; 3801 3802 /* 3803 * If the extent buffer is marked UPTODATE before the read operation 3804 * completes, other calls to read_extent_buffer_pages() will return 3805 * early without waiting for the read to finish, causing data races. 3806 */ 3807 WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); 3808 3809 eb->read_mirror = bbio->mirror_num; 3810 3811 if (uptodate && 3812 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) 3813 uptodate = false; 3814 3815 if (uptodate) 3816 set_extent_buffer_uptodate(eb); 3817 else 3818 clear_extent_buffer_uptodate(eb); 3819 3820 clear_extent_buffer_reading(eb); 3821 free_extent_buffer(eb); 3822 3823 bio_put(&bbio->bio); 3824 } 3825 3826 int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, 3827 const struct btrfs_tree_parent_check *check) 3828 { 3829 struct btrfs_bio *bbio; 3830 3831 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3832 return 0; 3833 3834 /* 3835 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 3836 * operation, which could potentially still be in flight. In this case 3837 * we simply want to return an error. 3838 */ 3839 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 3840 return -EIO; 3841 3842 /* Someone else is already reading the buffer, just wait for it. */ 3843 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) 3844 return 0; 3845 3846 /* 3847 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above 3848 * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have 3849 * started and finished reading the same eb. In this case, UPTODATE 3850 * will now be set, and we shouldn't read it in again. 3851 */ 3852 if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { 3853 clear_extent_buffer_reading(eb); 3854 return 0; 3855 } 3856 3857 eb->read_mirror = 0; 3858 check_buffer_tree_ref(eb); 3859 refcount_inc(&eb->refs); 3860 3861 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 3862 REQ_OP_READ | REQ_META, eb->fs_info, 3863 end_bbio_meta_read, eb); 3864 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 3865 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 3866 bbio->file_offset = eb->start; 3867 memcpy(&bbio->parent_check, check, sizeof(*check)); 3868 for (int i = 0; i < num_extent_folios(eb); i++) { 3869 struct folio *folio = eb->folios[i]; 3870 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 3871 u32 range_len = min_t(u64, folio_end(folio), 3872 eb->start + eb->len) - range_start; 3873 3874 bio_add_folio_nofail(&bbio->bio, folio, range_len, 3875 offset_in_folio(folio, range_start)); 3876 } 3877 btrfs_submit_bbio(bbio, mirror_num); 3878 return 0; 3879 } 3880 3881 int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, 3882 const struct btrfs_tree_parent_check *check) 3883 { 3884 int ret; 3885 3886 ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); 3887 if (ret < 0) 3888 return ret; 3889 3890 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3891 if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) 3892 return -EIO; 3893 return 0; 3894 } 3895 3896 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 3897 unsigned long len) 3898 { 3899 btrfs_warn(eb->fs_info, 3900 "access to eb bytenr %llu len %u out of range start %lu len %lu", 3901 eb->start, eb->len, start, len); 3902 DEBUG_WARN(); 3903 3904 return true; 3905 } 3906 3907 /* 3908 * Check if the [start, start + len) range is valid before reading/writing 3909 * the eb. 3910 * NOTE: @start and @len are offset inside the eb, not logical address. 3911 * 3912 * Caller should not touch the dst/src memory if this function returns error. 3913 */ 3914 static inline int check_eb_range(const struct extent_buffer *eb, 3915 unsigned long start, unsigned long len) 3916 { 3917 unsigned long offset; 3918 3919 /* start, start + len should not go beyond eb->len nor overflow */ 3920 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 3921 return report_eb_range(eb, start, len); 3922 3923 return false; 3924 } 3925 3926 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 3927 unsigned long start, unsigned long len) 3928 { 3929 const int unit_size = eb->folio_size; 3930 size_t cur; 3931 size_t offset; 3932 char *dst = (char *)dstv; 3933 unsigned long i = get_eb_folio_index(eb, start); 3934 3935 if (check_eb_range(eb, start, len)) { 3936 /* 3937 * Invalid range hit, reset the memory, so callers won't get 3938 * some random garbage for their uninitialized memory. 3939 */ 3940 memset(dstv, 0, len); 3941 return; 3942 } 3943 3944 if (eb->addr) { 3945 memcpy(dstv, eb->addr + start, len); 3946 return; 3947 } 3948 3949 offset = get_eb_offset_in_folio(eb, start); 3950 3951 while (len > 0) { 3952 char *kaddr; 3953 3954 cur = min(len, unit_size - offset); 3955 kaddr = folio_address(eb->folios[i]); 3956 memcpy(dst, kaddr + offset, cur); 3957 3958 dst += cur; 3959 len -= cur; 3960 offset = 0; 3961 i++; 3962 } 3963 } 3964 3965 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 3966 void __user *dstv, 3967 unsigned long start, unsigned long len) 3968 { 3969 const int unit_size = eb->folio_size; 3970 size_t cur; 3971 size_t offset; 3972 char __user *dst = (char __user *)dstv; 3973 unsigned long i = get_eb_folio_index(eb, start); 3974 int ret = 0; 3975 3976 WARN_ON(start > eb->len); 3977 WARN_ON(start + len > eb->start + eb->len); 3978 3979 if (eb->addr) { 3980 if (copy_to_user_nofault(dstv, eb->addr + start, len)) 3981 ret = -EFAULT; 3982 return ret; 3983 } 3984 3985 offset = get_eb_offset_in_folio(eb, start); 3986 3987 while (len > 0) { 3988 char *kaddr; 3989 3990 cur = min(len, unit_size - offset); 3991 kaddr = folio_address(eb->folios[i]); 3992 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 3993 ret = -EFAULT; 3994 break; 3995 } 3996 3997 dst += cur; 3998 len -= cur; 3999 offset = 0; 4000 i++; 4001 } 4002 4003 return ret; 4004 } 4005 4006 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 4007 unsigned long start, unsigned long len) 4008 { 4009 const int unit_size = eb->folio_size; 4010 size_t cur; 4011 size_t offset; 4012 char *kaddr; 4013 char *ptr = (char *)ptrv; 4014 unsigned long i = get_eb_folio_index(eb, start); 4015 int ret = 0; 4016 4017 if (check_eb_range(eb, start, len)) 4018 return -EINVAL; 4019 4020 if (eb->addr) 4021 return memcmp(ptrv, eb->addr + start, len); 4022 4023 offset = get_eb_offset_in_folio(eb, start); 4024 4025 while (len > 0) { 4026 cur = min(len, unit_size - offset); 4027 kaddr = folio_address(eb->folios[i]); 4028 ret = memcmp(ptr, kaddr + offset, cur); 4029 if (ret) 4030 break; 4031 4032 ptr += cur; 4033 len -= cur; 4034 offset = 0; 4035 i++; 4036 } 4037 return ret; 4038 } 4039 4040 /* 4041 * Check that the extent buffer is uptodate. 4042 * 4043 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 4044 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 4045 */ 4046 static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) 4047 { 4048 struct btrfs_fs_info *fs_info = eb->fs_info; 4049 struct folio *folio = eb->folios[i]; 4050 4051 ASSERT(folio); 4052 4053 /* 4054 * If we are using the commit root we could potentially clear a page 4055 * Uptodate while we're using the extent buffer that we've previously 4056 * looked up. We don't want to complain in this case, as the page was 4057 * valid before, we just didn't write it out. Instead we want to catch 4058 * the case where we didn't actually read the block properly, which 4059 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. 4060 */ 4061 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4062 return; 4063 4064 if (btrfs_meta_is_subpage(fs_info)) { 4065 folio = eb->folios[0]; 4066 ASSERT(i == 0); 4067 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, 4068 eb->start, eb->len))) 4069 btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); 4070 } else { 4071 WARN_ON(!folio_test_uptodate(folio)); 4072 } 4073 } 4074 4075 static void __write_extent_buffer(const struct extent_buffer *eb, 4076 const void *srcv, unsigned long start, 4077 unsigned long len, bool use_memmove) 4078 { 4079 const int unit_size = eb->folio_size; 4080 size_t cur; 4081 size_t offset; 4082 char *kaddr; 4083 const char *src = (const char *)srcv; 4084 unsigned long i = get_eb_folio_index(eb, start); 4085 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4086 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4087 4088 if (check_eb_range(eb, start, len)) 4089 return; 4090 4091 if (eb->addr) { 4092 if (use_memmove) 4093 memmove(eb->addr + start, srcv, len); 4094 else 4095 memcpy(eb->addr + start, srcv, len); 4096 return; 4097 } 4098 4099 offset = get_eb_offset_in_folio(eb, start); 4100 4101 while (len > 0) { 4102 if (check_uptodate) 4103 assert_eb_folio_uptodate(eb, i); 4104 4105 cur = min(len, unit_size - offset); 4106 kaddr = folio_address(eb->folios[i]); 4107 if (use_memmove) 4108 memmove(kaddr + offset, src, cur); 4109 else 4110 memcpy(kaddr + offset, src, cur); 4111 4112 src += cur; 4113 len -= cur; 4114 offset = 0; 4115 i++; 4116 } 4117 } 4118 4119 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 4120 unsigned long start, unsigned long len) 4121 { 4122 return __write_extent_buffer(eb, srcv, start, len, false); 4123 } 4124 4125 static void memset_extent_buffer(const struct extent_buffer *eb, int c, 4126 unsigned long start, unsigned long len) 4127 { 4128 const int unit_size = eb->folio_size; 4129 unsigned long cur = start; 4130 4131 if (eb->addr) { 4132 memset(eb->addr + start, c, len); 4133 return; 4134 } 4135 4136 while (cur < start + len) { 4137 unsigned long index = get_eb_folio_index(eb, cur); 4138 unsigned int offset = get_eb_offset_in_folio(eb, cur); 4139 unsigned int cur_len = min(start + len - cur, unit_size - offset); 4140 4141 assert_eb_folio_uptodate(eb, index); 4142 memset(folio_address(eb->folios[index]) + offset, c, cur_len); 4143 4144 cur += cur_len; 4145 } 4146 } 4147 4148 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 4149 unsigned long len) 4150 { 4151 if (check_eb_range(eb, start, len)) 4152 return; 4153 return memset_extent_buffer(eb, 0, start, len); 4154 } 4155 4156 void copy_extent_buffer_full(const struct extent_buffer *dst, 4157 const struct extent_buffer *src) 4158 { 4159 const int unit_size = src->folio_size; 4160 unsigned long cur = 0; 4161 4162 ASSERT(dst->len == src->len); 4163 4164 while (cur < src->len) { 4165 unsigned long index = get_eb_folio_index(src, cur); 4166 unsigned long offset = get_eb_offset_in_folio(src, cur); 4167 unsigned long cur_len = min(src->len, unit_size - offset); 4168 void *addr = folio_address(src->folios[index]) + offset; 4169 4170 write_extent_buffer(dst, addr, cur, cur_len); 4171 4172 cur += cur_len; 4173 } 4174 } 4175 4176 void copy_extent_buffer(const struct extent_buffer *dst, 4177 const struct extent_buffer *src, 4178 unsigned long dst_offset, unsigned long src_offset, 4179 unsigned long len) 4180 { 4181 const int unit_size = dst->folio_size; 4182 u64 dst_len = dst->len; 4183 size_t cur; 4184 size_t offset; 4185 char *kaddr; 4186 unsigned long i = get_eb_folio_index(dst, dst_offset); 4187 4188 if (check_eb_range(dst, dst_offset, len) || 4189 check_eb_range(src, src_offset, len)) 4190 return; 4191 4192 WARN_ON(src->len != dst_len); 4193 4194 offset = get_eb_offset_in_folio(dst, dst_offset); 4195 4196 while (len > 0) { 4197 assert_eb_folio_uptodate(dst, i); 4198 4199 cur = min(len, (unsigned long)(unit_size - offset)); 4200 4201 kaddr = folio_address(dst->folios[i]); 4202 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4203 4204 src_offset += cur; 4205 len -= cur; 4206 offset = 0; 4207 i++; 4208 } 4209 } 4210 4211 /* 4212 * Calculate the folio and offset of the byte containing the given bit number. 4213 * 4214 * @eb: the extent buffer 4215 * @start: offset of the bitmap item in the extent buffer 4216 * @nr: bit number 4217 * @folio_index: return index of the folio in the extent buffer that contains 4218 * the given bit number 4219 * @folio_offset: return offset into the folio given by folio_index 4220 * 4221 * This helper hides the ugliness of finding the byte in an extent buffer which 4222 * contains a given bit. 4223 */ 4224 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 4225 unsigned long start, unsigned long nr, 4226 unsigned long *folio_index, 4227 size_t *folio_offset) 4228 { 4229 size_t byte_offset = BIT_BYTE(nr); 4230 size_t offset; 4231 4232 /* 4233 * The byte we want is the offset of the extent buffer + the offset of 4234 * the bitmap item in the extent buffer + the offset of the byte in the 4235 * bitmap item. 4236 */ 4237 offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; 4238 4239 *folio_index = offset >> eb->folio_shift; 4240 *folio_offset = offset_in_eb_folio(eb, offset); 4241 } 4242 4243 /* 4244 * Determine whether a bit in a bitmap item is set. 4245 * 4246 * @eb: the extent buffer 4247 * @start: offset of the bitmap item in the extent buffer 4248 * @nr: bit number to test 4249 */ 4250 bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 4251 unsigned long nr) 4252 { 4253 unsigned long i; 4254 size_t offset; 4255 u8 *kaddr; 4256 4257 eb_bitmap_offset(eb, start, nr, &i, &offset); 4258 assert_eb_folio_uptodate(eb, i); 4259 kaddr = folio_address(eb->folios[i]); 4260 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 4261 } 4262 4263 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) 4264 { 4265 unsigned long index = get_eb_folio_index(eb, bytenr); 4266 4267 if (check_eb_range(eb, bytenr, 1)) 4268 return NULL; 4269 return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); 4270 } 4271 4272 /* 4273 * Set an area of a bitmap to 1. 4274 * 4275 * @eb: the extent buffer 4276 * @start: offset of the bitmap item in the extent buffer 4277 * @pos: bit number of the first bit 4278 * @len: number of bits to set 4279 */ 4280 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 4281 unsigned long pos, unsigned long len) 4282 { 4283 unsigned int first_byte = start + BIT_BYTE(pos); 4284 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4285 const bool same_byte = (first_byte == last_byte); 4286 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4287 u8 *kaddr; 4288 4289 if (same_byte) 4290 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4291 4292 /* Handle the first byte. */ 4293 kaddr = extent_buffer_get_byte(eb, first_byte); 4294 *kaddr |= mask; 4295 if (same_byte) 4296 return; 4297 4298 /* Handle the byte aligned part. */ 4299 ASSERT(first_byte + 1 <= last_byte); 4300 memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); 4301 4302 /* Handle the last byte. */ 4303 kaddr = extent_buffer_get_byte(eb, last_byte); 4304 *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); 4305 } 4306 4307 4308 /* 4309 * Clear an area of a bitmap. 4310 * 4311 * @eb: the extent buffer 4312 * @start: offset of the bitmap item in the extent buffer 4313 * @pos: bit number of the first bit 4314 * @len: number of bits to clear 4315 */ 4316 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 4317 unsigned long start, unsigned long pos, 4318 unsigned long len) 4319 { 4320 unsigned int first_byte = start + BIT_BYTE(pos); 4321 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4322 const bool same_byte = (first_byte == last_byte); 4323 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4324 u8 *kaddr; 4325 4326 if (same_byte) 4327 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4328 4329 /* Handle the first byte. */ 4330 kaddr = extent_buffer_get_byte(eb, first_byte); 4331 *kaddr &= ~mask; 4332 if (same_byte) 4333 return; 4334 4335 /* Handle the byte aligned part. */ 4336 ASSERT(first_byte + 1 <= last_byte); 4337 memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); 4338 4339 /* Handle the last byte. */ 4340 kaddr = extent_buffer_get_byte(eb, last_byte); 4341 *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); 4342 } 4343 4344 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4345 { 4346 unsigned long distance = (src > dst) ? src - dst : dst - src; 4347 return distance < len; 4348 } 4349 4350 void memcpy_extent_buffer(const struct extent_buffer *dst, 4351 unsigned long dst_offset, unsigned long src_offset, 4352 unsigned long len) 4353 { 4354 const int unit_size = dst->folio_size; 4355 unsigned long cur_off = 0; 4356 4357 if (check_eb_range(dst, dst_offset, len) || 4358 check_eb_range(dst, src_offset, len)) 4359 return; 4360 4361 if (dst->addr) { 4362 const bool use_memmove = areas_overlap(src_offset, dst_offset, len); 4363 4364 if (use_memmove) 4365 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4366 else 4367 memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); 4368 return; 4369 } 4370 4371 while (cur_off < len) { 4372 unsigned long cur_src = cur_off + src_offset; 4373 unsigned long folio_index = get_eb_folio_index(dst, cur_src); 4374 unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); 4375 unsigned long cur_len = min(src_offset + len - cur_src, 4376 unit_size - folio_off); 4377 void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; 4378 const bool use_memmove = areas_overlap(src_offset + cur_off, 4379 dst_offset + cur_off, cur_len); 4380 4381 __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, 4382 use_memmove); 4383 cur_off += cur_len; 4384 } 4385 } 4386 4387 void memmove_extent_buffer(const struct extent_buffer *dst, 4388 unsigned long dst_offset, unsigned long src_offset, 4389 unsigned long len) 4390 { 4391 unsigned long dst_end = dst_offset + len - 1; 4392 unsigned long src_end = src_offset + len - 1; 4393 4394 if (check_eb_range(dst, dst_offset, len) || 4395 check_eb_range(dst, src_offset, len)) 4396 return; 4397 4398 if (dst_offset < src_offset) { 4399 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4400 return; 4401 } 4402 4403 if (dst->addr) { 4404 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4405 return; 4406 } 4407 4408 while (len > 0) { 4409 unsigned long src_i; 4410 size_t cur; 4411 size_t dst_off_in_folio; 4412 size_t src_off_in_folio; 4413 void *src_addr; 4414 bool use_memmove; 4415 4416 src_i = get_eb_folio_index(dst, src_end); 4417 4418 dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); 4419 src_off_in_folio = get_eb_offset_in_folio(dst, src_end); 4420 4421 cur = min_t(unsigned long, len, src_off_in_folio + 1); 4422 cur = min(cur, dst_off_in_folio + 1); 4423 4424 src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - 4425 cur + 1; 4426 use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, 4427 cur); 4428 4429 __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, 4430 use_memmove); 4431 4432 dst_end -= cur; 4433 src_end -= cur; 4434 len -= cur; 4435 } 4436 } 4437 4438 static int try_release_subpage_extent_buffer(struct folio *folio) 4439 { 4440 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4441 struct extent_buffer *eb; 4442 unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); 4443 unsigned long index = start; 4444 unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; 4445 int ret; 4446 4447 rcu_read_lock(); 4448 xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4449 /* 4450 * The same as try_release_extent_buffer(), to ensure the eb 4451 * won't disappear out from under us. 4452 */ 4453 spin_lock(&eb->refs_lock); 4454 rcu_read_unlock(); 4455 4456 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4457 spin_unlock(&eb->refs_lock); 4458 rcu_read_lock(); 4459 continue; 4460 } 4461 4462 /* 4463 * If tree ref isn't set then we know the ref on this eb is a 4464 * real ref, so just return, this eb will likely be freed soon 4465 * anyway. 4466 */ 4467 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4468 spin_unlock(&eb->refs_lock); 4469 break; 4470 } 4471 4472 /* 4473 * Here we don't care about the return value, we will always 4474 * check the folio private at the end. And 4475 * release_extent_buffer() will release the refs_lock. 4476 */ 4477 release_extent_buffer(eb); 4478 rcu_read_lock(); 4479 } 4480 rcu_read_unlock(); 4481 4482 /* 4483 * Finally to check if we have cleared folio private, as if we have 4484 * released all ebs in the page, the folio private should be cleared now. 4485 */ 4486 spin_lock(&folio->mapping->i_private_lock); 4487 if (!folio_test_private(folio)) 4488 ret = 1; 4489 else 4490 ret = 0; 4491 spin_unlock(&folio->mapping->i_private_lock); 4492 return ret; 4493 } 4494 4495 int try_release_extent_buffer(struct folio *folio) 4496 { 4497 struct extent_buffer *eb; 4498 4499 if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) 4500 return try_release_subpage_extent_buffer(folio); 4501 4502 /* 4503 * We need to make sure nobody is changing folio private, as we rely on 4504 * folio private as the pointer to extent buffer. 4505 */ 4506 spin_lock(&folio->mapping->i_private_lock); 4507 if (!folio_test_private(folio)) { 4508 spin_unlock(&folio->mapping->i_private_lock); 4509 return 1; 4510 } 4511 4512 eb = folio_get_private(folio); 4513 BUG_ON(!eb); 4514 4515 /* 4516 * This is a little awful but should be ok, we need to make sure that 4517 * the eb doesn't disappear out from under us while we're looking at 4518 * this page. 4519 */ 4520 spin_lock(&eb->refs_lock); 4521 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4522 spin_unlock(&eb->refs_lock); 4523 spin_unlock(&folio->mapping->i_private_lock); 4524 return 0; 4525 } 4526 spin_unlock(&folio->mapping->i_private_lock); 4527 4528 /* 4529 * If tree ref isn't set then we know the ref on this eb is a real ref, 4530 * so just return, this page will likely be freed soon anyway. 4531 */ 4532 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4533 spin_unlock(&eb->refs_lock); 4534 return 0; 4535 } 4536 4537 return release_extent_buffer(eb); 4538 } 4539 4540 /* 4541 * Attempt to readahead a child block. 4542 * 4543 * @fs_info: the fs_info 4544 * @bytenr: bytenr to read 4545 * @owner_root: objectid of the root that owns this eb 4546 * @gen: generation for the uptodate check, can be 0 4547 * @level: level for the eb 4548 * 4549 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 4550 * normal uptodate check of the eb, without checking the generation. If we have 4551 * to read the block we will not block on anything. 4552 */ 4553 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 4554 u64 bytenr, u64 owner_root, u64 gen, int level) 4555 { 4556 struct btrfs_tree_parent_check check = { 4557 .level = level, 4558 .transid = gen 4559 }; 4560 struct extent_buffer *eb; 4561 int ret; 4562 4563 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 4564 if (IS_ERR(eb)) 4565 return; 4566 4567 if (btrfs_buffer_uptodate(eb, gen, true)) { 4568 free_extent_buffer(eb); 4569 return; 4570 } 4571 4572 ret = read_extent_buffer_pages_nowait(eb, 0, &check); 4573 if (ret < 0) 4574 free_extent_buffer_stale(eb); 4575 else 4576 free_extent_buffer(eb); 4577 } 4578 4579 /* 4580 * Readahead a node's child block. 4581 * 4582 * @node: parent node we're reading from 4583 * @slot: slot in the parent node for the child we want to read 4584 * 4585 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 4586 * the slot in the node provided. 4587 */ 4588 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 4589 { 4590 btrfs_readahead_tree_block(node->fs_info, 4591 btrfs_node_blockptr(node, slot), 4592 btrfs_header_owner(node), 4593 btrfs_node_ptr_generation(node, slot), 4594 btrfs_header_level(node) - 1); 4595 } 4596