1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "extent_io.h" 18 #include "extent-io-tree.h" 19 #include "extent_map.h" 20 #include "ctree.h" 21 #include "btrfs_inode.h" 22 #include "bio.h" 23 #include "locking.h" 24 #include "backref.h" 25 #include "disk-io.h" 26 #include "subpage.h" 27 #include "zoned.h" 28 #include "block-group.h" 29 #include "compression.h" 30 #include "fs.h" 31 #include "accessors.h" 32 #include "file-item.h" 33 #include "file.h" 34 #include "dev-replace.h" 35 #include "super.h" 36 #include "transaction.h" 37 38 static struct kmem_cache *extent_buffer_cache; 39 40 #ifdef CONFIG_BTRFS_DEBUG 41 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) 42 { 43 struct btrfs_fs_info *fs_info = eb->fs_info; 44 unsigned long flags; 45 46 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 47 list_add(&eb->leak_list, &fs_info->allocated_ebs); 48 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 49 } 50 51 static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) 52 { 53 struct btrfs_fs_info *fs_info = eb->fs_info; 54 unsigned long flags; 55 56 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 57 list_del(&eb->leak_list); 58 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 59 } 60 61 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 62 { 63 struct extent_buffer *eb; 64 unsigned long flags; 65 66 /* 67 * If we didn't get into open_ctree our allocated_ebs will not be 68 * initialized, so just skip this. 69 */ 70 if (!fs_info->allocated_ebs.next) 71 return; 72 73 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 74 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 75 while (!list_empty(&fs_info->allocated_ebs)) { 76 eb = list_first_entry(&fs_info->allocated_ebs, 77 struct extent_buffer, leak_list); 78 btrfs_err(fs_info, 79 "buffer leak start %llu len %u refs %d bflags %lu owner %llu", 80 eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, 81 btrfs_header_owner(eb)); 82 list_del(&eb->leak_list); 83 WARN_ON_ONCE(1); 84 kmem_cache_free(extent_buffer_cache, eb); 85 } 86 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 87 } 88 #else 89 #define btrfs_leak_debug_add_eb(eb) do {} while (0) 90 #define btrfs_leak_debug_del_eb(eb) do {} while (0) 91 #endif 92 93 /* 94 * Structure to record info about the bio being assembled, and other info like 95 * how many bytes are there before stripe/ordered extent boundary. 96 */ 97 struct btrfs_bio_ctrl { 98 struct btrfs_bio *bbio; 99 /* Last byte contained in bbio + 1 . */ 100 loff_t next_file_offset; 101 enum btrfs_compression_type compress_type; 102 u32 len_to_oe_boundary; 103 blk_opf_t opf; 104 /* 105 * For data read bios, we attempt to optimize csum lookups if the extent 106 * generation is older than the current one. To make this possible, we 107 * need to track the maximum generation of an extent in a bio_ctrl to 108 * make the decision when submitting the bio. 109 * 110 * The pattern between do_readpage(), submit_one_bio() and 111 * submit_extent_folio() is quite subtle, so tracking this is tricky. 112 * 113 * As we process extent E, we might submit a bio with existing built up 114 * extents before adding E to a new bio, or we might just add E to the 115 * bio. As a result, E's generation could apply to the current bio or 116 * to the next one, so we need to be careful to update the bio_ctrl's 117 * generation with E's only when we are sure E is added to bio_ctrl->bbio 118 * in submit_extent_folio(). 119 * 120 * See the comment in btrfs_lookup_bio_sums() for more detail on the 121 * need for this optimization. 122 */ 123 u64 generation; 124 btrfs_bio_end_io_t end_io_func; 125 struct writeback_control *wbc; 126 127 /* 128 * The sectors of the page which are going to be submitted by 129 * extent_writepage_io(). 130 * This is to avoid touching ranges covered by compression/inline. 131 */ 132 unsigned long submit_bitmap; 133 struct readahead_control *ractl; 134 135 /* 136 * The start offset of the last used extent map by a read operation. 137 * 138 * This is for proper compressed read merge. 139 * U64_MAX means we are starting the read and have made no progress yet. 140 * 141 * The current btrfs_bio_is_contig() only uses disk_bytenr as 142 * the condition to check if the read can be merged with previous 143 * bio, which is not correct. E.g. two file extents pointing to the 144 * same extent but with different offset. 145 * 146 * So here we need to do extra checks to only merge reads that are 147 * covered by the same extent map. 148 * Just extent_map::start will be enough, as they are unique 149 * inside the same inode. 150 */ 151 u64 last_em_start; 152 }; 153 154 /* 155 * Helper to set the csum search commit root option for a bio_ctrl's bbio 156 * before submitting the bio. 157 * 158 * Only for use by submit_one_bio(). 159 */ 160 static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) 161 { 162 struct btrfs_bio *bbio = bio_ctrl->bbio; 163 164 ASSERT(bbio); 165 166 if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) 167 return; 168 169 bio_ctrl->bbio->csum_search_commit_root = 170 (bio_ctrl->generation && 171 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); 172 } 173 174 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 175 { 176 struct btrfs_bio *bbio = bio_ctrl->bbio; 177 178 if (!bbio) 179 return; 180 181 /* Caller should ensure the bio has at least some range added */ 182 ASSERT(bbio->bio.bi_iter.bi_size); 183 184 bio_set_csum_search_commit_root(bio_ctrl); 185 186 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && 187 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 188 btrfs_submit_compressed_read(bbio); 189 else 190 btrfs_submit_bbio(bbio, 0); 191 192 /* The bbio is owned by the end_io handler now */ 193 bio_ctrl->bbio = NULL; 194 /* 195 * We used the generation to decide whether to lookup csums in the 196 * commit_root or not when we called bio_set_csum_search_commit_root() 197 * above. Now, reset the generation for the next bio. 198 */ 199 bio_ctrl->generation = 0; 200 } 201 202 /* 203 * Submit or fail the current bio in the bio_ctrl structure. 204 */ 205 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) 206 { 207 struct btrfs_bio *bbio = bio_ctrl->bbio; 208 209 if (!bbio) 210 return; 211 212 if (ret) { 213 ASSERT(ret < 0); 214 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 215 /* The bio is owned by the end_io handler now */ 216 bio_ctrl->bbio = NULL; 217 } else { 218 submit_one_bio(bio_ctrl); 219 } 220 } 221 222 int __init extent_buffer_init_cachep(void) 223 { 224 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 225 sizeof(struct extent_buffer), 0, 0, 226 NULL); 227 if (!extent_buffer_cache) 228 return -ENOMEM; 229 230 return 0; 231 } 232 233 void __cold extent_buffer_free_cachep(void) 234 { 235 /* 236 * Make sure all delayed rcu free are flushed before we 237 * destroy caches. 238 */ 239 rcu_barrier(); 240 kmem_cache_destroy(extent_buffer_cache); 241 } 242 243 static void process_one_folio(struct btrfs_fs_info *fs_info, 244 struct folio *folio, const struct folio *locked_folio, 245 unsigned long page_ops, u64 start, u64 end) 246 { 247 u32 len; 248 249 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 250 len = end + 1 - start; 251 252 if (page_ops & PAGE_SET_ORDERED) 253 btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); 254 if (page_ops & PAGE_START_WRITEBACK) { 255 btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); 256 btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); 257 } 258 if (page_ops & PAGE_END_WRITEBACK) 259 btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); 260 261 if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) 262 btrfs_folio_end_lock(fs_info, folio, start, len); 263 } 264 265 static void __process_folios_contig(struct address_space *mapping, 266 const struct folio *locked_folio, u64 start, 267 u64 end, unsigned long page_ops) 268 { 269 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 270 pgoff_t index = start >> PAGE_SHIFT; 271 pgoff_t end_index = end >> PAGE_SHIFT; 272 struct folio_batch fbatch; 273 int i; 274 275 folio_batch_init(&fbatch); 276 while (index <= end_index) { 277 int found_folios; 278 279 found_folios = filemap_get_folios_contig(mapping, &index, 280 end_index, &fbatch); 281 for (i = 0; i < found_folios; i++) { 282 struct folio *folio = fbatch.folios[i]; 283 284 process_one_folio(fs_info, folio, locked_folio, 285 page_ops, start, end); 286 } 287 folio_batch_release(&fbatch); 288 cond_resched(); 289 } 290 } 291 292 static noinline void unlock_delalloc_folio(const struct inode *inode, 293 struct folio *locked_folio, 294 u64 start, u64 end) 295 { 296 ASSERT(locked_folio); 297 298 __process_folios_contig(inode->i_mapping, locked_folio, start, end, 299 PAGE_UNLOCK); 300 } 301 302 static noinline int lock_delalloc_folios(struct inode *inode, 303 struct folio *locked_folio, 304 u64 start, u64 end) 305 { 306 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 307 struct address_space *mapping = inode->i_mapping; 308 pgoff_t index = start >> PAGE_SHIFT; 309 pgoff_t end_index = end >> PAGE_SHIFT; 310 u64 processed_end = start; 311 struct folio_batch fbatch; 312 313 folio_batch_init(&fbatch); 314 while (index <= end_index) { 315 unsigned int found_folios, i; 316 317 found_folios = filemap_get_folios_contig(mapping, &index, 318 end_index, &fbatch); 319 if (found_folios == 0) 320 goto out; 321 322 for (i = 0; i < found_folios; i++) { 323 struct folio *folio = fbatch.folios[i]; 324 u64 range_start; 325 u32 range_len; 326 327 if (folio == locked_folio) 328 continue; 329 330 folio_lock(folio); 331 if (!folio_test_dirty(folio) || folio->mapping != mapping) { 332 folio_unlock(folio); 333 goto out; 334 } 335 range_start = max_t(u64, folio_pos(folio), start); 336 range_len = min_t(u64, folio_end(folio), end + 1) - range_start; 337 btrfs_folio_set_lock(fs_info, folio, range_start, range_len); 338 339 processed_end = range_start + range_len - 1; 340 } 341 folio_batch_release(&fbatch); 342 cond_resched(); 343 } 344 345 return 0; 346 out: 347 folio_batch_release(&fbatch); 348 if (processed_end > start) 349 unlock_delalloc_folio(inode, locked_folio, start, processed_end); 350 return -EAGAIN; 351 } 352 353 /* 354 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 355 * more than @max_bytes. 356 * 357 * @start: The original start bytenr to search. 358 * Will store the extent range start bytenr. 359 * @end: The original end bytenr of the search range 360 * Will store the extent range end bytenr. 361 * 362 * Return true if we find a delalloc range which starts inside the original 363 * range, and @start/@end will store the delalloc range start/end. 364 * 365 * Return false if we can't find any delalloc range which starts inside the 366 * original range, and @start/@end will be the non-delalloc range start/end. 367 */ 368 EXPORT_FOR_TESTS 369 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 370 struct folio *locked_folio, 371 u64 *start, u64 *end) 372 { 373 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 375 const u64 orig_start = *start; 376 const u64 orig_end = *end; 377 /* The sanity tests may not set a valid fs_info. */ 378 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; 379 u64 delalloc_start; 380 u64 delalloc_end; 381 bool found; 382 struct extent_state *cached_state = NULL; 383 int ret; 384 int loops = 0; 385 386 /* Caller should pass a valid @end to indicate the search range end */ 387 ASSERT(orig_end > orig_start); 388 389 /* The range should at least cover part of the folio */ 390 ASSERT(!(orig_start >= folio_end(locked_folio) || 391 orig_end <= folio_pos(locked_folio))); 392 again: 393 /* step one, find a bunch of delalloc bytes starting at start */ 394 delalloc_start = *start; 395 delalloc_end = 0; 396 397 /* 398 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can 399 * return early without handling any dirty ranges. 400 */ 401 ASSERT(max_bytes >= fs_info->sectorsize); 402 403 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 404 max_bytes, &cached_state); 405 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 406 *start = delalloc_start; 407 408 /* @delalloc_end can be -1, never go beyond @orig_end */ 409 *end = min(delalloc_end, orig_end); 410 btrfs_free_extent_state(cached_state); 411 return false; 412 } 413 414 /* 415 * start comes from the offset of locked_folio. We have to lock 416 * folios in order, so we can't process delalloc bytes before 417 * locked_folio 418 */ 419 if (delalloc_start < *start) 420 delalloc_start = *start; 421 422 /* 423 * make sure to limit the number of folios we try to lock down 424 */ 425 if (delalloc_end + 1 - delalloc_start > max_bytes) 426 delalloc_end = delalloc_start + max_bytes - 1; 427 428 /* step two, lock all the folios after the folios that has start */ 429 ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, 430 delalloc_end); 431 ASSERT(!ret || ret == -EAGAIN); 432 if (ret == -EAGAIN) { 433 /* 434 * Some of the folios are gone, lets avoid looping by 435 * shortening the size of the delalloc range we're searching. 436 */ 437 btrfs_free_extent_state(cached_state); 438 cached_state = NULL; 439 if (!loops) { 440 max_bytes = fs_info->sectorsize; 441 loops = 1; 442 goto again; 443 } else { 444 found = false; 445 goto out_failed; 446 } 447 } 448 449 /* step three, lock the state bits for the whole range */ 450 btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); 451 452 /* then test to make sure it is all still delalloc */ 453 ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, 454 EXTENT_DELALLOC, cached_state); 455 456 btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); 457 if (!ret) { 458 unlock_delalloc_folio(inode, locked_folio, delalloc_start, 459 delalloc_end); 460 cond_resched(); 461 goto again; 462 } 463 *start = delalloc_start; 464 *end = delalloc_end; 465 out_failed: 466 return found; 467 } 468 469 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 470 const struct folio *locked_folio, 471 struct extent_state **cached, 472 u32 clear_bits, unsigned long page_ops) 473 { 474 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); 475 476 __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, 477 end, page_ops); 478 } 479 480 static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) 481 { 482 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 483 484 if (!fsverity_active(folio->mapping->host) || 485 btrfs_folio_test_uptodate(fs_info, folio, start, len) || 486 start >= i_size_read(folio->mapping->host)) 487 return true; 488 return fsverity_verify_folio(folio); 489 } 490 491 static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) 492 { 493 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 494 495 ASSERT(folio_pos(folio) <= start && 496 start + len <= folio_end(folio)); 497 498 if (uptodate && btrfs_verify_folio(folio, start, len)) 499 btrfs_folio_set_uptodate(fs_info, folio, start, len); 500 else 501 btrfs_folio_clear_uptodate(fs_info, folio, start, len); 502 503 if (!btrfs_is_subpage(fs_info, folio)) 504 folio_unlock(folio); 505 else 506 btrfs_folio_end_lock(fs_info, folio, start, len); 507 } 508 509 /* 510 * After a write IO is done, we need to: 511 * 512 * - clear the uptodate bits on error 513 * - clear the writeback bits in the extent tree for the range 514 * - filio_end_writeback() if there is no more pending io for the folio 515 * 516 * Scheduling is not allowed, so the extent state tree is expected 517 * to have one and only one object corresponding to this IO. 518 */ 519 static void end_bbio_data_write(struct btrfs_bio *bbio) 520 { 521 struct btrfs_fs_info *fs_info = bbio->fs_info; 522 struct bio *bio = &bbio->bio; 523 int error = blk_status_to_errno(bio->bi_status); 524 struct folio_iter fi; 525 const u32 sectorsize = fs_info->sectorsize; 526 527 ASSERT(!bio_flagged(bio, BIO_CLONED)); 528 bio_for_each_folio_all(fi, bio) { 529 struct folio *folio = fi.folio; 530 u64 start = folio_pos(folio) + fi.offset; 531 u32 len = fi.length; 532 533 /* Our read/write should always be sector aligned. */ 534 if (!IS_ALIGNED(fi.offset, sectorsize)) 535 btrfs_err(fs_info, 536 "partial page write in btrfs with offset %zu and length %zu", 537 fi.offset, fi.length); 538 else if (!IS_ALIGNED(fi.length, sectorsize)) 539 btrfs_info(fs_info, 540 "incomplete page write with offset %zu and length %zu", 541 fi.offset, fi.length); 542 543 btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, 544 !error); 545 if (error) 546 mapping_set_error(folio->mapping, error); 547 btrfs_folio_clear_writeback(fs_info, folio, start, len); 548 } 549 550 bio_put(bio); 551 } 552 553 static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) 554 { 555 ASSERT(folio_test_locked(folio)); 556 if (!btrfs_is_subpage(fs_info, folio)) 557 return; 558 559 ASSERT(folio_test_private(folio)); 560 btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); 561 } 562 563 /* 564 * After a data read IO is done, we need to: 565 * 566 * - clear the uptodate bits on error 567 * - set the uptodate bits if things worked 568 * - set the folio up to date if all extents in the tree are uptodate 569 * - clear the lock bit in the extent tree 570 * - unlock the folio if there are no other extents locked for it 571 * 572 * Scheduling is not allowed, so the extent state tree is expected 573 * to have one and only one object corresponding to this IO. 574 */ 575 static void end_bbio_data_read(struct btrfs_bio *bbio) 576 { 577 struct btrfs_fs_info *fs_info = bbio->fs_info; 578 struct bio *bio = &bbio->bio; 579 struct folio_iter fi; 580 581 ASSERT(!bio_flagged(bio, BIO_CLONED)); 582 bio_for_each_folio_all(fi, &bbio->bio) { 583 bool uptodate = !bio->bi_status; 584 struct folio *folio = fi.folio; 585 struct inode *inode = folio->mapping->host; 586 u64 start = folio_pos(folio) + fi.offset; 587 588 btrfs_debug(fs_info, 589 "%s: bi_sector=%llu, err=%d, mirror=%u", 590 __func__, bio->bi_iter.bi_sector, bio->bi_status, 591 bbio->mirror_num); 592 593 594 if (likely(uptodate)) { 595 u64 end = start + fi.length - 1; 596 loff_t i_size = i_size_read(inode); 597 598 /* 599 * Zero out the remaining part if this range straddles 600 * i_size. 601 * 602 * Here we should only zero the range inside the folio, 603 * not touch anything else. 604 * 605 * NOTE: i_size is exclusive while end is inclusive and 606 * folio_contains() takes PAGE_SIZE units. 607 */ 608 if (folio_contains(folio, i_size >> PAGE_SHIFT) && 609 i_size <= end) { 610 u32 zero_start = max(offset_in_folio(folio, i_size), 611 offset_in_folio(folio, start)); 612 u32 zero_len = offset_in_folio(folio, end) + 1 - 613 zero_start; 614 615 folio_zero_range(folio, zero_start, zero_len); 616 } 617 } 618 619 /* Update page status and unlock. */ 620 end_folio_read(folio, uptodate, start, fi.length); 621 } 622 bio_put(bio); 623 } 624 625 /* 626 * Populate every free slot in a provided array with folios using GFP_NOFS. 627 * 628 * @nr_folios: number of folios to allocate 629 * @order: the order of the folios to be allocated 630 * @folio_array: the array to fill with folios; any existing non-NULL entries in 631 * the array will be skipped 632 * 633 * Return: 0 if all folios were able to be allocated; 634 * -ENOMEM otherwise, the partially allocated folios would be freed and 635 * the array slots zeroed 636 */ 637 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, 638 struct folio **folio_array) 639 { 640 for (int i = 0; i < nr_folios; i++) { 641 if (folio_array[i]) 642 continue; 643 folio_array[i] = folio_alloc(GFP_NOFS, order); 644 if (!folio_array[i]) 645 goto error; 646 } 647 return 0; 648 error: 649 for (int i = 0; i < nr_folios; i++) { 650 if (folio_array[i]) 651 folio_put(folio_array[i]); 652 folio_array[i] = NULL; 653 } 654 return -ENOMEM; 655 } 656 657 /* 658 * Populate every free slot in a provided array with pages, using GFP_NOFS. 659 * 660 * @nr_pages: number of pages to allocate 661 * @page_array: the array to fill with pages; any existing non-null entries in 662 * the array will be skipped 663 * @nofail: whether using __GFP_NOFAIL flag 664 * 665 * Return: 0 if all pages were able to be allocated; 666 * -ENOMEM otherwise, the partially allocated pages would be freed and 667 * the array slots zeroed 668 */ 669 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, 670 bool nofail) 671 { 672 const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; 673 unsigned int allocated; 674 675 for (allocated = 0; allocated < nr_pages;) { 676 unsigned int last = allocated; 677 678 allocated = alloc_pages_bulk(gfp, nr_pages, page_array); 679 if (unlikely(allocated == last)) { 680 /* No progress, fail and do cleanup. */ 681 for (int i = 0; i < allocated; i++) { 682 __free_page(page_array[i]); 683 page_array[i] = NULL; 684 } 685 return -ENOMEM; 686 } 687 } 688 return 0; 689 } 690 691 /* 692 * Populate needed folios for the extent buffer. 693 * 694 * For now, the folios populated are always in order 0 (aka, single page). 695 */ 696 static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) 697 { 698 struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; 699 int num_pages = num_extent_pages(eb); 700 int ret; 701 702 ret = btrfs_alloc_page_array(num_pages, page_array, nofail); 703 if (ret < 0) 704 return ret; 705 706 for (int i = 0; i < num_pages; i++) 707 eb->folios[i] = page_folio(page_array[i]); 708 eb->folio_size = PAGE_SIZE; 709 eb->folio_shift = PAGE_SHIFT; 710 return 0; 711 } 712 713 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, 714 u64 disk_bytenr, loff_t file_offset) 715 { 716 struct bio *bio = &bio_ctrl->bbio->bio; 717 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 718 719 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 720 /* 721 * For compression, all IO should have its logical bytenr set 722 * to the starting bytenr of the compressed extent. 723 */ 724 return bio->bi_iter.bi_sector == sector; 725 } 726 727 /* 728 * To merge into a bio both the disk sector and the logical offset in 729 * the file need to be contiguous. 730 */ 731 return bio_ctrl->next_file_offset == file_offset && 732 bio_end_sector(bio) == sector; 733 } 734 735 static void alloc_new_bio(struct btrfs_inode *inode, 736 struct btrfs_bio_ctrl *bio_ctrl, 737 u64 disk_bytenr, u64 file_offset) 738 { 739 struct btrfs_fs_info *fs_info = inode->root->fs_info; 740 struct btrfs_bio *bbio; 741 742 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, 743 bio_ctrl->end_io_func, NULL); 744 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 745 bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; 746 bbio->inode = inode; 747 bbio->file_offset = file_offset; 748 bio_ctrl->bbio = bbio; 749 bio_ctrl->len_to_oe_boundary = U32_MAX; 750 bio_ctrl->next_file_offset = file_offset; 751 752 /* Limit data write bios to the ordered boundary. */ 753 if (bio_ctrl->wbc) { 754 struct btrfs_ordered_extent *ordered; 755 756 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 757 if (ordered) { 758 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 759 ordered->file_offset + 760 ordered->disk_num_bytes - file_offset); 761 bbio->ordered = ordered; 762 } 763 764 /* 765 * Pick the last added device to support cgroup writeback. For 766 * multi-device file systems this means blk-cgroup policies have 767 * to always be set on the last added/replaced device. 768 * This is a bit odd but has been like that for a long time. 769 */ 770 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 771 wbc_init_bio(bio_ctrl->wbc, &bbio->bio); 772 } 773 } 774 775 /* 776 * @disk_bytenr: logical bytenr where the write will be 777 * @page: page to add to the bio 778 * @size: portion of page that we want to write to 779 * @pg_offset: offset of the new bio or to check whether we are adding 780 * a contiguous page to the previous one 781 * @read_em_generation: generation of the extent_map we are submitting 782 * (only used for read) 783 * 784 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a 785 * new one in @bio_ctrl->bbio. 786 * The mirror number for this IO should already be initialized in 787 * @bio_ctrl->mirror_num. 788 */ 789 static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, 790 u64 disk_bytenr, struct folio *folio, 791 size_t size, unsigned long pg_offset, 792 u64 read_em_generation) 793 { 794 struct btrfs_inode *inode = folio_to_inode(folio); 795 loff_t file_offset = folio_pos(folio) + pg_offset; 796 797 ASSERT(pg_offset + size <= folio_size(folio)); 798 ASSERT(bio_ctrl->end_io_func); 799 800 if (bio_ctrl->bbio && 801 !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) 802 submit_one_bio(bio_ctrl); 803 804 do { 805 u32 len = size; 806 807 /* Allocate new bio if needed */ 808 if (!bio_ctrl->bbio) 809 alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); 810 811 /* Cap to the current ordered extent boundary if there is one. */ 812 if (len > bio_ctrl->len_to_oe_boundary) { 813 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); 814 ASSERT(is_data_inode(inode)); 815 len = bio_ctrl->len_to_oe_boundary; 816 } 817 818 if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { 819 /* bio full: move on to a new one */ 820 submit_one_bio(bio_ctrl); 821 continue; 822 } 823 /* 824 * Now that the folio is definitely added to the bio, include its 825 * generation in the max generation calculation. 826 */ 827 bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); 828 bio_ctrl->next_file_offset += len; 829 830 if (bio_ctrl->wbc) 831 wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); 832 833 size -= len; 834 pg_offset += len; 835 disk_bytenr += len; 836 file_offset += len; 837 838 /* 839 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or 840 * sector aligned. alloc_new_bio() then sets it to the end of 841 * our ordered extent for writes into zoned devices. 842 * 843 * When len_to_oe_boundary is tracking an ordered extent, we 844 * trust the ordered extent code to align things properly, and 845 * the check above to cap our write to the ordered extent 846 * boundary is correct. 847 * 848 * When len_to_oe_boundary is U32_MAX, the cap above would 849 * result in a 4095 byte IO for the last folio right before 850 * we hit the bio limit of UINT_MAX. bio_add_folio() has all 851 * the checks required to make sure we don't overflow the bio, 852 * and we should just ignore len_to_oe_boundary completely 853 * unless we're using it to track an ordered extent. 854 * 855 * It's pretty hard to make a bio sized U32_MAX, but it can 856 * happen when the page cache is able to feed us contiguous 857 * folios for large extents. 858 */ 859 if (bio_ctrl->len_to_oe_boundary != U32_MAX) 860 bio_ctrl->len_to_oe_boundary -= len; 861 862 /* Ordered extent boundary: move on to a new bio. */ 863 if (bio_ctrl->len_to_oe_boundary == 0) 864 submit_one_bio(bio_ctrl); 865 } while (size); 866 } 867 868 static int attach_extent_buffer_folio(struct extent_buffer *eb, 869 struct folio *folio, 870 struct btrfs_folio_state *prealloc) 871 { 872 struct btrfs_fs_info *fs_info = eb->fs_info; 873 int ret = 0; 874 875 /* 876 * If the page is mapped to btree inode, we should hold the private 877 * lock to prevent race. 878 * For cloned or dummy extent buffers, their pages are not mapped and 879 * will not race with any other ebs. 880 */ 881 if (folio->mapping) 882 lockdep_assert_held(&folio->mapping->i_private_lock); 883 884 if (!btrfs_meta_is_subpage(fs_info)) { 885 if (!folio_test_private(folio)) 886 folio_attach_private(folio, eb); 887 else 888 WARN_ON(folio_get_private(folio) != eb); 889 return 0; 890 } 891 892 /* Already mapped, just free prealloc */ 893 if (folio_test_private(folio)) { 894 btrfs_free_folio_state(prealloc); 895 return 0; 896 } 897 898 if (prealloc) 899 /* Has preallocated memory for subpage */ 900 folio_attach_private(folio, prealloc); 901 else 902 /* Do new allocation to attach subpage */ 903 ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 904 return ret; 905 } 906 907 int set_folio_extent_mapped(struct folio *folio) 908 { 909 struct btrfs_fs_info *fs_info; 910 911 ASSERT(folio->mapping); 912 913 if (folio_test_private(folio)) 914 return 0; 915 916 fs_info = folio_to_fs_info(folio); 917 918 if (btrfs_is_subpage(fs_info, folio)) 919 return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 920 921 folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); 922 return 0; 923 } 924 925 void clear_folio_extent_mapped(struct folio *folio) 926 { 927 struct btrfs_fs_info *fs_info; 928 929 ASSERT(folio->mapping); 930 931 if (!folio_test_private(folio)) 932 return; 933 934 fs_info = folio_to_fs_info(folio); 935 if (btrfs_is_subpage(fs_info, folio)) 936 return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 937 938 folio_detach_private(folio); 939 } 940 941 static struct extent_map *get_extent_map(struct btrfs_inode *inode, 942 struct folio *folio, u64 start, 943 u64 len, struct extent_map **em_cached) 944 { 945 struct extent_map *em; 946 947 ASSERT(em_cached); 948 949 if (*em_cached) { 950 em = *em_cached; 951 if (btrfs_extent_map_in_tree(em) && start >= em->start && 952 start < btrfs_extent_map_end(em)) { 953 refcount_inc(&em->refs); 954 return em; 955 } 956 957 btrfs_free_extent_map(em); 958 *em_cached = NULL; 959 } 960 961 em = btrfs_get_extent(inode, folio, start, len); 962 if (!IS_ERR(em)) { 963 BUG_ON(*em_cached); 964 refcount_inc(&em->refs); 965 *em_cached = em; 966 } 967 968 return em; 969 } 970 971 static void btrfs_readahead_expand(struct readahead_control *ractl, 972 const struct extent_map *em) 973 { 974 const u64 ra_pos = readahead_pos(ractl); 975 const u64 ra_end = ra_pos + readahead_length(ractl); 976 const u64 em_end = em->start + em->ram_bytes; 977 978 /* No expansion for holes and inline extents. */ 979 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) 980 return; 981 982 ASSERT(em_end >= ra_pos, 983 "extent_map %llu %llu ends before current readahead position %llu", 984 em->start, em->len, ra_pos); 985 if (em_end > ra_end) 986 readahead_expand(ractl, ra_pos, em_end - ra_pos); 987 } 988 989 /* 990 * basic readpage implementation. Locked extent state structs are inserted 991 * into the tree that are removed when the IO is done (by the end_io 992 * handlers) 993 * XXX JDM: This needs looking at to ensure proper page locking 994 * return 0 on success, otherwise return error 995 */ 996 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, 997 struct btrfs_bio_ctrl *bio_ctrl) 998 { 999 struct inode *inode = folio->mapping->host; 1000 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1001 u64 start = folio_pos(folio); 1002 const u64 end = start + folio_size(folio) - 1; 1003 u64 extent_offset; 1004 u64 last_byte = i_size_read(inode); 1005 struct extent_map *em; 1006 int ret = 0; 1007 const size_t blocksize = fs_info->sectorsize; 1008 1009 ret = set_folio_extent_mapped(folio); 1010 if (ret < 0) { 1011 folio_unlock(folio); 1012 return ret; 1013 } 1014 1015 if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { 1016 size_t zero_offset = offset_in_folio(folio, last_byte); 1017 1018 if (zero_offset) 1019 folio_zero_range(folio, zero_offset, 1020 folio_size(folio) - zero_offset); 1021 } 1022 bio_ctrl->end_io_func = end_bbio_data_read; 1023 begin_folio_read(fs_info, folio); 1024 for (u64 cur = start; cur <= end; cur += blocksize) { 1025 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; 1026 unsigned long pg_offset = offset_in_folio(folio, cur); 1027 bool force_bio_submit = false; 1028 u64 disk_bytenr; 1029 u64 block_start; 1030 u64 em_gen; 1031 1032 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 1033 if (cur >= last_byte) { 1034 folio_zero_range(folio, pg_offset, end - cur + 1); 1035 end_folio_read(folio, true, cur, end - cur + 1); 1036 break; 1037 } 1038 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1039 end_folio_read(folio, true, cur, blocksize); 1040 continue; 1041 } 1042 em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); 1043 if (IS_ERR(em)) { 1044 end_folio_read(folio, false, cur, end + 1 - cur); 1045 return PTR_ERR(em); 1046 } 1047 extent_offset = cur - em->start; 1048 BUG_ON(btrfs_extent_map_end(em) <= cur); 1049 BUG_ON(end < cur); 1050 1051 compress_type = btrfs_extent_map_compression(em); 1052 1053 /* 1054 * Only expand readahead for extents which are already creating 1055 * the pages anyway in add_ra_bio_pages, which is compressed 1056 * extents in the non subpage case. 1057 */ 1058 if (bio_ctrl->ractl && 1059 !btrfs_is_subpage(fs_info, folio) && 1060 compress_type != BTRFS_COMPRESS_NONE) 1061 btrfs_readahead_expand(bio_ctrl->ractl, em); 1062 1063 if (compress_type != BTRFS_COMPRESS_NONE) 1064 disk_bytenr = em->disk_bytenr; 1065 else 1066 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1067 1068 if (em->flags & EXTENT_FLAG_PREALLOC) 1069 block_start = EXTENT_MAP_HOLE; 1070 else 1071 block_start = btrfs_extent_map_block_start(em); 1072 1073 /* 1074 * If we have a file range that points to a compressed extent 1075 * and it's followed by a consecutive file range that points 1076 * to the same compressed extent (possibly with a different 1077 * offset and/or length, so it either points to the whole extent 1078 * or only part of it), we must make sure we do not submit a 1079 * single bio to populate the folios for the 2 ranges because 1080 * this makes the compressed extent read zero out the folios 1081 * belonging to the 2nd range. Imagine the following scenario: 1082 * 1083 * File layout 1084 * [0 - 8K] [8K - 24K] 1085 * | | 1086 * | | 1087 * points to extent X, points to extent X, 1088 * offset 4K, length of 8K offset 0, length 16K 1089 * 1090 * [extent X, compressed length = 4K uncompressed length = 16K] 1091 * 1092 * If the bio to read the compressed extent covers both ranges, 1093 * it will decompress extent X into the folios belonging to the 1094 * first range and then it will stop, zeroing out the remaining 1095 * folios that belong to the other range that points to extent X. 1096 * So here we make sure we submit 2 bios, one for the first 1097 * range and another one for the third range. Both will target 1098 * the same physical extent from disk, but we can't currently 1099 * make the compressed bio endio callback populate the folios 1100 * for both ranges because each compressed bio is tightly 1101 * coupled with a single extent map, and each range can have 1102 * an extent map with a different offset value relative to the 1103 * uncompressed data of our extent and different lengths. This 1104 * is a corner case so we prioritize correctness over 1105 * non-optimal behavior (submitting 2 bios for the same extent). 1106 */ 1107 if (compress_type != BTRFS_COMPRESS_NONE && 1108 bio_ctrl->last_em_start != U64_MAX && 1109 bio_ctrl->last_em_start != em->start) 1110 force_bio_submit = true; 1111 1112 bio_ctrl->last_em_start = em->start; 1113 1114 em_gen = em->generation; 1115 btrfs_free_extent_map(em); 1116 em = NULL; 1117 1118 /* we've found a hole, just zero and go on */ 1119 if (block_start == EXTENT_MAP_HOLE) { 1120 folio_zero_range(folio, pg_offset, blocksize); 1121 end_folio_read(folio, true, cur, blocksize); 1122 continue; 1123 } 1124 /* the get_extent function already copied into the folio */ 1125 if (block_start == EXTENT_MAP_INLINE) { 1126 end_folio_read(folio, true, cur, blocksize); 1127 continue; 1128 } 1129 1130 if (bio_ctrl->compress_type != compress_type) { 1131 submit_one_bio(bio_ctrl); 1132 bio_ctrl->compress_type = compress_type; 1133 } 1134 1135 if (force_bio_submit) 1136 submit_one_bio(bio_ctrl); 1137 submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, 1138 pg_offset, em_gen); 1139 } 1140 return 0; 1141 } 1142 1143 /* 1144 * Check if we can skip waiting the @ordered extent covering the block at @fileoff. 1145 * 1146 * @fileoff: Both input and output. 1147 * Input as the file offset where the check should start at. 1148 * Output as where the next check should start at, 1149 * if the function returns true. 1150 * 1151 * Return true if we can skip to @fileoff. The caller needs to check the new 1152 * @fileoff value to make sure it covers the full range, before skipping the 1153 * full OE. 1154 * 1155 * Return false if we must wait for the ordered extent. 1156 */ 1157 static bool can_skip_one_ordered_range(struct btrfs_inode *inode, 1158 struct btrfs_ordered_extent *ordered, 1159 u64 *fileoff) 1160 { 1161 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1162 struct folio *folio; 1163 const u32 blocksize = fs_info->sectorsize; 1164 u64 cur = *fileoff; 1165 bool ret; 1166 1167 folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); 1168 1169 /* 1170 * We should have locked the folio(s) for range [start, end], thus 1171 * there must be a folio and it must be locked. 1172 */ 1173 ASSERT(!IS_ERR(folio)); 1174 ASSERT(folio_test_locked(folio)); 1175 1176 /* 1177 * There are several cases for the folio and OE combination: 1178 * 1179 * 1) Folio has no private flag 1180 * The OE has all its IO done but not yet finished, and folio got 1181 * invalidated. 1182 * 1183 * Have we have to wait for the OE to finish, as it may contain the 1184 * to-be-inserted data checksum. 1185 * Without the data checksum inserted into the csum tree, read will 1186 * just fail with missing csum. 1187 */ 1188 if (!folio_test_private(folio)) { 1189 ret = false; 1190 goto out; 1191 } 1192 1193 /* 1194 * 2) The first block is DIRTY. 1195 * 1196 * This means the OE is created by some other folios whose file pos is 1197 * before this one. And since we are holding the folio lock, the writeback 1198 * of this folio cannot start. 1199 * 1200 * We must skip the whole OE, because it will never start until we 1201 * finished our folio read and unlocked the folio. 1202 */ 1203 if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { 1204 u64 range_len = min(folio_end(folio), 1205 ordered->file_offset + ordered->num_bytes) - cur; 1206 1207 ret = true; 1208 /* 1209 * At least inside the folio, all the remaining blocks should 1210 * also be dirty. 1211 */ 1212 ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); 1213 *fileoff = ordered->file_offset + ordered->num_bytes; 1214 goto out; 1215 } 1216 1217 /* 1218 * 3) The first block is uptodate. 1219 * 1220 * At least the first block can be skipped, but we are still not fully 1221 * sure. E.g. if the OE has some other folios in the range that cannot 1222 * be skipped. 1223 * So we return true and update @next_ret to the OE/folio boundary. 1224 */ 1225 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1226 u64 range_len = min(folio_end(folio), 1227 ordered->file_offset + ordered->num_bytes) - cur; 1228 1229 /* 1230 * The whole range to the OE end or folio boundary should also 1231 * be uptodate. 1232 */ 1233 ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); 1234 ret = true; 1235 *fileoff = cur + range_len; 1236 goto out; 1237 } 1238 1239 /* 1240 * 4) The first block is not uptodate. 1241 * 1242 * This means the folio is invalidated after the writeback was finished, 1243 * but by some other operations (e.g. block aligned buffered write) the 1244 * folio is inserted into filemap. 1245 * Very much the same as case 1). 1246 */ 1247 ret = false; 1248 out: 1249 folio_put(folio); 1250 return ret; 1251 } 1252 1253 static bool can_skip_ordered_extent(struct btrfs_inode *inode, 1254 struct btrfs_ordered_extent *ordered, 1255 u64 start, u64 end) 1256 { 1257 const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); 1258 u64 cur = max(start, ordered->file_offset); 1259 1260 while (cur < range_end) { 1261 bool can_skip; 1262 1263 can_skip = can_skip_one_ordered_range(inode, ordered, &cur); 1264 if (!can_skip) 1265 return false; 1266 } 1267 return true; 1268 } 1269 1270 /* 1271 * Locking helper to make sure we get a stable view of extent maps for the 1272 * involved range. 1273 * 1274 * This is for folio read paths (read and readahead), thus the involved range 1275 * should have all the folios locked. 1276 */ 1277 static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, 1278 struct extent_state **cached_state) 1279 { 1280 u64 cur_pos; 1281 1282 /* Caller must provide a valid @cached_state. */ 1283 ASSERT(cached_state); 1284 1285 /* The range must at least be page aligned, as all read paths are folio based. */ 1286 ASSERT(IS_ALIGNED(start, PAGE_SIZE)); 1287 ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); 1288 1289 again: 1290 btrfs_lock_extent(&inode->io_tree, start, end, cached_state); 1291 cur_pos = start; 1292 while (cur_pos < end) { 1293 struct btrfs_ordered_extent *ordered; 1294 1295 ordered = btrfs_lookup_ordered_range(inode, cur_pos, 1296 end - cur_pos + 1); 1297 /* 1298 * No ordered extents in the range, and we hold the extent lock, 1299 * no one can modify the extent maps in the range, we're safe to return. 1300 */ 1301 if (!ordered) 1302 break; 1303 1304 /* Check if we can skip waiting for the whole OE. */ 1305 if (can_skip_ordered_extent(inode, ordered, start, end)) { 1306 cur_pos = min(ordered->file_offset + ordered->num_bytes, 1307 end + 1); 1308 btrfs_put_ordered_extent(ordered); 1309 continue; 1310 } 1311 1312 /* Now wait for the OE to finish. */ 1313 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1314 btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); 1315 btrfs_put_ordered_extent(ordered); 1316 /* We have unlocked the whole range, restart from the beginning. */ 1317 goto again; 1318 } 1319 } 1320 1321 int btrfs_read_folio(struct file *file, struct folio *folio) 1322 { 1323 struct btrfs_inode *inode = folio_to_inode(folio); 1324 const u64 start = folio_pos(folio); 1325 const u64 end = start + folio_size(folio) - 1; 1326 struct extent_state *cached_state = NULL; 1327 struct btrfs_bio_ctrl bio_ctrl = { 1328 .opf = REQ_OP_READ, 1329 .last_em_start = U64_MAX, 1330 }; 1331 struct extent_map *em_cached = NULL; 1332 int ret; 1333 1334 lock_extents_for_read(inode, start, end, &cached_state); 1335 ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 1336 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1337 1338 btrfs_free_extent_map(em_cached); 1339 1340 /* 1341 * If btrfs_do_readpage() failed we will want to submit the assembled 1342 * bio to do the cleanup. 1343 */ 1344 submit_one_bio(&bio_ctrl); 1345 return ret; 1346 } 1347 1348 static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, 1349 u64 start, u32 len) 1350 { 1351 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1352 const u64 folio_start = folio_pos(folio); 1353 unsigned int start_bit; 1354 unsigned int nbits; 1355 1356 ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); 1357 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1358 nbits = len >> fs_info->sectorsize_bits; 1359 ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); 1360 bitmap_set(delalloc_bitmap, start_bit, nbits); 1361 } 1362 1363 static bool find_next_delalloc_bitmap(struct folio *folio, 1364 unsigned long *delalloc_bitmap, u64 start, 1365 u64 *found_start, u32 *found_len) 1366 { 1367 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1368 const u64 folio_start = folio_pos(folio); 1369 const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); 1370 unsigned int start_bit; 1371 unsigned int first_zero; 1372 unsigned int first_set; 1373 1374 ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); 1375 1376 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1377 first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); 1378 if (first_set >= bitmap_size) 1379 return false; 1380 1381 *found_start = folio_start + (first_set << fs_info->sectorsize_bits); 1382 first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); 1383 *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; 1384 return true; 1385 } 1386 1387 /* 1388 * Do all of the delayed allocation setup. 1389 * 1390 * Return >0 if all the dirty blocks are submitted async (compression) or inlined. 1391 * The @folio should no longer be touched (treat it as already unlocked). 1392 * 1393 * Return 0 if there is still dirty block that needs to be submitted through 1394 * extent_writepage_io(). 1395 * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be 1396 * submitted, and @folio is still kept locked. 1397 * 1398 * Return <0 if there is any error hit. 1399 * Any allocated ordered extent range covering this folio will be marked 1400 * finished (IOERR), and @folio is still kept locked. 1401 */ 1402 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 1403 struct folio *folio, 1404 struct btrfs_bio_ctrl *bio_ctrl) 1405 { 1406 struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); 1407 struct writeback_control *wbc = bio_ctrl->wbc; 1408 const bool is_subpage = btrfs_is_subpage(fs_info, folio); 1409 const u64 page_start = folio_pos(folio); 1410 const u64 page_end = page_start + folio_size(folio) - 1; 1411 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1412 unsigned long delalloc_bitmap = 0; 1413 /* 1414 * Save the last found delalloc end. As the delalloc end can go beyond 1415 * page boundary, thus we cannot rely on subpage bitmap to locate the 1416 * last delalloc end. 1417 */ 1418 u64 last_delalloc_end = 0; 1419 /* 1420 * The range end (exclusive) of the last successfully finished delalloc 1421 * range. 1422 * Any range covered by ordered extent must either be manually marked 1423 * finished (error handling), or has IO submitted (and finish the 1424 * ordered extent normally). 1425 * 1426 * This records the end of ordered extent cleanup if we hit an error. 1427 */ 1428 u64 last_finished_delalloc_end = page_start; 1429 u64 delalloc_start = page_start; 1430 u64 delalloc_end = page_end; 1431 u64 delalloc_to_write = 0; 1432 int ret = 0; 1433 int bit; 1434 1435 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1436 if (btrfs_is_subpage(fs_info, folio)) { 1437 ASSERT(blocks_per_folio > 1); 1438 btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); 1439 } else { 1440 bio_ctrl->submit_bitmap = 1; 1441 } 1442 1443 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1444 u64 start = page_start + (bit << fs_info->sectorsize_bits); 1445 1446 btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); 1447 } 1448 1449 /* Lock all (subpage) delalloc ranges inside the folio first. */ 1450 while (delalloc_start < page_end) { 1451 delalloc_end = page_end; 1452 if (!find_lock_delalloc_range(&inode->vfs_inode, folio, 1453 &delalloc_start, &delalloc_end)) { 1454 delalloc_start = delalloc_end + 1; 1455 continue; 1456 } 1457 set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, 1458 min(delalloc_end, page_end) + 1 - delalloc_start); 1459 last_delalloc_end = delalloc_end; 1460 delalloc_start = delalloc_end + 1; 1461 } 1462 delalloc_start = page_start; 1463 1464 if (!last_delalloc_end) 1465 goto out; 1466 1467 /* Run the delalloc ranges for the above locked ranges. */ 1468 while (delalloc_start < page_end) { 1469 u64 found_start; 1470 u32 found_len; 1471 bool found; 1472 1473 if (!is_subpage) { 1474 /* 1475 * For non-subpage case, the found delalloc range must 1476 * cover this folio and there must be only one locked 1477 * delalloc range. 1478 */ 1479 found_start = page_start; 1480 found_len = last_delalloc_end + 1 - found_start; 1481 found = true; 1482 } else { 1483 found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, 1484 delalloc_start, &found_start, &found_len); 1485 } 1486 if (!found) 1487 break; 1488 /* 1489 * The subpage range covers the last sector, the delalloc range may 1490 * end beyond the folio boundary, use the saved delalloc_end 1491 * instead. 1492 */ 1493 if (found_start + found_len >= page_end) 1494 found_len = last_delalloc_end + 1 - found_start; 1495 1496 if (ret >= 0) { 1497 /* 1498 * Some delalloc range may be created by previous folios. 1499 * Thus we still need to clean up this range during error 1500 * handling. 1501 */ 1502 last_finished_delalloc_end = found_start; 1503 /* No errors hit so far, run the current delalloc range. */ 1504 ret = btrfs_run_delalloc_range(inode, folio, 1505 found_start, 1506 found_start + found_len - 1, 1507 wbc); 1508 if (ret >= 0) 1509 last_finished_delalloc_end = found_start + found_len; 1510 if (unlikely(ret < 0)) 1511 btrfs_err_rl(fs_info, 1512 "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", 1513 btrfs_root_id(inode->root), 1514 btrfs_ino(inode), 1515 folio_pos(folio), 1516 blocks_per_folio, 1517 &bio_ctrl->submit_bitmap, 1518 found_start, found_len, ret); 1519 } else { 1520 /* 1521 * We've hit an error during previous delalloc range, 1522 * have to cleanup the remaining locked ranges. 1523 */ 1524 btrfs_unlock_extent(&inode->io_tree, found_start, 1525 found_start + found_len - 1, NULL); 1526 unlock_delalloc_folio(&inode->vfs_inode, folio, 1527 found_start, 1528 found_start + found_len - 1); 1529 } 1530 1531 /* 1532 * We have some ranges that's going to be submitted asynchronously 1533 * (compression or inline). These range have their own control 1534 * on when to unlock the pages. We should not touch them 1535 * anymore, so clear the range from the submission bitmap. 1536 */ 1537 if (ret > 0) { 1538 unsigned int start_bit = (found_start - page_start) >> 1539 fs_info->sectorsize_bits; 1540 unsigned int end_bit = (min(page_end + 1, found_start + found_len) - 1541 page_start) >> fs_info->sectorsize_bits; 1542 bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); 1543 } 1544 /* 1545 * Above btrfs_run_delalloc_range() may have unlocked the folio, 1546 * thus for the last range, we cannot touch the folio anymore. 1547 */ 1548 if (found_start + found_len >= last_delalloc_end + 1) 1549 break; 1550 1551 delalloc_start = found_start + found_len; 1552 } 1553 /* 1554 * It's possible we had some ordered extents created before we hit 1555 * an error, cleanup non-async successfully created delalloc ranges. 1556 */ 1557 if (unlikely(ret < 0)) { 1558 unsigned int bitmap_size = min( 1559 (last_finished_delalloc_end - page_start) >> 1560 fs_info->sectorsize_bits, 1561 blocks_per_folio); 1562 1563 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) 1564 btrfs_mark_ordered_io_finished(inode, folio, 1565 page_start + (bit << fs_info->sectorsize_bits), 1566 fs_info->sectorsize, false); 1567 return ret; 1568 } 1569 out: 1570 if (last_delalloc_end) 1571 delalloc_end = last_delalloc_end; 1572 else 1573 delalloc_end = page_end; 1574 /* 1575 * delalloc_end is already one less than the total length, so 1576 * we don't subtract one from PAGE_SIZE. 1577 */ 1578 delalloc_to_write += 1579 DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); 1580 1581 /* 1582 * If all ranges are submitted asynchronously, we just need to account 1583 * for them here. 1584 */ 1585 if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { 1586 wbc->nr_to_write -= delalloc_to_write; 1587 return 1; 1588 } 1589 1590 if (wbc->nr_to_write < delalloc_to_write) { 1591 int thresh = 8192; 1592 1593 if (delalloc_to_write < thresh * 2) 1594 thresh = delalloc_to_write; 1595 wbc->nr_to_write = min_t(u64, delalloc_to_write, 1596 thresh); 1597 } 1598 1599 return 0; 1600 } 1601 1602 /* 1603 * Return 0 if we have submitted or queued the sector for submission. 1604 * Return <0 for critical errors, and the sector will have its dirty flag cleared. 1605 * 1606 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1607 */ 1608 static int submit_one_sector(struct btrfs_inode *inode, 1609 struct folio *folio, 1610 u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, 1611 loff_t i_size) 1612 { 1613 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1614 struct extent_map *em; 1615 u64 block_start; 1616 u64 disk_bytenr; 1617 u64 extent_offset; 1618 u64 em_end; 1619 const u32 sectorsize = fs_info->sectorsize; 1620 1621 ASSERT(IS_ALIGNED(filepos, sectorsize)); 1622 1623 /* @filepos >= i_size case should be handled by the caller. */ 1624 ASSERT(filepos < i_size); 1625 1626 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1627 if (IS_ERR(em)) { 1628 /* 1629 * When submission failed, we should still clear the folio dirty. 1630 * Or the folio will be written back again but without any 1631 * ordered extent. 1632 */ 1633 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1634 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1635 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1636 return PTR_ERR(em); 1637 } 1638 1639 extent_offset = filepos - em->start; 1640 em_end = btrfs_extent_map_end(em); 1641 ASSERT(filepos <= em_end); 1642 ASSERT(IS_ALIGNED(em->start, sectorsize)); 1643 ASSERT(IS_ALIGNED(em->len, sectorsize)); 1644 1645 block_start = btrfs_extent_map_block_start(em); 1646 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1647 1648 ASSERT(!btrfs_extent_map_is_compressed(em)); 1649 ASSERT(block_start != EXTENT_MAP_HOLE); 1650 ASSERT(block_start != EXTENT_MAP_INLINE); 1651 1652 btrfs_free_extent_map(em); 1653 em = NULL; 1654 1655 /* 1656 * Although the PageDirty bit is cleared before entering this 1657 * function, subpage dirty bit is not cleared. 1658 * So clear subpage dirty bit here so next time we won't submit 1659 * a folio for a range already written to disk. 1660 */ 1661 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1662 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1663 /* 1664 * Above call should set the whole folio with writeback flag, even 1665 * just for a single subpage sector. 1666 * As long as the folio is properly locked and the range is correct, 1667 * we should always get the folio with writeback flag. 1668 */ 1669 ASSERT(folio_test_writeback(folio)); 1670 1671 submit_extent_folio(bio_ctrl, disk_bytenr, folio, 1672 sectorsize, filepos - folio_pos(folio), 0); 1673 return 0; 1674 } 1675 1676 /* 1677 * Helper for extent_writepage(). This calls the writepage start hooks, 1678 * and does the loop to map the page into extents and bios. 1679 * 1680 * We return 1 if the IO is started and the page is unlocked, 1681 * 0 if all went well (page still locked) 1682 * < 0 if there were errors (page still locked) 1683 */ 1684 static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, 1685 struct folio *folio, 1686 u64 start, u32 len, 1687 struct btrfs_bio_ctrl *bio_ctrl, 1688 loff_t i_size) 1689 { 1690 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1691 unsigned long range_bitmap = 0; 1692 bool submitted_io = false; 1693 int found_error = 0; 1694 const u64 folio_start = folio_pos(folio); 1695 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1696 u64 cur; 1697 int bit; 1698 int ret = 0; 1699 1700 ASSERT(start >= folio_start && 1701 start + len <= folio_start + folio_size(folio)); 1702 1703 ret = btrfs_writepage_cow_fixup(folio); 1704 if (ret == -EAGAIN) { 1705 /* Fixup worker will requeue */ 1706 folio_redirty_for_writepage(bio_ctrl->wbc, folio); 1707 folio_unlock(folio); 1708 return 1; 1709 } 1710 if (ret < 0) { 1711 btrfs_folio_clear_dirty(fs_info, folio, start, len); 1712 btrfs_folio_set_writeback(fs_info, folio, start, len); 1713 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1714 return ret; 1715 } 1716 1717 for (cur = start; cur < start + len; cur += fs_info->sectorsize) 1718 set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); 1719 bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, 1720 blocks_per_folio); 1721 1722 bio_ctrl->end_io_func = end_bbio_data_write; 1723 1724 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1725 cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); 1726 1727 if (cur >= i_size) { 1728 btrfs_mark_ordered_io_finished(inode, folio, cur, 1729 start + len - cur, true); 1730 /* 1731 * This range is beyond i_size, thus we don't need to 1732 * bother writing back. 1733 * But we still need to clear the dirty subpage bit, or 1734 * the next time the folio gets dirtied, we will try to 1735 * writeback the sectors with subpage dirty bits, 1736 * causing writeback without ordered extent. 1737 */ 1738 btrfs_folio_clear_dirty(fs_info, folio, cur, 1739 start + len - cur); 1740 break; 1741 } 1742 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1743 if (unlikely(ret < 0)) { 1744 /* 1745 * bio_ctrl may contain a bio crossing several folios. 1746 * Submit it immediately so that the bio has a chance 1747 * to finish normally, other than marked as error. 1748 */ 1749 submit_one_bio(bio_ctrl); 1750 /* 1751 * Failed to grab the extent map which should be very rare. 1752 * Since there is no bio submitted to finish the ordered 1753 * extent, we have to manually finish this sector. 1754 */ 1755 btrfs_mark_ordered_io_finished(inode, folio, cur, 1756 fs_info->sectorsize, false); 1757 if (!found_error) 1758 found_error = ret; 1759 continue; 1760 } 1761 submitted_io = true; 1762 } 1763 1764 /* 1765 * If we didn't submitted any sector (>= i_size), folio dirty get 1766 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared 1767 * by folio_start_writeback() if the folio is not dirty). 1768 * 1769 * Here we set writeback and clear for the range. If the full folio 1770 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1771 * 1772 * If we hit any error, the corresponding sector will have its dirty 1773 * flag cleared and writeback finished, thus no need to handle the error case. 1774 */ 1775 if (!submitted_io && !found_error) { 1776 btrfs_folio_set_writeback(fs_info, folio, start, len); 1777 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1778 } 1779 return found_error; 1780 } 1781 1782 /* 1783 * the writepage semantics are similar to regular writepage. extent 1784 * records are inserted to lock ranges in the tree, and as dirty areas 1785 * are found, they are marked writeback. Then the lock bits are removed 1786 * and the end_io handler clears the writeback ranges 1787 * 1788 * Return 0 if everything goes well. 1789 * Return <0 for error. 1790 */ 1791 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) 1792 { 1793 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 1794 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1795 int ret; 1796 size_t pg_offset; 1797 loff_t i_size = i_size_read(&inode->vfs_inode); 1798 const pgoff_t end_index = i_size >> PAGE_SHIFT; 1799 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1800 1801 trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); 1802 1803 WARN_ON(!folio_test_locked(folio)); 1804 1805 pg_offset = offset_in_folio(folio, i_size); 1806 if (folio->index > end_index || 1807 (folio->index == end_index && !pg_offset)) { 1808 folio_invalidate(folio, 0, folio_size(folio)); 1809 folio_unlock(folio); 1810 return 0; 1811 } 1812 1813 if (folio_contains(folio, end_index)) 1814 folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); 1815 1816 /* 1817 * Default to unlock the whole folio. 1818 * The proper bitmap can only be initialized until writepage_delalloc(). 1819 */ 1820 bio_ctrl->submit_bitmap = (unsigned long)-1; 1821 1822 /* 1823 * If the page is dirty but without private set, it's marked dirty 1824 * without informing the fs. 1825 * Nowadays that is a bug, since the introduction of 1826 * pin_user_pages*(). 1827 * 1828 * So here we check if the page has private set to rule out such 1829 * case. 1830 * But we also have a long history of relying on the COW fixup, 1831 * so here we only enable this check for experimental builds until 1832 * we're sure it's safe. 1833 */ 1834 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && 1835 unlikely(!folio_test_private(folio))) { 1836 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 1837 btrfs_err_rl(fs_info, 1838 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 1839 btrfs_root_id(inode->root), 1840 btrfs_ino(inode), folio_pos(folio)); 1841 ret = -EUCLEAN; 1842 goto done; 1843 } 1844 1845 ret = set_folio_extent_mapped(folio); 1846 if (ret < 0) 1847 goto done; 1848 1849 ret = writepage_delalloc(inode, folio, bio_ctrl); 1850 if (ret == 1) 1851 return 0; 1852 if (ret) 1853 goto done; 1854 1855 ret = extent_writepage_io(inode, folio, folio_pos(folio), 1856 folio_size(folio), bio_ctrl, i_size); 1857 if (ret == 1) 1858 return 0; 1859 if (ret < 0) 1860 btrfs_err_rl(fs_info, 1861 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", 1862 btrfs_root_id(inode->root), btrfs_ino(inode), 1863 folio_pos(folio), blocks_per_folio, 1864 &bio_ctrl->submit_bitmap, ret); 1865 1866 bio_ctrl->wbc->nr_to_write--; 1867 1868 done: 1869 if (ret < 0) 1870 mapping_set_error(folio->mapping, ret); 1871 /* 1872 * Only unlock ranges that are submitted. As there can be some async 1873 * submitted ranges inside the folio. 1874 */ 1875 btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1876 ASSERT(ret <= 0); 1877 return ret; 1878 } 1879 1880 /* 1881 * Lock extent buffer status and pages for writeback. 1882 * 1883 * Return %false if the extent buffer doesn't need to be submitted (e.g. the 1884 * extent buffer is not dirty) 1885 * Return %true is the extent buffer is submitted to bio. 1886 */ 1887 static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, 1888 struct writeback_control *wbc) 1889 { 1890 struct btrfs_fs_info *fs_info = eb->fs_info; 1891 bool ret = false; 1892 1893 btrfs_tree_lock(eb); 1894 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 1895 btrfs_tree_unlock(eb); 1896 if (wbc->sync_mode != WB_SYNC_ALL) 1897 return false; 1898 wait_on_extent_buffer_writeback(eb); 1899 btrfs_tree_lock(eb); 1900 } 1901 1902 /* 1903 * We need to do this to prevent races in people who check if the eb is 1904 * under IO since we can end up having no IO bits set for a short period 1905 * of time. 1906 */ 1907 spin_lock(&eb->refs_lock); 1908 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1909 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1910 unsigned long flags; 1911 1912 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 1913 spin_unlock(&eb->refs_lock); 1914 1915 xas_lock_irqsave(&xas, flags); 1916 xas_load(&xas); 1917 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 1918 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 1919 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 1920 xas_unlock_irqrestore(&xas, flags); 1921 1922 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 1923 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 1924 -eb->len, 1925 fs_info->dirty_metadata_batch); 1926 ret = true; 1927 } else { 1928 spin_unlock(&eb->refs_lock); 1929 } 1930 btrfs_tree_unlock(eb); 1931 return ret; 1932 } 1933 1934 static void set_btree_ioerr(struct extent_buffer *eb) 1935 { 1936 struct btrfs_fs_info *fs_info = eb->fs_info; 1937 1938 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 1939 1940 /* 1941 * A read may stumble upon this buffer later, make sure that it gets an 1942 * error and knows there was an error. 1943 */ 1944 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 1945 1946 /* 1947 * We need to set the mapping with the io error as well because a write 1948 * error will flip the file system readonly, and then syncfs() will 1949 * return a 0 because we are readonly if we don't modify the err seq for 1950 * the superblock. 1951 */ 1952 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); 1953 1954 /* 1955 * If writeback for a btree extent that doesn't belong to a log tree 1956 * failed, increment the counter transaction->eb_write_errors. 1957 * We do this because while the transaction is running and before it's 1958 * committing (when we call filemap_fdata[write|wait]_range against 1959 * the btree inode), we might have 1960 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 1961 * returns an error or an error happens during writeback, when we're 1962 * committing the transaction we wouldn't know about it, since the pages 1963 * can be no longer dirty nor marked anymore for writeback (if a 1964 * subsequent modification to the extent buffer didn't happen before the 1965 * transaction commit), which makes filemap_fdata[write|wait]_range not 1966 * able to find the pages which contain errors at transaction 1967 * commit time. So if this happens we must abort the transaction, 1968 * otherwise we commit a super block with btree roots that point to 1969 * btree nodes/leafs whose content on disk is invalid - either garbage 1970 * or the content of some node/leaf from a past generation that got 1971 * cowed or deleted and is no longer valid. 1972 * 1973 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 1974 * not be enough - we need to distinguish between log tree extents vs 1975 * non-log tree extents, and the next filemap_fdatawait_range() call 1976 * will catch and clear such errors in the mapping - and that call might 1977 * be from a log sync and not from a transaction commit. Also, checking 1978 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 1979 * not done and would not be reliable - the eb might have been released 1980 * from memory and reading it back again means that flag would not be 1981 * set (since it's a runtime flag, not persisted on disk). 1982 * 1983 * Using the flags below in the btree inode also makes us achieve the 1984 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 1985 * writeback for all dirty pages and before filemap_fdatawait_range() 1986 * is called, the writeback for all dirty pages had already finished 1987 * with errors - because we were not using AS_EIO/AS_ENOSPC, 1988 * filemap_fdatawait_range() would return success, as it could not know 1989 * that writeback errors happened (the pages were no longer tagged for 1990 * writeback). 1991 */ 1992 switch (eb->log_index) { 1993 case -1: 1994 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 1995 break; 1996 case 0: 1997 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 1998 break; 1999 case 1: 2000 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2001 break; 2002 default: 2003 BUG(); /* unexpected, logic error */ 2004 } 2005 } 2006 2007 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) 2008 { 2009 struct btrfs_fs_info *fs_info = eb->fs_info; 2010 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2011 unsigned long flags; 2012 2013 xas_lock_irqsave(&xas, flags); 2014 xas_load(&xas); 2015 xas_set_mark(&xas, mark); 2016 xas_unlock_irqrestore(&xas, flags); 2017 } 2018 2019 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) 2020 { 2021 struct btrfs_fs_info *fs_info = eb->fs_info; 2022 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2023 unsigned long flags; 2024 2025 xas_lock_irqsave(&xas, flags); 2026 xas_load(&xas); 2027 xas_clear_mark(&xas, mark); 2028 xas_unlock_irqrestore(&xas, flags); 2029 } 2030 2031 static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, 2032 unsigned long start, unsigned long end) 2033 { 2034 XA_STATE(xas, &fs_info->buffer_tree, start); 2035 unsigned int tagged = 0; 2036 void *eb; 2037 2038 xas_lock_irq(&xas); 2039 xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { 2040 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 2041 if (++tagged % XA_CHECK_SCHED) 2042 continue; 2043 xas_pause(&xas); 2044 xas_unlock_irq(&xas); 2045 cond_resched(); 2046 xas_lock_irq(&xas); 2047 } 2048 xas_unlock_irq(&xas); 2049 } 2050 2051 struct eb_batch { 2052 unsigned int nr; 2053 unsigned int cur; 2054 struct extent_buffer *ebs[PAGEVEC_SIZE]; 2055 }; 2056 2057 static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) 2058 { 2059 batch->ebs[batch->nr++] = eb; 2060 return (batch->nr < PAGEVEC_SIZE); 2061 } 2062 2063 static inline void eb_batch_init(struct eb_batch *batch) 2064 { 2065 batch->nr = 0; 2066 batch->cur = 0; 2067 } 2068 2069 static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) 2070 { 2071 if (batch->cur >= batch->nr) 2072 return NULL; 2073 return batch->ebs[batch->cur++]; 2074 } 2075 2076 static inline void eb_batch_release(struct eb_batch *batch) 2077 { 2078 for (unsigned int i = 0; i < batch->nr; i++) 2079 free_extent_buffer(batch->ebs[i]); 2080 eb_batch_init(batch); 2081 } 2082 2083 static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, 2084 xa_mark_t mark) 2085 { 2086 struct extent_buffer *eb; 2087 2088 retry: 2089 eb = xas_find_marked(xas, max, mark); 2090 2091 if (xas_retry(xas, eb)) 2092 goto retry; 2093 2094 if (!eb) 2095 return NULL; 2096 2097 if (!refcount_inc_not_zero(&eb->refs)) { 2098 xas_reset(xas); 2099 goto retry; 2100 } 2101 2102 if (unlikely(eb != xas_reload(xas))) { 2103 free_extent_buffer(eb); 2104 xas_reset(xas); 2105 goto retry; 2106 } 2107 2108 return eb; 2109 } 2110 2111 static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, 2112 unsigned long *start, 2113 unsigned long end, xa_mark_t tag, 2114 struct eb_batch *batch) 2115 { 2116 XA_STATE(xas, &fs_info->buffer_tree, *start); 2117 struct extent_buffer *eb; 2118 2119 rcu_read_lock(); 2120 while ((eb = find_get_eb(&xas, end, tag)) != NULL) { 2121 if (!eb_batch_add(batch, eb)) { 2122 *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); 2123 goto out; 2124 } 2125 } 2126 if (end == ULONG_MAX) 2127 *start = ULONG_MAX; 2128 else 2129 *start = end + 1; 2130 out: 2131 rcu_read_unlock(); 2132 2133 return batch->nr; 2134 } 2135 2136 /* 2137 * The endio specific version which won't touch any unsafe spinlock in endio 2138 * context. 2139 */ 2140 static struct extent_buffer *find_extent_buffer_nolock( 2141 struct btrfs_fs_info *fs_info, u64 start) 2142 { 2143 struct extent_buffer *eb; 2144 unsigned long index = (start >> fs_info->nodesize_bits); 2145 2146 rcu_read_lock(); 2147 eb = xa_load(&fs_info->buffer_tree, index); 2148 if (eb && !refcount_inc_not_zero(&eb->refs)) 2149 eb = NULL; 2150 rcu_read_unlock(); 2151 return eb; 2152 } 2153 2154 static void end_bbio_meta_write(struct btrfs_bio *bbio) 2155 { 2156 struct extent_buffer *eb = bbio->private; 2157 struct folio_iter fi; 2158 2159 if (bbio->bio.bi_status != BLK_STS_OK) 2160 set_btree_ioerr(eb); 2161 2162 bio_for_each_folio_all(fi, &bbio->bio) { 2163 btrfs_meta_folio_clear_writeback(fi.folio, eb); 2164 } 2165 2166 buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); 2167 clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 2168 bio_put(&bbio->bio); 2169 } 2170 2171 static void prepare_eb_write(struct extent_buffer *eb) 2172 { 2173 u32 nritems; 2174 unsigned long start; 2175 unsigned long end; 2176 2177 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2178 2179 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 2180 nritems = btrfs_header_nritems(eb); 2181 if (btrfs_header_level(eb) > 0) { 2182 end = btrfs_node_key_ptr_offset(eb, nritems); 2183 memzero_extent_buffer(eb, end, eb->len - end); 2184 } else { 2185 /* 2186 * Leaf: 2187 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 2188 */ 2189 start = btrfs_item_nr_offset(eb, nritems); 2190 end = btrfs_item_nr_offset(eb, 0); 2191 if (nritems == 0) 2192 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); 2193 else 2194 end += btrfs_item_offset(eb, nritems - 1); 2195 memzero_extent_buffer(eb, start, end - start); 2196 } 2197 } 2198 2199 static noinline_for_stack void write_one_eb(struct extent_buffer *eb, 2200 struct writeback_control *wbc) 2201 { 2202 struct btrfs_fs_info *fs_info = eb->fs_info; 2203 struct btrfs_bio *bbio; 2204 2205 prepare_eb_write(eb); 2206 2207 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 2208 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), 2209 eb->fs_info, end_bbio_meta_write, eb); 2210 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 2211 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 2212 wbc_init_bio(wbc, &bbio->bio); 2213 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 2214 bbio->file_offset = eb->start; 2215 for (int i = 0; i < num_extent_folios(eb); i++) { 2216 struct folio *folio = eb->folios[i]; 2217 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 2218 u32 range_len = min_t(u64, folio_end(folio), 2219 eb->start + eb->len) - range_start; 2220 2221 folio_lock(folio); 2222 btrfs_meta_folio_clear_dirty(folio, eb); 2223 btrfs_meta_folio_set_writeback(folio, eb); 2224 if (!folio_test_dirty(folio)) 2225 wbc->nr_to_write -= folio_nr_pages(folio); 2226 bio_add_folio_nofail(&bbio->bio, folio, range_len, 2227 offset_in_folio(folio, range_start)); 2228 wbc_account_cgroup_owner(wbc, folio, range_len); 2229 folio_unlock(folio); 2230 } 2231 btrfs_submit_bbio(bbio, 0); 2232 } 2233 2234 /* 2235 * Wait for all eb writeback in the given range to finish. 2236 * 2237 * @fs_info: The fs_info for this file system. 2238 * @start: The offset of the range to start waiting on writeback. 2239 * @end: The end of the range, inclusive. This is meant to be used in 2240 * conjunction with wait_marked_extents, so this will usually be 2241 * the_next_eb->start - 1. 2242 */ 2243 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, 2244 u64 end) 2245 { 2246 struct eb_batch batch; 2247 unsigned long start_index = (start >> fs_info->nodesize_bits); 2248 unsigned long end_index = (end >> fs_info->nodesize_bits); 2249 2250 eb_batch_init(&batch); 2251 while (start_index <= end_index) { 2252 struct extent_buffer *eb; 2253 unsigned int nr_ebs; 2254 2255 nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, 2256 PAGECACHE_TAG_WRITEBACK, &batch); 2257 if (!nr_ebs) 2258 break; 2259 2260 while ((eb = eb_batch_next(&batch)) != NULL) 2261 wait_on_extent_buffer_writeback(eb); 2262 eb_batch_release(&batch); 2263 cond_resched(); 2264 } 2265 } 2266 2267 int btree_write_cache_pages(struct address_space *mapping, 2268 struct writeback_control *wbc) 2269 { 2270 struct btrfs_eb_write_context ctx = { .wbc = wbc }; 2271 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 2272 int ret = 0; 2273 int done = 0; 2274 int nr_to_write_done = 0; 2275 struct eb_batch batch; 2276 unsigned int nr_ebs; 2277 unsigned long index; 2278 unsigned long end; 2279 int scanned = 0; 2280 xa_mark_t tag; 2281 2282 eb_batch_init(&batch); 2283 if (wbc->range_cyclic) { 2284 index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); 2285 end = -1; 2286 2287 /* 2288 * Start from the beginning does not need to cycle over the 2289 * range, mark it as scanned. 2290 */ 2291 scanned = (index == 0); 2292 } else { 2293 index = (wbc->range_start >> fs_info->nodesize_bits); 2294 end = (wbc->range_end >> fs_info->nodesize_bits); 2295 2296 scanned = 1; 2297 } 2298 if (wbc->sync_mode == WB_SYNC_ALL) 2299 tag = PAGECACHE_TAG_TOWRITE; 2300 else 2301 tag = PAGECACHE_TAG_DIRTY; 2302 btrfs_zoned_meta_io_lock(fs_info); 2303 retry: 2304 if (wbc->sync_mode == WB_SYNC_ALL) 2305 buffer_tree_tag_for_writeback(fs_info, index, end); 2306 while (!done && !nr_to_write_done && (index <= end) && 2307 (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { 2308 struct extent_buffer *eb; 2309 2310 while ((eb = eb_batch_next(&batch)) != NULL) { 2311 ctx.eb = eb; 2312 2313 ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); 2314 if (ret) { 2315 if (ret == -EBUSY) 2316 ret = 0; 2317 2318 if (ret) { 2319 done = 1; 2320 break; 2321 } 2322 continue; 2323 } 2324 2325 if (!lock_extent_buffer_for_io(eb, wbc)) 2326 continue; 2327 2328 /* Implies write in zoned mode. */ 2329 if (ctx.zoned_bg) { 2330 /* Mark the last eb in the block group. */ 2331 btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); 2332 ctx.zoned_bg->meta_write_pointer += eb->len; 2333 } 2334 write_one_eb(eb, wbc); 2335 } 2336 nr_to_write_done = (wbc->nr_to_write <= 0); 2337 eb_batch_release(&batch); 2338 cond_resched(); 2339 } 2340 if (!scanned && !done) { 2341 /* 2342 * We hit the last page and there is more work to be done: wrap 2343 * back to the start of the file 2344 */ 2345 scanned = 1; 2346 index = 0; 2347 goto retry; 2348 } 2349 /* 2350 * If something went wrong, don't allow any metadata write bio to be 2351 * submitted. 2352 * 2353 * This would prevent use-after-free if we had dirty pages not 2354 * cleaned up, which can still happen by fuzzed images. 2355 * 2356 * - Bad extent tree 2357 * Allowing existing tree block to be allocated for other trees. 2358 * 2359 * - Log tree operations 2360 * Exiting tree blocks get allocated to log tree, bumps its 2361 * generation, then get cleaned in tree re-balance. 2362 * Such tree block will not be written back, since it's clean, 2363 * thus no WRITTEN flag set. 2364 * And after log writes back, this tree block is not traced by 2365 * any dirty extent_io_tree. 2366 * 2367 * - Offending tree block gets re-dirtied from its original owner 2368 * Since it has bumped generation, no WRITTEN flag, it can be 2369 * reused without COWing. This tree block will not be traced 2370 * by btrfs_transaction::dirty_pages. 2371 * 2372 * Now such dirty tree block will not be cleaned by any dirty 2373 * extent io tree. Thus we don't want to submit such wild eb 2374 * if the fs already has error. 2375 * 2376 * We can get ret > 0 from submit_extent_folio() indicating how many ebs 2377 * were submitted. Reset it to 0 to avoid false alerts for the caller. 2378 */ 2379 if (ret > 0) 2380 ret = 0; 2381 if (!ret && BTRFS_FS_ERROR(fs_info)) 2382 ret = -EROFS; 2383 2384 if (ctx.zoned_bg) 2385 btrfs_put_block_group(ctx.zoned_bg); 2386 btrfs_zoned_meta_io_unlock(fs_info); 2387 return ret; 2388 } 2389 2390 /* 2391 * Walk the list of dirty pages of the given address space and write all of them. 2392 * 2393 * @mapping: address space structure to write 2394 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2395 * @bio_ctrl: holds context for the write, namely the bio 2396 * 2397 * If a page is already under I/O, write_cache_pages() skips it, even 2398 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2399 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2400 * and msync() need to guarantee that all the data which was dirty at the time 2401 * the call was made get new I/O started against them. If wbc->sync_mode is 2402 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2403 * existing IO to complete. 2404 */ 2405 static int extent_write_cache_pages(struct address_space *mapping, 2406 struct btrfs_bio_ctrl *bio_ctrl) 2407 { 2408 struct writeback_control *wbc = bio_ctrl->wbc; 2409 struct inode *inode = mapping->host; 2410 int ret = 0; 2411 int done = 0; 2412 int nr_to_write_done = 0; 2413 struct folio_batch fbatch; 2414 unsigned int nr_folios; 2415 pgoff_t index; 2416 pgoff_t end; /* Inclusive */ 2417 pgoff_t done_index; 2418 int range_whole = 0; 2419 int scanned = 0; 2420 xa_mark_t tag; 2421 2422 /* 2423 * We have to hold onto the inode so that ordered extents can do their 2424 * work when the IO finishes. The alternative to this is failing to add 2425 * an ordered extent if the igrab() fails there and that is a huge pain 2426 * to deal with, so instead just hold onto the inode throughout the 2427 * writepages operation. If it fails here we are freeing up the inode 2428 * anyway and we'd rather not waste our time writing out stuff that is 2429 * going to be truncated anyway. 2430 */ 2431 if (!igrab(inode)) 2432 return 0; 2433 2434 folio_batch_init(&fbatch); 2435 if (wbc->range_cyclic) { 2436 index = mapping->writeback_index; /* Start from prev offset */ 2437 end = -1; 2438 /* 2439 * Start from the beginning does not need to cycle over the 2440 * range, mark it as scanned. 2441 */ 2442 scanned = (index == 0); 2443 } else { 2444 index = wbc->range_start >> PAGE_SHIFT; 2445 end = wbc->range_end >> PAGE_SHIFT; 2446 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2447 range_whole = 1; 2448 scanned = 1; 2449 } 2450 2451 /* 2452 * We do the tagged writepage as long as the snapshot flush bit is set 2453 * and we are the first one who do the filemap_flush() on this inode. 2454 * 2455 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 2456 * not race in and drop the bit. 2457 */ 2458 if (range_whole && wbc->nr_to_write == LONG_MAX && 2459 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 2460 &BTRFS_I(inode)->runtime_flags)) 2461 wbc->tagged_writepages = 1; 2462 2463 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2464 tag = PAGECACHE_TAG_TOWRITE; 2465 else 2466 tag = PAGECACHE_TAG_DIRTY; 2467 retry: 2468 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2469 tag_pages_for_writeback(mapping, index, end); 2470 done_index = index; 2471 while (!done && !nr_to_write_done && (index <= end) && 2472 (nr_folios = filemap_get_folios_tag(mapping, &index, 2473 end, tag, &fbatch))) { 2474 unsigned i; 2475 2476 for (i = 0; i < nr_folios; i++) { 2477 struct folio *folio = fbatch.folios[i]; 2478 2479 done_index = folio_next_index(folio); 2480 /* 2481 * At this point we hold neither the i_pages lock nor 2482 * the folio lock: the folio may be truncated or 2483 * invalidated (changing folio->mapping to NULL). 2484 */ 2485 if (!folio_trylock(folio)) { 2486 submit_write_bio(bio_ctrl, 0); 2487 folio_lock(folio); 2488 } 2489 2490 if (unlikely(folio->mapping != mapping)) { 2491 folio_unlock(folio); 2492 continue; 2493 } 2494 2495 if (!folio_test_dirty(folio)) { 2496 /* Someone wrote it for us. */ 2497 folio_unlock(folio); 2498 continue; 2499 } 2500 2501 /* 2502 * For subpage case, compression can lead to mixed 2503 * writeback and dirty flags, e.g: 2504 * 0 32K 64K 96K 128K 2505 * | |//////||/////| |//| 2506 * 2507 * In above case, [32K, 96K) is asynchronously submitted 2508 * for compression, and [124K, 128K) needs to be written back. 2509 * 2510 * If we didn't wait writeback for page 64K, [128K, 128K) 2511 * won't be submitted as the page still has writeback flag 2512 * and will be skipped in the next check. 2513 * 2514 * This mixed writeback and dirty case is only possible for 2515 * subpage case. 2516 * 2517 * TODO: Remove this check after migrating compression to 2518 * regular submission. 2519 */ 2520 if (wbc->sync_mode != WB_SYNC_NONE || 2521 btrfs_is_subpage(inode_to_fs_info(inode), folio)) { 2522 if (folio_test_writeback(folio)) 2523 submit_write_bio(bio_ctrl, 0); 2524 folio_wait_writeback(folio); 2525 } 2526 2527 if (folio_test_writeback(folio) || 2528 !folio_clear_dirty_for_io(folio)) { 2529 folio_unlock(folio); 2530 continue; 2531 } 2532 2533 ret = extent_writepage(folio, bio_ctrl); 2534 if (ret < 0) { 2535 done = 1; 2536 break; 2537 } 2538 2539 /* 2540 * The filesystem may choose to bump up nr_to_write. 2541 * We have to make sure to honor the new nr_to_write 2542 * at any time. 2543 */ 2544 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && 2545 wbc->nr_to_write <= 0); 2546 } 2547 folio_batch_release(&fbatch); 2548 cond_resched(); 2549 } 2550 if (!scanned && !done) { 2551 /* 2552 * We hit the last page and there is more work to be done: wrap 2553 * back to the start of the file 2554 */ 2555 scanned = 1; 2556 index = 0; 2557 2558 /* 2559 * If we're looping we could run into a page that is locked by a 2560 * writer and that writer could be waiting on writeback for a 2561 * page in our current bio, and thus deadlock, so flush the 2562 * write bio here. 2563 */ 2564 submit_write_bio(bio_ctrl, 0); 2565 goto retry; 2566 } 2567 2568 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 2569 mapping->writeback_index = done_index; 2570 2571 btrfs_add_delayed_iput(BTRFS_I(inode)); 2572 return ret; 2573 } 2574 2575 /* 2576 * Submit the pages in the range to bio for call sites which delalloc range has 2577 * already been ran (aka, ordered extent inserted) and all pages are still 2578 * locked. 2579 */ 2580 void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, 2581 u64 start, u64 end, struct writeback_control *wbc, 2582 bool pages_dirty) 2583 { 2584 bool found_error = false; 2585 int ret = 0; 2586 struct address_space *mapping = inode->i_mapping; 2587 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2588 const u32 sectorsize = fs_info->sectorsize; 2589 loff_t i_size = i_size_read(inode); 2590 u64 cur = start; 2591 struct btrfs_bio_ctrl bio_ctrl = { 2592 .wbc = wbc, 2593 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2594 }; 2595 2596 if (wbc->no_cgroup_owner) 2597 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; 2598 2599 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 2600 2601 while (cur <= end) { 2602 u64 cur_end; 2603 u32 cur_len; 2604 struct folio *folio; 2605 2606 folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); 2607 2608 /* 2609 * This shouldn't happen, the pages are pinned and locked, this 2610 * code is just in case, but shouldn't actually be run. 2611 */ 2612 if (IS_ERR(folio)) { 2613 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2614 cur_len = cur_end + 1 - cur; 2615 btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, 2616 cur, cur_len, false); 2617 mapping_set_error(mapping, PTR_ERR(folio)); 2618 cur = cur_end; 2619 continue; 2620 } 2621 2622 cur_end = min_t(u64, folio_end(folio) - 1, end); 2623 cur_len = cur_end + 1 - cur; 2624 2625 ASSERT(folio_test_locked(folio)); 2626 if (pages_dirty && folio != locked_folio) 2627 ASSERT(folio_test_dirty(folio)); 2628 2629 /* 2630 * Set the submission bitmap to submit all sectors. 2631 * extent_writepage_io() will do the truncation correctly. 2632 */ 2633 bio_ctrl.submit_bitmap = (unsigned long)-1; 2634 ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, 2635 &bio_ctrl, i_size); 2636 if (ret == 1) 2637 goto next_page; 2638 2639 if (ret) 2640 mapping_set_error(mapping, ret); 2641 btrfs_folio_end_lock(fs_info, folio, cur, cur_len); 2642 if (ret < 0) 2643 found_error = true; 2644 next_page: 2645 folio_put(folio); 2646 cur = cur_end + 1; 2647 } 2648 2649 submit_write_bio(&bio_ctrl, found_error ? ret : 0); 2650 } 2651 2652 int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 2653 { 2654 struct inode *inode = mapping->host; 2655 int ret = 0; 2656 struct btrfs_bio_ctrl bio_ctrl = { 2657 .wbc = wbc, 2658 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2659 }; 2660 2661 /* 2662 * Allow only a single thread to do the reloc work in zoned mode to 2663 * protect the write pointer updates. 2664 */ 2665 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 2666 ret = extent_write_cache_pages(mapping, &bio_ctrl); 2667 submit_write_bio(&bio_ctrl, ret); 2668 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 2669 return ret; 2670 } 2671 2672 void btrfs_readahead(struct readahead_control *rac) 2673 { 2674 struct btrfs_bio_ctrl bio_ctrl = { 2675 .opf = REQ_OP_READ | REQ_RAHEAD, 2676 .ractl = rac, 2677 .last_em_start = U64_MAX, 2678 }; 2679 struct folio *folio; 2680 struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); 2681 const u64 start = readahead_pos(rac); 2682 const u64 end = start + readahead_length(rac) - 1; 2683 struct extent_state *cached_state = NULL; 2684 struct extent_map *em_cached = NULL; 2685 2686 lock_extents_for_read(inode, start, end, &cached_state); 2687 2688 while ((folio = readahead_folio(rac)) != NULL) 2689 btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 2690 2691 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2692 2693 if (em_cached) 2694 btrfs_free_extent_map(em_cached); 2695 submit_one_bio(&bio_ctrl); 2696 } 2697 2698 /* 2699 * basic invalidate_folio code, this waits on any locked or writeback 2700 * ranges corresponding to the folio, and then deletes any extent state 2701 * records from the tree 2702 */ 2703 int extent_invalidate_folio(struct extent_io_tree *tree, 2704 struct folio *folio, size_t offset) 2705 { 2706 struct extent_state *cached_state = NULL; 2707 u64 start = folio_pos(folio); 2708 u64 end = start + folio_size(folio) - 1; 2709 size_t blocksize = folio_to_fs_info(folio)->sectorsize; 2710 2711 /* This function is only called for the btree inode */ 2712 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 2713 2714 start += ALIGN(offset, blocksize); 2715 if (start > end) 2716 return 0; 2717 2718 btrfs_lock_extent(tree, start, end, &cached_state); 2719 folio_wait_writeback(folio); 2720 2721 /* 2722 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 2723 * so here we only need to unlock the extent range to free any 2724 * existing extent state. 2725 */ 2726 btrfs_unlock_extent(tree, start, end, &cached_state); 2727 return 0; 2728 } 2729 2730 /* 2731 * A helper for struct address_space_operations::release_folio, this tests for 2732 * areas of the folio that are locked or under IO and drops the related state 2733 * bits if it is safe to drop the folio. 2734 */ 2735 static bool try_release_extent_state(struct extent_io_tree *tree, 2736 struct folio *folio) 2737 { 2738 struct extent_state *cached_state = NULL; 2739 u64 start = folio_pos(folio); 2740 u64 end = start + folio_size(folio) - 1; 2741 u32 range_bits; 2742 u32 clear_bits; 2743 bool ret = false; 2744 int ret2; 2745 2746 btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); 2747 2748 /* 2749 * We can release the folio if it's locked only for ordered extent 2750 * completion, since that doesn't require using the folio. 2751 */ 2752 if ((range_bits & EXTENT_LOCKED) && 2753 !(range_bits & EXTENT_FINISHING_ORDERED)) 2754 goto out; 2755 2756 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | 2757 EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | 2758 EXTENT_FINISHING_ORDERED); 2759 /* 2760 * At this point we can safely clear everything except the locked, 2761 * nodatasum, delalloc new and finishing ordered bits. The delalloc new 2762 * bit will be cleared by ordered extent completion. 2763 */ 2764 ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); 2765 /* 2766 * If clear_extent_bit failed for enomem reasons, we can't allow the 2767 * release to continue. 2768 */ 2769 if (ret2 == 0) 2770 ret = true; 2771 out: 2772 btrfs_free_extent_state(cached_state); 2773 2774 return ret; 2775 } 2776 2777 /* 2778 * a helper for release_folio. As long as there are no locked extents 2779 * in the range corresponding to the page, both state records and extent 2780 * map records are removed 2781 */ 2782 bool try_release_extent_mapping(struct folio *folio, gfp_t mask) 2783 { 2784 u64 start = folio_pos(folio); 2785 u64 end = start + folio_size(folio) - 1; 2786 struct btrfs_inode *inode = folio_to_inode(folio); 2787 struct extent_io_tree *io_tree = &inode->io_tree; 2788 2789 while (start <= end) { 2790 const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); 2791 const u64 len = end - start + 1; 2792 struct extent_map_tree *extent_tree = &inode->extent_tree; 2793 struct extent_map *em; 2794 2795 write_lock(&extent_tree->lock); 2796 em = btrfs_lookup_extent_mapping(extent_tree, start, len); 2797 if (!em) { 2798 write_unlock(&extent_tree->lock); 2799 break; 2800 } 2801 if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { 2802 write_unlock(&extent_tree->lock); 2803 btrfs_free_extent_map(em); 2804 break; 2805 } 2806 if (btrfs_test_range_bit_exists(io_tree, em->start, 2807 btrfs_extent_map_end(em) - 1, 2808 EXTENT_LOCKED)) 2809 goto next; 2810 /* 2811 * If it's not in the list of modified extents, used by a fast 2812 * fsync, we can remove it. If it's being logged we can safely 2813 * remove it since fsync took an extra reference on the em. 2814 */ 2815 if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) 2816 goto remove_em; 2817 /* 2818 * If it's in the list of modified extents, remove it only if 2819 * its generation is older then the current one, in which case 2820 * we don't need it for a fast fsync. Otherwise don't remove it, 2821 * we could be racing with an ongoing fast fsync that could miss 2822 * the new extent. 2823 */ 2824 if (em->generation >= cur_gen) 2825 goto next; 2826 remove_em: 2827 /* 2828 * We only remove extent maps that are not in the list of 2829 * modified extents or that are in the list but with a 2830 * generation lower then the current generation, so there is no 2831 * need to set the full fsync flag on the inode (it hurts the 2832 * fsync performance for workloads with a data size that exceeds 2833 * or is close to the system's memory). 2834 */ 2835 btrfs_remove_extent_mapping(inode, em); 2836 /* Once for the inode's extent map tree. */ 2837 btrfs_free_extent_map(em); 2838 next: 2839 start = btrfs_extent_map_end(em); 2840 write_unlock(&extent_tree->lock); 2841 2842 /* Once for us, for the lookup_extent_mapping() reference. */ 2843 btrfs_free_extent_map(em); 2844 2845 if (need_resched()) { 2846 /* 2847 * If we need to resched but we can't block just exit 2848 * and leave any remaining extent maps. 2849 */ 2850 if (!gfpflags_allow_blocking(mask)) 2851 break; 2852 2853 cond_resched(); 2854 } 2855 } 2856 return try_release_extent_state(io_tree, folio); 2857 } 2858 2859 static int extent_buffer_under_io(const struct extent_buffer *eb) 2860 { 2861 return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 2862 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2863 } 2864 2865 static bool folio_range_has_eb(struct folio *folio) 2866 { 2867 struct btrfs_folio_state *bfs; 2868 2869 lockdep_assert_held(&folio->mapping->i_private_lock); 2870 2871 if (folio_test_private(folio)) { 2872 bfs = folio_get_private(folio); 2873 if (atomic_read(&bfs->eb_refs)) 2874 return true; 2875 } 2876 return false; 2877 } 2878 2879 static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) 2880 { 2881 struct btrfs_fs_info *fs_info = eb->fs_info; 2882 struct address_space *mapping = folio->mapping; 2883 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 2884 2885 /* 2886 * For mapped eb, we're going to change the folio private, which should 2887 * be done under the i_private_lock. 2888 */ 2889 if (mapped) 2890 spin_lock(&mapping->i_private_lock); 2891 2892 if (!folio_test_private(folio)) { 2893 if (mapped) 2894 spin_unlock(&mapping->i_private_lock); 2895 return; 2896 } 2897 2898 if (!btrfs_meta_is_subpage(fs_info)) { 2899 /* 2900 * We do this since we'll remove the pages after we've removed 2901 * the eb from the xarray, so we could race and have this page 2902 * now attached to the new eb. So only clear folio if it's 2903 * still connected to this eb. 2904 */ 2905 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2906 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2907 BUG_ON(folio_test_dirty(folio)); 2908 BUG_ON(folio_test_writeback(folio)); 2909 /* We need to make sure we haven't be attached to a new eb. */ 2910 folio_detach_private(folio); 2911 } 2912 if (mapped) 2913 spin_unlock(&mapping->i_private_lock); 2914 return; 2915 } 2916 2917 /* 2918 * For subpage, we can have dummy eb with folio private attached. In 2919 * this case, we can directly detach the private as such folio is only 2920 * attached to one dummy eb, no sharing. 2921 */ 2922 if (!mapped) { 2923 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2924 return; 2925 } 2926 2927 btrfs_folio_dec_eb_refs(fs_info, folio); 2928 2929 /* 2930 * We can only detach the folio private if there are no other ebs in the 2931 * page range and no unfinished IO. 2932 */ 2933 if (!folio_range_has_eb(folio)) 2934 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2935 2936 spin_unlock(&mapping->i_private_lock); 2937 } 2938 2939 /* Release all folios attached to the extent buffer */ 2940 static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) 2941 { 2942 ASSERT(!extent_buffer_under_io(eb)); 2943 2944 for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { 2945 struct folio *folio = eb->folios[i]; 2946 2947 if (!folio) 2948 continue; 2949 2950 detach_extent_buffer_folio(eb, folio); 2951 } 2952 } 2953 2954 /* 2955 * Helper for releasing the extent buffer. 2956 */ 2957 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 2958 { 2959 btrfs_release_extent_buffer_folios(eb); 2960 btrfs_leak_debug_del_eb(eb); 2961 kmem_cache_free(extent_buffer_cache, eb); 2962 } 2963 2964 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 2965 u64 start) 2966 { 2967 struct extent_buffer *eb = NULL; 2968 2969 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 2970 eb->start = start; 2971 eb->len = fs_info->nodesize; 2972 eb->fs_info = fs_info; 2973 init_rwsem(&eb->lock); 2974 2975 btrfs_leak_debug_add_eb(eb); 2976 2977 spin_lock_init(&eb->refs_lock); 2978 refcount_set(&eb->refs, 1); 2979 2980 ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); 2981 2982 return eb; 2983 } 2984 2985 /* 2986 * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() 2987 * does not call folio_put(), and we need to set the folios to NULL so that 2988 * btrfs_release_extent_buffer() will not detach them a second time. 2989 */ 2990 static void cleanup_extent_buffer_folios(struct extent_buffer *eb) 2991 { 2992 const int num_folios = num_extent_folios(eb); 2993 2994 /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ 2995 for (int i = 0; i < num_folios; i++) { 2996 ASSERT(eb->folios[i]); 2997 detach_extent_buffer_folio(eb, eb->folios[i]); 2998 folio_put(eb->folios[i]); 2999 eb->folios[i] = NULL; 3000 } 3001 } 3002 3003 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 3004 { 3005 struct extent_buffer *new; 3006 int num_folios; 3007 int ret; 3008 3009 new = __alloc_extent_buffer(src->fs_info, src->start); 3010 if (new == NULL) 3011 return NULL; 3012 3013 /* 3014 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 3015 * btrfs_release_extent_buffer() have different behavior for 3016 * UNMAPPED subpage extent buffer. 3017 */ 3018 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 3019 3020 ret = alloc_eb_folio_array(new, false); 3021 if (ret) 3022 goto release_eb; 3023 3024 ASSERT(num_extent_folios(src) == num_extent_folios(new), 3025 "%d != %d", num_extent_folios(src), num_extent_folios(new)); 3026 /* Explicitly use the cached num_extent value from now on. */ 3027 num_folios = num_extent_folios(src); 3028 for (int i = 0; i < num_folios; i++) { 3029 struct folio *folio = new->folios[i]; 3030 3031 ret = attach_extent_buffer_folio(new, folio, NULL); 3032 if (ret < 0) 3033 goto cleanup_folios; 3034 WARN_ON(folio_test_dirty(folio)); 3035 } 3036 for (int i = 0; i < num_folios; i++) 3037 folio_put(new->folios[i]); 3038 3039 copy_extent_buffer_full(new, src); 3040 set_extent_buffer_uptodate(new); 3041 3042 return new; 3043 3044 cleanup_folios: 3045 cleanup_extent_buffer_folios(new); 3046 release_eb: 3047 btrfs_release_extent_buffer(new); 3048 return NULL; 3049 } 3050 3051 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 3052 u64 start) 3053 { 3054 struct extent_buffer *eb; 3055 int ret; 3056 3057 eb = __alloc_extent_buffer(fs_info, start); 3058 if (!eb) 3059 return NULL; 3060 3061 ret = alloc_eb_folio_array(eb, false); 3062 if (ret) 3063 goto release_eb; 3064 3065 for (int i = 0; i < num_extent_folios(eb); i++) { 3066 ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); 3067 if (ret < 0) 3068 goto cleanup_folios; 3069 } 3070 for (int i = 0; i < num_extent_folios(eb); i++) 3071 folio_put(eb->folios[i]); 3072 3073 set_extent_buffer_uptodate(eb); 3074 btrfs_set_header_nritems(eb, 0); 3075 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 3076 3077 return eb; 3078 3079 cleanup_folios: 3080 cleanup_extent_buffer_folios(eb); 3081 release_eb: 3082 btrfs_release_extent_buffer(eb); 3083 return NULL; 3084 } 3085 3086 static void check_buffer_tree_ref(struct extent_buffer *eb) 3087 { 3088 int refs; 3089 /* 3090 * The TREE_REF bit is first set when the extent_buffer is added to the 3091 * xarray. It is also reset, if unset, when a new reference is created 3092 * by find_extent_buffer. 3093 * 3094 * It is only cleared in two cases: freeing the last non-tree 3095 * reference to the extent_buffer when its STALE bit is set or 3096 * calling release_folio when the tree reference is the only reference. 3097 * 3098 * In both cases, care is taken to ensure that the extent_buffer's 3099 * pages are not under io. However, release_folio can be concurrently 3100 * called with creating new references, which is prone to race 3101 * conditions between the calls to check_buffer_tree_ref in those 3102 * codepaths and clearing TREE_REF in try_release_extent_buffer. 3103 * 3104 * The actual lifetime of the extent_buffer in the xarray is adequately 3105 * protected by the refcount, but the TREE_REF bit and its corresponding 3106 * reference are not. To protect against this class of races, we call 3107 * check_buffer_tree_ref() from the code paths which trigger io. Note that 3108 * once io is initiated, TREE_REF can no longer be cleared, so that is 3109 * the moment at which any such race is best fixed. 3110 */ 3111 refs = refcount_read(&eb->refs); 3112 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3113 return; 3114 3115 spin_lock(&eb->refs_lock); 3116 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3117 refcount_inc(&eb->refs); 3118 spin_unlock(&eb->refs_lock); 3119 } 3120 3121 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 3122 { 3123 check_buffer_tree_ref(eb); 3124 3125 for (int i = 0; i < num_extent_folios(eb); i++) 3126 folio_mark_accessed(eb->folios[i]); 3127 } 3128 3129 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 3130 u64 start) 3131 { 3132 struct extent_buffer *eb; 3133 3134 eb = find_extent_buffer_nolock(fs_info, start); 3135 if (!eb) 3136 return NULL; 3137 /* 3138 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 3139 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 3140 * another task running free_extent_buffer() might have seen that flag 3141 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 3142 * writeback flags not set) and it's still in the tree (flag 3143 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 3144 * decrementing the extent buffer's reference count twice. So here we 3145 * could race and increment the eb's reference count, clear its stale 3146 * flag, mark it as dirty and drop our reference before the other task 3147 * finishes executing free_extent_buffer, which would later result in 3148 * an attempt to free an extent buffer that is dirty. 3149 */ 3150 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 3151 spin_lock(&eb->refs_lock); 3152 spin_unlock(&eb->refs_lock); 3153 } 3154 mark_extent_buffer_accessed(eb); 3155 return eb; 3156 } 3157 3158 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 3159 u64 start) 3160 { 3161 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3162 struct extent_buffer *eb, *exists = NULL; 3163 int ret; 3164 3165 eb = find_extent_buffer(fs_info, start); 3166 if (eb) 3167 return eb; 3168 eb = alloc_dummy_extent_buffer(fs_info, start); 3169 if (!eb) 3170 return ERR_PTR(-ENOMEM); 3171 eb->fs_info = fs_info; 3172 again: 3173 xa_lock_irq(&fs_info->buffer_tree); 3174 exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, 3175 NULL, eb, GFP_NOFS); 3176 if (xa_is_err(exists)) { 3177 ret = xa_err(exists); 3178 xa_unlock_irq(&fs_info->buffer_tree); 3179 btrfs_release_extent_buffer(eb); 3180 return ERR_PTR(ret); 3181 } 3182 if (exists) { 3183 if (!refcount_inc_not_zero(&exists->refs)) { 3184 /* The extent buffer is being freed, retry. */ 3185 xa_unlock_irq(&fs_info->buffer_tree); 3186 goto again; 3187 } 3188 xa_unlock_irq(&fs_info->buffer_tree); 3189 btrfs_release_extent_buffer(eb); 3190 return exists; 3191 } 3192 xa_unlock_irq(&fs_info->buffer_tree); 3193 check_buffer_tree_ref(eb); 3194 3195 return eb; 3196 #else 3197 /* Stub to avoid linker error when compiled with optimizations turned off. */ 3198 return NULL; 3199 #endif 3200 } 3201 3202 static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, 3203 struct folio *folio) 3204 { 3205 struct extent_buffer *exists; 3206 3207 lockdep_assert_held(&folio->mapping->i_private_lock); 3208 3209 /* 3210 * For subpage case, we completely rely on xarray to ensure we don't try 3211 * to insert two ebs for the same bytenr. So here we always return NULL 3212 * and just continue. 3213 */ 3214 if (btrfs_meta_is_subpage(fs_info)) 3215 return NULL; 3216 3217 /* Page not yet attached to an extent buffer */ 3218 if (!folio_test_private(folio)) 3219 return NULL; 3220 3221 /* 3222 * We could have already allocated an eb for this folio and attached one 3223 * so lets see if we can get a ref on the existing eb, and if we can we 3224 * know it's good and we can just return that one, else we know we can 3225 * just overwrite folio private. 3226 */ 3227 exists = folio_get_private(folio); 3228 if (refcount_inc_not_zero(&exists->refs)) 3229 return exists; 3230 3231 WARN_ON(folio_test_dirty(folio)); 3232 folio_detach_private(folio); 3233 return NULL; 3234 } 3235 3236 /* 3237 * Validate alignment constraints of eb at logical address @start. 3238 */ 3239 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 3240 { 3241 const u32 nodesize = fs_info->nodesize; 3242 3243 if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { 3244 btrfs_err(fs_info, "bad tree block start %llu", start); 3245 return true; 3246 } 3247 3248 if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { 3249 btrfs_err(fs_info, 3250 "tree block is not nodesize aligned, start %llu nodesize %u", 3251 start, nodesize); 3252 return true; 3253 } 3254 if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { 3255 btrfs_err(fs_info, 3256 "tree block is not page aligned, start %llu nodesize %u", 3257 start, nodesize); 3258 return true; 3259 } 3260 if (unlikely(!IS_ALIGNED(start, nodesize) && 3261 !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { 3262 btrfs_warn(fs_info, 3263 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", 3264 start, nodesize); 3265 } 3266 return false; 3267 } 3268 3269 /* 3270 * Return 0 if eb->folios[i] is attached to btree inode successfully. 3271 * Return >0 if there is already another extent buffer for the range, 3272 * and @found_eb_ret would be updated. 3273 * Return -EAGAIN if the filemap has an existing folio but with different size 3274 * than @eb. 3275 * The caller needs to free the existing folios and retry using the same order. 3276 */ 3277 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3278 struct btrfs_folio_state *prealloc, 3279 struct extent_buffer **found_eb_ret) 3280 { 3281 3282 struct btrfs_fs_info *fs_info = eb->fs_info; 3283 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3284 const pgoff_t index = eb->start >> PAGE_SHIFT; 3285 struct folio *existing_folio; 3286 int ret; 3287 3288 ASSERT(found_eb_ret); 3289 3290 /* Caller should ensure the folio exists. */ 3291 ASSERT(eb->folios[i]); 3292 3293 retry: 3294 existing_folio = NULL; 3295 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3296 GFP_NOFS | __GFP_NOFAIL); 3297 if (!ret) 3298 goto finish; 3299 3300 existing_folio = filemap_lock_folio(mapping, index + i); 3301 /* The page cache only exists for a very short time, just retry. */ 3302 if (IS_ERR(existing_folio)) 3303 goto retry; 3304 3305 /* For now, we should only have single-page folios for btree inode. */ 3306 ASSERT(folio_nr_pages(existing_folio) == 1); 3307 3308 if (folio_size(existing_folio) != eb->folio_size) { 3309 folio_unlock(existing_folio); 3310 folio_put(existing_folio); 3311 return -EAGAIN; 3312 } 3313 3314 finish: 3315 spin_lock(&mapping->i_private_lock); 3316 if (existing_folio && btrfs_meta_is_subpage(fs_info)) { 3317 /* We're going to reuse the existing page, can drop our folio now. */ 3318 __free_page(folio_page(eb->folios[i], 0)); 3319 eb->folios[i] = existing_folio; 3320 } else if (existing_folio) { 3321 struct extent_buffer *existing_eb; 3322 3323 existing_eb = grab_extent_buffer(fs_info, existing_folio); 3324 if (existing_eb) { 3325 /* The extent buffer still exists, we can use it directly. */ 3326 *found_eb_ret = existing_eb; 3327 spin_unlock(&mapping->i_private_lock); 3328 folio_unlock(existing_folio); 3329 folio_put(existing_folio); 3330 return 1; 3331 } 3332 /* The extent buffer no longer exists, we can reuse the folio. */ 3333 __free_page(folio_page(eb->folios[i], 0)); 3334 eb->folios[i] = existing_folio; 3335 } 3336 eb->folio_size = folio_size(eb->folios[i]); 3337 eb->folio_shift = folio_shift(eb->folios[i]); 3338 /* Should not fail, as we have preallocated the memory. */ 3339 ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3340 ASSERT(!ret); 3341 /* 3342 * To inform we have an extra eb under allocation, so that 3343 * detach_extent_buffer_page() won't release the folio private when the 3344 * eb hasn't been inserted into the xarray yet. 3345 * 3346 * The ref will be decreased when the eb releases the page, in 3347 * detach_extent_buffer_page(). Thus needs no special handling in the 3348 * error path. 3349 */ 3350 btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3351 spin_unlock(&mapping->i_private_lock); 3352 return 0; 3353 } 3354 3355 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3356 u64 start, u64 owner_root, int level) 3357 { 3358 int attached = 0; 3359 struct extent_buffer *eb; 3360 struct extent_buffer *existing_eb = NULL; 3361 struct btrfs_folio_state *prealloc = NULL; 3362 u64 lockdep_owner = owner_root; 3363 bool page_contig = true; 3364 int uptodate = 1; 3365 int ret; 3366 3367 if (check_eb_alignment(fs_info, start)) 3368 return ERR_PTR(-EINVAL); 3369 3370 #if BITS_PER_LONG == 32 3371 if (start >= MAX_LFS_FILESIZE) { 3372 btrfs_err_rl(fs_info, 3373 "extent buffer %llu is beyond 32bit page cache limit", start); 3374 btrfs_err_32bit_limit(fs_info); 3375 return ERR_PTR(-EOVERFLOW); 3376 } 3377 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 3378 btrfs_warn_32bit_limit(fs_info); 3379 #endif 3380 3381 eb = find_extent_buffer(fs_info, start); 3382 if (eb) 3383 return eb; 3384 3385 eb = __alloc_extent_buffer(fs_info, start); 3386 if (!eb) 3387 return ERR_PTR(-ENOMEM); 3388 3389 /* 3390 * The reloc trees are just snapshots, so we need them to appear to be 3391 * just like any other fs tree WRT lockdep. 3392 */ 3393 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 3394 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 3395 3396 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 3397 3398 /* 3399 * Preallocate folio private for subpage case, so that we won't 3400 * allocate memory with i_private_lock nor page lock hold. 3401 * 3402 * The memory will be freed by attach_extent_buffer_page() or freed 3403 * manually if we exit earlier. 3404 */ 3405 if (btrfs_meta_is_subpage(fs_info)) { 3406 prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); 3407 if (IS_ERR(prealloc)) { 3408 ret = PTR_ERR(prealloc); 3409 goto out; 3410 } 3411 } 3412 3413 reallocate: 3414 /* Allocate all pages first. */ 3415 ret = alloc_eb_folio_array(eb, true); 3416 if (ret < 0) { 3417 btrfs_free_folio_state(prealloc); 3418 goto out; 3419 } 3420 3421 /* Attach all pages to the filemap. */ 3422 for (int i = 0; i < num_extent_folios(eb); i++) { 3423 struct folio *folio; 3424 3425 ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3426 if (ret > 0) { 3427 ASSERT(existing_eb); 3428 goto out; 3429 } 3430 3431 /* 3432 * TODO: Special handling for a corner case where the order of 3433 * folios mismatch between the new eb and filemap. 3434 * 3435 * This happens when: 3436 * 3437 * - the new eb is using higher order folio 3438 * 3439 * - the filemap is still using 0-order folios for the range 3440 * This can happen at the previous eb allocation, and we don't 3441 * have higher order folio for the call. 3442 * 3443 * - the existing eb has already been freed 3444 * 3445 * In this case, we have to free the existing folios first, and 3446 * re-allocate using the same order. 3447 * Thankfully this is not going to happen yet, as we're still 3448 * using 0-order folios. 3449 */ 3450 if (unlikely(ret == -EAGAIN)) { 3451 DEBUG_WARN("folio order mismatch between new eb and filemap"); 3452 goto reallocate; 3453 } 3454 attached++; 3455 3456 /* 3457 * Only after attach_eb_folio_to_filemap(), eb->folios[] is 3458 * reliable, as we may choose to reuse the existing page cache 3459 * and free the allocated page. 3460 */ 3461 folio = eb->folios[i]; 3462 WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); 3463 3464 /* 3465 * Check if the current page is physically contiguous with previous eb 3466 * page. 3467 * At this stage, either we allocated a large folio, thus @i 3468 * would only be 0, or we fall back to per-page allocation. 3469 */ 3470 if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) 3471 page_contig = false; 3472 3473 if (!btrfs_meta_folio_test_uptodate(folio, eb)) 3474 uptodate = 0; 3475 3476 /* 3477 * We can't unlock the pages just yet since the extent buffer 3478 * hasn't been properly inserted into the xarray, this opens a 3479 * race with btree_release_folio() which can free a page while we 3480 * are still filling in all pages for the buffer and we could crash. 3481 */ 3482 } 3483 if (uptodate) 3484 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3485 /* All pages are physically contiguous, can skip cross page handling. */ 3486 if (page_contig) 3487 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3488 again: 3489 xa_lock_irq(&fs_info->buffer_tree); 3490 existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3491 start >> fs_info->nodesize_bits, NULL, eb, 3492 GFP_NOFS); 3493 if (xa_is_err(existing_eb)) { 3494 ret = xa_err(existing_eb); 3495 xa_unlock_irq(&fs_info->buffer_tree); 3496 goto out; 3497 } 3498 if (existing_eb) { 3499 if (!refcount_inc_not_zero(&existing_eb->refs)) { 3500 xa_unlock_irq(&fs_info->buffer_tree); 3501 goto again; 3502 } 3503 xa_unlock_irq(&fs_info->buffer_tree); 3504 goto out; 3505 } 3506 xa_unlock_irq(&fs_info->buffer_tree); 3507 3508 /* add one reference for the tree */ 3509 check_buffer_tree_ref(eb); 3510 3511 /* 3512 * Now it's safe to unlock the pages because any calls to 3513 * btree_release_folio will correctly detect that a page belongs to a 3514 * live buffer and won't free them prematurely. 3515 */ 3516 for (int i = 0; i < num_extent_folios(eb); i++) { 3517 folio_unlock(eb->folios[i]); 3518 /* 3519 * A folio that has been added to an address_space mapping 3520 * should not continue holding the refcount from its original 3521 * allocation indefinitely. 3522 */ 3523 folio_put(eb->folios[i]); 3524 } 3525 return eb; 3526 3527 out: 3528 WARN_ON(!refcount_dec_and_test(&eb->refs)); 3529 3530 /* 3531 * Any attached folios need to be detached before we unlock them. This 3532 * is because when we're inserting our new folios into the mapping, and 3533 * then attaching our eb to that folio. If we fail to insert our folio 3534 * we'll lookup the folio for that index, and grab that EB. We do not 3535 * want that to grab this eb, as we're getting ready to free it. So we 3536 * have to detach it first and then unlock it. 3537 * 3538 * Note: the bounds is num_extent_pages() as we need to go through all slots. 3539 */ 3540 for (int i = 0; i < num_extent_pages(eb); i++) { 3541 struct folio *folio = eb->folios[i]; 3542 3543 if (i < attached) { 3544 ASSERT(folio); 3545 detach_extent_buffer_folio(eb, folio); 3546 folio_unlock(folio); 3547 } else if (!folio) { 3548 continue; 3549 } 3550 3551 folio_put(folio); 3552 eb->folios[i] = NULL; 3553 } 3554 btrfs_release_extent_buffer(eb); 3555 if (ret < 0) 3556 return ERR_PTR(ret); 3557 ASSERT(existing_eb); 3558 return existing_eb; 3559 } 3560 3561 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3562 { 3563 struct extent_buffer *eb = 3564 container_of(head, struct extent_buffer, rcu_head); 3565 3566 kmem_cache_free(extent_buffer_cache, eb); 3567 } 3568 3569 static int release_extent_buffer(struct extent_buffer *eb) 3570 __releases(&eb->refs_lock) 3571 { 3572 lockdep_assert_held(&eb->refs_lock); 3573 3574 if (refcount_dec_and_test(&eb->refs)) { 3575 struct btrfs_fs_info *fs_info = eb->fs_info; 3576 3577 spin_unlock(&eb->refs_lock); 3578 3579 /* 3580 * We're erasing, theoretically there will be no allocations, so 3581 * just use GFP_ATOMIC. 3582 * 3583 * We use cmpxchg instead of erase because we do not know if 3584 * this eb is actually in the tree or not, we could be cleaning 3585 * up an eb that we allocated but never inserted into the tree. 3586 * Thus use cmpxchg to remove it from the tree if it is there, 3587 * or leave the other entry if this isn't in the tree. 3588 * 3589 * The documentation says that putting a NULL value is the same 3590 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3591 * in this case. 3592 */ 3593 xa_cmpxchg_irq(&fs_info->buffer_tree, 3594 eb->start >> fs_info->nodesize_bits, eb, NULL, 3595 GFP_ATOMIC); 3596 3597 btrfs_leak_debug_del_eb(eb); 3598 /* Should be safe to release folios at this point. */ 3599 btrfs_release_extent_buffer_folios(eb); 3600 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3601 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 3602 kmem_cache_free(extent_buffer_cache, eb); 3603 return 1; 3604 } 3605 #endif 3606 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3607 return 1; 3608 } 3609 spin_unlock(&eb->refs_lock); 3610 3611 return 0; 3612 } 3613 3614 void free_extent_buffer(struct extent_buffer *eb) 3615 { 3616 int refs; 3617 if (!eb) 3618 return; 3619 3620 refs = refcount_read(&eb->refs); 3621 while (1) { 3622 if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { 3623 if (refs == 1) 3624 break; 3625 } else if (refs <= 3) { 3626 break; 3627 } 3628 3629 /* Optimization to avoid locking eb->refs_lock. */ 3630 if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) 3631 return; 3632 } 3633 3634 spin_lock(&eb->refs_lock); 3635 if (refcount_read(&eb->refs) == 2 && 3636 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 3637 !extent_buffer_under_io(eb) && 3638 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3639 refcount_dec(&eb->refs); 3640 3641 /* 3642 * I know this is terrible, but it's temporary until we stop tracking 3643 * the uptodate bits and such for the extent buffers. 3644 */ 3645 release_extent_buffer(eb); 3646 } 3647 3648 void free_extent_buffer_stale(struct extent_buffer *eb) 3649 { 3650 if (!eb) 3651 return; 3652 3653 spin_lock(&eb->refs_lock); 3654 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 3655 3656 if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 3657 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3658 refcount_dec(&eb->refs); 3659 release_extent_buffer(eb); 3660 } 3661 3662 static void btree_clear_folio_dirty_tag(struct folio *folio) 3663 { 3664 ASSERT(!folio_test_dirty(folio)); 3665 ASSERT(folio_test_locked(folio)); 3666 xa_lock_irq(&folio->mapping->i_pages); 3667 if (!folio_test_dirty(folio)) 3668 __xa_clear_mark(&folio->mapping->i_pages, folio->index, 3669 PAGECACHE_TAG_DIRTY); 3670 xa_unlock_irq(&folio->mapping->i_pages); 3671 } 3672 3673 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, 3674 struct extent_buffer *eb) 3675 { 3676 struct btrfs_fs_info *fs_info = eb->fs_info; 3677 3678 btrfs_assert_tree_write_locked(eb); 3679 3680 if (trans && btrfs_header_generation(eb) != trans->transid) 3681 return; 3682 3683 /* 3684 * Instead of clearing the dirty flag off of the buffer, mark it as 3685 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve 3686 * write-ordering in zoned mode, without the need to later re-dirty 3687 * the extent_buffer. 3688 * 3689 * The actual zeroout of the buffer will happen later in 3690 * btree_csum_one_bio. 3691 */ 3692 if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3693 set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); 3694 return; 3695 } 3696 3697 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) 3698 return; 3699 3700 buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); 3701 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, 3702 fs_info->dirty_metadata_batch); 3703 3704 for (int i = 0; i < num_extent_folios(eb); i++) { 3705 struct folio *folio = eb->folios[i]; 3706 bool last; 3707 3708 if (!folio_test_dirty(folio)) 3709 continue; 3710 folio_lock(folio); 3711 last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); 3712 if (last) 3713 btree_clear_folio_dirty_tag(folio); 3714 folio_unlock(folio); 3715 } 3716 WARN_ON(refcount_read(&eb->refs) == 0); 3717 } 3718 3719 void set_extent_buffer_dirty(struct extent_buffer *eb) 3720 { 3721 bool was_dirty; 3722 3723 check_buffer_tree_ref(eb); 3724 3725 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3726 3727 WARN_ON(refcount_read(&eb->refs) == 0); 3728 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 3729 WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); 3730 3731 if (!was_dirty) { 3732 bool subpage = btrfs_meta_is_subpage(eb->fs_info); 3733 3734 /* 3735 * For subpage case, we can have other extent buffers in the 3736 * same page, and in clear_extent_buffer_dirty() we 3737 * have to clear page dirty without subpage lock held. 3738 * This can cause race where our page gets dirty cleared after 3739 * we just set it. 3740 * 3741 * Thankfully, clear_extent_buffer_dirty() has locked 3742 * its page for other reasons, we can use page lock to prevent 3743 * the above race. 3744 */ 3745 if (subpage) 3746 folio_lock(eb->folios[0]); 3747 for (int i = 0; i < num_extent_folios(eb); i++) 3748 btrfs_meta_folio_set_dirty(eb->folios[i], eb); 3749 buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); 3750 if (subpage) 3751 folio_unlock(eb->folios[0]); 3752 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, 3753 eb->len, 3754 eb->fs_info->dirty_metadata_batch); 3755 } 3756 #ifdef CONFIG_BTRFS_DEBUG 3757 for (int i = 0; i < num_extent_folios(eb); i++) 3758 ASSERT(folio_test_dirty(eb->folios[i])); 3759 #endif 3760 } 3761 3762 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 3763 { 3764 3765 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3766 for (int i = 0; i < num_extent_folios(eb); i++) { 3767 struct folio *folio = eb->folios[i]; 3768 3769 if (!folio) 3770 continue; 3771 3772 btrfs_meta_folio_clear_uptodate(folio, eb); 3773 } 3774 } 3775 3776 void set_extent_buffer_uptodate(struct extent_buffer *eb) 3777 { 3778 3779 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3780 for (int i = 0; i < num_extent_folios(eb); i++) 3781 btrfs_meta_folio_set_uptodate(eb->folios[i], eb); 3782 } 3783 3784 static void clear_extent_buffer_reading(struct extent_buffer *eb) 3785 { 3786 clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); 3787 } 3788 3789 static void end_bbio_meta_read(struct btrfs_bio *bbio) 3790 { 3791 struct extent_buffer *eb = bbio->private; 3792 bool uptodate = !bbio->bio.bi_status; 3793 3794 /* 3795 * If the extent buffer is marked UPTODATE before the read operation 3796 * completes, other calls to read_extent_buffer_pages() will return 3797 * early without waiting for the read to finish, causing data races. 3798 */ 3799 WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); 3800 3801 eb->read_mirror = bbio->mirror_num; 3802 3803 if (uptodate && 3804 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) 3805 uptodate = false; 3806 3807 if (uptodate) 3808 set_extent_buffer_uptodate(eb); 3809 else 3810 clear_extent_buffer_uptodate(eb); 3811 3812 clear_extent_buffer_reading(eb); 3813 free_extent_buffer(eb); 3814 3815 bio_put(&bbio->bio); 3816 } 3817 3818 int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, 3819 const struct btrfs_tree_parent_check *check) 3820 { 3821 struct btrfs_bio *bbio; 3822 3823 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3824 return 0; 3825 3826 /* 3827 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 3828 * operation, which could potentially still be in flight. In this case 3829 * we simply want to return an error. 3830 */ 3831 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 3832 return -EIO; 3833 3834 /* Someone else is already reading the buffer, just wait for it. */ 3835 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) 3836 return 0; 3837 3838 /* 3839 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above 3840 * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have 3841 * started and finished reading the same eb. In this case, UPTODATE 3842 * will now be set, and we shouldn't read it in again. 3843 */ 3844 if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { 3845 clear_extent_buffer_reading(eb); 3846 return 0; 3847 } 3848 3849 eb->read_mirror = 0; 3850 check_buffer_tree_ref(eb); 3851 refcount_inc(&eb->refs); 3852 3853 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 3854 REQ_OP_READ | REQ_META, eb->fs_info, 3855 end_bbio_meta_read, eb); 3856 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 3857 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 3858 bbio->file_offset = eb->start; 3859 memcpy(&bbio->parent_check, check, sizeof(*check)); 3860 for (int i = 0; i < num_extent_folios(eb); i++) { 3861 struct folio *folio = eb->folios[i]; 3862 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 3863 u32 range_len = min_t(u64, folio_end(folio), 3864 eb->start + eb->len) - range_start; 3865 3866 bio_add_folio_nofail(&bbio->bio, folio, range_len, 3867 offset_in_folio(folio, range_start)); 3868 } 3869 btrfs_submit_bbio(bbio, mirror_num); 3870 return 0; 3871 } 3872 3873 int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, 3874 const struct btrfs_tree_parent_check *check) 3875 { 3876 int ret; 3877 3878 ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); 3879 if (ret < 0) 3880 return ret; 3881 3882 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3883 if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) 3884 return -EIO; 3885 return 0; 3886 } 3887 3888 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 3889 unsigned long len) 3890 { 3891 btrfs_warn(eb->fs_info, 3892 "access to eb bytenr %llu len %u out of range start %lu len %lu", 3893 eb->start, eb->len, start, len); 3894 DEBUG_WARN(); 3895 3896 return true; 3897 } 3898 3899 /* 3900 * Check if the [start, start + len) range is valid before reading/writing 3901 * the eb. 3902 * NOTE: @start and @len are offset inside the eb, not logical address. 3903 * 3904 * Caller should not touch the dst/src memory if this function returns error. 3905 */ 3906 static inline int check_eb_range(const struct extent_buffer *eb, 3907 unsigned long start, unsigned long len) 3908 { 3909 unsigned long offset; 3910 3911 /* start, start + len should not go beyond eb->len nor overflow */ 3912 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 3913 return report_eb_range(eb, start, len); 3914 3915 return false; 3916 } 3917 3918 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 3919 unsigned long start, unsigned long len) 3920 { 3921 const int unit_size = eb->folio_size; 3922 size_t cur; 3923 size_t offset; 3924 char *dst = (char *)dstv; 3925 unsigned long i = get_eb_folio_index(eb, start); 3926 3927 if (check_eb_range(eb, start, len)) { 3928 /* 3929 * Invalid range hit, reset the memory, so callers won't get 3930 * some random garbage for their uninitialized memory. 3931 */ 3932 memset(dstv, 0, len); 3933 return; 3934 } 3935 3936 if (eb->addr) { 3937 memcpy(dstv, eb->addr + start, len); 3938 return; 3939 } 3940 3941 offset = get_eb_offset_in_folio(eb, start); 3942 3943 while (len > 0) { 3944 char *kaddr; 3945 3946 cur = min(len, unit_size - offset); 3947 kaddr = folio_address(eb->folios[i]); 3948 memcpy(dst, kaddr + offset, cur); 3949 3950 dst += cur; 3951 len -= cur; 3952 offset = 0; 3953 i++; 3954 } 3955 } 3956 3957 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 3958 void __user *dstv, 3959 unsigned long start, unsigned long len) 3960 { 3961 const int unit_size = eb->folio_size; 3962 size_t cur; 3963 size_t offset; 3964 char __user *dst = (char __user *)dstv; 3965 unsigned long i = get_eb_folio_index(eb, start); 3966 int ret = 0; 3967 3968 WARN_ON(start > eb->len); 3969 WARN_ON(start + len > eb->start + eb->len); 3970 3971 if (eb->addr) { 3972 if (copy_to_user_nofault(dstv, eb->addr + start, len)) 3973 ret = -EFAULT; 3974 return ret; 3975 } 3976 3977 offset = get_eb_offset_in_folio(eb, start); 3978 3979 while (len > 0) { 3980 char *kaddr; 3981 3982 cur = min(len, unit_size - offset); 3983 kaddr = folio_address(eb->folios[i]); 3984 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 3985 ret = -EFAULT; 3986 break; 3987 } 3988 3989 dst += cur; 3990 len -= cur; 3991 offset = 0; 3992 i++; 3993 } 3994 3995 return ret; 3996 } 3997 3998 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 3999 unsigned long start, unsigned long len) 4000 { 4001 const int unit_size = eb->folio_size; 4002 size_t cur; 4003 size_t offset; 4004 char *kaddr; 4005 char *ptr = (char *)ptrv; 4006 unsigned long i = get_eb_folio_index(eb, start); 4007 int ret = 0; 4008 4009 if (check_eb_range(eb, start, len)) 4010 return -EINVAL; 4011 4012 if (eb->addr) 4013 return memcmp(ptrv, eb->addr + start, len); 4014 4015 offset = get_eb_offset_in_folio(eb, start); 4016 4017 while (len > 0) { 4018 cur = min(len, unit_size - offset); 4019 kaddr = folio_address(eb->folios[i]); 4020 ret = memcmp(ptr, kaddr + offset, cur); 4021 if (ret) 4022 break; 4023 4024 ptr += cur; 4025 len -= cur; 4026 offset = 0; 4027 i++; 4028 } 4029 return ret; 4030 } 4031 4032 /* 4033 * Check that the extent buffer is uptodate. 4034 * 4035 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 4036 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 4037 */ 4038 static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) 4039 { 4040 struct btrfs_fs_info *fs_info = eb->fs_info; 4041 struct folio *folio = eb->folios[i]; 4042 4043 ASSERT(folio); 4044 4045 /* 4046 * If we are using the commit root we could potentially clear a page 4047 * Uptodate while we're using the extent buffer that we've previously 4048 * looked up. We don't want to complain in this case, as the page was 4049 * valid before, we just didn't write it out. Instead we want to catch 4050 * the case where we didn't actually read the block properly, which 4051 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. 4052 */ 4053 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4054 return; 4055 4056 if (btrfs_meta_is_subpage(fs_info)) { 4057 folio = eb->folios[0]; 4058 ASSERT(i == 0); 4059 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, 4060 eb->start, eb->len))) 4061 btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); 4062 } else { 4063 WARN_ON(!folio_test_uptodate(folio)); 4064 } 4065 } 4066 4067 static void __write_extent_buffer(const struct extent_buffer *eb, 4068 const void *srcv, unsigned long start, 4069 unsigned long len, bool use_memmove) 4070 { 4071 const int unit_size = eb->folio_size; 4072 size_t cur; 4073 size_t offset; 4074 char *kaddr; 4075 const char *src = (const char *)srcv; 4076 unsigned long i = get_eb_folio_index(eb, start); 4077 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4078 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4079 4080 if (check_eb_range(eb, start, len)) 4081 return; 4082 4083 if (eb->addr) { 4084 if (use_memmove) 4085 memmove(eb->addr + start, srcv, len); 4086 else 4087 memcpy(eb->addr + start, srcv, len); 4088 return; 4089 } 4090 4091 offset = get_eb_offset_in_folio(eb, start); 4092 4093 while (len > 0) { 4094 if (check_uptodate) 4095 assert_eb_folio_uptodate(eb, i); 4096 4097 cur = min(len, unit_size - offset); 4098 kaddr = folio_address(eb->folios[i]); 4099 if (use_memmove) 4100 memmove(kaddr + offset, src, cur); 4101 else 4102 memcpy(kaddr + offset, src, cur); 4103 4104 src += cur; 4105 len -= cur; 4106 offset = 0; 4107 i++; 4108 } 4109 } 4110 4111 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 4112 unsigned long start, unsigned long len) 4113 { 4114 return __write_extent_buffer(eb, srcv, start, len, false); 4115 } 4116 4117 static void memset_extent_buffer(const struct extent_buffer *eb, int c, 4118 unsigned long start, unsigned long len) 4119 { 4120 const int unit_size = eb->folio_size; 4121 unsigned long cur = start; 4122 4123 if (eb->addr) { 4124 memset(eb->addr + start, c, len); 4125 return; 4126 } 4127 4128 while (cur < start + len) { 4129 unsigned long index = get_eb_folio_index(eb, cur); 4130 unsigned int offset = get_eb_offset_in_folio(eb, cur); 4131 unsigned int cur_len = min(start + len - cur, unit_size - offset); 4132 4133 assert_eb_folio_uptodate(eb, index); 4134 memset(folio_address(eb->folios[index]) + offset, c, cur_len); 4135 4136 cur += cur_len; 4137 } 4138 } 4139 4140 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 4141 unsigned long len) 4142 { 4143 if (check_eb_range(eb, start, len)) 4144 return; 4145 return memset_extent_buffer(eb, 0, start, len); 4146 } 4147 4148 void copy_extent_buffer_full(const struct extent_buffer *dst, 4149 const struct extent_buffer *src) 4150 { 4151 const int unit_size = src->folio_size; 4152 unsigned long cur = 0; 4153 4154 ASSERT(dst->len == src->len); 4155 4156 while (cur < src->len) { 4157 unsigned long index = get_eb_folio_index(src, cur); 4158 unsigned long offset = get_eb_offset_in_folio(src, cur); 4159 unsigned long cur_len = min(src->len, unit_size - offset); 4160 void *addr = folio_address(src->folios[index]) + offset; 4161 4162 write_extent_buffer(dst, addr, cur, cur_len); 4163 4164 cur += cur_len; 4165 } 4166 } 4167 4168 void copy_extent_buffer(const struct extent_buffer *dst, 4169 const struct extent_buffer *src, 4170 unsigned long dst_offset, unsigned long src_offset, 4171 unsigned long len) 4172 { 4173 const int unit_size = dst->folio_size; 4174 u64 dst_len = dst->len; 4175 size_t cur; 4176 size_t offset; 4177 char *kaddr; 4178 unsigned long i = get_eb_folio_index(dst, dst_offset); 4179 4180 if (check_eb_range(dst, dst_offset, len) || 4181 check_eb_range(src, src_offset, len)) 4182 return; 4183 4184 WARN_ON(src->len != dst_len); 4185 4186 offset = get_eb_offset_in_folio(dst, dst_offset); 4187 4188 while (len > 0) { 4189 assert_eb_folio_uptodate(dst, i); 4190 4191 cur = min(len, (unsigned long)(unit_size - offset)); 4192 4193 kaddr = folio_address(dst->folios[i]); 4194 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4195 4196 src_offset += cur; 4197 len -= cur; 4198 offset = 0; 4199 i++; 4200 } 4201 } 4202 4203 /* 4204 * Calculate the folio and offset of the byte containing the given bit number. 4205 * 4206 * @eb: the extent buffer 4207 * @start: offset of the bitmap item in the extent buffer 4208 * @nr: bit number 4209 * @folio_index: return index of the folio in the extent buffer that contains 4210 * the given bit number 4211 * @folio_offset: return offset into the folio given by folio_index 4212 * 4213 * This helper hides the ugliness of finding the byte in an extent buffer which 4214 * contains a given bit. 4215 */ 4216 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 4217 unsigned long start, unsigned long nr, 4218 unsigned long *folio_index, 4219 size_t *folio_offset) 4220 { 4221 size_t byte_offset = BIT_BYTE(nr); 4222 size_t offset; 4223 4224 /* 4225 * The byte we want is the offset of the extent buffer + the offset of 4226 * the bitmap item in the extent buffer + the offset of the byte in the 4227 * bitmap item. 4228 */ 4229 offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; 4230 4231 *folio_index = offset >> eb->folio_shift; 4232 *folio_offset = offset_in_eb_folio(eb, offset); 4233 } 4234 4235 /* 4236 * Determine whether a bit in a bitmap item is set. 4237 * 4238 * @eb: the extent buffer 4239 * @start: offset of the bitmap item in the extent buffer 4240 * @nr: bit number to test 4241 */ 4242 bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 4243 unsigned long nr) 4244 { 4245 unsigned long i; 4246 size_t offset; 4247 u8 *kaddr; 4248 4249 eb_bitmap_offset(eb, start, nr, &i, &offset); 4250 assert_eb_folio_uptodate(eb, i); 4251 kaddr = folio_address(eb->folios[i]); 4252 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 4253 } 4254 4255 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) 4256 { 4257 unsigned long index = get_eb_folio_index(eb, bytenr); 4258 4259 if (check_eb_range(eb, bytenr, 1)) 4260 return NULL; 4261 return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); 4262 } 4263 4264 /* 4265 * Set an area of a bitmap to 1. 4266 * 4267 * @eb: the extent buffer 4268 * @start: offset of the bitmap item in the extent buffer 4269 * @pos: bit number of the first bit 4270 * @len: number of bits to set 4271 */ 4272 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 4273 unsigned long pos, unsigned long len) 4274 { 4275 unsigned int first_byte = start + BIT_BYTE(pos); 4276 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4277 const bool same_byte = (first_byte == last_byte); 4278 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4279 u8 *kaddr; 4280 4281 if (same_byte) 4282 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4283 4284 /* Handle the first byte. */ 4285 kaddr = extent_buffer_get_byte(eb, first_byte); 4286 *kaddr |= mask; 4287 if (same_byte) 4288 return; 4289 4290 /* Handle the byte aligned part. */ 4291 ASSERT(first_byte + 1 <= last_byte); 4292 memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); 4293 4294 /* Handle the last byte. */ 4295 kaddr = extent_buffer_get_byte(eb, last_byte); 4296 *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); 4297 } 4298 4299 4300 /* 4301 * Clear an area of a bitmap. 4302 * 4303 * @eb: the extent buffer 4304 * @start: offset of the bitmap item in the extent buffer 4305 * @pos: bit number of the first bit 4306 * @len: number of bits to clear 4307 */ 4308 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 4309 unsigned long start, unsigned long pos, 4310 unsigned long len) 4311 { 4312 unsigned int first_byte = start + BIT_BYTE(pos); 4313 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4314 const bool same_byte = (first_byte == last_byte); 4315 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4316 u8 *kaddr; 4317 4318 if (same_byte) 4319 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4320 4321 /* Handle the first byte. */ 4322 kaddr = extent_buffer_get_byte(eb, first_byte); 4323 *kaddr &= ~mask; 4324 if (same_byte) 4325 return; 4326 4327 /* Handle the byte aligned part. */ 4328 ASSERT(first_byte + 1 <= last_byte); 4329 memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); 4330 4331 /* Handle the last byte. */ 4332 kaddr = extent_buffer_get_byte(eb, last_byte); 4333 *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); 4334 } 4335 4336 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4337 { 4338 unsigned long distance = (src > dst) ? src - dst : dst - src; 4339 return distance < len; 4340 } 4341 4342 void memcpy_extent_buffer(const struct extent_buffer *dst, 4343 unsigned long dst_offset, unsigned long src_offset, 4344 unsigned long len) 4345 { 4346 const int unit_size = dst->folio_size; 4347 unsigned long cur_off = 0; 4348 4349 if (check_eb_range(dst, dst_offset, len) || 4350 check_eb_range(dst, src_offset, len)) 4351 return; 4352 4353 if (dst->addr) { 4354 const bool use_memmove = areas_overlap(src_offset, dst_offset, len); 4355 4356 if (use_memmove) 4357 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4358 else 4359 memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); 4360 return; 4361 } 4362 4363 while (cur_off < len) { 4364 unsigned long cur_src = cur_off + src_offset; 4365 unsigned long folio_index = get_eb_folio_index(dst, cur_src); 4366 unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); 4367 unsigned long cur_len = min(src_offset + len - cur_src, 4368 unit_size - folio_off); 4369 void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; 4370 const bool use_memmove = areas_overlap(src_offset + cur_off, 4371 dst_offset + cur_off, cur_len); 4372 4373 __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, 4374 use_memmove); 4375 cur_off += cur_len; 4376 } 4377 } 4378 4379 void memmove_extent_buffer(const struct extent_buffer *dst, 4380 unsigned long dst_offset, unsigned long src_offset, 4381 unsigned long len) 4382 { 4383 unsigned long dst_end = dst_offset + len - 1; 4384 unsigned long src_end = src_offset + len - 1; 4385 4386 if (check_eb_range(dst, dst_offset, len) || 4387 check_eb_range(dst, src_offset, len)) 4388 return; 4389 4390 if (dst_offset < src_offset) { 4391 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4392 return; 4393 } 4394 4395 if (dst->addr) { 4396 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4397 return; 4398 } 4399 4400 while (len > 0) { 4401 unsigned long src_i; 4402 size_t cur; 4403 size_t dst_off_in_folio; 4404 size_t src_off_in_folio; 4405 void *src_addr; 4406 bool use_memmove; 4407 4408 src_i = get_eb_folio_index(dst, src_end); 4409 4410 dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); 4411 src_off_in_folio = get_eb_offset_in_folio(dst, src_end); 4412 4413 cur = min_t(unsigned long, len, src_off_in_folio + 1); 4414 cur = min(cur, dst_off_in_folio + 1); 4415 4416 src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - 4417 cur + 1; 4418 use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, 4419 cur); 4420 4421 __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, 4422 use_memmove); 4423 4424 dst_end -= cur; 4425 src_end -= cur; 4426 len -= cur; 4427 } 4428 } 4429 4430 static int try_release_subpage_extent_buffer(struct folio *folio) 4431 { 4432 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4433 struct extent_buffer *eb; 4434 unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); 4435 unsigned long index = start; 4436 unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; 4437 int ret; 4438 4439 rcu_read_lock(); 4440 xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4441 /* 4442 * The same as try_release_extent_buffer(), to ensure the eb 4443 * won't disappear out from under us. 4444 */ 4445 spin_lock(&eb->refs_lock); 4446 rcu_read_unlock(); 4447 4448 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4449 spin_unlock(&eb->refs_lock); 4450 rcu_read_lock(); 4451 continue; 4452 } 4453 4454 /* 4455 * If tree ref isn't set then we know the ref on this eb is a 4456 * real ref, so just return, this eb will likely be freed soon 4457 * anyway. 4458 */ 4459 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4460 spin_unlock(&eb->refs_lock); 4461 break; 4462 } 4463 4464 /* 4465 * Here we don't care about the return value, we will always 4466 * check the folio private at the end. And 4467 * release_extent_buffer() will release the refs_lock. 4468 */ 4469 release_extent_buffer(eb); 4470 rcu_read_lock(); 4471 } 4472 rcu_read_unlock(); 4473 4474 /* 4475 * Finally to check if we have cleared folio private, as if we have 4476 * released all ebs in the page, the folio private should be cleared now. 4477 */ 4478 spin_lock(&folio->mapping->i_private_lock); 4479 if (!folio_test_private(folio)) 4480 ret = 1; 4481 else 4482 ret = 0; 4483 spin_unlock(&folio->mapping->i_private_lock); 4484 return ret; 4485 } 4486 4487 int try_release_extent_buffer(struct folio *folio) 4488 { 4489 struct extent_buffer *eb; 4490 4491 if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) 4492 return try_release_subpage_extent_buffer(folio); 4493 4494 /* 4495 * We need to make sure nobody is changing folio private, as we rely on 4496 * folio private as the pointer to extent buffer. 4497 */ 4498 spin_lock(&folio->mapping->i_private_lock); 4499 if (!folio_test_private(folio)) { 4500 spin_unlock(&folio->mapping->i_private_lock); 4501 return 1; 4502 } 4503 4504 eb = folio_get_private(folio); 4505 BUG_ON(!eb); 4506 4507 /* 4508 * This is a little awful but should be ok, we need to make sure that 4509 * the eb doesn't disappear out from under us while we're looking at 4510 * this page. 4511 */ 4512 spin_lock(&eb->refs_lock); 4513 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4514 spin_unlock(&eb->refs_lock); 4515 spin_unlock(&folio->mapping->i_private_lock); 4516 return 0; 4517 } 4518 spin_unlock(&folio->mapping->i_private_lock); 4519 4520 /* 4521 * If tree ref isn't set then we know the ref on this eb is a real ref, 4522 * so just return, this page will likely be freed soon anyway. 4523 */ 4524 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4525 spin_unlock(&eb->refs_lock); 4526 return 0; 4527 } 4528 4529 return release_extent_buffer(eb); 4530 } 4531 4532 /* 4533 * Attempt to readahead a child block. 4534 * 4535 * @fs_info: the fs_info 4536 * @bytenr: bytenr to read 4537 * @owner_root: objectid of the root that owns this eb 4538 * @gen: generation for the uptodate check, can be 0 4539 * @level: level for the eb 4540 * 4541 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 4542 * normal uptodate check of the eb, without checking the generation. If we have 4543 * to read the block we will not block on anything. 4544 */ 4545 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 4546 u64 bytenr, u64 owner_root, u64 gen, int level) 4547 { 4548 struct btrfs_tree_parent_check check = { 4549 .level = level, 4550 .transid = gen 4551 }; 4552 struct extent_buffer *eb; 4553 int ret; 4554 4555 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 4556 if (IS_ERR(eb)) 4557 return; 4558 4559 if (btrfs_buffer_uptodate(eb, gen, true)) { 4560 free_extent_buffer(eb); 4561 return; 4562 } 4563 4564 ret = read_extent_buffer_pages_nowait(eb, 0, &check); 4565 if (ret < 0) 4566 free_extent_buffer_stale(eb); 4567 else 4568 free_extent_buffer(eb); 4569 } 4570 4571 /* 4572 * Readahead a node's child block. 4573 * 4574 * @node: parent node we're reading from 4575 * @slot: slot in the parent node for the child we want to read 4576 * 4577 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 4578 * the slot in the node provided. 4579 */ 4580 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 4581 { 4582 btrfs_readahead_tree_block(node->fs_info, 4583 btrfs_node_blockptr(node, slot), 4584 btrfs_header_owner(node), 4585 btrfs_node_ptr_generation(node, slot), 4586 btrfs_header_level(node) - 1); 4587 } 4588