1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "extent_io.h" 18 #include "extent-io-tree.h" 19 #include "extent_map.h" 20 #include "ctree.h" 21 #include "btrfs_inode.h" 22 #include "bio.h" 23 #include "locking.h" 24 #include "backref.h" 25 #include "disk-io.h" 26 #include "subpage.h" 27 #include "zoned.h" 28 #include "block-group.h" 29 #include "compression.h" 30 #include "fs.h" 31 #include "accessors.h" 32 #include "file-item.h" 33 #include "file.h" 34 #include "dev-replace.h" 35 #include "super.h" 36 #include "transaction.h" 37 38 static struct kmem_cache *extent_buffer_cache; 39 40 #ifdef CONFIG_BTRFS_DEBUG 41 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) 42 { 43 struct btrfs_fs_info *fs_info = eb->fs_info; 44 unsigned long flags; 45 46 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 47 list_add(&eb->leak_list, &fs_info->allocated_ebs); 48 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 49 } 50 51 static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) 52 { 53 struct btrfs_fs_info *fs_info = eb->fs_info; 54 unsigned long flags; 55 56 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 57 list_del(&eb->leak_list); 58 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 59 } 60 61 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 62 { 63 struct extent_buffer *eb; 64 unsigned long flags; 65 66 /* 67 * If we didn't get into open_ctree our allocated_ebs will not be 68 * initialized, so just skip this. 69 */ 70 if (!fs_info->allocated_ebs.next) 71 return; 72 73 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 74 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 75 while (!list_empty(&fs_info->allocated_ebs)) { 76 eb = list_first_entry(&fs_info->allocated_ebs, 77 struct extent_buffer, leak_list); 78 btrfs_err(fs_info, 79 "buffer leak start %llu len %u refs %d bflags %lu owner %llu", 80 eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, 81 btrfs_header_owner(eb)); 82 list_del(&eb->leak_list); 83 WARN_ON_ONCE(1); 84 kmem_cache_free(extent_buffer_cache, eb); 85 } 86 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 87 } 88 #else 89 #define btrfs_leak_debug_add_eb(eb) do {} while (0) 90 #define btrfs_leak_debug_del_eb(eb) do {} while (0) 91 #endif 92 93 /* 94 * Structure to record info about the bio being assembled, and other info like 95 * how many bytes are there before stripe/ordered extent boundary. 96 */ 97 struct btrfs_bio_ctrl { 98 struct btrfs_bio *bbio; 99 /* Last byte contained in bbio + 1 . */ 100 loff_t next_file_offset; 101 enum btrfs_compression_type compress_type; 102 u32 len_to_oe_boundary; 103 blk_opf_t opf; 104 /* 105 * For data read bios, we attempt to optimize csum lookups if the extent 106 * generation is older than the current one. To make this possible, we 107 * need to track the maximum generation of an extent in a bio_ctrl to 108 * make the decision when submitting the bio. 109 * 110 * The pattern between do_readpage(), submit_one_bio() and 111 * submit_extent_folio() is quite subtle, so tracking this is tricky. 112 * 113 * As we process extent E, we might submit a bio with existing built up 114 * extents before adding E to a new bio, or we might just add E to the 115 * bio. As a result, E's generation could apply to the current bio or 116 * to the next one, so we need to be careful to update the bio_ctrl's 117 * generation with E's only when we are sure E is added to bio_ctrl->bbio 118 * in submit_extent_folio(). 119 * 120 * See the comment in btrfs_lookup_bio_sums() for more detail on the 121 * need for this optimization. 122 */ 123 u64 generation; 124 btrfs_bio_end_io_t end_io_func; 125 struct writeback_control *wbc; 126 127 /* 128 * The sectors of the page which are going to be submitted by 129 * extent_writepage_io(). 130 * This is to avoid touching ranges covered by compression/inline. 131 */ 132 unsigned long submit_bitmap; 133 struct readahead_control *ractl; 134 135 /* 136 * The start offset of the last used extent map by a read operation. 137 * 138 * This is for proper compressed read merge. 139 * U64_MAX means we are starting the read and have made no progress yet. 140 * 141 * The current btrfs_bio_is_contig() only uses disk_bytenr as 142 * the condition to check if the read can be merged with previous 143 * bio, which is not correct. E.g. two file extents pointing to the 144 * same extent but with different offset. 145 * 146 * So here we need to do extra checks to only merge reads that are 147 * covered by the same extent map. 148 * Just extent_map::start will be enough, as they are unique 149 * inside the same inode. 150 */ 151 u64 last_em_start; 152 }; 153 154 /* 155 * Helper to set the csum search commit root option for a bio_ctrl's bbio 156 * before submitting the bio. 157 * 158 * Only for use by submit_one_bio(). 159 */ 160 static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) 161 { 162 struct btrfs_bio *bbio = bio_ctrl->bbio; 163 164 ASSERT(bbio); 165 166 if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) 167 return; 168 169 bio_ctrl->bbio->csum_search_commit_root = 170 (bio_ctrl->generation && 171 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); 172 } 173 174 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 175 { 176 struct btrfs_bio *bbio = bio_ctrl->bbio; 177 178 if (!bbio) 179 return; 180 181 /* Caller should ensure the bio has at least some range added */ 182 ASSERT(bbio->bio.bi_iter.bi_size); 183 184 bio_set_csum_search_commit_root(bio_ctrl); 185 186 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && 187 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 188 btrfs_submit_compressed_read(bbio); 189 else 190 btrfs_submit_bbio(bbio, 0); 191 192 /* The bbio is owned by the end_io handler now */ 193 bio_ctrl->bbio = NULL; 194 /* 195 * We used the generation to decide whether to lookup csums in the 196 * commit_root or not when we called bio_set_csum_search_commit_root() 197 * above. Now, reset the generation for the next bio. 198 */ 199 bio_ctrl->generation = 0; 200 } 201 202 /* 203 * Submit or fail the current bio in the bio_ctrl structure. 204 */ 205 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) 206 { 207 struct btrfs_bio *bbio = bio_ctrl->bbio; 208 209 if (!bbio) 210 return; 211 212 if (ret) { 213 ASSERT(ret < 0); 214 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 215 /* The bio is owned by the end_io handler now */ 216 bio_ctrl->bbio = NULL; 217 } else { 218 submit_one_bio(bio_ctrl); 219 } 220 } 221 222 int __init extent_buffer_init_cachep(void) 223 { 224 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 225 sizeof(struct extent_buffer), 0, 0, 226 NULL); 227 if (!extent_buffer_cache) 228 return -ENOMEM; 229 230 return 0; 231 } 232 233 void __cold extent_buffer_free_cachep(void) 234 { 235 /* 236 * Make sure all delayed rcu free are flushed before we 237 * destroy caches. 238 */ 239 rcu_barrier(); 240 kmem_cache_destroy(extent_buffer_cache); 241 } 242 243 static void process_one_folio(struct btrfs_fs_info *fs_info, 244 struct folio *folio, const struct folio *locked_folio, 245 unsigned long page_ops, u64 start, u64 end) 246 { 247 u32 len; 248 249 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 250 len = end + 1 - start; 251 252 if (page_ops & PAGE_SET_ORDERED) 253 btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); 254 if (page_ops & PAGE_START_WRITEBACK) { 255 btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); 256 btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); 257 } 258 if (page_ops & PAGE_END_WRITEBACK) 259 btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); 260 261 if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) 262 btrfs_folio_end_lock(fs_info, folio, start, len); 263 } 264 265 static void __process_folios_contig(struct address_space *mapping, 266 const struct folio *locked_folio, u64 start, 267 u64 end, unsigned long page_ops) 268 { 269 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 270 pgoff_t index = start >> PAGE_SHIFT; 271 pgoff_t end_index = end >> PAGE_SHIFT; 272 struct folio_batch fbatch; 273 int i; 274 275 folio_batch_init(&fbatch); 276 while (index <= end_index) { 277 int found_folios; 278 279 found_folios = filemap_get_folios_contig(mapping, &index, 280 end_index, &fbatch); 281 for (i = 0; i < found_folios; i++) { 282 struct folio *folio = fbatch.folios[i]; 283 284 process_one_folio(fs_info, folio, locked_folio, 285 page_ops, start, end); 286 } 287 folio_batch_release(&fbatch); 288 cond_resched(); 289 } 290 } 291 292 static noinline void unlock_delalloc_folio(const struct inode *inode, 293 struct folio *locked_folio, 294 u64 start, u64 end) 295 { 296 ASSERT(locked_folio); 297 298 __process_folios_contig(inode->i_mapping, locked_folio, start, end, 299 PAGE_UNLOCK); 300 } 301 302 static noinline int lock_delalloc_folios(struct inode *inode, 303 struct folio *locked_folio, 304 u64 start, u64 end) 305 { 306 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 307 struct address_space *mapping = inode->i_mapping; 308 pgoff_t index = start >> PAGE_SHIFT; 309 pgoff_t end_index = end >> PAGE_SHIFT; 310 u64 processed_end = start; 311 struct folio_batch fbatch; 312 313 folio_batch_init(&fbatch); 314 while (index <= end_index) { 315 unsigned int found_folios, i; 316 317 found_folios = filemap_get_folios_contig(mapping, &index, 318 end_index, &fbatch); 319 if (found_folios == 0) 320 goto out; 321 322 for (i = 0; i < found_folios; i++) { 323 struct folio *folio = fbatch.folios[i]; 324 u64 range_start; 325 u32 range_len; 326 327 if (folio == locked_folio) 328 continue; 329 330 folio_lock(folio); 331 if (!folio_test_dirty(folio) || folio->mapping != mapping) { 332 folio_unlock(folio); 333 goto out; 334 } 335 range_start = max_t(u64, folio_pos(folio), start); 336 range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; 337 btrfs_folio_set_lock(fs_info, folio, range_start, range_len); 338 339 processed_end = range_start + range_len - 1; 340 } 341 folio_batch_release(&fbatch); 342 cond_resched(); 343 } 344 345 return 0; 346 out: 347 folio_batch_release(&fbatch); 348 if (processed_end > start) 349 unlock_delalloc_folio(inode, locked_folio, start, processed_end); 350 return -EAGAIN; 351 } 352 353 /* 354 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 355 * more than @max_bytes. 356 * 357 * @start: The original start bytenr to search. 358 * Will store the extent range start bytenr. 359 * @end: The original end bytenr of the search range 360 * Will store the extent range end bytenr. 361 * 362 * Return true if we find a delalloc range which starts inside the original 363 * range, and @start/@end will store the delalloc range start/end. 364 * 365 * Return false if we can't find any delalloc range which starts inside the 366 * original range, and @start/@end will be the non-delalloc range start/end. 367 */ 368 EXPORT_FOR_TESTS 369 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 370 struct folio *locked_folio, 371 u64 *start, u64 *end) 372 { 373 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 375 const u64 orig_start = *start; 376 const u64 orig_end = *end; 377 /* The sanity tests may not set a valid fs_info. */ 378 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; 379 u64 delalloc_start; 380 u64 delalloc_end; 381 bool found; 382 struct extent_state *cached_state = NULL; 383 int ret; 384 int loops = 0; 385 386 /* Caller should pass a valid @end to indicate the search range end */ 387 ASSERT(orig_end > orig_start); 388 389 /* The range should at least cover part of the folio */ 390 ASSERT(!(orig_start >= folio_next_pos(locked_folio) || 391 orig_end <= folio_pos(locked_folio))); 392 again: 393 /* step one, find a bunch of delalloc bytes starting at start */ 394 delalloc_start = *start; 395 delalloc_end = 0; 396 397 /* 398 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can 399 * return early without handling any dirty ranges. 400 */ 401 ASSERT(max_bytes >= fs_info->sectorsize); 402 403 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 404 max_bytes, &cached_state); 405 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 406 *start = delalloc_start; 407 408 /* @delalloc_end can be -1, never go beyond @orig_end */ 409 *end = min(delalloc_end, orig_end); 410 btrfs_free_extent_state(cached_state); 411 return false; 412 } 413 414 /* 415 * start comes from the offset of locked_folio. We have to lock 416 * folios in order, so we can't process delalloc bytes before 417 * locked_folio 418 */ 419 if (delalloc_start < *start) 420 delalloc_start = *start; 421 422 /* 423 * make sure to limit the number of folios we try to lock down 424 */ 425 if (delalloc_end + 1 - delalloc_start > max_bytes) 426 delalloc_end = delalloc_start + max_bytes - 1; 427 428 /* step two, lock all the folios after the folios that has start */ 429 ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, 430 delalloc_end); 431 ASSERT(!ret || ret == -EAGAIN); 432 if (ret == -EAGAIN) { 433 /* 434 * Some of the folios are gone, lets avoid looping by 435 * shortening the size of the delalloc range we're searching. 436 */ 437 btrfs_free_extent_state(cached_state); 438 cached_state = NULL; 439 if (!loops) { 440 max_bytes = fs_info->sectorsize; 441 loops = 1; 442 goto again; 443 } else { 444 found = false; 445 goto out_failed; 446 } 447 } 448 449 /* step three, lock the state bits for the whole range */ 450 btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); 451 452 /* then test to make sure it is all still delalloc */ 453 ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, 454 EXTENT_DELALLOC, cached_state); 455 456 btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); 457 if (!ret) { 458 unlock_delalloc_folio(inode, locked_folio, delalloc_start, 459 delalloc_end); 460 cond_resched(); 461 goto again; 462 } 463 *start = delalloc_start; 464 *end = delalloc_end; 465 out_failed: 466 return found; 467 } 468 469 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 470 const struct folio *locked_folio, 471 struct extent_state **cached, 472 u32 clear_bits, unsigned long page_ops) 473 { 474 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); 475 476 __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, 477 end, page_ops); 478 } 479 480 static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) 481 { 482 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 483 484 if (!fsverity_active(folio->mapping->host) || 485 btrfs_folio_test_uptodate(fs_info, folio, start, len) || 486 start >= i_size_read(folio->mapping->host)) 487 return true; 488 return fsverity_verify_folio(folio); 489 } 490 491 static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) 492 { 493 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 494 495 ASSERT(folio_pos(folio) <= start && 496 start + len <= folio_next_pos(folio)); 497 498 if (uptodate && btrfs_verify_folio(folio, start, len)) 499 btrfs_folio_set_uptodate(fs_info, folio, start, len); 500 else 501 btrfs_folio_clear_uptodate(fs_info, folio, start, len); 502 503 if (!btrfs_is_subpage(fs_info, folio)) 504 folio_unlock(folio); 505 else 506 btrfs_folio_end_lock(fs_info, folio, start, len); 507 } 508 509 /* 510 * After a write IO is done, we need to: 511 * 512 * - clear the uptodate bits on error 513 * - clear the writeback bits in the extent tree for the range 514 * - filio_end_writeback() if there is no more pending io for the folio 515 * 516 * Scheduling is not allowed, so the extent state tree is expected 517 * to have one and only one object corresponding to this IO. 518 */ 519 static void end_bbio_data_write(struct btrfs_bio *bbio) 520 { 521 struct btrfs_fs_info *fs_info = bbio->fs_info; 522 struct bio *bio = &bbio->bio; 523 int error = blk_status_to_errno(bio->bi_status); 524 struct folio_iter fi; 525 const u32 sectorsize = fs_info->sectorsize; 526 527 ASSERT(!bio_flagged(bio, BIO_CLONED)); 528 bio_for_each_folio_all(fi, bio) { 529 struct folio *folio = fi.folio; 530 u64 start = folio_pos(folio) + fi.offset; 531 u32 len = fi.length; 532 533 /* Our read/write should always be sector aligned. */ 534 if (!IS_ALIGNED(fi.offset, sectorsize)) 535 btrfs_err(fs_info, 536 "partial page write in btrfs with offset %zu and length %zu", 537 fi.offset, fi.length); 538 else if (!IS_ALIGNED(fi.length, sectorsize)) 539 btrfs_info(fs_info, 540 "incomplete page write with offset %zu and length %zu", 541 fi.offset, fi.length); 542 543 btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, 544 !error); 545 if (error) 546 mapping_set_error(folio->mapping, error); 547 btrfs_folio_clear_writeback(fs_info, folio, start, len); 548 } 549 550 bio_put(bio); 551 } 552 553 static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) 554 { 555 ASSERT(folio_test_locked(folio)); 556 if (!btrfs_is_subpage(fs_info, folio)) 557 return; 558 559 ASSERT(folio_test_private(folio)); 560 btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); 561 } 562 563 /* 564 * After a data read IO is done, we need to: 565 * 566 * - clear the uptodate bits on error 567 * - set the uptodate bits if things worked 568 * - set the folio up to date if all extents in the tree are uptodate 569 * - clear the lock bit in the extent tree 570 * - unlock the folio if there are no other extents locked for it 571 * 572 * Scheduling is not allowed, so the extent state tree is expected 573 * to have one and only one object corresponding to this IO. 574 */ 575 static void end_bbio_data_read(struct btrfs_bio *bbio) 576 { 577 struct btrfs_fs_info *fs_info = bbio->fs_info; 578 struct bio *bio = &bbio->bio; 579 struct folio_iter fi; 580 581 ASSERT(!bio_flagged(bio, BIO_CLONED)); 582 bio_for_each_folio_all(fi, &bbio->bio) { 583 bool uptodate = !bio->bi_status; 584 struct folio *folio = fi.folio; 585 struct inode *inode = folio->mapping->host; 586 u64 start = folio_pos(folio) + fi.offset; 587 588 btrfs_debug(fs_info, 589 "%s: bi_sector=%llu, err=%d, mirror=%u", 590 __func__, bio->bi_iter.bi_sector, bio->bi_status, 591 bbio->mirror_num); 592 593 594 if (likely(uptodate)) { 595 u64 end = start + fi.length - 1; 596 loff_t i_size = i_size_read(inode); 597 598 /* 599 * Zero out the remaining part if this range straddles 600 * i_size. 601 * 602 * Here we should only zero the range inside the folio, 603 * not touch anything else. 604 * 605 * NOTE: i_size is exclusive while end is inclusive and 606 * folio_contains() takes PAGE_SIZE units. 607 */ 608 if (folio_contains(folio, i_size >> PAGE_SHIFT) && 609 i_size <= end) { 610 u32 zero_start = max(offset_in_folio(folio, i_size), 611 offset_in_folio(folio, start)); 612 u32 zero_len = offset_in_folio(folio, end) + 1 - 613 zero_start; 614 615 folio_zero_range(folio, zero_start, zero_len); 616 } 617 } 618 619 /* Update page status and unlock. */ 620 end_folio_read(folio, uptodate, start, fi.length); 621 } 622 bio_put(bio); 623 } 624 625 /* 626 * Populate every free slot in a provided array with folios using GFP_NOFS. 627 * 628 * @nr_folios: number of folios to allocate 629 * @order: the order of the folios to be allocated 630 * @folio_array: the array to fill with folios; any existing non-NULL entries in 631 * the array will be skipped 632 * 633 * Return: 0 if all folios were able to be allocated; 634 * -ENOMEM otherwise, the partially allocated folios would be freed and 635 * the array slots zeroed 636 */ 637 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, 638 struct folio **folio_array) 639 { 640 for (int i = 0; i < nr_folios; i++) { 641 if (folio_array[i]) 642 continue; 643 folio_array[i] = folio_alloc(GFP_NOFS, order); 644 if (!folio_array[i]) 645 goto error; 646 } 647 return 0; 648 error: 649 for (int i = 0; i < nr_folios; i++) { 650 if (folio_array[i]) 651 folio_put(folio_array[i]); 652 folio_array[i] = NULL; 653 } 654 return -ENOMEM; 655 } 656 657 /* 658 * Populate every free slot in a provided array with pages, using GFP_NOFS. 659 * 660 * @nr_pages: number of pages to allocate 661 * @page_array: the array to fill with pages; any existing non-null entries in 662 * the array will be skipped 663 * @nofail: whether using __GFP_NOFAIL flag 664 * 665 * Return: 0 if all pages were able to be allocated; 666 * -ENOMEM otherwise, the partially allocated pages would be freed and 667 * the array slots zeroed 668 */ 669 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, 670 bool nofail) 671 { 672 const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS; 673 unsigned int allocated; 674 675 for (allocated = 0; allocated < nr_pages;) { 676 unsigned int last = allocated; 677 678 allocated = alloc_pages_bulk(gfp, nr_pages, page_array); 679 if (unlikely(allocated == last)) { 680 /* No progress, fail and do cleanup. */ 681 for (int i = 0; i < allocated; i++) { 682 __free_page(page_array[i]); 683 page_array[i] = NULL; 684 } 685 return -ENOMEM; 686 } 687 } 688 return 0; 689 } 690 691 /* 692 * Populate needed folios for the extent buffer. 693 * 694 * For now, the folios populated are always in order 0 (aka, single page). 695 */ 696 static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) 697 { 698 struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; 699 int num_pages = num_extent_pages(eb); 700 int ret; 701 702 ret = btrfs_alloc_page_array(num_pages, page_array, nofail); 703 if (ret < 0) 704 return ret; 705 706 for (int i = 0; i < num_pages; i++) 707 eb->folios[i] = page_folio(page_array[i]); 708 eb->folio_size = PAGE_SIZE; 709 eb->folio_shift = PAGE_SHIFT; 710 return 0; 711 } 712 713 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, 714 u64 disk_bytenr, loff_t file_offset) 715 { 716 struct bio *bio = &bio_ctrl->bbio->bio; 717 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 718 719 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 720 /* 721 * For compression, all IO should have its logical bytenr set 722 * to the starting bytenr of the compressed extent. 723 */ 724 return bio->bi_iter.bi_sector == sector; 725 } 726 727 /* 728 * To merge into a bio both the disk sector and the logical offset in 729 * the file need to be contiguous. 730 */ 731 return bio_ctrl->next_file_offset == file_offset && 732 bio_end_sector(bio) == sector; 733 } 734 735 static void alloc_new_bio(struct btrfs_inode *inode, 736 struct btrfs_bio_ctrl *bio_ctrl, 737 u64 disk_bytenr, u64 file_offset) 738 { 739 struct btrfs_fs_info *fs_info = inode->root->fs_info; 740 struct btrfs_bio *bbio; 741 742 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, 743 bio_ctrl->end_io_func, NULL); 744 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 745 bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; 746 bbio->inode = inode; 747 bbio->file_offset = file_offset; 748 bio_ctrl->bbio = bbio; 749 bio_ctrl->len_to_oe_boundary = U32_MAX; 750 bio_ctrl->next_file_offset = file_offset; 751 752 /* Limit data write bios to the ordered boundary. */ 753 if (bio_ctrl->wbc) { 754 struct btrfs_ordered_extent *ordered; 755 756 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 757 if (ordered) { 758 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 759 ordered->file_offset + 760 ordered->disk_num_bytes - file_offset); 761 bbio->ordered = ordered; 762 } 763 764 /* 765 * Pick the last added device to support cgroup writeback. For 766 * multi-device file systems this means blk-cgroup policies have 767 * to always be set on the last added/replaced device. 768 * This is a bit odd but has been like that for a long time. 769 */ 770 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 771 wbc_init_bio(bio_ctrl->wbc, &bbio->bio); 772 } 773 } 774 775 /* 776 * @disk_bytenr: logical bytenr where the write will be 777 * @page: page to add to the bio 778 * @size: portion of page that we want to write to 779 * @pg_offset: offset of the new bio or to check whether we are adding 780 * a contiguous page to the previous one 781 * @read_em_generation: generation of the extent_map we are submitting 782 * (only used for read) 783 * 784 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a 785 * new one in @bio_ctrl->bbio. 786 * The mirror number for this IO should already be initialized in 787 * @bio_ctrl->mirror_num. 788 */ 789 static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, 790 u64 disk_bytenr, struct folio *folio, 791 size_t size, unsigned long pg_offset, 792 u64 read_em_generation) 793 { 794 struct btrfs_inode *inode = folio_to_inode(folio); 795 loff_t file_offset = folio_pos(folio) + pg_offset; 796 797 ASSERT(pg_offset + size <= folio_size(folio)); 798 ASSERT(bio_ctrl->end_io_func); 799 800 if (bio_ctrl->bbio && 801 !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) 802 submit_one_bio(bio_ctrl); 803 804 do { 805 u32 len = size; 806 807 /* Allocate new bio if needed */ 808 if (!bio_ctrl->bbio) 809 alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); 810 811 /* Cap to the current ordered extent boundary if there is one. */ 812 if (len > bio_ctrl->len_to_oe_boundary) { 813 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); 814 ASSERT(is_data_inode(inode)); 815 len = bio_ctrl->len_to_oe_boundary; 816 } 817 818 if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { 819 /* bio full: move on to a new one */ 820 submit_one_bio(bio_ctrl); 821 continue; 822 } 823 /* 824 * Now that the folio is definitely added to the bio, include its 825 * generation in the max generation calculation. 826 */ 827 bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); 828 bio_ctrl->next_file_offset += len; 829 830 if (bio_ctrl->wbc) 831 wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); 832 833 size -= len; 834 pg_offset += len; 835 disk_bytenr += len; 836 file_offset += len; 837 838 /* 839 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or 840 * sector aligned. alloc_new_bio() then sets it to the end of 841 * our ordered extent for writes into zoned devices. 842 * 843 * When len_to_oe_boundary is tracking an ordered extent, we 844 * trust the ordered extent code to align things properly, and 845 * the check above to cap our write to the ordered extent 846 * boundary is correct. 847 * 848 * When len_to_oe_boundary is U32_MAX, the cap above would 849 * result in a 4095 byte IO for the last folio right before 850 * we hit the bio limit of UINT_MAX. bio_add_folio() has all 851 * the checks required to make sure we don't overflow the bio, 852 * and we should just ignore len_to_oe_boundary completely 853 * unless we're using it to track an ordered extent. 854 * 855 * It's pretty hard to make a bio sized U32_MAX, but it can 856 * happen when the page cache is able to feed us contiguous 857 * folios for large extents. 858 */ 859 if (bio_ctrl->len_to_oe_boundary != U32_MAX) 860 bio_ctrl->len_to_oe_boundary -= len; 861 862 /* Ordered extent boundary: move on to a new bio. */ 863 if (bio_ctrl->len_to_oe_boundary == 0) 864 submit_one_bio(bio_ctrl); 865 } while (size); 866 } 867 868 static int attach_extent_buffer_folio(struct extent_buffer *eb, 869 struct folio *folio, 870 struct btrfs_folio_state *prealloc) 871 { 872 struct btrfs_fs_info *fs_info = eb->fs_info; 873 int ret = 0; 874 875 /* 876 * If the page is mapped to btree inode, we should hold the private 877 * lock to prevent race. 878 * For cloned or dummy extent buffers, their pages are not mapped and 879 * will not race with any other ebs. 880 */ 881 if (folio->mapping) 882 lockdep_assert_held(&folio->mapping->i_private_lock); 883 884 if (!btrfs_meta_is_subpage(fs_info)) { 885 if (!folio_test_private(folio)) 886 folio_attach_private(folio, eb); 887 else 888 WARN_ON(folio_get_private(folio) != eb); 889 return 0; 890 } 891 892 /* Already mapped, just free prealloc */ 893 if (folio_test_private(folio)) { 894 btrfs_free_folio_state(prealloc); 895 return 0; 896 } 897 898 if (prealloc) 899 /* Has preallocated memory for subpage */ 900 folio_attach_private(folio, prealloc); 901 else 902 /* Do new allocation to attach subpage */ 903 ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 904 return ret; 905 } 906 907 int set_folio_extent_mapped(struct folio *folio) 908 { 909 struct btrfs_fs_info *fs_info; 910 911 ASSERT(folio->mapping); 912 913 if (folio_test_private(folio)) 914 return 0; 915 916 fs_info = folio_to_fs_info(folio); 917 918 if (btrfs_is_subpage(fs_info, folio)) 919 return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 920 921 folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); 922 return 0; 923 } 924 925 void clear_folio_extent_mapped(struct folio *folio) 926 { 927 struct btrfs_fs_info *fs_info; 928 929 ASSERT(folio->mapping); 930 931 if (!folio_test_private(folio)) 932 return; 933 934 fs_info = folio_to_fs_info(folio); 935 if (btrfs_is_subpage(fs_info, folio)) 936 return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 937 938 folio_detach_private(folio); 939 } 940 941 static struct extent_map *get_extent_map(struct btrfs_inode *inode, 942 struct folio *folio, u64 start, 943 u64 len, struct extent_map **em_cached) 944 { 945 struct extent_map *em; 946 947 ASSERT(em_cached); 948 949 if (*em_cached) { 950 em = *em_cached; 951 if (btrfs_extent_map_in_tree(em) && start >= em->start && 952 start < btrfs_extent_map_end(em)) { 953 refcount_inc(&em->refs); 954 return em; 955 } 956 957 btrfs_free_extent_map(em); 958 *em_cached = NULL; 959 } 960 961 em = btrfs_get_extent(inode, folio, start, len); 962 if (!IS_ERR(em)) { 963 BUG_ON(*em_cached); 964 refcount_inc(&em->refs); 965 *em_cached = em; 966 } 967 968 return em; 969 } 970 971 static void btrfs_readahead_expand(struct readahead_control *ractl, 972 const struct extent_map *em) 973 { 974 const u64 ra_pos = readahead_pos(ractl); 975 const u64 ra_end = ra_pos + readahead_length(ractl); 976 const u64 em_end = em->start + em->len; 977 978 /* No expansion for holes and inline extents. */ 979 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) 980 return; 981 982 ASSERT(em_end >= ra_pos, 983 "extent_map %llu %llu ends before current readahead position %llu", 984 em->start, em->len, ra_pos); 985 if (em_end > ra_end) 986 readahead_expand(ractl, ra_pos, em_end - ra_pos); 987 } 988 989 /* 990 * basic readpage implementation. Locked extent state structs are inserted 991 * into the tree that are removed when the IO is done (by the end_io 992 * handlers) 993 * XXX JDM: This needs looking at to ensure proper page locking 994 * return 0 on success, otherwise return error 995 */ 996 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, 997 struct btrfs_bio_ctrl *bio_ctrl) 998 { 999 struct inode *inode = folio->mapping->host; 1000 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1001 u64 start = folio_pos(folio); 1002 const u64 end = start + folio_size(folio) - 1; 1003 u64 extent_offset; 1004 u64 last_byte = i_size_read(inode); 1005 struct extent_map *em; 1006 int ret = 0; 1007 const size_t blocksize = fs_info->sectorsize; 1008 1009 ret = set_folio_extent_mapped(folio); 1010 if (ret < 0) { 1011 folio_unlock(folio); 1012 return ret; 1013 } 1014 1015 if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { 1016 size_t zero_offset = offset_in_folio(folio, last_byte); 1017 1018 if (zero_offset) 1019 folio_zero_range(folio, zero_offset, 1020 folio_size(folio) - zero_offset); 1021 } 1022 bio_ctrl->end_io_func = end_bbio_data_read; 1023 begin_folio_read(fs_info, folio); 1024 for (u64 cur = start; cur <= end; cur += blocksize) { 1025 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; 1026 unsigned long pg_offset = offset_in_folio(folio, cur); 1027 bool force_bio_submit = false; 1028 u64 disk_bytenr; 1029 u64 block_start; 1030 u64 em_gen; 1031 1032 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 1033 if (cur >= last_byte) { 1034 folio_zero_range(folio, pg_offset, end - cur + 1); 1035 end_folio_read(folio, true, cur, end - cur + 1); 1036 break; 1037 } 1038 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1039 end_folio_read(folio, true, cur, blocksize); 1040 continue; 1041 } 1042 em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); 1043 if (IS_ERR(em)) { 1044 end_folio_read(folio, false, cur, end + 1 - cur); 1045 return PTR_ERR(em); 1046 } 1047 extent_offset = cur - em->start; 1048 BUG_ON(btrfs_extent_map_end(em) <= cur); 1049 BUG_ON(end < cur); 1050 1051 compress_type = btrfs_extent_map_compression(em); 1052 1053 /* 1054 * Only expand readahead for extents which are already creating 1055 * the pages anyway in add_ra_bio_pages, which is compressed 1056 * extents in the non subpage case. 1057 */ 1058 if (bio_ctrl->ractl && 1059 !btrfs_is_subpage(fs_info, folio) && 1060 compress_type != BTRFS_COMPRESS_NONE) 1061 btrfs_readahead_expand(bio_ctrl->ractl, em); 1062 1063 if (compress_type != BTRFS_COMPRESS_NONE) 1064 disk_bytenr = em->disk_bytenr; 1065 else 1066 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1067 1068 if (em->flags & EXTENT_FLAG_PREALLOC) 1069 block_start = EXTENT_MAP_HOLE; 1070 else 1071 block_start = btrfs_extent_map_block_start(em); 1072 1073 /* 1074 * If we have a file range that points to a compressed extent 1075 * and it's followed by a consecutive file range that points 1076 * to the same compressed extent (possibly with a different 1077 * offset and/or length, so it either points to the whole extent 1078 * or only part of it), we must make sure we do not submit a 1079 * single bio to populate the folios for the 2 ranges because 1080 * this makes the compressed extent read zero out the folios 1081 * belonging to the 2nd range. Imagine the following scenario: 1082 * 1083 * File layout 1084 * [0 - 8K] [8K - 24K] 1085 * | | 1086 * | | 1087 * points to extent X, points to extent X, 1088 * offset 4K, length of 8K offset 0, length 16K 1089 * 1090 * [extent X, compressed length = 4K uncompressed length = 16K] 1091 * 1092 * If the bio to read the compressed extent covers both ranges, 1093 * it will decompress extent X into the folios belonging to the 1094 * first range and then it will stop, zeroing out the remaining 1095 * folios that belong to the other range that points to extent X. 1096 * So here we make sure we submit 2 bios, one for the first 1097 * range and another one for the third range. Both will target 1098 * the same physical extent from disk, but we can't currently 1099 * make the compressed bio endio callback populate the folios 1100 * for both ranges because each compressed bio is tightly 1101 * coupled with a single extent map, and each range can have 1102 * an extent map with a different offset value relative to the 1103 * uncompressed data of our extent and different lengths. This 1104 * is a corner case so we prioritize correctness over 1105 * non-optimal behavior (submitting 2 bios for the same extent). 1106 */ 1107 if (compress_type != BTRFS_COMPRESS_NONE && 1108 bio_ctrl->last_em_start != U64_MAX && 1109 bio_ctrl->last_em_start != em->start) 1110 force_bio_submit = true; 1111 1112 bio_ctrl->last_em_start = em->start; 1113 1114 em_gen = em->generation; 1115 btrfs_free_extent_map(em); 1116 em = NULL; 1117 1118 /* we've found a hole, just zero and go on */ 1119 if (block_start == EXTENT_MAP_HOLE) { 1120 folio_zero_range(folio, pg_offset, blocksize); 1121 end_folio_read(folio, true, cur, blocksize); 1122 continue; 1123 } 1124 /* the get_extent function already copied into the folio */ 1125 if (block_start == EXTENT_MAP_INLINE) { 1126 end_folio_read(folio, true, cur, blocksize); 1127 continue; 1128 } 1129 1130 if (bio_ctrl->compress_type != compress_type) { 1131 submit_one_bio(bio_ctrl); 1132 bio_ctrl->compress_type = compress_type; 1133 } 1134 1135 if (force_bio_submit) 1136 submit_one_bio(bio_ctrl); 1137 submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, 1138 pg_offset, em_gen); 1139 } 1140 return 0; 1141 } 1142 1143 /* 1144 * Check if we can skip waiting the @ordered extent covering the block at @fileoff. 1145 * 1146 * @fileoff: Both input and output. 1147 * Input as the file offset where the check should start at. 1148 * Output as where the next check should start at, 1149 * if the function returns true. 1150 * 1151 * Return true if we can skip to @fileoff. The caller needs to check the new 1152 * @fileoff value to make sure it covers the full range, before skipping the 1153 * full OE. 1154 * 1155 * Return false if we must wait for the ordered extent. 1156 */ 1157 static bool can_skip_one_ordered_range(struct btrfs_inode *inode, 1158 struct btrfs_ordered_extent *ordered, 1159 u64 *fileoff) 1160 { 1161 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1162 struct folio *folio; 1163 const u32 blocksize = fs_info->sectorsize; 1164 u64 cur = *fileoff; 1165 bool ret; 1166 1167 folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); 1168 1169 /* 1170 * We should have locked the folio(s) for range [start, end], thus 1171 * there must be a folio and it must be locked. 1172 */ 1173 ASSERT(!IS_ERR(folio)); 1174 ASSERT(folio_test_locked(folio)); 1175 1176 /* 1177 * There are several cases for the folio and OE combination: 1178 * 1179 * 1) Folio has no private flag 1180 * The OE has all its IO done but not yet finished, and folio got 1181 * invalidated. 1182 * 1183 * Have we have to wait for the OE to finish, as it may contain the 1184 * to-be-inserted data checksum. 1185 * Without the data checksum inserted into the csum tree, read will 1186 * just fail with missing csum. 1187 */ 1188 if (!folio_test_private(folio)) { 1189 ret = false; 1190 goto out; 1191 } 1192 1193 /* 1194 * 2) The first block is DIRTY. 1195 * 1196 * This means the OE is created by some other folios whose file pos is 1197 * before this one. And since we are holding the folio lock, the writeback 1198 * of this folio cannot start. 1199 * 1200 * We must skip the whole OE, because it will never start until we 1201 * finished our folio read and unlocked the folio. 1202 */ 1203 if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { 1204 u64 range_len = umin(folio_next_pos(folio), 1205 ordered->file_offset + ordered->num_bytes) - cur; 1206 1207 ret = true; 1208 /* 1209 * At least inside the folio, all the remaining blocks should 1210 * also be dirty. 1211 */ 1212 ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); 1213 *fileoff = ordered->file_offset + ordered->num_bytes; 1214 goto out; 1215 } 1216 1217 /* 1218 * 3) The first block is uptodate. 1219 * 1220 * At least the first block can be skipped, but we are still not fully 1221 * sure. E.g. if the OE has some other folios in the range that cannot 1222 * be skipped. 1223 * So we return true and update @next_ret to the OE/folio boundary. 1224 */ 1225 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1226 u64 range_len = umin(folio_next_pos(folio), 1227 ordered->file_offset + ordered->num_bytes) - cur; 1228 1229 /* 1230 * The whole range to the OE end or folio boundary should also 1231 * be uptodate. 1232 */ 1233 ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); 1234 ret = true; 1235 *fileoff = cur + range_len; 1236 goto out; 1237 } 1238 1239 /* 1240 * 4) The first block is not uptodate. 1241 * 1242 * This means the folio is invalidated after the writeback was finished, 1243 * but by some other operations (e.g. block aligned buffered write) the 1244 * folio is inserted into filemap. 1245 * Very much the same as case 1). 1246 */ 1247 ret = false; 1248 out: 1249 folio_put(folio); 1250 return ret; 1251 } 1252 1253 static bool can_skip_ordered_extent(struct btrfs_inode *inode, 1254 struct btrfs_ordered_extent *ordered, 1255 u64 start, u64 end) 1256 { 1257 const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); 1258 u64 cur = max(start, ordered->file_offset); 1259 1260 while (cur < range_end) { 1261 bool can_skip; 1262 1263 can_skip = can_skip_one_ordered_range(inode, ordered, &cur); 1264 if (!can_skip) 1265 return false; 1266 } 1267 return true; 1268 } 1269 1270 /* 1271 * Locking helper to make sure we get a stable view of extent maps for the 1272 * involved range. 1273 * 1274 * This is for folio read paths (read and readahead), thus the involved range 1275 * should have all the folios locked. 1276 */ 1277 static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, 1278 struct extent_state **cached_state) 1279 { 1280 u64 cur_pos; 1281 1282 /* Caller must provide a valid @cached_state. */ 1283 ASSERT(cached_state); 1284 1285 /* The range must at least be page aligned, as all read paths are folio based. */ 1286 ASSERT(IS_ALIGNED(start, PAGE_SIZE)); 1287 ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); 1288 1289 again: 1290 btrfs_lock_extent(&inode->io_tree, start, end, cached_state); 1291 cur_pos = start; 1292 while (cur_pos < end) { 1293 struct btrfs_ordered_extent *ordered; 1294 1295 ordered = btrfs_lookup_ordered_range(inode, cur_pos, 1296 end - cur_pos + 1); 1297 /* 1298 * No ordered extents in the range, and we hold the extent lock, 1299 * no one can modify the extent maps in the range, we're safe to return. 1300 */ 1301 if (!ordered) 1302 break; 1303 1304 /* Check if we can skip waiting for the whole OE. */ 1305 if (can_skip_ordered_extent(inode, ordered, start, end)) { 1306 cur_pos = min(ordered->file_offset + ordered->num_bytes, 1307 end + 1); 1308 btrfs_put_ordered_extent(ordered); 1309 continue; 1310 } 1311 1312 /* Now wait for the OE to finish. */ 1313 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1314 btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); 1315 btrfs_put_ordered_extent(ordered); 1316 /* We have unlocked the whole range, restart from the beginning. */ 1317 goto again; 1318 } 1319 } 1320 1321 int btrfs_read_folio(struct file *file, struct folio *folio) 1322 { 1323 struct btrfs_inode *inode = folio_to_inode(folio); 1324 const u64 start = folio_pos(folio); 1325 const u64 end = start + folio_size(folio) - 1; 1326 struct extent_state *cached_state = NULL; 1327 struct btrfs_bio_ctrl bio_ctrl = { 1328 .opf = REQ_OP_READ, 1329 .last_em_start = U64_MAX, 1330 }; 1331 struct extent_map *em_cached = NULL; 1332 int ret; 1333 1334 lock_extents_for_read(inode, start, end, &cached_state); 1335 ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 1336 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1337 1338 btrfs_free_extent_map(em_cached); 1339 1340 /* 1341 * If btrfs_do_readpage() failed we will want to submit the assembled 1342 * bio to do the cleanup. 1343 */ 1344 submit_one_bio(&bio_ctrl); 1345 return ret; 1346 } 1347 1348 static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, 1349 u64 start, u32 len) 1350 { 1351 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1352 const u64 folio_start = folio_pos(folio); 1353 unsigned int start_bit; 1354 unsigned int nbits; 1355 1356 ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); 1357 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1358 nbits = len >> fs_info->sectorsize_bits; 1359 ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); 1360 bitmap_set(delalloc_bitmap, start_bit, nbits); 1361 } 1362 1363 static bool find_next_delalloc_bitmap(struct folio *folio, 1364 unsigned long *delalloc_bitmap, u64 start, 1365 u64 *found_start, u32 *found_len) 1366 { 1367 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1368 const u64 folio_start = folio_pos(folio); 1369 const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); 1370 unsigned int start_bit; 1371 unsigned int first_zero; 1372 unsigned int first_set; 1373 1374 ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); 1375 1376 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1377 first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); 1378 if (first_set >= bitmap_size) 1379 return false; 1380 1381 *found_start = folio_start + (first_set << fs_info->sectorsize_bits); 1382 first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); 1383 *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; 1384 return true; 1385 } 1386 1387 /* 1388 * Do all of the delayed allocation setup. 1389 * 1390 * Return >0 if all the dirty blocks are submitted async (compression) or inlined. 1391 * The @folio should no longer be touched (treat it as already unlocked). 1392 * 1393 * Return 0 if there is still dirty block that needs to be submitted through 1394 * extent_writepage_io(). 1395 * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be 1396 * submitted, and @folio is still kept locked. 1397 * 1398 * Return <0 if there is any error hit. 1399 * Any allocated ordered extent range covering this folio will be marked 1400 * finished (IOERR), and @folio is still kept locked. 1401 */ 1402 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 1403 struct folio *folio, 1404 struct btrfs_bio_ctrl *bio_ctrl) 1405 { 1406 struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); 1407 struct writeback_control *wbc = bio_ctrl->wbc; 1408 const bool is_subpage = btrfs_is_subpage(fs_info, folio); 1409 const u64 page_start = folio_pos(folio); 1410 const u64 page_end = page_start + folio_size(folio) - 1; 1411 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1412 unsigned long delalloc_bitmap = 0; 1413 /* 1414 * Save the last found delalloc end. As the delalloc end can go beyond 1415 * page boundary, thus we cannot rely on subpage bitmap to locate the 1416 * last delalloc end. 1417 */ 1418 u64 last_delalloc_end = 0; 1419 /* 1420 * The range end (exclusive) of the last successfully finished delalloc 1421 * range. 1422 * Any range covered by ordered extent must either be manually marked 1423 * finished (error handling), or has IO submitted (and finish the 1424 * ordered extent normally). 1425 * 1426 * This records the end of ordered extent cleanup if we hit an error. 1427 */ 1428 u64 last_finished_delalloc_end = page_start; 1429 u64 delalloc_start = page_start; 1430 u64 delalloc_end = page_end; 1431 u64 delalloc_to_write = 0; 1432 int ret = 0; 1433 int bit; 1434 1435 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1436 if (btrfs_is_subpage(fs_info, folio)) { 1437 ASSERT(blocks_per_folio > 1); 1438 btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); 1439 } else { 1440 bio_ctrl->submit_bitmap = 1; 1441 } 1442 1443 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1444 u64 start = page_start + (bit << fs_info->sectorsize_bits); 1445 1446 btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); 1447 } 1448 1449 /* Lock all (subpage) delalloc ranges inside the folio first. */ 1450 while (delalloc_start < page_end) { 1451 delalloc_end = page_end; 1452 if (!find_lock_delalloc_range(&inode->vfs_inode, folio, 1453 &delalloc_start, &delalloc_end)) { 1454 delalloc_start = delalloc_end + 1; 1455 continue; 1456 } 1457 set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, 1458 min(delalloc_end, page_end) + 1 - delalloc_start); 1459 last_delalloc_end = delalloc_end; 1460 delalloc_start = delalloc_end + 1; 1461 } 1462 delalloc_start = page_start; 1463 1464 if (!last_delalloc_end) 1465 goto out; 1466 1467 /* Run the delalloc ranges for the above locked ranges. */ 1468 while (delalloc_start < page_end) { 1469 u64 found_start; 1470 u32 found_len; 1471 bool found; 1472 1473 if (!is_subpage) { 1474 /* 1475 * For non-subpage case, the found delalloc range must 1476 * cover this folio and there must be only one locked 1477 * delalloc range. 1478 */ 1479 found_start = page_start; 1480 found_len = last_delalloc_end + 1 - found_start; 1481 found = true; 1482 } else { 1483 found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, 1484 delalloc_start, &found_start, &found_len); 1485 } 1486 if (!found) 1487 break; 1488 /* 1489 * The subpage range covers the last sector, the delalloc range may 1490 * end beyond the folio boundary, use the saved delalloc_end 1491 * instead. 1492 */ 1493 if (found_start + found_len >= page_end) 1494 found_len = last_delalloc_end + 1 - found_start; 1495 1496 if (ret >= 0) { 1497 /* 1498 * Some delalloc range may be created by previous folios. 1499 * Thus we still need to clean up this range during error 1500 * handling. 1501 */ 1502 last_finished_delalloc_end = found_start; 1503 /* No errors hit so far, run the current delalloc range. */ 1504 ret = btrfs_run_delalloc_range(inode, folio, 1505 found_start, 1506 found_start + found_len - 1, 1507 wbc); 1508 if (ret >= 0) 1509 last_finished_delalloc_end = found_start + found_len; 1510 if (unlikely(ret < 0)) 1511 btrfs_err_rl(fs_info, 1512 "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", 1513 btrfs_root_id(inode->root), 1514 btrfs_ino(inode), 1515 folio_pos(folio), 1516 blocks_per_folio, 1517 &bio_ctrl->submit_bitmap, 1518 found_start, found_len, ret); 1519 } else { 1520 /* 1521 * We've hit an error during previous delalloc range, 1522 * have to cleanup the remaining locked ranges. 1523 */ 1524 btrfs_unlock_extent(&inode->io_tree, found_start, 1525 found_start + found_len - 1, NULL); 1526 unlock_delalloc_folio(&inode->vfs_inode, folio, 1527 found_start, 1528 found_start + found_len - 1); 1529 } 1530 1531 /* 1532 * We have some ranges that's going to be submitted asynchronously 1533 * (compression or inline). These range have their own control 1534 * on when to unlock the pages. We should not touch them 1535 * anymore, so clear the range from the submission bitmap. 1536 */ 1537 if (ret > 0) { 1538 unsigned int start_bit = (found_start - page_start) >> 1539 fs_info->sectorsize_bits; 1540 unsigned int end_bit = (min(page_end + 1, found_start + found_len) - 1541 page_start) >> fs_info->sectorsize_bits; 1542 bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); 1543 } 1544 /* 1545 * Above btrfs_run_delalloc_range() may have unlocked the folio, 1546 * thus for the last range, we cannot touch the folio anymore. 1547 */ 1548 if (found_start + found_len >= last_delalloc_end + 1) 1549 break; 1550 1551 delalloc_start = found_start + found_len; 1552 } 1553 /* 1554 * It's possible we had some ordered extents created before we hit 1555 * an error, cleanup non-async successfully created delalloc ranges. 1556 */ 1557 if (unlikely(ret < 0)) { 1558 unsigned int bitmap_size = min( 1559 (last_finished_delalloc_end - page_start) >> 1560 fs_info->sectorsize_bits, 1561 blocks_per_folio); 1562 1563 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) 1564 btrfs_mark_ordered_io_finished(inode, folio, 1565 page_start + (bit << fs_info->sectorsize_bits), 1566 fs_info->sectorsize, false); 1567 return ret; 1568 } 1569 out: 1570 if (last_delalloc_end) 1571 delalloc_end = last_delalloc_end; 1572 else 1573 delalloc_end = page_end; 1574 /* 1575 * delalloc_end is already one less than the total length, so 1576 * we don't subtract one from PAGE_SIZE. 1577 */ 1578 delalloc_to_write += 1579 DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); 1580 1581 /* 1582 * If all ranges are submitted asynchronously, we just need to account 1583 * for them here. 1584 */ 1585 if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { 1586 wbc->nr_to_write -= delalloc_to_write; 1587 return 1; 1588 } 1589 1590 if (wbc->nr_to_write < delalloc_to_write) { 1591 int thresh = 8192; 1592 1593 if (delalloc_to_write < thresh * 2) 1594 thresh = delalloc_to_write; 1595 wbc->nr_to_write = min_t(u64, delalloc_to_write, 1596 thresh); 1597 } 1598 1599 return 0; 1600 } 1601 1602 /* 1603 * Return 0 if we have submitted or queued the sector for submission. 1604 * Return <0 for critical errors, and the sector will have its dirty flag cleared. 1605 * 1606 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1607 */ 1608 static int submit_one_sector(struct btrfs_inode *inode, 1609 struct folio *folio, 1610 u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, 1611 loff_t i_size) 1612 { 1613 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1614 struct extent_map *em; 1615 u64 block_start; 1616 u64 disk_bytenr; 1617 u64 extent_offset; 1618 u64 em_end; 1619 const u32 sectorsize = fs_info->sectorsize; 1620 1621 ASSERT(IS_ALIGNED(filepos, sectorsize)); 1622 1623 /* @filepos >= i_size case should be handled by the caller. */ 1624 ASSERT(filepos < i_size); 1625 1626 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1627 if (IS_ERR(em)) { 1628 /* 1629 * When submission failed, we should still clear the folio dirty. 1630 * Or the folio will be written back again but without any 1631 * ordered extent. 1632 */ 1633 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1634 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1635 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1636 return PTR_ERR(em); 1637 } 1638 1639 extent_offset = filepos - em->start; 1640 em_end = btrfs_extent_map_end(em); 1641 ASSERT(filepos <= em_end); 1642 ASSERT(IS_ALIGNED(em->start, sectorsize)); 1643 ASSERT(IS_ALIGNED(em->len, sectorsize)); 1644 1645 block_start = btrfs_extent_map_block_start(em); 1646 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1647 1648 ASSERT(!btrfs_extent_map_is_compressed(em)); 1649 ASSERT(block_start != EXTENT_MAP_HOLE); 1650 ASSERT(block_start != EXTENT_MAP_INLINE); 1651 1652 btrfs_free_extent_map(em); 1653 em = NULL; 1654 1655 /* 1656 * Although the PageDirty bit is cleared before entering this 1657 * function, subpage dirty bit is not cleared. 1658 * So clear subpage dirty bit here so next time we won't submit 1659 * a folio for a range already written to disk. 1660 */ 1661 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1662 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1663 /* 1664 * Above call should set the whole folio with writeback flag, even 1665 * just for a single subpage sector. 1666 * As long as the folio is properly locked and the range is correct, 1667 * we should always get the folio with writeback flag. 1668 */ 1669 ASSERT(folio_test_writeback(folio)); 1670 1671 submit_extent_folio(bio_ctrl, disk_bytenr, folio, 1672 sectorsize, filepos - folio_pos(folio), 0); 1673 return 0; 1674 } 1675 1676 /* 1677 * Helper for extent_writepage(). This calls the writepage start hooks, 1678 * and does the loop to map the page into extents and bios. 1679 * 1680 * We return 1 if the IO is started and the page is unlocked, 1681 * 0 if all went well (page still locked) 1682 * < 0 if there were errors (page still locked) 1683 */ 1684 static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, 1685 struct folio *folio, 1686 u64 start, u32 len, 1687 struct btrfs_bio_ctrl *bio_ctrl, 1688 loff_t i_size) 1689 { 1690 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1691 unsigned long range_bitmap = 0; 1692 bool submitted_io = false; 1693 int found_error = 0; 1694 const u64 folio_start = folio_pos(folio); 1695 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1696 u64 cur; 1697 int bit; 1698 int ret = 0; 1699 1700 ASSERT(start >= folio_start && 1701 start + len <= folio_start + folio_size(folio)); 1702 1703 ret = btrfs_writepage_cow_fixup(folio); 1704 if (ret == -EAGAIN) { 1705 /* Fixup worker will requeue */ 1706 folio_redirty_for_writepage(bio_ctrl->wbc, folio); 1707 folio_unlock(folio); 1708 return 1; 1709 } 1710 if (ret < 0) { 1711 btrfs_folio_clear_dirty(fs_info, folio, start, len); 1712 btrfs_folio_set_writeback(fs_info, folio, start, len); 1713 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1714 return ret; 1715 } 1716 1717 for (cur = start; cur < start + len; cur += fs_info->sectorsize) 1718 set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); 1719 bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, 1720 blocks_per_folio); 1721 1722 bio_ctrl->end_io_func = end_bbio_data_write; 1723 1724 for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1725 cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); 1726 1727 if (cur >= i_size) { 1728 btrfs_mark_ordered_io_finished(inode, folio, cur, 1729 start + len - cur, true); 1730 /* 1731 * This range is beyond i_size, thus we don't need to 1732 * bother writing back. 1733 * But we still need to clear the dirty subpage bit, or 1734 * the next time the folio gets dirtied, we will try to 1735 * writeback the sectors with subpage dirty bits, 1736 * causing writeback without ordered extent. 1737 */ 1738 btrfs_folio_clear_dirty(fs_info, folio, cur, 1739 start + len - cur); 1740 break; 1741 } 1742 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1743 if (unlikely(ret < 0)) { 1744 /* 1745 * bio_ctrl may contain a bio crossing several folios. 1746 * Submit it immediately so that the bio has a chance 1747 * to finish normally, other than marked as error. 1748 */ 1749 submit_one_bio(bio_ctrl); 1750 /* 1751 * Failed to grab the extent map which should be very rare. 1752 * Since there is no bio submitted to finish the ordered 1753 * extent, we have to manually finish this sector. 1754 */ 1755 btrfs_mark_ordered_io_finished(inode, folio, cur, 1756 fs_info->sectorsize, false); 1757 if (!found_error) 1758 found_error = ret; 1759 continue; 1760 } 1761 submitted_io = true; 1762 } 1763 1764 /* 1765 * If we didn't submitted any sector (>= i_size), folio dirty get 1766 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared 1767 * by folio_start_writeback() if the folio is not dirty). 1768 * 1769 * Here we set writeback and clear for the range. If the full folio 1770 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1771 * 1772 * If we hit any error, the corresponding sector will have its dirty 1773 * flag cleared and writeback finished, thus no need to handle the error case. 1774 */ 1775 if (!submitted_io && !found_error) { 1776 btrfs_folio_set_writeback(fs_info, folio, start, len); 1777 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1778 } 1779 return found_error; 1780 } 1781 1782 /* 1783 * the writepage semantics are similar to regular writepage. extent 1784 * records are inserted to lock ranges in the tree, and as dirty areas 1785 * are found, they are marked writeback. Then the lock bits are removed 1786 * and the end_io handler clears the writeback ranges 1787 * 1788 * Return 0 if everything goes well. 1789 * Return <0 for error. 1790 */ 1791 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) 1792 { 1793 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 1794 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1795 int ret; 1796 size_t pg_offset; 1797 loff_t i_size = i_size_read(&inode->vfs_inode); 1798 const pgoff_t end_index = i_size >> PAGE_SHIFT; 1799 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1800 1801 trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); 1802 1803 WARN_ON(!folio_test_locked(folio)); 1804 1805 pg_offset = offset_in_folio(folio, i_size); 1806 if (folio->index > end_index || 1807 (folio->index == end_index && !pg_offset)) { 1808 folio_invalidate(folio, 0, folio_size(folio)); 1809 folio_unlock(folio); 1810 return 0; 1811 } 1812 1813 if (folio_contains(folio, end_index)) 1814 folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); 1815 1816 /* 1817 * Default to unlock the whole folio. 1818 * The proper bitmap can only be initialized until writepage_delalloc(). 1819 */ 1820 bio_ctrl->submit_bitmap = (unsigned long)-1; 1821 1822 /* 1823 * If the page is dirty but without private set, it's marked dirty 1824 * without informing the fs. 1825 * Nowadays that is a bug, since the introduction of 1826 * pin_user_pages*(). 1827 * 1828 * So here we check if the page has private set to rule out such 1829 * case. 1830 * But we also have a long history of relying on the COW fixup, 1831 * so here we only enable this check for experimental builds until 1832 * we're sure it's safe. 1833 */ 1834 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && 1835 unlikely(!folio_test_private(folio))) { 1836 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 1837 btrfs_err_rl(fs_info, 1838 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 1839 btrfs_root_id(inode->root), 1840 btrfs_ino(inode), folio_pos(folio)); 1841 ret = -EUCLEAN; 1842 goto done; 1843 } 1844 1845 ret = set_folio_extent_mapped(folio); 1846 if (ret < 0) 1847 goto done; 1848 1849 ret = writepage_delalloc(inode, folio, bio_ctrl); 1850 if (ret == 1) 1851 return 0; 1852 if (ret) 1853 goto done; 1854 1855 ret = extent_writepage_io(inode, folio, folio_pos(folio), 1856 folio_size(folio), bio_ctrl, i_size); 1857 if (ret == 1) 1858 return 0; 1859 if (ret < 0) 1860 btrfs_err_rl(fs_info, 1861 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", 1862 btrfs_root_id(inode->root), btrfs_ino(inode), 1863 folio_pos(folio), blocks_per_folio, 1864 &bio_ctrl->submit_bitmap, ret); 1865 1866 bio_ctrl->wbc->nr_to_write--; 1867 1868 done: 1869 if (ret < 0) 1870 mapping_set_error(folio->mapping, ret); 1871 /* 1872 * Only unlock ranges that are submitted. As there can be some async 1873 * submitted ranges inside the folio. 1874 */ 1875 btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1876 ASSERT(ret <= 0); 1877 return ret; 1878 } 1879 1880 /* 1881 * Lock extent buffer status and pages for writeback. 1882 * 1883 * Return %false if the extent buffer doesn't need to be submitted (e.g. the 1884 * extent buffer is not dirty) 1885 * Return %true is the extent buffer is submitted to bio. 1886 */ 1887 static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, 1888 struct writeback_control *wbc) 1889 { 1890 struct btrfs_fs_info *fs_info = eb->fs_info; 1891 bool ret = false; 1892 1893 btrfs_tree_lock(eb); 1894 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 1895 btrfs_tree_unlock(eb); 1896 if (wbc->sync_mode != WB_SYNC_ALL) 1897 return false; 1898 wait_on_extent_buffer_writeback(eb); 1899 btrfs_tree_lock(eb); 1900 } 1901 1902 /* 1903 * We need to do this to prevent races in people who check if the eb is 1904 * under IO since we can end up having no IO bits set for a short period 1905 * of time. 1906 */ 1907 spin_lock(&eb->refs_lock); 1908 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1909 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1910 unsigned long flags; 1911 1912 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 1913 spin_unlock(&eb->refs_lock); 1914 1915 xas_lock_irqsave(&xas, flags); 1916 xas_load(&xas); 1917 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 1918 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 1919 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 1920 xas_unlock_irqrestore(&xas, flags); 1921 1922 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 1923 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 1924 -eb->len, 1925 fs_info->dirty_metadata_batch); 1926 ret = true; 1927 } else { 1928 spin_unlock(&eb->refs_lock); 1929 } 1930 btrfs_tree_unlock(eb); 1931 return ret; 1932 } 1933 1934 static void set_btree_ioerr(struct extent_buffer *eb) 1935 { 1936 struct btrfs_fs_info *fs_info = eb->fs_info; 1937 1938 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 1939 1940 /* 1941 * A read may stumble upon this buffer later, make sure that it gets an 1942 * error and knows there was an error. 1943 */ 1944 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 1945 1946 /* 1947 * We need to set the mapping with the io error as well because a write 1948 * error will flip the file system readonly, and then syncfs() will 1949 * return a 0 because we are readonly if we don't modify the err seq for 1950 * the superblock. 1951 */ 1952 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); 1953 1954 /* 1955 * If writeback for a btree extent that doesn't belong to a log tree 1956 * failed, increment the counter transaction->eb_write_errors. 1957 * We do this because while the transaction is running and before it's 1958 * committing (when we call filemap_fdata[write|wait]_range against 1959 * the btree inode), we might have 1960 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 1961 * returns an error or an error happens during writeback, when we're 1962 * committing the transaction we wouldn't know about it, since the pages 1963 * can be no longer dirty nor marked anymore for writeback (if a 1964 * subsequent modification to the extent buffer didn't happen before the 1965 * transaction commit), which makes filemap_fdata[write|wait]_range not 1966 * able to find the pages which contain errors at transaction 1967 * commit time. So if this happens we must abort the transaction, 1968 * otherwise we commit a super block with btree roots that point to 1969 * btree nodes/leafs whose content on disk is invalid - either garbage 1970 * or the content of some node/leaf from a past generation that got 1971 * cowed or deleted and is no longer valid. 1972 * 1973 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 1974 * not be enough - we need to distinguish between log tree extents vs 1975 * non-log tree extents, and the next filemap_fdatawait_range() call 1976 * will catch and clear such errors in the mapping - and that call might 1977 * be from a log sync and not from a transaction commit. Also, checking 1978 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 1979 * not done and would not be reliable - the eb might have been released 1980 * from memory and reading it back again means that flag would not be 1981 * set (since it's a runtime flag, not persisted on disk). 1982 * 1983 * Using the flags below in the btree inode also makes us achieve the 1984 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 1985 * writeback for all dirty pages and before filemap_fdatawait_range() 1986 * is called, the writeback for all dirty pages had already finished 1987 * with errors - because we were not using AS_EIO/AS_ENOSPC, 1988 * filemap_fdatawait_range() would return success, as it could not know 1989 * that writeback errors happened (the pages were no longer tagged for 1990 * writeback). 1991 */ 1992 switch (eb->log_index) { 1993 case -1: 1994 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 1995 break; 1996 case 0: 1997 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 1998 break; 1999 case 1: 2000 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2001 break; 2002 default: 2003 BUG(); /* unexpected, logic error */ 2004 } 2005 } 2006 2007 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) 2008 { 2009 struct btrfs_fs_info *fs_info = eb->fs_info; 2010 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2011 unsigned long flags; 2012 2013 xas_lock_irqsave(&xas, flags); 2014 xas_load(&xas); 2015 xas_set_mark(&xas, mark); 2016 xas_unlock_irqrestore(&xas, flags); 2017 } 2018 2019 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) 2020 { 2021 struct btrfs_fs_info *fs_info = eb->fs_info; 2022 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2023 unsigned long flags; 2024 2025 xas_lock_irqsave(&xas, flags); 2026 xas_load(&xas); 2027 xas_clear_mark(&xas, mark); 2028 xas_unlock_irqrestore(&xas, flags); 2029 } 2030 2031 static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, 2032 unsigned long start, unsigned long end) 2033 { 2034 XA_STATE(xas, &fs_info->buffer_tree, start); 2035 unsigned int tagged = 0; 2036 void *eb; 2037 2038 xas_lock_irq(&xas); 2039 xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { 2040 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 2041 if (++tagged % XA_CHECK_SCHED) 2042 continue; 2043 xas_pause(&xas); 2044 xas_unlock_irq(&xas); 2045 cond_resched(); 2046 xas_lock_irq(&xas); 2047 } 2048 xas_unlock_irq(&xas); 2049 } 2050 2051 struct eb_batch { 2052 unsigned int nr; 2053 unsigned int cur; 2054 struct extent_buffer *ebs[PAGEVEC_SIZE]; 2055 }; 2056 2057 static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) 2058 { 2059 batch->ebs[batch->nr++] = eb; 2060 return (batch->nr < PAGEVEC_SIZE); 2061 } 2062 2063 static inline void eb_batch_init(struct eb_batch *batch) 2064 { 2065 batch->nr = 0; 2066 batch->cur = 0; 2067 } 2068 2069 static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) 2070 { 2071 if (batch->cur >= batch->nr) 2072 return NULL; 2073 return batch->ebs[batch->cur++]; 2074 } 2075 2076 static inline void eb_batch_release(struct eb_batch *batch) 2077 { 2078 for (unsigned int i = 0; i < batch->nr; i++) 2079 free_extent_buffer(batch->ebs[i]); 2080 eb_batch_init(batch); 2081 } 2082 2083 static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, 2084 xa_mark_t mark) 2085 { 2086 struct extent_buffer *eb; 2087 2088 retry: 2089 eb = xas_find_marked(xas, max, mark); 2090 2091 if (xas_retry(xas, eb)) 2092 goto retry; 2093 2094 if (!eb) 2095 return NULL; 2096 2097 if (!refcount_inc_not_zero(&eb->refs)) { 2098 xas_reset(xas); 2099 goto retry; 2100 } 2101 2102 if (unlikely(eb != xas_reload(xas))) { 2103 free_extent_buffer(eb); 2104 xas_reset(xas); 2105 goto retry; 2106 } 2107 2108 return eb; 2109 } 2110 2111 static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, 2112 unsigned long *start, 2113 unsigned long end, xa_mark_t tag, 2114 struct eb_batch *batch) 2115 { 2116 XA_STATE(xas, &fs_info->buffer_tree, *start); 2117 struct extent_buffer *eb; 2118 2119 rcu_read_lock(); 2120 while ((eb = find_get_eb(&xas, end, tag)) != NULL) { 2121 if (!eb_batch_add(batch, eb)) { 2122 *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); 2123 goto out; 2124 } 2125 } 2126 if (end == ULONG_MAX) 2127 *start = ULONG_MAX; 2128 else 2129 *start = end + 1; 2130 out: 2131 rcu_read_unlock(); 2132 2133 return batch->nr; 2134 } 2135 2136 /* 2137 * The endio specific version which won't touch any unsafe spinlock in endio 2138 * context. 2139 */ 2140 static struct extent_buffer *find_extent_buffer_nolock( 2141 struct btrfs_fs_info *fs_info, u64 start) 2142 { 2143 struct extent_buffer *eb; 2144 unsigned long index = (start >> fs_info->nodesize_bits); 2145 2146 rcu_read_lock(); 2147 eb = xa_load(&fs_info->buffer_tree, index); 2148 if (eb && !refcount_inc_not_zero(&eb->refs)) 2149 eb = NULL; 2150 rcu_read_unlock(); 2151 return eb; 2152 } 2153 2154 static void end_bbio_meta_write(struct btrfs_bio *bbio) 2155 { 2156 struct extent_buffer *eb = bbio->private; 2157 struct folio_iter fi; 2158 2159 if (bbio->bio.bi_status != BLK_STS_OK) 2160 set_btree_ioerr(eb); 2161 2162 bio_for_each_folio_all(fi, &bbio->bio) { 2163 btrfs_meta_folio_clear_writeback(fi.folio, eb); 2164 } 2165 2166 buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); 2167 clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 2168 bio_put(&bbio->bio); 2169 } 2170 2171 static void prepare_eb_write(struct extent_buffer *eb) 2172 { 2173 u32 nritems; 2174 unsigned long start; 2175 unsigned long end; 2176 2177 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2178 2179 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 2180 nritems = btrfs_header_nritems(eb); 2181 if (btrfs_header_level(eb) > 0) { 2182 end = btrfs_node_key_ptr_offset(eb, nritems); 2183 memzero_extent_buffer(eb, end, eb->len - end); 2184 } else { 2185 /* 2186 * Leaf: 2187 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 2188 */ 2189 start = btrfs_item_nr_offset(eb, nritems); 2190 end = btrfs_item_nr_offset(eb, 0); 2191 if (nritems == 0) 2192 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); 2193 else 2194 end += btrfs_item_offset(eb, nritems - 1); 2195 memzero_extent_buffer(eb, start, end - start); 2196 } 2197 } 2198 2199 static noinline_for_stack void write_one_eb(struct extent_buffer *eb, 2200 struct writeback_control *wbc) 2201 { 2202 struct btrfs_fs_info *fs_info = eb->fs_info; 2203 struct btrfs_bio *bbio; 2204 2205 prepare_eb_write(eb); 2206 2207 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 2208 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), 2209 eb->fs_info, end_bbio_meta_write, eb); 2210 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 2211 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 2212 wbc_init_bio(wbc, &bbio->bio); 2213 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 2214 bbio->file_offset = eb->start; 2215 for (int i = 0; i < num_extent_folios(eb); i++) { 2216 struct folio *folio = eb->folios[i]; 2217 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 2218 u32 range_len = min_t(u64, folio_next_pos(folio), 2219 eb->start + eb->len) - range_start; 2220 2221 folio_lock(folio); 2222 btrfs_meta_folio_clear_dirty(folio, eb); 2223 btrfs_meta_folio_set_writeback(folio, eb); 2224 if (!folio_test_dirty(folio)) 2225 wbc->nr_to_write -= folio_nr_pages(folio); 2226 bio_add_folio_nofail(&bbio->bio, folio, range_len, 2227 offset_in_folio(folio, range_start)); 2228 wbc_account_cgroup_owner(wbc, folio, range_len); 2229 folio_unlock(folio); 2230 } 2231 /* 2232 * If the fs is already in error status, do not submit any writeback 2233 * but immediately finish it. 2234 */ 2235 if (unlikely(BTRFS_FS_ERROR(fs_info))) { 2236 btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); 2237 return; 2238 } 2239 btrfs_submit_bbio(bbio, 0); 2240 } 2241 2242 /* 2243 * Wait for all eb writeback in the given range to finish. 2244 * 2245 * @fs_info: The fs_info for this file system. 2246 * @start: The offset of the range to start waiting on writeback. 2247 * @end: The end of the range, inclusive. This is meant to be used in 2248 * conjunction with wait_marked_extents, so this will usually be 2249 * the_next_eb->start - 1. 2250 */ 2251 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, 2252 u64 end) 2253 { 2254 struct eb_batch batch; 2255 unsigned long start_index = (start >> fs_info->nodesize_bits); 2256 unsigned long end_index = (end >> fs_info->nodesize_bits); 2257 2258 eb_batch_init(&batch); 2259 while (start_index <= end_index) { 2260 struct extent_buffer *eb; 2261 unsigned int nr_ebs; 2262 2263 nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, 2264 PAGECACHE_TAG_WRITEBACK, &batch); 2265 if (!nr_ebs) 2266 break; 2267 2268 while ((eb = eb_batch_next(&batch)) != NULL) 2269 wait_on_extent_buffer_writeback(eb); 2270 eb_batch_release(&batch); 2271 cond_resched(); 2272 } 2273 } 2274 2275 int btree_write_cache_pages(struct address_space *mapping, 2276 struct writeback_control *wbc) 2277 { 2278 struct btrfs_eb_write_context ctx = { .wbc = wbc }; 2279 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 2280 int ret = 0; 2281 int done = 0; 2282 int nr_to_write_done = 0; 2283 struct eb_batch batch; 2284 unsigned int nr_ebs; 2285 unsigned long index; 2286 unsigned long end; 2287 int scanned = 0; 2288 xa_mark_t tag; 2289 2290 eb_batch_init(&batch); 2291 if (wbc->range_cyclic) { 2292 index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); 2293 end = -1; 2294 2295 /* 2296 * Start from the beginning does not need to cycle over the 2297 * range, mark it as scanned. 2298 */ 2299 scanned = (index == 0); 2300 } else { 2301 index = (wbc->range_start >> fs_info->nodesize_bits); 2302 end = (wbc->range_end >> fs_info->nodesize_bits); 2303 2304 scanned = 1; 2305 } 2306 if (wbc->sync_mode == WB_SYNC_ALL) 2307 tag = PAGECACHE_TAG_TOWRITE; 2308 else 2309 tag = PAGECACHE_TAG_DIRTY; 2310 btrfs_zoned_meta_io_lock(fs_info); 2311 retry: 2312 if (wbc->sync_mode == WB_SYNC_ALL) 2313 buffer_tree_tag_for_writeback(fs_info, index, end); 2314 while (!done && !nr_to_write_done && (index <= end) && 2315 (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { 2316 struct extent_buffer *eb; 2317 2318 while ((eb = eb_batch_next(&batch)) != NULL) { 2319 ctx.eb = eb; 2320 2321 ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); 2322 if (ret) { 2323 if (ret == -EBUSY) 2324 ret = 0; 2325 2326 if (ret) { 2327 done = 1; 2328 break; 2329 } 2330 continue; 2331 } 2332 2333 if (!lock_extent_buffer_for_io(eb, wbc)) 2334 continue; 2335 2336 /* Implies write in zoned mode. */ 2337 if (ctx.zoned_bg) { 2338 /* Mark the last eb in the block group. */ 2339 btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); 2340 ctx.zoned_bg->meta_write_pointer += eb->len; 2341 } 2342 write_one_eb(eb, wbc); 2343 } 2344 nr_to_write_done = (wbc->nr_to_write <= 0); 2345 eb_batch_release(&batch); 2346 cond_resched(); 2347 } 2348 if (!scanned && !done) { 2349 /* 2350 * We hit the last page and there is more work to be done: wrap 2351 * back to the start of the file 2352 */ 2353 scanned = 1; 2354 index = 0; 2355 goto retry; 2356 } 2357 /* 2358 * If something went wrong, don't allow any metadata write bio to be 2359 * submitted. 2360 * 2361 * This would prevent use-after-free if we had dirty pages not 2362 * cleaned up, which can still happen by fuzzed images. 2363 * 2364 * - Bad extent tree 2365 * Allowing existing tree block to be allocated for other trees. 2366 * 2367 * - Log tree operations 2368 * Exiting tree blocks get allocated to log tree, bumps its 2369 * generation, then get cleaned in tree re-balance. 2370 * Such tree block will not be written back, since it's clean, 2371 * thus no WRITTEN flag set. 2372 * And after log writes back, this tree block is not traced by 2373 * any dirty extent_io_tree. 2374 * 2375 * - Offending tree block gets re-dirtied from its original owner 2376 * Since it has bumped generation, no WRITTEN flag, it can be 2377 * reused without COWing. This tree block will not be traced 2378 * by btrfs_transaction::dirty_pages. 2379 * 2380 * Now such dirty tree block will not be cleaned by any dirty 2381 * extent io tree. Thus we don't want to submit such wild eb 2382 * if the fs already has error. 2383 * 2384 * We can get ret > 0 from submit_extent_folio() indicating how many ebs 2385 * were submitted. Reset it to 0 to avoid false alerts for the caller. 2386 */ 2387 if (ret > 0) 2388 ret = 0; 2389 if (!ret && BTRFS_FS_ERROR(fs_info)) 2390 ret = -EROFS; 2391 2392 if (ctx.zoned_bg) 2393 btrfs_put_block_group(ctx.zoned_bg); 2394 btrfs_zoned_meta_io_unlock(fs_info); 2395 return ret; 2396 } 2397 2398 /* 2399 * Walk the list of dirty pages of the given address space and write all of them. 2400 * 2401 * @mapping: address space structure to write 2402 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2403 * @bio_ctrl: holds context for the write, namely the bio 2404 * 2405 * If a page is already under I/O, write_cache_pages() skips it, even 2406 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2407 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2408 * and msync() need to guarantee that all the data which was dirty at the time 2409 * the call was made get new I/O started against them. If wbc->sync_mode is 2410 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2411 * existing IO to complete. 2412 */ 2413 static int extent_write_cache_pages(struct address_space *mapping, 2414 struct btrfs_bio_ctrl *bio_ctrl) 2415 { 2416 struct writeback_control *wbc = bio_ctrl->wbc; 2417 struct inode *inode = mapping->host; 2418 int ret = 0; 2419 int done = 0; 2420 int nr_to_write_done = 0; 2421 struct folio_batch fbatch; 2422 unsigned int nr_folios; 2423 pgoff_t index; 2424 pgoff_t end; /* Inclusive */ 2425 pgoff_t done_index; 2426 int range_whole = 0; 2427 int scanned = 0; 2428 xa_mark_t tag; 2429 2430 /* 2431 * We have to hold onto the inode so that ordered extents can do their 2432 * work when the IO finishes. The alternative to this is failing to add 2433 * an ordered extent if the igrab() fails there and that is a huge pain 2434 * to deal with, so instead just hold onto the inode throughout the 2435 * writepages operation. If it fails here we are freeing up the inode 2436 * anyway and we'd rather not waste our time writing out stuff that is 2437 * going to be truncated anyway. 2438 */ 2439 if (!igrab(inode)) 2440 return 0; 2441 2442 folio_batch_init(&fbatch); 2443 if (wbc->range_cyclic) { 2444 index = mapping->writeback_index; /* Start from prev offset */ 2445 end = -1; 2446 /* 2447 * Start from the beginning does not need to cycle over the 2448 * range, mark it as scanned. 2449 */ 2450 scanned = (index == 0); 2451 } else { 2452 index = wbc->range_start >> PAGE_SHIFT; 2453 end = wbc->range_end >> PAGE_SHIFT; 2454 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2455 range_whole = 1; 2456 scanned = 1; 2457 } 2458 2459 /* 2460 * We do the tagged writepage as long as the snapshot flush bit is set 2461 * and we are the first one who do the filemap_flush() on this inode. 2462 * 2463 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 2464 * not race in and drop the bit. 2465 */ 2466 if (range_whole && wbc->nr_to_write == LONG_MAX && 2467 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 2468 &BTRFS_I(inode)->runtime_flags)) 2469 wbc->tagged_writepages = 1; 2470 2471 tag = wbc_to_tag(wbc); 2472 retry: 2473 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2474 tag_pages_for_writeback(mapping, index, end); 2475 done_index = index; 2476 while (!done && !nr_to_write_done && (index <= end) && 2477 (nr_folios = filemap_get_folios_tag(mapping, &index, 2478 end, tag, &fbatch))) { 2479 unsigned i; 2480 2481 for (i = 0; i < nr_folios; i++) { 2482 struct folio *folio = fbatch.folios[i]; 2483 2484 done_index = folio_next_index(folio); 2485 /* 2486 * At this point we hold neither the i_pages lock nor 2487 * the folio lock: the folio may be truncated or 2488 * invalidated (changing folio->mapping to NULL). 2489 */ 2490 if (!folio_trylock(folio)) { 2491 submit_write_bio(bio_ctrl, 0); 2492 folio_lock(folio); 2493 } 2494 2495 if (unlikely(folio->mapping != mapping)) { 2496 folio_unlock(folio); 2497 continue; 2498 } 2499 2500 if (!folio_test_dirty(folio)) { 2501 /* Someone wrote it for us. */ 2502 folio_unlock(folio); 2503 continue; 2504 } 2505 2506 /* 2507 * For subpage case, compression can lead to mixed 2508 * writeback and dirty flags, e.g: 2509 * 0 32K 64K 96K 128K 2510 * | |//////||/////| |//| 2511 * 2512 * In above case, [32K, 96K) is asynchronously submitted 2513 * for compression, and [124K, 128K) needs to be written back. 2514 * 2515 * If we didn't wait writeback for page 64K, [128K, 128K) 2516 * won't be submitted as the page still has writeback flag 2517 * and will be skipped in the next check. 2518 * 2519 * This mixed writeback and dirty case is only possible for 2520 * subpage case. 2521 * 2522 * TODO: Remove this check after migrating compression to 2523 * regular submission. 2524 */ 2525 if (wbc->sync_mode != WB_SYNC_NONE || 2526 btrfs_is_subpage(inode_to_fs_info(inode), folio)) { 2527 if (folio_test_writeback(folio)) 2528 submit_write_bio(bio_ctrl, 0); 2529 folio_wait_writeback(folio); 2530 } 2531 2532 if (folio_test_writeback(folio) || 2533 !folio_clear_dirty_for_io(folio)) { 2534 folio_unlock(folio); 2535 continue; 2536 } 2537 2538 ret = extent_writepage(folio, bio_ctrl); 2539 if (ret < 0) { 2540 done = 1; 2541 break; 2542 } 2543 2544 /* 2545 * The filesystem may choose to bump up nr_to_write. 2546 * We have to make sure to honor the new nr_to_write 2547 * at any time. 2548 */ 2549 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && 2550 wbc->nr_to_write <= 0); 2551 } 2552 folio_batch_release(&fbatch); 2553 cond_resched(); 2554 } 2555 if (!scanned && !done) { 2556 /* 2557 * We hit the last page and there is more work to be done: wrap 2558 * back to the start of the file 2559 */ 2560 scanned = 1; 2561 index = 0; 2562 2563 /* 2564 * If we're looping we could run into a page that is locked by a 2565 * writer and that writer could be waiting on writeback for a 2566 * page in our current bio, and thus deadlock, so flush the 2567 * write bio here. 2568 */ 2569 submit_write_bio(bio_ctrl, 0); 2570 goto retry; 2571 } 2572 2573 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 2574 mapping->writeback_index = done_index; 2575 2576 btrfs_add_delayed_iput(BTRFS_I(inode)); 2577 return ret; 2578 } 2579 2580 /* 2581 * Submit the pages in the range to bio for call sites which delalloc range has 2582 * already been ran (aka, ordered extent inserted) and all pages are still 2583 * locked. 2584 */ 2585 void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, 2586 u64 start, u64 end, struct writeback_control *wbc, 2587 bool pages_dirty) 2588 { 2589 bool found_error = false; 2590 int ret = 0; 2591 struct address_space *mapping = inode->i_mapping; 2592 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2593 const u32 sectorsize = fs_info->sectorsize; 2594 loff_t i_size = i_size_read(inode); 2595 u64 cur = start; 2596 struct btrfs_bio_ctrl bio_ctrl = { 2597 .wbc = wbc, 2598 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2599 }; 2600 2601 if (wbc->no_cgroup_owner) 2602 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; 2603 2604 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 2605 2606 while (cur <= end) { 2607 u64 cur_end; 2608 u32 cur_len; 2609 struct folio *folio; 2610 2611 folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); 2612 2613 /* 2614 * This shouldn't happen, the pages are pinned and locked, this 2615 * code is just in case, but shouldn't actually be run. 2616 */ 2617 if (IS_ERR(folio)) { 2618 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2619 cur_len = cur_end + 1 - cur; 2620 btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, 2621 cur, cur_len, false); 2622 mapping_set_error(mapping, PTR_ERR(folio)); 2623 cur = cur_end; 2624 continue; 2625 } 2626 2627 cur_end = min_t(u64, folio_next_pos(folio) - 1, end); 2628 cur_len = cur_end + 1 - cur; 2629 2630 ASSERT(folio_test_locked(folio)); 2631 if (pages_dirty && folio != locked_folio) 2632 ASSERT(folio_test_dirty(folio)); 2633 2634 /* 2635 * Set the submission bitmap to submit all sectors. 2636 * extent_writepage_io() will do the truncation correctly. 2637 */ 2638 bio_ctrl.submit_bitmap = (unsigned long)-1; 2639 ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, 2640 &bio_ctrl, i_size); 2641 if (ret == 1) 2642 goto next_page; 2643 2644 if (ret) 2645 mapping_set_error(mapping, ret); 2646 btrfs_folio_end_lock(fs_info, folio, cur, cur_len); 2647 if (ret < 0) 2648 found_error = true; 2649 next_page: 2650 folio_put(folio); 2651 cur = cur_end + 1; 2652 } 2653 2654 submit_write_bio(&bio_ctrl, found_error ? ret : 0); 2655 } 2656 2657 int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 2658 { 2659 struct inode *inode = mapping->host; 2660 int ret = 0; 2661 struct btrfs_bio_ctrl bio_ctrl = { 2662 .wbc = wbc, 2663 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2664 }; 2665 2666 /* 2667 * Allow only a single thread to do the reloc work in zoned mode to 2668 * protect the write pointer updates. 2669 */ 2670 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 2671 ret = extent_write_cache_pages(mapping, &bio_ctrl); 2672 submit_write_bio(&bio_ctrl, ret); 2673 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 2674 return ret; 2675 } 2676 2677 void btrfs_readahead(struct readahead_control *rac) 2678 { 2679 struct btrfs_bio_ctrl bio_ctrl = { 2680 .opf = REQ_OP_READ | REQ_RAHEAD, 2681 .ractl = rac, 2682 .last_em_start = U64_MAX, 2683 }; 2684 struct folio *folio; 2685 struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); 2686 const u64 start = readahead_pos(rac); 2687 const u64 end = start + readahead_length(rac) - 1; 2688 struct extent_state *cached_state = NULL; 2689 struct extent_map *em_cached = NULL; 2690 2691 lock_extents_for_read(inode, start, end, &cached_state); 2692 2693 while ((folio = readahead_folio(rac)) != NULL) 2694 btrfs_do_readpage(folio, &em_cached, &bio_ctrl); 2695 2696 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2697 2698 if (em_cached) 2699 btrfs_free_extent_map(em_cached); 2700 submit_one_bio(&bio_ctrl); 2701 } 2702 2703 /* 2704 * basic invalidate_folio code, this waits on any locked or writeback 2705 * ranges corresponding to the folio, and then deletes any extent state 2706 * records from the tree 2707 */ 2708 int extent_invalidate_folio(struct extent_io_tree *tree, 2709 struct folio *folio, size_t offset) 2710 { 2711 struct extent_state *cached_state = NULL; 2712 u64 start = folio_pos(folio); 2713 u64 end = start + folio_size(folio) - 1; 2714 size_t blocksize = folio_to_fs_info(folio)->sectorsize; 2715 2716 /* This function is only called for the btree inode */ 2717 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 2718 2719 start += ALIGN(offset, blocksize); 2720 if (start > end) 2721 return 0; 2722 2723 btrfs_lock_extent(tree, start, end, &cached_state); 2724 folio_wait_writeback(folio); 2725 2726 /* 2727 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 2728 * so here we only need to unlock the extent range to free any 2729 * existing extent state. 2730 */ 2731 btrfs_unlock_extent(tree, start, end, &cached_state); 2732 return 0; 2733 } 2734 2735 /* 2736 * A helper for struct address_space_operations::release_folio, this tests for 2737 * areas of the folio that are locked or under IO and drops the related state 2738 * bits if it is safe to drop the folio. 2739 */ 2740 static bool try_release_extent_state(struct extent_io_tree *tree, 2741 struct folio *folio) 2742 { 2743 struct extent_state *cached_state = NULL; 2744 u64 start = folio_pos(folio); 2745 u64 end = start + folio_size(folio) - 1; 2746 u32 range_bits; 2747 u32 clear_bits; 2748 bool ret = false; 2749 int ret2; 2750 2751 btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); 2752 2753 /* 2754 * We can release the folio if it's locked only for ordered extent 2755 * completion, since that doesn't require using the folio. 2756 */ 2757 if ((range_bits & EXTENT_LOCKED) && 2758 !(range_bits & EXTENT_FINISHING_ORDERED)) 2759 goto out; 2760 2761 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | 2762 EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | 2763 EXTENT_FINISHING_ORDERED); 2764 /* 2765 * At this point we can safely clear everything except the locked, 2766 * nodatasum, delalloc new and finishing ordered bits. The delalloc new 2767 * bit will be cleared by ordered extent completion. 2768 */ 2769 ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); 2770 /* 2771 * If clear_extent_bit failed for enomem reasons, we can't allow the 2772 * release to continue. 2773 */ 2774 if (ret2 == 0) 2775 ret = true; 2776 out: 2777 btrfs_free_extent_state(cached_state); 2778 2779 return ret; 2780 } 2781 2782 /* 2783 * a helper for release_folio. As long as there are no locked extents 2784 * in the range corresponding to the page, both state records and extent 2785 * map records are removed 2786 */ 2787 bool try_release_extent_mapping(struct folio *folio, gfp_t mask) 2788 { 2789 u64 start = folio_pos(folio); 2790 u64 end = start + folio_size(folio) - 1; 2791 struct btrfs_inode *inode = folio_to_inode(folio); 2792 struct extent_io_tree *io_tree = &inode->io_tree; 2793 2794 while (start <= end) { 2795 const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); 2796 const u64 len = end - start + 1; 2797 struct extent_map_tree *extent_tree = &inode->extent_tree; 2798 struct extent_map *em; 2799 2800 write_lock(&extent_tree->lock); 2801 em = btrfs_lookup_extent_mapping(extent_tree, start, len); 2802 if (!em) { 2803 write_unlock(&extent_tree->lock); 2804 break; 2805 } 2806 if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { 2807 write_unlock(&extent_tree->lock); 2808 btrfs_free_extent_map(em); 2809 break; 2810 } 2811 if (btrfs_test_range_bit_exists(io_tree, em->start, 2812 btrfs_extent_map_end(em) - 1, 2813 EXTENT_LOCKED)) 2814 goto next; 2815 /* 2816 * If it's not in the list of modified extents, used by a fast 2817 * fsync, we can remove it. If it's being logged we can safely 2818 * remove it since fsync took an extra reference on the em. 2819 */ 2820 if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) 2821 goto remove_em; 2822 /* 2823 * If it's in the list of modified extents, remove it only if 2824 * its generation is older then the current one, in which case 2825 * we don't need it for a fast fsync. Otherwise don't remove it, 2826 * we could be racing with an ongoing fast fsync that could miss 2827 * the new extent. 2828 */ 2829 if (em->generation >= cur_gen) 2830 goto next; 2831 remove_em: 2832 /* 2833 * We only remove extent maps that are not in the list of 2834 * modified extents or that are in the list but with a 2835 * generation lower then the current generation, so there is no 2836 * need to set the full fsync flag on the inode (it hurts the 2837 * fsync performance for workloads with a data size that exceeds 2838 * or is close to the system's memory). 2839 */ 2840 btrfs_remove_extent_mapping(inode, em); 2841 /* Once for the inode's extent map tree. */ 2842 btrfs_free_extent_map(em); 2843 next: 2844 start = btrfs_extent_map_end(em); 2845 write_unlock(&extent_tree->lock); 2846 2847 /* Once for us, for the lookup_extent_mapping() reference. */ 2848 btrfs_free_extent_map(em); 2849 2850 if (need_resched()) { 2851 /* 2852 * If we need to resched but we can't block just exit 2853 * and leave any remaining extent maps. 2854 */ 2855 if (!gfpflags_allow_blocking(mask)) 2856 break; 2857 2858 cond_resched(); 2859 } 2860 } 2861 return try_release_extent_state(io_tree, folio); 2862 } 2863 2864 static int extent_buffer_under_io(const struct extent_buffer *eb) 2865 { 2866 return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 2867 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2868 } 2869 2870 static bool folio_range_has_eb(struct folio *folio) 2871 { 2872 struct btrfs_folio_state *bfs; 2873 2874 lockdep_assert_held(&folio->mapping->i_private_lock); 2875 2876 if (folio_test_private(folio)) { 2877 bfs = folio_get_private(folio); 2878 if (atomic_read(&bfs->eb_refs)) 2879 return true; 2880 } 2881 return false; 2882 } 2883 2884 static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) 2885 { 2886 struct btrfs_fs_info *fs_info = eb->fs_info; 2887 struct address_space *mapping = folio->mapping; 2888 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 2889 2890 /* 2891 * For mapped eb, we're going to change the folio private, which should 2892 * be done under the i_private_lock. 2893 */ 2894 if (mapped) 2895 spin_lock(&mapping->i_private_lock); 2896 2897 if (!folio_test_private(folio)) { 2898 if (mapped) 2899 spin_unlock(&mapping->i_private_lock); 2900 return; 2901 } 2902 2903 if (!btrfs_meta_is_subpage(fs_info)) { 2904 /* 2905 * We do this since we'll remove the pages after we've removed 2906 * the eb from the xarray, so we could race and have this page 2907 * now attached to the new eb. So only clear folio if it's 2908 * still connected to this eb. 2909 */ 2910 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2911 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2912 BUG_ON(folio_test_dirty(folio)); 2913 BUG_ON(folio_test_writeback(folio)); 2914 /* We need to make sure we haven't be attached to a new eb. */ 2915 folio_detach_private(folio); 2916 } 2917 if (mapped) 2918 spin_unlock(&mapping->i_private_lock); 2919 return; 2920 } 2921 2922 /* 2923 * For subpage, we can have dummy eb with folio private attached. In 2924 * this case, we can directly detach the private as such folio is only 2925 * attached to one dummy eb, no sharing. 2926 */ 2927 if (!mapped) { 2928 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2929 return; 2930 } 2931 2932 btrfs_folio_dec_eb_refs(fs_info, folio); 2933 2934 /* 2935 * We can only detach the folio private if there are no other ebs in the 2936 * page range and no unfinished IO. 2937 */ 2938 if (!folio_range_has_eb(folio)) 2939 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2940 2941 spin_unlock(&mapping->i_private_lock); 2942 } 2943 2944 /* Release all folios attached to the extent buffer */ 2945 static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) 2946 { 2947 ASSERT(!extent_buffer_under_io(eb)); 2948 2949 for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { 2950 struct folio *folio = eb->folios[i]; 2951 2952 if (!folio) 2953 continue; 2954 2955 detach_extent_buffer_folio(eb, folio); 2956 } 2957 } 2958 2959 /* 2960 * Helper for releasing the extent buffer. 2961 */ 2962 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 2963 { 2964 btrfs_release_extent_buffer_folios(eb); 2965 btrfs_leak_debug_del_eb(eb); 2966 kmem_cache_free(extent_buffer_cache, eb); 2967 } 2968 2969 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 2970 u64 start) 2971 { 2972 struct extent_buffer *eb = NULL; 2973 2974 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 2975 eb->start = start; 2976 eb->len = fs_info->nodesize; 2977 eb->fs_info = fs_info; 2978 init_rwsem(&eb->lock); 2979 2980 btrfs_leak_debug_add_eb(eb); 2981 2982 spin_lock_init(&eb->refs_lock); 2983 refcount_set(&eb->refs, 1); 2984 2985 ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); 2986 2987 return eb; 2988 } 2989 2990 /* 2991 * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() 2992 * does not call folio_put(), and we need to set the folios to NULL so that 2993 * btrfs_release_extent_buffer() will not detach them a second time. 2994 */ 2995 static void cleanup_extent_buffer_folios(struct extent_buffer *eb) 2996 { 2997 const int num_folios = num_extent_folios(eb); 2998 2999 /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ 3000 for (int i = 0; i < num_folios; i++) { 3001 ASSERT(eb->folios[i]); 3002 detach_extent_buffer_folio(eb, eb->folios[i]); 3003 folio_put(eb->folios[i]); 3004 eb->folios[i] = NULL; 3005 } 3006 } 3007 3008 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 3009 { 3010 struct extent_buffer *new; 3011 int num_folios; 3012 int ret; 3013 3014 new = __alloc_extent_buffer(src->fs_info, src->start); 3015 if (new == NULL) 3016 return NULL; 3017 3018 /* 3019 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 3020 * btrfs_release_extent_buffer() have different behavior for 3021 * UNMAPPED subpage extent buffer. 3022 */ 3023 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 3024 3025 ret = alloc_eb_folio_array(new, false); 3026 if (ret) 3027 goto release_eb; 3028 3029 ASSERT(num_extent_folios(src) == num_extent_folios(new), 3030 "%d != %d", num_extent_folios(src), num_extent_folios(new)); 3031 /* Explicitly use the cached num_extent value from now on. */ 3032 num_folios = num_extent_folios(src); 3033 for (int i = 0; i < num_folios; i++) { 3034 struct folio *folio = new->folios[i]; 3035 3036 ret = attach_extent_buffer_folio(new, folio, NULL); 3037 if (ret < 0) 3038 goto cleanup_folios; 3039 WARN_ON(folio_test_dirty(folio)); 3040 } 3041 for (int i = 0; i < num_folios; i++) 3042 folio_put(new->folios[i]); 3043 3044 copy_extent_buffer_full(new, src); 3045 set_extent_buffer_uptodate(new); 3046 3047 return new; 3048 3049 cleanup_folios: 3050 cleanup_extent_buffer_folios(new); 3051 release_eb: 3052 btrfs_release_extent_buffer(new); 3053 return NULL; 3054 } 3055 3056 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 3057 u64 start) 3058 { 3059 struct extent_buffer *eb; 3060 int ret; 3061 3062 eb = __alloc_extent_buffer(fs_info, start); 3063 if (!eb) 3064 return NULL; 3065 3066 ret = alloc_eb_folio_array(eb, false); 3067 if (ret) 3068 goto release_eb; 3069 3070 for (int i = 0; i < num_extent_folios(eb); i++) { 3071 ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); 3072 if (ret < 0) 3073 goto cleanup_folios; 3074 } 3075 for (int i = 0; i < num_extent_folios(eb); i++) 3076 folio_put(eb->folios[i]); 3077 3078 set_extent_buffer_uptodate(eb); 3079 btrfs_set_header_nritems(eb, 0); 3080 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 3081 3082 return eb; 3083 3084 cleanup_folios: 3085 cleanup_extent_buffer_folios(eb); 3086 release_eb: 3087 btrfs_release_extent_buffer(eb); 3088 return NULL; 3089 } 3090 3091 static void check_buffer_tree_ref(struct extent_buffer *eb) 3092 { 3093 int refs; 3094 /* 3095 * The TREE_REF bit is first set when the extent_buffer is added to the 3096 * xarray. It is also reset, if unset, when a new reference is created 3097 * by find_extent_buffer. 3098 * 3099 * It is only cleared in two cases: freeing the last non-tree 3100 * reference to the extent_buffer when its STALE bit is set or 3101 * calling release_folio when the tree reference is the only reference. 3102 * 3103 * In both cases, care is taken to ensure that the extent_buffer's 3104 * pages are not under io. However, release_folio can be concurrently 3105 * called with creating new references, which is prone to race 3106 * conditions between the calls to check_buffer_tree_ref in those 3107 * codepaths and clearing TREE_REF in try_release_extent_buffer. 3108 * 3109 * The actual lifetime of the extent_buffer in the xarray is adequately 3110 * protected by the refcount, but the TREE_REF bit and its corresponding 3111 * reference are not. To protect against this class of races, we call 3112 * check_buffer_tree_ref() from the code paths which trigger io. Note that 3113 * once io is initiated, TREE_REF can no longer be cleared, so that is 3114 * the moment at which any such race is best fixed. 3115 */ 3116 refs = refcount_read(&eb->refs); 3117 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3118 return; 3119 3120 spin_lock(&eb->refs_lock); 3121 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3122 refcount_inc(&eb->refs); 3123 spin_unlock(&eb->refs_lock); 3124 } 3125 3126 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 3127 { 3128 check_buffer_tree_ref(eb); 3129 3130 for (int i = 0; i < num_extent_folios(eb); i++) 3131 folio_mark_accessed(eb->folios[i]); 3132 } 3133 3134 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 3135 u64 start) 3136 { 3137 struct extent_buffer *eb; 3138 3139 eb = find_extent_buffer_nolock(fs_info, start); 3140 if (!eb) 3141 return NULL; 3142 /* 3143 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 3144 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 3145 * another task running free_extent_buffer() might have seen that flag 3146 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 3147 * writeback flags not set) and it's still in the tree (flag 3148 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 3149 * decrementing the extent buffer's reference count twice. So here we 3150 * could race and increment the eb's reference count, clear its stale 3151 * flag, mark it as dirty and drop our reference before the other task 3152 * finishes executing free_extent_buffer, which would later result in 3153 * an attempt to free an extent buffer that is dirty. 3154 */ 3155 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 3156 spin_lock(&eb->refs_lock); 3157 spin_unlock(&eb->refs_lock); 3158 } 3159 mark_extent_buffer_accessed(eb); 3160 return eb; 3161 } 3162 3163 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 3164 u64 start) 3165 { 3166 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3167 struct extent_buffer *eb, *exists = NULL; 3168 int ret; 3169 3170 eb = find_extent_buffer(fs_info, start); 3171 if (eb) 3172 return eb; 3173 eb = alloc_dummy_extent_buffer(fs_info, start); 3174 if (!eb) 3175 return ERR_PTR(-ENOMEM); 3176 eb->fs_info = fs_info; 3177 again: 3178 xa_lock_irq(&fs_info->buffer_tree); 3179 exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, 3180 NULL, eb, GFP_NOFS); 3181 if (xa_is_err(exists)) { 3182 ret = xa_err(exists); 3183 xa_unlock_irq(&fs_info->buffer_tree); 3184 btrfs_release_extent_buffer(eb); 3185 return ERR_PTR(ret); 3186 } 3187 if (exists) { 3188 if (!refcount_inc_not_zero(&exists->refs)) { 3189 /* The extent buffer is being freed, retry. */ 3190 xa_unlock_irq(&fs_info->buffer_tree); 3191 goto again; 3192 } 3193 xa_unlock_irq(&fs_info->buffer_tree); 3194 btrfs_release_extent_buffer(eb); 3195 return exists; 3196 } 3197 xa_unlock_irq(&fs_info->buffer_tree); 3198 check_buffer_tree_ref(eb); 3199 3200 return eb; 3201 #else 3202 /* Stub to avoid linker error when compiled with optimizations turned off. */ 3203 return NULL; 3204 #endif 3205 } 3206 3207 static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, 3208 struct folio *folio) 3209 { 3210 struct extent_buffer *exists; 3211 3212 lockdep_assert_held(&folio->mapping->i_private_lock); 3213 3214 /* 3215 * For subpage case, we completely rely on xarray to ensure we don't try 3216 * to insert two ebs for the same bytenr. So here we always return NULL 3217 * and just continue. 3218 */ 3219 if (btrfs_meta_is_subpage(fs_info)) 3220 return NULL; 3221 3222 /* Page not yet attached to an extent buffer */ 3223 if (!folio_test_private(folio)) 3224 return NULL; 3225 3226 /* 3227 * We could have already allocated an eb for this folio and attached one 3228 * so lets see if we can get a ref on the existing eb, and if we can we 3229 * know it's good and we can just return that one, else we know we can 3230 * just overwrite folio private. 3231 */ 3232 exists = folio_get_private(folio); 3233 if (refcount_inc_not_zero(&exists->refs)) 3234 return exists; 3235 3236 WARN_ON(folio_test_dirty(folio)); 3237 folio_detach_private(folio); 3238 return NULL; 3239 } 3240 3241 /* 3242 * Validate alignment constraints of eb at logical address @start. 3243 */ 3244 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 3245 { 3246 const u32 nodesize = fs_info->nodesize; 3247 3248 if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { 3249 btrfs_err(fs_info, "bad tree block start %llu", start); 3250 return true; 3251 } 3252 3253 if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { 3254 btrfs_err(fs_info, 3255 "tree block is not nodesize aligned, start %llu nodesize %u", 3256 start, nodesize); 3257 return true; 3258 } 3259 if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { 3260 btrfs_err(fs_info, 3261 "tree block is not page aligned, start %llu nodesize %u", 3262 start, nodesize); 3263 return true; 3264 } 3265 if (unlikely(!IS_ALIGNED(start, nodesize) && 3266 !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { 3267 btrfs_warn(fs_info, 3268 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", 3269 start, nodesize); 3270 } 3271 return false; 3272 } 3273 3274 /* 3275 * Return 0 if eb->folios[i] is attached to btree inode successfully. 3276 * Return >0 if there is already another extent buffer for the range, 3277 * and @found_eb_ret would be updated. 3278 * Return -EAGAIN if the filemap has an existing folio but with different size 3279 * than @eb. 3280 * The caller needs to free the existing folios and retry using the same order. 3281 */ 3282 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3283 struct btrfs_folio_state *prealloc, 3284 struct extent_buffer **found_eb_ret) 3285 { 3286 3287 struct btrfs_fs_info *fs_info = eb->fs_info; 3288 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3289 const pgoff_t index = eb->start >> PAGE_SHIFT; 3290 struct folio *existing_folio; 3291 int ret; 3292 3293 ASSERT(found_eb_ret); 3294 3295 /* Caller should ensure the folio exists. */ 3296 ASSERT(eb->folios[i]); 3297 3298 retry: 3299 existing_folio = NULL; 3300 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3301 GFP_NOFS | __GFP_NOFAIL); 3302 if (!ret) 3303 goto finish; 3304 3305 existing_folio = filemap_lock_folio(mapping, index + i); 3306 /* The page cache only exists for a very short time, just retry. */ 3307 if (IS_ERR(existing_folio)) 3308 goto retry; 3309 3310 /* For now, we should only have single-page folios for btree inode. */ 3311 ASSERT(folio_nr_pages(existing_folio) == 1); 3312 3313 if (folio_size(existing_folio) != eb->folio_size) { 3314 folio_unlock(existing_folio); 3315 folio_put(existing_folio); 3316 return -EAGAIN; 3317 } 3318 3319 finish: 3320 spin_lock(&mapping->i_private_lock); 3321 if (existing_folio && btrfs_meta_is_subpage(fs_info)) { 3322 /* We're going to reuse the existing page, can drop our folio now. */ 3323 __free_page(folio_page(eb->folios[i], 0)); 3324 eb->folios[i] = existing_folio; 3325 } else if (existing_folio) { 3326 struct extent_buffer *existing_eb; 3327 3328 existing_eb = grab_extent_buffer(fs_info, existing_folio); 3329 if (existing_eb) { 3330 /* The extent buffer still exists, we can use it directly. */ 3331 *found_eb_ret = existing_eb; 3332 spin_unlock(&mapping->i_private_lock); 3333 folio_unlock(existing_folio); 3334 folio_put(existing_folio); 3335 return 1; 3336 } 3337 /* The extent buffer no longer exists, we can reuse the folio. */ 3338 __free_page(folio_page(eb->folios[i], 0)); 3339 eb->folios[i] = existing_folio; 3340 } 3341 eb->folio_size = folio_size(eb->folios[i]); 3342 eb->folio_shift = folio_shift(eb->folios[i]); 3343 /* Should not fail, as we have preallocated the memory. */ 3344 ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3345 ASSERT(!ret); 3346 /* 3347 * To inform we have an extra eb under allocation, so that 3348 * detach_extent_buffer_page() won't release the folio private when the 3349 * eb hasn't been inserted into the xarray yet. 3350 * 3351 * The ref will be decreased when the eb releases the page, in 3352 * detach_extent_buffer_page(). Thus needs no special handling in the 3353 * error path. 3354 */ 3355 btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3356 spin_unlock(&mapping->i_private_lock); 3357 return 0; 3358 } 3359 3360 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3361 u64 start, u64 owner_root, int level) 3362 { 3363 int attached = 0; 3364 struct extent_buffer *eb; 3365 struct extent_buffer *existing_eb = NULL; 3366 struct btrfs_folio_state *prealloc = NULL; 3367 u64 lockdep_owner = owner_root; 3368 bool page_contig = true; 3369 int uptodate = 1; 3370 int ret; 3371 3372 if (check_eb_alignment(fs_info, start)) 3373 return ERR_PTR(-EINVAL); 3374 3375 #if BITS_PER_LONG == 32 3376 if (start >= MAX_LFS_FILESIZE) { 3377 btrfs_err_rl(fs_info, 3378 "extent buffer %llu is beyond 32bit page cache limit", start); 3379 btrfs_err_32bit_limit(fs_info); 3380 return ERR_PTR(-EOVERFLOW); 3381 } 3382 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 3383 btrfs_warn_32bit_limit(fs_info); 3384 #endif 3385 3386 eb = find_extent_buffer(fs_info, start); 3387 if (eb) 3388 return eb; 3389 3390 eb = __alloc_extent_buffer(fs_info, start); 3391 if (!eb) 3392 return ERR_PTR(-ENOMEM); 3393 3394 /* 3395 * The reloc trees are just snapshots, so we need them to appear to be 3396 * just like any other fs tree WRT lockdep. 3397 */ 3398 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 3399 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 3400 3401 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 3402 3403 /* 3404 * Preallocate folio private for subpage case, so that we won't 3405 * allocate memory with i_private_lock nor page lock hold. 3406 * 3407 * The memory will be freed by attach_extent_buffer_page() or freed 3408 * manually if we exit earlier. 3409 */ 3410 if (btrfs_meta_is_subpage(fs_info)) { 3411 prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); 3412 if (IS_ERR(prealloc)) { 3413 ret = PTR_ERR(prealloc); 3414 goto out; 3415 } 3416 } 3417 3418 reallocate: 3419 /* Allocate all pages first. */ 3420 ret = alloc_eb_folio_array(eb, true); 3421 if (ret < 0) { 3422 btrfs_free_folio_state(prealloc); 3423 goto out; 3424 } 3425 3426 /* Attach all pages to the filemap. */ 3427 for (int i = 0; i < num_extent_folios(eb); i++) { 3428 struct folio *folio; 3429 3430 ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3431 if (ret > 0) { 3432 ASSERT(existing_eb); 3433 goto out; 3434 } 3435 3436 /* 3437 * TODO: Special handling for a corner case where the order of 3438 * folios mismatch between the new eb and filemap. 3439 * 3440 * This happens when: 3441 * 3442 * - the new eb is using higher order folio 3443 * 3444 * - the filemap is still using 0-order folios for the range 3445 * This can happen at the previous eb allocation, and we don't 3446 * have higher order folio for the call. 3447 * 3448 * - the existing eb has already been freed 3449 * 3450 * In this case, we have to free the existing folios first, and 3451 * re-allocate using the same order. 3452 * Thankfully this is not going to happen yet, as we're still 3453 * using 0-order folios. 3454 */ 3455 if (unlikely(ret == -EAGAIN)) { 3456 DEBUG_WARN("folio order mismatch between new eb and filemap"); 3457 goto reallocate; 3458 } 3459 attached++; 3460 3461 /* 3462 * Only after attach_eb_folio_to_filemap(), eb->folios[] is 3463 * reliable, as we may choose to reuse the existing page cache 3464 * and free the allocated page. 3465 */ 3466 folio = eb->folios[i]; 3467 WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); 3468 3469 /* 3470 * Check if the current page is physically contiguous with previous eb 3471 * page. 3472 * At this stage, either we allocated a large folio, thus @i 3473 * would only be 0, or we fall back to per-page allocation. 3474 */ 3475 if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) 3476 page_contig = false; 3477 3478 if (!btrfs_meta_folio_test_uptodate(folio, eb)) 3479 uptodate = 0; 3480 3481 /* 3482 * We can't unlock the pages just yet since the extent buffer 3483 * hasn't been properly inserted into the xarray, this opens a 3484 * race with btree_release_folio() which can free a page while we 3485 * are still filling in all pages for the buffer and we could crash. 3486 */ 3487 } 3488 if (uptodate) 3489 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3490 /* All pages are physically contiguous, can skip cross page handling. */ 3491 if (page_contig) 3492 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3493 again: 3494 xa_lock_irq(&fs_info->buffer_tree); 3495 existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3496 start >> fs_info->nodesize_bits, NULL, eb, 3497 GFP_NOFS); 3498 if (xa_is_err(existing_eb)) { 3499 ret = xa_err(existing_eb); 3500 xa_unlock_irq(&fs_info->buffer_tree); 3501 goto out; 3502 } 3503 if (existing_eb) { 3504 if (!refcount_inc_not_zero(&existing_eb->refs)) { 3505 xa_unlock_irq(&fs_info->buffer_tree); 3506 goto again; 3507 } 3508 xa_unlock_irq(&fs_info->buffer_tree); 3509 goto out; 3510 } 3511 xa_unlock_irq(&fs_info->buffer_tree); 3512 3513 /* add one reference for the tree */ 3514 check_buffer_tree_ref(eb); 3515 3516 /* 3517 * Now it's safe to unlock the pages because any calls to 3518 * btree_release_folio will correctly detect that a page belongs to a 3519 * live buffer and won't free them prematurely. 3520 */ 3521 for (int i = 0; i < num_extent_folios(eb); i++) { 3522 folio_unlock(eb->folios[i]); 3523 /* 3524 * A folio that has been added to an address_space mapping 3525 * should not continue holding the refcount from its original 3526 * allocation indefinitely. 3527 */ 3528 folio_put(eb->folios[i]); 3529 } 3530 return eb; 3531 3532 out: 3533 WARN_ON(!refcount_dec_and_test(&eb->refs)); 3534 3535 /* 3536 * Any attached folios need to be detached before we unlock them. This 3537 * is because when we're inserting our new folios into the mapping, and 3538 * then attaching our eb to that folio. If we fail to insert our folio 3539 * we'll lookup the folio for that index, and grab that EB. We do not 3540 * want that to grab this eb, as we're getting ready to free it. So we 3541 * have to detach it first and then unlock it. 3542 * 3543 * Note: the bounds is num_extent_pages() as we need to go through all slots. 3544 */ 3545 for (int i = 0; i < num_extent_pages(eb); i++) { 3546 struct folio *folio = eb->folios[i]; 3547 3548 if (i < attached) { 3549 ASSERT(folio); 3550 detach_extent_buffer_folio(eb, folio); 3551 folio_unlock(folio); 3552 } else if (!folio) { 3553 continue; 3554 } 3555 3556 folio_put(folio); 3557 eb->folios[i] = NULL; 3558 } 3559 btrfs_release_extent_buffer(eb); 3560 if (ret < 0) 3561 return ERR_PTR(ret); 3562 ASSERT(existing_eb); 3563 return existing_eb; 3564 } 3565 3566 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3567 { 3568 struct extent_buffer *eb = 3569 container_of(head, struct extent_buffer, rcu_head); 3570 3571 kmem_cache_free(extent_buffer_cache, eb); 3572 } 3573 3574 static int release_extent_buffer(struct extent_buffer *eb) 3575 __releases(&eb->refs_lock) 3576 { 3577 lockdep_assert_held(&eb->refs_lock); 3578 3579 if (refcount_dec_and_test(&eb->refs)) { 3580 struct btrfs_fs_info *fs_info = eb->fs_info; 3581 3582 spin_unlock(&eb->refs_lock); 3583 3584 /* 3585 * We're erasing, theoretically there will be no allocations, so 3586 * just use GFP_ATOMIC. 3587 * 3588 * We use cmpxchg instead of erase because we do not know if 3589 * this eb is actually in the tree or not, we could be cleaning 3590 * up an eb that we allocated but never inserted into the tree. 3591 * Thus use cmpxchg to remove it from the tree if it is there, 3592 * or leave the other entry if this isn't in the tree. 3593 * 3594 * The documentation says that putting a NULL value is the same 3595 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3596 * in this case. 3597 */ 3598 xa_cmpxchg_irq(&fs_info->buffer_tree, 3599 eb->start >> fs_info->nodesize_bits, eb, NULL, 3600 GFP_ATOMIC); 3601 3602 btrfs_leak_debug_del_eb(eb); 3603 /* Should be safe to release folios at this point. */ 3604 btrfs_release_extent_buffer_folios(eb); 3605 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3606 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 3607 kmem_cache_free(extent_buffer_cache, eb); 3608 return 1; 3609 } 3610 #endif 3611 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3612 return 1; 3613 } 3614 spin_unlock(&eb->refs_lock); 3615 3616 return 0; 3617 } 3618 3619 void free_extent_buffer(struct extent_buffer *eb) 3620 { 3621 int refs; 3622 if (!eb) 3623 return; 3624 3625 refs = refcount_read(&eb->refs); 3626 while (1) { 3627 if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { 3628 if (refs == 1) 3629 break; 3630 } else if (refs <= 3) { 3631 break; 3632 } 3633 3634 /* Optimization to avoid locking eb->refs_lock. */ 3635 if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) 3636 return; 3637 } 3638 3639 spin_lock(&eb->refs_lock); 3640 if (refcount_read(&eb->refs) == 2 && 3641 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 3642 !extent_buffer_under_io(eb) && 3643 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3644 refcount_dec(&eb->refs); 3645 3646 /* 3647 * I know this is terrible, but it's temporary until we stop tracking 3648 * the uptodate bits and such for the extent buffers. 3649 */ 3650 release_extent_buffer(eb); 3651 } 3652 3653 void free_extent_buffer_stale(struct extent_buffer *eb) 3654 { 3655 if (!eb) 3656 return; 3657 3658 spin_lock(&eb->refs_lock); 3659 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 3660 3661 if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 3662 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3663 refcount_dec(&eb->refs); 3664 release_extent_buffer(eb); 3665 } 3666 3667 static void btree_clear_folio_dirty_tag(struct folio *folio) 3668 { 3669 ASSERT(!folio_test_dirty(folio)); 3670 ASSERT(folio_test_locked(folio)); 3671 xa_lock_irq(&folio->mapping->i_pages); 3672 if (!folio_test_dirty(folio)) 3673 __xa_clear_mark(&folio->mapping->i_pages, folio->index, 3674 PAGECACHE_TAG_DIRTY); 3675 xa_unlock_irq(&folio->mapping->i_pages); 3676 } 3677 3678 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, 3679 struct extent_buffer *eb) 3680 { 3681 struct btrfs_fs_info *fs_info = eb->fs_info; 3682 3683 btrfs_assert_tree_write_locked(eb); 3684 3685 if (trans && btrfs_header_generation(eb) != trans->transid) 3686 return; 3687 3688 /* 3689 * Instead of clearing the dirty flag off of the buffer, mark it as 3690 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve 3691 * write-ordering in zoned mode, without the need to later re-dirty 3692 * the extent_buffer. 3693 * 3694 * The actual zeroout of the buffer will happen later in 3695 * btree_csum_one_bio. 3696 */ 3697 if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3698 set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); 3699 return; 3700 } 3701 3702 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) 3703 return; 3704 3705 buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); 3706 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, 3707 fs_info->dirty_metadata_batch); 3708 3709 for (int i = 0; i < num_extent_folios(eb); i++) { 3710 struct folio *folio = eb->folios[i]; 3711 bool last; 3712 3713 if (!folio_test_dirty(folio)) 3714 continue; 3715 folio_lock(folio); 3716 last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); 3717 if (last) 3718 btree_clear_folio_dirty_tag(folio); 3719 folio_unlock(folio); 3720 } 3721 WARN_ON(refcount_read(&eb->refs) == 0); 3722 } 3723 3724 void set_extent_buffer_dirty(struct extent_buffer *eb) 3725 { 3726 bool was_dirty; 3727 3728 check_buffer_tree_ref(eb); 3729 3730 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3731 3732 WARN_ON(refcount_read(&eb->refs) == 0); 3733 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 3734 WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); 3735 3736 if (!was_dirty) { 3737 bool subpage = btrfs_meta_is_subpage(eb->fs_info); 3738 3739 /* 3740 * For subpage case, we can have other extent buffers in the 3741 * same page, and in clear_extent_buffer_dirty() we 3742 * have to clear page dirty without subpage lock held. 3743 * This can cause race where our page gets dirty cleared after 3744 * we just set it. 3745 * 3746 * Thankfully, clear_extent_buffer_dirty() has locked 3747 * its page for other reasons, we can use page lock to prevent 3748 * the above race. 3749 */ 3750 if (subpage) 3751 folio_lock(eb->folios[0]); 3752 for (int i = 0; i < num_extent_folios(eb); i++) 3753 btrfs_meta_folio_set_dirty(eb->folios[i], eb); 3754 buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); 3755 if (subpage) 3756 folio_unlock(eb->folios[0]); 3757 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, 3758 eb->len, 3759 eb->fs_info->dirty_metadata_batch); 3760 } 3761 #ifdef CONFIG_BTRFS_DEBUG 3762 for (int i = 0; i < num_extent_folios(eb); i++) 3763 ASSERT(folio_test_dirty(eb->folios[i])); 3764 #endif 3765 } 3766 3767 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 3768 { 3769 3770 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3771 for (int i = 0; i < num_extent_folios(eb); i++) { 3772 struct folio *folio = eb->folios[i]; 3773 3774 if (!folio) 3775 continue; 3776 3777 btrfs_meta_folio_clear_uptodate(folio, eb); 3778 } 3779 } 3780 3781 void set_extent_buffer_uptodate(struct extent_buffer *eb) 3782 { 3783 3784 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3785 for (int i = 0; i < num_extent_folios(eb); i++) 3786 btrfs_meta_folio_set_uptodate(eb->folios[i], eb); 3787 } 3788 3789 static void clear_extent_buffer_reading(struct extent_buffer *eb) 3790 { 3791 clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); 3792 } 3793 3794 static void end_bbio_meta_read(struct btrfs_bio *bbio) 3795 { 3796 struct extent_buffer *eb = bbio->private; 3797 bool uptodate = !bbio->bio.bi_status; 3798 3799 /* 3800 * If the extent buffer is marked UPTODATE before the read operation 3801 * completes, other calls to read_extent_buffer_pages() will return 3802 * early without waiting for the read to finish, causing data races. 3803 */ 3804 WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); 3805 3806 eb->read_mirror = bbio->mirror_num; 3807 3808 if (uptodate && 3809 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) 3810 uptodate = false; 3811 3812 if (uptodate) 3813 set_extent_buffer_uptodate(eb); 3814 else 3815 clear_extent_buffer_uptodate(eb); 3816 3817 clear_extent_buffer_reading(eb); 3818 free_extent_buffer(eb); 3819 3820 bio_put(&bbio->bio); 3821 } 3822 3823 int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, 3824 const struct btrfs_tree_parent_check *check) 3825 { 3826 struct btrfs_bio *bbio; 3827 3828 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3829 return 0; 3830 3831 /* 3832 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 3833 * operation, which could potentially still be in flight. In this case 3834 * we simply want to return an error. 3835 */ 3836 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 3837 return -EIO; 3838 3839 /* Someone else is already reading the buffer, just wait for it. */ 3840 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) 3841 return 0; 3842 3843 /* 3844 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above 3845 * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have 3846 * started and finished reading the same eb. In this case, UPTODATE 3847 * will now be set, and we shouldn't read it in again. 3848 */ 3849 if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { 3850 clear_extent_buffer_reading(eb); 3851 return 0; 3852 } 3853 3854 eb->read_mirror = 0; 3855 check_buffer_tree_ref(eb); 3856 refcount_inc(&eb->refs); 3857 3858 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 3859 REQ_OP_READ | REQ_META, eb->fs_info, 3860 end_bbio_meta_read, eb); 3861 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 3862 bbio->inode = BTRFS_I(eb->fs_info->btree_inode); 3863 bbio->file_offset = eb->start; 3864 memcpy(&bbio->parent_check, check, sizeof(*check)); 3865 for (int i = 0; i < num_extent_folios(eb); i++) { 3866 struct folio *folio = eb->folios[i]; 3867 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 3868 u32 range_len = min_t(u64, folio_next_pos(folio), 3869 eb->start + eb->len) - range_start; 3870 3871 bio_add_folio_nofail(&bbio->bio, folio, range_len, 3872 offset_in_folio(folio, range_start)); 3873 } 3874 btrfs_submit_bbio(bbio, mirror_num); 3875 return 0; 3876 } 3877 3878 int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, 3879 const struct btrfs_tree_parent_check *check) 3880 { 3881 int ret; 3882 3883 ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); 3884 if (ret < 0) 3885 return ret; 3886 3887 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3888 if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) 3889 return -EIO; 3890 return 0; 3891 } 3892 3893 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 3894 unsigned long len) 3895 { 3896 btrfs_warn(eb->fs_info, 3897 "access to eb bytenr %llu len %u out of range start %lu len %lu", 3898 eb->start, eb->len, start, len); 3899 DEBUG_WARN(); 3900 3901 return true; 3902 } 3903 3904 /* 3905 * Check if the [start, start + len) range is valid before reading/writing 3906 * the eb. 3907 * NOTE: @start and @len are offset inside the eb, not logical address. 3908 * 3909 * Caller should not touch the dst/src memory if this function returns error. 3910 */ 3911 static inline int check_eb_range(const struct extent_buffer *eb, 3912 unsigned long start, unsigned long len) 3913 { 3914 unsigned long offset; 3915 3916 /* start, start + len should not go beyond eb->len nor overflow */ 3917 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 3918 return report_eb_range(eb, start, len); 3919 3920 return false; 3921 } 3922 3923 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 3924 unsigned long start, unsigned long len) 3925 { 3926 const int unit_size = eb->folio_size; 3927 size_t cur; 3928 size_t offset; 3929 char *dst = (char *)dstv; 3930 unsigned long i = get_eb_folio_index(eb, start); 3931 3932 if (check_eb_range(eb, start, len)) { 3933 /* 3934 * Invalid range hit, reset the memory, so callers won't get 3935 * some random garbage for their uninitialized memory. 3936 */ 3937 memset(dstv, 0, len); 3938 return; 3939 } 3940 3941 if (eb->addr) { 3942 memcpy(dstv, eb->addr + start, len); 3943 return; 3944 } 3945 3946 offset = get_eb_offset_in_folio(eb, start); 3947 3948 while (len > 0) { 3949 char *kaddr; 3950 3951 cur = min(len, unit_size - offset); 3952 kaddr = folio_address(eb->folios[i]); 3953 memcpy(dst, kaddr + offset, cur); 3954 3955 dst += cur; 3956 len -= cur; 3957 offset = 0; 3958 i++; 3959 } 3960 } 3961 3962 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 3963 void __user *dstv, 3964 unsigned long start, unsigned long len) 3965 { 3966 const int unit_size = eb->folio_size; 3967 size_t cur; 3968 size_t offset; 3969 char __user *dst = (char __user *)dstv; 3970 unsigned long i = get_eb_folio_index(eb, start); 3971 int ret = 0; 3972 3973 WARN_ON(start > eb->len); 3974 WARN_ON(start + len > eb->start + eb->len); 3975 3976 if (eb->addr) { 3977 if (copy_to_user_nofault(dstv, eb->addr + start, len)) 3978 ret = -EFAULT; 3979 return ret; 3980 } 3981 3982 offset = get_eb_offset_in_folio(eb, start); 3983 3984 while (len > 0) { 3985 char *kaddr; 3986 3987 cur = min(len, unit_size - offset); 3988 kaddr = folio_address(eb->folios[i]); 3989 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 3990 ret = -EFAULT; 3991 break; 3992 } 3993 3994 dst += cur; 3995 len -= cur; 3996 offset = 0; 3997 i++; 3998 } 3999 4000 return ret; 4001 } 4002 4003 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 4004 unsigned long start, unsigned long len) 4005 { 4006 const int unit_size = eb->folio_size; 4007 size_t cur; 4008 size_t offset; 4009 char *kaddr; 4010 char *ptr = (char *)ptrv; 4011 unsigned long i = get_eb_folio_index(eb, start); 4012 int ret = 0; 4013 4014 if (check_eb_range(eb, start, len)) 4015 return -EINVAL; 4016 4017 if (eb->addr) 4018 return memcmp(ptrv, eb->addr + start, len); 4019 4020 offset = get_eb_offset_in_folio(eb, start); 4021 4022 while (len > 0) { 4023 cur = min(len, unit_size - offset); 4024 kaddr = folio_address(eb->folios[i]); 4025 ret = memcmp(ptr, kaddr + offset, cur); 4026 if (ret) 4027 break; 4028 4029 ptr += cur; 4030 len -= cur; 4031 offset = 0; 4032 i++; 4033 } 4034 return ret; 4035 } 4036 4037 /* 4038 * Check that the extent buffer is uptodate. 4039 * 4040 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 4041 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 4042 */ 4043 static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) 4044 { 4045 struct btrfs_fs_info *fs_info = eb->fs_info; 4046 struct folio *folio = eb->folios[i]; 4047 4048 ASSERT(folio); 4049 4050 /* 4051 * If we are using the commit root we could potentially clear a page 4052 * Uptodate while we're using the extent buffer that we've previously 4053 * looked up. We don't want to complain in this case, as the page was 4054 * valid before, we just didn't write it out. Instead we want to catch 4055 * the case where we didn't actually read the block properly, which 4056 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. 4057 */ 4058 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4059 return; 4060 4061 if (btrfs_meta_is_subpage(fs_info)) { 4062 folio = eb->folios[0]; 4063 ASSERT(i == 0); 4064 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, 4065 eb->start, eb->len))) 4066 btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); 4067 } else { 4068 WARN_ON(!folio_test_uptodate(folio)); 4069 } 4070 } 4071 4072 static void __write_extent_buffer(const struct extent_buffer *eb, 4073 const void *srcv, unsigned long start, 4074 unsigned long len, bool use_memmove) 4075 { 4076 const int unit_size = eb->folio_size; 4077 size_t cur; 4078 size_t offset; 4079 char *kaddr; 4080 const char *src = (const char *)srcv; 4081 unsigned long i = get_eb_folio_index(eb, start); 4082 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4083 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4084 4085 if (check_eb_range(eb, start, len)) 4086 return; 4087 4088 if (eb->addr) { 4089 if (use_memmove) 4090 memmove(eb->addr + start, srcv, len); 4091 else 4092 memcpy(eb->addr + start, srcv, len); 4093 return; 4094 } 4095 4096 offset = get_eb_offset_in_folio(eb, start); 4097 4098 while (len > 0) { 4099 if (check_uptodate) 4100 assert_eb_folio_uptodate(eb, i); 4101 4102 cur = min(len, unit_size - offset); 4103 kaddr = folio_address(eb->folios[i]); 4104 if (use_memmove) 4105 memmove(kaddr + offset, src, cur); 4106 else 4107 memcpy(kaddr + offset, src, cur); 4108 4109 src += cur; 4110 len -= cur; 4111 offset = 0; 4112 i++; 4113 } 4114 } 4115 4116 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 4117 unsigned long start, unsigned long len) 4118 { 4119 return __write_extent_buffer(eb, srcv, start, len, false); 4120 } 4121 4122 static void memset_extent_buffer(const struct extent_buffer *eb, int c, 4123 unsigned long start, unsigned long len) 4124 { 4125 const int unit_size = eb->folio_size; 4126 unsigned long cur = start; 4127 4128 if (eb->addr) { 4129 memset(eb->addr + start, c, len); 4130 return; 4131 } 4132 4133 while (cur < start + len) { 4134 unsigned long index = get_eb_folio_index(eb, cur); 4135 unsigned int offset = get_eb_offset_in_folio(eb, cur); 4136 unsigned int cur_len = min(start + len - cur, unit_size - offset); 4137 4138 assert_eb_folio_uptodate(eb, index); 4139 memset(folio_address(eb->folios[index]) + offset, c, cur_len); 4140 4141 cur += cur_len; 4142 } 4143 } 4144 4145 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 4146 unsigned long len) 4147 { 4148 if (check_eb_range(eb, start, len)) 4149 return; 4150 return memset_extent_buffer(eb, 0, start, len); 4151 } 4152 4153 void copy_extent_buffer_full(const struct extent_buffer *dst, 4154 const struct extent_buffer *src) 4155 { 4156 const int unit_size = src->folio_size; 4157 unsigned long cur = 0; 4158 4159 ASSERT(dst->len == src->len); 4160 4161 while (cur < src->len) { 4162 unsigned long index = get_eb_folio_index(src, cur); 4163 unsigned long offset = get_eb_offset_in_folio(src, cur); 4164 unsigned long cur_len = min(src->len, unit_size - offset); 4165 void *addr = folio_address(src->folios[index]) + offset; 4166 4167 write_extent_buffer(dst, addr, cur, cur_len); 4168 4169 cur += cur_len; 4170 } 4171 } 4172 4173 void copy_extent_buffer(const struct extent_buffer *dst, 4174 const struct extent_buffer *src, 4175 unsigned long dst_offset, unsigned long src_offset, 4176 unsigned long len) 4177 { 4178 const int unit_size = dst->folio_size; 4179 u64 dst_len = dst->len; 4180 size_t cur; 4181 size_t offset; 4182 char *kaddr; 4183 unsigned long i = get_eb_folio_index(dst, dst_offset); 4184 4185 if (check_eb_range(dst, dst_offset, len) || 4186 check_eb_range(src, src_offset, len)) 4187 return; 4188 4189 WARN_ON(src->len != dst_len); 4190 4191 offset = get_eb_offset_in_folio(dst, dst_offset); 4192 4193 while (len > 0) { 4194 assert_eb_folio_uptodate(dst, i); 4195 4196 cur = min(len, (unsigned long)(unit_size - offset)); 4197 4198 kaddr = folio_address(dst->folios[i]); 4199 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4200 4201 src_offset += cur; 4202 len -= cur; 4203 offset = 0; 4204 i++; 4205 } 4206 } 4207 4208 /* 4209 * Calculate the folio and offset of the byte containing the given bit number. 4210 * 4211 * @eb: the extent buffer 4212 * @start: offset of the bitmap item in the extent buffer 4213 * @nr: bit number 4214 * @folio_index: return index of the folio in the extent buffer that contains 4215 * the given bit number 4216 * @folio_offset: return offset into the folio given by folio_index 4217 * 4218 * This helper hides the ugliness of finding the byte in an extent buffer which 4219 * contains a given bit. 4220 */ 4221 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 4222 unsigned long start, unsigned long nr, 4223 unsigned long *folio_index, 4224 size_t *folio_offset) 4225 { 4226 size_t byte_offset = BIT_BYTE(nr); 4227 size_t offset; 4228 4229 /* 4230 * The byte we want is the offset of the extent buffer + the offset of 4231 * the bitmap item in the extent buffer + the offset of the byte in the 4232 * bitmap item. 4233 */ 4234 offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; 4235 4236 *folio_index = offset >> eb->folio_shift; 4237 *folio_offset = offset_in_eb_folio(eb, offset); 4238 } 4239 4240 /* 4241 * Determine whether a bit in a bitmap item is set. 4242 * 4243 * @eb: the extent buffer 4244 * @start: offset of the bitmap item in the extent buffer 4245 * @nr: bit number to test 4246 */ 4247 bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 4248 unsigned long nr) 4249 { 4250 unsigned long i; 4251 size_t offset; 4252 u8 *kaddr; 4253 4254 eb_bitmap_offset(eb, start, nr, &i, &offset); 4255 assert_eb_folio_uptodate(eb, i); 4256 kaddr = folio_address(eb->folios[i]); 4257 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 4258 } 4259 4260 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) 4261 { 4262 unsigned long index = get_eb_folio_index(eb, bytenr); 4263 4264 if (check_eb_range(eb, bytenr, 1)) 4265 return NULL; 4266 return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); 4267 } 4268 4269 /* 4270 * Set an area of a bitmap to 1. 4271 * 4272 * @eb: the extent buffer 4273 * @start: offset of the bitmap item in the extent buffer 4274 * @pos: bit number of the first bit 4275 * @len: number of bits to set 4276 */ 4277 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 4278 unsigned long pos, unsigned long len) 4279 { 4280 unsigned int first_byte = start + BIT_BYTE(pos); 4281 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4282 const bool same_byte = (first_byte == last_byte); 4283 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4284 u8 *kaddr; 4285 4286 if (same_byte) 4287 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4288 4289 /* Handle the first byte. */ 4290 kaddr = extent_buffer_get_byte(eb, first_byte); 4291 *kaddr |= mask; 4292 if (same_byte) 4293 return; 4294 4295 /* Handle the byte aligned part. */ 4296 ASSERT(first_byte + 1 <= last_byte); 4297 memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); 4298 4299 /* Handle the last byte. */ 4300 kaddr = extent_buffer_get_byte(eb, last_byte); 4301 *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); 4302 } 4303 4304 4305 /* 4306 * Clear an area of a bitmap. 4307 * 4308 * @eb: the extent buffer 4309 * @start: offset of the bitmap item in the extent buffer 4310 * @pos: bit number of the first bit 4311 * @len: number of bits to clear 4312 */ 4313 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 4314 unsigned long start, unsigned long pos, 4315 unsigned long len) 4316 { 4317 unsigned int first_byte = start + BIT_BYTE(pos); 4318 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4319 const bool same_byte = (first_byte == last_byte); 4320 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4321 u8 *kaddr; 4322 4323 if (same_byte) 4324 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4325 4326 /* Handle the first byte. */ 4327 kaddr = extent_buffer_get_byte(eb, first_byte); 4328 *kaddr &= ~mask; 4329 if (same_byte) 4330 return; 4331 4332 /* Handle the byte aligned part. */ 4333 ASSERT(first_byte + 1 <= last_byte); 4334 memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); 4335 4336 /* Handle the last byte. */ 4337 kaddr = extent_buffer_get_byte(eb, last_byte); 4338 *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); 4339 } 4340 4341 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4342 { 4343 unsigned long distance = (src > dst) ? src - dst : dst - src; 4344 return distance < len; 4345 } 4346 4347 void memcpy_extent_buffer(const struct extent_buffer *dst, 4348 unsigned long dst_offset, unsigned long src_offset, 4349 unsigned long len) 4350 { 4351 const int unit_size = dst->folio_size; 4352 unsigned long cur_off = 0; 4353 4354 if (check_eb_range(dst, dst_offset, len) || 4355 check_eb_range(dst, src_offset, len)) 4356 return; 4357 4358 if (dst->addr) { 4359 const bool use_memmove = areas_overlap(src_offset, dst_offset, len); 4360 4361 if (use_memmove) 4362 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4363 else 4364 memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); 4365 return; 4366 } 4367 4368 while (cur_off < len) { 4369 unsigned long cur_src = cur_off + src_offset; 4370 unsigned long folio_index = get_eb_folio_index(dst, cur_src); 4371 unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); 4372 unsigned long cur_len = min(src_offset + len - cur_src, 4373 unit_size - folio_off); 4374 void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; 4375 const bool use_memmove = areas_overlap(src_offset + cur_off, 4376 dst_offset + cur_off, cur_len); 4377 4378 __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, 4379 use_memmove); 4380 cur_off += cur_len; 4381 } 4382 } 4383 4384 void memmove_extent_buffer(const struct extent_buffer *dst, 4385 unsigned long dst_offset, unsigned long src_offset, 4386 unsigned long len) 4387 { 4388 unsigned long dst_end = dst_offset + len - 1; 4389 unsigned long src_end = src_offset + len - 1; 4390 4391 if (check_eb_range(dst, dst_offset, len) || 4392 check_eb_range(dst, src_offset, len)) 4393 return; 4394 4395 if (dst_offset < src_offset) { 4396 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4397 return; 4398 } 4399 4400 if (dst->addr) { 4401 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4402 return; 4403 } 4404 4405 while (len > 0) { 4406 unsigned long src_i; 4407 size_t cur; 4408 size_t dst_off_in_folio; 4409 size_t src_off_in_folio; 4410 void *src_addr; 4411 bool use_memmove; 4412 4413 src_i = get_eb_folio_index(dst, src_end); 4414 4415 dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); 4416 src_off_in_folio = get_eb_offset_in_folio(dst, src_end); 4417 4418 cur = min_t(unsigned long, len, src_off_in_folio + 1); 4419 cur = min(cur, dst_off_in_folio + 1); 4420 4421 src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - 4422 cur + 1; 4423 use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, 4424 cur); 4425 4426 __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, 4427 use_memmove); 4428 4429 dst_end -= cur; 4430 src_end -= cur; 4431 len -= cur; 4432 } 4433 } 4434 4435 static int try_release_subpage_extent_buffer(struct folio *folio) 4436 { 4437 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4438 struct extent_buffer *eb; 4439 unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); 4440 unsigned long index = start; 4441 unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; 4442 int ret; 4443 4444 rcu_read_lock(); 4445 xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4446 /* 4447 * The same as try_release_extent_buffer(), to ensure the eb 4448 * won't disappear out from under us. 4449 */ 4450 spin_lock(&eb->refs_lock); 4451 rcu_read_unlock(); 4452 4453 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4454 spin_unlock(&eb->refs_lock); 4455 rcu_read_lock(); 4456 continue; 4457 } 4458 4459 /* 4460 * If tree ref isn't set then we know the ref on this eb is a 4461 * real ref, so just return, this eb will likely be freed soon 4462 * anyway. 4463 */ 4464 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4465 spin_unlock(&eb->refs_lock); 4466 break; 4467 } 4468 4469 /* 4470 * Here we don't care about the return value, we will always 4471 * check the folio private at the end. And 4472 * release_extent_buffer() will release the refs_lock. 4473 */ 4474 release_extent_buffer(eb); 4475 rcu_read_lock(); 4476 } 4477 rcu_read_unlock(); 4478 4479 /* 4480 * Finally to check if we have cleared folio private, as if we have 4481 * released all ebs in the page, the folio private should be cleared now. 4482 */ 4483 spin_lock(&folio->mapping->i_private_lock); 4484 if (!folio_test_private(folio)) 4485 ret = 1; 4486 else 4487 ret = 0; 4488 spin_unlock(&folio->mapping->i_private_lock); 4489 return ret; 4490 } 4491 4492 int try_release_extent_buffer(struct folio *folio) 4493 { 4494 struct extent_buffer *eb; 4495 4496 if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) 4497 return try_release_subpage_extent_buffer(folio); 4498 4499 /* 4500 * We need to make sure nobody is changing folio private, as we rely on 4501 * folio private as the pointer to extent buffer. 4502 */ 4503 spin_lock(&folio->mapping->i_private_lock); 4504 if (!folio_test_private(folio)) { 4505 spin_unlock(&folio->mapping->i_private_lock); 4506 return 1; 4507 } 4508 4509 eb = folio_get_private(folio); 4510 BUG_ON(!eb); 4511 4512 /* 4513 * This is a little awful but should be ok, we need to make sure that 4514 * the eb doesn't disappear out from under us while we're looking at 4515 * this page. 4516 */ 4517 spin_lock(&eb->refs_lock); 4518 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4519 spin_unlock(&eb->refs_lock); 4520 spin_unlock(&folio->mapping->i_private_lock); 4521 return 0; 4522 } 4523 spin_unlock(&folio->mapping->i_private_lock); 4524 4525 /* 4526 * If tree ref isn't set then we know the ref on this eb is a real ref, 4527 * so just return, this page will likely be freed soon anyway. 4528 */ 4529 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4530 spin_unlock(&eb->refs_lock); 4531 return 0; 4532 } 4533 4534 return release_extent_buffer(eb); 4535 } 4536 4537 /* 4538 * Attempt to readahead a child block. 4539 * 4540 * @fs_info: the fs_info 4541 * @bytenr: bytenr to read 4542 * @owner_root: objectid of the root that owns this eb 4543 * @gen: generation for the uptodate check, can be 0 4544 * @level: level for the eb 4545 * 4546 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 4547 * normal uptodate check of the eb, without checking the generation. If we have 4548 * to read the block we will not block on anything. 4549 */ 4550 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 4551 u64 bytenr, u64 owner_root, u64 gen, int level) 4552 { 4553 struct btrfs_tree_parent_check check = { 4554 .level = level, 4555 .transid = gen 4556 }; 4557 struct extent_buffer *eb; 4558 int ret; 4559 4560 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 4561 if (IS_ERR(eb)) 4562 return; 4563 4564 if (btrfs_buffer_uptodate(eb, gen, true)) { 4565 free_extent_buffer(eb); 4566 return; 4567 } 4568 4569 ret = read_extent_buffer_pages_nowait(eb, 0, &check); 4570 if (ret < 0) 4571 free_extent_buffer_stale(eb); 4572 else 4573 free_extent_buffer(eb); 4574 } 4575 4576 /* 4577 * Readahead a node's child block. 4578 * 4579 * @node: parent node we're reading from 4580 * @slot: slot in the parent node for the child we want to read 4581 * 4582 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 4583 * the slot in the node provided. 4584 */ 4585 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 4586 { 4587 btrfs_readahead_tree_block(node->fs_info, 4588 btrfs_node_blockptr(node, slot), 4589 btrfs_header_owner(node), 4590 btrfs_node_ptr_generation(node, slot), 4591 btrfs_header_level(node) - 1); 4592 } 4593