1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/folio_batch.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include <linux/lockdep.h> 18 #include "extent_io.h" 19 #include "extent-io-tree.h" 20 #include "extent_map.h" 21 #include "ctree.h" 22 #include "btrfs_inode.h" 23 #include "bio.h" 24 #include "locking.h" 25 #include "backref.h" 26 #include "disk-io.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "block-group.h" 30 #include "compression.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "file-item.h" 34 #include "file.h" 35 #include "dev-replace.h" 36 #include "super.h" 37 #include "transaction.h" 38 39 static struct kmem_cache *extent_buffer_cache; 40 41 #ifdef CONFIG_BTRFS_DEBUG 42 static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) 43 { 44 struct btrfs_fs_info *fs_info = eb->fs_info; 45 unsigned long flags; 46 47 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 48 list_add(&eb->leak_list, &fs_info->allocated_ebs); 49 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 50 } 51 52 static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) 53 { 54 struct btrfs_fs_info *fs_info = eb->fs_info; 55 unsigned long flags; 56 57 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 58 list_del(&eb->leak_list); 59 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 60 } 61 62 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 63 { 64 struct extent_buffer *eb; 65 unsigned long flags; 66 67 /* 68 * If we didn't get into open_ctree our allocated_ebs will not be 69 * initialized, so just skip this. 70 */ 71 if (!fs_info->allocated_ebs.next) 72 return; 73 74 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 75 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 76 while (!list_empty(&fs_info->allocated_ebs)) { 77 eb = list_first_entry(&fs_info->allocated_ebs, 78 struct extent_buffer, leak_list); 79 btrfs_err(fs_info, 80 "buffer leak start %llu len %u refs %d bflags %lu owner %llu", 81 eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, 82 btrfs_header_owner(eb)); 83 list_del(&eb->leak_list); 84 WARN_ON_ONCE(1); 85 kmem_cache_free(extent_buffer_cache, eb); 86 } 87 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 88 } 89 #else 90 #define btrfs_leak_debug_add_eb(eb) do {} while (0) 91 #define btrfs_leak_debug_del_eb(eb) do {} while (0) 92 #endif 93 94 /* 95 * Structure to record info about the bio being assembled, and other info like 96 * how many bytes are there before stripe/ordered extent boundary. 97 */ 98 struct btrfs_bio_ctrl { 99 struct btrfs_bio *bbio; 100 /* Last byte contained in bbio + 1 . */ 101 loff_t next_file_offset; 102 enum btrfs_compression_type compress_type; 103 u32 len_to_oe_boundary; 104 blk_opf_t opf; 105 /* 106 * For data read bios, we attempt to optimize csum lookups if the extent 107 * generation is older than the current one. To make this possible, we 108 * need to track the maximum generation of an extent in a bio_ctrl to 109 * make the decision when submitting the bio. 110 * 111 * The pattern between do_readpage(), submit_one_bio() and 112 * submit_extent_folio() is quite subtle, so tracking this is tricky. 113 * 114 * As we process extent E, we might submit a bio with existing built up 115 * extents before adding E to a new bio, or we might just add E to the 116 * bio. As a result, E's generation could apply to the current bio or 117 * to the next one, so we need to be careful to update the bio_ctrl's 118 * generation with E's only when we are sure E is added to bio_ctrl->bbio 119 * in submit_extent_folio(). 120 * 121 * See the comment in btrfs_lookup_bio_sums() for more detail on the 122 * need for this optimization. 123 */ 124 u64 generation; 125 btrfs_bio_end_io_t end_io_func; 126 struct writeback_control *wbc; 127 128 /* 129 * The sectors of the page which are going to be submitted by 130 * extent_writepage_io(). 131 * This is to avoid touching ranges covered by compression/inline. 132 */ 133 unsigned long submit_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)]; 134 135 struct readahead_control *ractl; 136 137 /* 138 * The start offset of the last used extent map by a read operation. 139 * 140 * This is for proper compressed read merge. 141 * U64_MAX means we are starting the read and have made no progress yet. 142 * 143 * The current btrfs_bio_is_contig() only uses disk_bytenr as 144 * the condition to check if the read can be merged with previous 145 * bio, which is not correct. E.g. two file extents pointing to the 146 * same extent but with different offset. 147 * 148 * So here we need to do extra checks to only merge reads that are 149 * covered by the same extent map. 150 * Just extent_map::start will be enough, as they are unique 151 * inside the same inode. 152 */ 153 u64 last_em_start; 154 }; 155 156 /* 157 * Helper to set the csum search commit root option for a bio_ctrl's bbio 158 * before submitting the bio. 159 * 160 * Only for use by submit_one_bio(). 161 */ 162 static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl) 163 { 164 struct btrfs_bio *bbio = bio_ctrl->bbio; 165 166 ASSERT(bbio); 167 168 if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode))) 169 return; 170 171 bio_ctrl->bbio->csum_search_commit_root = 172 (bio_ctrl->generation && 173 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info)); 174 } 175 176 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 177 { 178 struct btrfs_bio *bbio = bio_ctrl->bbio; 179 180 if (!bbio) 181 return; 182 183 /* Caller should ensure the bio has at least some range added */ 184 ASSERT(bbio->bio.bi_iter.bi_size); 185 186 bio_set_csum_search_commit_root(bio_ctrl); 187 188 if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && 189 bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 190 btrfs_submit_compressed_read(bbio); 191 else 192 btrfs_submit_bbio(bbio, 0); 193 194 /* The bbio is owned by the end_io handler now */ 195 bio_ctrl->bbio = NULL; 196 /* 197 * We used the generation to decide whether to lookup csums in the 198 * commit_root or not when we called bio_set_csum_search_commit_root() 199 * above. Now, reset the generation for the next bio. 200 */ 201 bio_ctrl->generation = 0; 202 } 203 204 /* 205 * Submit or fail the current bio in the bio_ctrl structure. 206 */ 207 static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) 208 { 209 struct btrfs_bio *bbio = bio_ctrl->bbio; 210 211 if (!bbio) 212 return; 213 214 if (ret) { 215 ASSERT(ret < 0); 216 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 217 /* The bio is owned by the end_io handler now */ 218 bio_ctrl->bbio = NULL; 219 } else { 220 submit_one_bio(bio_ctrl); 221 } 222 } 223 224 int __init extent_buffer_init_cachep(void) 225 { 226 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 227 sizeof(struct extent_buffer), 0, 0, 228 NULL); 229 if (!extent_buffer_cache) 230 return -ENOMEM; 231 232 return 0; 233 } 234 235 void __cold extent_buffer_free_cachep(void) 236 { 237 /* 238 * Make sure all delayed rcu free are flushed before we 239 * destroy caches. 240 */ 241 rcu_barrier(); 242 kmem_cache_destroy(extent_buffer_cache); 243 } 244 245 static void process_one_folio(struct btrfs_fs_info *fs_info, 246 struct folio *folio, const struct folio *locked_folio, 247 unsigned long page_ops, u64 start, u64 end) 248 { 249 u32 len; 250 251 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 252 len = end + 1 - start; 253 254 if (page_ops & PAGE_START_WRITEBACK) { 255 btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); 256 btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); 257 } 258 if (page_ops & PAGE_END_WRITEBACK) 259 btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); 260 261 if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) 262 btrfs_folio_end_lock(fs_info, folio, start, len); 263 } 264 265 static void __process_folios_contig(struct address_space *mapping, 266 const struct folio *locked_folio, u64 start, 267 u64 end, unsigned long page_ops) 268 { 269 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 270 pgoff_t index = start >> PAGE_SHIFT; 271 pgoff_t end_index = end >> PAGE_SHIFT; 272 struct folio_batch fbatch; 273 int i; 274 275 folio_batch_init(&fbatch); 276 while (index <= end_index) { 277 int found_folios; 278 279 found_folios = filemap_get_folios_contig(mapping, &index, 280 end_index, &fbatch); 281 for (i = 0; i < found_folios; i++) { 282 struct folio *folio = fbatch.folios[i]; 283 284 process_one_folio(fs_info, folio, locked_folio, 285 page_ops, start, end); 286 } 287 folio_batch_release(&fbatch); 288 cond_resched(); 289 } 290 } 291 292 static noinline void unlock_delalloc_folio(const struct inode *inode, 293 struct folio *locked_folio, 294 u64 start, u64 end) 295 { 296 ASSERT(locked_folio); 297 298 __process_folios_contig(inode->i_mapping, locked_folio, start, end, 299 PAGE_UNLOCK); 300 } 301 302 static noinline int lock_delalloc_folios(struct inode *inode, 303 struct folio *locked_folio, 304 u64 start, u64 end) 305 { 306 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 307 struct address_space *mapping = inode->i_mapping; 308 pgoff_t index = start >> PAGE_SHIFT; 309 pgoff_t end_index = end >> PAGE_SHIFT; 310 u64 processed_end = start; 311 struct folio_batch fbatch; 312 313 folio_batch_init(&fbatch); 314 while (index <= end_index) { 315 unsigned int found_folios, i; 316 317 found_folios = filemap_get_folios_contig(mapping, &index, 318 end_index, &fbatch); 319 if (found_folios == 0) 320 goto out; 321 322 for (i = 0; i < found_folios; i++) { 323 struct folio *folio = fbatch.folios[i]; 324 u64 range_start; 325 u32 range_len; 326 327 if (folio == locked_folio) 328 continue; 329 330 folio_lock(folio); 331 if (!folio_test_dirty(folio) || folio->mapping != mapping) { 332 folio_unlock(folio); 333 goto out; 334 } 335 range_start = max_t(u64, folio_pos(folio), start); 336 range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; 337 btrfs_folio_set_lock(fs_info, folio, range_start, range_len); 338 339 processed_end = range_start + range_len - 1; 340 } 341 folio_batch_release(&fbatch); 342 cond_resched(); 343 } 344 345 return 0; 346 out: 347 folio_batch_release(&fbatch); 348 if (processed_end > start) 349 unlock_delalloc_folio(inode, locked_folio, start, processed_end); 350 return -EAGAIN; 351 } 352 353 /* 354 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 355 * more than @max_bytes. 356 * 357 * @start: The original start bytenr to search. 358 * Will store the extent range start bytenr. 359 * @end: The original end bytenr of the search range 360 * Will store the extent range end bytenr. 361 * 362 * Return true if we find a delalloc range which starts inside the original 363 * range, and @start/@end will store the delalloc range start/end. 364 * 365 * Return false if we can't find any delalloc range which starts inside the 366 * original range, and @start/@end will be the non-delalloc range start/end. 367 */ 368 EXPORT_FOR_TESTS 369 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 370 struct folio *locked_folio, 371 u64 *start, u64 *end) 372 { 373 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 375 const u64 orig_start = *start; 376 const u64 orig_end = *end; 377 u64 max_bytes = fs_info->max_extent_size; 378 u64 delalloc_start; 379 u64 delalloc_end; 380 bool found; 381 struct extent_state *cached_state = NULL; 382 int ret; 383 bool loops = false; 384 385 /* Caller should pass a valid @end to indicate the search range end */ 386 ASSERT(orig_end > orig_start); 387 388 /* The range should at least cover part of the folio */ 389 ASSERT(!(orig_start >= folio_next_pos(locked_folio) || 390 orig_end <= folio_pos(locked_folio))); 391 again: 392 /* step one, find a bunch of delalloc bytes starting at start */ 393 delalloc_start = *start; 394 delalloc_end = 0; 395 396 /* 397 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can 398 * return early without handling any dirty ranges. 399 */ 400 ASSERT(max_bytes >= fs_info->sectorsize); 401 402 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 403 max_bytes, &cached_state); 404 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 405 *start = delalloc_start; 406 407 /* @delalloc_end can be -1, never go beyond @orig_end */ 408 *end = min(delalloc_end, orig_end); 409 btrfs_free_extent_state(cached_state); 410 return false; 411 } 412 413 /* 414 * start comes from the offset of locked_folio. We have to lock 415 * folios in order, so we can't process delalloc bytes before 416 * locked_folio 417 */ 418 if (delalloc_start < *start) 419 delalloc_start = *start; 420 421 /* 422 * make sure to limit the number of folios we try to lock down 423 */ 424 if (delalloc_end + 1 - delalloc_start > max_bytes) 425 delalloc_end = delalloc_start + max_bytes - 1; 426 427 /* step two, lock all the folios after the folios that has start */ 428 ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, 429 delalloc_end); 430 ASSERT(!ret || ret == -EAGAIN); 431 if (ret == -EAGAIN) { 432 /* 433 * Some of the folios are gone, lets avoid looping by 434 * shortening the size of the delalloc range we're searching. 435 */ 436 btrfs_free_extent_state(cached_state); 437 cached_state = NULL; 438 if (!loops) { 439 max_bytes = fs_info->sectorsize; 440 loops = true; 441 goto again; 442 } else { 443 return false; 444 } 445 } 446 447 /* step three, lock the state bits for the whole range */ 448 btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); 449 450 /* then test to make sure it is all still delalloc */ 451 ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, 452 EXTENT_DELALLOC, cached_state); 453 454 btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); 455 if (!ret) { 456 unlock_delalloc_folio(inode, locked_folio, delalloc_start, 457 delalloc_end); 458 cond_resched(); 459 goto again; 460 } 461 *start = delalloc_start; 462 *end = delalloc_end; 463 464 return found; 465 } 466 467 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 468 const struct folio *locked_folio, 469 struct extent_state **cached, 470 u32 clear_bits, unsigned long page_ops) 471 { 472 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); 473 474 __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, 475 end, page_ops); 476 } 477 478 static bool btrfs_verify_folio(struct fsverity_info *vi, struct folio *folio, 479 u64 start, u32 len) 480 { 481 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 482 483 if (!vi || btrfs_folio_test_uptodate(fs_info, folio, start, len)) 484 return true; 485 return fsverity_verify_folio(vi, folio); 486 } 487 488 static void end_folio_read(struct fsverity_info *vi, struct folio *folio, 489 bool uptodate, u64 start, u32 len) 490 { 491 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 492 493 ASSERT(folio_pos(folio) <= start && 494 start + len <= folio_next_pos(folio)); 495 496 if (uptodate && btrfs_verify_folio(vi, folio, start, len)) 497 btrfs_folio_set_uptodate(fs_info, folio, start, len); 498 else 499 btrfs_folio_clear_uptodate(fs_info, folio, start, len); 500 501 if (!btrfs_is_subpage(fs_info, folio)) 502 folio_unlock(folio); 503 else 504 btrfs_folio_end_lock(fs_info, folio, start, len); 505 } 506 507 /* 508 * After a write IO is done, we need to: 509 * 510 * - clear the uptodate bits on error 511 * - clear the writeback bits in the extent tree for the range 512 * - filio_end_writeback() if there is no more pending io for the folio 513 * 514 * Scheduling is not allowed, so the extent state tree is expected 515 * to have one and only one object corresponding to this IO. 516 */ 517 static void end_bbio_data_write(struct btrfs_bio *bbio) 518 { 519 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 520 struct bio *bio = &bbio->bio; 521 int error = blk_status_to_errno(bio->bi_status); 522 struct folio_iter fi; 523 u32 bio_size = 0; 524 525 ASSERT(!bio_flagged(bio, BIO_CLONED)); 526 bio_for_each_folio_all(fi, bio) { 527 struct folio *folio = fi.folio; 528 u64 start = folio_pos(folio) + fi.offset; 529 u32 len = fi.length; 530 531 bio_size += len; 532 btrfs_folio_clear_writeback(fs_info, folio, start, len); 533 } 534 535 if (error) 536 mapping_set_error(bbio->inode->vfs_inode.i_mapping, error); 537 538 btrfs_finish_ordered_extent(bbio->ordered, bbio->file_offset, bio_size, !error); 539 bio_put(bio); 540 } 541 542 static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) 543 { 544 ASSERT(folio_test_locked(folio)); 545 if (!btrfs_is_subpage(fs_info, folio)) 546 return; 547 548 ASSERT(folio_test_private(folio)); 549 btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); 550 } 551 552 /* 553 * After a data read IO is done, we need to: 554 * 555 * - clear the uptodate bits on error 556 * - set the uptodate bits if things worked 557 * - set the folio up to date if all extents in the tree are uptodate 558 * - clear the lock bit in the extent tree 559 * - unlock the folio if there are no other extents locked for it 560 * 561 * Scheduling is not allowed, so the extent state tree is expected 562 * to have one and only one object corresponding to this IO. 563 */ 564 static void end_bbio_data_read(struct btrfs_bio *bbio) 565 { 566 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 567 struct inode *inode = &bbio->inode->vfs_inode; 568 struct bio *bio = &bbio->bio; 569 struct fsverity_info *vi = NULL; 570 struct folio_iter fi; 571 572 ASSERT(!bio_flagged(bio, BIO_CLONED)); 573 574 if (bbio->file_offset < i_size_read(inode)) 575 vi = fsverity_get_info(inode); 576 577 bio_for_each_folio_all(fi, &bbio->bio) { 578 bool uptodate = !bio->bi_status; 579 struct folio *folio = fi.folio; 580 u64 start = folio_pos(folio) + fi.offset; 581 582 btrfs_debug(fs_info, 583 "%s: bi_sector=%llu, err=%d, mirror=%u", 584 __func__, bio->bi_iter.bi_sector, bio->bi_status, 585 bbio->mirror_num); 586 587 588 if (likely(uptodate)) { 589 u64 end = start + fi.length - 1; 590 loff_t i_size = i_size_read(inode); 591 592 /* 593 * Zero out the remaining part if this range straddles 594 * i_size. 595 * 596 * Here we should only zero the range inside the folio, 597 * not touch anything else. 598 * 599 * NOTE: i_size is exclusive while end is inclusive and 600 * folio_contains() takes PAGE_SIZE units. 601 */ 602 if (folio_contains(folio, i_size >> PAGE_SHIFT) && 603 i_size <= end) { 604 u32 zero_start = max(offset_in_folio(folio, i_size), 605 offset_in_folio(folio, start)); 606 u32 zero_len = offset_in_folio(folio, end) + 1 - 607 zero_start; 608 609 folio_zero_range(folio, zero_start, zero_len); 610 } 611 } 612 613 /* Update page status and unlock. */ 614 end_folio_read(vi, folio, uptodate, start, fi.length); 615 } 616 bio_put(bio); 617 } 618 619 /* 620 * Populate every free slot in a provided array with folios. 621 * 622 * @nr_folios: number of folios to allocate 623 * @order: folio order 624 * @folio_array: array to fill with folios; non-NULL entries are skipped 625 * @gfp: GFP flags for the allocation 626 * 627 * Return: 0 if all folios were able to be allocated; 628 * -ENOMEM otherwise, the partially allocated folios would be freed and 629 * the array slots zeroed 630 */ 631 int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order, 632 struct folio **folio_array, gfp_t gfp) 633 { 634 for (int i = 0; i < nr_folios; i++) { 635 if (folio_array[i]) 636 continue; 637 folio_array[i] = folio_alloc(gfp, order); 638 if (!folio_array[i]) 639 goto error; 640 } 641 return 0; 642 error: 643 for (int i = 0; i < nr_folios; i++) { 644 if (folio_array[i]) 645 folio_put(folio_array[i]); 646 folio_array[i] = NULL; 647 } 648 return -ENOMEM; 649 } 650 651 /* 652 * Populate every free slot in a provided array with pages. 653 * 654 * @nr_pages: number of pages to allocate 655 * @page_array: array to fill; non-NULL entries are skipped 656 * @gfp: GFP flags for the allocation 657 * 658 * Return: 0 if all pages were able to be allocated; 659 * -ENOMEM otherwise, the partially allocated pages would be freed and 660 * the array slots zeroed 661 */ 662 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t gfp) 663 { 664 unsigned int allocated; 665 666 for (allocated = 0; allocated < nr_pages;) { 667 unsigned int last = allocated; 668 669 allocated = alloc_pages_bulk(gfp, nr_pages, page_array); 670 if (unlikely(allocated == last)) { 671 /* No progress, fail and do cleanup. */ 672 for (int i = 0; i < allocated; i++) { 673 __free_page(page_array[i]); 674 page_array[i] = NULL; 675 } 676 return -ENOMEM; 677 } 678 } 679 return 0; 680 } 681 682 /* 683 * Populate needed folios for the extent buffer. 684 * 685 * For now, the folios populated are always in order 0 (aka, single page). 686 */ 687 static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t gfp) 688 { 689 struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; 690 int num_pages = num_extent_pages(eb); 691 int ret; 692 693 ret = btrfs_alloc_page_array(num_pages, page_array, gfp); 694 if (ret < 0) 695 return ret; 696 697 for (int i = 0; i < num_pages; i++) 698 eb->folios[i] = page_folio(page_array[i]); 699 eb->folio_size = PAGE_SIZE; 700 eb->folio_shift = PAGE_SHIFT; 701 return 0; 702 } 703 704 static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, 705 u64 disk_bytenr, loff_t file_offset) 706 { 707 struct bio *bio = &bio_ctrl->bbio->bio; 708 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 709 710 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 711 /* 712 * For compression, all IO should have its logical bytenr set 713 * to the starting bytenr of the compressed extent. 714 */ 715 return bio->bi_iter.bi_sector == sector; 716 } 717 718 /* 719 * To merge into a bio both the disk sector and the logical offset in 720 * the file need to be contiguous. 721 */ 722 return bio_ctrl->next_file_offset == file_offset && 723 bio_end_sector(bio) == sector; 724 } 725 726 static int alloc_new_bio(struct btrfs_inode *inode, 727 struct btrfs_bio_ctrl *bio_ctrl, 728 u64 disk_bytenr, u64 file_offset) 729 { 730 struct btrfs_fs_info *fs_info = inode->root->fs_info; 731 struct btrfs_bio *bbio; 732 733 bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode, 734 file_offset, bio_ctrl->end_io_func, NULL); 735 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 736 bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; 737 bio_ctrl->bbio = bbio; 738 bio_ctrl->len_to_oe_boundary = U32_MAX; 739 bio_ctrl->next_file_offset = file_offset; 740 741 /* Limit data write bios to the ordered boundary. */ 742 if (bio_ctrl->wbc) { 743 struct btrfs_ordered_extent *ordered; 744 745 /* This must be a write for data inodes. */ 746 ASSERT(btrfs_op(&bio_ctrl->bbio->bio) == BTRFS_MAP_WRITE); 747 ASSERT(is_data_inode(inode)); 748 749 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 750 if (unlikely(!ordered)) { 751 bio_ctrl->bbio = NULL; 752 bio_ctrl->next_file_offset = 0; 753 bio_put(&bbio->bio); 754 btrfs_err_rl(fs_info, 755 "root %lld ino %llu file offset %llu is marked dirty without notifying the fs", 756 btrfs_root_id(inode->root), btrfs_ino(inode), 757 file_offset); 758 return -EUCLEAN; 759 } 760 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 761 ordered->file_offset + 762 ordered->disk_num_bytes - file_offset); 763 bbio->ordered = ordered; 764 765 /* 766 * Pick the last added device to support cgroup writeback. For 767 * multi-device file systems this means blk-cgroup policies have 768 * to always be set on the last added/replaced device. 769 * This is a bit odd but has been like that for a long time. 770 */ 771 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 772 wbc_init_bio(bio_ctrl->wbc, &bbio->bio); 773 } 774 return 0; 775 } 776 777 /* 778 * @disk_bytenr: logical bytenr where the write will be 779 * @page: page to add to the bio 780 * @size: portion of page that we want to write to 781 * @pg_offset: offset of the new bio or to check whether we are adding 782 * a contiguous page to the previous one 783 * @read_em_generation: generation of the extent_map we are submitting 784 * (only used for read) 785 * 786 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a 787 * new one in @bio_ctrl->bbio. 788 * The mirror number for this IO should already be initialized in 789 * @bio_ctrl->mirror_num. 790 * 791 * Return the number of bytes that are queued into a bio. 792 * If the returned bytes is smaller than @size, it means we hit a critical error 793 * for data write, where there is no ordered extent for the range. 794 */ 795 static unsigned int submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, 796 u64 disk_bytenr, struct folio *folio, 797 size_t size, unsigned long pg_offset, 798 u64 read_em_generation) 799 { 800 struct btrfs_inode *inode = folio_to_inode(folio); 801 loff_t file_offset = folio_pos(folio) + pg_offset; 802 unsigned int queued = 0; 803 804 ASSERT(pg_offset + size <= folio_size(folio)); 805 ASSERT(bio_ctrl->end_io_func); 806 807 if (bio_ctrl->bbio && 808 !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) 809 submit_one_bio(bio_ctrl); 810 811 do { 812 u32 len = size; 813 814 /* Allocate new bio if needed */ 815 if (!bio_ctrl->bbio) { 816 int ret; 817 818 ret = alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); 819 if (ret < 0) 820 break; 821 } 822 823 /* Cap to the current ordered extent boundary if there is one. */ 824 if (len > bio_ctrl->len_to_oe_boundary) { 825 ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); 826 ASSERT(is_data_inode(inode)); 827 len = bio_ctrl->len_to_oe_boundary; 828 } 829 830 if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { 831 /* bio full: move on to a new one */ 832 submit_one_bio(bio_ctrl); 833 continue; 834 } 835 /* 836 * Now that the folio is definitely added to the bio, include its 837 * generation in the max generation calculation. 838 */ 839 bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation); 840 bio_ctrl->next_file_offset += len; 841 842 if (bio_ctrl->wbc) 843 wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); 844 845 size -= len; 846 pg_offset += len; 847 disk_bytenr += len; 848 file_offset += len; 849 queued += len; 850 851 /* 852 * len_to_oe_boundary defaults to U32_MAX, which isn't folio or 853 * sector aligned. alloc_new_bio() then sets it to the end of 854 * our ordered extent for writes into zoned devices. 855 * 856 * When len_to_oe_boundary is tracking an ordered extent, we 857 * trust the ordered extent code to align things properly, and 858 * the check above to cap our write to the ordered extent 859 * boundary is correct. 860 * 861 * When len_to_oe_boundary is U32_MAX, the cap above would 862 * result in a 4095 byte IO for the last folio right before 863 * we hit the bio limit of UINT_MAX. bio_add_folio() has all 864 * the checks required to make sure we don't overflow the bio, 865 * and we should just ignore len_to_oe_boundary completely 866 * unless we're using it to track an ordered extent. 867 * 868 * It's pretty hard to make a bio sized U32_MAX, but it can 869 * happen when the page cache is able to feed us contiguous 870 * folios for large extents. 871 */ 872 if (bio_ctrl->len_to_oe_boundary != U32_MAX) 873 bio_ctrl->len_to_oe_boundary -= len; 874 875 /* Ordered extent boundary: move on to a new bio. */ 876 if (bio_ctrl->len_to_oe_boundary == 0) 877 submit_one_bio(bio_ctrl); 878 /* 879 * If we have accumulated decent amount of IO, send it to the 880 * block layer so that IO can run while we are accumulating 881 * more folios to write. 882 */ 883 else if (bio_ctrl->wbc && 884 bio_ctrl->bbio->bio.bi_iter.bi_size >= 885 inode->root->fs_info->writeback_bio_size) 886 submit_one_bio(bio_ctrl); 887 888 } while (size); 889 return queued; 890 } 891 892 static int attach_extent_buffer_folio(struct extent_buffer *eb, 893 struct folio *folio, 894 struct btrfs_folio_state *prealloc) 895 { 896 struct btrfs_fs_info *fs_info = eb->fs_info; 897 int ret = 0; 898 899 /* 900 * If the page is mapped to btree inode, we should hold the private 901 * lock to prevent race. 902 * For cloned or dummy extent buffers, their pages are not mapped and 903 * will not race with any other ebs. 904 */ 905 if (folio->mapping) 906 lockdep_assert_held(&folio->mapping->i_private_lock); 907 908 if (!btrfs_meta_is_subpage(fs_info)) { 909 if (!folio_test_private(folio)) 910 folio_attach_private(folio, eb); 911 else 912 WARN_ON(folio_get_private(folio) != eb); 913 return 0; 914 } 915 916 /* Already mapped, just free prealloc */ 917 if (folio_test_private(folio)) { 918 btrfs_free_folio_state(prealloc); 919 return 0; 920 } 921 922 if (prealloc) 923 /* Has preallocated memory for subpage */ 924 folio_attach_private(folio, prealloc); 925 else 926 /* Do new allocation to attach subpage */ 927 ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 928 return ret; 929 } 930 931 int set_folio_extent_mapped(struct folio *folio) 932 { 933 struct btrfs_fs_info *fs_info; 934 935 ASSERT(folio->mapping); 936 937 if (folio_test_private(folio)) 938 return 0; 939 940 fs_info = folio_to_fs_info(folio); 941 942 if (btrfs_is_subpage(fs_info, folio)) 943 return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 944 945 folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); 946 return 0; 947 } 948 949 void clear_folio_extent_mapped(struct folio *folio) 950 { 951 struct btrfs_fs_info *fs_info; 952 953 ASSERT(folio->mapping); 954 /* 955 * The folio should not have writeback nor dirty flag set. 956 * 957 * If dirty flag is set, the folio can be written back again and we 958 * expect the private flag set for the folio. 959 * 960 * If writeback flag is set, the endio may need to utilize the 961 * private for btrfs_folio_state. 962 */ 963 ASSERT(!folio_test_dirty(folio)); 964 ASSERT(!folio_test_writeback(folio)); 965 966 if (!folio_test_private(folio)) 967 return; 968 969 fs_info = folio_to_fs_info(folio); 970 if (btrfs_is_subpage(fs_info, folio)) 971 return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); 972 973 folio_detach_private(folio); 974 } 975 976 static struct extent_map *get_extent_map(struct btrfs_inode *inode, 977 struct folio *folio, u64 start, 978 u64 len, struct extent_map **em_cached) 979 { 980 struct extent_map *em; 981 982 ASSERT(em_cached); 983 984 if (*em_cached) { 985 em = *em_cached; 986 if (btrfs_extent_map_in_tree(em) && start >= em->start && 987 start < btrfs_extent_map_end(em)) { 988 refcount_inc(&em->refs); 989 return em; 990 } 991 992 btrfs_free_extent_map(em); 993 *em_cached = NULL; 994 } 995 996 em = btrfs_get_extent(inode, folio, start, len); 997 if (!IS_ERR(em)) { 998 BUG_ON(*em_cached); 999 refcount_inc(&em->refs); 1000 *em_cached = em; 1001 } 1002 1003 return em; 1004 } 1005 1006 static void btrfs_readahead_expand(struct readahead_control *ractl, 1007 const struct extent_map *em) 1008 { 1009 const u64 ra_pos = readahead_pos(ractl); 1010 const u64 ra_end = ra_pos + readahead_length(ractl); 1011 const u64 em_end = btrfs_extent_map_end(em); 1012 1013 /* No expansion for holes and inline extents. */ 1014 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) 1015 return; 1016 1017 ASSERT(em_end >= ra_pos, 1018 "extent_map %llu %llu ends before current readahead position %llu", 1019 em->start, em->len, ra_pos); 1020 if (em_end > ra_end) 1021 readahead_expand(ractl, ra_pos, em_end - ra_pos); 1022 } 1023 1024 /* 1025 * basic readpage implementation. Locked extent state structs are inserted 1026 * into the tree that are removed when the IO is done (by the end_io 1027 * handlers) 1028 * XXX JDM: This needs looking at to ensure proper page locking 1029 * return 0 on success, otherwise return error 1030 */ 1031 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, 1032 struct btrfs_bio_ctrl *bio_ctrl, 1033 struct fsverity_info *vi) 1034 { 1035 struct inode *inode = folio->mapping->host; 1036 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1037 u64 start = folio_pos(folio); 1038 const u64 end = start + folio_size(folio) - 1; 1039 u64 extent_offset; 1040 u64 locked_end; 1041 u64 last_byte = i_size_read(inode); 1042 struct extent_map *em; 1043 int ret = 0; 1044 const size_t blocksize = fs_info->sectorsize; 1045 1046 if (bio_ctrl->ractl) 1047 locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1; 1048 else 1049 locked_end = end; 1050 1051 ret = set_folio_extent_mapped(folio); 1052 if (ret < 0) { 1053 folio_unlock(folio); 1054 return ret; 1055 } 1056 1057 if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { 1058 size_t zero_offset = offset_in_folio(folio, last_byte); 1059 1060 if (zero_offset) 1061 folio_zero_range(folio, zero_offset, 1062 folio_size(folio) - zero_offset); 1063 } 1064 bio_ctrl->end_io_func = end_bbio_data_read; 1065 begin_folio_read(fs_info, folio); 1066 for (u64 cur = start; cur <= end; cur += blocksize) { 1067 enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; 1068 unsigned long pg_offset = offset_in_folio(folio, cur); 1069 bool force_bio_submit = false; 1070 u64 disk_bytenr; 1071 u64 block_start; 1072 u64 em_gen; 1073 unsigned int queued; 1074 1075 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 1076 if (cur >= last_byte) { 1077 folio_zero_range(folio, pg_offset, end - cur + 1); 1078 end_folio_read(vi, folio, true, cur, end - cur + 1); 1079 break; 1080 } 1081 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1082 end_folio_read(vi, folio, true, cur, blocksize); 1083 continue; 1084 } 1085 /* 1086 * Search extent map for the whole locked range. 1087 * This will allow btrfs_get_extent() to return a larger hole 1088 * when possible. 1089 * This can reduce duplicated btrfs_get_extent() calls for large 1090 * holes. 1091 */ 1092 em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached); 1093 if (IS_ERR(em)) { 1094 end_folio_read(vi, folio, false, cur, end + 1 - cur); 1095 return PTR_ERR(em); 1096 } 1097 extent_offset = cur - em->start; 1098 BUG_ON(btrfs_extent_map_end(em) <= cur); 1099 BUG_ON(end < cur); 1100 1101 compress_type = btrfs_extent_map_compression(em); 1102 1103 /* 1104 * Only expand readahead for extents which are already creating 1105 * the pages anyway in add_ra_bio_pages, which is compressed 1106 * extents in the non subpage case. 1107 */ 1108 if (bio_ctrl->ractl && 1109 !btrfs_is_subpage(fs_info, folio) && 1110 compress_type != BTRFS_COMPRESS_NONE) 1111 btrfs_readahead_expand(bio_ctrl->ractl, em); 1112 1113 if (compress_type != BTRFS_COMPRESS_NONE) 1114 disk_bytenr = em->disk_bytenr; 1115 else 1116 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1117 1118 if (em->flags & EXTENT_FLAG_PREALLOC) 1119 block_start = EXTENT_MAP_HOLE; 1120 else 1121 block_start = btrfs_extent_map_block_start(em); 1122 1123 /* 1124 * If we have a file range that points to a compressed extent 1125 * and it's followed by a consecutive file range that points 1126 * to the same compressed extent (possibly with a different 1127 * offset and/or length, so it either points to the whole extent 1128 * or only part of it), we must make sure we do not submit a 1129 * single bio to populate the folios for the 2 ranges because 1130 * this makes the compressed extent read zero out the folios 1131 * belonging to the 2nd range. Imagine the following scenario: 1132 * 1133 * File layout 1134 * [0 - 8K] [8K - 24K] 1135 * | | 1136 * | | 1137 * points to extent X, points to extent X, 1138 * offset 4K, length of 8K offset 0, length 16K 1139 * 1140 * [extent X, compressed length = 4K uncompressed length = 16K] 1141 * 1142 * If the bio to read the compressed extent covers both ranges, 1143 * it will decompress extent X into the folios belonging to the 1144 * first range and then it will stop, zeroing out the remaining 1145 * folios that belong to the other range that points to extent X. 1146 * So here we make sure we submit 2 bios, one for the first 1147 * range and another one for the third range. Both will target 1148 * the same physical extent from disk, but we can't currently 1149 * make the compressed bio endio callback populate the folios 1150 * for both ranges because each compressed bio is tightly 1151 * coupled with a single extent map, and each range can have 1152 * an extent map with a different offset value relative to the 1153 * uncompressed data of our extent and different lengths. This 1154 * is a corner case so we prioritize correctness over 1155 * non-optimal behavior (submitting 2 bios for the same extent). 1156 */ 1157 if (compress_type != BTRFS_COMPRESS_NONE && 1158 bio_ctrl->last_em_start != U64_MAX && 1159 bio_ctrl->last_em_start != em->start) 1160 force_bio_submit = true; 1161 1162 bio_ctrl->last_em_start = em->start; 1163 1164 em_gen = em->generation; 1165 btrfs_free_extent_map(em); 1166 em = NULL; 1167 1168 /* we've found a hole, just zero and go on */ 1169 if (block_start == EXTENT_MAP_HOLE) { 1170 folio_zero_range(folio, pg_offset, blocksize); 1171 end_folio_read(vi, folio, true, cur, blocksize); 1172 continue; 1173 } 1174 /* the get_extent function already copied into the folio */ 1175 if (block_start == EXTENT_MAP_INLINE) { 1176 end_folio_read(vi, folio, true, cur, blocksize); 1177 continue; 1178 } 1179 1180 if (bio_ctrl->compress_type != compress_type) { 1181 submit_one_bio(bio_ctrl); 1182 bio_ctrl->compress_type = compress_type; 1183 } 1184 1185 if (force_bio_submit) 1186 submit_one_bio(bio_ctrl); 1187 queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, 1188 pg_offset, em_gen); 1189 /* Read submission should not fail. */ 1190 ASSERT(queued == blocksize); 1191 } 1192 return 0; 1193 } 1194 1195 /* 1196 * Check if we can skip waiting the @ordered extent covering the block at @fileoff. 1197 * 1198 * @fileoff: Both input and output. 1199 * Input as the file offset where the check should start at. 1200 * Output as where the next check should start at, 1201 * if the function returns true. 1202 * 1203 * Return true if we can skip to @fileoff. The caller needs to check the new 1204 * @fileoff value to make sure it covers the full range, before skipping the 1205 * full OE. 1206 * 1207 * Return false if we must wait for the ordered extent. 1208 */ 1209 static bool can_skip_one_ordered_range(struct btrfs_inode *inode, 1210 struct btrfs_ordered_extent *ordered, 1211 u64 *fileoff) 1212 { 1213 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 1214 struct folio *folio; 1215 const u32 blocksize = fs_info->sectorsize; 1216 u64 cur = *fileoff; 1217 bool ret; 1218 1219 folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); 1220 1221 /* 1222 * We should have locked the folio(s) for range [start, end], thus 1223 * there must be a folio and it must be locked. 1224 */ 1225 ASSERT(!IS_ERR(folio)); 1226 ASSERT(folio_test_locked(folio)); 1227 1228 /* 1229 * There are several cases for the folio and OE combination: 1230 * 1231 * 1) Folio has no private flag 1232 * The OE has all its IO done but not yet finished, and folio got 1233 * invalidated. 1234 * 1235 * Have we have to wait for the OE to finish, as it may contain the 1236 * to-be-inserted data checksum. 1237 * Without the data checksum inserted into the csum tree, read will 1238 * just fail with missing csum. 1239 */ 1240 if (!folio_test_private(folio)) { 1241 ret = false; 1242 goto out; 1243 } 1244 1245 /* 1246 * 2) The first block is DIRTY. 1247 * 1248 * This means the OE is created by some other folios whose file pos is 1249 * before this one. And since we are holding the folio lock, the writeback 1250 * of this folio cannot start. 1251 * 1252 * We must skip the whole OE, because it will never start until we 1253 * finished our folio read and unlocked the folio. 1254 */ 1255 if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { 1256 u64 range_len = umin(folio_next_pos(folio), 1257 ordered->file_offset + ordered->num_bytes) - cur; 1258 1259 ret = true; 1260 /* 1261 * At least inside the folio, all the remaining blocks should 1262 * also be dirty. 1263 */ 1264 ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); 1265 *fileoff = ordered->file_offset + ordered->num_bytes; 1266 goto out; 1267 } 1268 1269 /* 1270 * 3) The first block is uptodate. 1271 * 1272 * At least the first block can be skipped, but we are still not fully 1273 * sure. E.g. if the OE has some other folios in the range that cannot 1274 * be skipped. 1275 * So we return true and update @next_ret to the OE/folio boundary. 1276 */ 1277 if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { 1278 u64 range_len = umin(folio_next_pos(folio), 1279 ordered->file_offset + ordered->num_bytes) - cur; 1280 1281 /* 1282 * The whole range to the OE end or folio boundary should also 1283 * be uptodate. 1284 */ 1285 ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); 1286 ret = true; 1287 *fileoff = cur + range_len; 1288 goto out; 1289 } 1290 1291 /* 1292 * 4) The first block is not uptodate. 1293 * 1294 * This means the folio is invalidated after the writeback was finished, 1295 * but by some other operations (e.g. block aligned buffered write) the 1296 * folio is inserted into filemap. 1297 * Very much the same as case 1). 1298 */ 1299 ret = false; 1300 out: 1301 folio_put(folio); 1302 return ret; 1303 } 1304 1305 static bool can_skip_ordered_extent(struct btrfs_inode *inode, 1306 struct btrfs_ordered_extent *ordered, 1307 u64 start, u64 end) 1308 { 1309 const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); 1310 u64 cur = max(start, ordered->file_offset); 1311 1312 while (cur < range_end) { 1313 bool can_skip; 1314 1315 can_skip = can_skip_one_ordered_range(inode, ordered, &cur); 1316 if (!can_skip) 1317 return false; 1318 } 1319 return true; 1320 } 1321 1322 /* 1323 * Locking helper to make sure we get a stable view of extent maps for the 1324 * involved range. 1325 * 1326 * This is for folio read paths (read and readahead), thus the involved range 1327 * should have all the folios locked. 1328 */ 1329 static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, 1330 struct extent_state **cached_state) 1331 { 1332 u64 cur_pos; 1333 1334 /* Caller must provide a valid @cached_state. */ 1335 ASSERT(cached_state); 1336 1337 /* The range must at least be page aligned, as all read paths are folio based. */ 1338 ASSERT(IS_ALIGNED(start, PAGE_SIZE)); 1339 ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); 1340 1341 again: 1342 btrfs_lock_extent(&inode->io_tree, start, end, cached_state); 1343 cur_pos = start; 1344 while (cur_pos < end) { 1345 struct btrfs_ordered_extent *ordered; 1346 1347 ordered = btrfs_lookup_ordered_range(inode, cur_pos, 1348 end - cur_pos + 1); 1349 /* 1350 * No ordered extents in the range, and we hold the extent lock, 1351 * no one can modify the extent maps in the range, we're safe to return. 1352 */ 1353 if (!ordered) 1354 break; 1355 1356 /* Check if we can skip waiting for the whole OE. */ 1357 if (can_skip_ordered_extent(inode, ordered, start, end)) { 1358 cur_pos = min(ordered->file_offset + ordered->num_bytes, 1359 end + 1); 1360 btrfs_put_ordered_extent(ordered); 1361 continue; 1362 } 1363 1364 /* Now wait for the OE to finish. */ 1365 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1366 btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); 1367 btrfs_put_ordered_extent(ordered); 1368 /* We have unlocked the whole range, restart from the beginning. */ 1369 goto again; 1370 } 1371 } 1372 1373 int btrfs_read_folio(struct file *file, struct folio *folio) 1374 { 1375 struct inode *vfs_inode = folio->mapping->host; 1376 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 1377 const u64 start = folio_pos(folio); 1378 const u64 end = start + folio_size(folio) - 1; 1379 struct extent_state *cached_state = NULL; 1380 struct btrfs_bio_ctrl bio_ctrl = { 1381 .opf = REQ_OP_READ, 1382 .last_em_start = U64_MAX, 1383 }; 1384 struct extent_map *em_cached = NULL; 1385 struct fsverity_info *vi = NULL; 1386 int ret; 1387 1388 lock_extents_for_read(inode, start, end, &cached_state); 1389 if (folio_pos(folio) < i_size_read(vfs_inode)) 1390 vi = fsverity_get_info(vfs_inode); 1391 ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, vi); 1392 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1393 1394 btrfs_free_extent_map(em_cached); 1395 1396 /* 1397 * If btrfs_do_readpage() failed we will want to submit the assembled 1398 * bio to do the cleanup. 1399 */ 1400 submit_one_bio(&bio_ctrl); 1401 return ret; 1402 } 1403 1404 static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, 1405 u64 start, u32 len) 1406 { 1407 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1408 const u64 folio_start = folio_pos(folio); 1409 unsigned int start_bit; 1410 unsigned int nbits; 1411 1412 ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); 1413 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1414 nbits = len >> fs_info->sectorsize_bits; 1415 ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); 1416 bitmap_set(delalloc_bitmap, start_bit, nbits); 1417 } 1418 1419 static bool find_next_delalloc_bitmap(struct folio *folio, 1420 unsigned long *delalloc_bitmap, u64 start, 1421 u64 *found_start, u32 *found_len) 1422 { 1423 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 1424 const u64 folio_start = folio_pos(folio); 1425 const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); 1426 unsigned int start_bit; 1427 unsigned int first_zero; 1428 unsigned int first_set; 1429 1430 ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); 1431 1432 start_bit = (start - folio_start) >> fs_info->sectorsize_bits; 1433 first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); 1434 if (first_set >= bitmap_size) 1435 return false; 1436 1437 *found_start = folio_start + (first_set << fs_info->sectorsize_bits); 1438 first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); 1439 *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; 1440 return true; 1441 } 1442 1443 /* 1444 * Do all of the delayed allocation setup. 1445 * 1446 * Return >0 if all the dirty blocks are submitted async (compression) or inlined. 1447 * The @folio should no longer be touched (treat it as already unlocked). 1448 * 1449 * Return 0 if there is still dirty block that needs to be submitted through 1450 * extent_writepage_io(). 1451 * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be 1452 * submitted, and @folio is still kept locked. 1453 * 1454 * Return <0 if there is any error hit. 1455 * Any allocated ordered extent range covering this folio will be marked 1456 * finished (IOERR), and @folio is still kept locked. 1457 */ 1458 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 1459 struct folio *folio, 1460 struct btrfs_bio_ctrl *bio_ctrl) 1461 { 1462 struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); 1463 struct writeback_control *wbc = bio_ctrl->wbc; 1464 const bool is_subpage = btrfs_is_subpage(fs_info, folio); 1465 const u64 page_start = folio_pos(folio); 1466 const u64 page_end = page_start + folio_size(folio) - 1; 1467 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1468 unsigned long delalloc_bitmap[BITS_TO_LONGS(BTRFS_MAX_BLOCKS_PER_FOLIO)] = { 0 }; 1469 /* 1470 * Save the last found delalloc end. As the delalloc end can go beyond 1471 * page boundary, thus we cannot rely on subpage bitmap to locate the 1472 * last delalloc end. 1473 */ 1474 u64 last_delalloc_end = 0; 1475 /* 1476 * The range end (exclusive) of the last successfully finished delalloc 1477 * range. 1478 * Any range covered by ordered extent must either be manually marked 1479 * finished (error handling), or has IO submitted (and finish the 1480 * ordered extent normally). 1481 * 1482 * This records the end of ordered extent cleanup if we hit an error. 1483 */ 1484 u64 last_finished_delalloc_end = page_start; 1485 u64 delalloc_start = page_start; 1486 u64 delalloc_end = page_end; 1487 u64 delalloc_to_write = 0; 1488 unsigned int start_bit; 1489 unsigned int end_bit; 1490 int ret = 0; 1491 1492 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1493 btrfs_copy_subpage_dirty_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1494 1495 for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap, 1496 blocks_per_folio) { 1497 u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1498 u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1499 1500 btrfs_folio_set_lock(fs_info, folio, start, len); 1501 } 1502 1503 /* Lock all (subpage) delalloc ranges inside the folio first. */ 1504 while (delalloc_start < page_end) { 1505 delalloc_end = page_end; 1506 if (!find_lock_delalloc_range(&inode->vfs_inode, folio, 1507 &delalloc_start, &delalloc_end)) { 1508 delalloc_start = delalloc_end + 1; 1509 continue; 1510 } 1511 set_delalloc_bitmap(folio, delalloc_bitmap, delalloc_start, 1512 min(delalloc_end, page_end) + 1 - delalloc_start); 1513 last_delalloc_end = delalloc_end; 1514 delalloc_start = delalloc_end + 1; 1515 } 1516 delalloc_start = page_start; 1517 1518 if (!last_delalloc_end) 1519 goto out; 1520 1521 /* Run the delalloc ranges for the above locked ranges. */ 1522 while (delalloc_start < page_end) { 1523 u64 found_start; 1524 u32 found_len; 1525 bool found; 1526 1527 if (!is_subpage) { 1528 /* 1529 * For non-subpage case, the found delalloc range must 1530 * cover this folio and there must be only one locked 1531 * delalloc range. 1532 */ 1533 found_start = page_start; 1534 found_len = last_delalloc_end + 1 - found_start; 1535 found = true; 1536 } else { 1537 found = find_next_delalloc_bitmap(folio, delalloc_bitmap, 1538 delalloc_start, &found_start, &found_len); 1539 } 1540 if (!found) 1541 break; 1542 /* 1543 * The subpage range covers the last sector, the delalloc range may 1544 * end beyond the folio boundary, use the saved delalloc_end 1545 * instead. 1546 */ 1547 if (found_start + found_len >= page_end) 1548 found_len = last_delalloc_end + 1 - found_start; 1549 1550 if (ret >= 0) { 1551 /* 1552 * Some delalloc range may be created by previous folios. 1553 * Thus we still need to clean up this range during error 1554 * handling. 1555 */ 1556 last_finished_delalloc_end = found_start; 1557 /* No errors hit so far, run the current delalloc range. */ 1558 ret = btrfs_run_delalloc_range(inode, folio, 1559 found_start, 1560 found_start + found_len - 1, 1561 wbc); 1562 if (ret >= 0) 1563 last_finished_delalloc_end = found_start + found_len; 1564 if (unlikely(ret < 0)) 1565 btrfs_err_rl(fs_info, 1566 "failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", 1567 btrfs_root_id(inode->root), 1568 btrfs_ino(inode), 1569 folio_pos(folio), 1570 blocks_per_folio, 1571 bio_ctrl->submit_bitmap, 1572 found_start, found_len, ret); 1573 } else { 1574 /* 1575 * We've hit an error during previous delalloc range, 1576 * have to cleanup the remaining locked ranges. 1577 */ 1578 btrfs_unlock_extent(&inode->io_tree, found_start, 1579 found_start + found_len - 1, NULL); 1580 unlock_delalloc_folio(&inode->vfs_inode, folio, 1581 found_start, 1582 found_start + found_len - 1); 1583 } 1584 1585 /* 1586 * We have some ranges that's going to be submitted asynchronously 1587 * (compression or inline). These range have their own control 1588 * on when to unlock the pages. We should not touch them 1589 * anymore, so clear the range from the submission bitmap. 1590 */ 1591 if (ret > 0) { 1592 unsigned int start_bit = (found_start - page_start) >> 1593 fs_info->sectorsize_bits; 1594 unsigned int end_bit = (min(page_end + 1, found_start + found_len) - 1595 page_start) >> fs_info->sectorsize_bits; 1596 bitmap_clear(bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); 1597 } 1598 /* 1599 * Above btrfs_run_delalloc_range() may have unlocked the folio, 1600 * thus for the last range, we cannot touch the folio anymore. 1601 */ 1602 if (found_start + found_len >= last_delalloc_end + 1) 1603 break; 1604 1605 delalloc_start = found_start + found_len; 1606 } 1607 /* 1608 * It's possible we had some ordered extents created before we hit 1609 * an error, cleanup non-async successfully created delalloc ranges. 1610 */ 1611 if (unlikely(ret < 0)) { 1612 unsigned int bitmap_size = min( 1613 (last_finished_delalloc_end - page_start) >> 1614 fs_info->sectorsize_bits, 1615 blocks_per_folio); 1616 1617 for_each_set_bitrange(start_bit, end_bit, bio_ctrl->submit_bitmap, 1618 bitmap_size) { 1619 u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1620 u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1621 1622 btrfs_mark_ordered_io_finished(inode, start, len, false); 1623 } 1624 return ret; 1625 } 1626 out: 1627 if (last_delalloc_end) 1628 delalloc_end = last_delalloc_end; 1629 else 1630 delalloc_end = page_end; 1631 /* 1632 * delalloc_end is already one less than the total length, so 1633 * we don't subtract one from PAGE_SIZE. 1634 */ 1635 delalloc_to_write += 1636 DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); 1637 1638 /* 1639 * If all ranges are submitted asynchronously, we just need to account 1640 * for them here. 1641 */ 1642 if (bitmap_empty(bio_ctrl->submit_bitmap, blocks_per_folio)) { 1643 wbc->nr_to_write -= delalloc_to_write; 1644 return 1; 1645 } 1646 1647 if (wbc->nr_to_write < delalloc_to_write) { 1648 int thresh = 8192; 1649 1650 if (delalloc_to_write < thresh * 2) 1651 thresh = delalloc_to_write; 1652 wbc->nr_to_write = min_t(u64, delalloc_to_write, 1653 thresh); 1654 } 1655 1656 return 0; 1657 } 1658 1659 /* 1660 * Return 0 if we have submitted or queued the sector for submission. 1661 * Return <0 for critical errors, and the involved sector will be cleaned up. 1662 * 1663 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1664 */ 1665 static int submit_one_sector(struct btrfs_inode *inode, 1666 struct folio *folio, 1667 u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, 1668 loff_t i_size) 1669 { 1670 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1671 struct extent_map *em; 1672 u64 block_start; 1673 u64 disk_bytenr; 1674 u64 extent_offset; 1675 u64 em_end; 1676 const u32 sectorsize = fs_info->sectorsize; 1677 unsigned int queued; 1678 1679 ASSERT(IS_ALIGNED(filepos, sectorsize)); 1680 1681 /* @filepos >= i_size case should be handled by the caller. */ 1682 ASSERT(filepos < i_size); 1683 1684 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1685 if (IS_ERR(em)) { 1686 /* 1687 * bio_ctrl may contain a bio crossing several folios. 1688 * Submit it immediately so that the bio has a chance 1689 * to finish normally, other than marked as error. 1690 */ 1691 submit_one_bio(bio_ctrl); 1692 1693 /* 1694 * When submission failed, we should still clear the folio dirty. 1695 * Or the folio will be written back again but without any 1696 * ordered extent. 1697 */ 1698 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1699 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1700 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1701 1702 /* 1703 * Since there is no bio submitted to finish the ordered 1704 * extent, we have to manually finish this sector. 1705 */ 1706 btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize, 1707 false); 1708 return PTR_ERR(em); 1709 } 1710 1711 extent_offset = filepos - em->start; 1712 em_end = btrfs_extent_map_end(em); 1713 ASSERT(filepos <= em_end); 1714 ASSERT(IS_ALIGNED(em->start, sectorsize)); 1715 ASSERT(IS_ALIGNED(em->len, sectorsize)); 1716 1717 block_start = btrfs_extent_map_block_start(em); 1718 disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; 1719 1720 ASSERT(!btrfs_extent_map_is_compressed(em)); 1721 ASSERT(block_start != EXTENT_MAP_HOLE); 1722 ASSERT(block_start != EXTENT_MAP_INLINE); 1723 1724 btrfs_free_extent_map(em); 1725 em = NULL; 1726 1727 /* 1728 * Although the PageDirty bit is cleared before entering this 1729 * function, subpage dirty bit is not cleared. 1730 * So clear subpage dirty bit here so next time we won't submit 1731 * a folio for a range already written to disk. 1732 */ 1733 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1734 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1735 /* 1736 * Above call should set the whole folio with writeback flag, even 1737 * just for a single subpage sector. 1738 * As long as the folio is properly locked and the range is correct, 1739 * we should always get the folio with writeback flag. 1740 */ 1741 ASSERT(folio_test_writeback(folio)); 1742 1743 queued = submit_extent_folio(bio_ctrl, disk_bytenr, folio, 1744 sectorsize, filepos - folio_pos(folio), 0); 1745 if (unlikely(queued < sectorsize)) { 1746 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1747 btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize, 1748 false); 1749 return -EUCLEAN; 1750 } 1751 return 0; 1752 } 1753 1754 /* 1755 * Helper for extent_writepage(). This calls the writepage start hooks, 1756 * and does the loop to map the page into extents and bios. 1757 * 1758 * We return 1 if the IO is started and the page is unlocked, 1759 * 0 if all went well (page still locked) 1760 * < 0 if there were errors (page still locked) 1761 */ 1762 static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, 1763 struct folio *folio, 1764 u64 start, u32 len, 1765 struct btrfs_bio_ctrl *bio_ctrl, 1766 loff_t i_size) 1767 { 1768 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1769 bool submitted_io = false; 1770 int found_error = 0; 1771 const u64 end = start + len; 1772 const u64 folio_start = folio_pos(folio); 1773 const u64 folio_end = folio_start + folio_size(folio); 1774 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1775 u64 cur; 1776 int bit; 1777 int ret = 0; 1778 1779 ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start); 1780 ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu", 1781 start, len, folio_start, folio_size(folio)); 1782 1783 /* Truncate the submit bitmap to the current range. */ 1784 if (start > folio_start) 1785 bitmap_clear(bio_ctrl->submit_bitmap, 0, 1786 (start - folio_start) >> fs_info->sectorsize_bits); 1787 if (start + len < folio_end) 1788 bitmap_clear(bio_ctrl->submit_bitmap, 1789 (end - folio_start) >> fs_info->sectorsize_bits, 1790 (folio_end - end) >> fs_info->sectorsize_bits); 1791 1792 bio_ctrl->end_io_func = end_bbio_data_write; 1793 1794 for_each_set_bit(bit, bio_ctrl->submit_bitmap, blocks_per_folio) { 1795 cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); 1796 1797 if (cur >= i_size) { 1798 struct btrfs_ordered_extent *ordered; 1799 1800 ordered = btrfs_lookup_first_ordered_range(inode, cur, 1801 fs_info->sectorsize); 1802 /* 1803 * We have just run delalloc before getting here, so 1804 * there must be an ordered extent. 1805 */ 1806 ASSERT(ordered != NULL); 1807 spin_lock(&inode->ordered_tree_lock); 1808 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 1809 ordered->truncated_len = min(ordered->truncated_len, 1810 cur - ordered->file_offset); 1811 spin_unlock(&inode->ordered_tree_lock); 1812 btrfs_put_ordered_extent(ordered); 1813 1814 btrfs_mark_ordered_io_finished(inode, cur, fs_info->sectorsize, true); 1815 /* 1816 * This range is beyond i_size, thus we don't need to 1817 * bother writing back. 1818 * But we still need to clear the dirty subpage bit, or 1819 * the next time the folio gets dirtied, we will try to 1820 * writeback the sectors with subpage dirty bits, 1821 * causing writeback without ordered extent. 1822 */ 1823 btrfs_folio_clear_dirty(fs_info, folio, cur, fs_info->sectorsize); 1824 continue; 1825 } 1826 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1827 if (unlikely(ret < 0)) { 1828 if (!found_error) 1829 found_error = ret; 1830 continue; 1831 } 1832 submitted_io = true; 1833 } 1834 1835 /* 1836 * If we didn't submitted any sector (>= i_size), folio dirty get 1837 * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared 1838 * by folio_start_writeback() if the folio is not dirty). 1839 * 1840 * Here we set writeback and clear for the range. If the full folio 1841 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1842 * 1843 * If we hit any error, the corresponding sector will have its dirty 1844 * flag cleared and writeback finished, thus no need to handle the error case. 1845 */ 1846 if (!submitted_io && !found_error) { 1847 btrfs_folio_set_writeback(fs_info, folio, start, len); 1848 btrfs_folio_clear_writeback(fs_info, folio, start, len); 1849 } 1850 return found_error; 1851 } 1852 1853 static void bio_ctrl_init_submit_bitmap(struct btrfs_fs_info *fs_info, 1854 struct folio *folio, 1855 struct btrfs_bio_ctrl *bio_ctrl) 1856 { 1857 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1858 1859 ASSERT(blocks_per_folio <= BTRFS_MAX_BLOCKS_PER_FOLIO); 1860 1861 /* 1862 * Default to unlock the whole folio. 1863 * The proper bitmap is not initialized until writepage_delalloc(). 1864 * 1865 * We're safe just to set the bitmap range [0, blocks_per_folio), as 1866 * all later usage of the bitmap will follow the same range limit. 1867 * Any bits beyond blocks_per_folio will be ignored. 1868 */ 1869 bitmap_set(bio_ctrl->submit_bitmap, 0, blocks_per_folio); 1870 } 1871 1872 /* 1873 * the writepage semantics are similar to regular writepage. extent 1874 * records are inserted to lock ranges in the tree, and as dirty areas 1875 * are found, they are marked writeback. Then the lock bits are removed 1876 * and the end_io handler clears the writeback ranges 1877 * 1878 * Return 0 if everything goes well. 1879 * Return <0 for error. 1880 */ 1881 static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) 1882 { 1883 struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); 1884 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1885 int ret; 1886 size_t pg_offset; 1887 loff_t i_size = i_size_read(&inode->vfs_inode); 1888 const pgoff_t end_index = i_size >> PAGE_SHIFT; 1889 const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); 1890 1891 trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); 1892 1893 WARN_ON(!folio_test_locked(folio)); 1894 1895 pg_offset = offset_in_folio(folio, i_size); 1896 if (folio->index > end_index || 1897 (folio->index == end_index && !pg_offset)) { 1898 folio_invalidate(folio, 0, folio_size(folio)); 1899 folio_unlock(folio); 1900 return 0; 1901 } 1902 1903 if (folio_contains(folio, end_index)) 1904 folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); 1905 1906 bio_ctrl_init_submit_bitmap(fs_info, folio, bio_ctrl); 1907 /* 1908 * If the page is dirty but without private set, it's marked dirty 1909 * without informing the fs. 1910 * Nowadays that is a bug, since the introduction of 1911 * pin_user_pages*(). 1912 * 1913 * So here we check if the page has private set to rule out such 1914 * case. 1915 */ 1916 if (unlikely(!folio_test_private(folio))) { 1917 DEBUG_WARN(); 1918 btrfs_err_rl(fs_info, 1919 "root %lld ino %llu folio %llu is marked dirty without notifying the fs", 1920 btrfs_root_id(inode->root), 1921 btrfs_ino(inode), folio_pos(folio)); 1922 ret = -EUCLEAN; 1923 goto done; 1924 } 1925 1926 ret = set_folio_extent_mapped(folio); 1927 if (ret < 0) 1928 goto done; 1929 1930 ret = writepage_delalloc(inode, folio, bio_ctrl); 1931 if (ret == 1) 1932 return 0; 1933 if (ret) 1934 goto done; 1935 1936 ret = extent_writepage_io(inode, folio, folio_pos(folio), 1937 folio_size(folio), bio_ctrl, i_size); 1938 if (ret == 1) 1939 return 0; 1940 if (unlikely(ret < 0)) 1941 btrfs_err_rl(fs_info, 1942 "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", 1943 btrfs_root_id(inode->root), btrfs_ino(inode), 1944 folio_pos(folio), blocks_per_folio, 1945 bio_ctrl->submit_bitmap, ret); 1946 1947 bio_ctrl->wbc->nr_to_write--; 1948 1949 done: 1950 if (ret < 0) 1951 mapping_set_error(folio->mapping, ret); 1952 /* 1953 * Only unlock ranges that are submitted. As there can be some async 1954 * submitted ranges inside the folio. 1955 */ 1956 btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); 1957 ASSERT(ret <= 0); 1958 return ret; 1959 } 1960 1961 /* 1962 * Lock extent buffer status and pages for writeback. 1963 * 1964 * Return %false if the extent buffer doesn't need to be submitted (e.g. the 1965 * extent buffer is not dirty) 1966 * Return %true is the extent buffer is submitted to bio. 1967 */ 1968 static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, 1969 struct writeback_control *wbc) 1970 { 1971 struct btrfs_fs_info *fs_info = eb->fs_info; 1972 bool ret = false; 1973 1974 btrfs_tree_lock(eb); 1975 while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 1976 btrfs_tree_unlock(eb); 1977 if (wbc->sync_mode != WB_SYNC_ALL) 1978 return false; 1979 wait_on_extent_buffer_writeback(eb); 1980 btrfs_tree_lock(eb); 1981 } 1982 1983 /* 1984 * We need to do this to prevent races in people who check if the eb is 1985 * under IO since we can end up having no IO bits set for a short period 1986 * of time. 1987 */ 1988 spin_lock(&eb->refs_lock); 1989 if ((wbc->sync_mode == WB_SYNC_ALL || 1990 atomic_read(&eb->writeback_inhibitors) == 0) && 1991 test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1992 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1993 unsigned long flags; 1994 1995 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 1996 spin_unlock(&eb->refs_lock); 1997 1998 xas_lock_irqsave(&xas, flags); 1999 xas_load(&xas); 2000 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 2001 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 2002 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 2003 xas_unlock_irqrestore(&xas, flags); 2004 2005 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2006 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 2007 -eb->len, 2008 fs_info->dirty_metadata_batch); 2009 ret = true; 2010 } else { 2011 spin_unlock(&eb->refs_lock); 2012 } 2013 btrfs_tree_unlock(eb); 2014 return ret; 2015 } 2016 2017 static void set_btree_ioerr(struct extent_buffer *eb) 2018 { 2019 struct btrfs_fs_info *fs_info = eb->fs_info; 2020 2021 set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2022 2023 /* 2024 * A read may stumble upon this buffer later, make sure that it gets an 2025 * error and knows there was an error. 2026 */ 2027 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 2028 2029 /* 2030 * We need to set the mapping with the io error as well because a write 2031 * error will flip the file system readonly, and then syncfs() will 2032 * return a 0 because we are readonly if we don't modify the err seq for 2033 * the superblock. 2034 */ 2035 mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); 2036 2037 /* 2038 * If writeback for a btree extent that doesn't belong to a log tree 2039 * failed, increment the counter transaction->eb_write_errors. 2040 * We do this because while the transaction is running and before it's 2041 * committing (when we call filemap_fdata[write|wait]_range against 2042 * the btree inode), we might have 2043 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 2044 * returns an error or an error happens during writeback, when we're 2045 * committing the transaction we wouldn't know about it, since the pages 2046 * can be no longer dirty nor marked anymore for writeback (if a 2047 * subsequent modification to the extent buffer didn't happen before the 2048 * transaction commit), which makes filemap_fdata[write|wait]_range not 2049 * able to find the pages which contain errors at transaction 2050 * commit time. So if this happens we must abort the transaction, 2051 * otherwise we commit a super block with btree roots that point to 2052 * btree nodes/leafs whose content on disk is invalid - either garbage 2053 * or the content of some node/leaf from a past generation that got 2054 * cowed or deleted and is no longer valid. 2055 * 2056 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 2057 * not be enough - we need to distinguish between log tree extents vs 2058 * non-log tree extents, and the next filemap_fdatawait_range() call 2059 * will catch and clear such errors in the mapping - and that call might 2060 * be from a log sync and not from a transaction commit. Also, checking 2061 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 2062 * not done and would not be reliable - the eb might have been released 2063 * from memory and reading it back again means that flag would not be 2064 * set (since it's a runtime flag, not persisted on disk). 2065 * 2066 * Using the flags below in the btree inode also makes us achieve the 2067 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 2068 * writeback for all dirty pages and before filemap_fdatawait_range() 2069 * is called, the writeback for all dirty pages had already finished 2070 * with errors - because we were not using AS_EIO/AS_ENOSPC, 2071 * filemap_fdatawait_range() would return success, as it could not know 2072 * that writeback errors happened (the pages were no longer tagged for 2073 * writeback). 2074 */ 2075 switch (eb->log_index) { 2076 case -1: 2077 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 2078 break; 2079 case 0: 2080 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 2081 break; 2082 case 1: 2083 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 2084 break; 2085 default: 2086 BUG(); /* unexpected, logic error */ 2087 } 2088 } 2089 2090 static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) 2091 { 2092 struct btrfs_fs_info *fs_info = eb->fs_info; 2093 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2094 unsigned long flags; 2095 2096 xas_lock_irqsave(&xas, flags); 2097 xas_load(&xas); 2098 xas_set_mark(&xas, mark); 2099 xas_unlock_irqrestore(&xas, flags); 2100 } 2101 2102 static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) 2103 { 2104 struct btrfs_fs_info *fs_info = eb->fs_info; 2105 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 2106 unsigned long flags; 2107 2108 xas_lock_irqsave(&xas, flags); 2109 xas_load(&xas); 2110 xas_clear_mark(&xas, mark); 2111 xas_unlock_irqrestore(&xas, flags); 2112 } 2113 2114 static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, 2115 unsigned long start, unsigned long end) 2116 { 2117 XA_STATE(xas, &fs_info->buffer_tree, start); 2118 unsigned int tagged = 0; 2119 void *eb; 2120 2121 xas_lock_irq(&xas); 2122 xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { 2123 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); 2124 if (++tagged % XA_CHECK_SCHED) 2125 continue; 2126 xas_pause(&xas); 2127 xas_unlock_irq(&xas); 2128 cond_resched(); 2129 xas_lock_irq(&xas); 2130 } 2131 xas_unlock_irq(&xas); 2132 } 2133 2134 struct eb_batch { 2135 unsigned int nr; 2136 unsigned int cur; 2137 struct extent_buffer *ebs[FOLIO_BATCH_SIZE]; 2138 }; 2139 2140 static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) 2141 { 2142 batch->ebs[batch->nr++] = eb; 2143 return (batch->nr < FOLIO_BATCH_SIZE); 2144 } 2145 2146 static inline void eb_batch_init(struct eb_batch *batch) 2147 { 2148 batch->nr = 0; 2149 batch->cur = 0; 2150 } 2151 2152 static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) 2153 { 2154 if (batch->cur >= batch->nr) 2155 return NULL; 2156 return batch->ebs[batch->cur++]; 2157 } 2158 2159 static inline void eb_batch_release(struct eb_batch *batch) 2160 { 2161 for (unsigned int i = 0; i < batch->nr; i++) 2162 free_extent_buffer(batch->ebs[i]); 2163 eb_batch_init(batch); 2164 } 2165 2166 static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, 2167 xa_mark_t mark) 2168 { 2169 struct extent_buffer *eb; 2170 2171 retry: 2172 eb = xas_find_marked(xas, max, mark); 2173 2174 if (xas_retry(xas, eb)) 2175 goto retry; 2176 2177 if (!eb) 2178 return NULL; 2179 2180 if (!refcount_inc_not_zero(&eb->refs)) { 2181 xas_reset(xas); 2182 goto retry; 2183 } 2184 2185 if (unlikely(eb != xas_reload(xas))) { 2186 free_extent_buffer(eb); 2187 xas_reset(xas); 2188 goto retry; 2189 } 2190 2191 return eb; 2192 } 2193 2194 static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, 2195 unsigned long *start, 2196 unsigned long end, xa_mark_t tag, 2197 struct eb_batch *batch) 2198 { 2199 XA_STATE(xas, &fs_info->buffer_tree, *start); 2200 struct extent_buffer *eb; 2201 2202 rcu_read_lock(); 2203 while ((eb = find_get_eb(&xas, end, tag)) != NULL) { 2204 if (!eb_batch_add(batch, eb)) { 2205 *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); 2206 goto out; 2207 } 2208 } 2209 if (end == ULONG_MAX) 2210 *start = ULONG_MAX; 2211 else 2212 *start = end + 1; 2213 out: 2214 rcu_read_unlock(); 2215 2216 return batch->nr; 2217 } 2218 2219 /* 2220 * The endio specific version which won't touch any unsafe spinlock in endio 2221 * context. 2222 */ 2223 static struct extent_buffer *find_extent_buffer_nolock( 2224 struct btrfs_fs_info *fs_info, u64 start) 2225 { 2226 struct extent_buffer *eb; 2227 unsigned long index = (start >> fs_info->nodesize_bits); 2228 2229 rcu_read_lock(); 2230 eb = xa_load(&fs_info->buffer_tree, index); 2231 if (eb && !refcount_inc_not_zero(&eb->refs)) 2232 eb = NULL; 2233 rcu_read_unlock(); 2234 return eb; 2235 } 2236 2237 static void end_bbio_meta_write(struct btrfs_bio *bbio) 2238 { 2239 struct extent_buffer *eb = bbio->private; 2240 struct folio_iter fi; 2241 2242 if (bbio->bio.bi_status != BLK_STS_OK) 2243 set_btree_ioerr(eb); 2244 2245 bio_for_each_folio_all(fi, &bbio->bio) { 2246 btrfs_meta_folio_clear_writeback(fi.folio, eb); 2247 } 2248 2249 buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); 2250 clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 2251 bio_put(&bbio->bio); 2252 } 2253 2254 static void prepare_eb_write(struct extent_buffer *eb) 2255 { 2256 u32 nritems; 2257 unsigned long start; 2258 unsigned long end; 2259 2260 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 2261 2262 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 2263 nritems = btrfs_header_nritems(eb); 2264 if (btrfs_header_level(eb) > 0) { 2265 end = btrfs_node_key_ptr_offset(eb, nritems); 2266 memzero_extent_buffer(eb, end, eb->len - end); 2267 } else { 2268 /* 2269 * Leaf: 2270 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 2271 */ 2272 start = btrfs_item_nr_offset(eb, nritems); 2273 end = btrfs_item_nr_offset(eb, 0); 2274 if (nritems == 0) 2275 end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); 2276 else 2277 end += btrfs_item_offset(eb, nritems - 1); 2278 memzero_extent_buffer(eb, start, end - start); 2279 } 2280 } 2281 2282 static noinline_for_stack void write_one_eb(struct extent_buffer *eb, 2283 struct writeback_control *wbc) 2284 { 2285 struct btrfs_fs_info *fs_info = eb->fs_info; 2286 struct btrfs_bio *bbio; 2287 2288 prepare_eb_write(eb); 2289 2290 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 2291 REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), 2292 BTRFS_I(fs_info->btree_inode), eb->start, 2293 end_bbio_meta_write, eb); 2294 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 2295 bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); 2296 wbc_init_bio(wbc, &bbio->bio); 2297 for (int i = 0; i < num_extent_folios(eb); i++) { 2298 struct folio *folio = eb->folios[i]; 2299 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 2300 u32 range_len = min_t(u64, folio_next_pos(folio), 2301 eb->start + eb->len) - range_start; 2302 2303 folio_lock(folio); 2304 btrfs_meta_folio_clear_dirty(folio, eb); 2305 btrfs_meta_folio_set_writeback(folio, eb); 2306 if (!folio_test_dirty(folio)) 2307 wbc->nr_to_write -= folio_nr_pages(folio); 2308 bio_add_folio_nofail(&bbio->bio, folio, range_len, 2309 offset_in_folio(folio, range_start)); 2310 wbc_account_cgroup_owner(wbc, folio, range_len); 2311 folio_unlock(folio); 2312 } 2313 /* 2314 * If the fs is already in error status, do not submit any writeback 2315 * but immediately finish it. 2316 */ 2317 if (unlikely(BTRFS_FS_ERROR(fs_info))) { 2318 btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); 2319 return; 2320 } 2321 btrfs_submit_bbio(bbio, 0); 2322 } 2323 2324 /* 2325 * Wait for all eb writeback in the given range to finish. 2326 * 2327 * @fs_info: The fs_info for this file system. 2328 * @start: The offset of the range to start waiting on writeback. 2329 * @end: The end of the range, inclusive. This is meant to be used in 2330 * conjunction with wait_marked_extents, so this will usually be 2331 * the_next_eb->start - 1. 2332 */ 2333 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, 2334 u64 end) 2335 { 2336 struct eb_batch batch; 2337 unsigned long start_index = (start >> fs_info->nodesize_bits); 2338 unsigned long end_index = (end >> fs_info->nodesize_bits); 2339 2340 eb_batch_init(&batch); 2341 while (start_index <= end_index) { 2342 struct extent_buffer *eb; 2343 unsigned int nr_ebs; 2344 2345 nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, 2346 PAGECACHE_TAG_WRITEBACK, &batch); 2347 if (!nr_ebs) 2348 break; 2349 2350 while ((eb = eb_batch_next(&batch)) != NULL) 2351 wait_on_extent_buffer_writeback(eb); 2352 eb_batch_release(&batch); 2353 cond_resched(); 2354 } 2355 } 2356 2357 int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) 2358 { 2359 struct btrfs_eb_write_context ctx = { .wbc = wbc }; 2360 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); 2361 int ret = 0; 2362 bool done = false; 2363 int nr_to_write_done = 0; 2364 struct eb_batch batch; 2365 unsigned int nr_ebs; 2366 unsigned long index; 2367 unsigned long end; 2368 bool scanned = false; 2369 xa_mark_t tag; 2370 2371 eb_batch_init(&batch); 2372 if (wbc->range_cyclic) { 2373 index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); 2374 end = -1; 2375 2376 /* 2377 * Start from the beginning does not need to cycle over the 2378 * range, mark it as scanned. 2379 */ 2380 scanned = (index == 0); 2381 } else { 2382 index = (wbc->range_start >> fs_info->nodesize_bits); 2383 end = (wbc->range_end >> fs_info->nodesize_bits); 2384 2385 scanned = true; 2386 } 2387 if (wbc->sync_mode == WB_SYNC_ALL) 2388 tag = PAGECACHE_TAG_TOWRITE; 2389 else 2390 tag = PAGECACHE_TAG_DIRTY; 2391 btrfs_zoned_meta_io_lock(fs_info); 2392 retry: 2393 if (wbc->sync_mode == WB_SYNC_ALL) 2394 buffer_tree_tag_for_writeback(fs_info, index, end); 2395 while (!done && !nr_to_write_done && (index <= end) && 2396 (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { 2397 struct extent_buffer *eb; 2398 2399 while ((eb = eb_batch_next(&batch)) != NULL) { 2400 ctx.eb = eb; 2401 2402 ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); 2403 if (ret) { 2404 if (ret == -EBUSY) 2405 ret = 0; 2406 2407 if (ret) { 2408 done = true; 2409 break; 2410 } 2411 continue; 2412 } 2413 2414 if (!lock_extent_buffer_for_io(eb, wbc)) 2415 continue; 2416 2417 /* Implies write in zoned mode. */ 2418 if (ctx.zoned_bg) { 2419 /* Mark the last eb in the block group. */ 2420 btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); 2421 ctx.zoned_bg->meta_write_pointer += eb->len; 2422 } 2423 write_one_eb(eb, wbc); 2424 } 2425 nr_to_write_done = (wbc->nr_to_write <= 0); 2426 eb_batch_release(&batch); 2427 cond_resched(); 2428 } 2429 if (!scanned && !done) { 2430 /* 2431 * We hit the last page and there is more work to be done: wrap 2432 * back to the start of the file 2433 */ 2434 scanned = true; 2435 index = 0; 2436 goto retry; 2437 } 2438 2439 /* 2440 * Only btrfs_check_meta_write_pointer() can update @ret, 2441 * and it only returns 0 or errors. 2442 */ 2443 ASSERT(ret <= 0); 2444 if (unlikely(!ret && BTRFS_FS_ERROR(fs_info))) 2445 ret = -EROFS; 2446 2447 if (ctx.zoned_bg) 2448 btrfs_put_block_group(ctx.zoned_bg); 2449 btrfs_zoned_meta_io_unlock(fs_info); 2450 return ret; 2451 } 2452 2453 /* 2454 * Walk the list of dirty pages of the given address space and write all of them. 2455 * 2456 * @mapping: address space structure to write 2457 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2458 * @bio_ctrl: holds context for the write, namely the bio 2459 * 2460 * If a page is already under I/O, write_cache_pages() skips it, even 2461 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2462 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2463 * and msync() need to guarantee that all the data which was dirty at the time 2464 * the call was made get new I/O started against them. If wbc->sync_mode is 2465 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2466 * existing IO to complete. 2467 */ 2468 static int extent_write_cache_pages(struct address_space *mapping, 2469 struct btrfs_bio_ctrl *bio_ctrl) 2470 { 2471 struct writeback_control *wbc = bio_ctrl->wbc; 2472 struct inode *inode = mapping->host; 2473 int ret = 0; 2474 bool done = false; 2475 int nr_to_write_done = 0; 2476 struct folio_batch fbatch; 2477 unsigned int nr_folios; 2478 pgoff_t index; 2479 pgoff_t end; /* Inclusive */ 2480 pgoff_t done_index; 2481 bool range_whole = false; 2482 bool scanned = false; 2483 xa_mark_t tag; 2484 2485 /* 2486 * We have to hold onto the inode so that ordered extents can do their 2487 * work when the IO finishes. The alternative to this is failing to add 2488 * an ordered extent if the igrab() fails there and that is a huge pain 2489 * to deal with, so instead just hold onto the inode throughout the 2490 * writepages operation. If it fails here we are freeing up the inode 2491 * anyway and we'd rather not waste our time writing out stuff that is 2492 * going to be truncated anyway. 2493 */ 2494 if (!igrab(inode)) 2495 return 0; 2496 2497 folio_batch_init(&fbatch); 2498 if (wbc->range_cyclic) { 2499 index = mapping->writeback_index; /* Start from prev offset */ 2500 end = -1; 2501 /* 2502 * Start from the beginning does not need to cycle over the 2503 * range, mark it as scanned. 2504 */ 2505 scanned = (index == 0); 2506 } else { 2507 index = wbc->range_start >> PAGE_SHIFT; 2508 end = wbc->range_end >> PAGE_SHIFT; 2509 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2510 range_whole = true; 2511 scanned = true; 2512 } 2513 2514 /* 2515 * We do the tagged writepage as long as the snapshot flush bit is set 2516 * and we are the first one who do the filemap_flush() on this inode. 2517 * 2518 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 2519 * not race in and drop the bit. 2520 */ 2521 if (range_whole && wbc->nr_to_write == LONG_MAX && 2522 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 2523 &BTRFS_I(inode)->runtime_flags)) 2524 wbc->tagged_writepages = 1; 2525 2526 tag = wbc_to_tag(wbc); 2527 retry: 2528 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2529 tag_pages_for_writeback(mapping, index, end); 2530 done_index = index; 2531 while (!done && !nr_to_write_done && (index <= end) && 2532 (nr_folios = filemap_get_folios_tag(mapping, &index, 2533 end, tag, &fbatch))) { 2534 unsigned i; 2535 2536 for (i = 0; i < nr_folios; i++) { 2537 struct folio *folio = fbatch.folios[i]; 2538 2539 done_index = folio_next_index(folio); 2540 /* 2541 * At this point we hold neither the i_pages lock nor 2542 * the folio lock: the folio may be truncated or 2543 * invalidated (changing folio->mapping to NULL). 2544 */ 2545 if (!folio_trylock(folio)) { 2546 submit_write_bio(bio_ctrl, 0); 2547 folio_lock(folio); 2548 } 2549 2550 if (unlikely(folio->mapping != mapping)) { 2551 folio_unlock(folio); 2552 continue; 2553 } 2554 2555 if (!folio_test_dirty(folio)) { 2556 /* Someone wrote it for us. */ 2557 folio_unlock(folio); 2558 continue; 2559 } 2560 2561 /* 2562 * For subpage case, compression can lead to mixed 2563 * writeback and dirty flags, e.g: 2564 * 0 32K 64K 96K 128K 2565 * | |//////||/////| |//| 2566 * 2567 * In above case, [32K, 96K) is asynchronously submitted 2568 * for compression, and [124K, 128K) needs to be written back. 2569 * 2570 * If we didn't wait writeback for page 64K, [128K, 128K) 2571 * won't be submitted as the page still has writeback flag 2572 * and will be skipped in the next check. 2573 * 2574 * This mixed writeback and dirty case is only possible for 2575 * subpage case. 2576 * 2577 * TODO: Remove this check after migrating compression to 2578 * regular submission. 2579 */ 2580 if (wbc->sync_mode != WB_SYNC_NONE || 2581 btrfs_is_subpage(inode_to_fs_info(inode), folio)) { 2582 if (folio_test_writeback(folio)) 2583 submit_write_bio(bio_ctrl, 0); 2584 folio_wait_writeback(folio); 2585 } 2586 2587 if (folio_test_writeback(folio) || 2588 !folio_test_dirty(folio)) { 2589 folio_unlock(folio); 2590 continue; 2591 } 2592 2593 ret = extent_writepage(folio, bio_ctrl); 2594 if (ret < 0) { 2595 done = true; 2596 break; 2597 } 2598 2599 /* 2600 * The filesystem may choose to bump up nr_to_write. 2601 * We have to make sure to honor the new nr_to_write 2602 * at any time. 2603 */ 2604 nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && 2605 wbc->nr_to_write <= 0); 2606 } 2607 folio_batch_release(&fbatch); 2608 cond_resched(); 2609 } 2610 if (!scanned && !done) { 2611 /* 2612 * We hit the last page and there is more work to be done: wrap 2613 * back to the start of the file 2614 */ 2615 scanned = true; 2616 index = 0; 2617 2618 /* 2619 * If we're looping we could run into a page that is locked by a 2620 * writer and that writer could be waiting on writeback for a 2621 * page in our current bio, and thus deadlock, so flush the 2622 * write bio here. 2623 */ 2624 submit_write_bio(bio_ctrl, 0); 2625 goto retry; 2626 } 2627 2628 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 2629 mapping->writeback_index = done_index; 2630 2631 btrfs_add_delayed_iput(BTRFS_I(inode)); 2632 return ret; 2633 } 2634 2635 /* 2636 * Submit the pages in the range to bio for call sites which delalloc range has 2637 * already been ran (aka, ordered extent inserted) and all pages are still 2638 * locked. 2639 */ 2640 void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, 2641 u64 start, u64 end, struct writeback_control *wbc, 2642 bool pages_dirty) 2643 { 2644 bool found_error = false; 2645 int ret = 0; 2646 struct address_space *mapping = inode->i_mapping; 2647 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2648 const u32 sectorsize = fs_info->sectorsize; 2649 loff_t i_size = i_size_read(inode); 2650 u64 cur = start; 2651 struct btrfs_bio_ctrl bio_ctrl = { 2652 .wbc = wbc, 2653 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2654 }; 2655 2656 if (wbc->no_cgroup_owner) 2657 bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; 2658 2659 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 2660 2661 while (cur <= end) { 2662 u64 cur_end; 2663 u32 cur_len; 2664 struct folio *folio; 2665 2666 folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); 2667 2668 /* 2669 * This shouldn't happen, the pages are pinned and locked, this 2670 * code is just in case, but shouldn't actually be run. 2671 */ 2672 if (IS_ERR(folio)) { 2673 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2674 cur_len = cur_end + 1 - cur; 2675 btrfs_mark_ordered_io_finished(BTRFS_I(inode), cur, cur_len, false); 2676 mapping_set_error(mapping, PTR_ERR(folio)); 2677 cur = cur_end; 2678 continue; 2679 } 2680 2681 cur_end = min_t(u64, folio_next_pos(folio) - 1, end); 2682 cur_len = cur_end + 1 - cur; 2683 2684 ASSERT(folio_test_locked(folio)); 2685 if (pages_dirty && folio != locked_folio) 2686 ASSERT(folio_test_dirty(folio)); 2687 2688 /* 2689 * Set the submission bitmap to submit all sectors. 2690 * extent_writepage_io() will do the truncation correctly. 2691 */ 2692 bio_ctrl_init_submit_bitmap(fs_info, folio, &bio_ctrl); 2693 ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, 2694 &bio_ctrl, i_size); 2695 if (ret == 1) 2696 goto next_page; 2697 2698 if (ret) 2699 mapping_set_error(mapping, ret); 2700 btrfs_folio_end_lock(fs_info, folio, cur, cur_len); 2701 if (ret < 0) 2702 found_error = true; 2703 next_page: 2704 folio_put(folio); 2705 cur = cur_end + 1; 2706 } 2707 2708 submit_write_bio(&bio_ctrl, found_error ? ret : 0); 2709 } 2710 2711 int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 2712 { 2713 struct inode *inode = mapping->host; 2714 int ret = 0; 2715 struct btrfs_bio_ctrl bio_ctrl = { 2716 .wbc = wbc, 2717 .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), 2718 }; 2719 2720 /* 2721 * Allow only a single thread to do the reloc work in zoned mode to 2722 * protect the write pointer updates. 2723 */ 2724 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 2725 ret = extent_write_cache_pages(mapping, &bio_ctrl); 2726 submit_write_bio(&bio_ctrl, ret); 2727 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 2728 return ret; 2729 } 2730 2731 void btrfs_readahead(struct readahead_control *rac) 2732 { 2733 struct btrfs_bio_ctrl bio_ctrl = { 2734 .opf = REQ_OP_READ | REQ_RAHEAD, 2735 .ractl = rac, 2736 .last_em_start = U64_MAX, 2737 }; 2738 struct folio *folio; 2739 struct inode *vfs_inode = rac->mapping->host; 2740 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 2741 const u64 start = readahead_pos(rac); 2742 const u64 end = start + readahead_length(rac) - 1; 2743 struct extent_state *cached_state = NULL; 2744 struct extent_map *em_cached = NULL; 2745 struct fsverity_info *vi = NULL; 2746 2747 lock_extents_for_read(inode, start, end, &cached_state); 2748 if (start < i_size_read(vfs_inode)) 2749 vi = fsverity_get_info(vfs_inode); 2750 while ((folio = readahead_folio(rac)) != NULL) 2751 btrfs_do_readpage(folio, &em_cached, &bio_ctrl, vi); 2752 2753 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2754 2755 if (em_cached) 2756 btrfs_free_extent_map(em_cached); 2757 submit_one_bio(&bio_ctrl); 2758 } 2759 2760 /* 2761 * A helper for struct address_space_operations::release_folio, this tests for 2762 * areas of the folio that are locked or under IO and drops the related state 2763 * bits if it is safe to drop the folio. 2764 */ 2765 static bool try_release_extent_state(struct extent_io_tree *tree, 2766 struct folio *folio) 2767 { 2768 struct extent_state *cached_state = NULL; 2769 u64 start = folio_pos(folio); 2770 u64 end = start + folio_size(folio) - 1; 2771 u32 range_bits; 2772 u32 clear_bits; 2773 bool ret = false; 2774 int ret2; 2775 2776 btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); 2777 2778 /* 2779 * We can release the folio if it's locked only for ordered extent 2780 * completion, since that doesn't require using the folio. 2781 */ 2782 if ((range_bits & EXTENT_LOCKED) && 2783 !(range_bits & EXTENT_FINISHING_ORDERED)) 2784 goto out; 2785 2786 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | 2787 EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | 2788 EXTENT_FINISHING_ORDERED); 2789 /* 2790 * At this point we can safely clear everything except the locked, 2791 * nodatasum, delalloc new and finishing ordered bits. The delalloc new 2792 * bit will be cleared by ordered extent completion. 2793 */ 2794 ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); 2795 /* 2796 * If clear_extent_bit failed for enomem reasons, we can't allow the 2797 * release to continue. 2798 */ 2799 if (ret2 == 0) 2800 ret = true; 2801 out: 2802 btrfs_free_extent_state(cached_state); 2803 2804 return ret; 2805 } 2806 2807 /* 2808 * a helper for release_folio. As long as there are no locked extents 2809 * in the range corresponding to the page, both state records and extent 2810 * map records are removed 2811 */ 2812 bool try_release_extent_mapping(struct folio *folio, gfp_t mask) 2813 { 2814 u64 start = folio_pos(folio); 2815 u64 end = start + folio_size(folio) - 1; 2816 struct btrfs_inode *inode = folio_to_inode(folio); 2817 struct extent_io_tree *io_tree = &inode->io_tree; 2818 2819 while (start <= end) { 2820 const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); 2821 const u64 len = end - start + 1; 2822 struct extent_map_tree *extent_tree = &inode->extent_tree; 2823 struct extent_map *em; 2824 2825 write_lock(&extent_tree->lock); 2826 em = btrfs_lookup_extent_mapping(extent_tree, start, len); 2827 if (!em) { 2828 write_unlock(&extent_tree->lock); 2829 break; 2830 } 2831 if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { 2832 write_unlock(&extent_tree->lock); 2833 btrfs_free_extent_map(em); 2834 break; 2835 } 2836 if (btrfs_test_range_bit_exists(io_tree, em->start, 2837 btrfs_extent_map_end(em) - 1, 2838 EXTENT_LOCKED)) 2839 goto next; 2840 /* 2841 * If it's not in the list of modified extents, used by a fast 2842 * fsync, we can remove it. If it's being logged we can safely 2843 * remove it since fsync took an extra reference on the em. 2844 */ 2845 if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) 2846 goto remove_em; 2847 /* 2848 * If it's in the list of modified extents, remove it only if 2849 * its generation is older then the current one, in which case 2850 * we don't need it for a fast fsync. Otherwise don't remove it, 2851 * we could be racing with an ongoing fast fsync that could miss 2852 * the new extent. 2853 */ 2854 if (em->generation >= cur_gen) 2855 goto next; 2856 remove_em: 2857 /* 2858 * We only remove extent maps that are not in the list of 2859 * modified extents or that are in the list but with a 2860 * generation lower then the current generation, so there is no 2861 * need to set the full fsync flag on the inode (it hurts the 2862 * fsync performance for workloads with a data size that exceeds 2863 * or is close to the system's memory). 2864 */ 2865 btrfs_remove_extent_mapping(inode, em); 2866 /* Once for the inode's extent map tree. */ 2867 btrfs_free_extent_map(em); 2868 next: 2869 start = btrfs_extent_map_end(em); 2870 write_unlock(&extent_tree->lock); 2871 2872 /* Once for us, for the lookup_extent_mapping() reference. */ 2873 btrfs_free_extent_map(em); 2874 2875 if (need_resched()) { 2876 /* 2877 * If we need to resched but we can't block just exit 2878 * and leave any remaining extent maps. 2879 */ 2880 if (!gfpflags_allow_blocking(mask)) 2881 break; 2882 2883 cond_resched(); 2884 } 2885 } 2886 return try_release_extent_state(io_tree, folio); 2887 } 2888 2889 static bool folio_range_has_eb(struct folio *folio) 2890 { 2891 struct btrfs_folio_state *bfs; 2892 2893 lockdep_assert_held(&folio->mapping->i_private_lock); 2894 2895 if (folio_test_private(folio)) { 2896 bfs = folio_get_private(folio); 2897 if (atomic_read(&bfs->eb_refs)) 2898 return true; 2899 } 2900 return false; 2901 } 2902 2903 static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) 2904 { 2905 struct btrfs_fs_info *fs_info = eb->fs_info; 2906 struct address_space *mapping = folio->mapping; 2907 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 2908 2909 /* 2910 * For mapped eb, we're going to change the folio private, which should 2911 * be done under the i_private_lock. 2912 */ 2913 if (mapped) 2914 spin_lock(&mapping->i_private_lock); 2915 2916 if (!folio_test_private(folio)) { 2917 if (mapped) 2918 spin_unlock(&mapping->i_private_lock); 2919 return; 2920 } 2921 2922 if (!btrfs_meta_is_subpage(fs_info)) { 2923 /* 2924 * We do this since we'll remove the pages after we've removed 2925 * the eb from the xarray, so we could race and have this page 2926 * now attached to the new eb. So only clear folio if it's 2927 * still connected to this eb. 2928 */ 2929 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2930 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 2931 BUG_ON(folio_test_dirty(folio)); 2932 BUG_ON(folio_test_writeback(folio)); 2933 /* We need to make sure we haven't be attached to a new eb. */ 2934 folio_detach_private(folio); 2935 } 2936 if (mapped) 2937 spin_unlock(&mapping->i_private_lock); 2938 return; 2939 } 2940 2941 /* 2942 * For subpage, we can have dummy eb with folio private attached. In 2943 * this case, we can directly detach the private as such folio is only 2944 * attached to one dummy eb, no sharing. 2945 */ 2946 if (!mapped) { 2947 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2948 return; 2949 } 2950 2951 btrfs_folio_dec_eb_refs(fs_info, folio); 2952 2953 /* 2954 * We can only detach the folio private if there are no other ebs in the 2955 * page range and no unfinished IO. 2956 */ 2957 if (!folio_range_has_eb(folio)) 2958 btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); 2959 2960 spin_unlock(&mapping->i_private_lock); 2961 } 2962 2963 /* Release all folios attached to the extent buffer */ 2964 static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) 2965 { 2966 ASSERT(!extent_buffer_under_io(eb)); 2967 2968 for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { 2969 struct folio *folio = eb->folios[i]; 2970 2971 if (!folio) 2972 continue; 2973 2974 detach_extent_buffer_folio(eb, folio); 2975 } 2976 } 2977 2978 /* 2979 * Helper for releasing the extent buffer. 2980 */ 2981 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 2982 { 2983 btrfs_release_extent_buffer_folios(eb); 2984 btrfs_leak_debug_del_eb(eb); 2985 kmem_cache_free(extent_buffer_cache, eb); 2986 } 2987 2988 /* 2989 * Inhibit writeback on buffer during transaction. 2990 * 2991 * @trans: transaction handle that will own the inhibitor 2992 * @eb: extent buffer to inhibit writeback on 2993 * 2994 * Attempt to track this extent buffer in the transaction's inhibited set. If 2995 * memory allocation fails, the buffer is simply not tracked. It may be written 2996 * back and need re-COW, which is the original behavior. This is acceptable 2997 * since inhibiting writeback is an optimization. 2998 */ 2999 void btrfs_inhibit_eb_writeback(struct btrfs_trans_handle *trans, struct extent_buffer *eb) 3000 { 3001 unsigned long index = eb->start >> trans->fs_info->nodesize_bits; 3002 void *old; 3003 3004 lockdep_assert_held(&eb->lock); 3005 /* Check if already inhibited by this handle. */ 3006 old = xa_load(&trans->writeback_inhibited_ebs, index); 3007 if (old == eb) 3008 return; 3009 3010 /* Take reference for the xarray entry. */ 3011 refcount_inc(&eb->refs); 3012 3013 old = xa_store(&trans->writeback_inhibited_ebs, index, eb, GFP_NOFS); 3014 if (xa_is_err(old)) { 3015 /* Allocation failed, just skip inhibiting this buffer. */ 3016 free_extent_buffer(eb); 3017 return; 3018 } 3019 3020 /* Handle replacement of different eb at same index. */ 3021 if (old && old != eb) { 3022 struct extent_buffer *old_eb = old; 3023 3024 atomic_dec(&old_eb->writeback_inhibitors); 3025 free_extent_buffer(old_eb); 3026 } 3027 3028 atomic_inc(&eb->writeback_inhibitors); 3029 } 3030 3031 /* 3032 * Uninhibit writeback on all extent buffers. 3033 */ 3034 void btrfs_uninhibit_all_eb_writeback(struct btrfs_trans_handle *trans) 3035 { 3036 struct extent_buffer *eb; 3037 unsigned long index; 3038 3039 xa_for_each(&trans->writeback_inhibited_ebs, index, eb) { 3040 atomic_dec(&eb->writeback_inhibitors); 3041 free_extent_buffer(eb); 3042 } 3043 xa_destroy(&trans->writeback_inhibited_ebs); 3044 } 3045 3046 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3047 u64 start) 3048 { 3049 struct extent_buffer *eb = NULL; 3050 3051 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 3052 eb->start = start; 3053 eb->len = fs_info->nodesize; 3054 eb->fs_info = fs_info; 3055 init_rwsem(&eb->lock); 3056 atomic_set(&eb->writeback_inhibitors, 0); 3057 3058 btrfs_leak_debug_add_eb(eb); 3059 3060 spin_lock_init(&eb->refs_lock); 3061 refcount_set(&eb->refs, 1); 3062 3063 ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); 3064 3065 return eb; 3066 } 3067 3068 /* 3069 * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() 3070 * does not call folio_put(), and we need to set the folios to NULL so that 3071 * btrfs_release_extent_buffer() will not detach them a second time. 3072 */ 3073 static void cleanup_extent_buffer_folios(struct extent_buffer *eb) 3074 { 3075 const int num_folios = num_extent_folios(eb); 3076 3077 /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */ 3078 for (int i = 0; i < num_folios; i++) { 3079 ASSERT(eb->folios[i]); 3080 detach_extent_buffer_folio(eb, eb->folios[i]); 3081 folio_put(eb->folios[i]); 3082 eb->folios[i] = NULL; 3083 } 3084 } 3085 3086 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 3087 { 3088 struct extent_buffer *new; 3089 int num_folios; 3090 int ret; 3091 3092 new = __alloc_extent_buffer(src->fs_info, src->start); 3093 if (new == NULL) 3094 return NULL; 3095 3096 /* 3097 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 3098 * btrfs_release_extent_buffer() have different behavior for 3099 * UNMAPPED subpage extent buffer. 3100 */ 3101 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 3102 3103 ret = alloc_eb_folio_array(new, GFP_NOFS); 3104 if (ret) 3105 goto release_eb; 3106 3107 ASSERT(num_extent_folios(src) == num_extent_folios(new), 3108 "%d != %d", num_extent_folios(src), num_extent_folios(new)); 3109 /* Explicitly use the cached num_extent value from now on. */ 3110 num_folios = num_extent_folios(src); 3111 for (int i = 0; i < num_folios; i++) { 3112 struct folio *folio = new->folios[i]; 3113 3114 ret = attach_extent_buffer_folio(new, folio, NULL); 3115 if (ret < 0) 3116 goto cleanup_folios; 3117 WARN_ON(folio_test_dirty(folio)); 3118 } 3119 for (int i = 0; i < num_folios; i++) 3120 folio_put(new->folios[i]); 3121 3122 copy_extent_buffer_full(new, src); 3123 set_extent_buffer_uptodate(new); 3124 3125 return new; 3126 3127 cleanup_folios: 3128 cleanup_extent_buffer_folios(new); 3129 release_eb: 3130 btrfs_release_extent_buffer(new); 3131 return NULL; 3132 } 3133 3134 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 3135 u64 start) 3136 { 3137 struct extent_buffer *eb; 3138 int ret; 3139 3140 eb = __alloc_extent_buffer(fs_info, start); 3141 if (!eb) 3142 return NULL; 3143 3144 ret = alloc_eb_folio_array(eb, GFP_NOFS); 3145 if (ret) 3146 goto release_eb; 3147 3148 for (int i = 0; i < num_extent_folios(eb); i++) { 3149 ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); 3150 if (ret < 0) 3151 goto cleanup_folios; 3152 } 3153 for (int i = 0; i < num_extent_folios(eb); i++) 3154 folio_put(eb->folios[i]); 3155 3156 set_extent_buffer_uptodate(eb); 3157 btrfs_set_header_nritems(eb, 0); 3158 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 3159 3160 return eb; 3161 3162 cleanup_folios: 3163 cleanup_extent_buffer_folios(eb); 3164 release_eb: 3165 btrfs_release_extent_buffer(eb); 3166 return NULL; 3167 } 3168 3169 static void check_buffer_tree_ref(struct extent_buffer *eb) 3170 { 3171 int refs; 3172 /* 3173 * The TREE_REF bit is first set when the extent_buffer is added to the 3174 * xarray. It is also reset, if unset, when a new reference is created 3175 * by find_extent_buffer. 3176 * 3177 * It is only cleared in two cases: freeing the last non-tree 3178 * reference to the extent_buffer when its STALE bit is set or 3179 * calling release_folio when the tree reference is the only reference. 3180 * 3181 * In both cases, care is taken to ensure that the extent_buffer's 3182 * pages are not under io. However, release_folio can be concurrently 3183 * called with creating new references, which is prone to race 3184 * conditions between the calls to check_buffer_tree_ref in those 3185 * codepaths and clearing TREE_REF in try_release_extent_buffer. 3186 * 3187 * The actual lifetime of the extent_buffer in the xarray is adequately 3188 * protected by the refcount, but the TREE_REF bit and its corresponding 3189 * reference are not. To protect against this class of races, we call 3190 * check_buffer_tree_ref() from the code paths which trigger io. Note that 3191 * once io is initiated, TREE_REF can no longer be cleared, so that is 3192 * the moment at which any such race is best fixed. 3193 */ 3194 refs = refcount_read(&eb->refs); 3195 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3196 return; 3197 3198 spin_lock(&eb->refs_lock); 3199 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3200 refcount_inc(&eb->refs); 3201 spin_unlock(&eb->refs_lock); 3202 } 3203 3204 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 3205 { 3206 check_buffer_tree_ref(eb); 3207 3208 for (int i = 0; i < num_extent_folios(eb); i++) 3209 folio_mark_accessed(eb->folios[i]); 3210 } 3211 3212 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 3213 u64 start) 3214 { 3215 struct extent_buffer *eb; 3216 3217 eb = find_extent_buffer_nolock(fs_info, start); 3218 if (!eb) 3219 return NULL; 3220 /* 3221 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 3222 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 3223 * another task running free_extent_buffer() might have seen that flag 3224 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 3225 * writeback flags not set) and it's still in the tree (flag 3226 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 3227 * decrementing the extent buffer's reference count twice. So here we 3228 * could race and increment the eb's reference count, clear its stale 3229 * flag, mark it as dirty and drop our reference before the other task 3230 * finishes executing free_extent_buffer, which would later result in 3231 * an attempt to free an extent buffer that is dirty. 3232 */ 3233 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 3234 spin_lock(&eb->refs_lock); 3235 spin_unlock(&eb->refs_lock); 3236 } 3237 mark_extent_buffer_accessed(eb); 3238 return eb; 3239 } 3240 3241 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 3242 u64 start) 3243 { 3244 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3245 struct extent_buffer *eb, *exists = NULL; 3246 int ret; 3247 3248 eb = find_extent_buffer(fs_info, start); 3249 if (eb) 3250 return eb; 3251 eb = alloc_dummy_extent_buffer(fs_info, start); 3252 if (!eb) 3253 return ERR_PTR(-ENOMEM); 3254 eb->fs_info = fs_info; 3255 again: 3256 xa_lock_irq(&fs_info->buffer_tree); 3257 exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, 3258 NULL, eb, GFP_NOFS); 3259 if (xa_is_err(exists)) { 3260 ret = xa_err(exists); 3261 xa_unlock_irq(&fs_info->buffer_tree); 3262 btrfs_release_extent_buffer(eb); 3263 return ERR_PTR(ret); 3264 } 3265 if (exists) { 3266 if (!refcount_inc_not_zero(&exists->refs)) { 3267 /* The extent buffer is being freed, retry. */ 3268 xa_unlock_irq(&fs_info->buffer_tree); 3269 goto again; 3270 } 3271 xa_unlock_irq(&fs_info->buffer_tree); 3272 btrfs_release_extent_buffer(eb); 3273 return exists; 3274 } 3275 xa_unlock_irq(&fs_info->buffer_tree); 3276 check_buffer_tree_ref(eb); 3277 3278 return eb; 3279 #else 3280 /* Stub to avoid linker error when compiled with optimizations turned off. */ 3281 return NULL; 3282 #endif 3283 } 3284 3285 static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, 3286 struct folio *folio) 3287 { 3288 struct extent_buffer *exists; 3289 3290 lockdep_assert_held(&folio->mapping->i_private_lock); 3291 3292 /* 3293 * For subpage case, we completely rely on xarray to ensure we don't try 3294 * to insert two ebs for the same bytenr. So here we always return NULL 3295 * and just continue. 3296 */ 3297 if (btrfs_meta_is_subpage(fs_info)) 3298 return NULL; 3299 3300 /* Page not yet attached to an extent buffer */ 3301 if (!folio_test_private(folio)) 3302 return NULL; 3303 3304 /* 3305 * We could have already allocated an eb for this folio and attached one 3306 * so lets see if we can get a ref on the existing eb, and if we can we 3307 * know it's good and we can just return that one, else we know we can 3308 * just overwrite folio private. 3309 */ 3310 exists = folio_get_private(folio); 3311 if (refcount_inc_not_zero(&exists->refs)) 3312 return exists; 3313 3314 WARN_ON(folio_test_dirty(folio)); 3315 folio_detach_private(folio); 3316 return NULL; 3317 } 3318 3319 /* 3320 * Validate alignment constraints of eb at logical address @start. 3321 */ 3322 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 3323 { 3324 const u32 nodesize = fs_info->nodesize; 3325 3326 if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) { 3327 btrfs_err(fs_info, "bad tree block start %llu", start); 3328 return true; 3329 } 3330 3331 if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) { 3332 btrfs_err(fs_info, 3333 "tree block is not nodesize aligned, start %llu nodesize %u", 3334 start, nodesize); 3335 return true; 3336 } 3337 if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) { 3338 btrfs_err(fs_info, 3339 "tree block is not page aligned, start %llu nodesize %u", 3340 start, nodesize); 3341 return true; 3342 } 3343 if (unlikely(!IS_ALIGNED(start, nodesize) && 3344 !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) { 3345 btrfs_warn(fs_info, 3346 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", 3347 start, nodesize); 3348 } 3349 return false; 3350 } 3351 3352 /* 3353 * Return 0 if eb->folios[i] is attached to btree inode successfully. 3354 * Return >0 if there is already another extent buffer for the range, 3355 * and @found_eb_ret would be updated. 3356 * Return -EAGAIN if the filemap has an existing folio but with different size 3357 * than @eb. 3358 * The caller needs to free the existing folios and retry using the same order. 3359 */ 3360 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3361 struct btrfs_folio_state *prealloc, 3362 struct extent_buffer **found_eb_ret) 3363 { 3364 3365 struct btrfs_fs_info *fs_info = eb->fs_info; 3366 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3367 const pgoff_t index = eb->start >> PAGE_SHIFT; 3368 struct folio *existing_folio; 3369 int ret; 3370 3371 ASSERT(found_eb_ret); 3372 3373 /* Caller should ensure the folio exists. */ 3374 ASSERT(eb->folios[i]); 3375 3376 retry: 3377 existing_folio = NULL; 3378 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3379 GFP_NOFS | __GFP_NOFAIL); 3380 if (!ret) 3381 goto finish; 3382 3383 existing_folio = filemap_lock_folio(mapping, index + i); 3384 /* The page cache only exists for a very short time, just retry. */ 3385 if (IS_ERR(existing_folio)) 3386 goto retry; 3387 3388 /* For now, we should only have single-page folios for btree inode. */ 3389 ASSERT(folio_nr_pages(existing_folio) == 1); 3390 3391 if (folio_size(existing_folio) != eb->folio_size) { 3392 folio_unlock(existing_folio); 3393 folio_put(existing_folio); 3394 return -EAGAIN; 3395 } 3396 3397 finish: 3398 spin_lock(&mapping->i_private_lock); 3399 if (existing_folio && btrfs_meta_is_subpage(fs_info)) { 3400 /* We're going to reuse the existing folio, can drop our folio now. */ 3401 folio_put(eb->folios[i]); 3402 eb->folios[i] = existing_folio; 3403 } else if (existing_folio) { 3404 struct extent_buffer *existing_eb; 3405 3406 existing_eb = grab_extent_buffer(fs_info, existing_folio); 3407 if (existing_eb) { 3408 /* The extent buffer still exists, we can use it directly. */ 3409 *found_eb_ret = existing_eb; 3410 spin_unlock(&mapping->i_private_lock); 3411 folio_unlock(existing_folio); 3412 folio_put(existing_folio); 3413 return 1; 3414 } 3415 /* The extent buffer no longer exists, we can reuse the folio. */ 3416 folio_put(eb->folios[i]); 3417 eb->folios[i] = existing_folio; 3418 } 3419 eb->folio_size = folio_size(eb->folios[i]); 3420 eb->folio_shift = folio_shift(eb->folios[i]); 3421 /* Should not fail, as we have preallocated the memory. */ 3422 ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3423 ASSERT(!ret); 3424 /* 3425 * To inform we have an extra eb under allocation, so that 3426 * detach_extent_buffer_page() won't release the folio private when the 3427 * eb hasn't been inserted into the xarray yet. 3428 * 3429 * The ref will be decreased when the eb releases the page, in 3430 * detach_extent_buffer_page(). Thus needs no special handling in the 3431 * error path. 3432 */ 3433 btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3434 spin_unlock(&mapping->i_private_lock); 3435 return 0; 3436 } 3437 3438 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 3439 u64 start, u64 owner_root, int level) 3440 { 3441 int attached = 0; 3442 struct extent_buffer *eb; 3443 struct extent_buffer *existing_eb = NULL; 3444 struct btrfs_folio_state *prealloc = NULL; 3445 u64 lockdep_owner = owner_root; 3446 bool page_contig = true; 3447 bool uptodate = true; 3448 int ret; 3449 3450 if (check_eb_alignment(fs_info, start)) 3451 return ERR_PTR(-EINVAL); 3452 3453 #if BITS_PER_LONG == 32 3454 if (start >= MAX_LFS_FILESIZE) { 3455 btrfs_err_rl(fs_info, 3456 "extent buffer %llu is beyond 32bit page cache limit", start); 3457 btrfs_err_32bit_limit(fs_info); 3458 return ERR_PTR(-EOVERFLOW); 3459 } 3460 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 3461 btrfs_warn_32bit_limit(fs_info); 3462 #endif 3463 3464 eb = find_extent_buffer(fs_info, start); 3465 if (eb) 3466 return eb; 3467 3468 eb = __alloc_extent_buffer(fs_info, start); 3469 if (!eb) 3470 return ERR_PTR(-ENOMEM); 3471 3472 /* 3473 * The reloc trees are just snapshots, so we need them to appear to be 3474 * just like any other fs tree WRT lockdep. 3475 */ 3476 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 3477 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 3478 3479 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 3480 3481 /* 3482 * Preallocate folio private for subpage case, so that we won't 3483 * allocate memory with i_private_lock nor page lock hold. 3484 * 3485 * The memory will be freed by attach_extent_buffer_page() or freed 3486 * manually if we exit earlier. 3487 */ 3488 if (btrfs_meta_is_subpage(fs_info)) { 3489 prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); 3490 if (IS_ERR(prealloc)) { 3491 ret = PTR_ERR(prealloc); 3492 goto out; 3493 } 3494 } 3495 3496 reallocate: 3497 /* 3498 * Allocate all pages first. These will be attached to btree_inode->i_mapping 3499 * below (added to LRU, served by btree_migrate_folio), so request 3500 * __GFP_MOVABLE so the page allocator places them in MOVABLE pageblocks. 3501 */ 3502 ret = alloc_eb_folio_array(eb, GFP_NOFS | __GFP_NOFAIL | __GFP_MOVABLE); 3503 if (ret < 0) { 3504 btrfs_free_folio_state(prealloc); 3505 goto out; 3506 } 3507 3508 /* Attach all pages to the filemap. */ 3509 for (int i = 0; i < num_extent_folios(eb); i++) { 3510 struct folio *folio; 3511 3512 ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3513 if (ret > 0) { 3514 ASSERT(existing_eb); 3515 goto out; 3516 } 3517 3518 /* 3519 * TODO: Special handling for a corner case where the order of 3520 * folios mismatch between the new eb and filemap. 3521 * 3522 * This happens when: 3523 * 3524 * - the new eb is using higher order folio 3525 * 3526 * - the filemap is still using 0-order folios for the range 3527 * This can happen at the previous eb allocation, and we don't 3528 * have higher order folio for the call. 3529 * 3530 * - the existing eb has already been freed 3531 * 3532 * In this case, we have to free the existing folios first, and 3533 * re-allocate using the same order. 3534 * Thankfully this is not going to happen yet, as we're still 3535 * using 0-order folios. 3536 */ 3537 if (unlikely(ret == -EAGAIN)) { 3538 DEBUG_WARN("folio order mismatch between new eb and filemap"); 3539 goto reallocate; 3540 } 3541 attached++; 3542 3543 /* 3544 * Only after attach_eb_folio_to_filemap(), eb->folios[] is 3545 * reliable, as we may choose to reuse the existing page cache 3546 * and free the allocated page. 3547 */ 3548 folio = eb->folios[i]; 3549 WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); 3550 3551 /* 3552 * Check if the current page is physically contiguous with previous eb 3553 * page. 3554 * At this stage, either we allocated a large folio, thus @i 3555 * would only be 0, or we fall back to per-page allocation. 3556 */ 3557 if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) 3558 page_contig = false; 3559 3560 if (!btrfs_meta_folio_test_uptodate(folio, eb)) 3561 uptodate = false; 3562 3563 /* 3564 * We can't unlock the pages just yet since the extent buffer 3565 * hasn't been properly inserted into the xarray, this opens a 3566 * race with btree_release_folio() which can free a page while we 3567 * are still filling in all pages for the buffer and we could crash. 3568 */ 3569 } 3570 if (uptodate) 3571 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3572 /* All pages are physically contiguous, can skip cross page handling. */ 3573 if (page_contig) 3574 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3575 again: 3576 xa_lock_irq(&fs_info->buffer_tree); 3577 existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3578 start >> fs_info->nodesize_bits, NULL, eb, 3579 GFP_NOFS); 3580 if (xa_is_err(existing_eb)) { 3581 ret = xa_err(existing_eb); 3582 xa_unlock_irq(&fs_info->buffer_tree); 3583 goto out; 3584 } 3585 if (existing_eb) { 3586 if (!refcount_inc_not_zero(&existing_eb->refs)) { 3587 xa_unlock_irq(&fs_info->buffer_tree); 3588 goto again; 3589 } 3590 xa_unlock_irq(&fs_info->buffer_tree); 3591 goto out; 3592 } 3593 xa_unlock_irq(&fs_info->buffer_tree); 3594 3595 /* add one reference for the tree */ 3596 check_buffer_tree_ref(eb); 3597 3598 /* 3599 * Now it's safe to unlock the pages because any calls to 3600 * btree_release_folio will correctly detect that a page belongs to a 3601 * live buffer and won't free them prematurely. 3602 */ 3603 for (int i = 0; i < num_extent_folios(eb); i++) { 3604 folio_unlock(eb->folios[i]); 3605 /* 3606 * A folio that has been added to an address_space mapping 3607 * should not continue holding the refcount from its original 3608 * allocation indefinitely. 3609 */ 3610 folio_put(eb->folios[i]); 3611 } 3612 return eb; 3613 3614 out: 3615 WARN_ON(!refcount_dec_and_test(&eb->refs)); 3616 3617 /* 3618 * Any attached folios need to be detached before we unlock them. This 3619 * is because when we're inserting our new folios into the mapping, and 3620 * then attaching our eb to that folio. If we fail to insert our folio 3621 * we'll lookup the folio for that index, and grab that EB. We do not 3622 * want that to grab this eb, as we're getting ready to free it. So we 3623 * have to detach it first and then unlock it. 3624 * 3625 * Note: the bounds is num_extent_pages() as we need to go through all slots. 3626 */ 3627 for (int i = 0; i < num_extent_pages(eb); i++) { 3628 struct folio *folio = eb->folios[i]; 3629 3630 if (i < attached) { 3631 ASSERT(folio); 3632 detach_extent_buffer_folio(eb, folio); 3633 folio_unlock(folio); 3634 } else if (!folio) { 3635 continue; 3636 } 3637 3638 folio_put(folio); 3639 eb->folios[i] = NULL; 3640 } 3641 btrfs_release_extent_buffer(eb); 3642 if (ret < 0) 3643 return ERR_PTR(ret); 3644 ASSERT(existing_eb); 3645 return existing_eb; 3646 } 3647 3648 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3649 { 3650 struct extent_buffer *eb = 3651 container_of(head, struct extent_buffer, rcu_head); 3652 3653 kmem_cache_free(extent_buffer_cache, eb); 3654 } 3655 3656 static int release_extent_buffer(struct extent_buffer *eb) 3657 __releases(&eb->refs_lock) 3658 { 3659 lockdep_assert_held(&eb->refs_lock); 3660 3661 if (refcount_dec_and_test(&eb->refs)) { 3662 struct btrfs_fs_info *fs_info = eb->fs_info; 3663 3664 spin_unlock(&eb->refs_lock); 3665 3666 /* 3667 * We're erasing, theoretically there will be no allocations, so 3668 * just use GFP_ATOMIC. 3669 * 3670 * We use cmpxchg instead of erase because we do not know if 3671 * this eb is actually in the tree or not, we could be cleaning 3672 * up an eb that we allocated but never inserted into the tree. 3673 * Thus use cmpxchg to remove it from the tree if it is there, 3674 * or leave the other entry if this isn't in the tree. 3675 * 3676 * The documentation says that putting a NULL value is the same 3677 * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3678 * in this case. 3679 */ 3680 xa_cmpxchg_irq(&fs_info->buffer_tree, 3681 eb->start >> fs_info->nodesize_bits, eb, NULL, 3682 GFP_ATOMIC); 3683 3684 btrfs_leak_debug_del_eb(eb); 3685 /* Should be safe to release folios at this point. */ 3686 btrfs_release_extent_buffer_folios(eb); 3687 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3688 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 3689 kmem_cache_free(extent_buffer_cache, eb); 3690 return 1; 3691 } 3692 #endif 3693 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3694 return 1; 3695 } 3696 spin_unlock(&eb->refs_lock); 3697 3698 return 0; 3699 } 3700 3701 void free_extent_buffer(struct extent_buffer *eb) 3702 { 3703 int refs; 3704 if (!eb) 3705 return; 3706 3707 refs = refcount_read(&eb->refs); 3708 while (1) { 3709 if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { 3710 if (refs == 1) 3711 break; 3712 } else if (refs <= 3) { 3713 break; 3714 } 3715 3716 /* Optimization to avoid locking eb->refs_lock. */ 3717 if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) 3718 return; 3719 } 3720 3721 spin_lock(&eb->refs_lock); 3722 if (refcount_read(&eb->refs) == 2 && 3723 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 3724 !extent_buffer_under_io(eb) && 3725 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3726 refcount_dec(&eb->refs); 3727 3728 /* 3729 * I know this is terrible, but it's temporary until we stop tracking 3730 * the uptodate bits and such for the extent buffers. 3731 */ 3732 release_extent_buffer(eb); 3733 } 3734 3735 void free_extent_buffer_stale(struct extent_buffer *eb) 3736 { 3737 if (!eb) 3738 return; 3739 3740 spin_lock(&eb->refs_lock); 3741 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 3742 3743 if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 3744 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 3745 refcount_dec(&eb->refs); 3746 release_extent_buffer(eb); 3747 } 3748 3749 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, 3750 struct extent_buffer *eb) 3751 { 3752 struct btrfs_fs_info *fs_info = eb->fs_info; 3753 3754 btrfs_assert_tree_write_locked(eb); 3755 3756 if (trans && btrfs_header_generation(eb) != trans->transid) 3757 return; 3758 3759 /* 3760 * Instead of clearing the dirty flag off of the buffer, mark it as 3761 * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve 3762 * write-ordering in zoned mode, without the need to later re-dirty 3763 * the extent_buffer. 3764 * 3765 * The actual zeroout of the buffer will happen later in 3766 * btree_csum_one_bio. 3767 */ 3768 if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3769 set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); 3770 return; 3771 } 3772 3773 if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) 3774 return; 3775 3776 buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); 3777 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, 3778 fs_info->dirty_metadata_batch); 3779 3780 for (int i = 0; i < num_extent_folios(eb); i++) { 3781 struct folio *folio = eb->folios[i]; 3782 bool last; 3783 3784 if (!folio_test_dirty(folio)) 3785 continue; 3786 folio_lock(folio); 3787 last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); 3788 if (last) 3789 btrfs_clear_folio_dirty_tag(folio); 3790 folio_unlock(folio); 3791 } 3792 WARN_ON(refcount_read(&eb->refs) == 0); 3793 } 3794 3795 void set_extent_buffer_dirty(struct extent_buffer *eb) 3796 { 3797 bool was_dirty; 3798 3799 check_buffer_tree_ref(eb); 3800 3801 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3802 3803 WARN_ON(refcount_read(&eb->refs) == 0); 3804 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 3805 WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); 3806 3807 if (!was_dirty) { 3808 bool subpage = btrfs_meta_is_subpage(eb->fs_info); 3809 3810 /* 3811 * For subpage case, we can have other extent buffers in the 3812 * same page, and in clear_extent_buffer_dirty() we 3813 * have to clear page dirty without subpage lock held. 3814 * This can cause race where our page gets dirty cleared after 3815 * we just set it. 3816 * 3817 * Thankfully, clear_extent_buffer_dirty() has locked 3818 * its page for other reasons, we can use page lock to prevent 3819 * the above race. 3820 */ 3821 if (subpage) 3822 folio_lock(eb->folios[0]); 3823 for (int i = 0; i < num_extent_folios(eb); i++) 3824 btrfs_meta_folio_set_dirty(eb->folios[i], eb); 3825 buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); 3826 if (subpage) 3827 folio_unlock(eb->folios[0]); 3828 percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, 3829 eb->len, 3830 eb->fs_info->dirty_metadata_batch); 3831 } 3832 #ifdef CONFIG_BTRFS_DEBUG 3833 for (int i = 0; i < num_extent_folios(eb); i++) 3834 ASSERT(folio_test_dirty(eb->folios[i])); 3835 #endif 3836 } 3837 3838 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 3839 { 3840 3841 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3842 for (int i = 0; i < num_extent_folios(eb); i++) { 3843 struct folio *folio = eb->folios[i]; 3844 3845 if (!folio) 3846 continue; 3847 3848 btrfs_meta_folio_clear_uptodate(folio, eb); 3849 } 3850 } 3851 3852 void set_extent_buffer_uptodate(struct extent_buffer *eb) 3853 { 3854 3855 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3856 for (int i = 0; i < num_extent_folios(eb); i++) 3857 btrfs_meta_folio_set_uptodate(eb->folios[i], eb); 3858 } 3859 3860 static void clear_extent_buffer_reading(struct extent_buffer *eb) 3861 { 3862 clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); 3863 } 3864 3865 static void end_bbio_meta_read(struct btrfs_bio *bbio) 3866 { 3867 struct extent_buffer *eb = bbio->private; 3868 bool uptodate = !bbio->bio.bi_status; 3869 3870 /* 3871 * If the extent buffer is marked UPTODATE before the read operation 3872 * completes, other calls to read_extent_buffer_pages() will return 3873 * early without waiting for the read to finish, causing data races. 3874 */ 3875 WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); 3876 3877 eb->read_mirror = bbio->mirror_num; 3878 3879 if (uptodate && 3880 btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) 3881 uptodate = false; 3882 3883 if (uptodate) 3884 set_extent_buffer_uptodate(eb); 3885 else 3886 clear_extent_buffer_uptodate(eb); 3887 3888 clear_extent_buffer_reading(eb); 3889 free_extent_buffer(eb); 3890 3891 bio_put(&bbio->bio); 3892 } 3893 3894 int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, 3895 const struct btrfs_tree_parent_check *check) 3896 { 3897 struct btrfs_fs_info *fs_info = eb->fs_info; 3898 struct btrfs_bio *bbio; 3899 3900 if (extent_buffer_uptodate(eb)) { 3901 int ret; 3902 3903 ret = btrfs_buffer_uptodate(eb, 0, check); 3904 if (unlikely(ret <= 0)) { 3905 if (ret == 0) 3906 ret = -EIO; 3907 return ret; 3908 } 3909 return 0; 3910 } 3911 3912 /* 3913 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 3914 * operation, which could potentially still be in flight. In this case 3915 * we simply want to return an error. 3916 */ 3917 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 3918 return -EIO; 3919 3920 /* Someone else is already reading the buffer, just wait for it. */ 3921 if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) 3922 return 0; 3923 3924 /* 3925 * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above 3926 * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have 3927 * started and finished reading the same eb. In this case, UPTODATE 3928 * will now be set, and we shouldn't read it in again. 3929 */ 3930 if (unlikely(extent_buffer_uptodate(eb))) { 3931 int ret; 3932 3933 clear_extent_buffer_reading(eb); 3934 ret = btrfs_buffer_uptodate(eb, 0, check); 3935 if (unlikely(ret <= 0)) { 3936 if (ret == 0) 3937 ret = -EIO; 3938 return ret; 3939 } 3940 return 0; 3941 } 3942 3943 eb->read_mirror = 0; 3944 check_buffer_tree_ref(eb); 3945 refcount_inc(&eb->refs); 3946 3947 bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, 3948 REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode), 3949 eb->start, end_bbio_meta_read, eb); 3950 bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; 3951 memcpy(&bbio->parent_check, check, sizeof(*check)); 3952 for (int i = 0; i < num_extent_folios(eb); i++) { 3953 struct folio *folio = eb->folios[i]; 3954 u64 range_start = max_t(u64, eb->start, folio_pos(folio)); 3955 u32 range_len = min_t(u64, folio_next_pos(folio), 3956 eb->start + eb->len) - range_start; 3957 3958 bio_add_folio_nofail(&bbio->bio, folio, range_len, 3959 offset_in_folio(folio, range_start)); 3960 } 3961 btrfs_submit_bbio(bbio, mirror_num); 3962 return 0; 3963 } 3964 3965 int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, 3966 const struct btrfs_tree_parent_check *check) 3967 { 3968 int ret; 3969 3970 ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); 3971 if (ret < 0) 3972 return ret; 3973 3974 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3975 if (unlikely(!extent_buffer_uptodate(eb))) 3976 return -EIO; 3977 return 0; 3978 } 3979 3980 /* Never inlined to decrease code size, as this is called in a cold path. */ 3981 static noinline void report_eb_range(const struct extent_buffer *eb, 3982 unsigned long start, unsigned long len) 3983 { 3984 btrfs_warn(eb->fs_info, 3985 "access to eb bytenr %llu len %u out of range start %lu len %lu", 3986 eb->start, eb->len, start, len); 3987 DEBUG_WARN(); 3988 } 3989 3990 /* 3991 * Check if the [start, start + len) range is valid before reading/writing 3992 * the eb. 3993 * NOTE: @start and @len are offset inside the eb, not logical address. 3994 * 3995 * Caller should not touch the dst/src memory if this function returns error. 3996 */ 3997 static inline bool check_eb_range(const struct extent_buffer *eb, 3998 unsigned long start, unsigned long len) 3999 { 4000 unsigned long offset; 4001 4002 /* start, start + len should not go beyond eb->len nor overflow */ 4003 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) { 4004 report_eb_range(eb, start, len); 4005 return true; 4006 } 4007 4008 return false; 4009 } 4010 4011 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 4012 unsigned long start, unsigned long len) 4013 { 4014 const int unit_size = eb->folio_size; 4015 size_t cur; 4016 size_t offset; 4017 char *dst = (char *)dstv; 4018 unsigned long i; 4019 4020 if (check_eb_range(eb, start, len)) { 4021 /* 4022 * Invalid range hit, reset the memory, so callers won't get 4023 * some random garbage for their uninitialized memory. 4024 */ 4025 memset(dstv, 0, len); 4026 return; 4027 } 4028 4029 if (eb->addr) { 4030 memcpy(dstv, eb->addr + start, len); 4031 return; 4032 } 4033 4034 offset = get_eb_offset_in_folio(eb, start); 4035 i = get_eb_folio_index(eb, start); 4036 while (len > 0) { 4037 char *kaddr; 4038 4039 cur = min(len, unit_size - offset); 4040 kaddr = folio_address(eb->folios[i]); 4041 memcpy(dst, kaddr + offset, cur); 4042 4043 dst += cur; 4044 len -= cur; 4045 offset = 0; 4046 i++; 4047 } 4048 } 4049 4050 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 4051 void __user *dstv, 4052 unsigned long start, unsigned long len) 4053 { 4054 const int unit_size = eb->folio_size; 4055 size_t cur; 4056 size_t offset; 4057 char __user *dst = (char __user *)dstv; 4058 unsigned long i; 4059 int ret = 0; 4060 4061 if (check_eb_range(eb, start, len)) 4062 return -EINVAL; 4063 4064 if (eb->addr) { 4065 if (copy_to_user_nofault(dstv, eb->addr + start, len)) 4066 ret = -EFAULT; 4067 return ret; 4068 } 4069 4070 offset = get_eb_offset_in_folio(eb, start); 4071 i = get_eb_folio_index(eb, start); 4072 while (len > 0) { 4073 char *kaddr; 4074 4075 cur = min(len, unit_size - offset); 4076 kaddr = folio_address(eb->folios[i]); 4077 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 4078 ret = -EFAULT; 4079 break; 4080 } 4081 4082 dst += cur; 4083 len -= cur; 4084 offset = 0; 4085 i++; 4086 } 4087 4088 return ret; 4089 } 4090 4091 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 4092 unsigned long start, unsigned long len) 4093 { 4094 const int unit_size = eb->folio_size; 4095 size_t cur; 4096 size_t offset; 4097 char *kaddr; 4098 char *ptr = (char *)ptrv; 4099 unsigned long i; 4100 int ret = 0; 4101 4102 if (check_eb_range(eb, start, len)) 4103 return -EINVAL; 4104 4105 if (eb->addr) 4106 return memcmp(ptrv, eb->addr + start, len); 4107 4108 offset = get_eb_offset_in_folio(eb, start); 4109 i = get_eb_folio_index(eb, start); 4110 while (len > 0) { 4111 cur = min(len, unit_size - offset); 4112 kaddr = folio_address(eb->folios[i]); 4113 ret = memcmp(ptr, kaddr + offset, cur); 4114 if (ret) 4115 break; 4116 4117 ptr += cur; 4118 len -= cur; 4119 offset = 0; 4120 i++; 4121 } 4122 return ret; 4123 } 4124 4125 /* 4126 * Check that the extent buffer is uptodate. 4127 * 4128 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 4129 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 4130 */ 4131 static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) 4132 { 4133 struct btrfs_fs_info *fs_info = eb->fs_info; 4134 struct folio *folio = eb->folios[i]; 4135 4136 ASSERT(folio); 4137 4138 /* 4139 * If we are using the commit root we could potentially clear a page 4140 * Uptodate while we're using the extent buffer that we've previously 4141 * looked up. We don't want to complain in this case, as the page was 4142 * valid before, we just didn't write it out. Instead we want to catch 4143 * the case where we didn't actually read the block properly, which 4144 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. 4145 */ 4146 if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4147 return; 4148 4149 if (btrfs_meta_is_subpage(fs_info)) { 4150 folio = eb->folios[0]; 4151 ASSERT(i == 0); 4152 if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, 4153 eb->start, eb->len))) 4154 btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); 4155 } else { 4156 WARN_ON(!folio_test_uptodate(folio)); 4157 } 4158 } 4159 4160 static void __write_extent_buffer(const struct extent_buffer *eb, 4161 const void *srcv, unsigned long start, 4162 unsigned long len, bool use_memmove) 4163 { 4164 const int unit_size = eb->folio_size; 4165 size_t cur; 4166 size_t offset; 4167 char *kaddr; 4168 const char *src = (const char *)srcv; 4169 unsigned long i; 4170 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4171 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4172 4173 if (check_eb_range(eb, start, len)) 4174 return; 4175 4176 if (eb->addr) { 4177 if (use_memmove) 4178 memmove(eb->addr + start, srcv, len); 4179 else 4180 memcpy(eb->addr + start, srcv, len); 4181 return; 4182 } 4183 4184 offset = get_eb_offset_in_folio(eb, start); 4185 i = get_eb_folio_index(eb, start); 4186 while (len > 0) { 4187 if (check_uptodate) 4188 assert_eb_folio_uptodate(eb, i); 4189 4190 cur = min(len, unit_size - offset); 4191 kaddr = folio_address(eb->folios[i]); 4192 if (use_memmove) 4193 memmove(kaddr + offset, src, cur); 4194 else 4195 memcpy(kaddr + offset, src, cur); 4196 4197 src += cur; 4198 len -= cur; 4199 offset = 0; 4200 i++; 4201 } 4202 } 4203 4204 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 4205 unsigned long start, unsigned long len) 4206 { 4207 return __write_extent_buffer(eb, srcv, start, len, false); 4208 } 4209 4210 static void memset_extent_buffer(const struct extent_buffer *eb, int c, 4211 unsigned long start, unsigned long len) 4212 { 4213 const int unit_size = eb->folio_size; 4214 unsigned long cur = start; 4215 4216 if (eb->addr) { 4217 memset(eb->addr + start, c, len); 4218 return; 4219 } 4220 4221 while (cur < start + len) { 4222 unsigned long index = get_eb_folio_index(eb, cur); 4223 unsigned int offset = get_eb_offset_in_folio(eb, cur); 4224 unsigned int cur_len = min(start + len - cur, unit_size - offset); 4225 4226 assert_eb_folio_uptodate(eb, index); 4227 memset(folio_address(eb->folios[index]) + offset, c, cur_len); 4228 4229 cur += cur_len; 4230 } 4231 } 4232 4233 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 4234 unsigned long len) 4235 { 4236 if (check_eb_range(eb, start, len)) 4237 return; 4238 return memset_extent_buffer(eb, 0, start, len); 4239 } 4240 4241 void copy_extent_buffer_full(const struct extent_buffer *dst, 4242 const struct extent_buffer *src) 4243 { 4244 const int unit_size = src->folio_size; 4245 unsigned long cur = 0; 4246 4247 ASSERT(dst->len == src->len); 4248 4249 while (cur < src->len) { 4250 unsigned long index = get_eb_folio_index(src, cur); 4251 unsigned long offset = get_eb_offset_in_folio(src, cur); 4252 unsigned long cur_len = min(src->len, unit_size - offset); 4253 void *addr = folio_address(src->folios[index]) + offset; 4254 4255 write_extent_buffer(dst, addr, cur, cur_len); 4256 4257 cur += cur_len; 4258 } 4259 } 4260 4261 void copy_extent_buffer(const struct extent_buffer *dst, 4262 const struct extent_buffer *src, 4263 unsigned long dst_offset, unsigned long src_offset, 4264 unsigned long len) 4265 { 4266 const int unit_size = dst->folio_size; 4267 u64 dst_len = dst->len; 4268 size_t cur; 4269 size_t offset; 4270 char *kaddr; 4271 unsigned long i; 4272 4273 if (check_eb_range(dst, dst_offset, len) || 4274 check_eb_range(src, src_offset, len)) 4275 return; 4276 4277 WARN_ON(src->len != dst_len); 4278 4279 offset = get_eb_offset_in_folio(dst, dst_offset); 4280 4281 i = get_eb_folio_index(dst, dst_offset); 4282 while (len > 0) { 4283 assert_eb_folio_uptodate(dst, i); 4284 4285 cur = min(len, (unsigned long)(unit_size - offset)); 4286 4287 kaddr = folio_address(dst->folios[i]); 4288 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4289 4290 src_offset += cur; 4291 len -= cur; 4292 offset = 0; 4293 i++; 4294 } 4295 } 4296 4297 /* 4298 * Calculate the folio and offset of the byte containing the given bit number. 4299 * 4300 * @eb: the extent buffer 4301 * @start: offset of the bitmap item in the extent buffer 4302 * @nr: bit number 4303 * @folio_index: return index of the folio in the extent buffer that contains 4304 * the given bit number 4305 * @folio_offset: return offset into the folio given by folio_index 4306 * 4307 * This helper hides the ugliness of finding the byte in an extent buffer which 4308 * contains a given bit. 4309 */ 4310 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 4311 unsigned long start, unsigned long nr, 4312 unsigned long *folio_index, 4313 size_t *folio_offset) 4314 { 4315 size_t byte_offset = BIT_BYTE(nr); 4316 size_t offset; 4317 4318 /* 4319 * The byte we want is the offset of the extent buffer + the offset of 4320 * the bitmap item in the extent buffer + the offset of the byte in the 4321 * bitmap item. 4322 */ 4323 offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset; 4324 4325 *folio_index = offset >> eb->folio_shift; 4326 *folio_offset = offset_in_eb_folio(eb, offset); 4327 } 4328 4329 /* 4330 * Determine whether a bit in a bitmap item is set. 4331 * 4332 * @eb: the extent buffer 4333 * @start: offset of the bitmap item in the extent buffer 4334 * @nr: bit number to test 4335 */ 4336 bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 4337 unsigned long nr) 4338 { 4339 unsigned long i; 4340 size_t offset; 4341 u8 *kaddr; 4342 4343 eb_bitmap_offset(eb, start, nr, &i, &offset); 4344 assert_eb_folio_uptodate(eb, i); 4345 kaddr = folio_address(eb->folios[i]); 4346 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 4347 } 4348 4349 static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) 4350 { 4351 unsigned long index = get_eb_folio_index(eb, bytenr); 4352 4353 if (check_eb_range(eb, bytenr, 1)) 4354 return NULL; 4355 return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); 4356 } 4357 4358 /* 4359 * Set an area of a bitmap to 1. 4360 * 4361 * @eb: the extent buffer 4362 * @start: offset of the bitmap item in the extent buffer 4363 * @pos: bit number of the first bit 4364 * @len: number of bits to set 4365 */ 4366 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 4367 unsigned long pos, unsigned long len) 4368 { 4369 unsigned int first_byte = start + BIT_BYTE(pos); 4370 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4371 const bool same_byte = (first_byte == last_byte); 4372 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4373 u8 *kaddr; 4374 4375 if (same_byte) 4376 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4377 4378 /* Handle the first byte. */ 4379 kaddr = extent_buffer_get_byte(eb, first_byte); 4380 *kaddr |= mask; 4381 if (same_byte) 4382 return; 4383 4384 /* Handle the byte aligned part. */ 4385 ASSERT(first_byte + 1 <= last_byte); 4386 memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); 4387 4388 /* Handle the last byte. */ 4389 kaddr = extent_buffer_get_byte(eb, last_byte); 4390 *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); 4391 } 4392 4393 4394 /* 4395 * Clear an area of a bitmap. 4396 * 4397 * @eb: the extent buffer 4398 * @start: offset of the bitmap item in the extent buffer 4399 * @pos: bit number of the first bit 4400 * @len: number of bits to clear 4401 */ 4402 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 4403 unsigned long start, unsigned long pos, 4404 unsigned long len) 4405 { 4406 unsigned int first_byte = start + BIT_BYTE(pos); 4407 unsigned int last_byte = start + BIT_BYTE(pos + len - 1); 4408 const bool same_byte = (first_byte == last_byte); 4409 u8 mask = BITMAP_FIRST_BYTE_MASK(pos); 4410 u8 *kaddr; 4411 4412 if (same_byte) 4413 mask &= BITMAP_LAST_BYTE_MASK(pos + len); 4414 4415 /* Handle the first byte. */ 4416 kaddr = extent_buffer_get_byte(eb, first_byte); 4417 *kaddr &= ~mask; 4418 if (same_byte) 4419 return; 4420 4421 /* Handle the byte aligned part. */ 4422 ASSERT(first_byte + 1 <= last_byte); 4423 memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); 4424 4425 /* Handle the last byte. */ 4426 kaddr = extent_buffer_get_byte(eb, last_byte); 4427 *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); 4428 } 4429 4430 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4431 { 4432 unsigned long distance = (src > dst) ? src - dst : dst - src; 4433 return distance < len; 4434 } 4435 4436 void memcpy_extent_buffer(const struct extent_buffer *dst, 4437 unsigned long dst_offset, unsigned long src_offset, 4438 unsigned long len) 4439 { 4440 const int unit_size = dst->folio_size; 4441 unsigned long cur_off = 0; 4442 4443 if (check_eb_range(dst, dst_offset, len) || 4444 check_eb_range(dst, src_offset, len)) 4445 return; 4446 4447 if (dst->addr) { 4448 const bool use_memmove = areas_overlap(src_offset, dst_offset, len); 4449 4450 if (use_memmove) 4451 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4452 else 4453 memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); 4454 return; 4455 } 4456 4457 while (cur_off < len) { 4458 unsigned long cur_src = cur_off + src_offset; 4459 unsigned long folio_index = get_eb_folio_index(dst, cur_src); 4460 unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); 4461 unsigned long cur_len = min(src_offset + len - cur_src, 4462 unit_size - folio_off); 4463 void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; 4464 const bool use_memmove = areas_overlap(src_offset + cur_off, 4465 dst_offset + cur_off, cur_len); 4466 4467 __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, 4468 use_memmove); 4469 cur_off += cur_len; 4470 } 4471 } 4472 4473 void memmove_extent_buffer(const struct extent_buffer *dst, 4474 unsigned long dst_offset, unsigned long src_offset, 4475 unsigned long len) 4476 { 4477 unsigned long dst_end = dst_offset + len - 1; 4478 unsigned long src_end = src_offset + len - 1; 4479 4480 if (check_eb_range(dst, dst_offset, len) || 4481 check_eb_range(dst, src_offset, len)) 4482 return; 4483 4484 if (dst_offset < src_offset) { 4485 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4486 return; 4487 } 4488 4489 if (dst->addr) { 4490 memmove(dst->addr + dst_offset, dst->addr + src_offset, len); 4491 return; 4492 } 4493 4494 while (len > 0) { 4495 unsigned long src_i; 4496 size_t cur; 4497 size_t dst_off_in_folio; 4498 size_t src_off_in_folio; 4499 void *src_addr; 4500 bool use_memmove; 4501 4502 src_i = get_eb_folio_index(dst, src_end); 4503 4504 dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); 4505 src_off_in_folio = get_eb_offset_in_folio(dst, src_end); 4506 4507 cur = min_t(unsigned long, len, src_off_in_folio + 1); 4508 cur = min(cur, dst_off_in_folio + 1); 4509 4510 src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - 4511 cur + 1; 4512 use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, 4513 cur); 4514 4515 __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, 4516 use_memmove); 4517 4518 dst_end -= cur; 4519 src_end -= cur; 4520 len -= cur; 4521 } 4522 } 4523 4524 static int try_release_subpage_extent_buffer(struct folio *folio) 4525 { 4526 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4527 struct extent_buffer *eb; 4528 unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); 4529 unsigned long index = start; 4530 unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; 4531 int ret; 4532 4533 rcu_read_lock(); 4534 xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4535 /* 4536 * The same as try_release_extent_buffer(), to ensure the eb 4537 * won't disappear out from under us. 4538 */ 4539 spin_lock(&eb->refs_lock); 4540 rcu_read_unlock(); 4541 4542 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4543 spin_unlock(&eb->refs_lock); 4544 rcu_read_lock(); 4545 continue; 4546 } 4547 4548 /* 4549 * If tree ref isn't set then we know the ref on this eb is a 4550 * real ref, so just return, this eb will likely be freed soon 4551 * anyway. 4552 */ 4553 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4554 spin_unlock(&eb->refs_lock); 4555 rcu_read_lock(); 4556 break; 4557 } 4558 4559 /* 4560 * Here we don't care about the return value, we will always 4561 * check the folio private at the end. And 4562 * release_extent_buffer() will release the refs_lock. 4563 */ 4564 release_extent_buffer(eb); 4565 rcu_read_lock(); 4566 } 4567 rcu_read_unlock(); 4568 4569 /* 4570 * Finally to check if we have cleared folio private, as if we have 4571 * released all ebs in the page, the folio private should be cleared now. 4572 */ 4573 spin_lock(&folio->mapping->i_private_lock); 4574 if (!folio_test_private(folio)) 4575 ret = 1; 4576 else 4577 ret = 0; 4578 spin_unlock(&folio->mapping->i_private_lock); 4579 return ret; 4580 } 4581 4582 int try_release_extent_buffer(struct folio *folio) 4583 { 4584 struct extent_buffer *eb; 4585 4586 if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) 4587 return try_release_subpage_extent_buffer(folio); 4588 4589 /* 4590 * We need to make sure nobody is changing folio private, as we rely on 4591 * folio private as the pointer to extent buffer. 4592 */ 4593 spin_lock(&folio->mapping->i_private_lock); 4594 if (!folio_test_private(folio)) { 4595 spin_unlock(&folio->mapping->i_private_lock); 4596 return 1; 4597 } 4598 4599 eb = folio_get_private(folio); 4600 BUG_ON(!eb); 4601 4602 /* 4603 * This is a little awful but should be ok, we need to make sure that 4604 * the eb doesn't disappear out from under us while we're looking at 4605 * this page. 4606 */ 4607 spin_lock(&eb->refs_lock); 4608 if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4609 spin_unlock(&eb->refs_lock); 4610 spin_unlock(&folio->mapping->i_private_lock); 4611 return 0; 4612 } 4613 spin_unlock(&folio->mapping->i_private_lock); 4614 4615 /* 4616 * If tree ref isn't set then we know the ref on this eb is a real ref, 4617 * so just return, this page will likely be freed soon anyway. 4618 */ 4619 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 4620 spin_unlock(&eb->refs_lock); 4621 return 0; 4622 } 4623 4624 return release_extent_buffer(eb); 4625 } 4626 4627 /* 4628 * Attempt to readahead a child block. 4629 * 4630 * @fs_info: the fs_info 4631 * @bytenr: bytenr to read 4632 * @owner_root: objectid of the root that owns this eb 4633 * @gen: generation for the uptodate check, can be 0 4634 * @level: level for the eb 4635 * 4636 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 4637 * normal uptodate check of the eb, without checking the generation. If we have 4638 * to read the block we will not block on anything. 4639 */ 4640 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 4641 u64 bytenr, u64 owner_root, u64 gen, int level, 4642 const struct btrfs_key *first_key) 4643 { 4644 struct btrfs_tree_parent_check check = { 4645 .level = level, 4646 .transid = gen 4647 }; 4648 struct extent_buffer *eb; 4649 int ret; 4650 4651 if (first_key) { 4652 memcpy(&check.first_key, first_key, sizeof(struct btrfs_key)); 4653 check.has_first_key = true; 4654 } 4655 4656 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 4657 if (IS_ERR(eb)) 4658 return; 4659 4660 if (btrfs_buffer_uptodate(eb, gen, &check)) { 4661 free_extent_buffer(eb); 4662 return; 4663 } 4664 4665 ret = read_extent_buffer_pages_nowait(eb, 0, &check); 4666 if (ret < 0) 4667 free_extent_buffer_stale(eb); 4668 else 4669 free_extent_buffer(eb); 4670 } 4671 4672 /* 4673 * Readahead a node's child block. 4674 * 4675 * @node: parent node we're reading from 4676 * @slot: slot in the parent node for the child we want to read 4677 * 4678 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 4679 * the slot in the node provided. 4680 */ 4681 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 4682 { 4683 struct btrfs_key node_key; 4684 4685 btrfs_node_key_to_cpu(node, &node_key, slot); 4686 btrfs_readahead_tree_block(node->fs_info, 4687 btrfs_node_blockptr(node, slot), 4688 btrfs_header_owner(node), 4689 btrfs_node_ptr_generation(node, slot), 4690 btrfs_header_level(node) - 1, 4691 &node_key); 4692 } 4693