1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/buffer.c 4 * 5 * Copyright (C) 1991, 1992, 2002 Linus Torvalds 6 */ 7 8 /* 9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 10 * 11 * Removed a lot of unnecessary code and simplified things now that 12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 13 * 14 * Speed up hash, lru, and free list operations. Use gfp() for allocating 15 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM 16 * 17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK 18 * 19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> 20 */ 21 22 #include <linux/kernel.h> 23 #include <linux/sched/signal.h> 24 #include <linux/syscalls.h> 25 #include <linux/fs.h> 26 #include <linux/iomap.h> 27 #include <linux/mm.h> 28 #include <linux/percpu.h> 29 #include <linux/slab.h> 30 #include <linux/capability.h> 31 #include <linux/blkdev.h> 32 #include <linux/blk-crypto.h> 33 #include <linux/file.h> 34 #include <linux/quotaops.h> 35 #include <linux/highmem.h> 36 #include <linux/export.h> 37 #include <linux/backing-dev.h> 38 #include <linux/writeback.h> 39 #include <linux/hash.h> 40 #include <linux/suspend.h> 41 #include <linux/buffer_head.h> 42 #include <linux/task_io_accounting_ops.h> 43 #include <linux/bio.h> 44 #include <linux/cpu.h> 45 #include <linux/bitops.h> 46 #include <linux/mpage.h> 47 #include <linux/bit_spinlock.h> 48 #include <linux/pagevec.h> 49 #include <linux/sched/mm.h> 50 #include <trace/events/block.h> 51 #include <linux/fscrypt.h> 52 #include <linux/fsverity.h> 53 #include <linux/sched/isolation.h> 54 55 #include "internal.h" 56 57 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 58 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, 59 enum rw_hint hint, struct writeback_control *wbc); 60 61 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 62 63 inline void touch_buffer(struct buffer_head *bh) 64 { 65 trace_block_touch_buffer(bh); 66 folio_mark_accessed(bh->b_folio); 67 } 68 EXPORT_SYMBOL(touch_buffer); 69 70 void __lock_buffer(struct buffer_head *bh) 71 { 72 wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); 73 } 74 EXPORT_SYMBOL(__lock_buffer); 75 76 void unlock_buffer(struct buffer_head *bh) 77 { 78 clear_bit_unlock(BH_Lock, &bh->b_state); 79 smp_mb__after_atomic(); 80 wake_up_bit(&bh->b_state, BH_Lock); 81 } 82 EXPORT_SYMBOL(unlock_buffer); 83 84 /* 85 * Returns if the folio has dirty or writeback buffers. If all the buffers 86 * are unlocked and clean then the folio_test_dirty information is stale. If 87 * any of the buffers are locked, it is assumed they are locked for IO. 88 */ 89 void buffer_check_dirty_writeback(struct folio *folio, 90 bool *dirty, bool *writeback) 91 { 92 struct buffer_head *head, *bh; 93 *dirty = false; 94 *writeback = false; 95 96 BUG_ON(!folio_test_locked(folio)); 97 98 head = folio_buffers(folio); 99 if (!head) 100 return; 101 102 if (folio_test_writeback(folio)) 103 *writeback = true; 104 105 bh = head; 106 do { 107 if (buffer_locked(bh)) 108 *writeback = true; 109 110 if (buffer_dirty(bh)) 111 *dirty = true; 112 113 bh = bh->b_this_page; 114 } while (bh != head); 115 } 116 117 /* 118 * Block until a buffer comes unlocked. This doesn't stop it 119 * from becoming locked again - you have to lock it yourself 120 * if you want to preserve its state. 121 */ 122 void __wait_on_buffer(struct buffer_head * bh) 123 { 124 wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); 125 } 126 EXPORT_SYMBOL(__wait_on_buffer); 127 128 static void buffer_io_error(struct buffer_head *bh, char *msg) 129 { 130 if (!test_bit(BH_Quiet, &bh->b_state)) 131 printk_ratelimited(KERN_ERR 132 "Buffer I/O error on dev %pg, logical block %llu%s\n", 133 bh->b_bdev, (unsigned long long)bh->b_blocknr, msg); 134 } 135 136 /* 137 * End-of-IO handler helper function which does not touch the bh after 138 * unlocking it. 139 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but 140 * a race there is benign: unlock_buffer() only use the bh's address for 141 * hashing after unlocking the buffer, so it doesn't actually touch the bh 142 * itself. 143 */ 144 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) 145 { 146 if (uptodate) { 147 set_buffer_uptodate(bh); 148 } else { 149 /* This happens, due to failed read-ahead attempts. */ 150 clear_buffer_uptodate(bh); 151 } 152 unlock_buffer(bh); 153 } 154 155 /* 156 * Default synchronous end-of-IO handler.. Just mark it up-to-date and 157 * unlock the buffer. 158 */ 159 void end_buffer_read_sync(struct buffer_head *bh, int uptodate) 160 { 161 put_bh(bh); 162 __end_buffer_read_notouch(bh, uptodate); 163 } 164 EXPORT_SYMBOL(end_buffer_read_sync); 165 166 void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 167 { 168 if (uptodate) { 169 set_buffer_uptodate(bh); 170 } else { 171 buffer_io_error(bh, ", lost sync page write"); 172 mark_buffer_write_io_error(bh); 173 clear_buffer_uptodate(bh); 174 } 175 unlock_buffer(bh); 176 put_bh(bh); 177 } 178 EXPORT_SYMBOL(end_buffer_write_sync); 179 180 static struct buffer_head * 181 __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) 182 { 183 struct address_space *bd_mapping = bdev->bd_mapping; 184 const int blkbits = bd_mapping->host->i_blkbits; 185 struct buffer_head *ret = NULL; 186 pgoff_t index; 187 struct buffer_head *bh; 188 struct buffer_head *head; 189 struct folio *folio; 190 int all_mapped = 1; 191 static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); 192 193 index = ((loff_t)block << blkbits) / PAGE_SIZE; 194 folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); 195 if (IS_ERR(folio)) 196 goto out; 197 198 /* 199 * Folio lock protects the buffers. Callers that cannot block 200 * will fallback to serializing vs try_to_free_buffers() via 201 * the i_private_lock. 202 */ 203 if (atomic) 204 spin_lock(&bd_mapping->i_private_lock); 205 else 206 folio_lock(folio); 207 208 head = folio_buffers(folio); 209 if (!head) 210 goto out_unlock; 211 /* 212 * Upon a noref migration, the folio lock serializes here; 213 * otherwise bail. 214 */ 215 if (test_bit_acquire(BH_Migrate, &head->b_state)) { 216 WARN_ON(!atomic); 217 goto out_unlock; 218 } 219 220 bh = head; 221 do { 222 if (!buffer_mapped(bh)) 223 all_mapped = 0; 224 else if (bh->b_blocknr == block) { 225 ret = bh; 226 get_bh(bh); 227 goto out_unlock; 228 } 229 bh = bh->b_this_page; 230 } while (bh != head); 231 232 /* we might be here because some of the buffers on this page are 233 * not mapped. This is due to various races between 234 * file io on the block device and getblk. It gets dealt with 235 * elsewhere, don't buffer_error if we had some unmapped buffers 236 */ 237 ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); 238 if (all_mapped && __ratelimit(&last_warned)) { 239 printk("__find_get_block_slow() failed. block=%llu, " 240 "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " 241 "device %pg blocksize: %d\n", 242 (unsigned long long)block, 243 (unsigned long long)bh->b_blocknr, 244 bh->b_state, bh->b_size, bdev, 245 1 << blkbits); 246 } 247 out_unlock: 248 if (atomic) 249 spin_unlock(&bd_mapping->i_private_lock); 250 else 251 folio_unlock(folio); 252 folio_put(folio); 253 out: 254 return ret; 255 } 256 257 static void end_buffer_async_read(struct buffer_head *bh, int uptodate) 258 { 259 unsigned long flags; 260 struct buffer_head *first; 261 struct buffer_head *tmp; 262 struct folio *folio; 263 int folio_uptodate = 1; 264 265 BUG_ON(!buffer_async_read(bh)); 266 267 folio = bh->b_folio; 268 if (uptodate) { 269 set_buffer_uptodate(bh); 270 } else { 271 clear_buffer_uptodate(bh); 272 buffer_io_error(bh, ", async page read"); 273 } 274 275 /* 276 * Be _very_ careful from here on. Bad things can happen if 277 * two buffer heads end IO at almost the same time and both 278 * decide that the page is now completely done. 279 */ 280 first = folio_buffers(folio); 281 spin_lock_irqsave(&first->b_uptodate_lock, flags); 282 clear_buffer_async_read(bh); 283 unlock_buffer(bh); 284 tmp = bh; 285 do { 286 if (!buffer_uptodate(tmp)) 287 folio_uptodate = 0; 288 if (buffer_async_read(tmp)) { 289 BUG_ON(!buffer_locked(tmp)); 290 goto still_busy; 291 } 292 tmp = tmp->b_this_page; 293 } while (tmp != bh); 294 spin_unlock_irqrestore(&first->b_uptodate_lock, flags); 295 296 folio_end_read(folio, folio_uptodate); 297 return; 298 299 still_busy: 300 spin_unlock_irqrestore(&first->b_uptodate_lock, flags); 301 } 302 303 struct postprocess_bh_ctx { 304 struct work_struct work; 305 struct buffer_head *bh; 306 struct fsverity_info *vi; 307 }; 308 309 static void verify_bh(struct work_struct *work) 310 { 311 struct postprocess_bh_ctx *ctx = 312 container_of(work, struct postprocess_bh_ctx, work); 313 struct buffer_head *bh = ctx->bh; 314 bool valid; 315 316 valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size, 317 bh_offset(bh)); 318 end_buffer_async_read(bh, valid); 319 kfree(ctx); 320 } 321 322 static void decrypt_bh(struct work_struct *work) 323 { 324 struct postprocess_bh_ctx *ctx = 325 container_of(work, struct postprocess_bh_ctx, work); 326 struct buffer_head *bh = ctx->bh; 327 int err; 328 329 err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size, 330 bh_offset(bh)); 331 if (err == 0 && ctx->vi) { 332 /* 333 * We use different work queues for decryption and for verity 334 * because verity may require reading metadata pages that need 335 * decryption, and we shouldn't recurse to the same workqueue. 336 */ 337 INIT_WORK(&ctx->work, verify_bh); 338 fsverity_enqueue_verify_work(&ctx->work); 339 return; 340 } 341 end_buffer_async_read(bh, err == 0); 342 kfree(ctx); 343 } 344 345 /* 346 * I/O completion handler for block_read_full_folio() - pages 347 * which come unlocked at the end of I/O. 348 */ 349 static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) 350 { 351 struct inode *inode = bh->b_folio->mapping->host; 352 bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); 353 struct fsverity_info *vi = NULL; 354 355 /* needed by ext4 */ 356 if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE)) 357 vi = fsverity_get_info(inode); 358 359 /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */ 360 if (uptodate && (decrypt || vi)) { 361 struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC); 362 363 if (ctx) { 364 ctx->bh = bh; 365 ctx->vi = vi; 366 if (decrypt) { 367 INIT_WORK(&ctx->work, decrypt_bh); 368 fscrypt_enqueue_decrypt_work(&ctx->work); 369 } else { 370 INIT_WORK(&ctx->work, verify_bh); 371 fsverity_enqueue_verify_work(&ctx->work); 372 } 373 return; 374 } 375 uptodate = 0; 376 } 377 end_buffer_async_read(bh, uptodate); 378 } 379 380 /* 381 * Completion handler for block_write_full_folio() - folios which are unlocked 382 * during I/O, and which have the writeback flag cleared upon I/O completion. 383 */ 384 static void end_buffer_async_write(struct buffer_head *bh, int uptodate) 385 { 386 unsigned long flags; 387 struct buffer_head *first; 388 struct buffer_head *tmp; 389 struct folio *folio; 390 391 BUG_ON(!buffer_async_write(bh)); 392 393 folio = bh->b_folio; 394 if (uptodate) { 395 set_buffer_uptodate(bh); 396 } else { 397 buffer_io_error(bh, ", lost async page write"); 398 mark_buffer_write_io_error(bh); 399 clear_buffer_uptodate(bh); 400 } 401 402 first = folio_buffers(folio); 403 spin_lock_irqsave(&first->b_uptodate_lock, flags); 404 405 clear_buffer_async_write(bh); 406 unlock_buffer(bh); 407 tmp = bh->b_this_page; 408 while (tmp != bh) { 409 if (buffer_async_write(tmp)) { 410 BUG_ON(!buffer_locked(tmp)); 411 goto still_busy; 412 } 413 tmp = tmp->b_this_page; 414 } 415 spin_unlock_irqrestore(&first->b_uptodate_lock, flags); 416 folio_end_writeback(folio); 417 return; 418 419 still_busy: 420 spin_unlock_irqrestore(&first->b_uptodate_lock, flags); 421 } 422 423 /* 424 * If a page's buffers are under async readin (end_buffer_async_read 425 * completion) then there is a possibility that another thread of 426 * control could lock one of the buffers after it has completed 427 * but while some of the other buffers have not completed. This 428 * locked buffer would confuse end_buffer_async_read() into not unlocking 429 * the page. So the absence of BH_Async_Read tells end_buffer_async_read() 430 * that this buffer is not under async I/O. 431 * 432 * The page comes unlocked when it has no locked buffer_async buffers 433 * left. 434 * 435 * PageLocked prevents anyone starting new async I/O reads any of 436 * the buffers. 437 * 438 * PageWriteback is used to prevent simultaneous writeout of the same 439 * page. 440 * 441 * PageLocked prevents anyone from starting writeback of a page which is 442 * under read I/O (PageWriteback is only ever set against a locked page). 443 */ 444 static void mark_buffer_async_read(struct buffer_head *bh) 445 { 446 bh->b_end_io = end_buffer_async_read_io; 447 set_buffer_async_read(bh); 448 } 449 450 static void mark_buffer_async_write_endio(struct buffer_head *bh, 451 bh_end_io_t *handler) 452 { 453 bh->b_end_io = handler; 454 set_buffer_async_write(bh); 455 } 456 457 void mark_buffer_async_write(struct buffer_head *bh) 458 { 459 mark_buffer_async_write_endio(bh, end_buffer_async_write); 460 } 461 EXPORT_SYMBOL(mark_buffer_async_write); 462 463 464 /* 465 * fs/buffer.c contains helper functions for buffer-backed address space's 466 * fsync functions. A common requirement for buffer-based filesystems is 467 * that certain data from the backing blockdev needs to be written out for 468 * a successful fsync(). For example, ext2 indirect blocks need to be 469 * written back and waited upon before fsync() returns. 470 * 471 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), 472 * inode_has_buffers() and invalidate_inode_buffers() are provided for the 473 * management of a list of dependent buffers at ->i_mapping->i_private_list. 474 * 475 * Locking is a little subtle: try_to_free_buffers() will remove buffers 476 * from their controlling inode's queue when they are being freed. But 477 * try_to_free_buffers() will be operating against the *blockdev* mapping 478 * at the time, not against the S_ISREG file which depends on those buffers. 479 * So the locking for i_private_list is via the i_private_lock in the address_space 480 * which backs the buffers. Which is different from the address_space 481 * against which the buffers are listed. So for a particular address_space, 482 * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, 483 * mapping->i_private_list will always be protected by the backing blockdev's 484 * ->i_private_lock. 485 * 486 * Which introduces a requirement: all buffers on an address_space's 487 * ->i_private_list must be from the same address_space: the blockdev's. 488 * 489 * address_spaces which do not place buffers at ->i_private_list via these 490 * utility functions are free to use i_private_lock and i_private_list for 491 * whatever they want. The only requirement is that list_empty(i_private_list) 492 * be true at clear_inode() time. 493 * 494 * FIXME: clear_inode should not call invalidate_inode_buffers(). The 495 * filesystems should do that. invalidate_inode_buffers() should just go 496 * BUG_ON(!list_empty). 497 * 498 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should 499 * take an address_space, not an inode. And it should be called 500 * mark_buffer_dirty_fsync() to clearly define why those buffers are being 501 * queued up. 502 * 503 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the 504 * list if it is already on a list. Because if the buffer is on a list, 505 * it *must* already be on the right one. If not, the filesystem is being 506 * silly. This will save a ton of locking. But first we have to ensure 507 * that buffers are taken *off* the old inode's list when they are freed 508 * (presumably in truncate). That requires careful auditing of all 509 * filesystems (do it inside bforget()). It could also be done by bringing 510 * b_inode back. 511 */ 512 513 /* 514 * The buffer's backing address_space's i_private_lock must be held 515 */ 516 static void __remove_assoc_queue(struct buffer_head *bh) 517 { 518 list_del_init(&bh->b_assoc_buffers); 519 WARN_ON(!bh->b_assoc_map); 520 bh->b_assoc_map = NULL; 521 } 522 523 int inode_has_buffers(struct inode *inode) 524 { 525 return !list_empty(&inode->i_data.i_private_list); 526 } 527 528 /* 529 * osync is designed to support O_SYNC io. It waits synchronously for 530 * all already-submitted IO to complete, but does not queue any new 531 * writes to the disk. 532 * 533 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer 534 * as you dirty the buffers, and then use osync_inode_buffers to wait for 535 * completion. Any other dirty buffers which are not yet queued for 536 * write will not be flushed to disk by the osync. 537 */ 538 static int osync_buffers_list(spinlock_t *lock, struct list_head *list) 539 { 540 struct buffer_head *bh; 541 struct list_head *p; 542 int err = 0; 543 544 spin_lock(lock); 545 repeat: 546 list_for_each_prev(p, list) { 547 bh = BH_ENTRY(p); 548 if (buffer_locked(bh)) { 549 get_bh(bh); 550 spin_unlock(lock); 551 wait_on_buffer(bh); 552 if (!buffer_uptodate(bh)) 553 err = -EIO; 554 brelse(bh); 555 spin_lock(lock); 556 goto repeat; 557 } 558 } 559 spin_unlock(lock); 560 return err; 561 } 562 563 /** 564 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 565 * @mapping: the mapping which wants those buffers written 566 * 567 * Starts I/O against the buffers at mapping->i_private_list, and waits upon 568 * that I/O. 569 * 570 * Basically, this is a convenience function for fsync(). 571 * @mapping is a file or directory which needs those buffers to be written for 572 * a successful fsync(). 573 */ 574 int sync_mapping_buffers(struct address_space *mapping) 575 { 576 struct address_space *buffer_mapping = mapping->i_private_data; 577 578 if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) 579 return 0; 580 581 return fsync_buffers_list(&buffer_mapping->i_private_lock, 582 &mapping->i_private_list); 583 } 584 EXPORT_SYMBOL(sync_mapping_buffers); 585 586 /** 587 * generic_buffers_fsync_noflush - generic buffer fsync implementation 588 * for simple filesystems with no inode lock 589 * 590 * @file: file to synchronize 591 * @start: start offset in bytes 592 * @end: end offset in bytes (inclusive) 593 * @datasync: only synchronize essential metadata if true 594 * 595 * This is a generic implementation of the fsync method for simple 596 * filesystems which track all non-inode metadata in the buffers list 597 * hanging off the address_space structure. 598 */ 599 int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, 600 bool datasync) 601 { 602 struct inode *inode = file->f_mapping->host; 603 int err; 604 int ret; 605 606 err = file_write_and_wait_range(file, start, end); 607 if (err) 608 return err; 609 610 ret = sync_mapping_buffers(inode->i_mapping); 611 if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) 612 goto out; 613 if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) 614 goto out; 615 616 err = sync_inode_metadata(inode, 1); 617 if (ret == 0) 618 ret = err; 619 620 out: 621 /* check and advance again to catch errors after syncing out buffers */ 622 err = file_check_and_advance_wb_err(file); 623 if (ret == 0) 624 ret = err; 625 return ret; 626 } 627 EXPORT_SYMBOL(generic_buffers_fsync_noflush); 628 629 /** 630 * generic_buffers_fsync - generic buffer fsync implementation 631 * for simple filesystems with no inode lock 632 * 633 * @file: file to synchronize 634 * @start: start offset in bytes 635 * @end: end offset in bytes (inclusive) 636 * @datasync: only synchronize essential metadata if true 637 * 638 * This is a generic implementation of the fsync method for simple 639 * filesystems which track all non-inode metadata in the buffers list 640 * hanging off the address_space structure. This also makes sure that 641 * a device cache flush operation is called at the end. 642 */ 643 int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, 644 bool datasync) 645 { 646 struct inode *inode = file->f_mapping->host; 647 int ret; 648 649 ret = generic_buffers_fsync_noflush(file, start, end, datasync); 650 if (!ret) 651 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 652 return ret; 653 } 654 EXPORT_SYMBOL(generic_buffers_fsync); 655 656 /* 657 * Called when we've recently written block `bblock', and it is known that 658 * `bblock' was for a buffer_boundary() buffer. This means that the block at 659 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's 660 * dirty, schedule it for IO. So that indirects merge nicely with their data. 661 */ 662 void write_boundary_block(struct block_device *bdev, 663 sector_t bblock, unsigned blocksize) 664 { 665 struct buffer_head *bh; 666 667 bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); 668 if (bh) { 669 if (buffer_dirty(bh)) 670 write_dirty_buffer(bh, 0); 671 put_bh(bh); 672 } 673 } 674 675 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) 676 { 677 struct address_space *mapping = inode->i_mapping; 678 struct address_space *buffer_mapping = bh->b_folio->mapping; 679 680 mark_buffer_dirty(bh); 681 if (!mapping->i_private_data) { 682 mapping->i_private_data = buffer_mapping; 683 } else { 684 BUG_ON(mapping->i_private_data != buffer_mapping); 685 } 686 if (!bh->b_assoc_map) { 687 spin_lock(&buffer_mapping->i_private_lock); 688 list_move_tail(&bh->b_assoc_buffers, 689 &mapping->i_private_list); 690 bh->b_assoc_map = mapping; 691 spin_unlock(&buffer_mapping->i_private_lock); 692 } 693 } 694 EXPORT_SYMBOL(mark_buffer_dirty_inode); 695 696 /** 697 * block_dirty_folio - Mark a folio as dirty. 698 * @mapping: The address space containing this folio. 699 * @folio: The folio to mark dirty. 700 * 701 * Filesystems which use buffer_heads can use this function as their 702 * ->dirty_folio implementation. Some filesystems need to do a little 703 * work before calling this function. Filesystems which do not use 704 * buffer_heads should call filemap_dirty_folio() instead. 705 * 706 * If the folio has buffers, the uptodate buffers are set dirty, to 707 * preserve dirty-state coherency between the folio and the buffers. 708 * Buffers added to a dirty folio are created dirty. 709 * 710 * The buffers are dirtied before the folio is dirtied. There's a small 711 * race window in which writeback may see the folio cleanness but not the 712 * buffer dirtiness. That's fine. If this code were to set the folio 713 * dirty before the buffers, writeback could clear the folio dirty flag, 714 * see a bunch of clean buffers and we'd end up with dirty buffers/clean 715 * folio on the dirty folio list. 716 * 717 * We use i_private_lock to lock against try_to_free_buffers() while 718 * using the folio's buffer list. This also prevents clean buffers 719 * being added to the folio after it was set dirty. 720 * 721 * Context: May only be called from process context. Does not sleep. 722 * Caller must ensure that @folio cannot be truncated during this call, 723 * typically by holding the folio lock or having a page in the folio 724 * mapped and holding the page table lock. 725 * 726 * Return: True if the folio was dirtied; false if it was already dirtied. 727 */ 728 bool block_dirty_folio(struct address_space *mapping, struct folio *folio) 729 { 730 struct buffer_head *head; 731 bool newly_dirty; 732 733 spin_lock(&mapping->i_private_lock); 734 head = folio_buffers(folio); 735 if (head) { 736 struct buffer_head *bh = head; 737 738 do { 739 set_buffer_dirty(bh); 740 bh = bh->b_this_page; 741 } while (bh != head); 742 } 743 /* 744 * Lock out page's memcg migration to keep PageDirty 745 * synchronized with per-memcg dirty page counters. 746 */ 747 newly_dirty = !folio_test_set_dirty(folio); 748 spin_unlock(&mapping->i_private_lock); 749 750 if (newly_dirty) 751 __folio_mark_dirty(folio, mapping, 1); 752 753 if (newly_dirty) 754 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 755 756 return newly_dirty; 757 } 758 EXPORT_SYMBOL(block_dirty_folio); 759 760 /* 761 * Write out and wait upon a list of buffers. 762 * 763 * We have conflicting pressures: we want to make sure that all 764 * initially dirty buffers get waited on, but that any subsequently 765 * dirtied buffers don't. After all, we don't want fsync to last 766 * forever if somebody is actively writing to the file. 767 * 768 * Do this in two main stages: first we copy dirty buffers to a 769 * temporary inode list, queueing the writes as we go. Then we clean 770 * up, waiting for those writes to complete. 771 * 772 * During this second stage, any subsequent updates to the file may end 773 * up refiling the buffer on the original inode's dirty list again, so 774 * there is a chance we will end up with a buffer queued for write but 775 * not yet completed on that list. So, as a final cleanup we go through 776 * the osync code to catch these locked, dirty buffers without requeuing 777 * any newly dirty buffers for write. 778 */ 779 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) 780 { 781 struct buffer_head *bh; 782 struct address_space *mapping; 783 int err = 0, err2; 784 struct blk_plug plug; 785 LIST_HEAD(tmp); 786 787 blk_start_plug(&plug); 788 789 spin_lock(lock); 790 while (!list_empty(list)) { 791 bh = BH_ENTRY(list->next); 792 mapping = bh->b_assoc_map; 793 __remove_assoc_queue(bh); 794 /* Avoid race with mark_buffer_dirty_inode() which does 795 * a lockless check and we rely on seeing the dirty bit */ 796 smp_mb(); 797 if (buffer_dirty(bh) || buffer_locked(bh)) { 798 list_add(&bh->b_assoc_buffers, &tmp); 799 bh->b_assoc_map = mapping; 800 if (buffer_dirty(bh)) { 801 get_bh(bh); 802 spin_unlock(lock); 803 /* 804 * Ensure any pending I/O completes so that 805 * write_dirty_buffer() actually writes the 806 * current contents - it is a noop if I/O is 807 * still in flight on potentially older 808 * contents. 809 */ 810 write_dirty_buffer(bh, REQ_SYNC); 811 812 /* 813 * Kick off IO for the previous mapping. Note 814 * that we will not run the very last mapping, 815 * wait_on_buffer() will do that for us 816 * through sync_buffer(). 817 */ 818 brelse(bh); 819 spin_lock(lock); 820 } 821 } 822 } 823 824 spin_unlock(lock); 825 blk_finish_plug(&plug); 826 spin_lock(lock); 827 828 while (!list_empty(&tmp)) { 829 bh = BH_ENTRY(tmp.prev); 830 get_bh(bh); 831 mapping = bh->b_assoc_map; 832 __remove_assoc_queue(bh); 833 /* Avoid race with mark_buffer_dirty_inode() which does 834 * a lockless check and we rely on seeing the dirty bit */ 835 smp_mb(); 836 if (buffer_dirty(bh)) { 837 list_add(&bh->b_assoc_buffers, 838 &mapping->i_private_list); 839 bh->b_assoc_map = mapping; 840 } 841 spin_unlock(lock); 842 wait_on_buffer(bh); 843 if (!buffer_uptodate(bh)) 844 err = -EIO; 845 brelse(bh); 846 spin_lock(lock); 847 } 848 849 spin_unlock(lock); 850 err2 = osync_buffers_list(lock, list); 851 if (err) 852 return err; 853 else 854 return err2; 855 } 856 857 /* 858 * Invalidate any and all dirty buffers on a given inode. We are 859 * probably unmounting the fs, but that doesn't mean we have already 860 * done a sync(). Just drop the buffers from the inode list. 861 * 862 * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which 863 * assumes that all the buffers are against the blockdev. 864 */ 865 void invalidate_inode_buffers(struct inode *inode) 866 { 867 if (inode_has_buffers(inode)) { 868 struct address_space *mapping = &inode->i_data; 869 struct list_head *list = &mapping->i_private_list; 870 struct address_space *buffer_mapping = mapping->i_private_data; 871 872 spin_lock(&buffer_mapping->i_private_lock); 873 while (!list_empty(list)) 874 __remove_assoc_queue(BH_ENTRY(list->next)); 875 spin_unlock(&buffer_mapping->i_private_lock); 876 } 877 } 878 EXPORT_SYMBOL(invalidate_inode_buffers); 879 880 /* 881 * Remove any clean buffers from the inode's buffer list. This is called 882 * when we're trying to free the inode itself. Those buffers can pin it. 883 * 884 * Returns true if all buffers were removed. 885 */ 886 int remove_inode_buffers(struct inode *inode) 887 { 888 int ret = 1; 889 890 if (inode_has_buffers(inode)) { 891 struct address_space *mapping = &inode->i_data; 892 struct list_head *list = &mapping->i_private_list; 893 struct address_space *buffer_mapping = mapping->i_private_data; 894 895 spin_lock(&buffer_mapping->i_private_lock); 896 while (!list_empty(list)) { 897 struct buffer_head *bh = BH_ENTRY(list->next); 898 if (buffer_dirty(bh)) { 899 ret = 0; 900 break; 901 } 902 __remove_assoc_queue(bh); 903 } 904 spin_unlock(&buffer_mapping->i_private_lock); 905 } 906 return ret; 907 } 908 909 /* 910 * Create the appropriate buffers when given a folio for data area and 911 * the size of each buffer.. Use the bh->b_this_page linked list to 912 * follow the buffers created. Return NULL if unable to create more 913 * buffers. 914 * 915 * The retry flag is used to differentiate async IO (paging, swapping) 916 * which may not fail from ordinary buffer allocations. 917 */ 918 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, 919 gfp_t gfp) 920 { 921 struct buffer_head *bh, *head; 922 long offset; 923 struct mem_cgroup *memcg, *old_memcg; 924 925 /* The folio lock pins the memcg */ 926 memcg = folio_memcg(folio); 927 old_memcg = set_active_memcg(memcg); 928 929 head = NULL; 930 offset = folio_size(folio); 931 while ((offset -= size) >= 0) { 932 bh = alloc_buffer_head(gfp); 933 if (!bh) 934 goto no_grow; 935 936 bh->b_this_page = head; 937 bh->b_blocknr = -1; 938 head = bh; 939 940 bh->b_size = size; 941 942 /* Link the buffer to its folio */ 943 folio_set_bh(bh, folio, offset); 944 } 945 out: 946 set_active_memcg(old_memcg); 947 return head; 948 /* 949 * In case anything failed, we just free everything we got. 950 */ 951 no_grow: 952 if (head) { 953 do { 954 bh = head; 955 head = head->b_this_page; 956 free_buffer_head(bh); 957 } while (head); 958 } 959 960 goto out; 961 } 962 EXPORT_SYMBOL_GPL(folio_alloc_buffers); 963 964 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size) 965 { 966 gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; 967 968 return folio_alloc_buffers(page_folio(page), size, gfp); 969 } 970 EXPORT_SYMBOL_GPL(alloc_page_buffers); 971 972 static inline void link_dev_buffers(struct folio *folio, 973 struct buffer_head *head) 974 { 975 struct buffer_head *bh, *tail; 976 977 bh = head; 978 do { 979 tail = bh; 980 bh = bh->b_this_page; 981 } while (bh); 982 tail->b_this_page = head; 983 folio_attach_private(folio, head); 984 } 985 986 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) 987 { 988 sector_t retval = ~((sector_t)0); 989 loff_t sz = bdev_nr_bytes(bdev); 990 991 if (sz) { 992 unsigned int sizebits = blksize_bits(size); 993 retval = (sz >> sizebits); 994 } 995 return retval; 996 } 997 998 /* 999 * Initialise the state of a blockdev folio's buffers. 1000 */ 1001 static sector_t folio_init_buffers(struct folio *folio, 1002 struct block_device *bdev, unsigned size) 1003 { 1004 struct buffer_head *head = folio_buffers(folio); 1005 struct buffer_head *bh = head; 1006 bool uptodate = folio_test_uptodate(folio); 1007 sector_t block = div_u64(folio_pos(folio), size); 1008 sector_t end_block = blkdev_max_block(bdev, size); 1009 1010 do { 1011 if (!buffer_mapped(bh)) { 1012 bh->b_end_io = NULL; 1013 bh->b_private = NULL; 1014 bh->b_bdev = bdev; 1015 bh->b_blocknr = block; 1016 if (uptodate) 1017 set_buffer_uptodate(bh); 1018 if (block < end_block) 1019 set_buffer_mapped(bh); 1020 } 1021 block++; 1022 bh = bh->b_this_page; 1023 } while (bh != head); 1024 1025 /* 1026 * Caller needs to validate requested block against end of device. 1027 */ 1028 return end_block; 1029 } 1030 1031 /* 1032 * Create the page-cache folio that contains the requested block. 1033 * 1034 * This is used purely for blockdev mappings. 1035 * 1036 * Returns false if we have a failure which cannot be cured by retrying 1037 * without sleeping. Returns true if we succeeded, or the caller should retry. 1038 */ 1039 static bool grow_dev_folio(struct block_device *bdev, sector_t block, 1040 pgoff_t index, unsigned size, gfp_t gfp) 1041 { 1042 struct address_space *mapping = bdev->bd_mapping; 1043 struct folio *folio; 1044 struct buffer_head *bh; 1045 sector_t end_block = 0; 1046 1047 folio = __filemap_get_folio(mapping, index, 1048 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1049 if (IS_ERR(folio)) 1050 return false; 1051 1052 bh = folio_buffers(folio); 1053 if (bh) { 1054 if (bh->b_size == size) { 1055 end_block = folio_init_buffers(folio, bdev, size); 1056 goto unlock; 1057 } 1058 1059 /* 1060 * Retrying may succeed; for example the folio may finish 1061 * writeback, or buffers may be cleaned. This should not 1062 * happen very often; maybe we have old buffers attached to 1063 * this blockdev's page cache and we're trying to change 1064 * the block size? 1065 */ 1066 if (!try_to_free_buffers(folio)) { 1067 end_block = ~0ULL; 1068 goto unlock; 1069 } 1070 } 1071 1072 bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); 1073 if (!bh) 1074 goto unlock; 1075 1076 /* 1077 * Link the folio to the buffers and initialise them. Take the 1078 * lock to be atomic wrt __find_get_block(), which does not 1079 * run under the folio lock. 1080 */ 1081 spin_lock(&mapping->i_private_lock); 1082 link_dev_buffers(folio, bh); 1083 end_block = folio_init_buffers(folio, bdev, size); 1084 spin_unlock(&mapping->i_private_lock); 1085 unlock: 1086 folio_unlock(folio); 1087 folio_put(folio); 1088 return block < end_block; 1089 } 1090 1091 /* 1092 * Create buffers for the specified block device block's folio. If 1093 * that folio was dirty, the buffers are set dirty also. Returns false 1094 * if we've hit a permanent error. 1095 */ 1096 static bool grow_buffers(struct block_device *bdev, sector_t block, 1097 unsigned size, gfp_t gfp) 1098 { 1099 loff_t pos; 1100 1101 /* 1102 * Check for a block which lies outside our maximum possible 1103 * pagecache index. 1104 */ 1105 if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) { 1106 printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n", 1107 __func__, (unsigned long long)block, 1108 bdev); 1109 return false; 1110 } 1111 1112 /* Create a folio with the proper size buffers */ 1113 return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp); 1114 } 1115 1116 static struct buffer_head * 1117 __getblk_slow(struct block_device *bdev, sector_t block, 1118 unsigned size, gfp_t gfp) 1119 { 1120 bool blocking = gfpflags_allow_blocking(gfp); 1121 1122 if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) { 1123 printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n", 1124 size, bdev_logical_block_size(bdev)); 1125 return NULL; 1126 } 1127 1128 for (;;) { 1129 struct buffer_head *bh; 1130 1131 if (!grow_buffers(bdev, block, size, gfp)) 1132 return NULL; 1133 1134 if (blocking) 1135 bh = __find_get_block_nonatomic(bdev, block, size); 1136 else 1137 bh = __find_get_block(bdev, block, size); 1138 if (bh) 1139 return bh; 1140 } 1141 } 1142 1143 /* 1144 * The relationship between dirty buffers and dirty pages: 1145 * 1146 * Whenever a page has any dirty buffers, the page's dirty bit is set, and 1147 * the page is tagged dirty in the page cache. 1148 * 1149 * At all times, the dirtiness of the buffers represents the dirtiness of 1150 * subsections of the page. If the page has buffers, the page dirty bit is 1151 * merely a hint about the true dirty state. 1152 * 1153 * When a page is set dirty in its entirety, all its buffers are marked dirty 1154 * (if the page has buffers). 1155 * 1156 * When a buffer is marked dirty, its page is dirtied, but the page's other 1157 * buffers are not. 1158 * 1159 * Also. When blockdev buffers are explicitly read with bread(), they 1160 * individually become uptodate. But their backing page remains not 1161 * uptodate - even if all of its buffers are uptodate. A subsequent 1162 * block_read_full_folio() against that folio will discover all the uptodate 1163 * buffers, will set the folio uptodate and will perform no I/O. 1164 */ 1165 1166 /** 1167 * mark_buffer_dirty - mark a buffer_head as needing writeout 1168 * @bh: the buffer_head to mark dirty 1169 * 1170 * mark_buffer_dirty() will set the dirty bit against the buffer, then set 1171 * its backing page dirty, then tag the page as dirty in the page cache 1172 * and then attach the address_space's inode to its superblock's dirty 1173 * inode list. 1174 * 1175 * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, 1176 * i_pages lock and mapping->host->i_lock. 1177 */ 1178 void mark_buffer_dirty(struct buffer_head *bh) 1179 { 1180 WARN_ON_ONCE(!buffer_uptodate(bh)); 1181 1182 trace_block_dirty_buffer(bh); 1183 1184 /* 1185 * Very *carefully* optimize the it-is-already-dirty case. 1186 * 1187 * Don't let the final "is it dirty" escape to before we 1188 * perhaps modified the buffer. 1189 */ 1190 if (buffer_dirty(bh)) { 1191 smp_mb(); 1192 if (buffer_dirty(bh)) 1193 return; 1194 } 1195 1196 if (!test_set_buffer_dirty(bh)) { 1197 struct folio *folio = bh->b_folio; 1198 struct address_space *mapping = NULL; 1199 1200 if (!folio_test_set_dirty(folio)) { 1201 mapping = folio->mapping; 1202 if (mapping) 1203 __folio_mark_dirty(folio, mapping, 0); 1204 } 1205 if (mapping) 1206 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1207 } 1208 } 1209 EXPORT_SYMBOL(mark_buffer_dirty); 1210 1211 void mark_buffer_write_io_error(struct buffer_head *bh) 1212 { 1213 set_buffer_write_io_error(bh); 1214 /* FIXME: do we need to set this in both places? */ 1215 if (bh->b_folio && bh->b_folio->mapping) 1216 mapping_set_error(bh->b_folio->mapping, -EIO); 1217 if (bh->b_assoc_map) 1218 mapping_set_error(bh->b_assoc_map, -EIO); 1219 } 1220 EXPORT_SYMBOL(mark_buffer_write_io_error); 1221 1222 /** 1223 * __brelse - Release a buffer. 1224 * @bh: The buffer to release. 1225 * 1226 * This variant of brelse() can be called if @bh is guaranteed to not be NULL. 1227 */ 1228 void __brelse(struct buffer_head *bh) 1229 { 1230 if (atomic_read(&bh->b_count)) { 1231 put_bh(bh); 1232 return; 1233 } 1234 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1235 } 1236 EXPORT_SYMBOL(__brelse); 1237 1238 /** 1239 * __bforget - Discard any dirty data in a buffer. 1240 * @bh: The buffer to forget. 1241 * 1242 * This variant of bforget() can be called if @bh is guaranteed to not 1243 * be NULL. 1244 */ 1245 void __bforget(struct buffer_head *bh) 1246 { 1247 clear_buffer_dirty(bh); 1248 if (bh->b_assoc_map) { 1249 struct address_space *buffer_mapping = bh->b_folio->mapping; 1250 1251 spin_lock(&buffer_mapping->i_private_lock); 1252 list_del_init(&bh->b_assoc_buffers); 1253 bh->b_assoc_map = NULL; 1254 spin_unlock(&buffer_mapping->i_private_lock); 1255 } 1256 __brelse(bh); 1257 } 1258 EXPORT_SYMBOL(__bforget); 1259 1260 static struct buffer_head *__bread_slow(struct buffer_head *bh) 1261 { 1262 lock_buffer(bh); 1263 if (buffer_uptodate(bh)) { 1264 unlock_buffer(bh); 1265 return bh; 1266 } else { 1267 get_bh(bh); 1268 bh->b_end_io = end_buffer_read_sync; 1269 submit_bh(REQ_OP_READ, bh); 1270 wait_on_buffer(bh); 1271 if (buffer_uptodate(bh)) 1272 return bh; 1273 } 1274 brelse(bh); 1275 return NULL; 1276 } 1277 1278 /* 1279 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). 1280 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their 1281 * refcount elevated by one when they're in an LRU. A buffer can only appear 1282 * once in a particular CPU's LRU. A single buffer can be present in multiple 1283 * CPU's LRUs at the same time. 1284 * 1285 * This is a transparent caching front-end to sb_bread(), sb_getblk() and 1286 * sb_find_get_block(). 1287 * 1288 * The LRUs themselves only need locking against invalidate_bh_lrus. We use 1289 * a local interrupt disable for that. 1290 */ 1291 1292 #define BH_LRU_SIZE 16 1293 1294 struct bh_lru { 1295 struct buffer_head *bhs[BH_LRU_SIZE]; 1296 }; 1297 1298 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; 1299 1300 #ifdef CONFIG_SMP 1301 #define bh_lru_lock() local_irq_disable() 1302 #define bh_lru_unlock() local_irq_enable() 1303 #else 1304 #define bh_lru_lock() preempt_disable() 1305 #define bh_lru_unlock() preempt_enable() 1306 #endif 1307 1308 static inline void check_irqs_on(void) 1309 { 1310 #ifdef irqs_disabled 1311 BUG_ON(irqs_disabled()); 1312 #endif 1313 } 1314 1315 /* 1316 * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is 1317 * inserted at the front, and the buffer_head at the back if any is evicted. 1318 * Or, if already in the LRU it is moved to the front. 1319 */ 1320 static void bh_lru_install(struct buffer_head *bh) 1321 { 1322 struct buffer_head *evictee = bh; 1323 struct bh_lru *b; 1324 int i; 1325 1326 check_irqs_on(); 1327 bh_lru_lock(); 1328 1329 /* 1330 * the refcount of buffer_head in bh_lru prevents dropping the 1331 * attached page(i.e., try_to_free_buffers) so it could cause 1332 * failing page migration. 1333 * Skip putting upcoming bh into bh_lru until migration is done. 1334 */ 1335 if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { 1336 bh_lru_unlock(); 1337 return; 1338 } 1339 1340 b = this_cpu_ptr(&bh_lrus); 1341 for (i = 0; i < BH_LRU_SIZE; i++) { 1342 swap(evictee, b->bhs[i]); 1343 if (evictee == bh) { 1344 bh_lru_unlock(); 1345 return; 1346 } 1347 } 1348 1349 get_bh(bh); 1350 bh_lru_unlock(); 1351 brelse(evictee); 1352 } 1353 1354 /* 1355 * Look up the bh in this cpu's LRU. If it's there, move it to the head. 1356 */ 1357 static struct buffer_head * 1358 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1359 { 1360 struct buffer_head *ret = NULL; 1361 unsigned int i; 1362 1363 check_irqs_on(); 1364 bh_lru_lock(); 1365 if (cpu_is_isolated(smp_processor_id())) { 1366 bh_lru_unlock(); 1367 return NULL; 1368 } 1369 for (i = 0; i < BH_LRU_SIZE; i++) { 1370 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); 1371 1372 if (bh && bh->b_blocknr == block && bh->b_bdev == bdev && 1373 bh->b_size == size) { 1374 if (i) { 1375 while (i) { 1376 __this_cpu_write(bh_lrus.bhs[i], 1377 __this_cpu_read(bh_lrus.bhs[i - 1])); 1378 i--; 1379 } 1380 __this_cpu_write(bh_lrus.bhs[0], bh); 1381 } 1382 get_bh(bh); 1383 ret = bh; 1384 break; 1385 } 1386 } 1387 bh_lru_unlock(); 1388 return ret; 1389 } 1390 1391 /* 1392 * Perform a pagecache lookup for the matching buffer. If it's there, refresh 1393 * it in the LRU and mark it as accessed. If it is not present then return 1394 * NULL. Atomic context callers may also return NULL if the buffer is being 1395 * migrated; similarly the page is not marked accessed either. 1396 */ 1397 static struct buffer_head * 1398 find_get_block_common(struct block_device *bdev, sector_t block, 1399 unsigned size, bool atomic) 1400 { 1401 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1402 1403 if (bh == NULL) { 1404 /* __find_get_block_slow will mark the page accessed */ 1405 bh = __find_get_block_slow(bdev, block, atomic); 1406 if (bh) 1407 bh_lru_install(bh); 1408 } else 1409 touch_buffer(bh); 1410 1411 return bh; 1412 } 1413 1414 struct buffer_head * 1415 __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1416 { 1417 return find_get_block_common(bdev, block, size, true); 1418 } 1419 EXPORT_SYMBOL(__find_get_block); 1420 1421 /* same as __find_get_block() but allows sleeping contexts */ 1422 struct buffer_head * 1423 __find_get_block_nonatomic(struct block_device *bdev, sector_t block, 1424 unsigned size) 1425 { 1426 return find_get_block_common(bdev, block, size, false); 1427 } 1428 EXPORT_SYMBOL(__find_get_block_nonatomic); 1429 1430 /** 1431 * bdev_getblk - Get a buffer_head in a block device's buffer cache. 1432 * @bdev: The block device. 1433 * @block: The block number. 1434 * @size: The size of buffer_heads for this @bdev. 1435 * @gfp: The memory allocation flags to use. 1436 * 1437 * The returned buffer head has its reference count incremented, but is 1438 * not locked. The caller should call brelse() when it has finished 1439 * with the buffer. The buffer may not be uptodate. If needed, the 1440 * caller can bring it uptodate either by reading it or overwriting it. 1441 * 1442 * Return: The buffer head, or NULL if memory could not be allocated. 1443 */ 1444 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, 1445 unsigned size, gfp_t gfp) 1446 { 1447 struct buffer_head *bh; 1448 1449 if (gfpflags_allow_blocking(gfp)) 1450 bh = __find_get_block_nonatomic(bdev, block, size); 1451 else 1452 bh = __find_get_block(bdev, block, size); 1453 1454 might_alloc(gfp); 1455 if (bh) 1456 return bh; 1457 1458 return __getblk_slow(bdev, block, size, gfp); 1459 } 1460 EXPORT_SYMBOL(bdev_getblk); 1461 1462 /* 1463 * Do async read-ahead on a buffer.. 1464 */ 1465 void __breadahead(struct block_device *bdev, sector_t block, unsigned size) 1466 { 1467 struct buffer_head *bh = bdev_getblk(bdev, block, size, 1468 GFP_NOWAIT | __GFP_MOVABLE); 1469 1470 if (likely(bh)) { 1471 bh_readahead(bh, REQ_RAHEAD); 1472 brelse(bh); 1473 } 1474 } 1475 EXPORT_SYMBOL(__breadahead); 1476 1477 /** 1478 * __bread_gfp() - Read a block. 1479 * @bdev: The block device to read from. 1480 * @block: Block number in units of block size. 1481 * @size: The block size of this device in bytes. 1482 * @gfp: Not page allocation flags; see below. 1483 * 1484 * You are not expected to call this function. You should use one of 1485 * sb_bread(), sb_bread_unmovable() or __bread(). 1486 * 1487 * Read a specified block, and return the buffer head that refers to it. 1488 * If @gfp is 0, the memory will be allocated using the block device's 1489 * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be 1490 * allocated from a movable area. Do not pass in a complete set of 1491 * GFP flags. 1492 * 1493 * The returned buffer head has its refcount increased. The caller should 1494 * call brelse() when it has finished with the buffer. 1495 * 1496 * Context: May sleep waiting for I/O. 1497 * Return: NULL if the block was unreadable. 1498 */ 1499 struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block, 1500 unsigned size, gfp_t gfp) 1501 { 1502 struct buffer_head *bh; 1503 1504 gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS); 1505 1506 /* 1507 * Prefer looping in the allocator rather than here, at least that 1508 * code knows what it's doing. 1509 */ 1510 gfp |= __GFP_NOFAIL; 1511 1512 bh = bdev_getblk(bdev, block, size, gfp); 1513 1514 if (likely(bh) && !buffer_uptodate(bh)) 1515 bh = __bread_slow(bh); 1516 return bh; 1517 } 1518 EXPORT_SYMBOL(__bread_gfp); 1519 1520 static void __invalidate_bh_lrus(struct bh_lru *b) 1521 { 1522 int i; 1523 1524 for (i = 0; i < BH_LRU_SIZE; i++) { 1525 brelse(b->bhs[i]); 1526 b->bhs[i] = NULL; 1527 } 1528 } 1529 /* 1530 * invalidate_bh_lrus() is called rarely - but not only at unmount. 1531 * This doesn't race because it runs in each cpu either in irq 1532 * or with preempt disabled. 1533 */ 1534 static void invalidate_bh_lru(void *arg) 1535 { 1536 struct bh_lru *b = &get_cpu_var(bh_lrus); 1537 1538 __invalidate_bh_lrus(b); 1539 put_cpu_var(bh_lrus); 1540 } 1541 1542 bool has_bh_in_lru(int cpu, void *dummy) 1543 { 1544 struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); 1545 int i; 1546 1547 for (i = 0; i < BH_LRU_SIZE; i++) { 1548 if (b->bhs[i]) 1549 return true; 1550 } 1551 1552 return false; 1553 } 1554 1555 void invalidate_bh_lrus(void) 1556 { 1557 on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); 1558 } 1559 EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1560 1561 /* 1562 * It's called from workqueue context so we need a bh_lru_lock to close 1563 * the race with preemption/irq. 1564 */ 1565 void invalidate_bh_lrus_cpu(void) 1566 { 1567 struct bh_lru *b; 1568 1569 bh_lru_lock(); 1570 b = this_cpu_ptr(&bh_lrus); 1571 __invalidate_bh_lrus(b); 1572 bh_lru_unlock(); 1573 } 1574 1575 void folio_set_bh(struct buffer_head *bh, struct folio *folio, 1576 unsigned long offset) 1577 { 1578 bh->b_folio = folio; 1579 BUG_ON(offset >= folio_size(folio)); 1580 if (folio_test_highmem(folio)) 1581 /* 1582 * This catches illegal uses and preserves the offset: 1583 */ 1584 bh->b_data = (char *)(0 + offset); 1585 else 1586 bh->b_data = folio_address(folio) + offset; 1587 } 1588 EXPORT_SYMBOL(folio_set_bh); 1589 1590 /* 1591 * Called when truncating a buffer on a page completely. 1592 */ 1593 1594 /* Bits that are cleared during an invalidate */ 1595 #define BUFFER_FLAGS_DISCARD \ 1596 (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ 1597 1 << BH_Delay | 1 << BH_Unwritten) 1598 1599 static void discard_buffer(struct buffer_head * bh) 1600 { 1601 unsigned long b_state; 1602 1603 lock_buffer(bh); 1604 clear_buffer_dirty(bh); 1605 bh->b_bdev = NULL; 1606 b_state = READ_ONCE(bh->b_state); 1607 do { 1608 } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state, 1609 b_state & ~BUFFER_FLAGS_DISCARD)); 1610 unlock_buffer(bh); 1611 } 1612 1613 /** 1614 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio. 1615 * @folio: The folio which is affected. 1616 * @offset: start of the range to invalidate 1617 * @length: length of the range to invalidate 1618 * 1619 * block_invalidate_folio() is called when all or part of the folio has been 1620 * invalidated by a truncate operation. 1621 * 1622 * block_invalidate_folio() does not have to release all buffers, but it must 1623 * ensure that no dirty buffer is left outside @offset and that no I/O 1624 * is underway against any of the blocks which are outside the truncation 1625 * point. Because the caller is about to free (and possibly reuse) those 1626 * blocks on-disk. 1627 */ 1628 void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) 1629 { 1630 struct buffer_head *head, *bh, *next; 1631 size_t curr_off = 0; 1632 size_t stop = length + offset; 1633 1634 BUG_ON(!folio_test_locked(folio)); 1635 1636 /* 1637 * Check for overflow 1638 */ 1639 BUG_ON(stop > folio_size(folio) || stop < length); 1640 1641 head = folio_buffers(folio); 1642 if (!head) 1643 return; 1644 1645 bh = head; 1646 do { 1647 size_t next_off = curr_off + bh->b_size; 1648 next = bh->b_this_page; 1649 1650 /* 1651 * Are we still fully in range ? 1652 */ 1653 if (next_off > stop) 1654 goto out; 1655 1656 /* 1657 * is this block fully invalidated? 1658 */ 1659 if (offset <= curr_off) 1660 discard_buffer(bh); 1661 curr_off = next_off; 1662 bh = next; 1663 } while (bh != head); 1664 1665 /* 1666 * We release buffers only if the entire folio is being invalidated. 1667 * The get_block cached value has been unconditionally invalidated, 1668 * so real IO is not possible anymore. 1669 */ 1670 if (length == folio_size(folio)) 1671 filemap_release_folio(folio, 0); 1672 out: 1673 folio_clear_mappedtodisk(folio); 1674 } 1675 EXPORT_SYMBOL(block_invalidate_folio); 1676 1677 /* 1678 * We attach and possibly dirty the buffers atomically wrt 1679 * block_dirty_folio() via i_private_lock. try_to_free_buffers 1680 * is already excluded via the folio lock. 1681 */ 1682 struct buffer_head *create_empty_buffers(struct folio *folio, 1683 unsigned long blocksize, unsigned long b_state) 1684 { 1685 struct buffer_head *bh, *head, *tail; 1686 gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; 1687 1688 head = folio_alloc_buffers(folio, blocksize, gfp); 1689 bh = head; 1690 do { 1691 bh->b_state |= b_state; 1692 tail = bh; 1693 bh = bh->b_this_page; 1694 } while (bh); 1695 tail->b_this_page = head; 1696 1697 spin_lock(&folio->mapping->i_private_lock); 1698 if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { 1699 bh = head; 1700 do { 1701 if (folio_test_dirty(folio)) 1702 set_buffer_dirty(bh); 1703 if (folio_test_uptodate(folio)) 1704 set_buffer_uptodate(bh); 1705 bh = bh->b_this_page; 1706 } while (bh != head); 1707 } 1708 folio_attach_private(folio, head); 1709 spin_unlock(&folio->mapping->i_private_lock); 1710 1711 return head; 1712 } 1713 EXPORT_SYMBOL(create_empty_buffers); 1714 1715 /** 1716 * clean_bdev_aliases: clean a range of buffers in block device 1717 * @bdev: Block device to clean buffers in 1718 * @block: Start of a range of blocks to clean 1719 * @len: Number of blocks to clean 1720 * 1721 * We are taking a range of blocks for data and we don't want writeback of any 1722 * buffer-cache aliases starting from return from this function and until the 1723 * moment when something will explicitly mark the buffer dirty (hopefully that 1724 * will not happen until we will free that block ;-) We don't even need to mark 1725 * it not-uptodate - nobody can expect anything from a newly allocated buffer 1726 * anyway. We used to use unmap_buffer() for such invalidation, but that was 1727 * wrong. We definitely don't want to mark the alias unmapped, for example - it 1728 * would confuse anyone who might pick it with bread() afterwards... 1729 * 1730 * Also.. Note that bforget() doesn't lock the buffer. So there can be 1731 * writeout I/O going on against recently-freed buffers. We don't wait on that 1732 * I/O in bforget() - it's more efficient to wait on the I/O only if we really 1733 * need to. That happens here. 1734 */ 1735 void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) 1736 { 1737 struct address_space *bd_mapping = bdev->bd_mapping; 1738 const int blkbits = bd_mapping->host->i_blkbits; 1739 struct folio_batch fbatch; 1740 pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE; 1741 pgoff_t end; 1742 int i, count; 1743 struct buffer_head *bh; 1744 struct buffer_head *head; 1745 1746 end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE; 1747 folio_batch_init(&fbatch); 1748 while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { 1749 count = folio_batch_count(&fbatch); 1750 for (i = 0; i < count; i++) { 1751 struct folio *folio = fbatch.folios[i]; 1752 1753 if (!folio_buffers(folio)) 1754 continue; 1755 /* 1756 * We use folio lock instead of bd_mapping->i_private_lock 1757 * to pin buffers here since we can afford to sleep and 1758 * it scales better than a global spinlock lock. 1759 */ 1760 folio_lock(folio); 1761 /* Recheck when the folio is locked which pins bhs */ 1762 head = folio_buffers(folio); 1763 if (!head) 1764 goto unlock_page; 1765 bh = head; 1766 do { 1767 if (!buffer_mapped(bh) || (bh->b_blocknr < block)) 1768 goto next; 1769 if (bh->b_blocknr >= block + len) 1770 break; 1771 clear_buffer_dirty(bh); 1772 wait_on_buffer(bh); 1773 clear_buffer_req(bh); 1774 next: 1775 bh = bh->b_this_page; 1776 } while (bh != head); 1777 unlock_page: 1778 folio_unlock(folio); 1779 } 1780 folio_batch_release(&fbatch); 1781 cond_resched(); 1782 /* End of range already reached? */ 1783 if (index > end || !index) 1784 break; 1785 } 1786 } 1787 EXPORT_SYMBOL(clean_bdev_aliases); 1788 1789 static struct buffer_head *folio_create_buffers(struct folio *folio, 1790 struct inode *inode, 1791 unsigned int b_state) 1792 { 1793 struct buffer_head *bh; 1794 1795 BUG_ON(!folio_test_locked(folio)); 1796 1797 bh = folio_buffers(folio); 1798 if (!bh) 1799 bh = create_empty_buffers(folio, 1800 1 << READ_ONCE(inode->i_blkbits), b_state); 1801 return bh; 1802 } 1803 1804 /* 1805 * NOTE! All mapped/uptodate combinations are valid: 1806 * 1807 * Mapped Uptodate Meaning 1808 * 1809 * No No "unknown" - must do get_block() 1810 * No Yes "hole" - zero-filled 1811 * Yes No "allocated" - allocated on disk, not read in 1812 * Yes Yes "valid" - allocated and up-to-date in memory. 1813 * 1814 * "Dirty" is valid only with the last case (mapped+uptodate). 1815 */ 1816 1817 /* 1818 * While block_write_full_folio is writing back the dirty buffers under 1819 * the page lock, whoever dirtied the buffers may decide to clean them 1820 * again at any time. We handle that by only looking at the buffer 1821 * state inside lock_buffer(). 1822 * 1823 * If block_write_full_folio() is called for regular writeback 1824 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a 1825 * locked buffer. This only can happen if someone has written the buffer 1826 * directly, with submit_bh(). At the address_space level PageWriteback 1827 * prevents this contention from occurring. 1828 * 1829 * If block_write_full_folio() is called with wbc->sync_mode == 1830 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this 1831 * causes the writes to be flagged as synchronous writes. 1832 */ 1833 int __block_write_full_folio(struct inode *inode, struct folio *folio, 1834 get_block_t *get_block, struct writeback_control *wbc) 1835 { 1836 int err; 1837 sector_t block; 1838 sector_t last_block; 1839 struct buffer_head *bh, *head; 1840 size_t blocksize; 1841 int nr_underway = 0; 1842 blk_opf_t write_flags = wbc_to_write_flags(wbc); 1843 1844 head = folio_create_buffers(folio, inode, 1845 (1 << BH_Dirty) | (1 << BH_Uptodate)); 1846 1847 /* 1848 * Be very careful. We have no exclusion from block_dirty_folio 1849 * here, and the (potentially unmapped) buffers may become dirty at 1850 * any time. If a buffer becomes dirty here after we've inspected it 1851 * then we just miss that fact, and the folio stays dirty. 1852 * 1853 * Buffers outside i_size may be dirtied by block_dirty_folio; 1854 * handle that here by just cleaning them. 1855 */ 1856 1857 bh = head; 1858 blocksize = bh->b_size; 1859 1860 block = div_u64(folio_pos(folio), blocksize); 1861 last_block = div_u64(i_size_read(inode) - 1, blocksize); 1862 1863 /* 1864 * Get all the dirty buffers mapped to disk addresses and 1865 * handle any aliases from the underlying blockdev's mapping. 1866 */ 1867 do { 1868 if (block > last_block) { 1869 /* 1870 * mapped buffers outside i_size will occur, because 1871 * this folio can be outside i_size when there is a 1872 * truncate in progress. 1873 */ 1874 /* 1875 * The buffer was zeroed by block_write_full_folio() 1876 */ 1877 clear_buffer_dirty(bh); 1878 set_buffer_uptodate(bh); 1879 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && 1880 buffer_dirty(bh)) { 1881 WARN_ON(bh->b_size != blocksize); 1882 err = get_block(inode, block, bh, 1); 1883 if (err) 1884 goto recover; 1885 clear_buffer_delay(bh); 1886 if (buffer_new(bh)) { 1887 /* blockdev mappings never come here */ 1888 clear_buffer_new(bh); 1889 clean_bdev_bh_alias(bh); 1890 } 1891 } 1892 bh = bh->b_this_page; 1893 block++; 1894 } while (bh != head); 1895 1896 do { 1897 if (!buffer_mapped(bh)) 1898 continue; 1899 /* 1900 * If it's a fully non-blocking write attempt and we cannot 1901 * lock the buffer then redirty the folio. Note that this can 1902 * potentially cause a busy-wait loop from writeback threads 1903 * and kswapd activity, but those code paths have their own 1904 * higher-level throttling. 1905 */ 1906 if (wbc->sync_mode != WB_SYNC_NONE) { 1907 lock_buffer(bh); 1908 } else if (!trylock_buffer(bh)) { 1909 folio_redirty_for_writepage(wbc, folio); 1910 continue; 1911 } 1912 if (test_clear_buffer_dirty(bh)) { 1913 mark_buffer_async_write_endio(bh, 1914 end_buffer_async_write); 1915 } else { 1916 unlock_buffer(bh); 1917 } 1918 } while ((bh = bh->b_this_page) != head); 1919 1920 /* 1921 * The folio and its buffers are protected by the writeback flag, 1922 * so we can drop the bh refcounts early. 1923 */ 1924 BUG_ON(folio_test_writeback(folio)); 1925 folio_start_writeback(folio); 1926 1927 do { 1928 struct buffer_head *next = bh->b_this_page; 1929 if (buffer_async_write(bh)) { 1930 submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, 1931 inode->i_write_hint, wbc); 1932 nr_underway++; 1933 } 1934 bh = next; 1935 } while (bh != head); 1936 folio_unlock(folio); 1937 1938 err = 0; 1939 done: 1940 if (nr_underway == 0) { 1941 /* 1942 * The folio was marked dirty, but the buffers were 1943 * clean. Someone wrote them back by hand with 1944 * write_dirty_buffer/submit_bh. A rare case. 1945 */ 1946 folio_end_writeback(folio); 1947 1948 /* 1949 * The folio and buffer_heads can be released at any time from 1950 * here on. 1951 */ 1952 } 1953 return err; 1954 1955 recover: 1956 /* 1957 * ENOSPC, or some other error. We may already have added some 1958 * blocks to the file, so we need to write these out to avoid 1959 * exposing stale data. 1960 * The folio is currently locked and not marked for writeback 1961 */ 1962 bh = head; 1963 /* Recovery: lock and submit the mapped buffers */ 1964 do { 1965 if (buffer_mapped(bh) && buffer_dirty(bh) && 1966 !buffer_delay(bh)) { 1967 lock_buffer(bh); 1968 mark_buffer_async_write_endio(bh, 1969 end_buffer_async_write); 1970 } else { 1971 /* 1972 * The buffer may have been set dirty during 1973 * attachment to a dirty folio. 1974 */ 1975 clear_buffer_dirty(bh); 1976 } 1977 } while ((bh = bh->b_this_page) != head); 1978 BUG_ON(folio_test_writeback(folio)); 1979 mapping_set_error(folio->mapping, err); 1980 folio_start_writeback(folio); 1981 do { 1982 struct buffer_head *next = bh->b_this_page; 1983 if (buffer_async_write(bh)) { 1984 clear_buffer_dirty(bh); 1985 submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, 1986 inode->i_write_hint, wbc); 1987 nr_underway++; 1988 } 1989 bh = next; 1990 } while (bh != head); 1991 folio_unlock(folio); 1992 goto done; 1993 } 1994 EXPORT_SYMBOL(__block_write_full_folio); 1995 1996 /* 1997 * If a folio has any new buffers, zero them out here, and mark them uptodate 1998 * and dirty so they'll be written out (in order to prevent uninitialised 1999 * block data from leaking). And clear the new bit. 2000 */ 2001 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) 2002 { 2003 size_t block_start, block_end; 2004 struct buffer_head *head, *bh; 2005 2006 BUG_ON(!folio_test_locked(folio)); 2007 head = folio_buffers(folio); 2008 if (!head) 2009 return; 2010 2011 bh = head; 2012 block_start = 0; 2013 do { 2014 block_end = block_start + bh->b_size; 2015 2016 if (buffer_new(bh)) { 2017 if (block_end > from && block_start < to) { 2018 if (!folio_test_uptodate(folio)) { 2019 size_t start, xend; 2020 2021 start = max(from, block_start); 2022 xend = min(to, block_end); 2023 2024 folio_zero_segment(folio, start, xend); 2025 set_buffer_uptodate(bh); 2026 } 2027 2028 clear_buffer_new(bh); 2029 mark_buffer_dirty(bh); 2030 } 2031 } 2032 2033 block_start = block_end; 2034 bh = bh->b_this_page; 2035 } while (bh != head); 2036 } 2037 EXPORT_SYMBOL(folio_zero_new_buffers); 2038 2039 static int 2040 iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, 2041 const struct iomap *iomap) 2042 { 2043 loff_t offset = (loff_t)block << inode->i_blkbits; 2044 2045 bh->b_bdev = iomap->bdev; 2046 2047 /* 2048 * Block points to offset in file we need to map, iomap contains 2049 * the offset at which the map starts. If the map ends before the 2050 * current block, then do not map the buffer and let the caller 2051 * handle it. 2052 */ 2053 if (offset >= iomap->offset + iomap->length) 2054 return -EIO; 2055 2056 switch (iomap->type) { 2057 case IOMAP_HOLE: 2058 /* 2059 * If the buffer is not up to date or beyond the current EOF, 2060 * we need to mark it as new to ensure sub-block zeroing is 2061 * executed if necessary. 2062 */ 2063 if (!buffer_uptodate(bh) || 2064 (offset >= i_size_read(inode))) 2065 set_buffer_new(bh); 2066 return 0; 2067 case IOMAP_DELALLOC: 2068 if (!buffer_uptodate(bh) || 2069 (offset >= i_size_read(inode))) 2070 set_buffer_new(bh); 2071 set_buffer_uptodate(bh); 2072 set_buffer_mapped(bh); 2073 set_buffer_delay(bh); 2074 return 0; 2075 case IOMAP_UNWRITTEN: 2076 /* 2077 * For unwritten regions, we always need to ensure that regions 2078 * in the block we are not writing to are zeroed. Mark the 2079 * buffer as new to ensure this. 2080 */ 2081 set_buffer_new(bh); 2082 set_buffer_unwritten(bh); 2083 fallthrough; 2084 case IOMAP_MAPPED: 2085 if ((iomap->flags & IOMAP_F_NEW) || 2086 offset >= i_size_read(inode)) { 2087 /* 2088 * This can happen if truncating the block device races 2089 * with the check in the caller as i_size updates on 2090 * block devices aren't synchronized by i_rwsem for 2091 * block devices. 2092 */ 2093 if (S_ISBLK(inode->i_mode)) 2094 return -EIO; 2095 set_buffer_new(bh); 2096 } 2097 bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> 2098 inode->i_blkbits; 2099 set_buffer_mapped(bh); 2100 return 0; 2101 default: 2102 WARN_ON_ONCE(1); 2103 return -EIO; 2104 } 2105 } 2106 2107 int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, 2108 get_block_t *get_block, const struct iomap *iomap) 2109 { 2110 size_t from = offset_in_folio(folio, pos); 2111 size_t to = from + len; 2112 struct inode *inode = folio->mapping->host; 2113 size_t block_start, block_end; 2114 sector_t block; 2115 int err = 0; 2116 size_t blocksize; 2117 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 2118 2119 BUG_ON(!folio_test_locked(folio)); 2120 BUG_ON(to > folio_size(folio)); 2121 BUG_ON(from > to); 2122 2123 head = folio_create_buffers(folio, inode, 0); 2124 blocksize = head->b_size; 2125 block = div_u64(folio_pos(folio), blocksize); 2126 2127 for (bh = head, block_start = 0; bh != head || !block_start; 2128 block++, block_start=block_end, bh = bh->b_this_page) { 2129 block_end = block_start + blocksize; 2130 if (block_end <= from || block_start >= to) { 2131 if (folio_test_uptodate(folio)) { 2132 if (!buffer_uptodate(bh)) 2133 set_buffer_uptodate(bh); 2134 } 2135 continue; 2136 } 2137 if (buffer_new(bh)) 2138 clear_buffer_new(bh); 2139 if (!buffer_mapped(bh)) { 2140 WARN_ON(bh->b_size != blocksize); 2141 if (get_block) 2142 err = get_block(inode, block, bh, 1); 2143 else 2144 err = iomap_to_bh(inode, block, bh, iomap); 2145 if (err) 2146 break; 2147 2148 if (buffer_new(bh)) { 2149 clean_bdev_bh_alias(bh); 2150 if (folio_test_uptodate(folio)) { 2151 clear_buffer_new(bh); 2152 set_buffer_uptodate(bh); 2153 mark_buffer_dirty(bh); 2154 continue; 2155 } 2156 if (block_end > to || block_start < from) 2157 folio_zero_segments(folio, 2158 to, block_end, 2159 block_start, from); 2160 continue; 2161 } 2162 } 2163 if (folio_test_uptodate(folio)) { 2164 if (!buffer_uptodate(bh)) 2165 set_buffer_uptodate(bh); 2166 continue; 2167 } 2168 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 2169 !buffer_unwritten(bh) && 2170 (block_start < from || block_end > to)) { 2171 bh_read_nowait(bh, 0); 2172 *wait_bh++=bh; 2173 } 2174 } 2175 /* 2176 * If we issued read requests - let them complete. 2177 */ 2178 while(wait_bh > wait) { 2179 wait_on_buffer(*--wait_bh); 2180 if (!buffer_uptodate(*wait_bh)) 2181 err = -EIO; 2182 } 2183 if (unlikely(err)) 2184 folio_zero_new_buffers(folio, from, to); 2185 return err; 2186 } 2187 2188 int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, 2189 get_block_t *get_block) 2190 { 2191 return __block_write_begin_int(folio, pos, len, get_block, NULL); 2192 } 2193 EXPORT_SYMBOL(__block_write_begin); 2194 2195 void block_commit_write(struct folio *folio, size_t from, size_t to) 2196 { 2197 size_t block_start, block_end; 2198 bool partial = false; 2199 unsigned blocksize; 2200 struct buffer_head *bh, *head; 2201 2202 bh = head = folio_buffers(folio); 2203 if (!bh) 2204 return; 2205 blocksize = bh->b_size; 2206 2207 block_start = 0; 2208 do { 2209 block_end = block_start + blocksize; 2210 if (block_end <= from || block_start >= to) { 2211 if (!buffer_uptodate(bh)) 2212 partial = true; 2213 } else { 2214 set_buffer_uptodate(bh); 2215 mark_buffer_dirty(bh); 2216 } 2217 if (buffer_new(bh)) 2218 clear_buffer_new(bh); 2219 2220 block_start = block_end; 2221 bh = bh->b_this_page; 2222 } while (bh != head); 2223 2224 /* 2225 * If this is a partial write which happened to make all buffers 2226 * uptodate then we can optimize away a bogus read_folio() for 2227 * the next read(). Here we 'discover' whether the folio went 2228 * uptodate as a result of this (potentially partial) write. 2229 */ 2230 if (!partial) 2231 folio_mark_uptodate(folio); 2232 } 2233 EXPORT_SYMBOL(block_commit_write); 2234 2235 /* 2236 * block_write_begin takes care of the basic task of block allocation and 2237 * bringing partial write blocks uptodate first. 2238 * 2239 * The filesystem needs to handle block truncation upon failure. 2240 */ 2241 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, 2242 struct folio **foliop, get_block_t *get_block) 2243 { 2244 pgoff_t index = pos >> PAGE_SHIFT; 2245 struct folio *folio; 2246 int status; 2247 2248 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 2249 mapping_gfp_mask(mapping)); 2250 if (IS_ERR(folio)) 2251 return PTR_ERR(folio); 2252 2253 status = __block_write_begin_int(folio, pos, len, get_block, NULL); 2254 if (unlikely(status)) { 2255 folio_unlock(folio); 2256 folio_put(folio); 2257 folio = NULL; 2258 } 2259 2260 *foliop = folio; 2261 return status; 2262 } 2263 EXPORT_SYMBOL(block_write_begin); 2264 2265 int block_write_end(loff_t pos, unsigned len, unsigned copied, 2266 struct folio *folio) 2267 { 2268 size_t start = pos - folio_pos(folio); 2269 2270 if (unlikely(copied < len)) { 2271 /* 2272 * The buffers that were written will now be uptodate, so 2273 * we don't have to worry about a read_folio reading them 2274 * and overwriting a partial write. However if we have 2275 * encountered a short write and only partially written 2276 * into a buffer, it will not be marked uptodate, so a 2277 * read_folio might come in and destroy our partial write. 2278 * 2279 * Do the simplest thing, and just treat any short write to a 2280 * non uptodate folio as a zero-length write, and force the 2281 * caller to redo the whole thing. 2282 */ 2283 if (!folio_test_uptodate(folio)) 2284 copied = 0; 2285 2286 folio_zero_new_buffers(folio, start+copied, start+len); 2287 } 2288 flush_dcache_folio(folio); 2289 2290 /* This could be a short (even 0-length) commit */ 2291 block_commit_write(folio, start, start + copied); 2292 2293 return copied; 2294 } 2295 EXPORT_SYMBOL(block_write_end); 2296 2297 int generic_write_end(const struct kiocb *iocb, struct address_space *mapping, 2298 loff_t pos, unsigned len, unsigned copied, 2299 struct folio *folio, void *fsdata) 2300 { 2301 struct inode *inode = mapping->host; 2302 loff_t old_size = inode->i_size; 2303 bool i_size_changed = false; 2304 2305 copied = block_write_end(pos, len, copied, folio); 2306 2307 /* 2308 * No need to use i_size_read() here, the i_size cannot change under us 2309 * because we hold i_rwsem. 2310 * 2311 * But it's important to update i_size while still holding folio lock: 2312 * page writeout could otherwise come in and zero beyond i_size. 2313 */ 2314 if (pos + copied > inode->i_size) { 2315 i_size_write(inode, pos + copied); 2316 i_size_changed = true; 2317 } 2318 2319 folio_unlock(folio); 2320 folio_put(folio); 2321 2322 if (old_size < pos) 2323 pagecache_isize_extended(inode, old_size, pos); 2324 /* 2325 * Don't mark the inode dirty under page lock. First, it unnecessarily 2326 * makes the holding time of page lock longer. Second, it forces lock 2327 * ordering of page lock and transaction start for journaling 2328 * filesystems. 2329 */ 2330 if (i_size_changed) 2331 mark_inode_dirty(inode); 2332 return copied; 2333 } 2334 EXPORT_SYMBOL(generic_write_end); 2335 2336 /* 2337 * block_is_partially_uptodate checks whether buffers within a folio are 2338 * uptodate or not. 2339 * 2340 * Returns true if all buffers which correspond to the specified part 2341 * of the folio are uptodate. 2342 */ 2343 bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) 2344 { 2345 unsigned block_start, block_end, blocksize; 2346 unsigned to; 2347 struct buffer_head *bh, *head; 2348 bool ret = true; 2349 2350 head = folio_buffers(folio); 2351 if (!head) 2352 return false; 2353 blocksize = head->b_size; 2354 to = min(folio_size(folio) - from, count); 2355 to = from + to; 2356 if (from < blocksize && to > folio_size(folio) - blocksize) 2357 return false; 2358 2359 bh = head; 2360 block_start = 0; 2361 do { 2362 block_end = block_start + blocksize; 2363 if (block_end > from && block_start < to) { 2364 if (!buffer_uptodate(bh)) { 2365 ret = false; 2366 break; 2367 } 2368 if (block_end >= to) 2369 break; 2370 } 2371 block_start = block_end; 2372 bh = bh->b_this_page; 2373 } while (bh != head); 2374 2375 return ret; 2376 } 2377 EXPORT_SYMBOL(block_is_partially_uptodate); 2378 2379 /* 2380 * Generic "read_folio" function for block devices that have the normal 2381 * get_block functionality. This is most of the block device filesystems. 2382 * Reads the folio asynchronously --- the unlock_buffer() and 2383 * set/clear_buffer_uptodate() functions propagate buffer state into the 2384 * folio once IO has completed. 2385 */ 2386 int block_read_full_folio(struct folio *folio, get_block_t *get_block) 2387 { 2388 struct inode *inode = folio->mapping->host; 2389 sector_t iblock, lblock; 2390 struct buffer_head *bh, *head, *prev = NULL; 2391 size_t blocksize; 2392 int fully_mapped = 1; 2393 bool page_error = false; 2394 loff_t limit = i_size_read(inode); 2395 2396 /* This is needed for ext4. */ 2397 if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) 2398 limit = inode->i_sb->s_maxbytes; 2399 2400 head = folio_create_buffers(folio, inode, 0); 2401 blocksize = head->b_size; 2402 2403 iblock = div_u64(folio_pos(folio), blocksize); 2404 lblock = div_u64(limit + blocksize - 1, blocksize); 2405 bh = head; 2406 2407 do { 2408 if (buffer_uptodate(bh)) 2409 continue; 2410 2411 if (!buffer_mapped(bh)) { 2412 int err = 0; 2413 2414 fully_mapped = 0; 2415 if (iblock < lblock) { 2416 WARN_ON(bh->b_size != blocksize); 2417 err = get_block(inode, iblock, bh, 0); 2418 if (err) 2419 page_error = true; 2420 } 2421 if (!buffer_mapped(bh)) { 2422 folio_zero_range(folio, bh_offset(bh), 2423 blocksize); 2424 if (!err) 2425 set_buffer_uptodate(bh); 2426 continue; 2427 } 2428 /* 2429 * get_block() might have updated the buffer 2430 * synchronously 2431 */ 2432 if (buffer_uptodate(bh)) 2433 continue; 2434 } 2435 2436 lock_buffer(bh); 2437 if (buffer_uptodate(bh)) { 2438 unlock_buffer(bh); 2439 continue; 2440 } 2441 2442 mark_buffer_async_read(bh); 2443 if (prev) 2444 submit_bh(REQ_OP_READ, prev); 2445 prev = bh; 2446 } while (iblock++, (bh = bh->b_this_page) != head); 2447 2448 if (fully_mapped) 2449 folio_set_mappedtodisk(folio); 2450 2451 /* 2452 * All buffers are uptodate or get_block() returned an error 2453 * when trying to map them - we must finish the read because 2454 * end_buffer_async_read() will never be called on any buffer 2455 * in this folio. 2456 */ 2457 if (prev) 2458 submit_bh(REQ_OP_READ, prev); 2459 else 2460 folio_end_read(folio, !page_error); 2461 2462 return 0; 2463 } 2464 EXPORT_SYMBOL(block_read_full_folio); 2465 2466 /* utility function for filesystems that need to do work on expanding 2467 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2468 * deal with the hole. 2469 */ 2470 int generic_cont_expand_simple(struct inode *inode, loff_t size) 2471 { 2472 struct address_space *mapping = inode->i_mapping; 2473 const struct address_space_operations *aops = mapping->a_ops; 2474 struct folio *folio; 2475 void *fsdata = NULL; 2476 int err; 2477 2478 err = inode_newsize_ok(inode, size); 2479 if (err) 2480 goto out; 2481 2482 err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata); 2483 if (err) 2484 goto out; 2485 2486 err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata); 2487 BUG_ON(err > 0); 2488 2489 out: 2490 return err; 2491 } 2492 EXPORT_SYMBOL(generic_cont_expand_simple); 2493 2494 static int cont_expand_zero(const struct kiocb *iocb, 2495 struct address_space *mapping, 2496 loff_t pos, loff_t *bytes) 2497 { 2498 struct inode *inode = mapping->host; 2499 const struct address_space_operations *aops = mapping->a_ops; 2500 unsigned int blocksize = i_blocksize(inode); 2501 struct folio *folio; 2502 void *fsdata = NULL; 2503 pgoff_t index, curidx; 2504 loff_t curpos; 2505 unsigned zerofrom, offset, len; 2506 int err = 0; 2507 2508 index = pos >> PAGE_SHIFT; 2509 offset = pos & ~PAGE_MASK; 2510 2511 while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) { 2512 zerofrom = curpos & ~PAGE_MASK; 2513 if (zerofrom & (blocksize-1)) { 2514 *bytes |= (blocksize-1); 2515 (*bytes)++; 2516 } 2517 len = PAGE_SIZE - zerofrom; 2518 2519 err = aops->write_begin(iocb, mapping, curpos, len, 2520 &folio, &fsdata); 2521 if (err) 2522 goto out; 2523 folio_zero_range(folio, offset_in_folio(folio, curpos), len); 2524 err = aops->write_end(iocb, mapping, curpos, len, len, 2525 folio, fsdata); 2526 if (err < 0) 2527 goto out; 2528 BUG_ON(err != len); 2529 err = 0; 2530 2531 balance_dirty_pages_ratelimited(mapping); 2532 2533 if (fatal_signal_pending(current)) { 2534 err = -EINTR; 2535 goto out; 2536 } 2537 } 2538 2539 /* page covers the boundary, find the boundary offset */ 2540 if (index == curidx) { 2541 zerofrom = curpos & ~PAGE_MASK; 2542 /* if we will expand the thing last block will be filled */ 2543 if (offset <= zerofrom) { 2544 goto out; 2545 } 2546 if (zerofrom & (blocksize-1)) { 2547 *bytes |= (blocksize-1); 2548 (*bytes)++; 2549 } 2550 len = offset - zerofrom; 2551 2552 err = aops->write_begin(iocb, mapping, curpos, len, 2553 &folio, &fsdata); 2554 if (err) 2555 goto out; 2556 folio_zero_range(folio, offset_in_folio(folio, curpos), len); 2557 err = aops->write_end(iocb, mapping, curpos, len, len, 2558 folio, fsdata); 2559 if (err < 0) 2560 goto out; 2561 BUG_ON(err != len); 2562 err = 0; 2563 } 2564 out: 2565 return err; 2566 } 2567 2568 /* 2569 * For moronic filesystems that do not allow holes in file. 2570 * We may have to extend the file. 2571 */ 2572 int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, 2573 loff_t pos, unsigned len, struct folio **foliop, 2574 void **fsdata, get_block_t *get_block, loff_t *bytes) 2575 { 2576 struct inode *inode = mapping->host; 2577 unsigned int blocksize = i_blocksize(inode); 2578 unsigned int zerofrom; 2579 int err; 2580 2581 err = cont_expand_zero(iocb, mapping, pos, bytes); 2582 if (err) 2583 return err; 2584 2585 zerofrom = *bytes & ~PAGE_MASK; 2586 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2587 *bytes |= (blocksize-1); 2588 (*bytes)++; 2589 } 2590 2591 return block_write_begin(mapping, pos, len, foliop, get_block); 2592 } 2593 EXPORT_SYMBOL(cont_write_begin); 2594 2595 /* 2596 * block_page_mkwrite() is not allowed to change the file size as it gets 2597 * called from a page fault handler when a page is first dirtied. Hence we must 2598 * be careful to check for EOF conditions here. We set the page up correctly 2599 * for a written page which means we get ENOSPC checking when writing into 2600 * holes and correct delalloc and unwritten extent mapping on filesystems that 2601 * support these features. 2602 * 2603 * We are not allowed to take the i_rwsem here so we have to play games to 2604 * protect against truncate races as the page could now be beyond EOF. Because 2605 * truncate writes the inode size before removing pages, once we have the 2606 * page lock we can determine safely if the page is beyond EOF. If it is not 2607 * beyond EOF, then the page is guaranteed safe against truncation until we 2608 * unlock the page. 2609 * 2610 * Direct callers of this function should protect against filesystem freezing 2611 * using sb_start_pagefault() - sb_end_pagefault() functions. 2612 */ 2613 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2614 get_block_t get_block) 2615 { 2616 struct folio *folio = page_folio(vmf->page); 2617 struct inode *inode = file_inode(vma->vm_file); 2618 unsigned long end; 2619 loff_t size; 2620 int ret; 2621 2622 folio_lock(folio); 2623 size = i_size_read(inode); 2624 if ((folio->mapping != inode->i_mapping) || 2625 (folio_pos(folio) >= size)) { 2626 /* We overload EFAULT to mean page got truncated */ 2627 ret = -EFAULT; 2628 goto out_unlock; 2629 } 2630 2631 end = folio_size(folio); 2632 /* folio is wholly or partially inside EOF */ 2633 if (folio_pos(folio) + end > size) 2634 end = size - folio_pos(folio); 2635 2636 ret = __block_write_begin_int(folio, 0, end, get_block, NULL); 2637 if (unlikely(ret)) 2638 goto out_unlock; 2639 2640 block_commit_write(folio, 0, end); 2641 2642 folio_mark_dirty(folio); 2643 folio_wait_stable(folio); 2644 return 0; 2645 out_unlock: 2646 folio_unlock(folio); 2647 return ret; 2648 } 2649 EXPORT_SYMBOL(block_page_mkwrite); 2650 2651 int block_truncate_page(struct address_space *mapping, 2652 loff_t from, get_block_t *get_block) 2653 { 2654 pgoff_t index = from >> PAGE_SHIFT; 2655 unsigned blocksize; 2656 sector_t iblock; 2657 size_t offset, length, pos; 2658 struct inode *inode = mapping->host; 2659 struct folio *folio; 2660 struct buffer_head *bh; 2661 int err = 0; 2662 2663 blocksize = i_blocksize(inode); 2664 length = from & (blocksize - 1); 2665 2666 /* Block boundary? Nothing to do */ 2667 if (!length) 2668 return 0; 2669 2670 length = blocksize - length; 2671 iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits; 2672 2673 folio = filemap_grab_folio(mapping, index); 2674 if (IS_ERR(folio)) 2675 return PTR_ERR(folio); 2676 2677 bh = folio_buffers(folio); 2678 if (!bh) 2679 bh = create_empty_buffers(folio, blocksize, 0); 2680 2681 /* Find the buffer that contains "offset" */ 2682 offset = offset_in_folio(folio, from); 2683 pos = blocksize; 2684 while (offset >= pos) { 2685 bh = bh->b_this_page; 2686 iblock++; 2687 pos += blocksize; 2688 } 2689 2690 if (!buffer_mapped(bh)) { 2691 WARN_ON(bh->b_size != blocksize); 2692 err = get_block(inode, iblock, bh, 0); 2693 if (err) 2694 goto unlock; 2695 /* unmapped? It's a hole - nothing to do */ 2696 if (!buffer_mapped(bh)) 2697 goto unlock; 2698 } 2699 2700 /* Ok, it's mapped. Make sure it's up-to-date */ 2701 if (folio_test_uptodate(folio)) 2702 set_buffer_uptodate(bh); 2703 2704 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { 2705 err = bh_read(bh, 0); 2706 /* Uhhuh. Read error. Complain and punt. */ 2707 if (err < 0) 2708 goto unlock; 2709 } 2710 2711 folio_zero_range(folio, offset, length); 2712 mark_buffer_dirty(bh); 2713 2714 unlock: 2715 folio_unlock(folio); 2716 folio_put(folio); 2717 2718 return err; 2719 } 2720 EXPORT_SYMBOL(block_truncate_page); 2721 2722 /* 2723 * The generic write folio function for buffer-backed address_spaces 2724 */ 2725 int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, 2726 void *get_block) 2727 { 2728 struct inode * const inode = folio->mapping->host; 2729 loff_t i_size = i_size_read(inode); 2730 2731 /* Is the folio fully inside i_size? */ 2732 if (folio_next_pos(folio) <= i_size) 2733 return __block_write_full_folio(inode, folio, get_block, wbc); 2734 2735 /* Is the folio fully outside i_size? (truncate in progress) */ 2736 if (folio_pos(folio) >= i_size) { 2737 folio_unlock(folio); 2738 return 0; /* don't care */ 2739 } 2740 2741 /* 2742 * The folio straddles i_size. It must be zeroed out on each and every 2743 * writeback invocation because it may be mmapped. "A file is mapped 2744 * in multiples of the page size. For a file that is not a multiple of 2745 * the page size, the remaining memory is zeroed when mapped, and 2746 * writes to that region are not written out to the file." 2747 */ 2748 folio_zero_segment(folio, offset_in_folio(folio, i_size), 2749 folio_size(folio)); 2750 return __block_write_full_folio(inode, folio, get_block, wbc); 2751 } 2752 2753 sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2754 get_block_t *get_block) 2755 { 2756 struct inode *inode = mapping->host; 2757 struct buffer_head tmp = { 2758 .b_size = i_blocksize(inode), 2759 }; 2760 2761 get_block(inode, block, &tmp, 0); 2762 return tmp.b_blocknr; 2763 } 2764 EXPORT_SYMBOL(generic_block_bmap); 2765 2766 static void end_bio_bh_io_sync(struct bio *bio) 2767 { 2768 struct buffer_head *bh = bio->bi_private; 2769 2770 if (unlikely(bio_flagged(bio, BIO_QUIET))) 2771 set_bit(BH_Quiet, &bh->b_state); 2772 2773 bh->b_end_io(bh, !bio->bi_status); 2774 bio_put(bio); 2775 } 2776 2777 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, 2778 enum rw_hint write_hint, 2779 struct writeback_control *wbc) 2780 { 2781 const enum req_op op = opf & REQ_OP_MASK; 2782 struct bio *bio; 2783 2784 BUG_ON(!buffer_locked(bh)); 2785 BUG_ON(!buffer_mapped(bh)); 2786 BUG_ON(!bh->b_end_io); 2787 BUG_ON(buffer_delay(bh)); 2788 BUG_ON(buffer_unwritten(bh)); 2789 2790 /* 2791 * Only clear out a write error when rewriting 2792 */ 2793 if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) 2794 clear_buffer_write_io_error(bh); 2795 2796 if (buffer_meta(bh)) 2797 opf |= REQ_META; 2798 if (buffer_prio(bh)) 2799 opf |= REQ_PRIO; 2800 2801 bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); 2802 2803 fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); 2804 2805 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 2806 bio->bi_write_hint = write_hint; 2807 2808 bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); 2809 2810 bio->bi_end_io = end_bio_bh_io_sync; 2811 bio->bi_private = bh; 2812 2813 /* Take care of bh's that straddle the end of the device */ 2814 guard_bio_eod(bio); 2815 2816 if (wbc) { 2817 wbc_init_bio(wbc, bio); 2818 wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); 2819 } 2820 2821 blk_crypto_submit_bio(bio); 2822 } 2823 2824 void submit_bh(blk_opf_t opf, struct buffer_head *bh) 2825 { 2826 submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); 2827 } 2828 EXPORT_SYMBOL(submit_bh); 2829 2830 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) 2831 { 2832 lock_buffer(bh); 2833 if (!test_clear_buffer_dirty(bh)) { 2834 unlock_buffer(bh); 2835 return; 2836 } 2837 bh->b_end_io = end_buffer_write_sync; 2838 get_bh(bh); 2839 submit_bh(REQ_OP_WRITE | op_flags, bh); 2840 } 2841 EXPORT_SYMBOL(write_dirty_buffer); 2842 2843 /* 2844 * For a data-integrity writeout, we need to wait upon any in-progress I/O 2845 * and then start new I/O and then wait upon it. The caller must have a ref on 2846 * the buffer_head. 2847 */ 2848 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) 2849 { 2850 WARN_ON(atomic_read(&bh->b_count) < 1); 2851 lock_buffer(bh); 2852 if (test_clear_buffer_dirty(bh)) { 2853 /* 2854 * The bh should be mapped, but it might not be if the 2855 * device was hot-removed. Not much we can do but fail the I/O. 2856 */ 2857 if (!buffer_mapped(bh)) { 2858 unlock_buffer(bh); 2859 return -EIO; 2860 } 2861 2862 get_bh(bh); 2863 bh->b_end_io = end_buffer_write_sync; 2864 submit_bh(REQ_OP_WRITE | op_flags, bh); 2865 wait_on_buffer(bh); 2866 if (!buffer_uptodate(bh)) 2867 return -EIO; 2868 } else { 2869 unlock_buffer(bh); 2870 } 2871 return 0; 2872 } 2873 EXPORT_SYMBOL(__sync_dirty_buffer); 2874 2875 int sync_dirty_buffer(struct buffer_head *bh) 2876 { 2877 return __sync_dirty_buffer(bh, REQ_SYNC); 2878 } 2879 EXPORT_SYMBOL(sync_dirty_buffer); 2880 2881 static inline int buffer_busy(struct buffer_head *bh) 2882 { 2883 return atomic_read(&bh->b_count) | 2884 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); 2885 } 2886 2887 static bool 2888 drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free) 2889 { 2890 struct buffer_head *head = folio_buffers(folio); 2891 struct buffer_head *bh; 2892 2893 bh = head; 2894 do { 2895 if (buffer_busy(bh)) 2896 goto failed; 2897 bh = bh->b_this_page; 2898 } while (bh != head); 2899 2900 do { 2901 struct buffer_head *next = bh->b_this_page; 2902 2903 if (bh->b_assoc_map) 2904 __remove_assoc_queue(bh); 2905 bh = next; 2906 } while (bh != head); 2907 *buffers_to_free = head; 2908 folio_detach_private(folio); 2909 return true; 2910 failed: 2911 return false; 2912 } 2913 2914 /** 2915 * try_to_free_buffers - Release buffers attached to this folio. 2916 * @folio: The folio. 2917 * 2918 * If any buffers are in use (dirty, under writeback, elevated refcount), 2919 * no buffers will be freed. 2920 * 2921 * If the folio is dirty but all the buffers are clean then we need to 2922 * be sure to mark the folio clean as well. This is because the folio 2923 * may be against a block device, and a later reattachment of buffers 2924 * to a dirty folio will set *all* buffers dirty. Which would corrupt 2925 * filesystem data on the same device. 2926 * 2927 * The same applies to regular filesystem folios: if all the buffers are 2928 * clean then we set the folio clean and proceed. To do that, we require 2929 * total exclusion from block_dirty_folio(). That is obtained with 2930 * i_private_lock. 2931 * 2932 * Exclusion against try_to_free_buffers may be obtained by either 2933 * locking the folio or by holding its mapping's i_private_lock. 2934 * 2935 * Context: Process context. @folio must be locked. Will not sleep. 2936 * Return: true if all buffers attached to this folio were freed. 2937 */ 2938 bool try_to_free_buffers(struct folio *folio) 2939 { 2940 struct address_space * const mapping = folio->mapping; 2941 struct buffer_head *buffers_to_free = NULL; 2942 bool ret = 0; 2943 2944 BUG_ON(!folio_test_locked(folio)); 2945 if (folio_test_writeback(folio)) 2946 return false; 2947 2948 /* Misconfigured folio check */ 2949 if (WARN_ON_ONCE(!folio_buffers(folio))) 2950 return true; 2951 2952 if (mapping == NULL) { /* can this still happen? */ 2953 ret = drop_buffers(folio, &buffers_to_free); 2954 goto out; 2955 } 2956 2957 spin_lock(&mapping->i_private_lock); 2958 ret = drop_buffers(folio, &buffers_to_free); 2959 2960 /* 2961 * If the filesystem writes its buffers by hand (eg ext3) 2962 * then we can have clean buffers against a dirty folio. We 2963 * clean the folio here; otherwise the VM will never notice 2964 * that the filesystem did any IO at all. 2965 * 2966 * Also, during truncate, discard_buffer will have marked all 2967 * the folio's buffers clean. We discover that here and clean 2968 * the folio also. 2969 * 2970 * i_private_lock must be held over this entire operation in order 2971 * to synchronise against block_dirty_folio and prevent the 2972 * dirty bit from being lost. 2973 */ 2974 if (ret) 2975 folio_cancel_dirty(folio); 2976 spin_unlock(&mapping->i_private_lock); 2977 out: 2978 if (buffers_to_free) { 2979 struct buffer_head *bh = buffers_to_free; 2980 2981 do { 2982 struct buffer_head *next = bh->b_this_page; 2983 free_buffer_head(bh); 2984 bh = next; 2985 } while (bh != buffers_to_free); 2986 } 2987 return ret; 2988 } 2989 EXPORT_SYMBOL(try_to_free_buffers); 2990 2991 /* 2992 * Buffer-head allocation 2993 */ 2994 static struct kmem_cache *bh_cachep __ro_after_init; 2995 2996 /* 2997 * Once the number of bh's in the machine exceeds this level, we start 2998 * stripping them in writeback. 2999 */ 3000 static unsigned long max_buffer_heads __ro_after_init; 3001 3002 int buffer_heads_over_limit; 3003 3004 struct bh_accounting { 3005 int nr; /* Number of live bh's */ 3006 int ratelimit; /* Limit cacheline bouncing */ 3007 }; 3008 3009 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; 3010 3011 static void recalc_bh_state(void) 3012 { 3013 int i; 3014 int tot = 0; 3015 3016 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) 3017 return; 3018 __this_cpu_write(bh_accounting.ratelimit, 0); 3019 for_each_online_cpu(i) 3020 tot += per_cpu(bh_accounting, i).nr; 3021 buffer_heads_over_limit = (tot > max_buffer_heads); 3022 } 3023 3024 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3025 { 3026 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); 3027 if (ret) { 3028 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3029 spin_lock_init(&ret->b_uptodate_lock); 3030 preempt_disable(); 3031 __this_cpu_inc(bh_accounting.nr); 3032 recalc_bh_state(); 3033 preempt_enable(); 3034 } 3035 return ret; 3036 } 3037 EXPORT_SYMBOL(alloc_buffer_head); 3038 3039 void free_buffer_head(struct buffer_head *bh) 3040 { 3041 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3042 kmem_cache_free(bh_cachep, bh); 3043 preempt_disable(); 3044 __this_cpu_dec(bh_accounting.nr); 3045 recalc_bh_state(); 3046 preempt_enable(); 3047 } 3048 EXPORT_SYMBOL(free_buffer_head); 3049 3050 static int buffer_exit_cpu_dead(unsigned int cpu) 3051 { 3052 int i; 3053 struct bh_lru *b = &per_cpu(bh_lrus, cpu); 3054 3055 for (i = 0; i < BH_LRU_SIZE; i++) { 3056 brelse(b->bhs[i]); 3057 b->bhs[i] = NULL; 3058 } 3059 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); 3060 per_cpu(bh_accounting, cpu).nr = 0; 3061 return 0; 3062 } 3063 3064 /** 3065 * bh_uptodate_or_lock - Test whether the buffer is uptodate 3066 * @bh: struct buffer_head 3067 * 3068 * Return true if the buffer is up-to-date and false, 3069 * with the buffer locked, if not. 3070 */ 3071 int bh_uptodate_or_lock(struct buffer_head *bh) 3072 { 3073 if (!buffer_uptodate(bh)) { 3074 lock_buffer(bh); 3075 if (!buffer_uptodate(bh)) 3076 return 0; 3077 unlock_buffer(bh); 3078 } 3079 return 1; 3080 } 3081 EXPORT_SYMBOL(bh_uptodate_or_lock); 3082 3083 /** 3084 * __bh_read - Submit read for a locked buffer 3085 * @bh: struct buffer_head 3086 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ 3087 * @wait: wait until reading finish 3088 * 3089 * Returns zero on success or don't wait, and -EIO on error. 3090 */ 3091 int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) 3092 { 3093 int ret = 0; 3094 3095 BUG_ON(!buffer_locked(bh)); 3096 3097 get_bh(bh); 3098 bh->b_end_io = end_buffer_read_sync; 3099 submit_bh(REQ_OP_READ | op_flags, bh); 3100 if (wait) { 3101 wait_on_buffer(bh); 3102 if (!buffer_uptodate(bh)) 3103 ret = -EIO; 3104 } 3105 return ret; 3106 } 3107 EXPORT_SYMBOL(__bh_read); 3108 3109 /** 3110 * __bh_read_batch - Submit read for a batch of unlocked buffers 3111 * @nr: entry number of the buffer batch 3112 * @bhs: a batch of struct buffer_head 3113 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ 3114 * @force_lock: force to get a lock on the buffer if set, otherwise drops any 3115 * buffer that cannot lock. 3116 * 3117 * Returns zero on success or don't wait, and -EIO on error. 3118 */ 3119 void __bh_read_batch(int nr, struct buffer_head *bhs[], 3120 blk_opf_t op_flags, bool force_lock) 3121 { 3122 int i; 3123 3124 for (i = 0; i < nr; i++) { 3125 struct buffer_head *bh = bhs[i]; 3126 3127 if (buffer_uptodate(bh)) 3128 continue; 3129 3130 if (force_lock) 3131 lock_buffer(bh); 3132 else 3133 if (!trylock_buffer(bh)) 3134 continue; 3135 3136 if (buffer_uptodate(bh)) { 3137 unlock_buffer(bh); 3138 continue; 3139 } 3140 3141 bh->b_end_io = end_buffer_read_sync; 3142 get_bh(bh); 3143 submit_bh(REQ_OP_READ | op_flags, bh); 3144 } 3145 } 3146 EXPORT_SYMBOL(__bh_read_batch); 3147 3148 void __init buffer_init(void) 3149 { 3150 unsigned long nrpages; 3151 int ret; 3152 3153 bh_cachep = KMEM_CACHE(buffer_head, 3154 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC); 3155 /* 3156 * Limit the bh occupancy to 10% of ZONE_NORMAL 3157 */ 3158 nrpages = (nr_free_buffer_pages() * 10) / 100; 3159 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3160 ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead", 3161 NULL, buffer_exit_cpu_dead); 3162 WARN_ON(ret < 0); 3163 } 3164