1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/filemap.c 4 * 5 * Copyright (C) 1994-1999 Linus Torvalds 6 */ 7 8 /* 9 * This file handles the generic file mmap semantics used by 10 * most "normal" filesystems (but you don't /have/ to use this: 11 * the NFS filesystem used to do this differently, for example) 12 */ 13 #include <linux/export.h> 14 #include <linux/compiler.h> 15 #include <linux/dax.h> 16 #include <linux/fs.h> 17 #include <linux/sched/signal.h> 18 #include <linux/uaccess.h> 19 #include <linux/capability.h> 20 #include <linux/kernel_stat.h> 21 #include <linux/gfp.h> 22 #include <linux/mm.h> 23 #include <linux/swap.h> 24 #include <linux/swapops.h> 25 #include <linux/mman.h> 26 #include <linux/pagemap.h> 27 #include <linux/file.h> 28 #include <linux/uio.h> 29 #include <linux/error-injection.h> 30 #include <linux/hash.h> 31 #include <linux/writeback.h> 32 #include <linux/backing-dev.h> 33 #include <linux/pagevec.h> 34 #include <linux/security.h> 35 #include <linux/cpuset.h> 36 #include <linux/hugetlb.h> 37 #include <linux/memcontrol.h> 38 #include <linux/shmem_fs.h> 39 #include <linux/rmap.h> 40 #include <linux/delayacct.h> 41 #include <linux/psi.h> 42 #include <linux/ramfs.h> 43 #include <linux/page_idle.h> 44 #include <linux/migrate.h> 45 #include <linux/pipe_fs_i.h> 46 #include <linux/splice.h> 47 #include <asm/pgalloc.h> 48 #include <asm/tlbflush.h> 49 #include "internal.h" 50 51 #define CREATE_TRACE_POINTS 52 #include <trace/events/filemap.h> 53 54 /* 55 * FIXME: remove all knowledge of the buffer layer from the core VM 56 */ 57 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 58 59 #include <asm/mman.h> 60 61 /* 62 * Shared mappings implemented 30.11.1994. It's not fully working yet, 63 * though. 64 * 65 * Shared mappings now work. 15.8.1995 Bruno. 66 * 67 * finished 'unifying' the page and buffer cache and SMP-threaded the 68 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 69 * 70 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 71 */ 72 73 /* 74 * Lock ordering: 75 * 76 * ->i_mmap_rwsem (truncate_pagecache) 77 * ->private_lock (__free_pte->block_dirty_folio) 78 * ->swap_lock (exclusive_swap_page, others) 79 * ->i_pages lock 80 * 81 * ->i_rwsem 82 * ->invalidate_lock (acquired by fs in truncate path) 83 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 84 * 85 * ->mmap_lock 86 * ->i_mmap_rwsem 87 * ->page_table_lock or pte_lock (various, mainly in memory.c) 88 * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) 89 * 90 * ->mmap_lock 91 * ->invalidate_lock (filemap_fault) 92 * ->lock_page (filemap_fault, access_process_vm) 93 * 94 * ->i_rwsem (generic_perform_write) 95 * ->mmap_lock (fault_in_readable->do_page_fault) 96 * 97 * bdi->wb.list_lock 98 * sb_lock (fs/fs-writeback.c) 99 * ->i_pages lock (__sync_single_inode) 100 * 101 * ->i_mmap_rwsem 102 * ->anon_vma.lock (vma_adjust) 103 * 104 * ->anon_vma.lock 105 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 106 * 107 * ->page_table_lock or pte_lock 108 * ->swap_lock (try_to_unmap_one) 109 * ->private_lock (try_to_unmap_one) 110 * ->i_pages lock (try_to_unmap_one) 111 * ->lruvec->lru_lock (follow_page->mark_page_accessed) 112 * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) 113 * ->private_lock (page_remove_rmap->set_page_dirty) 114 * ->i_pages lock (page_remove_rmap->set_page_dirty) 115 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 116 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 117 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) 118 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 119 * ->inode->i_lock (zap_pte_range->set_page_dirty) 120 * ->private_lock (zap_pte_range->block_dirty_folio) 121 * 122 * ->i_mmap_rwsem 123 * ->tasklist_lock (memory_failure, collect_procs_ao) 124 */ 125 126 static void page_cache_delete(struct address_space *mapping, 127 struct folio *folio, void *shadow) 128 { 129 XA_STATE(xas, &mapping->i_pages, folio->index); 130 long nr = 1; 131 132 mapping_set_update(&xas, mapping); 133 134 /* hugetlb pages are represented by a single entry in the xarray */ 135 if (!folio_test_hugetlb(folio)) { 136 xas_set_order(&xas, folio->index, folio_order(folio)); 137 nr = folio_nr_pages(folio); 138 } 139 140 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 141 142 xas_store(&xas, shadow); 143 xas_init_marks(&xas); 144 145 folio->mapping = NULL; 146 /* Leave page->index set: truncation lookup relies upon it */ 147 mapping->nrpages -= nr; 148 } 149 150 static void filemap_unaccount_folio(struct address_space *mapping, 151 struct folio *folio) 152 { 153 long nr; 154 155 VM_BUG_ON_FOLIO(folio_mapped(folio), folio); 156 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { 157 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 158 current->comm, folio_pfn(folio)); 159 dump_page(&folio->page, "still mapped when deleted"); 160 dump_stack(); 161 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 162 163 if (mapping_exiting(mapping) && !folio_test_large(folio)) { 164 int mapcount = page_mapcount(&folio->page); 165 166 if (folio_ref_count(folio) >= mapcount + 2) { 167 /* 168 * All vmas have already been torn down, so it's 169 * a good bet that actually the page is unmapped 170 * and we'd rather not leak it: if we're wrong, 171 * another bad page check should catch it later. 172 */ 173 page_mapcount_reset(&folio->page); 174 folio_ref_sub(folio, mapcount); 175 } 176 } 177 } 178 179 /* hugetlb folios do not participate in page cache accounting. */ 180 if (folio_test_hugetlb(folio)) 181 return; 182 183 nr = folio_nr_pages(folio); 184 185 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 186 if (folio_test_swapbacked(folio)) { 187 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 188 if (folio_test_pmd_mappable(folio)) 189 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); 190 } else if (folio_test_pmd_mappable(folio)) { 191 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); 192 filemap_nr_thps_dec(mapping); 193 } 194 195 /* 196 * At this point folio must be either written or cleaned by 197 * truncate. Dirty folio here signals a bug and loss of 198 * unwritten data - on ordinary filesystems. 199 * 200 * But it's harmless on in-memory filesystems like tmpfs; and can 201 * occur when a driver which did get_user_pages() sets page dirty 202 * before putting it, while the inode is being finally evicted. 203 * 204 * Below fixes dirty accounting after removing the folio entirely 205 * but leaves the dirty flag set: it has no effect for truncated 206 * folio and anyway will be cleared before returning folio to 207 * buddy allocator. 208 */ 209 if (WARN_ON_ONCE(folio_test_dirty(folio) && 210 mapping_can_writeback(mapping))) 211 folio_account_cleaned(folio, inode_to_wb(mapping->host)); 212 } 213 214 /* 215 * Delete a page from the page cache and free it. Caller has to make 216 * sure the page is locked and that nobody else uses it - or that usage 217 * is safe. The caller must hold the i_pages lock. 218 */ 219 void __filemap_remove_folio(struct folio *folio, void *shadow) 220 { 221 struct address_space *mapping = folio->mapping; 222 223 trace_mm_filemap_delete_from_page_cache(folio); 224 filemap_unaccount_folio(mapping, folio); 225 page_cache_delete(mapping, folio, shadow); 226 } 227 228 void filemap_free_folio(struct address_space *mapping, struct folio *folio) 229 { 230 void (*free_folio)(struct folio *); 231 int refs = 1; 232 233 free_folio = mapping->a_ops->free_folio; 234 if (free_folio) 235 free_folio(folio); 236 237 if (folio_test_large(folio) && !folio_test_hugetlb(folio)) 238 refs = folio_nr_pages(folio); 239 folio_put_refs(folio, refs); 240 } 241 242 /** 243 * filemap_remove_folio - Remove folio from page cache. 244 * @folio: The folio. 245 * 246 * This must be called only on folios that are locked and have been 247 * verified to be in the page cache. It will never put the folio into 248 * the free list because the caller has a reference on the page. 249 */ 250 void filemap_remove_folio(struct folio *folio) 251 { 252 struct address_space *mapping = folio->mapping; 253 254 BUG_ON(!folio_test_locked(folio)); 255 spin_lock(&mapping->host->i_lock); 256 xa_lock_irq(&mapping->i_pages); 257 __filemap_remove_folio(folio, NULL); 258 xa_unlock_irq(&mapping->i_pages); 259 if (mapping_shrinkable(mapping)) 260 inode_add_lru(mapping->host); 261 spin_unlock(&mapping->host->i_lock); 262 263 filemap_free_folio(mapping, folio); 264 } 265 266 /* 267 * page_cache_delete_batch - delete several folios from page cache 268 * @mapping: the mapping to which folios belong 269 * @fbatch: batch of folios to delete 270 * 271 * The function walks over mapping->i_pages and removes folios passed in 272 * @fbatch from the mapping. The function expects @fbatch to be sorted 273 * by page index and is optimised for it to be dense. 274 * It tolerates holes in @fbatch (mapping entries at those indices are not 275 * modified). 276 * 277 * The function expects the i_pages lock to be held. 278 */ 279 static void page_cache_delete_batch(struct address_space *mapping, 280 struct folio_batch *fbatch) 281 { 282 XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); 283 long total_pages = 0; 284 int i = 0; 285 struct folio *folio; 286 287 mapping_set_update(&xas, mapping); 288 xas_for_each(&xas, folio, ULONG_MAX) { 289 if (i >= folio_batch_count(fbatch)) 290 break; 291 292 /* A swap/dax/shadow entry got inserted? Skip it. */ 293 if (xa_is_value(folio)) 294 continue; 295 /* 296 * A page got inserted in our range? Skip it. We have our 297 * pages locked so they are protected from being removed. 298 * If we see a page whose index is higher than ours, it 299 * means our page has been removed, which shouldn't be 300 * possible because we're holding the PageLock. 301 */ 302 if (folio != fbatch->folios[i]) { 303 VM_BUG_ON_FOLIO(folio->index > 304 fbatch->folios[i]->index, folio); 305 continue; 306 } 307 308 WARN_ON_ONCE(!folio_test_locked(folio)); 309 310 folio->mapping = NULL; 311 /* Leave folio->index set: truncation lookup relies on it */ 312 313 i++; 314 xas_store(&xas, NULL); 315 total_pages += folio_nr_pages(folio); 316 } 317 mapping->nrpages -= total_pages; 318 } 319 320 void delete_from_page_cache_batch(struct address_space *mapping, 321 struct folio_batch *fbatch) 322 { 323 int i; 324 325 if (!folio_batch_count(fbatch)) 326 return; 327 328 spin_lock(&mapping->host->i_lock); 329 xa_lock_irq(&mapping->i_pages); 330 for (i = 0; i < folio_batch_count(fbatch); i++) { 331 struct folio *folio = fbatch->folios[i]; 332 333 trace_mm_filemap_delete_from_page_cache(folio); 334 filemap_unaccount_folio(mapping, folio); 335 } 336 page_cache_delete_batch(mapping, fbatch); 337 xa_unlock_irq(&mapping->i_pages); 338 if (mapping_shrinkable(mapping)) 339 inode_add_lru(mapping->host); 340 spin_unlock(&mapping->host->i_lock); 341 342 for (i = 0; i < folio_batch_count(fbatch); i++) 343 filemap_free_folio(mapping, fbatch->folios[i]); 344 } 345 346 int filemap_check_errors(struct address_space *mapping) 347 { 348 int ret = 0; 349 /* Check for outstanding write errors */ 350 if (test_bit(AS_ENOSPC, &mapping->flags) && 351 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 352 ret = -ENOSPC; 353 if (test_bit(AS_EIO, &mapping->flags) && 354 test_and_clear_bit(AS_EIO, &mapping->flags)) 355 ret = -EIO; 356 return ret; 357 } 358 EXPORT_SYMBOL(filemap_check_errors); 359 360 static int filemap_check_and_keep_errors(struct address_space *mapping) 361 { 362 /* Check for outstanding write errors */ 363 if (test_bit(AS_EIO, &mapping->flags)) 364 return -EIO; 365 if (test_bit(AS_ENOSPC, &mapping->flags)) 366 return -ENOSPC; 367 return 0; 368 } 369 370 /** 371 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range 372 * @mapping: address space structure to write 373 * @wbc: the writeback_control controlling the writeout 374 * 375 * Call writepages on the mapping using the provided wbc to control the 376 * writeout. 377 * 378 * Return: %0 on success, negative error code otherwise. 379 */ 380 int filemap_fdatawrite_wbc(struct address_space *mapping, 381 struct writeback_control *wbc) 382 { 383 int ret; 384 385 if (!mapping_can_writeback(mapping) || 386 !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 387 return 0; 388 389 wbc_attach_fdatawrite_inode(wbc, mapping->host); 390 ret = do_writepages(mapping, wbc); 391 wbc_detach_inode(wbc); 392 return ret; 393 } 394 EXPORT_SYMBOL(filemap_fdatawrite_wbc); 395 396 /** 397 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 398 * @mapping: address space structure to write 399 * @start: offset in bytes where the range starts 400 * @end: offset in bytes where the range ends (inclusive) 401 * @sync_mode: enable synchronous operation 402 * 403 * Start writeback against all of a mapping's dirty pages that lie 404 * within the byte offsets <start, end> inclusive. 405 * 406 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 407 * opposed to a regular memory cleansing writeback. The difference between 408 * these two operations is that if a dirty page/buffer is encountered, it must 409 * be waited upon, and not just skipped over. 410 * 411 * Return: %0 on success, negative error code otherwise. 412 */ 413 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 414 loff_t end, int sync_mode) 415 { 416 struct writeback_control wbc = { 417 .sync_mode = sync_mode, 418 .nr_to_write = LONG_MAX, 419 .range_start = start, 420 .range_end = end, 421 }; 422 423 return filemap_fdatawrite_wbc(mapping, &wbc); 424 } 425 426 static inline int __filemap_fdatawrite(struct address_space *mapping, 427 int sync_mode) 428 { 429 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 430 } 431 432 int filemap_fdatawrite(struct address_space *mapping) 433 { 434 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 435 } 436 EXPORT_SYMBOL(filemap_fdatawrite); 437 438 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 439 loff_t end) 440 { 441 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 442 } 443 EXPORT_SYMBOL(filemap_fdatawrite_range); 444 445 /** 446 * filemap_flush - mostly a non-blocking flush 447 * @mapping: target address_space 448 * 449 * This is a mostly non-blocking flush. Not suitable for data-integrity 450 * purposes - I/O may not be started against all dirty pages. 451 * 452 * Return: %0 on success, negative error code otherwise. 453 */ 454 int filemap_flush(struct address_space *mapping) 455 { 456 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 457 } 458 EXPORT_SYMBOL(filemap_flush); 459 460 /** 461 * filemap_range_has_page - check if a page exists in range. 462 * @mapping: address space within which to check 463 * @start_byte: offset in bytes where the range starts 464 * @end_byte: offset in bytes where the range ends (inclusive) 465 * 466 * Find at least one page in the range supplied, usually used to check if 467 * direct writing in this range will trigger a writeback. 468 * 469 * Return: %true if at least one page exists in the specified range, 470 * %false otherwise. 471 */ 472 bool filemap_range_has_page(struct address_space *mapping, 473 loff_t start_byte, loff_t end_byte) 474 { 475 struct page *page; 476 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); 477 pgoff_t max = end_byte >> PAGE_SHIFT; 478 479 if (end_byte < start_byte) 480 return false; 481 482 rcu_read_lock(); 483 for (;;) { 484 page = xas_find(&xas, max); 485 if (xas_retry(&xas, page)) 486 continue; 487 /* Shadow entries don't count */ 488 if (xa_is_value(page)) 489 continue; 490 /* 491 * We don't need to try to pin this page; we're about to 492 * release the RCU lock anyway. It is enough to know that 493 * there was a page here recently. 494 */ 495 break; 496 } 497 rcu_read_unlock(); 498 499 return page != NULL; 500 } 501 EXPORT_SYMBOL(filemap_range_has_page); 502 503 static void __filemap_fdatawait_range(struct address_space *mapping, 504 loff_t start_byte, loff_t end_byte) 505 { 506 pgoff_t index = start_byte >> PAGE_SHIFT; 507 pgoff_t end = end_byte >> PAGE_SHIFT; 508 struct pagevec pvec; 509 int nr_pages; 510 511 pagevec_init(&pvec); 512 while (index <= end) { 513 unsigned i; 514 515 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, 516 end, PAGECACHE_TAG_WRITEBACK); 517 if (!nr_pages) 518 break; 519 520 for (i = 0; i < nr_pages; i++) { 521 struct page *page = pvec.pages[i]; 522 523 wait_on_page_writeback(page); 524 ClearPageError(page); 525 } 526 pagevec_release(&pvec); 527 cond_resched(); 528 } 529 } 530 531 /** 532 * filemap_fdatawait_range - wait for writeback to complete 533 * @mapping: address space structure to wait for 534 * @start_byte: offset in bytes where the range starts 535 * @end_byte: offset in bytes where the range ends (inclusive) 536 * 537 * Walk the list of under-writeback pages of the given address space 538 * in the given range and wait for all of them. Check error status of 539 * the address space and return it. 540 * 541 * Since the error status of the address space is cleared by this function, 542 * callers are responsible for checking the return value and handling and/or 543 * reporting the error. 544 * 545 * Return: error status of the address space. 546 */ 547 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 548 loff_t end_byte) 549 { 550 __filemap_fdatawait_range(mapping, start_byte, end_byte); 551 return filemap_check_errors(mapping); 552 } 553 EXPORT_SYMBOL(filemap_fdatawait_range); 554 555 /** 556 * filemap_fdatawait_range_keep_errors - wait for writeback to complete 557 * @mapping: address space structure to wait for 558 * @start_byte: offset in bytes where the range starts 559 * @end_byte: offset in bytes where the range ends (inclusive) 560 * 561 * Walk the list of under-writeback pages of the given address space in the 562 * given range and wait for all of them. Unlike filemap_fdatawait_range(), 563 * this function does not clear error status of the address space. 564 * 565 * Use this function if callers don't handle errors themselves. Expected 566 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 567 * fsfreeze(8) 568 */ 569 int filemap_fdatawait_range_keep_errors(struct address_space *mapping, 570 loff_t start_byte, loff_t end_byte) 571 { 572 __filemap_fdatawait_range(mapping, start_byte, end_byte); 573 return filemap_check_and_keep_errors(mapping); 574 } 575 EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); 576 577 /** 578 * file_fdatawait_range - wait for writeback to complete 579 * @file: file pointing to address space structure to wait for 580 * @start_byte: offset in bytes where the range starts 581 * @end_byte: offset in bytes where the range ends (inclusive) 582 * 583 * Walk the list of under-writeback pages of the address space that file 584 * refers to, in the given range and wait for all of them. Check error 585 * status of the address space vs. the file->f_wb_err cursor and return it. 586 * 587 * Since the error status of the file is advanced by this function, 588 * callers are responsible for checking the return value and handling and/or 589 * reporting the error. 590 * 591 * Return: error status of the address space vs. the file->f_wb_err cursor. 592 */ 593 int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) 594 { 595 struct address_space *mapping = file->f_mapping; 596 597 __filemap_fdatawait_range(mapping, start_byte, end_byte); 598 return file_check_and_advance_wb_err(file); 599 } 600 EXPORT_SYMBOL(file_fdatawait_range); 601 602 /** 603 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 604 * @mapping: address space structure to wait for 605 * 606 * Walk the list of under-writeback pages of the given address space 607 * and wait for all of them. Unlike filemap_fdatawait(), this function 608 * does not clear error status of the address space. 609 * 610 * Use this function if callers don't handle errors themselves. Expected 611 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 612 * fsfreeze(8) 613 * 614 * Return: error status of the address space. 615 */ 616 int filemap_fdatawait_keep_errors(struct address_space *mapping) 617 { 618 __filemap_fdatawait_range(mapping, 0, LLONG_MAX); 619 return filemap_check_and_keep_errors(mapping); 620 } 621 EXPORT_SYMBOL(filemap_fdatawait_keep_errors); 622 623 /* Returns true if writeback might be needed or already in progress. */ 624 static bool mapping_needs_writeback(struct address_space *mapping) 625 { 626 return mapping->nrpages; 627 } 628 629 bool filemap_range_has_writeback(struct address_space *mapping, 630 loff_t start_byte, loff_t end_byte) 631 { 632 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); 633 pgoff_t max = end_byte >> PAGE_SHIFT; 634 struct folio *folio; 635 636 if (end_byte < start_byte) 637 return false; 638 639 rcu_read_lock(); 640 xas_for_each(&xas, folio, max) { 641 if (xas_retry(&xas, folio)) 642 continue; 643 if (xa_is_value(folio)) 644 continue; 645 if (folio_test_dirty(folio) || folio_test_locked(folio) || 646 folio_test_writeback(folio)) 647 break; 648 } 649 rcu_read_unlock(); 650 return folio != NULL; 651 } 652 EXPORT_SYMBOL_GPL(filemap_range_has_writeback); 653 654 /** 655 * filemap_write_and_wait_range - write out & wait on a file range 656 * @mapping: the address_space for the pages 657 * @lstart: offset in bytes where the range starts 658 * @lend: offset in bytes where the range ends (inclusive) 659 * 660 * Write out and wait upon file offsets lstart->lend, inclusive. 661 * 662 * Note that @lend is inclusive (describes the last byte to be written) so 663 * that this function can be used to write to the very end-of-file (end = -1). 664 * 665 * Return: error status of the address space. 666 */ 667 int filemap_write_and_wait_range(struct address_space *mapping, 668 loff_t lstart, loff_t lend) 669 { 670 int err = 0, err2; 671 672 if (lend < lstart) 673 return 0; 674 675 if (mapping_needs_writeback(mapping)) { 676 err = __filemap_fdatawrite_range(mapping, lstart, lend, 677 WB_SYNC_ALL); 678 /* 679 * Even if the above returned error, the pages may be 680 * written partially (e.g. -ENOSPC), so we wait for it. 681 * But the -EIO is special case, it may indicate the worst 682 * thing (e.g. bug) happened, so we avoid waiting for it. 683 */ 684 if (err != -EIO) 685 __filemap_fdatawait_range(mapping, lstart, lend); 686 } 687 err2 = filemap_check_errors(mapping); 688 if (!err) 689 err = err2; 690 return err; 691 } 692 EXPORT_SYMBOL(filemap_write_and_wait_range); 693 694 void __filemap_set_wb_err(struct address_space *mapping, int err) 695 { 696 errseq_t eseq = errseq_set(&mapping->wb_err, err); 697 698 trace_filemap_set_wb_err(mapping, eseq); 699 } 700 EXPORT_SYMBOL(__filemap_set_wb_err); 701 702 /** 703 * file_check_and_advance_wb_err - report wb error (if any) that was previously 704 * and advance wb_err to current one 705 * @file: struct file on which the error is being reported 706 * 707 * When userland calls fsync (or something like nfsd does the equivalent), we 708 * want to report any writeback errors that occurred since the last fsync (or 709 * since the file was opened if there haven't been any). 710 * 711 * Grab the wb_err from the mapping. If it matches what we have in the file, 712 * then just quickly return 0. The file is all caught up. 713 * 714 * If it doesn't match, then take the mapping value, set the "seen" flag in 715 * it and try to swap it into place. If it works, or another task beat us 716 * to it with the new value, then update the f_wb_err and return the error 717 * portion. The error at this point must be reported via proper channels 718 * (a'la fsync, or NFS COMMIT operation, etc.). 719 * 720 * While we handle mapping->wb_err with atomic operations, the f_wb_err 721 * value is protected by the f_lock since we must ensure that it reflects 722 * the latest value swapped in for this file descriptor. 723 * 724 * Return: %0 on success, negative error code otherwise. 725 */ 726 int file_check_and_advance_wb_err(struct file *file) 727 { 728 int err = 0; 729 errseq_t old = READ_ONCE(file->f_wb_err); 730 struct address_space *mapping = file->f_mapping; 731 732 /* Locklessly handle the common case where nothing has changed */ 733 if (errseq_check(&mapping->wb_err, old)) { 734 /* Something changed, must use slow path */ 735 spin_lock(&file->f_lock); 736 old = file->f_wb_err; 737 err = errseq_check_and_advance(&mapping->wb_err, 738 &file->f_wb_err); 739 trace_file_check_and_advance_wb_err(file, old); 740 spin_unlock(&file->f_lock); 741 } 742 743 /* 744 * We're mostly using this function as a drop in replacement for 745 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect 746 * that the legacy code would have had on these flags. 747 */ 748 clear_bit(AS_EIO, &mapping->flags); 749 clear_bit(AS_ENOSPC, &mapping->flags); 750 return err; 751 } 752 EXPORT_SYMBOL(file_check_and_advance_wb_err); 753 754 /** 755 * file_write_and_wait_range - write out & wait on a file range 756 * @file: file pointing to address_space with pages 757 * @lstart: offset in bytes where the range starts 758 * @lend: offset in bytes where the range ends (inclusive) 759 * 760 * Write out and wait upon file offsets lstart->lend, inclusive. 761 * 762 * Note that @lend is inclusive (describes the last byte to be written) so 763 * that this function can be used to write to the very end-of-file (end = -1). 764 * 765 * After writing out and waiting on the data, we check and advance the 766 * f_wb_err cursor to the latest value, and return any errors detected there. 767 * 768 * Return: %0 on success, negative error code otherwise. 769 */ 770 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) 771 { 772 int err = 0, err2; 773 struct address_space *mapping = file->f_mapping; 774 775 if (lend < lstart) 776 return 0; 777 778 if (mapping_needs_writeback(mapping)) { 779 err = __filemap_fdatawrite_range(mapping, lstart, lend, 780 WB_SYNC_ALL); 781 /* See comment of filemap_write_and_wait() */ 782 if (err != -EIO) 783 __filemap_fdatawait_range(mapping, lstart, lend); 784 } 785 err2 = file_check_and_advance_wb_err(file); 786 if (!err) 787 err = err2; 788 return err; 789 } 790 EXPORT_SYMBOL(file_write_and_wait_range); 791 792 /** 793 * replace_page_cache_folio - replace a pagecache folio with a new one 794 * @old: folio to be replaced 795 * @new: folio to replace with 796 * 797 * This function replaces a folio in the pagecache with a new one. On 798 * success it acquires the pagecache reference for the new folio and 799 * drops it for the old folio. Both the old and new folios must be 800 * locked. This function does not add the new folio to the LRU, the 801 * caller must do that. 802 * 803 * The remove + add is atomic. This function cannot fail. 804 */ 805 void replace_page_cache_folio(struct folio *old, struct folio *new) 806 { 807 struct address_space *mapping = old->mapping; 808 void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; 809 pgoff_t offset = old->index; 810 XA_STATE(xas, &mapping->i_pages, offset); 811 812 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 813 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 814 VM_BUG_ON_FOLIO(new->mapping, new); 815 816 folio_get(new); 817 new->mapping = mapping; 818 new->index = offset; 819 820 mem_cgroup_migrate(old, new); 821 822 xas_lock_irq(&xas); 823 xas_store(&xas, new); 824 825 old->mapping = NULL; 826 /* hugetlb pages do not participate in page cache accounting. */ 827 if (!folio_test_hugetlb(old)) 828 __lruvec_stat_sub_folio(old, NR_FILE_PAGES); 829 if (!folio_test_hugetlb(new)) 830 __lruvec_stat_add_folio(new, NR_FILE_PAGES); 831 if (folio_test_swapbacked(old)) 832 __lruvec_stat_sub_folio(old, NR_SHMEM); 833 if (folio_test_swapbacked(new)) 834 __lruvec_stat_add_folio(new, NR_SHMEM); 835 xas_unlock_irq(&xas); 836 if (free_folio) 837 free_folio(old); 838 folio_put(old); 839 } 840 EXPORT_SYMBOL_GPL(replace_page_cache_folio); 841 842 noinline int __filemap_add_folio(struct address_space *mapping, 843 struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) 844 { 845 XA_STATE(xas, &mapping->i_pages, index); 846 int huge = folio_test_hugetlb(folio); 847 bool charged = false; 848 long nr = 1; 849 850 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 851 VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); 852 mapping_set_update(&xas, mapping); 853 854 if (!huge) { 855 int error = mem_cgroup_charge(folio, NULL, gfp); 856 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); 857 if (error) 858 return error; 859 charged = true; 860 xas_set_order(&xas, index, folio_order(folio)); 861 nr = folio_nr_pages(folio); 862 } 863 864 gfp &= GFP_RECLAIM_MASK; 865 folio_ref_add(folio, nr); 866 folio->mapping = mapping; 867 folio->index = xas.xa_index; 868 869 do { 870 unsigned int order = xa_get_order(xas.xa, xas.xa_index); 871 void *entry, *old = NULL; 872 873 if (order > folio_order(folio)) 874 xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), 875 order, gfp); 876 xas_lock_irq(&xas); 877 xas_for_each_conflict(&xas, entry) { 878 old = entry; 879 if (!xa_is_value(entry)) { 880 xas_set_err(&xas, -EEXIST); 881 goto unlock; 882 } 883 } 884 885 if (old) { 886 if (shadowp) 887 *shadowp = old; 888 /* entry may have been split before we acquired lock */ 889 order = xa_get_order(xas.xa, xas.xa_index); 890 if (order > folio_order(folio)) { 891 /* How to handle large swap entries? */ 892 BUG_ON(shmem_mapping(mapping)); 893 xas_split(&xas, old, order); 894 xas_reset(&xas); 895 } 896 } 897 898 xas_store(&xas, folio); 899 if (xas_error(&xas)) 900 goto unlock; 901 902 mapping->nrpages += nr; 903 904 /* hugetlb pages do not participate in page cache accounting */ 905 if (!huge) { 906 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 907 if (folio_test_pmd_mappable(folio)) 908 __lruvec_stat_mod_folio(folio, 909 NR_FILE_THPS, nr); 910 } 911 unlock: 912 xas_unlock_irq(&xas); 913 } while (xas_nomem(&xas, gfp)); 914 915 if (xas_error(&xas)) 916 goto error; 917 918 trace_mm_filemap_add_to_page_cache(folio); 919 return 0; 920 error: 921 if (charged) 922 mem_cgroup_uncharge(folio); 923 folio->mapping = NULL; 924 /* Leave page->index set: truncation relies upon it */ 925 folio_put_refs(folio, nr); 926 return xas_error(&xas); 927 } 928 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); 929 930 int filemap_add_folio(struct address_space *mapping, struct folio *folio, 931 pgoff_t index, gfp_t gfp) 932 { 933 void *shadow = NULL; 934 int ret; 935 936 __folio_set_locked(folio); 937 ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); 938 if (unlikely(ret)) 939 __folio_clear_locked(folio); 940 else { 941 /* 942 * The folio might have been evicted from cache only 943 * recently, in which case it should be activated like 944 * any other repeatedly accessed folio. 945 * The exception is folios getting rewritten; evicting other 946 * data from the working set, only to cache data that will 947 * get overwritten with something else, is a waste of memory. 948 */ 949 WARN_ON_ONCE(folio_test_active(folio)); 950 if (!(gfp & __GFP_WRITE) && shadow) 951 workingset_refault(folio, shadow); 952 folio_add_lru(folio); 953 } 954 return ret; 955 } 956 EXPORT_SYMBOL_GPL(filemap_add_folio); 957 958 #ifdef CONFIG_NUMA 959 struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) 960 { 961 int n; 962 struct folio *folio; 963 964 if (cpuset_do_page_mem_spread()) { 965 unsigned int cpuset_mems_cookie; 966 do { 967 cpuset_mems_cookie = read_mems_allowed_begin(); 968 n = cpuset_mem_spread_node(); 969 folio = __folio_alloc_node(gfp, order, n); 970 } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); 971 972 return folio; 973 } 974 return folio_alloc(gfp, order); 975 } 976 EXPORT_SYMBOL(filemap_alloc_folio); 977 #endif 978 979 /* 980 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings 981 * 982 * Lock exclusively invalidate_lock of any passed mapping that is not NULL. 983 * 984 * @mapping1: the first mapping to lock 985 * @mapping2: the second mapping to lock 986 */ 987 void filemap_invalidate_lock_two(struct address_space *mapping1, 988 struct address_space *mapping2) 989 { 990 if (mapping1 > mapping2) 991 swap(mapping1, mapping2); 992 if (mapping1) 993 down_write(&mapping1->invalidate_lock); 994 if (mapping2 && mapping1 != mapping2) 995 down_write_nested(&mapping2->invalidate_lock, 1); 996 } 997 EXPORT_SYMBOL(filemap_invalidate_lock_two); 998 999 /* 1000 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings 1001 * 1002 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. 1003 * 1004 * @mapping1: the first mapping to unlock 1005 * @mapping2: the second mapping to unlock 1006 */ 1007 void filemap_invalidate_unlock_two(struct address_space *mapping1, 1008 struct address_space *mapping2) 1009 { 1010 if (mapping1) 1011 up_write(&mapping1->invalidate_lock); 1012 if (mapping2 && mapping1 != mapping2) 1013 up_write(&mapping2->invalidate_lock); 1014 } 1015 EXPORT_SYMBOL(filemap_invalidate_unlock_two); 1016 1017 /* 1018 * In order to wait for pages to become available there must be 1019 * waitqueues associated with pages. By using a hash table of 1020 * waitqueues where the bucket discipline is to maintain all 1021 * waiters on the same queue and wake all when any of the pages 1022 * become available, and for the woken contexts to check to be 1023 * sure the appropriate page became available, this saves space 1024 * at a cost of "thundering herd" phenomena during rare hash 1025 * collisions. 1026 */ 1027 #define PAGE_WAIT_TABLE_BITS 8 1028 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) 1029 static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; 1030 1031 static wait_queue_head_t *folio_waitqueue(struct folio *folio) 1032 { 1033 return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; 1034 } 1035 1036 void __init pagecache_init(void) 1037 { 1038 int i; 1039 1040 for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) 1041 init_waitqueue_head(&folio_wait_table[i]); 1042 1043 page_writeback_init(); 1044 } 1045 1046 /* 1047 * The page wait code treats the "wait->flags" somewhat unusually, because 1048 * we have multiple different kinds of waits, not just the usual "exclusive" 1049 * one. 1050 * 1051 * We have: 1052 * 1053 * (a) no special bits set: 1054 * 1055 * We're just waiting for the bit to be released, and when a waker 1056 * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, 1057 * and remove it from the wait queue. 1058 * 1059 * Simple and straightforward. 1060 * 1061 * (b) WQ_FLAG_EXCLUSIVE: 1062 * 1063 * The waiter is waiting to get the lock, and only one waiter should 1064 * be woken up to avoid any thundering herd behavior. We'll set the 1065 * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. 1066 * 1067 * This is the traditional exclusive wait. 1068 * 1069 * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: 1070 * 1071 * The waiter is waiting to get the bit, and additionally wants the 1072 * lock to be transferred to it for fair lock behavior. If the lock 1073 * cannot be taken, we stop walking the wait queue without waking 1074 * the waiter. 1075 * 1076 * This is the "fair lock handoff" case, and in addition to setting 1077 * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see 1078 * that it now has the lock. 1079 */ 1080 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) 1081 { 1082 unsigned int flags; 1083 struct wait_page_key *key = arg; 1084 struct wait_page_queue *wait_page 1085 = container_of(wait, struct wait_page_queue, wait); 1086 1087 if (!wake_page_match(wait_page, key)) 1088 return 0; 1089 1090 /* 1091 * If it's a lock handoff wait, we get the bit for it, and 1092 * stop walking (and do not wake it up) if we can't. 1093 */ 1094 flags = wait->flags; 1095 if (flags & WQ_FLAG_EXCLUSIVE) { 1096 if (test_bit(key->bit_nr, &key->folio->flags)) 1097 return -1; 1098 if (flags & WQ_FLAG_CUSTOM) { 1099 if (test_and_set_bit(key->bit_nr, &key->folio->flags)) 1100 return -1; 1101 flags |= WQ_FLAG_DONE; 1102 } 1103 } 1104 1105 /* 1106 * We are holding the wait-queue lock, but the waiter that 1107 * is waiting for this will be checking the flags without 1108 * any locking. 1109 * 1110 * So update the flags atomically, and wake up the waiter 1111 * afterwards to avoid any races. This store-release pairs 1112 * with the load-acquire in folio_wait_bit_common(). 1113 */ 1114 smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); 1115 wake_up_state(wait->private, mode); 1116 1117 /* 1118 * Ok, we have successfully done what we're waiting for, 1119 * and we can unconditionally remove the wait entry. 1120 * 1121 * Note that this pairs with the "finish_wait()" in the 1122 * waiter, and has to be the absolute last thing we do. 1123 * After this list_del_init(&wait->entry) the wait entry 1124 * might be de-allocated and the process might even have 1125 * exited. 1126 */ 1127 list_del_init_careful(&wait->entry); 1128 return (flags & WQ_FLAG_EXCLUSIVE) != 0; 1129 } 1130 1131 static void folio_wake_bit(struct folio *folio, int bit_nr) 1132 { 1133 wait_queue_head_t *q = folio_waitqueue(folio); 1134 struct wait_page_key key; 1135 unsigned long flags; 1136 wait_queue_entry_t bookmark; 1137 1138 key.folio = folio; 1139 key.bit_nr = bit_nr; 1140 key.page_match = 0; 1141 1142 bookmark.flags = 0; 1143 bookmark.private = NULL; 1144 bookmark.func = NULL; 1145 INIT_LIST_HEAD(&bookmark.entry); 1146 1147 spin_lock_irqsave(&q->lock, flags); 1148 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); 1149 1150 while (bookmark.flags & WQ_FLAG_BOOKMARK) { 1151 /* 1152 * Take a breather from holding the lock, 1153 * allow pages that finish wake up asynchronously 1154 * to acquire the lock and remove themselves 1155 * from wait queue 1156 */ 1157 spin_unlock_irqrestore(&q->lock, flags); 1158 cpu_relax(); 1159 spin_lock_irqsave(&q->lock, flags); 1160 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); 1161 } 1162 1163 /* 1164 * It's possible to miss clearing waiters here, when we woke our page 1165 * waiters, but the hashed waitqueue has waiters for other pages on it. 1166 * That's okay, it's a rare case. The next waker will clear it. 1167 * 1168 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, 1169 * other), the flag may be cleared in the course of freeing the page; 1170 * but that is not required for correctness. 1171 */ 1172 if (!waitqueue_active(q) || !key.page_match) 1173 folio_clear_waiters(folio); 1174 1175 spin_unlock_irqrestore(&q->lock, flags); 1176 } 1177 1178 static void folio_wake(struct folio *folio, int bit) 1179 { 1180 if (!folio_test_waiters(folio)) 1181 return; 1182 folio_wake_bit(folio, bit); 1183 } 1184 1185 /* 1186 * A choice of three behaviors for folio_wait_bit_common(): 1187 */ 1188 enum behavior { 1189 EXCLUSIVE, /* Hold ref to page and take the bit when woken, like 1190 * __folio_lock() waiting on then setting PG_locked. 1191 */ 1192 SHARED, /* Hold ref to page and check the bit when woken, like 1193 * folio_wait_writeback() waiting on PG_writeback. 1194 */ 1195 DROP, /* Drop ref to page before wait, no check when woken, 1196 * like folio_put_wait_locked() on PG_locked. 1197 */ 1198 }; 1199 1200 /* 1201 * Attempt to check (or get) the folio flag, and mark us done 1202 * if successful. 1203 */ 1204 static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, 1205 struct wait_queue_entry *wait) 1206 { 1207 if (wait->flags & WQ_FLAG_EXCLUSIVE) { 1208 if (test_and_set_bit(bit_nr, &folio->flags)) 1209 return false; 1210 } else if (test_bit(bit_nr, &folio->flags)) 1211 return false; 1212 1213 wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; 1214 return true; 1215 } 1216 1217 /* How many times do we accept lock stealing from under a waiter? */ 1218 int sysctl_page_lock_unfairness = 5; 1219 1220 static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, 1221 int state, enum behavior behavior) 1222 { 1223 wait_queue_head_t *q = folio_waitqueue(folio); 1224 int unfairness = sysctl_page_lock_unfairness; 1225 struct wait_page_queue wait_page; 1226 wait_queue_entry_t *wait = &wait_page.wait; 1227 bool thrashing = false; 1228 unsigned long pflags; 1229 bool in_thrashing; 1230 1231 if (bit_nr == PG_locked && 1232 !folio_test_uptodate(folio) && folio_test_workingset(folio)) { 1233 delayacct_thrashing_start(&in_thrashing); 1234 psi_memstall_enter(&pflags); 1235 thrashing = true; 1236 } 1237 1238 init_wait(wait); 1239 wait->func = wake_page_function; 1240 wait_page.folio = folio; 1241 wait_page.bit_nr = bit_nr; 1242 1243 repeat: 1244 wait->flags = 0; 1245 if (behavior == EXCLUSIVE) { 1246 wait->flags = WQ_FLAG_EXCLUSIVE; 1247 if (--unfairness < 0) 1248 wait->flags |= WQ_FLAG_CUSTOM; 1249 } 1250 1251 /* 1252 * Do one last check whether we can get the 1253 * page bit synchronously. 1254 * 1255 * Do the folio_set_waiters() marking before that 1256 * to let any waker we _just_ missed know they 1257 * need to wake us up (otherwise they'll never 1258 * even go to the slow case that looks at the 1259 * page queue), and add ourselves to the wait 1260 * queue if we need to sleep. 1261 * 1262 * This part needs to be done under the queue 1263 * lock to avoid races. 1264 */ 1265 spin_lock_irq(&q->lock); 1266 folio_set_waiters(folio); 1267 if (!folio_trylock_flag(folio, bit_nr, wait)) 1268 __add_wait_queue_entry_tail(q, wait); 1269 spin_unlock_irq(&q->lock); 1270 1271 /* 1272 * From now on, all the logic will be based on 1273 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to 1274 * see whether the page bit testing has already 1275 * been done by the wake function. 1276 * 1277 * We can drop our reference to the folio. 1278 */ 1279 if (behavior == DROP) 1280 folio_put(folio); 1281 1282 /* 1283 * Note that until the "finish_wait()", or until 1284 * we see the WQ_FLAG_WOKEN flag, we need to 1285 * be very careful with the 'wait->flags', because 1286 * we may race with a waker that sets them. 1287 */ 1288 for (;;) { 1289 unsigned int flags; 1290 1291 set_current_state(state); 1292 1293 /* Loop until we've been woken or interrupted */ 1294 flags = smp_load_acquire(&wait->flags); 1295 if (!(flags & WQ_FLAG_WOKEN)) { 1296 if (signal_pending_state(state, current)) 1297 break; 1298 1299 io_schedule(); 1300 continue; 1301 } 1302 1303 /* If we were non-exclusive, we're done */ 1304 if (behavior != EXCLUSIVE) 1305 break; 1306 1307 /* If the waker got the lock for us, we're done */ 1308 if (flags & WQ_FLAG_DONE) 1309 break; 1310 1311 /* 1312 * Otherwise, if we're getting the lock, we need to 1313 * try to get it ourselves. 1314 * 1315 * And if that fails, we'll have to retry this all. 1316 */ 1317 if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) 1318 goto repeat; 1319 1320 wait->flags |= WQ_FLAG_DONE; 1321 break; 1322 } 1323 1324 /* 1325 * If a signal happened, this 'finish_wait()' may remove the last 1326 * waiter from the wait-queues, but the folio waiters bit will remain 1327 * set. That's ok. The next wakeup will take care of it, and trying 1328 * to do it here would be difficult and prone to races. 1329 */ 1330 finish_wait(q, wait); 1331 1332 if (thrashing) { 1333 delayacct_thrashing_end(&in_thrashing); 1334 psi_memstall_leave(&pflags); 1335 } 1336 1337 /* 1338 * NOTE! The wait->flags weren't stable until we've done the 1339 * 'finish_wait()', and we could have exited the loop above due 1340 * to a signal, and had a wakeup event happen after the signal 1341 * test but before the 'finish_wait()'. 1342 * 1343 * So only after the finish_wait() can we reliably determine 1344 * if we got woken up or not, so we can now figure out the final 1345 * return value based on that state without races. 1346 * 1347 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive 1348 * waiter, but an exclusive one requires WQ_FLAG_DONE. 1349 */ 1350 if (behavior == EXCLUSIVE) 1351 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; 1352 1353 return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; 1354 } 1355 1356 #ifdef CONFIG_MIGRATION 1357 /** 1358 * migration_entry_wait_on_locked - Wait for a migration entry to be removed 1359 * @entry: migration swap entry. 1360 * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required 1361 * for pte entries, pass NULL for pmd entries. 1362 * @ptl: already locked ptl. This function will drop the lock. 1363 * 1364 * Wait for a migration entry referencing the given page to be removed. This is 1365 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except 1366 * this can be called without taking a reference on the page. Instead this 1367 * should be called while holding the ptl for the migration entry referencing 1368 * the page. 1369 * 1370 * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). 1371 * 1372 * This follows the same logic as folio_wait_bit_common() so see the comments 1373 * there. 1374 */ 1375 void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, 1376 spinlock_t *ptl) 1377 { 1378 struct wait_page_queue wait_page; 1379 wait_queue_entry_t *wait = &wait_page.wait; 1380 bool thrashing = false; 1381 unsigned long pflags; 1382 bool in_thrashing; 1383 wait_queue_head_t *q; 1384 struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); 1385 1386 q = folio_waitqueue(folio); 1387 if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { 1388 delayacct_thrashing_start(&in_thrashing); 1389 psi_memstall_enter(&pflags); 1390 thrashing = true; 1391 } 1392 1393 init_wait(wait); 1394 wait->func = wake_page_function; 1395 wait_page.folio = folio; 1396 wait_page.bit_nr = PG_locked; 1397 wait->flags = 0; 1398 1399 spin_lock_irq(&q->lock); 1400 folio_set_waiters(folio); 1401 if (!folio_trylock_flag(folio, PG_locked, wait)) 1402 __add_wait_queue_entry_tail(q, wait); 1403 spin_unlock_irq(&q->lock); 1404 1405 /* 1406 * If a migration entry exists for the page the migration path must hold 1407 * a valid reference to the page, and it must take the ptl to remove the 1408 * migration entry. So the page is valid until the ptl is dropped. 1409 */ 1410 if (ptep) 1411 pte_unmap_unlock(ptep, ptl); 1412 else 1413 spin_unlock(ptl); 1414 1415 for (;;) { 1416 unsigned int flags; 1417 1418 set_current_state(TASK_UNINTERRUPTIBLE); 1419 1420 /* Loop until we've been woken or interrupted */ 1421 flags = smp_load_acquire(&wait->flags); 1422 if (!(flags & WQ_FLAG_WOKEN)) { 1423 if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) 1424 break; 1425 1426 io_schedule(); 1427 continue; 1428 } 1429 break; 1430 } 1431 1432 finish_wait(q, wait); 1433 1434 if (thrashing) { 1435 delayacct_thrashing_end(&in_thrashing); 1436 psi_memstall_leave(&pflags); 1437 } 1438 } 1439 #endif 1440 1441 void folio_wait_bit(struct folio *folio, int bit_nr) 1442 { 1443 folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); 1444 } 1445 EXPORT_SYMBOL(folio_wait_bit); 1446 1447 int folio_wait_bit_killable(struct folio *folio, int bit_nr) 1448 { 1449 return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); 1450 } 1451 EXPORT_SYMBOL(folio_wait_bit_killable); 1452 1453 /** 1454 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked 1455 * @folio: The folio to wait for. 1456 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). 1457 * 1458 * The caller should hold a reference on @folio. They expect the page to 1459 * become unlocked relatively soon, but do not wish to hold up migration 1460 * (for example) by holding the reference while waiting for the folio to 1461 * come unlocked. After this function returns, the caller should not 1462 * dereference @folio. 1463 * 1464 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. 1465 */ 1466 static int folio_put_wait_locked(struct folio *folio, int state) 1467 { 1468 return folio_wait_bit_common(folio, PG_locked, state, DROP); 1469 } 1470 1471 /** 1472 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue 1473 * @folio: Folio defining the wait queue of interest 1474 * @waiter: Waiter to add to the queue 1475 * 1476 * Add an arbitrary @waiter to the wait queue for the nominated @folio. 1477 */ 1478 void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) 1479 { 1480 wait_queue_head_t *q = folio_waitqueue(folio); 1481 unsigned long flags; 1482 1483 spin_lock_irqsave(&q->lock, flags); 1484 __add_wait_queue_entry_tail(q, waiter); 1485 folio_set_waiters(folio); 1486 spin_unlock_irqrestore(&q->lock, flags); 1487 } 1488 EXPORT_SYMBOL_GPL(folio_add_wait_queue); 1489 1490 #ifndef clear_bit_unlock_is_negative_byte 1491 1492 /* 1493 * PG_waiters is the high bit in the same byte as PG_lock. 1494 * 1495 * On x86 (and on many other architectures), we can clear PG_lock and 1496 * test the sign bit at the same time. But if the architecture does 1497 * not support that special operation, we just do this all by hand 1498 * instead. 1499 * 1500 * The read of PG_waiters has to be after (or concurrently with) PG_locked 1501 * being cleared, but a memory barrier should be unnecessary since it is 1502 * in the same byte as PG_locked. 1503 */ 1504 static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) 1505 { 1506 clear_bit_unlock(nr, mem); 1507 /* smp_mb__after_atomic(); */ 1508 return test_bit(PG_waiters, mem); 1509 } 1510 1511 #endif 1512 1513 /** 1514 * folio_unlock - Unlock a locked folio. 1515 * @folio: The folio. 1516 * 1517 * Unlocks the folio and wakes up any thread sleeping on the page lock. 1518 * 1519 * Context: May be called from interrupt or process context. May not be 1520 * called from NMI context. 1521 */ 1522 void folio_unlock(struct folio *folio) 1523 { 1524 /* Bit 7 allows x86 to check the byte's sign bit */ 1525 BUILD_BUG_ON(PG_waiters != 7); 1526 BUILD_BUG_ON(PG_locked > 7); 1527 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1528 if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) 1529 folio_wake_bit(folio, PG_locked); 1530 } 1531 EXPORT_SYMBOL(folio_unlock); 1532 1533 /** 1534 * folio_end_private_2 - Clear PG_private_2 and wake any waiters. 1535 * @folio: The folio. 1536 * 1537 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for 1538 * it. The folio reference held for PG_private_2 being set is released. 1539 * 1540 * This is, for example, used when a netfs folio is being written to a local 1541 * disk cache, thereby allowing writes to the cache for the same folio to be 1542 * serialised. 1543 */ 1544 void folio_end_private_2(struct folio *folio) 1545 { 1546 VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); 1547 clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); 1548 folio_wake_bit(folio, PG_private_2); 1549 folio_put(folio); 1550 } 1551 EXPORT_SYMBOL(folio_end_private_2); 1552 1553 /** 1554 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. 1555 * @folio: The folio to wait on. 1556 * 1557 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. 1558 */ 1559 void folio_wait_private_2(struct folio *folio) 1560 { 1561 while (folio_test_private_2(folio)) 1562 folio_wait_bit(folio, PG_private_2); 1563 } 1564 EXPORT_SYMBOL(folio_wait_private_2); 1565 1566 /** 1567 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. 1568 * @folio: The folio to wait on. 1569 * 1570 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a 1571 * fatal signal is received by the calling task. 1572 * 1573 * Return: 1574 * - 0 if successful. 1575 * - -EINTR if a fatal signal was encountered. 1576 */ 1577 int folio_wait_private_2_killable(struct folio *folio) 1578 { 1579 int ret = 0; 1580 1581 while (folio_test_private_2(folio)) { 1582 ret = folio_wait_bit_killable(folio, PG_private_2); 1583 if (ret < 0) 1584 break; 1585 } 1586 1587 return ret; 1588 } 1589 EXPORT_SYMBOL(folio_wait_private_2_killable); 1590 1591 /** 1592 * folio_end_writeback - End writeback against a folio. 1593 * @folio: The folio. 1594 */ 1595 void folio_end_writeback(struct folio *folio) 1596 { 1597 /* 1598 * folio_test_clear_reclaim() could be used here but it is an 1599 * atomic operation and overkill in this particular case. Failing 1600 * to shuffle a folio marked for immediate reclaim is too mild 1601 * a gain to justify taking an atomic operation penalty at the 1602 * end of every folio writeback. 1603 */ 1604 if (folio_test_reclaim(folio)) { 1605 folio_clear_reclaim(folio); 1606 folio_rotate_reclaimable(folio); 1607 } 1608 1609 /* 1610 * Writeback does not hold a folio reference of its own, relying 1611 * on truncation to wait for the clearing of PG_writeback. 1612 * But here we must make sure that the folio is not freed and 1613 * reused before the folio_wake(). 1614 */ 1615 folio_get(folio); 1616 if (!__folio_end_writeback(folio)) 1617 BUG(); 1618 1619 smp_mb__after_atomic(); 1620 folio_wake(folio, PG_writeback); 1621 acct_reclaim_writeback(folio); 1622 folio_put(folio); 1623 } 1624 EXPORT_SYMBOL(folio_end_writeback); 1625 1626 /* 1627 * After completing I/O on a page, call this routine to update the page 1628 * flags appropriately 1629 */ 1630 void page_endio(struct page *page, bool is_write, int err) 1631 { 1632 struct folio *folio = page_folio(page); 1633 1634 if (!is_write) { 1635 if (!err) { 1636 folio_mark_uptodate(folio); 1637 } else { 1638 folio_clear_uptodate(folio); 1639 folio_set_error(folio); 1640 } 1641 folio_unlock(folio); 1642 } else { 1643 if (err) { 1644 struct address_space *mapping; 1645 1646 folio_set_error(folio); 1647 mapping = folio_mapping(folio); 1648 if (mapping) 1649 mapping_set_error(mapping, err); 1650 } 1651 folio_end_writeback(folio); 1652 } 1653 } 1654 EXPORT_SYMBOL_GPL(page_endio); 1655 1656 /** 1657 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. 1658 * @folio: The folio to lock 1659 */ 1660 void __folio_lock(struct folio *folio) 1661 { 1662 folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, 1663 EXCLUSIVE); 1664 } 1665 EXPORT_SYMBOL(__folio_lock); 1666 1667 int __folio_lock_killable(struct folio *folio) 1668 { 1669 return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, 1670 EXCLUSIVE); 1671 } 1672 EXPORT_SYMBOL_GPL(__folio_lock_killable); 1673 1674 static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) 1675 { 1676 struct wait_queue_head *q = folio_waitqueue(folio); 1677 int ret = 0; 1678 1679 wait->folio = folio; 1680 wait->bit_nr = PG_locked; 1681 1682 spin_lock_irq(&q->lock); 1683 __add_wait_queue_entry_tail(q, &wait->wait); 1684 folio_set_waiters(folio); 1685 ret = !folio_trylock(folio); 1686 /* 1687 * If we were successful now, we know we're still on the 1688 * waitqueue as we're still under the lock. This means it's 1689 * safe to remove and return success, we know the callback 1690 * isn't going to trigger. 1691 */ 1692 if (!ret) 1693 __remove_wait_queue(q, &wait->wait); 1694 else 1695 ret = -EIOCBQUEUED; 1696 spin_unlock_irq(&q->lock); 1697 return ret; 1698 } 1699 1700 /* 1701 * Return values: 1702 * true - folio is locked; mmap_lock is still held. 1703 * false - folio is not locked. 1704 * mmap_lock has been released (mmap_read_unlock(), unless flags had both 1705 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 1706 * which case mmap_lock is still held. 1707 * 1708 * If neither ALLOW_RETRY nor KILLABLE are set, will always return true 1709 * with the folio locked and the mmap_lock unperturbed. 1710 */ 1711 bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, 1712 unsigned int flags) 1713 { 1714 if (fault_flag_allow_retry_first(flags)) { 1715 /* 1716 * CAUTION! In this case, mmap_lock is not released 1717 * even though return 0. 1718 */ 1719 if (flags & FAULT_FLAG_RETRY_NOWAIT) 1720 return false; 1721 1722 mmap_read_unlock(mm); 1723 if (flags & FAULT_FLAG_KILLABLE) 1724 folio_wait_locked_killable(folio); 1725 else 1726 folio_wait_locked(folio); 1727 return false; 1728 } 1729 if (flags & FAULT_FLAG_KILLABLE) { 1730 bool ret; 1731 1732 ret = __folio_lock_killable(folio); 1733 if (ret) { 1734 mmap_read_unlock(mm); 1735 return false; 1736 } 1737 } else { 1738 __folio_lock(folio); 1739 } 1740 1741 return true; 1742 } 1743 1744 /** 1745 * page_cache_next_miss() - Find the next gap in the page cache. 1746 * @mapping: Mapping. 1747 * @index: Index. 1748 * @max_scan: Maximum range to search. 1749 * 1750 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the 1751 * gap with the lowest index. 1752 * 1753 * This function may be called under the rcu_read_lock. However, this will 1754 * not atomically search a snapshot of the cache at a single point in time. 1755 * For example, if a gap is created at index 5, then subsequently a gap is 1756 * created at index 10, page_cache_next_miss covering both indices may 1757 * return 10 if called under the rcu_read_lock. 1758 * 1759 * Return: The index of the gap if found, otherwise an index outside the 1760 * range specified (in which case 'return - index >= max_scan' will be true). 1761 * In the rare case of index wrap-around, 0 will be returned. 1762 */ 1763 pgoff_t page_cache_next_miss(struct address_space *mapping, 1764 pgoff_t index, unsigned long max_scan) 1765 { 1766 XA_STATE(xas, &mapping->i_pages, index); 1767 1768 while (max_scan--) { 1769 void *entry = xas_next(&xas); 1770 if (!entry || xa_is_value(entry)) 1771 break; 1772 if (xas.xa_index == 0) 1773 break; 1774 } 1775 1776 return xas.xa_index; 1777 } 1778 EXPORT_SYMBOL(page_cache_next_miss); 1779 1780 /** 1781 * page_cache_prev_miss() - Find the previous gap in the page cache. 1782 * @mapping: Mapping. 1783 * @index: Index. 1784 * @max_scan: Maximum range to search. 1785 * 1786 * Search the range [max(index - max_scan + 1, 0), index] for the 1787 * gap with the highest index. 1788 * 1789 * This function may be called under the rcu_read_lock. However, this will 1790 * not atomically search a snapshot of the cache at a single point in time. 1791 * For example, if a gap is created at index 10, then subsequently a gap is 1792 * created at index 5, page_cache_prev_miss() covering both indices may 1793 * return 5 if called under the rcu_read_lock. 1794 * 1795 * Return: The index of the gap if found, otherwise an index outside the 1796 * range specified (in which case 'index - return >= max_scan' will be true). 1797 * In the rare case of wrap-around, ULONG_MAX will be returned. 1798 */ 1799 pgoff_t page_cache_prev_miss(struct address_space *mapping, 1800 pgoff_t index, unsigned long max_scan) 1801 { 1802 XA_STATE(xas, &mapping->i_pages, index); 1803 1804 while (max_scan--) { 1805 void *entry = xas_prev(&xas); 1806 if (!entry || xa_is_value(entry)) 1807 break; 1808 if (xas.xa_index == ULONG_MAX) 1809 break; 1810 } 1811 1812 return xas.xa_index; 1813 } 1814 EXPORT_SYMBOL(page_cache_prev_miss); 1815 1816 /* 1817 * Lockless page cache protocol: 1818 * On the lookup side: 1819 * 1. Load the folio from i_pages 1820 * 2. Increment the refcount if it's not zero 1821 * 3. If the folio is not found by xas_reload(), put the refcount and retry 1822 * 1823 * On the removal side: 1824 * A. Freeze the page (by zeroing the refcount if nobody else has a reference) 1825 * B. Remove the page from i_pages 1826 * C. Return the page to the page allocator 1827 * 1828 * This means that any page may have its reference count temporarily 1829 * increased by a speculative page cache (or fast GUP) lookup as it can 1830 * be allocated by another user before the RCU grace period expires. 1831 * Because the refcount temporarily acquired here may end up being the 1832 * last refcount on the page, any page allocation must be freeable by 1833 * folio_put(). 1834 */ 1835 1836 /* 1837 * mapping_get_entry - Get a page cache entry. 1838 * @mapping: the address_space to search 1839 * @index: The page cache index. 1840 * 1841 * Looks up the page cache entry at @mapping & @index. If it is a folio, 1842 * it is returned with an increased refcount. If it is a shadow entry 1843 * of a previously evicted folio, or a swap entry from shmem/tmpfs, 1844 * it is returned without further action. 1845 * 1846 * Return: The folio, swap or shadow entry, %NULL if nothing is found. 1847 */ 1848 static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) 1849 { 1850 XA_STATE(xas, &mapping->i_pages, index); 1851 struct folio *folio; 1852 1853 rcu_read_lock(); 1854 repeat: 1855 xas_reset(&xas); 1856 folio = xas_load(&xas); 1857 if (xas_retry(&xas, folio)) 1858 goto repeat; 1859 /* 1860 * A shadow entry of a recently evicted page, or a swap entry from 1861 * shmem/tmpfs. Return it without attempting to raise page count. 1862 */ 1863 if (!folio || xa_is_value(folio)) 1864 goto out; 1865 1866 if (!folio_try_get_rcu(folio)) 1867 goto repeat; 1868 1869 if (unlikely(folio != xas_reload(&xas))) { 1870 folio_put(folio); 1871 goto repeat; 1872 } 1873 out: 1874 rcu_read_unlock(); 1875 1876 return folio; 1877 } 1878 1879 /** 1880 * __filemap_get_folio - Find and get a reference to a folio. 1881 * @mapping: The address_space to search. 1882 * @index: The page index. 1883 * @fgp_flags: %FGP flags modify how the folio is returned. 1884 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. 1885 * 1886 * Looks up the page cache entry at @mapping & @index. 1887 * 1888 * @fgp_flags can be zero or more of these flags: 1889 * 1890 * * %FGP_ACCESSED - The folio will be marked accessed. 1891 * * %FGP_LOCK - The folio is returned locked. 1892 * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it 1893 * instead of allocating a new folio to replace it. 1894 * * %FGP_CREAT - If no page is present then a new page is allocated using 1895 * @gfp and added to the page cache and the VM's LRU list. 1896 * The page is returned locked and with an increased refcount. 1897 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the 1898 * page is already in cache. If the page was allocated, unlock it before 1899 * returning so the caller can do the same dance. 1900 * * %FGP_WRITE - The page will be written to by the caller. 1901 * * %FGP_NOFS - __GFP_FS will get cleared in gfp. 1902 * * %FGP_NOWAIT - Don't get blocked by page lock. 1903 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) 1904 * 1905 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even 1906 * if the %GFP flags specified for %FGP_CREAT are atomic. 1907 * 1908 * If there is a page cache page, it is returned with an increased refcount. 1909 * 1910 * Return: The found folio or %NULL otherwise. 1911 */ 1912 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, 1913 int fgp_flags, gfp_t gfp) 1914 { 1915 struct folio *folio; 1916 1917 repeat: 1918 folio = mapping_get_entry(mapping, index); 1919 if (xa_is_value(folio)) { 1920 if (fgp_flags & FGP_ENTRY) 1921 return folio; 1922 folio = NULL; 1923 } 1924 if (!folio) 1925 goto no_page; 1926 1927 if (fgp_flags & FGP_LOCK) { 1928 if (fgp_flags & FGP_NOWAIT) { 1929 if (!folio_trylock(folio)) { 1930 folio_put(folio); 1931 return NULL; 1932 } 1933 } else { 1934 folio_lock(folio); 1935 } 1936 1937 /* Has the page been truncated? */ 1938 if (unlikely(folio->mapping != mapping)) { 1939 folio_unlock(folio); 1940 folio_put(folio); 1941 goto repeat; 1942 } 1943 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); 1944 } 1945 1946 if (fgp_flags & FGP_ACCESSED) 1947 folio_mark_accessed(folio); 1948 else if (fgp_flags & FGP_WRITE) { 1949 /* Clear idle flag for buffer write */ 1950 if (folio_test_idle(folio)) 1951 folio_clear_idle(folio); 1952 } 1953 1954 if (fgp_flags & FGP_STABLE) 1955 folio_wait_stable(folio); 1956 no_page: 1957 if (!folio && (fgp_flags & FGP_CREAT)) { 1958 int err; 1959 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) 1960 gfp |= __GFP_WRITE; 1961 if (fgp_flags & FGP_NOFS) 1962 gfp &= ~__GFP_FS; 1963 if (fgp_flags & FGP_NOWAIT) { 1964 gfp &= ~GFP_KERNEL; 1965 gfp |= GFP_NOWAIT | __GFP_NOWARN; 1966 } 1967 1968 folio = filemap_alloc_folio(gfp, 0); 1969 if (!folio) 1970 return NULL; 1971 1972 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) 1973 fgp_flags |= FGP_LOCK; 1974 1975 /* Init accessed so avoid atomic mark_page_accessed later */ 1976 if (fgp_flags & FGP_ACCESSED) 1977 __folio_set_referenced(folio); 1978 1979 err = filemap_add_folio(mapping, folio, index, gfp); 1980 if (unlikely(err)) { 1981 folio_put(folio); 1982 folio = NULL; 1983 if (err == -EEXIST) 1984 goto repeat; 1985 } 1986 1987 /* 1988 * filemap_add_folio locks the page, and for mmap 1989 * we expect an unlocked page. 1990 */ 1991 if (folio && (fgp_flags & FGP_FOR_MMAP)) 1992 folio_unlock(folio); 1993 } 1994 1995 return folio; 1996 } 1997 EXPORT_SYMBOL(__filemap_get_folio); 1998 1999 static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, 2000 xa_mark_t mark) 2001 { 2002 struct folio *folio; 2003 2004 retry: 2005 if (mark == XA_PRESENT) 2006 folio = xas_find(xas, max); 2007 else 2008 folio = xas_find_marked(xas, max, mark); 2009 2010 if (xas_retry(xas, folio)) 2011 goto retry; 2012 /* 2013 * A shadow entry of a recently evicted page, a swap 2014 * entry from shmem/tmpfs or a DAX entry. Return it 2015 * without attempting to raise page count. 2016 */ 2017 if (!folio || xa_is_value(folio)) 2018 return folio; 2019 2020 if (!folio_try_get_rcu(folio)) 2021 goto reset; 2022 2023 if (unlikely(folio != xas_reload(xas))) { 2024 folio_put(folio); 2025 goto reset; 2026 } 2027 2028 return folio; 2029 reset: 2030 xas_reset(xas); 2031 goto retry; 2032 } 2033 2034 /** 2035 * find_get_entries - gang pagecache lookup 2036 * @mapping: The address_space to search 2037 * @start: The starting page cache index 2038 * @end: The final page index (inclusive). 2039 * @fbatch: Where the resulting entries are placed. 2040 * @indices: The cache indices corresponding to the entries in @entries 2041 * 2042 * find_get_entries() will search for and return a batch of entries in 2043 * the mapping. The entries are placed in @fbatch. find_get_entries() 2044 * takes a reference on any actual folios it returns. 2045 * 2046 * The entries have ascending indexes. The indices may not be consecutive 2047 * due to not-present entries or large folios. 2048 * 2049 * Any shadow entries of evicted folios, or swap entries from 2050 * shmem/tmpfs, are included in the returned array. 2051 * 2052 * Return: The number of entries which were found. 2053 */ 2054 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, 2055 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) 2056 { 2057 XA_STATE(xas, &mapping->i_pages, *start); 2058 struct folio *folio; 2059 2060 rcu_read_lock(); 2061 while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { 2062 indices[fbatch->nr] = xas.xa_index; 2063 if (!folio_batch_add(fbatch, folio)) 2064 break; 2065 } 2066 rcu_read_unlock(); 2067 2068 if (folio_batch_count(fbatch)) { 2069 unsigned long nr = 1; 2070 int idx = folio_batch_count(fbatch) - 1; 2071 2072 folio = fbatch->folios[idx]; 2073 if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) 2074 nr = folio_nr_pages(folio); 2075 *start = indices[idx] + nr; 2076 } 2077 return folio_batch_count(fbatch); 2078 } 2079 2080 /** 2081 * find_lock_entries - Find a batch of pagecache entries. 2082 * @mapping: The address_space to search. 2083 * @start: The starting page cache index. 2084 * @end: The final page index (inclusive). 2085 * @fbatch: Where the resulting entries are placed. 2086 * @indices: The cache indices of the entries in @fbatch. 2087 * 2088 * find_lock_entries() will return a batch of entries from @mapping. 2089 * Swap, shadow and DAX entries are included. Folios are returned 2090 * locked and with an incremented refcount. Folios which are locked 2091 * by somebody else or under writeback are skipped. Folios which are 2092 * partially outside the range are not returned. 2093 * 2094 * The entries have ascending indexes. The indices may not be consecutive 2095 * due to not-present entries, large folios, folios which could not be 2096 * locked or folios under writeback. 2097 * 2098 * Return: The number of entries which were found. 2099 */ 2100 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, 2101 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) 2102 { 2103 XA_STATE(xas, &mapping->i_pages, *start); 2104 struct folio *folio; 2105 2106 rcu_read_lock(); 2107 while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { 2108 if (!xa_is_value(folio)) { 2109 if (folio->index < *start) 2110 goto put; 2111 if (folio->index + folio_nr_pages(folio) - 1 > end) 2112 goto put; 2113 if (!folio_trylock(folio)) 2114 goto put; 2115 if (folio->mapping != mapping || 2116 folio_test_writeback(folio)) 2117 goto unlock; 2118 VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), 2119 folio); 2120 } 2121 indices[fbatch->nr] = xas.xa_index; 2122 if (!folio_batch_add(fbatch, folio)) 2123 break; 2124 continue; 2125 unlock: 2126 folio_unlock(folio); 2127 put: 2128 folio_put(folio); 2129 } 2130 rcu_read_unlock(); 2131 2132 if (folio_batch_count(fbatch)) { 2133 unsigned long nr = 1; 2134 int idx = folio_batch_count(fbatch) - 1; 2135 2136 folio = fbatch->folios[idx]; 2137 if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) 2138 nr = folio_nr_pages(folio); 2139 *start = indices[idx] + nr; 2140 } 2141 return folio_batch_count(fbatch); 2142 } 2143 2144 /** 2145 * filemap_get_folios - Get a batch of folios 2146 * @mapping: The address_space to search 2147 * @start: The starting page index 2148 * @end: The final page index (inclusive) 2149 * @fbatch: The batch to fill. 2150 * 2151 * Search for and return a batch of folios in the mapping starting at 2152 * index @start and up to index @end (inclusive). The folios are returned 2153 * in @fbatch with an elevated reference count. 2154 * 2155 * The first folio may start before @start; if it does, it will contain 2156 * @start. The final folio may extend beyond @end; if it does, it will 2157 * contain @end. The folios have ascending indices. There may be gaps 2158 * between the folios if there are indices which have no folio in the 2159 * page cache. If folios are added to or removed from the page cache 2160 * while this is running, they may or may not be found by this call. 2161 * 2162 * Return: The number of folios which were found. 2163 * We also update @start to index the next folio for the traversal. 2164 */ 2165 unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, 2166 pgoff_t end, struct folio_batch *fbatch) 2167 { 2168 XA_STATE(xas, &mapping->i_pages, *start); 2169 struct folio *folio; 2170 2171 rcu_read_lock(); 2172 while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { 2173 /* Skip over shadow, swap and DAX entries */ 2174 if (xa_is_value(folio)) 2175 continue; 2176 if (!folio_batch_add(fbatch, folio)) { 2177 unsigned long nr = folio_nr_pages(folio); 2178 2179 if (folio_test_hugetlb(folio)) 2180 nr = 1; 2181 *start = folio->index + nr; 2182 goto out; 2183 } 2184 } 2185 2186 /* 2187 * We come here when there is no page beyond @end. We take care to not 2188 * overflow the index @start as it confuses some of the callers. This 2189 * breaks the iteration when there is a page at index -1 but that is 2190 * already broken anyway. 2191 */ 2192 if (end == (pgoff_t)-1) 2193 *start = (pgoff_t)-1; 2194 else 2195 *start = end + 1; 2196 out: 2197 rcu_read_unlock(); 2198 2199 return folio_batch_count(fbatch); 2200 } 2201 EXPORT_SYMBOL(filemap_get_folios); 2202 2203 static inline 2204 bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) 2205 { 2206 if (!folio_test_large(folio) || folio_test_hugetlb(folio)) 2207 return false; 2208 if (index >= max) 2209 return false; 2210 return index < folio->index + folio_nr_pages(folio) - 1; 2211 } 2212 2213 /** 2214 * filemap_get_folios_contig - Get a batch of contiguous folios 2215 * @mapping: The address_space to search 2216 * @start: The starting page index 2217 * @end: The final page index (inclusive) 2218 * @fbatch: The batch to fill 2219 * 2220 * filemap_get_folios_contig() works exactly like filemap_get_folios(), 2221 * except the returned folios are guaranteed to be contiguous. This may 2222 * not return all contiguous folios if the batch gets filled up. 2223 * 2224 * Return: The number of folios found. 2225 * Also update @start to be positioned for traversal of the next folio. 2226 */ 2227 2228 unsigned filemap_get_folios_contig(struct address_space *mapping, 2229 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) 2230 { 2231 XA_STATE(xas, &mapping->i_pages, *start); 2232 unsigned long nr; 2233 struct folio *folio; 2234 2235 rcu_read_lock(); 2236 2237 for (folio = xas_load(&xas); folio && xas.xa_index <= end; 2238 folio = xas_next(&xas)) { 2239 if (xas_retry(&xas, folio)) 2240 continue; 2241 /* 2242 * If the entry has been swapped out, we can stop looking. 2243 * No current caller is looking for DAX entries. 2244 */ 2245 if (xa_is_value(folio)) 2246 goto update_start; 2247 2248 if (!folio_try_get_rcu(folio)) 2249 goto retry; 2250 2251 if (unlikely(folio != xas_reload(&xas))) 2252 goto put_folio; 2253 2254 if (!folio_batch_add(fbatch, folio)) { 2255 nr = folio_nr_pages(folio); 2256 2257 if (folio_test_hugetlb(folio)) 2258 nr = 1; 2259 *start = folio->index + nr; 2260 goto out; 2261 } 2262 continue; 2263 put_folio: 2264 folio_put(folio); 2265 2266 retry: 2267 xas_reset(&xas); 2268 } 2269 2270 update_start: 2271 nr = folio_batch_count(fbatch); 2272 2273 if (nr) { 2274 folio = fbatch->folios[nr - 1]; 2275 if (folio_test_hugetlb(folio)) 2276 *start = folio->index + 1; 2277 else 2278 *start = folio->index + folio_nr_pages(folio); 2279 } 2280 out: 2281 rcu_read_unlock(); 2282 return folio_batch_count(fbatch); 2283 } 2284 EXPORT_SYMBOL(filemap_get_folios_contig); 2285 2286 /** 2287 * find_get_pages_range_tag - Find and return head pages matching @tag. 2288 * @mapping: the address_space to search 2289 * @index: the starting page index 2290 * @end: The final page index (inclusive) 2291 * @tag: the tag index 2292 * @nr_pages: the maximum number of pages 2293 * @pages: where the resulting pages are placed 2294 * 2295 * Like find_get_pages_range(), except we only return head pages which are 2296 * tagged with @tag. @index is updated to the index immediately after the 2297 * last page we return, ready for the next iteration. 2298 * 2299 * Return: the number of pages which were found. 2300 */ 2301 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, 2302 pgoff_t end, xa_mark_t tag, unsigned int nr_pages, 2303 struct page **pages) 2304 { 2305 XA_STATE(xas, &mapping->i_pages, *index); 2306 struct folio *folio; 2307 unsigned ret = 0; 2308 2309 if (unlikely(!nr_pages)) 2310 return 0; 2311 2312 rcu_read_lock(); 2313 while ((folio = find_get_entry(&xas, end, tag))) { 2314 /* 2315 * Shadow entries should never be tagged, but this iteration 2316 * is lockless so there is a window for page reclaim to evict 2317 * a page we saw tagged. Skip over it. 2318 */ 2319 if (xa_is_value(folio)) 2320 continue; 2321 2322 pages[ret] = &folio->page; 2323 if (++ret == nr_pages) { 2324 *index = folio->index + folio_nr_pages(folio); 2325 goto out; 2326 } 2327 } 2328 2329 /* 2330 * We come here when we got to @end. We take care to not overflow the 2331 * index @index as it confuses some of the callers. This breaks the 2332 * iteration when there is a page at index -1 but that is already 2333 * broken anyway. 2334 */ 2335 if (end == (pgoff_t)-1) 2336 *index = (pgoff_t)-1; 2337 else 2338 *index = end + 1; 2339 out: 2340 rcu_read_unlock(); 2341 2342 return ret; 2343 } 2344 EXPORT_SYMBOL(find_get_pages_range_tag); 2345 2346 /* 2347 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 2348 * a _large_ part of the i/o request. Imagine the worst scenario: 2349 * 2350 * ---R__________________________________________B__________ 2351 * ^ reading here ^ bad block(assume 4k) 2352 * 2353 * read(R) => miss => readahead(R...B) => media error => frustrating retries 2354 * => failing the whole request => read(R) => read(R+1) => 2355 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 2356 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 2357 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 2358 * 2359 * It is going insane. Fix it by quickly scaling down the readahead size. 2360 */ 2361 static void shrink_readahead_size_eio(struct file_ra_state *ra) 2362 { 2363 ra->ra_pages /= 4; 2364 } 2365 2366 /* 2367 * filemap_get_read_batch - Get a batch of folios for read 2368 * 2369 * Get a batch of folios which represent a contiguous range of bytes in 2370 * the file. No exceptional entries will be returned. If @index is in 2371 * the middle of a folio, the entire folio will be returned. The last 2372 * folio in the batch may have the readahead flag set or the uptodate flag 2373 * clear so that the caller can take the appropriate action. 2374 */ 2375 static void filemap_get_read_batch(struct address_space *mapping, 2376 pgoff_t index, pgoff_t max, struct folio_batch *fbatch) 2377 { 2378 XA_STATE(xas, &mapping->i_pages, index); 2379 struct folio *folio; 2380 2381 rcu_read_lock(); 2382 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { 2383 if (xas_retry(&xas, folio)) 2384 continue; 2385 if (xas.xa_index > max || xa_is_value(folio)) 2386 break; 2387 if (xa_is_sibling(folio)) 2388 break; 2389 if (!folio_try_get_rcu(folio)) 2390 goto retry; 2391 2392 if (unlikely(folio != xas_reload(&xas))) 2393 goto put_folio; 2394 2395 if (!folio_batch_add(fbatch, folio)) 2396 break; 2397 if (!folio_test_uptodate(folio)) 2398 break; 2399 if (folio_test_readahead(folio)) 2400 break; 2401 xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); 2402 continue; 2403 put_folio: 2404 folio_put(folio); 2405 retry: 2406 xas_reset(&xas); 2407 } 2408 rcu_read_unlock(); 2409 } 2410 2411 static int filemap_read_folio(struct file *file, filler_t filler, 2412 struct folio *folio) 2413 { 2414 bool workingset = folio_test_workingset(folio); 2415 unsigned long pflags; 2416 int error; 2417 2418 /* 2419 * A previous I/O error may have been due to temporary failures, 2420 * eg. multipath errors. PG_error will be set again if read_folio 2421 * fails. 2422 */ 2423 folio_clear_error(folio); 2424 2425 /* Start the actual read. The read will unlock the page. */ 2426 if (unlikely(workingset)) 2427 psi_memstall_enter(&pflags); 2428 error = filler(file, folio); 2429 if (unlikely(workingset)) 2430 psi_memstall_leave(&pflags); 2431 if (error) 2432 return error; 2433 2434 error = folio_wait_locked_killable(folio); 2435 if (error) 2436 return error; 2437 if (folio_test_uptodate(folio)) 2438 return 0; 2439 if (file) 2440 shrink_readahead_size_eio(&file->f_ra); 2441 return -EIO; 2442 } 2443 2444 static bool filemap_range_uptodate(struct address_space *mapping, 2445 loff_t pos, size_t count, struct folio *folio, 2446 bool need_uptodate) 2447 { 2448 if (folio_test_uptodate(folio)) 2449 return true; 2450 /* pipes can't handle partially uptodate pages */ 2451 if (need_uptodate) 2452 return false; 2453 if (!mapping->a_ops->is_partially_uptodate) 2454 return false; 2455 if (mapping->host->i_blkbits >= folio_shift(folio)) 2456 return false; 2457 2458 if (folio_pos(folio) > pos) { 2459 count -= folio_pos(folio) - pos; 2460 pos = 0; 2461 } else { 2462 pos -= folio_pos(folio); 2463 } 2464 2465 return mapping->a_ops->is_partially_uptodate(folio, pos, count); 2466 } 2467 2468 static int filemap_update_page(struct kiocb *iocb, 2469 struct address_space *mapping, size_t count, 2470 struct folio *folio, bool need_uptodate) 2471 { 2472 int error; 2473 2474 if (iocb->ki_flags & IOCB_NOWAIT) { 2475 if (!filemap_invalidate_trylock_shared(mapping)) 2476 return -EAGAIN; 2477 } else { 2478 filemap_invalidate_lock_shared(mapping); 2479 } 2480 2481 if (!folio_trylock(folio)) { 2482 error = -EAGAIN; 2483 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) 2484 goto unlock_mapping; 2485 if (!(iocb->ki_flags & IOCB_WAITQ)) { 2486 filemap_invalidate_unlock_shared(mapping); 2487 /* 2488 * This is where we usually end up waiting for a 2489 * previously submitted readahead to finish. 2490 */ 2491 folio_put_wait_locked(folio, TASK_KILLABLE); 2492 return AOP_TRUNCATED_PAGE; 2493 } 2494 error = __folio_lock_async(folio, iocb->ki_waitq); 2495 if (error) 2496 goto unlock_mapping; 2497 } 2498 2499 error = AOP_TRUNCATED_PAGE; 2500 if (!folio->mapping) 2501 goto unlock; 2502 2503 error = 0; 2504 if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, 2505 need_uptodate)) 2506 goto unlock; 2507 2508 error = -EAGAIN; 2509 if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) 2510 goto unlock; 2511 2512 error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, 2513 folio); 2514 goto unlock_mapping; 2515 unlock: 2516 folio_unlock(folio); 2517 unlock_mapping: 2518 filemap_invalidate_unlock_shared(mapping); 2519 if (error == AOP_TRUNCATED_PAGE) 2520 folio_put(folio); 2521 return error; 2522 } 2523 2524 static int filemap_create_folio(struct file *file, 2525 struct address_space *mapping, pgoff_t index, 2526 struct folio_batch *fbatch) 2527 { 2528 struct folio *folio; 2529 int error; 2530 2531 folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); 2532 if (!folio) 2533 return -ENOMEM; 2534 2535 /* 2536 * Protect against truncate / hole punch. Grabbing invalidate_lock 2537 * here assures we cannot instantiate and bring uptodate new 2538 * pagecache folios after evicting page cache during truncate 2539 * and before actually freeing blocks. Note that we could 2540 * release invalidate_lock after inserting the folio into 2541 * the page cache as the locked folio would then be enough to 2542 * synchronize with hole punching. But there are code paths 2543 * such as filemap_update_page() filling in partially uptodate 2544 * pages or ->readahead() that need to hold invalidate_lock 2545 * while mapping blocks for IO so let's hold the lock here as 2546 * well to keep locking rules simple. 2547 */ 2548 filemap_invalidate_lock_shared(mapping); 2549 error = filemap_add_folio(mapping, folio, index, 2550 mapping_gfp_constraint(mapping, GFP_KERNEL)); 2551 if (error == -EEXIST) 2552 error = AOP_TRUNCATED_PAGE; 2553 if (error) 2554 goto error; 2555 2556 error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); 2557 if (error) 2558 goto error; 2559 2560 filemap_invalidate_unlock_shared(mapping); 2561 folio_batch_add(fbatch, folio); 2562 return 0; 2563 error: 2564 filemap_invalidate_unlock_shared(mapping); 2565 folio_put(folio); 2566 return error; 2567 } 2568 2569 static int filemap_readahead(struct kiocb *iocb, struct file *file, 2570 struct address_space *mapping, struct folio *folio, 2571 pgoff_t last_index) 2572 { 2573 DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); 2574 2575 if (iocb->ki_flags & IOCB_NOIO) 2576 return -EAGAIN; 2577 page_cache_async_ra(&ractl, folio, last_index - folio->index); 2578 return 0; 2579 } 2580 2581 static int filemap_get_pages(struct kiocb *iocb, size_t count, 2582 struct folio_batch *fbatch, bool need_uptodate) 2583 { 2584 struct file *filp = iocb->ki_filp; 2585 struct address_space *mapping = filp->f_mapping; 2586 struct file_ra_state *ra = &filp->f_ra; 2587 pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; 2588 pgoff_t last_index; 2589 struct folio *folio; 2590 int err = 0; 2591 2592 /* "last_index" is the index of the page beyond the end of the read */ 2593 last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); 2594 retry: 2595 if (fatal_signal_pending(current)) 2596 return -EINTR; 2597 2598 filemap_get_read_batch(mapping, index, last_index - 1, fbatch); 2599 if (!folio_batch_count(fbatch)) { 2600 if (iocb->ki_flags & IOCB_NOIO) 2601 return -EAGAIN; 2602 page_cache_sync_readahead(mapping, ra, filp, index, 2603 last_index - index); 2604 filemap_get_read_batch(mapping, index, last_index - 1, fbatch); 2605 } 2606 if (!folio_batch_count(fbatch)) { 2607 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) 2608 return -EAGAIN; 2609 err = filemap_create_folio(filp, mapping, 2610 iocb->ki_pos >> PAGE_SHIFT, fbatch); 2611 if (err == AOP_TRUNCATED_PAGE) 2612 goto retry; 2613 return err; 2614 } 2615 2616 folio = fbatch->folios[folio_batch_count(fbatch) - 1]; 2617 if (folio_test_readahead(folio)) { 2618 err = filemap_readahead(iocb, filp, mapping, folio, last_index); 2619 if (err) 2620 goto err; 2621 } 2622 if (!folio_test_uptodate(folio)) { 2623 if ((iocb->ki_flags & IOCB_WAITQ) && 2624 folio_batch_count(fbatch) > 1) 2625 iocb->ki_flags |= IOCB_NOWAIT; 2626 err = filemap_update_page(iocb, mapping, count, folio, 2627 need_uptodate); 2628 if (err) 2629 goto err; 2630 } 2631 2632 return 0; 2633 err: 2634 if (err < 0) 2635 folio_put(folio); 2636 if (likely(--fbatch->nr)) 2637 return 0; 2638 if (err == AOP_TRUNCATED_PAGE) 2639 goto retry; 2640 return err; 2641 } 2642 2643 static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) 2644 { 2645 unsigned int shift = folio_shift(folio); 2646 2647 return (pos1 >> shift == pos2 >> shift); 2648 } 2649 2650 /** 2651 * filemap_read - Read data from the page cache. 2652 * @iocb: The iocb to read. 2653 * @iter: Destination for the data. 2654 * @already_read: Number of bytes already read by the caller. 2655 * 2656 * Copies data from the page cache. If the data is not currently present, 2657 * uses the readahead and read_folio address_space operations to fetch it. 2658 * 2659 * Return: Total number of bytes copied, including those already read by 2660 * the caller. If an error happens before any bytes are copied, returns 2661 * a negative error number. 2662 */ 2663 ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, 2664 ssize_t already_read) 2665 { 2666 struct file *filp = iocb->ki_filp; 2667 struct file_ra_state *ra = &filp->f_ra; 2668 struct address_space *mapping = filp->f_mapping; 2669 struct inode *inode = mapping->host; 2670 struct folio_batch fbatch; 2671 int i, error = 0; 2672 bool writably_mapped; 2673 loff_t isize, end_offset; 2674 2675 if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) 2676 return 0; 2677 if (unlikely(!iov_iter_count(iter))) 2678 return 0; 2679 2680 iov_iter_truncate(iter, inode->i_sb->s_maxbytes); 2681 folio_batch_init(&fbatch); 2682 2683 do { 2684 cond_resched(); 2685 2686 /* 2687 * If we've already successfully copied some data, then we 2688 * can no longer safely return -EIOCBQUEUED. Hence mark 2689 * an async read NOWAIT at that point. 2690 */ 2691 if ((iocb->ki_flags & IOCB_WAITQ) && already_read) 2692 iocb->ki_flags |= IOCB_NOWAIT; 2693 2694 if (unlikely(iocb->ki_pos >= i_size_read(inode))) 2695 break; 2696 2697 error = filemap_get_pages(iocb, iter->count, &fbatch, 2698 iov_iter_is_pipe(iter)); 2699 if (error < 0) 2700 break; 2701 2702 /* 2703 * i_size must be checked after we know the pages are Uptodate. 2704 * 2705 * Checking i_size after the check allows us to calculate 2706 * the correct value for "nr", which means the zero-filled 2707 * part of the page is not copied back to userspace (unless 2708 * another truncate extends the file - this is desired though). 2709 */ 2710 isize = i_size_read(inode); 2711 if (unlikely(iocb->ki_pos >= isize)) 2712 goto put_folios; 2713 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); 2714 2715 /* 2716 * Once we start copying data, we don't want to be touching any 2717 * cachelines that might be contended: 2718 */ 2719 writably_mapped = mapping_writably_mapped(mapping); 2720 2721 /* 2722 * When a read accesses the same folio several times, only 2723 * mark it as accessed the first time. 2724 */ 2725 if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1, 2726 fbatch.folios[0])) 2727 folio_mark_accessed(fbatch.folios[0]); 2728 2729 for (i = 0; i < folio_batch_count(&fbatch); i++) { 2730 struct folio *folio = fbatch.folios[i]; 2731 size_t fsize = folio_size(folio); 2732 size_t offset = iocb->ki_pos & (fsize - 1); 2733 size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, 2734 fsize - offset); 2735 size_t copied; 2736 2737 if (end_offset < folio_pos(folio)) 2738 break; 2739 if (i > 0) 2740 folio_mark_accessed(folio); 2741 /* 2742 * If users can be writing to this folio using arbitrary 2743 * virtual addresses, take care of potential aliasing 2744 * before reading the folio on the kernel side. 2745 */ 2746 if (writably_mapped) 2747 flush_dcache_folio(folio); 2748 2749 copied = copy_folio_to_iter(folio, offset, bytes, iter); 2750 2751 already_read += copied; 2752 iocb->ki_pos += copied; 2753 ra->prev_pos = iocb->ki_pos; 2754 2755 if (copied < bytes) { 2756 error = -EFAULT; 2757 break; 2758 } 2759 } 2760 put_folios: 2761 for (i = 0; i < folio_batch_count(&fbatch); i++) 2762 folio_put(fbatch.folios[i]); 2763 folio_batch_init(&fbatch); 2764 } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); 2765 2766 file_accessed(filp); 2767 2768 return already_read ? already_read : error; 2769 } 2770 EXPORT_SYMBOL_GPL(filemap_read); 2771 2772 /** 2773 * generic_file_read_iter - generic filesystem read routine 2774 * @iocb: kernel I/O control block 2775 * @iter: destination for the data read 2776 * 2777 * This is the "read_iter()" routine for all filesystems 2778 * that can use the page cache directly. 2779 * 2780 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall 2781 * be returned when no data can be read without waiting for I/O requests 2782 * to complete; it doesn't prevent readahead. 2783 * 2784 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O 2785 * requests shall be made for the read or for readahead. When no data 2786 * can be read, -EAGAIN shall be returned. When readahead would be 2787 * triggered, a partial, possibly empty read shall be returned. 2788 * 2789 * Return: 2790 * * number of bytes copied, even for partial reads 2791 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 2792 */ 2793 ssize_t 2794 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 2795 { 2796 size_t count = iov_iter_count(iter); 2797 ssize_t retval = 0; 2798 2799 if (!count) 2800 return 0; /* skip atime */ 2801 2802 if (iocb->ki_flags & IOCB_DIRECT) { 2803 struct file *file = iocb->ki_filp; 2804 struct address_space *mapping = file->f_mapping; 2805 struct inode *inode = mapping->host; 2806 2807 if (iocb->ki_flags & IOCB_NOWAIT) { 2808 if (filemap_range_needs_writeback(mapping, iocb->ki_pos, 2809 iocb->ki_pos + count - 1)) 2810 return -EAGAIN; 2811 } else { 2812 retval = filemap_write_and_wait_range(mapping, 2813 iocb->ki_pos, 2814 iocb->ki_pos + count - 1); 2815 if (retval < 0) 2816 return retval; 2817 } 2818 2819 file_accessed(file); 2820 2821 retval = mapping->a_ops->direct_IO(iocb, iter); 2822 if (retval >= 0) { 2823 iocb->ki_pos += retval; 2824 count -= retval; 2825 } 2826 if (retval != -EIOCBQUEUED) 2827 iov_iter_revert(iter, count - iov_iter_count(iter)); 2828 2829 /* 2830 * Btrfs can have a short DIO read if we encounter 2831 * compressed extents, so if there was an error, or if 2832 * we've already read everything we wanted to, or if 2833 * there was a short read because we hit EOF, go ahead 2834 * and return. Otherwise fallthrough to buffered io for 2835 * the rest of the read. Buffered reads will not work for 2836 * DAX files, so don't bother trying. 2837 */ 2838 if (retval < 0 || !count || IS_DAX(inode)) 2839 return retval; 2840 if (iocb->ki_pos >= i_size_read(inode)) 2841 return retval; 2842 } 2843 2844 return filemap_read(iocb, iter, retval); 2845 } 2846 EXPORT_SYMBOL(generic_file_read_iter); 2847 2848 /* 2849 * Splice subpages from a folio into a pipe. 2850 */ 2851 size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, 2852 struct folio *folio, loff_t fpos, size_t size) 2853 { 2854 struct page *page; 2855 size_t spliced = 0, offset = offset_in_folio(folio, fpos); 2856 2857 page = folio_page(folio, offset / PAGE_SIZE); 2858 size = min(size, folio_size(folio) - offset); 2859 offset %= PAGE_SIZE; 2860 2861 while (spliced < size && 2862 !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 2863 struct pipe_buffer *buf = pipe_head_buf(pipe); 2864 size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); 2865 2866 *buf = (struct pipe_buffer) { 2867 .ops = &page_cache_pipe_buf_ops, 2868 .page = page, 2869 .offset = offset, 2870 .len = part, 2871 }; 2872 folio_get(folio); 2873 pipe->head++; 2874 page++; 2875 spliced += part; 2876 offset = 0; 2877 } 2878 2879 return spliced; 2880 } 2881 2882 /* 2883 * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into 2884 * a pipe. 2885 */ 2886 ssize_t filemap_splice_read(struct file *in, loff_t *ppos, 2887 struct pipe_inode_info *pipe, 2888 size_t len, unsigned int flags) 2889 { 2890 struct folio_batch fbatch; 2891 struct kiocb iocb; 2892 size_t total_spliced = 0, used, npages; 2893 loff_t isize, end_offset; 2894 bool writably_mapped; 2895 int i, error = 0; 2896 2897 init_sync_kiocb(&iocb, in); 2898 iocb.ki_pos = *ppos; 2899 2900 /* Work out how much data we can actually add into the pipe */ 2901 used = pipe_occupancy(pipe->head, pipe->tail); 2902 npages = max_t(ssize_t, pipe->max_usage - used, 0); 2903 len = min_t(size_t, len, npages * PAGE_SIZE); 2904 2905 folio_batch_init(&fbatch); 2906 2907 do { 2908 cond_resched(); 2909 2910 if (*ppos >= i_size_read(file_inode(in))) 2911 break; 2912 2913 iocb.ki_pos = *ppos; 2914 error = filemap_get_pages(&iocb, len, &fbatch, true); 2915 if (error < 0) 2916 break; 2917 2918 /* 2919 * i_size must be checked after we know the pages are Uptodate. 2920 * 2921 * Checking i_size after the check allows us to calculate 2922 * the correct value for "nr", which means the zero-filled 2923 * part of the page is not copied back to userspace (unless 2924 * another truncate extends the file - this is desired though). 2925 */ 2926 isize = i_size_read(file_inode(in)); 2927 if (unlikely(*ppos >= isize)) 2928 break; 2929 end_offset = min_t(loff_t, isize, *ppos + len); 2930 2931 /* 2932 * Once we start copying data, we don't want to be touching any 2933 * cachelines that might be contended: 2934 */ 2935 writably_mapped = mapping_writably_mapped(in->f_mapping); 2936 2937 for (i = 0; i < folio_batch_count(&fbatch); i++) { 2938 struct folio *folio = fbatch.folios[i]; 2939 size_t n; 2940 2941 if (folio_pos(folio) >= end_offset) 2942 goto out; 2943 folio_mark_accessed(folio); 2944 2945 /* 2946 * If users can be writing to this folio using arbitrary 2947 * virtual addresses, take care of potential aliasing 2948 * before reading the folio on the kernel side. 2949 */ 2950 if (writably_mapped) 2951 flush_dcache_folio(folio); 2952 2953 n = min_t(loff_t, len, isize - *ppos); 2954 n = splice_folio_into_pipe(pipe, folio, *ppos, n); 2955 if (!n) 2956 goto out; 2957 len -= n; 2958 total_spliced += n; 2959 *ppos += n; 2960 in->f_ra.prev_pos = *ppos; 2961 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 2962 goto out; 2963 } 2964 2965 folio_batch_release(&fbatch); 2966 } while (len); 2967 2968 out: 2969 folio_batch_release(&fbatch); 2970 file_accessed(in); 2971 2972 return total_spliced ? total_spliced : error; 2973 } 2974 EXPORT_SYMBOL(filemap_splice_read); 2975 2976 static inline loff_t folio_seek_hole_data(struct xa_state *xas, 2977 struct address_space *mapping, struct folio *folio, 2978 loff_t start, loff_t end, bool seek_data) 2979 { 2980 const struct address_space_operations *ops = mapping->a_ops; 2981 size_t offset, bsz = i_blocksize(mapping->host); 2982 2983 if (xa_is_value(folio) || folio_test_uptodate(folio)) 2984 return seek_data ? start : end; 2985 if (!ops->is_partially_uptodate) 2986 return seek_data ? end : start; 2987 2988 xas_pause(xas); 2989 rcu_read_unlock(); 2990 folio_lock(folio); 2991 if (unlikely(folio->mapping != mapping)) 2992 goto unlock; 2993 2994 offset = offset_in_folio(folio, start) & ~(bsz - 1); 2995 2996 do { 2997 if (ops->is_partially_uptodate(folio, offset, bsz) == 2998 seek_data) 2999 break; 3000 start = (start + bsz) & ~(bsz - 1); 3001 offset += bsz; 3002 } while (offset < folio_size(folio)); 3003 unlock: 3004 folio_unlock(folio); 3005 rcu_read_lock(); 3006 return start; 3007 } 3008 3009 static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) 3010 { 3011 if (xa_is_value(folio)) 3012 return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); 3013 return folio_size(folio); 3014 } 3015 3016 /** 3017 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. 3018 * @mapping: Address space to search. 3019 * @start: First byte to consider. 3020 * @end: Limit of search (exclusive). 3021 * @whence: Either SEEK_HOLE or SEEK_DATA. 3022 * 3023 * If the page cache knows which blocks contain holes and which blocks 3024 * contain data, your filesystem can use this function to implement 3025 * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are 3026 * entirely memory-based such as tmpfs, and filesystems which support 3027 * unwritten extents. 3028 * 3029 * Return: The requested offset on success, or -ENXIO if @whence specifies 3030 * SEEK_DATA and there is no data after @start. There is an implicit hole 3031 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start 3032 * and @end contain data. 3033 */ 3034 loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, 3035 loff_t end, int whence) 3036 { 3037 XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); 3038 pgoff_t max = (end - 1) >> PAGE_SHIFT; 3039 bool seek_data = (whence == SEEK_DATA); 3040 struct folio *folio; 3041 3042 if (end <= start) 3043 return -ENXIO; 3044 3045 rcu_read_lock(); 3046 while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { 3047 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; 3048 size_t seek_size; 3049 3050 if (start < pos) { 3051 if (!seek_data) 3052 goto unlock; 3053 start = pos; 3054 } 3055 3056 seek_size = seek_folio_size(&xas, folio); 3057 pos = round_up((u64)pos + 1, seek_size); 3058 start = folio_seek_hole_data(&xas, mapping, folio, start, pos, 3059 seek_data); 3060 if (start < pos) 3061 goto unlock; 3062 if (start >= end) 3063 break; 3064 if (seek_size > PAGE_SIZE) 3065 xas_set(&xas, pos >> PAGE_SHIFT); 3066 if (!xa_is_value(folio)) 3067 folio_put(folio); 3068 } 3069 if (seek_data) 3070 start = -ENXIO; 3071 unlock: 3072 rcu_read_unlock(); 3073 if (folio && !xa_is_value(folio)) 3074 folio_put(folio); 3075 if (start > end) 3076 return end; 3077 return start; 3078 } 3079 3080 #ifdef CONFIG_MMU 3081 #define MMAP_LOTSAMISS (100) 3082 /* 3083 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock 3084 * @vmf - the vm_fault for this fault. 3085 * @folio - the folio to lock. 3086 * @fpin - the pointer to the file we may pin (or is already pinned). 3087 * 3088 * This works similar to lock_folio_or_retry in that it can drop the 3089 * mmap_lock. It differs in that it actually returns the folio locked 3090 * if it returns 1 and 0 if it couldn't lock the folio. If we did have 3091 * to drop the mmap_lock then fpin will point to the pinned file and 3092 * needs to be fput()'ed at a later point. 3093 */ 3094 static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, 3095 struct file **fpin) 3096 { 3097 if (folio_trylock(folio)) 3098 return 1; 3099 3100 /* 3101 * NOTE! This will make us return with VM_FAULT_RETRY, but with 3102 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT 3103 * is supposed to work. We have way too many special cases.. 3104 */ 3105 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 3106 return 0; 3107 3108 *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); 3109 if (vmf->flags & FAULT_FLAG_KILLABLE) { 3110 if (__folio_lock_killable(folio)) { 3111 /* 3112 * We didn't have the right flags to drop the mmap_lock, 3113 * but all fault_handlers only check for fatal signals 3114 * if we return VM_FAULT_RETRY, so we need to drop the 3115 * mmap_lock here and return 0 if we don't have a fpin. 3116 */ 3117 if (*fpin == NULL) 3118 mmap_read_unlock(vmf->vma->vm_mm); 3119 return 0; 3120 } 3121 } else 3122 __folio_lock(folio); 3123 3124 return 1; 3125 } 3126 3127 /* 3128 * Synchronous readahead happens when we don't even find a page in the page 3129 * cache at all. We don't want to perform IO under the mmap sem, so if we have 3130 * to drop the mmap sem we return the file that was pinned in order for us to do 3131 * that. If we didn't pin a file then we return NULL. The file that is 3132 * returned needs to be fput()'ed when we're done with it. 3133 */ 3134 static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) 3135 { 3136 struct file *file = vmf->vma->vm_file; 3137 struct file_ra_state *ra = &file->f_ra; 3138 struct address_space *mapping = file->f_mapping; 3139 DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); 3140 struct file *fpin = NULL; 3141 unsigned long vm_flags = vmf->vma->vm_flags; 3142 unsigned int mmap_miss; 3143 3144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3145 /* Use the readahead code, even if readahead is disabled */ 3146 if (vm_flags & VM_HUGEPAGE) { 3147 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3148 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); 3149 ra->size = HPAGE_PMD_NR; 3150 /* 3151 * Fetch two PMD folios, so we get the chance to actually 3152 * readahead, unless we've been told not to. 3153 */ 3154 if (!(vm_flags & VM_RAND_READ)) 3155 ra->size *= 2; 3156 ra->async_size = HPAGE_PMD_NR; 3157 page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); 3158 return fpin; 3159 } 3160 #endif 3161 3162 /* If we don't want any read-ahead, don't bother */ 3163 if (vm_flags & VM_RAND_READ) 3164 return fpin; 3165 if (!ra->ra_pages) 3166 return fpin; 3167 3168 if (vm_flags & VM_SEQ_READ) { 3169 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3170 page_cache_sync_ra(&ractl, ra->ra_pages); 3171 return fpin; 3172 } 3173 3174 /* Avoid banging the cache line if not needed */ 3175 mmap_miss = READ_ONCE(ra->mmap_miss); 3176 if (mmap_miss < MMAP_LOTSAMISS * 10) 3177 WRITE_ONCE(ra->mmap_miss, ++mmap_miss); 3178 3179 /* 3180 * Do we miss much more than hit in this file? If so, 3181 * stop bothering with read-ahead. It will only hurt. 3182 */ 3183 if (mmap_miss > MMAP_LOTSAMISS) 3184 return fpin; 3185 3186 /* 3187 * mmap read-around 3188 */ 3189 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3190 ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); 3191 ra->size = ra->ra_pages; 3192 ra->async_size = ra->ra_pages / 4; 3193 ractl._index = ra->start; 3194 page_cache_ra_order(&ractl, ra, 0); 3195 return fpin; 3196 } 3197 3198 /* 3199 * Asynchronous readahead happens when we find the page and PG_readahead, 3200 * so we want to possibly extend the readahead further. We return the file that 3201 * was pinned if we have to drop the mmap_lock in order to do IO. 3202 */ 3203 static struct file *do_async_mmap_readahead(struct vm_fault *vmf, 3204 struct folio *folio) 3205 { 3206 struct file *file = vmf->vma->vm_file; 3207 struct file_ra_state *ra = &file->f_ra; 3208 DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); 3209 struct file *fpin = NULL; 3210 unsigned int mmap_miss; 3211 3212 /* If we don't want any read-ahead, don't bother */ 3213 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) 3214 return fpin; 3215 3216 mmap_miss = READ_ONCE(ra->mmap_miss); 3217 if (mmap_miss) 3218 WRITE_ONCE(ra->mmap_miss, --mmap_miss); 3219 3220 if (folio_test_readahead(folio)) { 3221 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3222 page_cache_async_ra(&ractl, folio, ra->ra_pages); 3223 } 3224 return fpin; 3225 } 3226 3227 /** 3228 * filemap_fault - read in file data for page fault handling 3229 * @vmf: struct vm_fault containing details of the fault 3230 * 3231 * filemap_fault() is invoked via the vma operations vector for a 3232 * mapped memory region to read in file data during a page fault. 3233 * 3234 * The goto's are kind of ugly, but this streamlines the normal case of having 3235 * it in the page cache, and handles the special cases reasonably without 3236 * having a lot of duplicated code. 3237 * 3238 * vma->vm_mm->mmap_lock must be held on entry. 3239 * 3240 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock 3241 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). 3242 * 3243 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock 3244 * has not been released. 3245 * 3246 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 3247 * 3248 * Return: bitwise-OR of %VM_FAULT_ codes. 3249 */ 3250 vm_fault_t filemap_fault(struct vm_fault *vmf) 3251 { 3252 int error; 3253 struct file *file = vmf->vma->vm_file; 3254 struct file *fpin = NULL; 3255 struct address_space *mapping = file->f_mapping; 3256 struct inode *inode = mapping->host; 3257 pgoff_t max_idx, index = vmf->pgoff; 3258 struct folio *folio; 3259 vm_fault_t ret = 0; 3260 bool mapping_locked = false; 3261 3262 max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 3263 if (unlikely(index >= max_idx)) 3264 return VM_FAULT_SIGBUS; 3265 3266 /* 3267 * Do we have something in the page cache already? 3268 */ 3269 folio = filemap_get_folio(mapping, index); 3270 if (likely(folio)) { 3271 /* 3272 * We found the page, so try async readahead before waiting for 3273 * the lock. 3274 */ 3275 if (!(vmf->flags & FAULT_FLAG_TRIED)) 3276 fpin = do_async_mmap_readahead(vmf, folio); 3277 if (unlikely(!folio_test_uptodate(folio))) { 3278 filemap_invalidate_lock_shared(mapping); 3279 mapping_locked = true; 3280 } 3281 } else { 3282 /* No page in the page cache at all */ 3283 count_vm_event(PGMAJFAULT); 3284 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 3285 ret = VM_FAULT_MAJOR; 3286 fpin = do_sync_mmap_readahead(vmf); 3287 retry_find: 3288 /* 3289 * See comment in filemap_create_folio() why we need 3290 * invalidate_lock 3291 */ 3292 if (!mapping_locked) { 3293 filemap_invalidate_lock_shared(mapping); 3294 mapping_locked = true; 3295 } 3296 folio = __filemap_get_folio(mapping, index, 3297 FGP_CREAT|FGP_FOR_MMAP, 3298 vmf->gfp_mask); 3299 if (!folio) { 3300 if (fpin) 3301 goto out_retry; 3302 filemap_invalidate_unlock_shared(mapping); 3303 return VM_FAULT_OOM; 3304 } 3305 } 3306 3307 if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) 3308 goto out_retry; 3309 3310 /* Did it get truncated? */ 3311 if (unlikely(folio->mapping != mapping)) { 3312 folio_unlock(folio); 3313 folio_put(folio); 3314 goto retry_find; 3315 } 3316 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); 3317 3318 /* 3319 * We have a locked page in the page cache, now we need to check 3320 * that it's up-to-date. If not, it is going to be due to an error. 3321 */ 3322 if (unlikely(!folio_test_uptodate(folio))) { 3323 /* 3324 * The page was in cache and uptodate and now it is not. 3325 * Strange but possible since we didn't hold the page lock all 3326 * the time. Let's drop everything get the invalidate lock and 3327 * try again. 3328 */ 3329 if (!mapping_locked) { 3330 folio_unlock(folio); 3331 folio_put(folio); 3332 goto retry_find; 3333 } 3334 goto page_not_uptodate; 3335 } 3336 3337 /* 3338 * We've made it this far and we had to drop our mmap_lock, now is the 3339 * time to return to the upper layer and have it re-find the vma and 3340 * redo the fault. 3341 */ 3342 if (fpin) { 3343 folio_unlock(folio); 3344 goto out_retry; 3345 } 3346 if (mapping_locked) 3347 filemap_invalidate_unlock_shared(mapping); 3348 3349 /* 3350 * Found the page and have a reference on it. 3351 * We must recheck i_size under page lock. 3352 */ 3353 max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 3354 if (unlikely(index >= max_idx)) { 3355 folio_unlock(folio); 3356 folio_put(folio); 3357 return VM_FAULT_SIGBUS; 3358 } 3359 3360 vmf->page = folio_file_page(folio, index); 3361 return ret | VM_FAULT_LOCKED; 3362 3363 page_not_uptodate: 3364 /* 3365 * Umm, take care of errors if the page isn't up-to-date. 3366 * Try to re-read it _once_. We do this synchronously, 3367 * because there really aren't any performance issues here 3368 * and we need to check for errors. 3369 */ 3370 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3371 error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); 3372 if (fpin) 3373 goto out_retry; 3374 folio_put(folio); 3375 3376 if (!error || error == AOP_TRUNCATED_PAGE) 3377 goto retry_find; 3378 filemap_invalidate_unlock_shared(mapping); 3379 3380 return VM_FAULT_SIGBUS; 3381 3382 out_retry: 3383 /* 3384 * We dropped the mmap_lock, we need to return to the fault handler to 3385 * re-find the vma and come back and find our hopefully still populated 3386 * page. 3387 */ 3388 if (folio) 3389 folio_put(folio); 3390 if (mapping_locked) 3391 filemap_invalidate_unlock_shared(mapping); 3392 if (fpin) 3393 fput(fpin); 3394 return ret | VM_FAULT_RETRY; 3395 } 3396 EXPORT_SYMBOL(filemap_fault); 3397 3398 static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) 3399 { 3400 struct mm_struct *mm = vmf->vma->vm_mm; 3401 3402 /* Huge page is mapped? No need to proceed. */ 3403 if (pmd_trans_huge(*vmf->pmd)) { 3404 unlock_page(page); 3405 put_page(page); 3406 return true; 3407 } 3408 3409 if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { 3410 vm_fault_t ret = do_set_pmd(vmf, page); 3411 if (!ret) { 3412 /* The page is mapped successfully, reference consumed. */ 3413 unlock_page(page); 3414 return true; 3415 } 3416 } 3417 3418 if (pmd_none(*vmf->pmd)) 3419 pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); 3420 3421 /* See comment in handle_pte_fault() */ 3422 if (pmd_devmap_trans_unstable(vmf->pmd)) { 3423 unlock_page(page); 3424 put_page(page); 3425 return true; 3426 } 3427 3428 return false; 3429 } 3430 3431 static struct folio *next_uptodate_page(struct folio *folio, 3432 struct address_space *mapping, 3433 struct xa_state *xas, pgoff_t end_pgoff) 3434 { 3435 unsigned long max_idx; 3436 3437 do { 3438 if (!folio) 3439 return NULL; 3440 if (xas_retry(xas, folio)) 3441 continue; 3442 if (xa_is_value(folio)) 3443 continue; 3444 if (folio_test_locked(folio)) 3445 continue; 3446 if (!folio_try_get_rcu(folio)) 3447 continue; 3448 /* Has the page moved or been split? */ 3449 if (unlikely(folio != xas_reload(xas))) 3450 goto skip; 3451 if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) 3452 goto skip; 3453 if (!folio_trylock(folio)) 3454 goto skip; 3455 if (folio->mapping != mapping) 3456 goto unlock; 3457 if (!folio_test_uptodate(folio)) 3458 goto unlock; 3459 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3460 if (xas->xa_index >= max_idx) 3461 goto unlock; 3462 return folio; 3463 unlock: 3464 folio_unlock(folio); 3465 skip: 3466 folio_put(folio); 3467 } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); 3468 3469 return NULL; 3470 } 3471 3472 static inline struct folio *first_map_page(struct address_space *mapping, 3473 struct xa_state *xas, 3474 pgoff_t end_pgoff) 3475 { 3476 return next_uptodate_page(xas_find(xas, end_pgoff), 3477 mapping, xas, end_pgoff); 3478 } 3479 3480 static inline struct folio *next_map_page(struct address_space *mapping, 3481 struct xa_state *xas, 3482 pgoff_t end_pgoff) 3483 { 3484 return next_uptodate_page(xas_next_entry(xas, end_pgoff), 3485 mapping, xas, end_pgoff); 3486 } 3487 3488 vm_fault_t filemap_map_pages(struct vm_fault *vmf, 3489 pgoff_t start_pgoff, pgoff_t end_pgoff) 3490 { 3491 struct vm_area_struct *vma = vmf->vma; 3492 struct file *file = vma->vm_file; 3493 struct address_space *mapping = file->f_mapping; 3494 pgoff_t last_pgoff = start_pgoff; 3495 unsigned long addr; 3496 XA_STATE(xas, &mapping->i_pages, start_pgoff); 3497 struct folio *folio; 3498 struct page *page; 3499 unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); 3500 vm_fault_t ret = 0; 3501 3502 rcu_read_lock(); 3503 folio = first_map_page(mapping, &xas, end_pgoff); 3504 if (!folio) 3505 goto out; 3506 3507 if (filemap_map_pmd(vmf, &folio->page)) { 3508 ret = VM_FAULT_NOPAGE; 3509 goto out; 3510 } 3511 3512 addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); 3513 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); 3514 do { 3515 again: 3516 page = folio_file_page(folio, xas.xa_index); 3517 if (PageHWPoison(page)) 3518 goto unlock; 3519 3520 if (mmap_miss > 0) 3521 mmap_miss--; 3522 3523 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; 3524 vmf->pte += xas.xa_index - last_pgoff; 3525 last_pgoff = xas.xa_index; 3526 3527 /* 3528 * NOTE: If there're PTE markers, we'll leave them to be 3529 * handled in the specific fault path, and it'll prohibit the 3530 * fault-around logic. 3531 */ 3532 if (!pte_none(*vmf->pte)) 3533 goto unlock; 3534 3535 /* We're about to handle the fault */ 3536 if (vmf->address == addr) 3537 ret = VM_FAULT_NOPAGE; 3538 3539 do_set_pte(vmf, page, addr); 3540 /* no need to invalidate: a not-present page won't be cached */ 3541 update_mmu_cache(vma, addr, vmf->pte); 3542 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { 3543 xas.xa_index++; 3544 folio_ref_inc(folio); 3545 goto again; 3546 } 3547 folio_unlock(folio); 3548 continue; 3549 unlock: 3550 if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { 3551 xas.xa_index++; 3552 goto again; 3553 } 3554 folio_unlock(folio); 3555 folio_put(folio); 3556 } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); 3557 pte_unmap_unlock(vmf->pte, vmf->ptl); 3558 out: 3559 rcu_read_unlock(); 3560 WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); 3561 return ret; 3562 } 3563 EXPORT_SYMBOL(filemap_map_pages); 3564 3565 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) 3566 { 3567 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 3568 struct folio *folio = page_folio(vmf->page); 3569 vm_fault_t ret = VM_FAULT_LOCKED; 3570 3571 sb_start_pagefault(mapping->host->i_sb); 3572 file_update_time(vmf->vma->vm_file); 3573 folio_lock(folio); 3574 if (folio->mapping != mapping) { 3575 folio_unlock(folio); 3576 ret = VM_FAULT_NOPAGE; 3577 goto out; 3578 } 3579 /* 3580 * We mark the folio dirty already here so that when freeze is in 3581 * progress, we are guaranteed that writeback during freezing will 3582 * see the dirty folio and writeprotect it again. 3583 */ 3584 folio_mark_dirty(folio); 3585 folio_wait_stable(folio); 3586 out: 3587 sb_end_pagefault(mapping->host->i_sb); 3588 return ret; 3589 } 3590 3591 const struct vm_operations_struct generic_file_vm_ops = { 3592 .fault = filemap_fault, 3593 .map_pages = filemap_map_pages, 3594 .page_mkwrite = filemap_page_mkwrite, 3595 }; 3596 3597 /* This is used for a general mmap of a disk file */ 3598 3599 int generic_file_mmap(struct file *file, struct vm_area_struct *vma) 3600 { 3601 struct address_space *mapping = file->f_mapping; 3602 3603 if (!mapping->a_ops->read_folio) 3604 return -ENOEXEC; 3605 file_accessed(file); 3606 vma->vm_ops = &generic_file_vm_ops; 3607 return 0; 3608 } 3609 3610 /* 3611 * This is for filesystems which do not implement ->writepage. 3612 */ 3613 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 3614 { 3615 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 3616 return -EINVAL; 3617 return generic_file_mmap(file, vma); 3618 } 3619 #else 3620 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) 3621 { 3622 return VM_FAULT_SIGBUS; 3623 } 3624 int generic_file_mmap(struct file *file, struct vm_area_struct *vma) 3625 { 3626 return -ENOSYS; 3627 } 3628 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 3629 { 3630 return -ENOSYS; 3631 } 3632 #endif /* CONFIG_MMU */ 3633 3634 EXPORT_SYMBOL(filemap_page_mkwrite); 3635 EXPORT_SYMBOL(generic_file_mmap); 3636 EXPORT_SYMBOL(generic_file_readonly_mmap); 3637 3638 static struct folio *do_read_cache_folio(struct address_space *mapping, 3639 pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) 3640 { 3641 struct folio *folio; 3642 int err; 3643 3644 if (!filler) 3645 filler = mapping->a_ops->read_folio; 3646 repeat: 3647 folio = filemap_get_folio(mapping, index); 3648 if (!folio) { 3649 folio = filemap_alloc_folio(gfp, 0); 3650 if (!folio) 3651 return ERR_PTR(-ENOMEM); 3652 err = filemap_add_folio(mapping, folio, index, gfp); 3653 if (unlikely(err)) { 3654 folio_put(folio); 3655 if (err == -EEXIST) 3656 goto repeat; 3657 /* Presumably ENOMEM for xarray node */ 3658 return ERR_PTR(err); 3659 } 3660 3661 goto filler; 3662 } 3663 if (folio_test_uptodate(folio)) 3664 goto out; 3665 3666 if (!folio_trylock(folio)) { 3667 folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); 3668 goto repeat; 3669 } 3670 3671 /* Folio was truncated from mapping */ 3672 if (!folio->mapping) { 3673 folio_unlock(folio); 3674 folio_put(folio); 3675 goto repeat; 3676 } 3677 3678 /* Someone else locked and filled the page in a very small window */ 3679 if (folio_test_uptodate(folio)) { 3680 folio_unlock(folio); 3681 goto out; 3682 } 3683 3684 filler: 3685 err = filemap_read_folio(file, filler, folio); 3686 if (err) { 3687 folio_put(folio); 3688 if (err == AOP_TRUNCATED_PAGE) 3689 goto repeat; 3690 return ERR_PTR(err); 3691 } 3692 3693 out: 3694 folio_mark_accessed(folio); 3695 return folio; 3696 } 3697 3698 /** 3699 * read_cache_folio - Read into page cache, fill it if needed. 3700 * @mapping: The address_space to read from. 3701 * @index: The index to read. 3702 * @filler: Function to perform the read, or NULL to use aops->read_folio(). 3703 * @file: Passed to filler function, may be NULL if not required. 3704 * 3705 * Read one page into the page cache. If it succeeds, the folio returned 3706 * will contain @index, but it may not be the first page of the folio. 3707 * 3708 * If the filler function returns an error, it will be returned to the 3709 * caller. 3710 * 3711 * Context: May sleep. Expects mapping->invalidate_lock to be held. 3712 * Return: An uptodate folio on success, ERR_PTR() on failure. 3713 */ 3714 struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, 3715 filler_t filler, struct file *file) 3716 { 3717 return do_read_cache_folio(mapping, index, filler, file, 3718 mapping_gfp_mask(mapping)); 3719 } 3720 EXPORT_SYMBOL(read_cache_folio); 3721 3722 static struct page *do_read_cache_page(struct address_space *mapping, 3723 pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) 3724 { 3725 struct folio *folio; 3726 3727 folio = do_read_cache_folio(mapping, index, filler, file, gfp); 3728 if (IS_ERR(folio)) 3729 return &folio->page; 3730 return folio_file_page(folio, index); 3731 } 3732 3733 struct page *read_cache_page(struct address_space *mapping, 3734 pgoff_t index, filler_t *filler, struct file *file) 3735 { 3736 return do_read_cache_page(mapping, index, filler, file, 3737 mapping_gfp_mask(mapping)); 3738 } 3739 EXPORT_SYMBOL(read_cache_page); 3740 3741 /** 3742 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 3743 * @mapping: the page's address_space 3744 * @index: the page index 3745 * @gfp: the page allocator flags to use if allocating 3746 * 3747 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 3748 * any new page allocations done using the specified allocation flags. 3749 * 3750 * If the page does not get brought uptodate, return -EIO. 3751 * 3752 * The function expects mapping->invalidate_lock to be already held. 3753 * 3754 * Return: up to date page on success, ERR_PTR() on failure. 3755 */ 3756 struct page *read_cache_page_gfp(struct address_space *mapping, 3757 pgoff_t index, 3758 gfp_t gfp) 3759 { 3760 return do_read_cache_page(mapping, index, NULL, NULL, gfp); 3761 } 3762 EXPORT_SYMBOL(read_cache_page_gfp); 3763 3764 /* 3765 * Warn about a page cache invalidation failure during a direct I/O write. 3766 */ 3767 void dio_warn_stale_pagecache(struct file *filp) 3768 { 3769 static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); 3770 char pathname[128]; 3771 char *path; 3772 3773 errseq_set(&filp->f_mapping->wb_err, -EIO); 3774 if (__ratelimit(&_rs)) { 3775 path = file_path(filp, pathname, sizeof(pathname)); 3776 if (IS_ERR(path)) 3777 path = "(unknown)"; 3778 pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); 3779 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, 3780 current->comm); 3781 } 3782 } 3783 3784 ssize_t 3785 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 3786 { 3787 struct file *file = iocb->ki_filp; 3788 struct address_space *mapping = file->f_mapping; 3789 struct inode *inode = mapping->host; 3790 loff_t pos = iocb->ki_pos; 3791 ssize_t written; 3792 size_t write_len; 3793 pgoff_t end; 3794 3795 write_len = iov_iter_count(from); 3796 end = (pos + write_len - 1) >> PAGE_SHIFT; 3797 3798 if (iocb->ki_flags & IOCB_NOWAIT) { 3799 /* If there are pages to writeback, return */ 3800 if (filemap_range_has_page(file->f_mapping, pos, 3801 pos + write_len - 1)) 3802 return -EAGAIN; 3803 } else { 3804 written = filemap_write_and_wait_range(mapping, pos, 3805 pos + write_len - 1); 3806 if (written) 3807 goto out; 3808 } 3809 3810 /* 3811 * After a write we want buffered reads to be sure to go to disk to get 3812 * the new data. We invalidate clean cached page from the region we're 3813 * about to write. We do this *before* the write so that we can return 3814 * without clobbering -EIOCBQUEUED from ->direct_IO(). 3815 */ 3816 written = invalidate_inode_pages2_range(mapping, 3817 pos >> PAGE_SHIFT, end); 3818 /* 3819 * If a page can not be invalidated, return 0 to fall back 3820 * to buffered write. 3821 */ 3822 if (written) { 3823 if (written == -EBUSY) 3824 return 0; 3825 goto out; 3826 } 3827 3828 written = mapping->a_ops->direct_IO(iocb, from); 3829 3830 /* 3831 * Finally, try again to invalidate clean pages which might have been 3832 * cached by non-direct readahead, or faulted in by get_user_pages() 3833 * if the source of the write was an mmap'ed region of the file 3834 * we're writing. Either one is a pretty crazy thing to do, 3835 * so we don't support it 100%. If this invalidation 3836 * fails, tough, the write still worked... 3837 * 3838 * Most of the time we do not need this since dio_complete() will do 3839 * the invalidation for us. However there are some file systems that 3840 * do not end up with dio_complete() being called, so let's not break 3841 * them by removing it completely. 3842 * 3843 * Noticeable example is a blkdev_direct_IO(). 3844 * 3845 * Skip invalidation for async writes or if mapping has no pages. 3846 */ 3847 if (written > 0 && mapping->nrpages && 3848 invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) 3849 dio_warn_stale_pagecache(file); 3850 3851 if (written > 0) { 3852 pos += written; 3853 write_len -= written; 3854 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 3855 i_size_write(inode, pos); 3856 mark_inode_dirty(inode); 3857 } 3858 iocb->ki_pos = pos; 3859 } 3860 if (written != -EIOCBQUEUED) 3861 iov_iter_revert(from, write_len - iov_iter_count(from)); 3862 out: 3863 return written; 3864 } 3865 EXPORT_SYMBOL(generic_file_direct_write); 3866 3867 ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) 3868 { 3869 struct file *file = iocb->ki_filp; 3870 loff_t pos = iocb->ki_pos; 3871 struct address_space *mapping = file->f_mapping; 3872 const struct address_space_operations *a_ops = mapping->a_ops; 3873 long status = 0; 3874 ssize_t written = 0; 3875 3876 do { 3877 struct page *page; 3878 unsigned long offset; /* Offset into pagecache page */ 3879 unsigned long bytes; /* Bytes to write to page */ 3880 size_t copied; /* Bytes copied from user */ 3881 void *fsdata = NULL; 3882 3883 offset = (pos & (PAGE_SIZE - 1)); 3884 bytes = min_t(unsigned long, PAGE_SIZE - offset, 3885 iov_iter_count(i)); 3886 3887 again: 3888 /* 3889 * Bring in the user page that we will copy from _first_. 3890 * Otherwise there's a nasty deadlock on copying from the 3891 * same page as we're writing to, without it being marked 3892 * up-to-date. 3893 */ 3894 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { 3895 status = -EFAULT; 3896 break; 3897 } 3898 3899 if (fatal_signal_pending(current)) { 3900 status = -EINTR; 3901 break; 3902 } 3903 3904 status = a_ops->write_begin(file, mapping, pos, bytes, 3905 &page, &fsdata); 3906 if (unlikely(status < 0)) 3907 break; 3908 3909 if (mapping_writably_mapped(mapping)) 3910 flush_dcache_page(page); 3911 3912 copied = copy_page_from_iter_atomic(page, offset, bytes, i); 3913 flush_dcache_page(page); 3914 3915 status = a_ops->write_end(file, mapping, pos, bytes, copied, 3916 page, fsdata); 3917 if (unlikely(status != copied)) { 3918 iov_iter_revert(i, copied - max(status, 0L)); 3919 if (unlikely(status < 0)) 3920 break; 3921 } 3922 cond_resched(); 3923 3924 if (unlikely(status == 0)) { 3925 /* 3926 * A short copy made ->write_end() reject the 3927 * thing entirely. Might be memory poisoning 3928 * halfway through, might be a race with munmap, 3929 * might be severe memory pressure. 3930 */ 3931 if (copied) 3932 bytes = copied; 3933 goto again; 3934 } 3935 pos += status; 3936 written += status; 3937 3938 balance_dirty_pages_ratelimited(mapping); 3939 } while (iov_iter_count(i)); 3940 3941 return written ? written : status; 3942 } 3943 EXPORT_SYMBOL(generic_perform_write); 3944 3945 /** 3946 * __generic_file_write_iter - write data to a file 3947 * @iocb: IO state structure (file, offset, etc.) 3948 * @from: iov_iter with data to write 3949 * 3950 * This function does all the work needed for actually writing data to a 3951 * file. It does all basic checks, removes SUID from the file, updates 3952 * modification times and calls proper subroutines depending on whether we 3953 * do direct IO or a standard buffered write. 3954 * 3955 * It expects i_rwsem to be grabbed unless we work on a block device or similar 3956 * object which does not need locking at all. 3957 * 3958 * This function does *not* take care of syncing data in case of O_SYNC write. 3959 * A caller has to handle it. This is mainly due to the fact that we want to 3960 * avoid syncing under i_rwsem. 3961 * 3962 * Return: 3963 * * number of bytes written, even for truncated writes 3964 * * negative error code if no data has been written at all 3965 */ 3966 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3967 { 3968 struct file *file = iocb->ki_filp; 3969 struct address_space *mapping = file->f_mapping; 3970 struct inode *inode = mapping->host; 3971 ssize_t written = 0; 3972 ssize_t err; 3973 ssize_t status; 3974 3975 /* We can write back this queue in page reclaim */ 3976 current->backing_dev_info = inode_to_bdi(inode); 3977 err = file_remove_privs(file); 3978 if (err) 3979 goto out; 3980 3981 err = file_update_time(file); 3982 if (err) 3983 goto out; 3984 3985 if (iocb->ki_flags & IOCB_DIRECT) { 3986 loff_t pos, endbyte; 3987 3988 written = generic_file_direct_write(iocb, from); 3989 /* 3990 * If the write stopped short of completing, fall back to 3991 * buffered writes. Some filesystems do this for writes to 3992 * holes, for example. For DAX files, a buffered write will 3993 * not succeed (even if it did, DAX does not handle dirty 3994 * page-cache pages correctly). 3995 */ 3996 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 3997 goto out; 3998 3999 pos = iocb->ki_pos; 4000 status = generic_perform_write(iocb, from); 4001 /* 4002 * If generic_perform_write() returned a synchronous error 4003 * then we want to return the number of bytes which were 4004 * direct-written, or the error code if that was zero. Note 4005 * that this differs from normal direct-io semantics, which 4006 * will return -EFOO even if some bytes were written. 4007 */ 4008 if (unlikely(status < 0)) { 4009 err = status; 4010 goto out; 4011 } 4012 /* 4013 * We need to ensure that the page cache pages are written to 4014 * disk and invalidated to preserve the expected O_DIRECT 4015 * semantics. 4016 */ 4017 endbyte = pos + status - 1; 4018 err = filemap_write_and_wait_range(mapping, pos, endbyte); 4019 if (err == 0) { 4020 iocb->ki_pos = endbyte + 1; 4021 written += status; 4022 invalidate_mapping_pages(mapping, 4023 pos >> PAGE_SHIFT, 4024 endbyte >> PAGE_SHIFT); 4025 } else { 4026 /* 4027 * We don't know how much we wrote, so just return 4028 * the number of bytes which were direct-written 4029 */ 4030 } 4031 } else { 4032 written = generic_perform_write(iocb, from); 4033 if (likely(written > 0)) 4034 iocb->ki_pos += written; 4035 } 4036 out: 4037 current->backing_dev_info = NULL; 4038 return written ? written : err; 4039 } 4040 EXPORT_SYMBOL(__generic_file_write_iter); 4041 4042 /** 4043 * generic_file_write_iter - write data to a file 4044 * @iocb: IO state structure 4045 * @from: iov_iter with data to write 4046 * 4047 * This is a wrapper around __generic_file_write_iter() to be used by most 4048 * filesystems. It takes care of syncing the file in case of O_SYNC file 4049 * and acquires i_rwsem as needed. 4050 * Return: 4051 * * negative error code if no data has been written at all of 4052 * vfs_fsync_range() failed for a synchronous write 4053 * * number of bytes written, even for truncated writes 4054 */ 4055 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 4056 { 4057 struct file *file = iocb->ki_filp; 4058 struct inode *inode = file->f_mapping->host; 4059 ssize_t ret; 4060 4061 inode_lock(inode); 4062 ret = generic_write_checks(iocb, from); 4063 if (ret > 0) 4064 ret = __generic_file_write_iter(iocb, from); 4065 inode_unlock(inode); 4066 4067 if (ret > 0) 4068 ret = generic_write_sync(iocb, ret); 4069 return ret; 4070 } 4071 EXPORT_SYMBOL(generic_file_write_iter); 4072 4073 /** 4074 * filemap_release_folio() - Release fs-specific metadata on a folio. 4075 * @folio: The folio which the kernel is trying to free. 4076 * @gfp: Memory allocation flags (and I/O mode). 4077 * 4078 * The address_space is trying to release any data attached to a folio 4079 * (presumably at folio->private). 4080 * 4081 * This will also be called if the private_2 flag is set on a page, 4082 * indicating that the folio has other metadata associated with it. 4083 * 4084 * The @gfp argument specifies whether I/O may be performed to release 4085 * this page (__GFP_IO), and whether the call may block 4086 * (__GFP_RECLAIM & __GFP_FS). 4087 * 4088 * Return: %true if the release was successful, otherwise %false. 4089 */ 4090 bool filemap_release_folio(struct folio *folio, gfp_t gfp) 4091 { 4092 struct address_space * const mapping = folio->mapping; 4093 4094 BUG_ON(!folio_test_locked(folio)); 4095 if (folio_test_writeback(folio)) 4096 return false; 4097 4098 if (mapping && mapping->a_ops->release_folio) 4099 return mapping->a_ops->release_folio(folio, gfp); 4100 return try_to_free_buffers(folio); 4101 } 4102 EXPORT_SYMBOL(filemap_release_folio); 4103