1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/filemap.c 4 * 5 * Copyright (C) 1994-1999 Linus Torvalds 6 */ 7 8 /* 9 * This file handles the generic file mmap semantics used by 10 * most "normal" filesystems (but you don't /have/ to use this: 11 * the NFS filesystem used to do this differently, for example) 12 */ 13 #include <linux/export.h> 14 #include <linux/compiler.h> 15 #include <linux/dax.h> 16 #include <linux/fs.h> 17 #include <linux/sched/signal.h> 18 #include <linux/uaccess.h> 19 #include <linux/capability.h> 20 #include <linux/kernel_stat.h> 21 #include <linux/gfp.h> 22 #include <linux/mm.h> 23 #include <linux/swap.h> 24 #include <linux/swapops.h> 25 #include <linux/syscalls.h> 26 #include <linux/mman.h> 27 #include <linux/pagemap.h> 28 #include <linux/file.h> 29 #include <linux/uio.h> 30 #include <linux/error-injection.h> 31 #include <linux/hash.h> 32 #include <linux/writeback.h> 33 #include <linux/backing-dev.h> 34 #include <linux/pagevec.h> 35 #include <linux/security.h> 36 #include <linux/cpuset.h> 37 #include <linux/hugetlb.h> 38 #include <linux/memcontrol.h> 39 #include <linux/shmem_fs.h> 40 #include <linux/rmap.h> 41 #include <linux/delayacct.h> 42 #include <linux/psi.h> 43 #include <linux/ramfs.h> 44 #include <linux/page_idle.h> 45 #include <linux/migrate.h> 46 #include <linux/pipe_fs_i.h> 47 #include <linux/splice.h> 48 #include <linux/rcupdate_wait.h> 49 #include <linux/sched/mm.h> 50 #include <asm/pgalloc.h> 51 #include <asm/tlbflush.h> 52 #include "internal.h" 53 54 #define CREATE_TRACE_POINTS 55 #include <trace/events/filemap.h> 56 57 /* 58 * FIXME: remove all knowledge of the buffer layer from the core VM 59 */ 60 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 61 62 #include <asm/mman.h> 63 64 #include "swap.h" 65 66 /* 67 * Shared mappings implemented 30.11.1994. It's not fully working yet, 68 * though. 69 * 70 * Shared mappings now work. 15.8.1995 Bruno. 71 * 72 * finished 'unifying' the page and buffer cache and SMP-threaded the 73 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 74 * 75 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 76 */ 77 78 /* 79 * Lock ordering: 80 * 81 * ->i_mmap_rwsem (truncate_pagecache) 82 * ->private_lock (__free_pte->block_dirty_folio) 83 * ->swap_lock (exclusive_swap_page, others) 84 * ->i_pages lock 85 * 86 * ->i_rwsem 87 * ->invalidate_lock (acquired by fs in truncate path) 88 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 89 * 90 * ->mmap_lock 91 * ->i_mmap_rwsem 92 * ->page_table_lock or pte_lock (various, mainly in memory.c) 93 * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) 94 * 95 * ->mmap_lock 96 * ->invalidate_lock (filemap_fault) 97 * ->lock_page (filemap_fault, access_process_vm) 98 * 99 * ->i_rwsem (generic_perform_write) 100 * ->mmap_lock (fault_in_readable->do_page_fault) 101 * 102 * bdi->wb.list_lock 103 * sb_lock (fs/fs-writeback.c) 104 * ->i_pages lock (__sync_single_inode) 105 * 106 * ->i_mmap_rwsem 107 * ->anon_vma.lock (vma_merge) 108 * 109 * ->anon_vma.lock 110 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 111 * 112 * ->page_table_lock or pte_lock 113 * ->swap_lock (try_to_unmap_one) 114 * ->private_lock (try_to_unmap_one) 115 * ->i_pages lock (try_to_unmap_one) 116 * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) 117 * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) 118 * ->private_lock (folio_remove_rmap_pte->set_page_dirty) 119 * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) 120 * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) 121 * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) 122 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 123 * ->inode->i_lock (zap_pte_range->set_page_dirty) 124 * ->private_lock (zap_pte_range->block_dirty_folio) 125 */ 126 127 static void mapping_set_update(struct xa_state *xas, 128 struct address_space *mapping) 129 { 130 if (dax_mapping(mapping) || shmem_mapping(mapping)) 131 return; 132 xas_set_update(xas, workingset_update_node); 133 xas_set_lru(xas, &shadow_nodes); 134 } 135 136 static void page_cache_delete(struct address_space *mapping, 137 struct folio *folio, void *shadow) 138 { 139 XA_STATE(xas, &mapping->i_pages, folio->index); 140 long nr = 1; 141 142 mapping_set_update(&xas, mapping); 143 144 xas_set_order(&xas, folio->index, folio_order(folio)); 145 nr = folio_nr_pages(folio); 146 147 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 148 149 xas_store(&xas, shadow); 150 xas_init_marks(&xas); 151 152 folio->mapping = NULL; 153 /* Leave page->index set: truncation lookup relies upon it */ 154 mapping->nrpages -= nr; 155 } 156 157 static void filemap_unaccount_folio(struct address_space *mapping, 158 struct folio *folio) 159 { 160 long nr; 161 162 VM_BUG_ON_FOLIO(folio_mapped(folio), folio); 163 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { 164 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 165 current->comm, folio_pfn(folio)); 166 dump_page(&folio->page, "still mapped when deleted"); 167 dump_stack(); 168 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 169 170 if (mapping_exiting(mapping) && !folio_test_large(folio)) { 171 int mapcount = folio_mapcount(folio); 172 173 if (folio_ref_count(folio) >= mapcount + 2) { 174 /* 175 * All vmas have already been torn down, so it's 176 * a good bet that actually the page is unmapped 177 * and we'd rather not leak it: if we're wrong, 178 * another bad page check should catch it later. 179 */ 180 atomic_set(&folio->_mapcount, -1); 181 folio_ref_sub(folio, mapcount); 182 } 183 } 184 } 185 186 /* hugetlb folios do not participate in page cache accounting. */ 187 if (folio_test_hugetlb(folio)) 188 return; 189 190 nr = folio_nr_pages(folio); 191 192 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 193 if (folio_test_swapbacked(folio)) { 194 __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); 195 if (folio_test_pmd_mappable(folio)) 196 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); 197 } else if (folio_test_pmd_mappable(folio)) { 198 __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); 199 filemap_nr_thps_dec(mapping); 200 } 201 202 /* 203 * At this point folio must be either written or cleaned by 204 * truncate. Dirty folio here signals a bug and loss of 205 * unwritten data - on ordinary filesystems. 206 * 207 * But it's harmless on in-memory filesystems like tmpfs; and can 208 * occur when a driver which did get_user_pages() sets page dirty 209 * before putting it, while the inode is being finally evicted. 210 * 211 * Below fixes dirty accounting after removing the folio entirely 212 * but leaves the dirty flag set: it has no effect for truncated 213 * folio and anyway will be cleared before returning folio to 214 * buddy allocator. 215 */ 216 if (WARN_ON_ONCE(folio_test_dirty(folio) && 217 mapping_can_writeback(mapping))) 218 folio_account_cleaned(folio, inode_to_wb(mapping->host)); 219 } 220 221 /* 222 * Delete a page from the page cache and free it. Caller has to make 223 * sure the page is locked and that nobody else uses it - or that usage 224 * is safe. The caller must hold the i_pages lock. 225 */ 226 void __filemap_remove_folio(struct folio *folio, void *shadow) 227 { 228 struct address_space *mapping = folio->mapping; 229 230 trace_mm_filemap_delete_from_page_cache(folio); 231 filemap_unaccount_folio(mapping, folio); 232 page_cache_delete(mapping, folio, shadow); 233 } 234 235 void filemap_free_folio(struct address_space *mapping, struct folio *folio) 236 { 237 void (*free_folio)(struct folio *); 238 int refs = 1; 239 240 free_folio = mapping->a_ops->free_folio; 241 if (free_folio) 242 free_folio(folio); 243 244 if (folio_test_large(folio)) 245 refs = folio_nr_pages(folio); 246 folio_put_refs(folio, refs); 247 } 248 249 /** 250 * filemap_remove_folio - Remove folio from page cache. 251 * @folio: The folio. 252 * 253 * This must be called only on folios that are locked and have been 254 * verified to be in the page cache. It will never put the folio into 255 * the free list because the caller has a reference on the page. 256 */ 257 void filemap_remove_folio(struct folio *folio) 258 { 259 struct address_space *mapping = folio->mapping; 260 261 BUG_ON(!folio_test_locked(folio)); 262 spin_lock(&mapping->host->i_lock); 263 xa_lock_irq(&mapping->i_pages); 264 __filemap_remove_folio(folio, NULL); 265 xa_unlock_irq(&mapping->i_pages); 266 if (mapping_shrinkable(mapping)) 267 inode_add_lru(mapping->host); 268 spin_unlock(&mapping->host->i_lock); 269 270 filemap_free_folio(mapping, folio); 271 } 272 273 /* 274 * page_cache_delete_batch - delete several folios from page cache 275 * @mapping: the mapping to which folios belong 276 * @fbatch: batch of folios to delete 277 * 278 * The function walks over mapping->i_pages and removes folios passed in 279 * @fbatch from the mapping. The function expects @fbatch to be sorted 280 * by page index and is optimised for it to be dense. 281 * It tolerates holes in @fbatch (mapping entries at those indices are not 282 * modified). 283 * 284 * The function expects the i_pages lock to be held. 285 */ 286 static void page_cache_delete_batch(struct address_space *mapping, 287 struct folio_batch *fbatch) 288 { 289 XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); 290 long total_pages = 0; 291 int i = 0; 292 struct folio *folio; 293 294 mapping_set_update(&xas, mapping); 295 xas_for_each(&xas, folio, ULONG_MAX) { 296 if (i >= folio_batch_count(fbatch)) 297 break; 298 299 /* A swap/dax/shadow entry got inserted? Skip it. */ 300 if (xa_is_value(folio)) 301 continue; 302 /* 303 * A page got inserted in our range? Skip it. We have our 304 * pages locked so they are protected from being removed. 305 * If we see a page whose index is higher than ours, it 306 * means our page has been removed, which shouldn't be 307 * possible because we're holding the PageLock. 308 */ 309 if (folio != fbatch->folios[i]) { 310 VM_BUG_ON_FOLIO(folio->index > 311 fbatch->folios[i]->index, folio); 312 continue; 313 } 314 315 WARN_ON_ONCE(!folio_test_locked(folio)); 316 317 folio->mapping = NULL; 318 /* Leave folio->index set: truncation lookup relies on it */ 319 320 i++; 321 xas_store(&xas, NULL); 322 total_pages += folio_nr_pages(folio); 323 } 324 mapping->nrpages -= total_pages; 325 } 326 327 void delete_from_page_cache_batch(struct address_space *mapping, 328 struct folio_batch *fbatch) 329 { 330 int i; 331 332 if (!folio_batch_count(fbatch)) 333 return; 334 335 spin_lock(&mapping->host->i_lock); 336 xa_lock_irq(&mapping->i_pages); 337 for (i = 0; i < folio_batch_count(fbatch); i++) { 338 struct folio *folio = fbatch->folios[i]; 339 340 trace_mm_filemap_delete_from_page_cache(folio); 341 filemap_unaccount_folio(mapping, folio); 342 } 343 page_cache_delete_batch(mapping, fbatch); 344 xa_unlock_irq(&mapping->i_pages); 345 if (mapping_shrinkable(mapping)) 346 inode_add_lru(mapping->host); 347 spin_unlock(&mapping->host->i_lock); 348 349 for (i = 0; i < folio_batch_count(fbatch); i++) 350 filemap_free_folio(mapping, fbatch->folios[i]); 351 } 352 353 int filemap_check_errors(struct address_space *mapping) 354 { 355 int ret = 0; 356 /* Check for outstanding write errors */ 357 if (test_bit(AS_ENOSPC, &mapping->flags) && 358 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 359 ret = -ENOSPC; 360 if (test_bit(AS_EIO, &mapping->flags) && 361 test_and_clear_bit(AS_EIO, &mapping->flags)) 362 ret = -EIO; 363 return ret; 364 } 365 EXPORT_SYMBOL(filemap_check_errors); 366 367 static int filemap_check_and_keep_errors(struct address_space *mapping) 368 { 369 /* Check for outstanding write errors */ 370 if (test_bit(AS_EIO, &mapping->flags)) 371 return -EIO; 372 if (test_bit(AS_ENOSPC, &mapping->flags)) 373 return -ENOSPC; 374 return 0; 375 } 376 377 /** 378 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range 379 * @mapping: address space structure to write 380 * @wbc: the writeback_control controlling the writeout 381 * 382 * Call writepages on the mapping using the provided wbc to control the 383 * writeout. 384 * 385 * Return: %0 on success, negative error code otherwise. 386 */ 387 int filemap_fdatawrite_wbc(struct address_space *mapping, 388 struct writeback_control *wbc) 389 { 390 int ret; 391 392 if (!mapping_can_writeback(mapping) || 393 !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 394 return 0; 395 396 wbc_attach_fdatawrite_inode(wbc, mapping->host); 397 ret = do_writepages(mapping, wbc); 398 wbc_detach_inode(wbc); 399 return ret; 400 } 401 EXPORT_SYMBOL(filemap_fdatawrite_wbc); 402 403 /** 404 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 405 * @mapping: address space structure to write 406 * @start: offset in bytes where the range starts 407 * @end: offset in bytes where the range ends (inclusive) 408 * @sync_mode: enable synchronous operation 409 * 410 * Start writeback against all of a mapping's dirty pages that lie 411 * within the byte offsets <start, end> inclusive. 412 * 413 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 414 * opposed to a regular memory cleansing writeback. The difference between 415 * these two operations is that if a dirty page/buffer is encountered, it must 416 * be waited upon, and not just skipped over. 417 * 418 * Return: %0 on success, negative error code otherwise. 419 */ 420 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 421 loff_t end, int sync_mode) 422 { 423 struct writeback_control wbc = { 424 .sync_mode = sync_mode, 425 .nr_to_write = LONG_MAX, 426 .range_start = start, 427 .range_end = end, 428 }; 429 430 return filemap_fdatawrite_wbc(mapping, &wbc); 431 } 432 433 static inline int __filemap_fdatawrite(struct address_space *mapping, 434 int sync_mode) 435 { 436 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 437 } 438 439 int filemap_fdatawrite(struct address_space *mapping) 440 { 441 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 442 } 443 EXPORT_SYMBOL(filemap_fdatawrite); 444 445 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 446 loff_t end) 447 { 448 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 449 } 450 EXPORT_SYMBOL(filemap_fdatawrite_range); 451 452 /** 453 * filemap_flush - mostly a non-blocking flush 454 * @mapping: target address_space 455 * 456 * This is a mostly non-blocking flush. Not suitable for data-integrity 457 * purposes - I/O may not be started against all dirty pages. 458 * 459 * Return: %0 on success, negative error code otherwise. 460 */ 461 int filemap_flush(struct address_space *mapping) 462 { 463 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 464 } 465 EXPORT_SYMBOL(filemap_flush); 466 467 /** 468 * filemap_range_has_page - check if a page exists in range. 469 * @mapping: address space within which to check 470 * @start_byte: offset in bytes where the range starts 471 * @end_byte: offset in bytes where the range ends (inclusive) 472 * 473 * Find at least one page in the range supplied, usually used to check if 474 * direct writing in this range will trigger a writeback. 475 * 476 * Return: %true if at least one page exists in the specified range, 477 * %false otherwise. 478 */ 479 bool filemap_range_has_page(struct address_space *mapping, 480 loff_t start_byte, loff_t end_byte) 481 { 482 struct folio *folio; 483 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); 484 pgoff_t max = end_byte >> PAGE_SHIFT; 485 486 if (end_byte < start_byte) 487 return false; 488 489 rcu_read_lock(); 490 for (;;) { 491 folio = xas_find(&xas, max); 492 if (xas_retry(&xas, folio)) 493 continue; 494 /* Shadow entries don't count */ 495 if (xa_is_value(folio)) 496 continue; 497 /* 498 * We don't need to try to pin this page; we're about to 499 * release the RCU lock anyway. It is enough to know that 500 * there was a page here recently. 501 */ 502 break; 503 } 504 rcu_read_unlock(); 505 506 return folio != NULL; 507 } 508 EXPORT_SYMBOL(filemap_range_has_page); 509 510 static void __filemap_fdatawait_range(struct address_space *mapping, 511 loff_t start_byte, loff_t end_byte) 512 { 513 pgoff_t index = start_byte >> PAGE_SHIFT; 514 pgoff_t end = end_byte >> PAGE_SHIFT; 515 struct folio_batch fbatch; 516 unsigned nr_folios; 517 518 folio_batch_init(&fbatch); 519 520 while (index <= end) { 521 unsigned i; 522 523 nr_folios = filemap_get_folios_tag(mapping, &index, end, 524 PAGECACHE_TAG_WRITEBACK, &fbatch); 525 526 if (!nr_folios) 527 break; 528 529 for (i = 0; i < nr_folios; i++) { 530 struct folio *folio = fbatch.folios[i]; 531 532 folio_wait_writeback(folio); 533 } 534 folio_batch_release(&fbatch); 535 cond_resched(); 536 } 537 } 538 539 /** 540 * filemap_fdatawait_range - wait for writeback to complete 541 * @mapping: address space structure to wait for 542 * @start_byte: offset in bytes where the range starts 543 * @end_byte: offset in bytes where the range ends (inclusive) 544 * 545 * Walk the list of under-writeback pages of the given address space 546 * in the given range and wait for all of them. Check error status of 547 * the address space and return it. 548 * 549 * Since the error status of the address space is cleared by this function, 550 * callers are responsible for checking the return value and handling and/or 551 * reporting the error. 552 * 553 * Return: error status of the address space. 554 */ 555 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 556 loff_t end_byte) 557 { 558 __filemap_fdatawait_range(mapping, start_byte, end_byte); 559 return filemap_check_errors(mapping); 560 } 561 EXPORT_SYMBOL(filemap_fdatawait_range); 562 563 /** 564 * filemap_fdatawait_range_keep_errors - wait for writeback to complete 565 * @mapping: address space structure to wait for 566 * @start_byte: offset in bytes where the range starts 567 * @end_byte: offset in bytes where the range ends (inclusive) 568 * 569 * Walk the list of under-writeback pages of the given address space in the 570 * given range and wait for all of them. Unlike filemap_fdatawait_range(), 571 * this function does not clear error status of the address space. 572 * 573 * Use this function if callers don't handle errors themselves. Expected 574 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 575 * fsfreeze(8) 576 */ 577 int filemap_fdatawait_range_keep_errors(struct address_space *mapping, 578 loff_t start_byte, loff_t end_byte) 579 { 580 __filemap_fdatawait_range(mapping, start_byte, end_byte); 581 return filemap_check_and_keep_errors(mapping); 582 } 583 EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); 584 585 /** 586 * file_fdatawait_range - wait for writeback to complete 587 * @file: file pointing to address space structure to wait for 588 * @start_byte: offset in bytes where the range starts 589 * @end_byte: offset in bytes where the range ends (inclusive) 590 * 591 * Walk the list of under-writeback pages of the address space that file 592 * refers to, in the given range and wait for all of them. Check error 593 * status of the address space vs. the file->f_wb_err cursor and return it. 594 * 595 * Since the error status of the file is advanced by this function, 596 * callers are responsible for checking the return value and handling and/or 597 * reporting the error. 598 * 599 * Return: error status of the address space vs. the file->f_wb_err cursor. 600 */ 601 int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) 602 { 603 struct address_space *mapping = file->f_mapping; 604 605 __filemap_fdatawait_range(mapping, start_byte, end_byte); 606 return file_check_and_advance_wb_err(file); 607 } 608 EXPORT_SYMBOL(file_fdatawait_range); 609 610 /** 611 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 612 * @mapping: address space structure to wait for 613 * 614 * Walk the list of under-writeback pages of the given address space 615 * and wait for all of them. Unlike filemap_fdatawait(), this function 616 * does not clear error status of the address space. 617 * 618 * Use this function if callers don't handle errors themselves. Expected 619 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 620 * fsfreeze(8) 621 * 622 * Return: error status of the address space. 623 */ 624 int filemap_fdatawait_keep_errors(struct address_space *mapping) 625 { 626 __filemap_fdatawait_range(mapping, 0, LLONG_MAX); 627 return filemap_check_and_keep_errors(mapping); 628 } 629 EXPORT_SYMBOL(filemap_fdatawait_keep_errors); 630 631 /* Returns true if writeback might be needed or already in progress. */ 632 static bool mapping_needs_writeback(struct address_space *mapping) 633 { 634 return mapping->nrpages; 635 } 636 637 bool filemap_range_has_writeback(struct address_space *mapping, 638 loff_t start_byte, loff_t end_byte) 639 { 640 XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); 641 pgoff_t max = end_byte >> PAGE_SHIFT; 642 struct folio *folio; 643 644 if (end_byte < start_byte) 645 return false; 646 647 rcu_read_lock(); 648 xas_for_each(&xas, folio, max) { 649 if (xas_retry(&xas, folio)) 650 continue; 651 if (xa_is_value(folio)) 652 continue; 653 if (folio_test_dirty(folio) || folio_test_locked(folio) || 654 folio_test_writeback(folio)) 655 break; 656 } 657 rcu_read_unlock(); 658 return folio != NULL; 659 } 660 EXPORT_SYMBOL_GPL(filemap_range_has_writeback); 661 662 /** 663 * filemap_write_and_wait_range - write out & wait on a file range 664 * @mapping: the address_space for the pages 665 * @lstart: offset in bytes where the range starts 666 * @lend: offset in bytes where the range ends (inclusive) 667 * 668 * Write out and wait upon file offsets lstart->lend, inclusive. 669 * 670 * Note that @lend is inclusive (describes the last byte to be written) so 671 * that this function can be used to write to the very end-of-file (end = -1). 672 * 673 * Return: error status of the address space. 674 */ 675 int filemap_write_and_wait_range(struct address_space *mapping, 676 loff_t lstart, loff_t lend) 677 { 678 int err = 0, err2; 679 680 if (lend < lstart) 681 return 0; 682 683 if (mapping_needs_writeback(mapping)) { 684 err = __filemap_fdatawrite_range(mapping, lstart, lend, 685 WB_SYNC_ALL); 686 /* 687 * Even if the above returned error, the pages may be 688 * written partially (e.g. -ENOSPC), so we wait for it. 689 * But the -EIO is special case, it may indicate the worst 690 * thing (e.g. bug) happened, so we avoid waiting for it. 691 */ 692 if (err != -EIO) 693 __filemap_fdatawait_range(mapping, lstart, lend); 694 } 695 err2 = filemap_check_errors(mapping); 696 if (!err) 697 err = err2; 698 return err; 699 } 700 EXPORT_SYMBOL(filemap_write_and_wait_range); 701 702 void __filemap_set_wb_err(struct address_space *mapping, int err) 703 { 704 errseq_t eseq = errseq_set(&mapping->wb_err, err); 705 706 trace_filemap_set_wb_err(mapping, eseq); 707 } 708 EXPORT_SYMBOL(__filemap_set_wb_err); 709 710 /** 711 * file_check_and_advance_wb_err - report wb error (if any) that was previously 712 * and advance wb_err to current one 713 * @file: struct file on which the error is being reported 714 * 715 * When userland calls fsync (or something like nfsd does the equivalent), we 716 * want to report any writeback errors that occurred since the last fsync (or 717 * since the file was opened if there haven't been any). 718 * 719 * Grab the wb_err from the mapping. If it matches what we have in the file, 720 * then just quickly return 0. The file is all caught up. 721 * 722 * If it doesn't match, then take the mapping value, set the "seen" flag in 723 * it and try to swap it into place. If it works, or another task beat us 724 * to it with the new value, then update the f_wb_err and return the error 725 * portion. The error at this point must be reported via proper channels 726 * (a'la fsync, or NFS COMMIT operation, etc.). 727 * 728 * While we handle mapping->wb_err with atomic operations, the f_wb_err 729 * value is protected by the f_lock since we must ensure that it reflects 730 * the latest value swapped in for this file descriptor. 731 * 732 * Return: %0 on success, negative error code otherwise. 733 */ 734 int file_check_and_advance_wb_err(struct file *file) 735 { 736 int err = 0; 737 errseq_t old = READ_ONCE(file->f_wb_err); 738 struct address_space *mapping = file->f_mapping; 739 740 /* Locklessly handle the common case where nothing has changed */ 741 if (errseq_check(&mapping->wb_err, old)) { 742 /* Something changed, must use slow path */ 743 spin_lock(&file->f_lock); 744 old = file->f_wb_err; 745 err = errseq_check_and_advance(&mapping->wb_err, 746 &file->f_wb_err); 747 trace_file_check_and_advance_wb_err(file, old); 748 spin_unlock(&file->f_lock); 749 } 750 751 /* 752 * We're mostly using this function as a drop in replacement for 753 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect 754 * that the legacy code would have had on these flags. 755 */ 756 clear_bit(AS_EIO, &mapping->flags); 757 clear_bit(AS_ENOSPC, &mapping->flags); 758 return err; 759 } 760 EXPORT_SYMBOL(file_check_and_advance_wb_err); 761 762 /** 763 * file_write_and_wait_range - write out & wait on a file range 764 * @file: file pointing to address_space with pages 765 * @lstart: offset in bytes where the range starts 766 * @lend: offset in bytes where the range ends (inclusive) 767 * 768 * Write out and wait upon file offsets lstart->lend, inclusive. 769 * 770 * Note that @lend is inclusive (describes the last byte to be written) so 771 * that this function can be used to write to the very end-of-file (end = -1). 772 * 773 * After writing out and waiting on the data, we check and advance the 774 * f_wb_err cursor to the latest value, and return any errors detected there. 775 * 776 * Return: %0 on success, negative error code otherwise. 777 */ 778 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) 779 { 780 int err = 0, err2; 781 struct address_space *mapping = file->f_mapping; 782 783 if (lend < lstart) 784 return 0; 785 786 if (mapping_needs_writeback(mapping)) { 787 err = __filemap_fdatawrite_range(mapping, lstart, lend, 788 WB_SYNC_ALL); 789 /* See comment of filemap_write_and_wait() */ 790 if (err != -EIO) 791 __filemap_fdatawait_range(mapping, lstart, lend); 792 } 793 err2 = file_check_and_advance_wb_err(file); 794 if (!err) 795 err = err2; 796 return err; 797 } 798 EXPORT_SYMBOL(file_write_and_wait_range); 799 800 /** 801 * replace_page_cache_folio - replace a pagecache folio with a new one 802 * @old: folio to be replaced 803 * @new: folio to replace with 804 * 805 * This function replaces a folio in the pagecache with a new one. On 806 * success it acquires the pagecache reference for the new folio and 807 * drops it for the old folio. Both the old and new folios must be 808 * locked. This function does not add the new folio to the LRU, the 809 * caller must do that. 810 * 811 * The remove + add is atomic. This function cannot fail. 812 */ 813 void replace_page_cache_folio(struct folio *old, struct folio *new) 814 { 815 struct address_space *mapping = old->mapping; 816 void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; 817 pgoff_t offset = old->index; 818 XA_STATE(xas, &mapping->i_pages, offset); 819 820 VM_BUG_ON_FOLIO(!folio_test_locked(old), old); 821 VM_BUG_ON_FOLIO(!folio_test_locked(new), new); 822 VM_BUG_ON_FOLIO(new->mapping, new); 823 824 folio_get(new); 825 new->mapping = mapping; 826 new->index = offset; 827 828 mem_cgroup_replace_folio(old, new); 829 830 xas_lock_irq(&xas); 831 xas_store(&xas, new); 832 833 old->mapping = NULL; 834 /* hugetlb pages do not participate in page cache accounting. */ 835 if (!folio_test_hugetlb(old)) 836 __lruvec_stat_sub_folio(old, NR_FILE_PAGES); 837 if (!folio_test_hugetlb(new)) 838 __lruvec_stat_add_folio(new, NR_FILE_PAGES); 839 if (folio_test_swapbacked(old)) 840 __lruvec_stat_sub_folio(old, NR_SHMEM); 841 if (folio_test_swapbacked(new)) 842 __lruvec_stat_add_folio(new, NR_SHMEM); 843 xas_unlock_irq(&xas); 844 if (free_folio) 845 free_folio(old); 846 folio_put(old); 847 } 848 EXPORT_SYMBOL_GPL(replace_page_cache_folio); 849 850 noinline int __filemap_add_folio(struct address_space *mapping, 851 struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) 852 { 853 XA_STATE(xas, &mapping->i_pages, index); 854 void *alloced_shadow = NULL; 855 int alloced_order = 0; 856 bool huge; 857 long nr; 858 859 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 860 VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); 861 VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), 862 folio); 863 mapping_set_update(&xas, mapping); 864 865 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); 866 xas_set_order(&xas, index, folio_order(folio)); 867 huge = folio_test_hugetlb(folio); 868 nr = folio_nr_pages(folio); 869 870 gfp &= GFP_RECLAIM_MASK; 871 folio_ref_add(folio, nr); 872 folio->mapping = mapping; 873 folio->index = xas.xa_index; 874 875 for (;;) { 876 int order = -1, split_order = 0; 877 void *entry, *old = NULL; 878 879 xas_lock_irq(&xas); 880 xas_for_each_conflict(&xas, entry) { 881 old = entry; 882 if (!xa_is_value(entry)) { 883 xas_set_err(&xas, -EEXIST); 884 goto unlock; 885 } 886 /* 887 * If a larger entry exists, 888 * it will be the first and only entry iterated. 889 */ 890 if (order == -1) 891 order = xas_get_order(&xas); 892 } 893 894 /* entry may have changed before we re-acquire the lock */ 895 if (alloced_order && (old != alloced_shadow || order != alloced_order)) { 896 xas_destroy(&xas); 897 alloced_order = 0; 898 } 899 900 if (old) { 901 if (order > 0 && order > folio_order(folio)) { 902 /* How to handle large swap entries? */ 903 BUG_ON(shmem_mapping(mapping)); 904 if (!alloced_order) { 905 split_order = order; 906 goto unlock; 907 } 908 xas_split(&xas, old, order); 909 xas_reset(&xas); 910 } 911 if (shadowp) 912 *shadowp = old; 913 } 914 915 xas_store(&xas, folio); 916 if (xas_error(&xas)) 917 goto unlock; 918 919 mapping->nrpages += nr; 920 921 /* hugetlb pages do not participate in page cache accounting */ 922 if (!huge) { 923 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 924 if (folio_test_pmd_mappable(folio)) 925 __lruvec_stat_mod_folio(folio, 926 NR_FILE_THPS, nr); 927 } 928 929 unlock: 930 xas_unlock_irq(&xas); 931 932 /* split needed, alloc here and retry. */ 933 if (split_order) { 934 xas_split_alloc(&xas, old, split_order, gfp); 935 if (xas_error(&xas)) 936 goto error; 937 alloced_shadow = old; 938 alloced_order = split_order; 939 xas_reset(&xas); 940 continue; 941 } 942 943 if (!xas_nomem(&xas, gfp)) 944 break; 945 } 946 947 if (xas_error(&xas)) 948 goto error; 949 950 trace_mm_filemap_add_to_page_cache(folio); 951 return 0; 952 error: 953 folio->mapping = NULL; 954 /* Leave page->index set: truncation relies upon it */ 955 folio_put_refs(folio, nr); 956 return xas_error(&xas); 957 } 958 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); 959 960 int filemap_add_folio(struct address_space *mapping, struct folio *folio, 961 pgoff_t index, gfp_t gfp) 962 { 963 void *shadow = NULL; 964 int ret; 965 966 ret = mem_cgroup_charge(folio, NULL, gfp); 967 if (ret) 968 return ret; 969 970 __folio_set_locked(folio); 971 ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); 972 if (unlikely(ret)) { 973 mem_cgroup_uncharge(folio); 974 __folio_clear_locked(folio); 975 } else { 976 /* 977 * The folio might have been evicted from cache only 978 * recently, in which case it should be activated like 979 * any other repeatedly accessed folio. 980 * The exception is folios getting rewritten; evicting other 981 * data from the working set, only to cache data that will 982 * get overwritten with something else, is a waste of memory. 983 */ 984 WARN_ON_ONCE(folio_test_active(folio)); 985 if (!(gfp & __GFP_WRITE) && shadow) 986 workingset_refault(folio, shadow); 987 folio_add_lru(folio); 988 } 989 return ret; 990 } 991 EXPORT_SYMBOL_GPL(filemap_add_folio); 992 993 #ifdef CONFIG_NUMA 994 struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) 995 { 996 int n; 997 struct folio *folio; 998 999 if (cpuset_do_page_mem_spread()) { 1000 unsigned int cpuset_mems_cookie; 1001 do { 1002 cpuset_mems_cookie = read_mems_allowed_begin(); 1003 n = cpuset_mem_spread_node(); 1004 folio = __folio_alloc_node_noprof(gfp, order, n); 1005 } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); 1006 1007 return folio; 1008 } 1009 return folio_alloc_noprof(gfp, order); 1010 } 1011 EXPORT_SYMBOL(filemap_alloc_folio_noprof); 1012 #endif 1013 1014 /* 1015 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings 1016 * 1017 * Lock exclusively invalidate_lock of any passed mapping that is not NULL. 1018 * 1019 * @mapping1: the first mapping to lock 1020 * @mapping2: the second mapping to lock 1021 */ 1022 void filemap_invalidate_lock_two(struct address_space *mapping1, 1023 struct address_space *mapping2) 1024 { 1025 if (mapping1 > mapping2) 1026 swap(mapping1, mapping2); 1027 if (mapping1) 1028 down_write(&mapping1->invalidate_lock); 1029 if (mapping2 && mapping1 != mapping2) 1030 down_write_nested(&mapping2->invalidate_lock, 1); 1031 } 1032 EXPORT_SYMBOL(filemap_invalidate_lock_two); 1033 1034 /* 1035 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings 1036 * 1037 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. 1038 * 1039 * @mapping1: the first mapping to unlock 1040 * @mapping2: the second mapping to unlock 1041 */ 1042 void filemap_invalidate_unlock_two(struct address_space *mapping1, 1043 struct address_space *mapping2) 1044 { 1045 if (mapping1) 1046 up_write(&mapping1->invalidate_lock); 1047 if (mapping2 && mapping1 != mapping2) 1048 up_write(&mapping2->invalidate_lock); 1049 } 1050 EXPORT_SYMBOL(filemap_invalidate_unlock_two); 1051 1052 /* 1053 * In order to wait for pages to become available there must be 1054 * waitqueues associated with pages. By using a hash table of 1055 * waitqueues where the bucket discipline is to maintain all 1056 * waiters on the same queue and wake all when any of the pages 1057 * become available, and for the woken contexts to check to be 1058 * sure the appropriate page became available, this saves space 1059 * at a cost of "thundering herd" phenomena during rare hash 1060 * collisions. 1061 */ 1062 #define PAGE_WAIT_TABLE_BITS 8 1063 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) 1064 static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; 1065 1066 static wait_queue_head_t *folio_waitqueue(struct folio *folio) 1067 { 1068 return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; 1069 } 1070 1071 void __init pagecache_init(void) 1072 { 1073 int i; 1074 1075 for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) 1076 init_waitqueue_head(&folio_wait_table[i]); 1077 1078 page_writeback_init(); 1079 } 1080 1081 /* 1082 * The page wait code treats the "wait->flags" somewhat unusually, because 1083 * we have multiple different kinds of waits, not just the usual "exclusive" 1084 * one. 1085 * 1086 * We have: 1087 * 1088 * (a) no special bits set: 1089 * 1090 * We're just waiting for the bit to be released, and when a waker 1091 * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, 1092 * and remove it from the wait queue. 1093 * 1094 * Simple and straightforward. 1095 * 1096 * (b) WQ_FLAG_EXCLUSIVE: 1097 * 1098 * The waiter is waiting to get the lock, and only one waiter should 1099 * be woken up to avoid any thundering herd behavior. We'll set the 1100 * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. 1101 * 1102 * This is the traditional exclusive wait. 1103 * 1104 * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: 1105 * 1106 * The waiter is waiting to get the bit, and additionally wants the 1107 * lock to be transferred to it for fair lock behavior. If the lock 1108 * cannot be taken, we stop walking the wait queue without waking 1109 * the waiter. 1110 * 1111 * This is the "fair lock handoff" case, and in addition to setting 1112 * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see 1113 * that it now has the lock. 1114 */ 1115 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) 1116 { 1117 unsigned int flags; 1118 struct wait_page_key *key = arg; 1119 struct wait_page_queue *wait_page 1120 = container_of(wait, struct wait_page_queue, wait); 1121 1122 if (!wake_page_match(wait_page, key)) 1123 return 0; 1124 1125 /* 1126 * If it's a lock handoff wait, we get the bit for it, and 1127 * stop walking (and do not wake it up) if we can't. 1128 */ 1129 flags = wait->flags; 1130 if (flags & WQ_FLAG_EXCLUSIVE) { 1131 if (test_bit(key->bit_nr, &key->folio->flags)) 1132 return -1; 1133 if (flags & WQ_FLAG_CUSTOM) { 1134 if (test_and_set_bit(key->bit_nr, &key->folio->flags)) 1135 return -1; 1136 flags |= WQ_FLAG_DONE; 1137 } 1138 } 1139 1140 /* 1141 * We are holding the wait-queue lock, but the waiter that 1142 * is waiting for this will be checking the flags without 1143 * any locking. 1144 * 1145 * So update the flags atomically, and wake up the waiter 1146 * afterwards to avoid any races. This store-release pairs 1147 * with the load-acquire in folio_wait_bit_common(). 1148 */ 1149 smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); 1150 wake_up_state(wait->private, mode); 1151 1152 /* 1153 * Ok, we have successfully done what we're waiting for, 1154 * and we can unconditionally remove the wait entry. 1155 * 1156 * Note that this pairs with the "finish_wait()" in the 1157 * waiter, and has to be the absolute last thing we do. 1158 * After this list_del_init(&wait->entry) the wait entry 1159 * might be de-allocated and the process might even have 1160 * exited. 1161 */ 1162 list_del_init_careful(&wait->entry); 1163 return (flags & WQ_FLAG_EXCLUSIVE) != 0; 1164 } 1165 1166 static void folio_wake_bit(struct folio *folio, int bit_nr) 1167 { 1168 wait_queue_head_t *q = folio_waitqueue(folio); 1169 struct wait_page_key key; 1170 unsigned long flags; 1171 1172 key.folio = folio; 1173 key.bit_nr = bit_nr; 1174 key.page_match = 0; 1175 1176 spin_lock_irqsave(&q->lock, flags); 1177 __wake_up_locked_key(q, TASK_NORMAL, &key); 1178 1179 /* 1180 * It's possible to miss clearing waiters here, when we woke our page 1181 * waiters, but the hashed waitqueue has waiters for other pages on it. 1182 * That's okay, it's a rare case. The next waker will clear it. 1183 * 1184 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, 1185 * other), the flag may be cleared in the course of freeing the page; 1186 * but that is not required for correctness. 1187 */ 1188 if (!waitqueue_active(q) || !key.page_match) 1189 folio_clear_waiters(folio); 1190 1191 spin_unlock_irqrestore(&q->lock, flags); 1192 } 1193 1194 /* 1195 * A choice of three behaviors for folio_wait_bit_common(): 1196 */ 1197 enum behavior { 1198 EXCLUSIVE, /* Hold ref to page and take the bit when woken, like 1199 * __folio_lock() waiting on then setting PG_locked. 1200 */ 1201 SHARED, /* Hold ref to page and check the bit when woken, like 1202 * folio_wait_writeback() waiting on PG_writeback. 1203 */ 1204 DROP, /* Drop ref to page before wait, no check when woken, 1205 * like folio_put_wait_locked() on PG_locked. 1206 */ 1207 }; 1208 1209 /* 1210 * Attempt to check (or get) the folio flag, and mark us done 1211 * if successful. 1212 */ 1213 static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, 1214 struct wait_queue_entry *wait) 1215 { 1216 if (wait->flags & WQ_FLAG_EXCLUSIVE) { 1217 if (test_and_set_bit(bit_nr, &folio->flags)) 1218 return false; 1219 } else if (test_bit(bit_nr, &folio->flags)) 1220 return false; 1221 1222 wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; 1223 return true; 1224 } 1225 1226 /* How many times do we accept lock stealing from under a waiter? */ 1227 int sysctl_page_lock_unfairness = 5; 1228 1229 static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, 1230 int state, enum behavior behavior) 1231 { 1232 wait_queue_head_t *q = folio_waitqueue(folio); 1233 int unfairness = sysctl_page_lock_unfairness; 1234 struct wait_page_queue wait_page; 1235 wait_queue_entry_t *wait = &wait_page.wait; 1236 bool thrashing = false; 1237 unsigned long pflags; 1238 bool in_thrashing; 1239 1240 if (bit_nr == PG_locked && 1241 !folio_test_uptodate(folio) && folio_test_workingset(folio)) { 1242 delayacct_thrashing_start(&in_thrashing); 1243 psi_memstall_enter(&pflags); 1244 thrashing = true; 1245 } 1246 1247 init_wait(wait); 1248 wait->func = wake_page_function; 1249 wait_page.folio = folio; 1250 wait_page.bit_nr = bit_nr; 1251 1252 repeat: 1253 wait->flags = 0; 1254 if (behavior == EXCLUSIVE) { 1255 wait->flags = WQ_FLAG_EXCLUSIVE; 1256 if (--unfairness < 0) 1257 wait->flags |= WQ_FLAG_CUSTOM; 1258 } 1259 1260 /* 1261 * Do one last check whether we can get the 1262 * page bit synchronously. 1263 * 1264 * Do the folio_set_waiters() marking before that 1265 * to let any waker we _just_ missed know they 1266 * need to wake us up (otherwise they'll never 1267 * even go to the slow case that looks at the 1268 * page queue), and add ourselves to the wait 1269 * queue if we need to sleep. 1270 * 1271 * This part needs to be done under the queue 1272 * lock to avoid races. 1273 */ 1274 spin_lock_irq(&q->lock); 1275 folio_set_waiters(folio); 1276 if (!folio_trylock_flag(folio, bit_nr, wait)) 1277 __add_wait_queue_entry_tail(q, wait); 1278 spin_unlock_irq(&q->lock); 1279 1280 /* 1281 * From now on, all the logic will be based on 1282 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to 1283 * see whether the page bit testing has already 1284 * been done by the wake function. 1285 * 1286 * We can drop our reference to the folio. 1287 */ 1288 if (behavior == DROP) 1289 folio_put(folio); 1290 1291 /* 1292 * Note that until the "finish_wait()", or until 1293 * we see the WQ_FLAG_WOKEN flag, we need to 1294 * be very careful with the 'wait->flags', because 1295 * we may race with a waker that sets them. 1296 */ 1297 for (;;) { 1298 unsigned int flags; 1299 1300 set_current_state(state); 1301 1302 /* Loop until we've been woken or interrupted */ 1303 flags = smp_load_acquire(&wait->flags); 1304 if (!(flags & WQ_FLAG_WOKEN)) { 1305 if (signal_pending_state(state, current)) 1306 break; 1307 1308 io_schedule(); 1309 continue; 1310 } 1311 1312 /* If we were non-exclusive, we're done */ 1313 if (behavior != EXCLUSIVE) 1314 break; 1315 1316 /* If the waker got the lock for us, we're done */ 1317 if (flags & WQ_FLAG_DONE) 1318 break; 1319 1320 /* 1321 * Otherwise, if we're getting the lock, we need to 1322 * try to get it ourselves. 1323 * 1324 * And if that fails, we'll have to retry this all. 1325 */ 1326 if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) 1327 goto repeat; 1328 1329 wait->flags |= WQ_FLAG_DONE; 1330 break; 1331 } 1332 1333 /* 1334 * If a signal happened, this 'finish_wait()' may remove the last 1335 * waiter from the wait-queues, but the folio waiters bit will remain 1336 * set. That's ok. The next wakeup will take care of it, and trying 1337 * to do it here would be difficult and prone to races. 1338 */ 1339 finish_wait(q, wait); 1340 1341 if (thrashing) { 1342 delayacct_thrashing_end(&in_thrashing); 1343 psi_memstall_leave(&pflags); 1344 } 1345 1346 /* 1347 * NOTE! The wait->flags weren't stable until we've done the 1348 * 'finish_wait()', and we could have exited the loop above due 1349 * to a signal, and had a wakeup event happen after the signal 1350 * test but before the 'finish_wait()'. 1351 * 1352 * So only after the finish_wait() can we reliably determine 1353 * if we got woken up or not, so we can now figure out the final 1354 * return value based on that state without races. 1355 * 1356 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive 1357 * waiter, but an exclusive one requires WQ_FLAG_DONE. 1358 */ 1359 if (behavior == EXCLUSIVE) 1360 return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; 1361 1362 return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; 1363 } 1364 1365 #ifdef CONFIG_MIGRATION 1366 /** 1367 * migration_entry_wait_on_locked - Wait for a migration entry to be removed 1368 * @entry: migration swap entry. 1369 * @ptl: already locked ptl. This function will drop the lock. 1370 * 1371 * Wait for a migration entry referencing the given page to be removed. This is 1372 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except 1373 * this can be called without taking a reference on the page. Instead this 1374 * should be called while holding the ptl for the migration entry referencing 1375 * the page. 1376 * 1377 * Returns after unlocking the ptl. 1378 * 1379 * This follows the same logic as folio_wait_bit_common() so see the comments 1380 * there. 1381 */ 1382 void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) 1383 __releases(ptl) 1384 { 1385 struct wait_page_queue wait_page; 1386 wait_queue_entry_t *wait = &wait_page.wait; 1387 bool thrashing = false; 1388 unsigned long pflags; 1389 bool in_thrashing; 1390 wait_queue_head_t *q; 1391 struct folio *folio = pfn_swap_entry_folio(entry); 1392 1393 q = folio_waitqueue(folio); 1394 if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { 1395 delayacct_thrashing_start(&in_thrashing); 1396 psi_memstall_enter(&pflags); 1397 thrashing = true; 1398 } 1399 1400 init_wait(wait); 1401 wait->func = wake_page_function; 1402 wait_page.folio = folio; 1403 wait_page.bit_nr = PG_locked; 1404 wait->flags = 0; 1405 1406 spin_lock_irq(&q->lock); 1407 folio_set_waiters(folio); 1408 if (!folio_trylock_flag(folio, PG_locked, wait)) 1409 __add_wait_queue_entry_tail(q, wait); 1410 spin_unlock_irq(&q->lock); 1411 1412 /* 1413 * If a migration entry exists for the page the migration path must hold 1414 * a valid reference to the page, and it must take the ptl to remove the 1415 * migration entry. So the page is valid until the ptl is dropped. 1416 */ 1417 spin_unlock(ptl); 1418 1419 for (;;) { 1420 unsigned int flags; 1421 1422 set_current_state(TASK_UNINTERRUPTIBLE); 1423 1424 /* Loop until we've been woken or interrupted */ 1425 flags = smp_load_acquire(&wait->flags); 1426 if (!(flags & WQ_FLAG_WOKEN)) { 1427 if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) 1428 break; 1429 1430 io_schedule(); 1431 continue; 1432 } 1433 break; 1434 } 1435 1436 finish_wait(q, wait); 1437 1438 if (thrashing) { 1439 delayacct_thrashing_end(&in_thrashing); 1440 psi_memstall_leave(&pflags); 1441 } 1442 } 1443 #endif 1444 1445 void folio_wait_bit(struct folio *folio, int bit_nr) 1446 { 1447 folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); 1448 } 1449 EXPORT_SYMBOL(folio_wait_bit); 1450 1451 int folio_wait_bit_killable(struct folio *folio, int bit_nr) 1452 { 1453 return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); 1454 } 1455 EXPORT_SYMBOL(folio_wait_bit_killable); 1456 1457 /** 1458 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked 1459 * @folio: The folio to wait for. 1460 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). 1461 * 1462 * The caller should hold a reference on @folio. They expect the page to 1463 * become unlocked relatively soon, but do not wish to hold up migration 1464 * (for example) by holding the reference while waiting for the folio to 1465 * come unlocked. After this function returns, the caller should not 1466 * dereference @folio. 1467 * 1468 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. 1469 */ 1470 static int folio_put_wait_locked(struct folio *folio, int state) 1471 { 1472 return folio_wait_bit_common(folio, PG_locked, state, DROP); 1473 } 1474 1475 /** 1476 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue 1477 * @folio: Folio defining the wait queue of interest 1478 * @waiter: Waiter to add to the queue 1479 * 1480 * Add an arbitrary @waiter to the wait queue for the nominated @folio. 1481 */ 1482 void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) 1483 { 1484 wait_queue_head_t *q = folio_waitqueue(folio); 1485 unsigned long flags; 1486 1487 spin_lock_irqsave(&q->lock, flags); 1488 __add_wait_queue_entry_tail(q, waiter); 1489 folio_set_waiters(folio); 1490 spin_unlock_irqrestore(&q->lock, flags); 1491 } 1492 EXPORT_SYMBOL_GPL(folio_add_wait_queue); 1493 1494 /** 1495 * folio_unlock - Unlock a locked folio. 1496 * @folio: The folio. 1497 * 1498 * Unlocks the folio and wakes up any thread sleeping on the page lock. 1499 * 1500 * Context: May be called from interrupt or process context. May not be 1501 * called from NMI context. 1502 */ 1503 void folio_unlock(struct folio *folio) 1504 { 1505 /* Bit 7 allows x86 to check the byte's sign bit */ 1506 BUILD_BUG_ON(PG_waiters != 7); 1507 BUILD_BUG_ON(PG_locked > 7); 1508 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1509 if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) 1510 folio_wake_bit(folio, PG_locked); 1511 } 1512 EXPORT_SYMBOL(folio_unlock); 1513 1514 /** 1515 * folio_end_read - End read on a folio. 1516 * @folio: The folio. 1517 * @success: True if all reads completed successfully. 1518 * 1519 * When all reads against a folio have completed, filesystems should 1520 * call this function to let the pagecache know that no more reads 1521 * are outstanding. This will unlock the folio and wake up any thread 1522 * sleeping on the lock. The folio will also be marked uptodate if all 1523 * reads succeeded. 1524 * 1525 * Context: May be called from interrupt or process context. May not be 1526 * called from NMI context. 1527 */ 1528 void folio_end_read(struct folio *folio, bool success) 1529 { 1530 unsigned long mask = 1 << PG_locked; 1531 1532 /* Must be in bottom byte for x86 to work */ 1533 BUILD_BUG_ON(PG_uptodate > 7); 1534 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1535 VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); 1536 1537 if (likely(success)) 1538 mask |= 1 << PG_uptodate; 1539 if (folio_xor_flags_has_waiters(folio, mask)) 1540 folio_wake_bit(folio, PG_locked); 1541 } 1542 EXPORT_SYMBOL(folio_end_read); 1543 1544 /** 1545 * folio_end_private_2 - Clear PG_private_2 and wake any waiters. 1546 * @folio: The folio. 1547 * 1548 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for 1549 * it. The folio reference held for PG_private_2 being set is released. 1550 * 1551 * This is, for example, used when a netfs folio is being written to a local 1552 * disk cache, thereby allowing writes to the cache for the same folio to be 1553 * serialised. 1554 */ 1555 void folio_end_private_2(struct folio *folio) 1556 { 1557 VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); 1558 clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); 1559 folio_wake_bit(folio, PG_private_2); 1560 folio_put(folio); 1561 } 1562 EXPORT_SYMBOL(folio_end_private_2); 1563 1564 /** 1565 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. 1566 * @folio: The folio to wait on. 1567 * 1568 * Wait for PG_private_2 to be cleared on a folio. 1569 */ 1570 void folio_wait_private_2(struct folio *folio) 1571 { 1572 while (folio_test_private_2(folio)) 1573 folio_wait_bit(folio, PG_private_2); 1574 } 1575 EXPORT_SYMBOL(folio_wait_private_2); 1576 1577 /** 1578 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. 1579 * @folio: The folio to wait on. 1580 * 1581 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is 1582 * received by the calling task. 1583 * 1584 * Return: 1585 * - 0 if successful. 1586 * - -EINTR if a fatal signal was encountered. 1587 */ 1588 int folio_wait_private_2_killable(struct folio *folio) 1589 { 1590 int ret = 0; 1591 1592 while (folio_test_private_2(folio)) { 1593 ret = folio_wait_bit_killable(folio, PG_private_2); 1594 if (ret < 0) 1595 break; 1596 } 1597 1598 return ret; 1599 } 1600 EXPORT_SYMBOL(folio_wait_private_2_killable); 1601 1602 /** 1603 * folio_end_writeback - End writeback against a folio. 1604 * @folio: The folio. 1605 * 1606 * The folio must actually be under writeback. 1607 * 1608 * Context: May be called from process or interrupt context. 1609 */ 1610 void folio_end_writeback(struct folio *folio) 1611 { 1612 VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); 1613 1614 /* 1615 * folio_test_clear_reclaim() could be used here but it is an 1616 * atomic operation and overkill in this particular case. Failing 1617 * to shuffle a folio marked for immediate reclaim is too mild 1618 * a gain to justify taking an atomic operation penalty at the 1619 * end of every folio writeback. 1620 */ 1621 if (folio_test_reclaim(folio)) { 1622 folio_clear_reclaim(folio); 1623 folio_rotate_reclaimable(folio); 1624 } 1625 1626 /* 1627 * Writeback does not hold a folio reference of its own, relying 1628 * on truncation to wait for the clearing of PG_writeback. 1629 * But here we must make sure that the folio is not freed and 1630 * reused before the folio_wake_bit(). 1631 */ 1632 folio_get(folio); 1633 if (__folio_end_writeback(folio)) 1634 folio_wake_bit(folio, PG_writeback); 1635 acct_reclaim_writeback(folio); 1636 folio_put(folio); 1637 } 1638 EXPORT_SYMBOL(folio_end_writeback); 1639 1640 /** 1641 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. 1642 * @folio: The folio to lock 1643 */ 1644 void __folio_lock(struct folio *folio) 1645 { 1646 folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, 1647 EXCLUSIVE); 1648 } 1649 EXPORT_SYMBOL(__folio_lock); 1650 1651 int __folio_lock_killable(struct folio *folio) 1652 { 1653 return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, 1654 EXCLUSIVE); 1655 } 1656 EXPORT_SYMBOL_GPL(__folio_lock_killable); 1657 1658 static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) 1659 { 1660 struct wait_queue_head *q = folio_waitqueue(folio); 1661 int ret; 1662 1663 wait->folio = folio; 1664 wait->bit_nr = PG_locked; 1665 1666 spin_lock_irq(&q->lock); 1667 __add_wait_queue_entry_tail(q, &wait->wait); 1668 folio_set_waiters(folio); 1669 ret = !folio_trylock(folio); 1670 /* 1671 * If we were successful now, we know we're still on the 1672 * waitqueue as we're still under the lock. This means it's 1673 * safe to remove and return success, we know the callback 1674 * isn't going to trigger. 1675 */ 1676 if (!ret) 1677 __remove_wait_queue(q, &wait->wait); 1678 else 1679 ret = -EIOCBQUEUED; 1680 spin_unlock_irq(&q->lock); 1681 return ret; 1682 } 1683 1684 /* 1685 * Return values: 1686 * 0 - folio is locked. 1687 * non-zero - folio is not locked. 1688 * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or 1689 * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and 1690 * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. 1691 * 1692 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 1693 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. 1694 */ 1695 vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) 1696 { 1697 unsigned int flags = vmf->flags; 1698 1699 if (fault_flag_allow_retry_first(flags)) { 1700 /* 1701 * CAUTION! In this case, mmap_lock/per-VMA lock is not 1702 * released even though returning VM_FAULT_RETRY. 1703 */ 1704 if (flags & FAULT_FLAG_RETRY_NOWAIT) 1705 return VM_FAULT_RETRY; 1706 1707 release_fault_lock(vmf); 1708 if (flags & FAULT_FLAG_KILLABLE) 1709 folio_wait_locked_killable(folio); 1710 else 1711 folio_wait_locked(folio); 1712 return VM_FAULT_RETRY; 1713 } 1714 if (flags & FAULT_FLAG_KILLABLE) { 1715 bool ret; 1716 1717 ret = __folio_lock_killable(folio); 1718 if (ret) { 1719 release_fault_lock(vmf); 1720 return VM_FAULT_RETRY; 1721 } 1722 } else { 1723 __folio_lock(folio); 1724 } 1725 1726 return 0; 1727 } 1728 1729 /** 1730 * page_cache_next_miss() - Find the next gap in the page cache. 1731 * @mapping: Mapping. 1732 * @index: Index. 1733 * @max_scan: Maximum range to search. 1734 * 1735 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the 1736 * gap with the lowest index. 1737 * 1738 * This function may be called under the rcu_read_lock. However, this will 1739 * not atomically search a snapshot of the cache at a single point in time. 1740 * For example, if a gap is created at index 5, then subsequently a gap is 1741 * created at index 10, page_cache_next_miss covering both indices may 1742 * return 10 if called under the rcu_read_lock. 1743 * 1744 * Return: The index of the gap if found, otherwise an index outside the 1745 * range specified (in which case 'return - index >= max_scan' will be true). 1746 * In the rare case of index wrap-around, 0 will be returned. 1747 */ 1748 pgoff_t page_cache_next_miss(struct address_space *mapping, 1749 pgoff_t index, unsigned long max_scan) 1750 { 1751 XA_STATE(xas, &mapping->i_pages, index); 1752 1753 while (max_scan--) { 1754 void *entry = xas_next(&xas); 1755 if (!entry || xa_is_value(entry)) 1756 return xas.xa_index; 1757 if (xas.xa_index == 0) 1758 return 0; 1759 } 1760 1761 return index + max_scan; 1762 } 1763 EXPORT_SYMBOL(page_cache_next_miss); 1764 1765 /** 1766 * page_cache_prev_miss() - Find the previous gap in the page cache. 1767 * @mapping: Mapping. 1768 * @index: Index. 1769 * @max_scan: Maximum range to search. 1770 * 1771 * Search the range [max(index - max_scan + 1, 0), index] for the 1772 * gap with the highest index. 1773 * 1774 * This function may be called under the rcu_read_lock. However, this will 1775 * not atomically search a snapshot of the cache at a single point in time. 1776 * For example, if a gap is created at index 10, then subsequently a gap is 1777 * created at index 5, page_cache_prev_miss() covering both indices may 1778 * return 5 if called under the rcu_read_lock. 1779 * 1780 * Return: The index of the gap if found, otherwise an index outside the 1781 * range specified (in which case 'index - return >= max_scan' will be true). 1782 * In the rare case of wrap-around, ULONG_MAX will be returned. 1783 */ 1784 pgoff_t page_cache_prev_miss(struct address_space *mapping, 1785 pgoff_t index, unsigned long max_scan) 1786 { 1787 XA_STATE(xas, &mapping->i_pages, index); 1788 1789 while (max_scan--) { 1790 void *entry = xas_prev(&xas); 1791 if (!entry || xa_is_value(entry)) 1792 break; 1793 if (xas.xa_index == ULONG_MAX) 1794 break; 1795 } 1796 1797 return xas.xa_index; 1798 } 1799 EXPORT_SYMBOL(page_cache_prev_miss); 1800 1801 /* 1802 * Lockless page cache protocol: 1803 * On the lookup side: 1804 * 1. Load the folio from i_pages 1805 * 2. Increment the refcount if it's not zero 1806 * 3. If the folio is not found by xas_reload(), put the refcount and retry 1807 * 1808 * On the removal side: 1809 * A. Freeze the page (by zeroing the refcount if nobody else has a reference) 1810 * B. Remove the page from i_pages 1811 * C. Return the page to the page allocator 1812 * 1813 * This means that any page may have its reference count temporarily 1814 * increased by a speculative page cache (or GUP-fast) lookup as it can 1815 * be allocated by another user before the RCU grace period expires. 1816 * Because the refcount temporarily acquired here may end up being the 1817 * last refcount on the page, any page allocation must be freeable by 1818 * folio_put(). 1819 */ 1820 1821 /* 1822 * filemap_get_entry - Get a page cache entry. 1823 * @mapping: the address_space to search 1824 * @index: The page cache index. 1825 * 1826 * Looks up the page cache entry at @mapping & @index. If it is a folio, 1827 * it is returned with an increased refcount. If it is a shadow entry 1828 * of a previously evicted folio, or a swap entry from shmem/tmpfs, 1829 * it is returned without further action. 1830 * 1831 * Return: The folio, swap or shadow entry, %NULL if nothing is found. 1832 */ 1833 void *filemap_get_entry(struct address_space *mapping, pgoff_t index) 1834 { 1835 XA_STATE(xas, &mapping->i_pages, index); 1836 struct folio *folio; 1837 1838 rcu_read_lock(); 1839 repeat: 1840 xas_reset(&xas); 1841 folio = xas_load(&xas); 1842 if (xas_retry(&xas, folio)) 1843 goto repeat; 1844 /* 1845 * A shadow entry of a recently evicted page, or a swap entry from 1846 * shmem/tmpfs. Return it without attempting to raise page count. 1847 */ 1848 if (!folio || xa_is_value(folio)) 1849 goto out; 1850 1851 if (!folio_try_get(folio)) 1852 goto repeat; 1853 1854 if (unlikely(folio != xas_reload(&xas))) { 1855 folio_put(folio); 1856 goto repeat; 1857 } 1858 out: 1859 rcu_read_unlock(); 1860 1861 return folio; 1862 } 1863 1864 /** 1865 * __filemap_get_folio - Find and get a reference to a folio. 1866 * @mapping: The address_space to search. 1867 * @index: The page index. 1868 * @fgp_flags: %FGP flags modify how the folio is returned. 1869 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. 1870 * 1871 * Looks up the page cache entry at @mapping & @index. 1872 * 1873 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even 1874 * if the %GFP flags specified for %FGP_CREAT are atomic. 1875 * 1876 * If this function returns a folio, it is returned with an increased refcount. 1877 * 1878 * Return: The found folio or an ERR_PTR() otherwise. 1879 */ 1880 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, 1881 fgf_t fgp_flags, gfp_t gfp) 1882 { 1883 struct folio *folio; 1884 1885 repeat: 1886 folio = filemap_get_entry(mapping, index); 1887 if (xa_is_value(folio)) 1888 folio = NULL; 1889 if (!folio) 1890 goto no_page; 1891 1892 if (fgp_flags & FGP_LOCK) { 1893 if (fgp_flags & FGP_NOWAIT) { 1894 if (!folio_trylock(folio)) { 1895 folio_put(folio); 1896 return ERR_PTR(-EAGAIN); 1897 } 1898 } else { 1899 folio_lock(folio); 1900 } 1901 1902 /* Has the page been truncated? */ 1903 if (unlikely(folio->mapping != mapping)) { 1904 folio_unlock(folio); 1905 folio_put(folio); 1906 goto repeat; 1907 } 1908 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); 1909 } 1910 1911 if (fgp_flags & FGP_ACCESSED) 1912 folio_mark_accessed(folio); 1913 else if (fgp_flags & FGP_WRITE) { 1914 /* Clear idle flag for buffer write */ 1915 if (folio_test_idle(folio)) 1916 folio_clear_idle(folio); 1917 } 1918 1919 if (fgp_flags & FGP_STABLE) 1920 folio_wait_stable(folio); 1921 no_page: 1922 if (!folio && (fgp_flags & FGP_CREAT)) { 1923 unsigned int min_order = mapping_min_folio_order(mapping); 1924 unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); 1925 int err; 1926 index = mapping_align_index(mapping, index); 1927 1928 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) 1929 gfp |= __GFP_WRITE; 1930 if (fgp_flags & FGP_NOFS) 1931 gfp &= ~__GFP_FS; 1932 if (fgp_flags & FGP_NOWAIT) { 1933 gfp &= ~GFP_KERNEL; 1934 gfp |= GFP_NOWAIT | __GFP_NOWARN; 1935 } 1936 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) 1937 fgp_flags |= FGP_LOCK; 1938 1939 if (order > mapping_max_folio_order(mapping)) 1940 order = mapping_max_folio_order(mapping); 1941 /* If we're not aligned, allocate a smaller folio */ 1942 if (index & ((1UL << order) - 1)) 1943 order = __ffs(index); 1944 1945 do { 1946 gfp_t alloc_gfp = gfp; 1947 1948 err = -ENOMEM; 1949 if (order > min_order) 1950 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; 1951 folio = filemap_alloc_folio(alloc_gfp, order); 1952 if (!folio) 1953 continue; 1954 1955 /* Init accessed so avoid atomic mark_page_accessed later */ 1956 if (fgp_flags & FGP_ACCESSED) 1957 __folio_set_referenced(folio); 1958 1959 err = filemap_add_folio(mapping, folio, index, gfp); 1960 if (!err) 1961 break; 1962 folio_put(folio); 1963 folio = NULL; 1964 } while (order-- > min_order); 1965 1966 if (err == -EEXIST) 1967 goto repeat; 1968 if (err) 1969 return ERR_PTR(err); 1970 /* 1971 * filemap_add_folio locks the page, and for mmap 1972 * we expect an unlocked page. 1973 */ 1974 if (folio && (fgp_flags & FGP_FOR_MMAP)) 1975 folio_unlock(folio); 1976 } 1977 1978 if (!folio) 1979 return ERR_PTR(-ENOENT); 1980 return folio; 1981 } 1982 EXPORT_SYMBOL(__filemap_get_folio); 1983 1984 static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, 1985 xa_mark_t mark) 1986 { 1987 struct folio *folio; 1988 1989 retry: 1990 if (mark == XA_PRESENT) 1991 folio = xas_find(xas, max); 1992 else 1993 folio = xas_find_marked(xas, max, mark); 1994 1995 if (xas_retry(xas, folio)) 1996 goto retry; 1997 /* 1998 * A shadow entry of a recently evicted page, a swap 1999 * entry from shmem/tmpfs or a DAX entry. Return it 2000 * without attempting to raise page count. 2001 */ 2002 if (!folio || xa_is_value(folio)) 2003 return folio; 2004 2005 if (!folio_try_get(folio)) 2006 goto reset; 2007 2008 if (unlikely(folio != xas_reload(xas))) { 2009 folio_put(folio); 2010 goto reset; 2011 } 2012 2013 return folio; 2014 reset: 2015 xas_reset(xas); 2016 goto retry; 2017 } 2018 2019 /** 2020 * find_get_entries - gang pagecache lookup 2021 * @mapping: The address_space to search 2022 * @start: The starting page cache index 2023 * @end: The final page index (inclusive). 2024 * @fbatch: Where the resulting entries are placed. 2025 * @indices: The cache indices corresponding to the entries in @entries 2026 * 2027 * find_get_entries() will search for and return a batch of entries in 2028 * the mapping. The entries are placed in @fbatch. find_get_entries() 2029 * takes a reference on any actual folios it returns. 2030 * 2031 * The entries have ascending indexes. The indices may not be consecutive 2032 * due to not-present entries or large folios. 2033 * 2034 * Any shadow entries of evicted folios, or swap entries from 2035 * shmem/tmpfs, are included in the returned array. 2036 * 2037 * Return: The number of entries which were found. 2038 */ 2039 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, 2040 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) 2041 { 2042 XA_STATE(xas, &mapping->i_pages, *start); 2043 struct folio *folio; 2044 2045 rcu_read_lock(); 2046 while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { 2047 indices[fbatch->nr] = xas.xa_index; 2048 if (!folio_batch_add(fbatch, folio)) 2049 break; 2050 } 2051 2052 if (folio_batch_count(fbatch)) { 2053 unsigned long nr; 2054 int idx = folio_batch_count(fbatch) - 1; 2055 2056 folio = fbatch->folios[idx]; 2057 if (!xa_is_value(folio)) 2058 nr = folio_nr_pages(folio); 2059 else 2060 nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); 2061 *start = round_down(indices[idx] + nr, nr); 2062 } 2063 rcu_read_unlock(); 2064 2065 return folio_batch_count(fbatch); 2066 } 2067 2068 /** 2069 * find_lock_entries - Find a batch of pagecache entries. 2070 * @mapping: The address_space to search. 2071 * @start: The starting page cache index. 2072 * @end: The final page index (inclusive). 2073 * @fbatch: Where the resulting entries are placed. 2074 * @indices: The cache indices of the entries in @fbatch. 2075 * 2076 * find_lock_entries() will return a batch of entries from @mapping. 2077 * Swap, shadow and DAX entries are included. Folios are returned 2078 * locked and with an incremented refcount. Folios which are locked 2079 * by somebody else or under writeback are skipped. Folios which are 2080 * partially outside the range are not returned. 2081 * 2082 * The entries have ascending indexes. The indices may not be consecutive 2083 * due to not-present entries, large folios, folios which could not be 2084 * locked or folios under writeback. 2085 * 2086 * Return: The number of entries which were found. 2087 */ 2088 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, 2089 pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) 2090 { 2091 XA_STATE(xas, &mapping->i_pages, *start); 2092 struct folio *folio; 2093 2094 rcu_read_lock(); 2095 while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { 2096 unsigned long base; 2097 unsigned long nr; 2098 2099 if (!xa_is_value(folio)) { 2100 nr = folio_nr_pages(folio); 2101 base = folio->index; 2102 /* Omit large folio which begins before the start */ 2103 if (base < *start) 2104 goto put; 2105 /* Omit large folio which extends beyond the end */ 2106 if (base + nr - 1 > end) 2107 goto put; 2108 if (!folio_trylock(folio)) 2109 goto put; 2110 if (folio->mapping != mapping || 2111 folio_test_writeback(folio)) 2112 goto unlock; 2113 VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), 2114 folio); 2115 } else { 2116 nr = 1 << xas_get_order(&xas); 2117 base = xas.xa_index & ~(nr - 1); 2118 /* Omit order>0 value which begins before the start */ 2119 if (base < *start) 2120 continue; 2121 /* Omit order>0 value which extends beyond the end */ 2122 if (base + nr - 1 > end) 2123 break; 2124 } 2125 2126 /* Update start now so that last update is correct on return */ 2127 *start = base + nr; 2128 indices[fbatch->nr] = xas.xa_index; 2129 if (!folio_batch_add(fbatch, folio)) 2130 break; 2131 continue; 2132 unlock: 2133 folio_unlock(folio); 2134 put: 2135 folio_put(folio); 2136 } 2137 rcu_read_unlock(); 2138 2139 return folio_batch_count(fbatch); 2140 } 2141 2142 /** 2143 * filemap_get_folios - Get a batch of folios 2144 * @mapping: The address_space to search 2145 * @start: The starting page index 2146 * @end: The final page index (inclusive) 2147 * @fbatch: The batch to fill. 2148 * 2149 * Search for and return a batch of folios in the mapping starting at 2150 * index @start and up to index @end (inclusive). The folios are returned 2151 * in @fbatch with an elevated reference count. 2152 * 2153 * Return: The number of folios which were found. 2154 * We also update @start to index the next folio for the traversal. 2155 */ 2156 unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, 2157 pgoff_t end, struct folio_batch *fbatch) 2158 { 2159 return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); 2160 } 2161 EXPORT_SYMBOL(filemap_get_folios); 2162 2163 /** 2164 * filemap_get_folios_contig - Get a batch of contiguous folios 2165 * @mapping: The address_space to search 2166 * @start: The starting page index 2167 * @end: The final page index (inclusive) 2168 * @fbatch: The batch to fill 2169 * 2170 * filemap_get_folios_contig() works exactly like filemap_get_folios(), 2171 * except the returned folios are guaranteed to be contiguous. This may 2172 * not return all contiguous folios if the batch gets filled up. 2173 * 2174 * Return: The number of folios found. 2175 * Also update @start to be positioned for traversal of the next folio. 2176 */ 2177 2178 unsigned filemap_get_folios_contig(struct address_space *mapping, 2179 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) 2180 { 2181 XA_STATE(xas, &mapping->i_pages, *start); 2182 unsigned long nr; 2183 struct folio *folio; 2184 2185 rcu_read_lock(); 2186 2187 for (folio = xas_load(&xas); folio && xas.xa_index <= end; 2188 folio = xas_next(&xas)) { 2189 if (xas_retry(&xas, folio)) 2190 continue; 2191 /* 2192 * If the entry has been swapped out, we can stop looking. 2193 * No current caller is looking for DAX entries. 2194 */ 2195 if (xa_is_value(folio)) 2196 goto update_start; 2197 2198 /* If we landed in the middle of a THP, continue at its end. */ 2199 if (xa_is_sibling(folio)) 2200 goto update_start; 2201 2202 if (!folio_try_get(folio)) 2203 goto retry; 2204 2205 if (unlikely(folio != xas_reload(&xas))) 2206 goto put_folio; 2207 2208 if (!folio_batch_add(fbatch, folio)) { 2209 nr = folio_nr_pages(folio); 2210 *start = folio->index + nr; 2211 goto out; 2212 } 2213 continue; 2214 put_folio: 2215 folio_put(folio); 2216 2217 retry: 2218 xas_reset(&xas); 2219 } 2220 2221 update_start: 2222 nr = folio_batch_count(fbatch); 2223 2224 if (nr) { 2225 folio = fbatch->folios[nr - 1]; 2226 *start = folio_next_index(folio); 2227 } 2228 out: 2229 rcu_read_unlock(); 2230 return folio_batch_count(fbatch); 2231 } 2232 EXPORT_SYMBOL(filemap_get_folios_contig); 2233 2234 /** 2235 * filemap_get_folios_tag - Get a batch of folios matching @tag 2236 * @mapping: The address_space to search 2237 * @start: The starting page index 2238 * @end: The final page index (inclusive) 2239 * @tag: The tag index 2240 * @fbatch: The batch to fill 2241 * 2242 * The first folio may start before @start; if it does, it will contain 2243 * @start. The final folio may extend beyond @end; if it does, it will 2244 * contain @end. The folios have ascending indices. There may be gaps 2245 * between the folios if there are indices which have no folio in the 2246 * page cache. If folios are added to or removed from the page cache 2247 * while this is running, they may or may not be found by this call. 2248 * Only returns folios that are tagged with @tag. 2249 * 2250 * Return: The number of folios found. 2251 * Also update @start to index the next folio for traversal. 2252 */ 2253 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, 2254 pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) 2255 { 2256 XA_STATE(xas, &mapping->i_pages, *start); 2257 struct folio *folio; 2258 2259 rcu_read_lock(); 2260 while ((folio = find_get_entry(&xas, end, tag)) != NULL) { 2261 /* 2262 * Shadow entries should never be tagged, but this iteration 2263 * is lockless so there is a window for page reclaim to evict 2264 * a page we saw tagged. Skip over it. 2265 */ 2266 if (xa_is_value(folio)) 2267 continue; 2268 if (!folio_batch_add(fbatch, folio)) { 2269 unsigned long nr = folio_nr_pages(folio); 2270 *start = folio->index + nr; 2271 goto out; 2272 } 2273 } 2274 /* 2275 * We come here when there is no page beyond @end. We take care to not 2276 * overflow the index @start as it confuses some of the callers. This 2277 * breaks the iteration when there is a page at index -1 but that is 2278 * already broke anyway. 2279 */ 2280 if (end == (pgoff_t)-1) 2281 *start = (pgoff_t)-1; 2282 else 2283 *start = end + 1; 2284 out: 2285 rcu_read_unlock(); 2286 2287 return folio_batch_count(fbatch); 2288 } 2289 EXPORT_SYMBOL(filemap_get_folios_tag); 2290 2291 /* 2292 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 2293 * a _large_ part of the i/o request. Imagine the worst scenario: 2294 * 2295 * ---R__________________________________________B__________ 2296 * ^ reading here ^ bad block(assume 4k) 2297 * 2298 * read(R) => miss => readahead(R...B) => media error => frustrating retries 2299 * => failing the whole request => read(R) => read(R+1) => 2300 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 2301 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 2302 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 2303 * 2304 * It is going insane. Fix it by quickly scaling down the readahead size. 2305 */ 2306 static void shrink_readahead_size_eio(struct file_ra_state *ra) 2307 { 2308 ra->ra_pages /= 4; 2309 } 2310 2311 /* 2312 * filemap_get_read_batch - Get a batch of folios for read 2313 * 2314 * Get a batch of folios which represent a contiguous range of bytes in 2315 * the file. No exceptional entries will be returned. If @index is in 2316 * the middle of a folio, the entire folio will be returned. The last 2317 * folio in the batch may have the readahead flag set or the uptodate flag 2318 * clear so that the caller can take the appropriate action. 2319 */ 2320 static void filemap_get_read_batch(struct address_space *mapping, 2321 pgoff_t index, pgoff_t max, struct folio_batch *fbatch) 2322 { 2323 XA_STATE(xas, &mapping->i_pages, index); 2324 struct folio *folio; 2325 2326 rcu_read_lock(); 2327 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { 2328 if (xas_retry(&xas, folio)) 2329 continue; 2330 if (xas.xa_index > max || xa_is_value(folio)) 2331 break; 2332 if (xa_is_sibling(folio)) 2333 break; 2334 if (!folio_try_get(folio)) 2335 goto retry; 2336 2337 if (unlikely(folio != xas_reload(&xas))) 2338 goto put_folio; 2339 2340 if (!folio_batch_add(fbatch, folio)) 2341 break; 2342 if (!folio_test_uptodate(folio)) 2343 break; 2344 if (folio_test_readahead(folio)) 2345 break; 2346 xas_advance(&xas, folio_next_index(folio) - 1); 2347 continue; 2348 put_folio: 2349 folio_put(folio); 2350 retry: 2351 xas_reset(&xas); 2352 } 2353 rcu_read_unlock(); 2354 } 2355 2356 static int filemap_read_folio(struct file *file, filler_t filler, 2357 struct folio *folio) 2358 { 2359 bool workingset = folio_test_workingset(folio); 2360 unsigned long pflags; 2361 int error; 2362 2363 /* Start the actual read. The read will unlock the page. */ 2364 if (unlikely(workingset)) 2365 psi_memstall_enter(&pflags); 2366 error = filler(file, folio); 2367 if (unlikely(workingset)) 2368 psi_memstall_leave(&pflags); 2369 if (error) 2370 return error; 2371 2372 error = folio_wait_locked_killable(folio); 2373 if (error) 2374 return error; 2375 if (folio_test_uptodate(folio)) 2376 return 0; 2377 if (file) 2378 shrink_readahead_size_eio(&file->f_ra); 2379 return -EIO; 2380 } 2381 2382 static bool filemap_range_uptodate(struct address_space *mapping, 2383 loff_t pos, size_t count, struct folio *folio, 2384 bool need_uptodate) 2385 { 2386 if (folio_test_uptodate(folio)) 2387 return true; 2388 /* pipes can't handle partially uptodate pages */ 2389 if (need_uptodate) 2390 return false; 2391 if (!mapping->a_ops->is_partially_uptodate) 2392 return false; 2393 if (mapping->host->i_blkbits >= folio_shift(folio)) 2394 return false; 2395 2396 if (folio_pos(folio) > pos) { 2397 count -= folio_pos(folio) - pos; 2398 pos = 0; 2399 } else { 2400 pos -= folio_pos(folio); 2401 } 2402 2403 return mapping->a_ops->is_partially_uptodate(folio, pos, count); 2404 } 2405 2406 static int filemap_update_page(struct kiocb *iocb, 2407 struct address_space *mapping, size_t count, 2408 struct folio *folio, bool need_uptodate) 2409 { 2410 int error; 2411 2412 if (iocb->ki_flags & IOCB_NOWAIT) { 2413 if (!filemap_invalidate_trylock_shared(mapping)) 2414 return -EAGAIN; 2415 } else { 2416 filemap_invalidate_lock_shared(mapping); 2417 } 2418 2419 if (!folio_trylock(folio)) { 2420 error = -EAGAIN; 2421 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) 2422 goto unlock_mapping; 2423 if (!(iocb->ki_flags & IOCB_WAITQ)) { 2424 filemap_invalidate_unlock_shared(mapping); 2425 /* 2426 * This is where we usually end up waiting for a 2427 * previously submitted readahead to finish. 2428 */ 2429 folio_put_wait_locked(folio, TASK_KILLABLE); 2430 return AOP_TRUNCATED_PAGE; 2431 } 2432 error = __folio_lock_async(folio, iocb->ki_waitq); 2433 if (error) 2434 goto unlock_mapping; 2435 } 2436 2437 error = AOP_TRUNCATED_PAGE; 2438 if (!folio->mapping) 2439 goto unlock; 2440 2441 error = 0; 2442 if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, 2443 need_uptodate)) 2444 goto unlock; 2445 2446 error = -EAGAIN; 2447 if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) 2448 goto unlock; 2449 2450 error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, 2451 folio); 2452 goto unlock_mapping; 2453 unlock: 2454 folio_unlock(folio); 2455 unlock_mapping: 2456 filemap_invalidate_unlock_shared(mapping); 2457 if (error == AOP_TRUNCATED_PAGE) 2458 folio_put(folio); 2459 return error; 2460 } 2461 2462 static int filemap_create_folio(struct file *file, 2463 struct address_space *mapping, loff_t pos, 2464 struct folio_batch *fbatch) 2465 { 2466 struct folio *folio; 2467 int error; 2468 unsigned int min_order = mapping_min_folio_order(mapping); 2469 pgoff_t index; 2470 2471 folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); 2472 if (!folio) 2473 return -ENOMEM; 2474 2475 /* 2476 * Protect against truncate / hole punch. Grabbing invalidate_lock 2477 * here assures we cannot instantiate and bring uptodate new 2478 * pagecache folios after evicting page cache during truncate 2479 * and before actually freeing blocks. Note that we could 2480 * release invalidate_lock after inserting the folio into 2481 * the page cache as the locked folio would then be enough to 2482 * synchronize with hole punching. But there are code paths 2483 * such as filemap_update_page() filling in partially uptodate 2484 * pages or ->readahead() that need to hold invalidate_lock 2485 * while mapping blocks for IO so let's hold the lock here as 2486 * well to keep locking rules simple. 2487 */ 2488 filemap_invalidate_lock_shared(mapping); 2489 index = (pos >> (PAGE_SHIFT + min_order)) << min_order; 2490 error = filemap_add_folio(mapping, folio, index, 2491 mapping_gfp_constraint(mapping, GFP_KERNEL)); 2492 if (error == -EEXIST) 2493 error = AOP_TRUNCATED_PAGE; 2494 if (error) 2495 goto error; 2496 2497 error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); 2498 if (error) 2499 goto error; 2500 2501 filemap_invalidate_unlock_shared(mapping); 2502 folio_batch_add(fbatch, folio); 2503 return 0; 2504 error: 2505 filemap_invalidate_unlock_shared(mapping); 2506 folio_put(folio); 2507 return error; 2508 } 2509 2510 static int filemap_readahead(struct kiocb *iocb, struct file *file, 2511 struct address_space *mapping, struct folio *folio, 2512 pgoff_t last_index) 2513 { 2514 DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); 2515 2516 if (iocb->ki_flags & IOCB_NOIO) 2517 return -EAGAIN; 2518 page_cache_async_ra(&ractl, folio, last_index - folio->index); 2519 return 0; 2520 } 2521 2522 static int filemap_get_pages(struct kiocb *iocb, size_t count, 2523 struct folio_batch *fbatch, bool need_uptodate) 2524 { 2525 struct file *filp = iocb->ki_filp; 2526 struct address_space *mapping = filp->f_mapping; 2527 struct file_ra_state *ra = &filp->f_ra; 2528 pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; 2529 pgoff_t last_index; 2530 struct folio *folio; 2531 unsigned int flags; 2532 int err = 0; 2533 2534 /* "last_index" is the index of the page beyond the end of the read */ 2535 last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); 2536 retry: 2537 if (fatal_signal_pending(current)) 2538 return -EINTR; 2539 2540 filemap_get_read_batch(mapping, index, last_index - 1, fbatch); 2541 if (!folio_batch_count(fbatch)) { 2542 if (iocb->ki_flags & IOCB_NOIO) 2543 return -EAGAIN; 2544 if (iocb->ki_flags & IOCB_NOWAIT) 2545 flags = memalloc_noio_save(); 2546 page_cache_sync_readahead(mapping, ra, filp, index, 2547 last_index - index); 2548 if (iocb->ki_flags & IOCB_NOWAIT) 2549 memalloc_noio_restore(flags); 2550 filemap_get_read_batch(mapping, index, last_index - 1, fbatch); 2551 } 2552 if (!folio_batch_count(fbatch)) { 2553 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) 2554 return -EAGAIN; 2555 err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); 2556 if (err == AOP_TRUNCATED_PAGE) 2557 goto retry; 2558 return err; 2559 } 2560 2561 folio = fbatch->folios[folio_batch_count(fbatch) - 1]; 2562 if (folio_test_readahead(folio)) { 2563 err = filemap_readahead(iocb, filp, mapping, folio, last_index); 2564 if (err) 2565 goto err; 2566 } 2567 if (!folio_test_uptodate(folio)) { 2568 if ((iocb->ki_flags & IOCB_WAITQ) && 2569 folio_batch_count(fbatch) > 1) 2570 iocb->ki_flags |= IOCB_NOWAIT; 2571 err = filemap_update_page(iocb, mapping, count, folio, 2572 need_uptodate); 2573 if (err) 2574 goto err; 2575 } 2576 2577 trace_mm_filemap_get_pages(mapping, index, last_index - 1); 2578 return 0; 2579 err: 2580 if (err < 0) 2581 folio_put(folio); 2582 if (likely(--fbatch->nr)) 2583 return 0; 2584 if (err == AOP_TRUNCATED_PAGE) 2585 goto retry; 2586 return err; 2587 } 2588 2589 static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) 2590 { 2591 unsigned int shift = folio_shift(folio); 2592 2593 return (pos1 >> shift == pos2 >> shift); 2594 } 2595 2596 /** 2597 * filemap_read - Read data from the page cache. 2598 * @iocb: The iocb to read. 2599 * @iter: Destination for the data. 2600 * @already_read: Number of bytes already read by the caller. 2601 * 2602 * Copies data from the page cache. If the data is not currently present, 2603 * uses the readahead and read_folio address_space operations to fetch it. 2604 * 2605 * Return: Total number of bytes copied, including those already read by 2606 * the caller. If an error happens before any bytes are copied, returns 2607 * a negative error number. 2608 */ 2609 ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, 2610 ssize_t already_read) 2611 { 2612 struct file *filp = iocb->ki_filp; 2613 struct file_ra_state *ra = &filp->f_ra; 2614 struct address_space *mapping = filp->f_mapping; 2615 struct inode *inode = mapping->host; 2616 struct folio_batch fbatch; 2617 int i, error = 0; 2618 bool writably_mapped; 2619 loff_t isize, end_offset; 2620 loff_t last_pos = ra->prev_pos; 2621 2622 if (unlikely(iocb->ki_pos < 0)) 2623 return -EINVAL; 2624 if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) 2625 return 0; 2626 if (unlikely(!iov_iter_count(iter))) 2627 return 0; 2628 2629 iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); 2630 folio_batch_init(&fbatch); 2631 2632 do { 2633 cond_resched(); 2634 2635 /* 2636 * If we've already successfully copied some data, then we 2637 * can no longer safely return -EIOCBQUEUED. Hence mark 2638 * an async read NOWAIT at that point. 2639 */ 2640 if ((iocb->ki_flags & IOCB_WAITQ) && already_read) 2641 iocb->ki_flags |= IOCB_NOWAIT; 2642 2643 if (unlikely(iocb->ki_pos >= i_size_read(inode))) 2644 break; 2645 2646 error = filemap_get_pages(iocb, iter->count, &fbatch, false); 2647 if (error < 0) 2648 break; 2649 2650 /* 2651 * i_size must be checked after we know the pages are Uptodate. 2652 * 2653 * Checking i_size after the check allows us to calculate 2654 * the correct value for "nr", which means the zero-filled 2655 * part of the page is not copied back to userspace (unless 2656 * another truncate extends the file - this is desired though). 2657 */ 2658 isize = i_size_read(inode); 2659 if (unlikely(iocb->ki_pos >= isize)) 2660 goto put_folios; 2661 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); 2662 2663 /* 2664 * Once we start copying data, we don't want to be touching any 2665 * cachelines that might be contended: 2666 */ 2667 writably_mapped = mapping_writably_mapped(mapping); 2668 2669 /* 2670 * When a read accesses the same folio several times, only 2671 * mark it as accessed the first time. 2672 */ 2673 if (!pos_same_folio(iocb->ki_pos, last_pos - 1, 2674 fbatch.folios[0])) 2675 folio_mark_accessed(fbatch.folios[0]); 2676 2677 for (i = 0; i < folio_batch_count(&fbatch); i++) { 2678 struct folio *folio = fbatch.folios[i]; 2679 size_t fsize = folio_size(folio); 2680 size_t offset = iocb->ki_pos & (fsize - 1); 2681 size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, 2682 fsize - offset); 2683 size_t copied; 2684 2685 if (end_offset < folio_pos(folio)) 2686 break; 2687 if (i > 0) 2688 folio_mark_accessed(folio); 2689 /* 2690 * If users can be writing to this folio using arbitrary 2691 * virtual addresses, take care of potential aliasing 2692 * before reading the folio on the kernel side. 2693 */ 2694 if (writably_mapped) 2695 flush_dcache_folio(folio); 2696 2697 copied = copy_folio_to_iter(folio, offset, bytes, iter); 2698 2699 already_read += copied; 2700 iocb->ki_pos += copied; 2701 last_pos = iocb->ki_pos; 2702 2703 if (copied < bytes) { 2704 error = -EFAULT; 2705 break; 2706 } 2707 } 2708 put_folios: 2709 for (i = 0; i < folio_batch_count(&fbatch); i++) 2710 folio_put(fbatch.folios[i]); 2711 folio_batch_init(&fbatch); 2712 } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); 2713 2714 file_accessed(filp); 2715 ra->prev_pos = last_pos; 2716 return already_read ? already_read : error; 2717 } 2718 EXPORT_SYMBOL_GPL(filemap_read); 2719 2720 int kiocb_write_and_wait(struct kiocb *iocb, size_t count) 2721 { 2722 struct address_space *mapping = iocb->ki_filp->f_mapping; 2723 loff_t pos = iocb->ki_pos; 2724 loff_t end = pos + count - 1; 2725 2726 if (iocb->ki_flags & IOCB_NOWAIT) { 2727 if (filemap_range_needs_writeback(mapping, pos, end)) 2728 return -EAGAIN; 2729 return 0; 2730 } 2731 2732 return filemap_write_and_wait_range(mapping, pos, end); 2733 } 2734 EXPORT_SYMBOL_GPL(kiocb_write_and_wait); 2735 2736 int filemap_invalidate_pages(struct address_space *mapping, 2737 loff_t pos, loff_t end, bool nowait) 2738 { 2739 int ret; 2740 2741 if (nowait) { 2742 /* we could block if there are any pages in the range */ 2743 if (filemap_range_has_page(mapping, pos, end)) 2744 return -EAGAIN; 2745 } else { 2746 ret = filemap_write_and_wait_range(mapping, pos, end); 2747 if (ret) 2748 return ret; 2749 } 2750 2751 /* 2752 * After a write we want buffered reads to be sure to go to disk to get 2753 * the new data. We invalidate clean cached page from the region we're 2754 * about to write. We do this *before* the write so that we can return 2755 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2756 */ 2757 return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, 2758 end >> PAGE_SHIFT); 2759 } 2760 2761 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) 2762 { 2763 struct address_space *mapping = iocb->ki_filp->f_mapping; 2764 2765 return filemap_invalidate_pages(mapping, iocb->ki_pos, 2766 iocb->ki_pos + count - 1, 2767 iocb->ki_flags & IOCB_NOWAIT); 2768 } 2769 EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); 2770 2771 /** 2772 * generic_file_read_iter - generic filesystem read routine 2773 * @iocb: kernel I/O control block 2774 * @iter: destination for the data read 2775 * 2776 * This is the "read_iter()" routine for all filesystems 2777 * that can use the page cache directly. 2778 * 2779 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall 2780 * be returned when no data can be read without waiting for I/O requests 2781 * to complete; it doesn't prevent readahead. 2782 * 2783 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O 2784 * requests shall be made for the read or for readahead. When no data 2785 * can be read, -EAGAIN shall be returned. When readahead would be 2786 * triggered, a partial, possibly empty read shall be returned. 2787 * 2788 * Return: 2789 * * number of bytes copied, even for partial reads 2790 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 2791 */ 2792 ssize_t 2793 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 2794 { 2795 size_t count = iov_iter_count(iter); 2796 ssize_t retval = 0; 2797 2798 if (!count) 2799 return 0; /* skip atime */ 2800 2801 if (iocb->ki_flags & IOCB_DIRECT) { 2802 struct file *file = iocb->ki_filp; 2803 struct address_space *mapping = file->f_mapping; 2804 struct inode *inode = mapping->host; 2805 2806 retval = kiocb_write_and_wait(iocb, count); 2807 if (retval < 0) 2808 return retval; 2809 file_accessed(file); 2810 2811 retval = mapping->a_ops->direct_IO(iocb, iter); 2812 if (retval >= 0) { 2813 iocb->ki_pos += retval; 2814 count -= retval; 2815 } 2816 if (retval != -EIOCBQUEUED) 2817 iov_iter_revert(iter, count - iov_iter_count(iter)); 2818 2819 /* 2820 * Btrfs can have a short DIO read if we encounter 2821 * compressed extents, so if there was an error, or if 2822 * we've already read everything we wanted to, or if 2823 * there was a short read because we hit EOF, go ahead 2824 * and return. Otherwise fallthrough to buffered io for 2825 * the rest of the read. Buffered reads will not work for 2826 * DAX files, so don't bother trying. 2827 */ 2828 if (retval < 0 || !count || IS_DAX(inode)) 2829 return retval; 2830 if (iocb->ki_pos >= i_size_read(inode)) 2831 return retval; 2832 } 2833 2834 return filemap_read(iocb, iter, retval); 2835 } 2836 EXPORT_SYMBOL(generic_file_read_iter); 2837 2838 /* 2839 * Splice subpages from a folio into a pipe. 2840 */ 2841 size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, 2842 struct folio *folio, loff_t fpos, size_t size) 2843 { 2844 struct page *page; 2845 size_t spliced = 0, offset = offset_in_folio(folio, fpos); 2846 2847 page = folio_page(folio, offset / PAGE_SIZE); 2848 size = min(size, folio_size(folio) - offset); 2849 offset %= PAGE_SIZE; 2850 2851 while (spliced < size && 2852 !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 2853 struct pipe_buffer *buf = pipe_head_buf(pipe); 2854 size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); 2855 2856 *buf = (struct pipe_buffer) { 2857 .ops = &page_cache_pipe_buf_ops, 2858 .page = page, 2859 .offset = offset, 2860 .len = part, 2861 }; 2862 folio_get(folio); 2863 pipe->head++; 2864 page++; 2865 spliced += part; 2866 offset = 0; 2867 } 2868 2869 return spliced; 2870 } 2871 2872 /** 2873 * filemap_splice_read - Splice data from a file's pagecache into a pipe 2874 * @in: The file to read from 2875 * @ppos: Pointer to the file position to read from 2876 * @pipe: The pipe to splice into 2877 * @len: The amount to splice 2878 * @flags: The SPLICE_F_* flags 2879 * 2880 * This function gets folios from a file's pagecache and splices them into the 2881 * pipe. Readahead will be called as necessary to fill more folios. This may 2882 * be used for blockdevs also. 2883 * 2884 * Return: On success, the number of bytes read will be returned and *@ppos 2885 * will be updated if appropriate; 0 will be returned if there is no more data 2886 * to be read; -EAGAIN will be returned if the pipe had no space, and some 2887 * other negative error code will be returned on error. A short read may occur 2888 * if the pipe has insufficient space, we reach the end of the data or we hit a 2889 * hole. 2890 */ 2891 ssize_t filemap_splice_read(struct file *in, loff_t *ppos, 2892 struct pipe_inode_info *pipe, 2893 size_t len, unsigned int flags) 2894 { 2895 struct folio_batch fbatch; 2896 struct kiocb iocb; 2897 size_t total_spliced = 0, used, npages; 2898 loff_t isize, end_offset; 2899 bool writably_mapped; 2900 int i, error = 0; 2901 2902 if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) 2903 return 0; 2904 2905 init_sync_kiocb(&iocb, in); 2906 iocb.ki_pos = *ppos; 2907 2908 /* Work out how much data we can actually add into the pipe */ 2909 used = pipe_occupancy(pipe->head, pipe->tail); 2910 npages = max_t(ssize_t, pipe->max_usage - used, 0); 2911 len = min_t(size_t, len, npages * PAGE_SIZE); 2912 2913 folio_batch_init(&fbatch); 2914 2915 do { 2916 cond_resched(); 2917 2918 if (*ppos >= i_size_read(in->f_mapping->host)) 2919 break; 2920 2921 iocb.ki_pos = *ppos; 2922 error = filemap_get_pages(&iocb, len, &fbatch, true); 2923 if (error < 0) 2924 break; 2925 2926 /* 2927 * i_size must be checked after we know the pages are Uptodate. 2928 * 2929 * Checking i_size after the check allows us to calculate 2930 * the correct value for "nr", which means the zero-filled 2931 * part of the page is not copied back to userspace (unless 2932 * another truncate extends the file - this is desired though). 2933 */ 2934 isize = i_size_read(in->f_mapping->host); 2935 if (unlikely(*ppos >= isize)) 2936 break; 2937 end_offset = min_t(loff_t, isize, *ppos + len); 2938 2939 /* 2940 * Once we start copying data, we don't want to be touching any 2941 * cachelines that might be contended: 2942 */ 2943 writably_mapped = mapping_writably_mapped(in->f_mapping); 2944 2945 for (i = 0; i < folio_batch_count(&fbatch); i++) { 2946 struct folio *folio = fbatch.folios[i]; 2947 size_t n; 2948 2949 if (folio_pos(folio) >= end_offset) 2950 goto out; 2951 folio_mark_accessed(folio); 2952 2953 /* 2954 * If users can be writing to this folio using arbitrary 2955 * virtual addresses, take care of potential aliasing 2956 * before reading the folio on the kernel side. 2957 */ 2958 if (writably_mapped) 2959 flush_dcache_folio(folio); 2960 2961 n = min_t(loff_t, len, isize - *ppos); 2962 n = splice_folio_into_pipe(pipe, folio, *ppos, n); 2963 if (!n) 2964 goto out; 2965 len -= n; 2966 total_spliced += n; 2967 *ppos += n; 2968 in->f_ra.prev_pos = *ppos; 2969 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 2970 goto out; 2971 } 2972 2973 folio_batch_release(&fbatch); 2974 } while (len); 2975 2976 out: 2977 folio_batch_release(&fbatch); 2978 file_accessed(in); 2979 2980 return total_spliced ? total_spliced : error; 2981 } 2982 EXPORT_SYMBOL(filemap_splice_read); 2983 2984 static inline loff_t folio_seek_hole_data(struct xa_state *xas, 2985 struct address_space *mapping, struct folio *folio, 2986 loff_t start, loff_t end, bool seek_data) 2987 { 2988 const struct address_space_operations *ops = mapping->a_ops; 2989 size_t offset, bsz = i_blocksize(mapping->host); 2990 2991 if (xa_is_value(folio) || folio_test_uptodate(folio)) 2992 return seek_data ? start : end; 2993 if (!ops->is_partially_uptodate) 2994 return seek_data ? end : start; 2995 2996 xas_pause(xas); 2997 rcu_read_unlock(); 2998 folio_lock(folio); 2999 if (unlikely(folio->mapping != mapping)) 3000 goto unlock; 3001 3002 offset = offset_in_folio(folio, start) & ~(bsz - 1); 3003 3004 do { 3005 if (ops->is_partially_uptodate(folio, offset, bsz) == 3006 seek_data) 3007 break; 3008 start = (start + bsz) & ~(bsz - 1); 3009 offset += bsz; 3010 } while (offset < folio_size(folio)); 3011 unlock: 3012 folio_unlock(folio); 3013 rcu_read_lock(); 3014 return start; 3015 } 3016 3017 static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) 3018 { 3019 if (xa_is_value(folio)) 3020 return PAGE_SIZE << xas_get_order(xas); 3021 return folio_size(folio); 3022 } 3023 3024 /** 3025 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. 3026 * @mapping: Address space to search. 3027 * @start: First byte to consider. 3028 * @end: Limit of search (exclusive). 3029 * @whence: Either SEEK_HOLE or SEEK_DATA. 3030 * 3031 * If the page cache knows which blocks contain holes and which blocks 3032 * contain data, your filesystem can use this function to implement 3033 * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are 3034 * entirely memory-based such as tmpfs, and filesystems which support 3035 * unwritten extents. 3036 * 3037 * Return: The requested offset on success, or -ENXIO if @whence specifies 3038 * SEEK_DATA and there is no data after @start. There is an implicit hole 3039 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start 3040 * and @end contain data. 3041 */ 3042 loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, 3043 loff_t end, int whence) 3044 { 3045 XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); 3046 pgoff_t max = (end - 1) >> PAGE_SHIFT; 3047 bool seek_data = (whence == SEEK_DATA); 3048 struct folio *folio; 3049 3050 if (end <= start) 3051 return -ENXIO; 3052 3053 rcu_read_lock(); 3054 while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { 3055 loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; 3056 size_t seek_size; 3057 3058 if (start < pos) { 3059 if (!seek_data) 3060 goto unlock; 3061 start = pos; 3062 } 3063 3064 seek_size = seek_folio_size(&xas, folio); 3065 pos = round_up((u64)pos + 1, seek_size); 3066 start = folio_seek_hole_data(&xas, mapping, folio, start, pos, 3067 seek_data); 3068 if (start < pos) 3069 goto unlock; 3070 if (start >= end) 3071 break; 3072 if (seek_size > PAGE_SIZE) 3073 xas_set(&xas, pos >> PAGE_SHIFT); 3074 if (!xa_is_value(folio)) 3075 folio_put(folio); 3076 } 3077 if (seek_data) 3078 start = -ENXIO; 3079 unlock: 3080 rcu_read_unlock(); 3081 if (folio && !xa_is_value(folio)) 3082 folio_put(folio); 3083 if (start > end) 3084 return end; 3085 return start; 3086 } 3087 3088 #ifdef CONFIG_MMU 3089 #define MMAP_LOTSAMISS (100) 3090 /* 3091 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock 3092 * @vmf - the vm_fault for this fault. 3093 * @folio - the folio to lock. 3094 * @fpin - the pointer to the file we may pin (or is already pinned). 3095 * 3096 * This works similar to lock_folio_or_retry in that it can drop the 3097 * mmap_lock. It differs in that it actually returns the folio locked 3098 * if it returns 1 and 0 if it couldn't lock the folio. If we did have 3099 * to drop the mmap_lock then fpin will point to the pinned file and 3100 * needs to be fput()'ed at a later point. 3101 */ 3102 static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, 3103 struct file **fpin) 3104 { 3105 if (folio_trylock(folio)) 3106 return 1; 3107 3108 /* 3109 * NOTE! This will make us return with VM_FAULT_RETRY, but with 3110 * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT 3111 * is supposed to work. We have way too many special cases.. 3112 */ 3113 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 3114 return 0; 3115 3116 *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); 3117 if (vmf->flags & FAULT_FLAG_KILLABLE) { 3118 if (__folio_lock_killable(folio)) { 3119 /* 3120 * We didn't have the right flags to drop the 3121 * fault lock, but all fault_handlers only check 3122 * for fatal signals if we return VM_FAULT_RETRY, 3123 * so we need to drop the fault lock here and 3124 * return 0 if we don't have a fpin. 3125 */ 3126 if (*fpin == NULL) 3127 release_fault_lock(vmf); 3128 return 0; 3129 } 3130 } else 3131 __folio_lock(folio); 3132 3133 return 1; 3134 } 3135 3136 /* 3137 * Synchronous readahead happens when we don't even find a page in the page 3138 * cache at all. We don't want to perform IO under the mmap sem, so if we have 3139 * to drop the mmap sem we return the file that was pinned in order for us to do 3140 * that. If we didn't pin a file then we return NULL. The file that is 3141 * returned needs to be fput()'ed when we're done with it. 3142 */ 3143 static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) 3144 { 3145 struct file *file = vmf->vma->vm_file; 3146 struct file_ra_state *ra = &file->f_ra; 3147 struct address_space *mapping = file->f_mapping; 3148 DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); 3149 struct file *fpin = NULL; 3150 unsigned long vm_flags = vmf->vma->vm_flags; 3151 unsigned int mmap_miss; 3152 3153 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3154 /* Use the readahead code, even if readahead is disabled */ 3155 if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { 3156 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3157 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); 3158 ra->size = HPAGE_PMD_NR; 3159 /* 3160 * Fetch two PMD folios, so we get the chance to actually 3161 * readahead, unless we've been told not to. 3162 */ 3163 if (!(vm_flags & VM_RAND_READ)) 3164 ra->size *= 2; 3165 ra->async_size = HPAGE_PMD_NR; 3166 page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); 3167 return fpin; 3168 } 3169 #endif 3170 3171 /* If we don't want any read-ahead, don't bother */ 3172 if (vm_flags & VM_RAND_READ) 3173 return fpin; 3174 if (!ra->ra_pages) 3175 return fpin; 3176 3177 if (vm_flags & VM_SEQ_READ) { 3178 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3179 page_cache_sync_ra(&ractl, ra->ra_pages); 3180 return fpin; 3181 } 3182 3183 /* Avoid banging the cache line if not needed */ 3184 mmap_miss = READ_ONCE(ra->mmap_miss); 3185 if (mmap_miss < MMAP_LOTSAMISS * 10) 3186 WRITE_ONCE(ra->mmap_miss, ++mmap_miss); 3187 3188 /* 3189 * Do we miss much more than hit in this file? If so, 3190 * stop bothering with read-ahead. It will only hurt. 3191 */ 3192 if (mmap_miss > MMAP_LOTSAMISS) 3193 return fpin; 3194 3195 /* 3196 * mmap read-around 3197 */ 3198 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3199 ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); 3200 ra->size = ra->ra_pages; 3201 ra->async_size = ra->ra_pages / 4; 3202 ractl._index = ra->start; 3203 page_cache_ra_order(&ractl, ra, 0); 3204 return fpin; 3205 } 3206 3207 /* 3208 * Asynchronous readahead happens when we find the page and PG_readahead, 3209 * so we want to possibly extend the readahead further. We return the file that 3210 * was pinned if we have to drop the mmap_lock in order to do IO. 3211 */ 3212 static struct file *do_async_mmap_readahead(struct vm_fault *vmf, 3213 struct folio *folio) 3214 { 3215 struct file *file = vmf->vma->vm_file; 3216 struct file_ra_state *ra = &file->f_ra; 3217 DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); 3218 struct file *fpin = NULL; 3219 unsigned int mmap_miss; 3220 3221 /* If we don't want any read-ahead, don't bother */ 3222 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) 3223 return fpin; 3224 3225 mmap_miss = READ_ONCE(ra->mmap_miss); 3226 if (mmap_miss) 3227 WRITE_ONCE(ra->mmap_miss, --mmap_miss); 3228 3229 if (folio_test_readahead(folio)) { 3230 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3231 page_cache_async_ra(&ractl, folio, ra->ra_pages); 3232 } 3233 return fpin; 3234 } 3235 3236 static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) 3237 { 3238 struct vm_area_struct *vma = vmf->vma; 3239 vm_fault_t ret = 0; 3240 pte_t *ptep; 3241 3242 /* 3243 * We might have COW'ed a pagecache folio and might now have an mlocked 3244 * anon folio mapped. The original pagecache folio is not mlocked and 3245 * might have been evicted. During a read+clear/modify/write update of 3246 * the PTE, such as done in do_numa_page()/change_pte_range(), we 3247 * temporarily clear the PTE under PT lock and might detect it here as 3248 * "none" when not holding the PT lock. 3249 * 3250 * Not rechecking the PTE under PT lock could result in an unexpected 3251 * major fault in an mlock'ed region. Recheck only for this special 3252 * scenario while holding the PT lock, to not degrade non-mlocked 3253 * scenarios. Recheck the PTE without PT lock firstly, thereby reducing 3254 * the number of times we hold PT lock. 3255 */ 3256 if (!(vma->vm_flags & VM_LOCKED)) 3257 return 0; 3258 3259 if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) 3260 return 0; 3261 3262 ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, 3263 &vmf->ptl); 3264 if (unlikely(!ptep)) 3265 return VM_FAULT_NOPAGE; 3266 3267 if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { 3268 ret = VM_FAULT_NOPAGE; 3269 } else { 3270 spin_lock(vmf->ptl); 3271 if (unlikely(!pte_none(ptep_get(ptep)))) 3272 ret = VM_FAULT_NOPAGE; 3273 spin_unlock(vmf->ptl); 3274 } 3275 pte_unmap(ptep); 3276 return ret; 3277 } 3278 3279 /** 3280 * filemap_fault - read in file data for page fault handling 3281 * @vmf: struct vm_fault containing details of the fault 3282 * 3283 * filemap_fault() is invoked via the vma operations vector for a 3284 * mapped memory region to read in file data during a page fault. 3285 * 3286 * The goto's are kind of ugly, but this streamlines the normal case of having 3287 * it in the page cache, and handles the special cases reasonably without 3288 * having a lot of duplicated code. 3289 * 3290 * vma->vm_mm->mmap_lock must be held on entry. 3291 * 3292 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock 3293 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). 3294 * 3295 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock 3296 * has not been released. 3297 * 3298 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 3299 * 3300 * Return: bitwise-OR of %VM_FAULT_ codes. 3301 */ 3302 vm_fault_t filemap_fault(struct vm_fault *vmf) 3303 { 3304 int error; 3305 struct file *file = vmf->vma->vm_file; 3306 struct file *fpin = NULL; 3307 struct address_space *mapping = file->f_mapping; 3308 struct inode *inode = mapping->host; 3309 pgoff_t max_idx, index = vmf->pgoff; 3310 struct folio *folio; 3311 vm_fault_t ret = 0; 3312 bool mapping_locked = false; 3313 3314 max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 3315 if (unlikely(index >= max_idx)) 3316 return VM_FAULT_SIGBUS; 3317 3318 trace_mm_filemap_fault(mapping, index); 3319 3320 /* 3321 * Do we have something in the page cache already? 3322 */ 3323 folio = filemap_get_folio(mapping, index); 3324 if (likely(!IS_ERR(folio))) { 3325 /* 3326 * We found the page, so try async readahead before waiting for 3327 * the lock. 3328 */ 3329 if (!(vmf->flags & FAULT_FLAG_TRIED)) 3330 fpin = do_async_mmap_readahead(vmf, folio); 3331 if (unlikely(!folio_test_uptodate(folio))) { 3332 filemap_invalidate_lock_shared(mapping); 3333 mapping_locked = true; 3334 } 3335 } else { 3336 ret = filemap_fault_recheck_pte_none(vmf); 3337 if (unlikely(ret)) 3338 return ret; 3339 3340 /* No page in the page cache at all */ 3341 count_vm_event(PGMAJFAULT); 3342 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 3343 ret = VM_FAULT_MAJOR; 3344 fpin = do_sync_mmap_readahead(vmf); 3345 retry_find: 3346 /* 3347 * See comment in filemap_create_folio() why we need 3348 * invalidate_lock 3349 */ 3350 if (!mapping_locked) { 3351 filemap_invalidate_lock_shared(mapping); 3352 mapping_locked = true; 3353 } 3354 folio = __filemap_get_folio(mapping, index, 3355 FGP_CREAT|FGP_FOR_MMAP, 3356 vmf->gfp_mask); 3357 if (IS_ERR(folio)) { 3358 if (fpin) 3359 goto out_retry; 3360 filemap_invalidate_unlock_shared(mapping); 3361 return VM_FAULT_OOM; 3362 } 3363 } 3364 3365 if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) 3366 goto out_retry; 3367 3368 /* Did it get truncated? */ 3369 if (unlikely(folio->mapping != mapping)) { 3370 folio_unlock(folio); 3371 folio_put(folio); 3372 goto retry_find; 3373 } 3374 VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); 3375 3376 /* 3377 * We have a locked folio in the page cache, now we need to check 3378 * that it's up-to-date. If not, it is going to be due to an error, 3379 * or because readahead was otherwise unable to retrieve it. 3380 */ 3381 if (unlikely(!folio_test_uptodate(folio))) { 3382 /* 3383 * If the invalidate lock is not held, the folio was in cache 3384 * and uptodate and now it is not. Strange but possible since we 3385 * didn't hold the page lock all the time. Let's drop 3386 * everything, get the invalidate lock and try again. 3387 */ 3388 if (!mapping_locked) { 3389 folio_unlock(folio); 3390 folio_put(folio); 3391 goto retry_find; 3392 } 3393 3394 /* 3395 * OK, the folio is really not uptodate. This can be because the 3396 * VMA has the VM_RAND_READ flag set, or because an error 3397 * arose. Let's read it in directly. 3398 */ 3399 goto page_not_uptodate; 3400 } 3401 3402 /* 3403 * We've made it this far and we had to drop our mmap_lock, now is the 3404 * time to return to the upper layer and have it re-find the vma and 3405 * redo the fault. 3406 */ 3407 if (fpin) { 3408 folio_unlock(folio); 3409 goto out_retry; 3410 } 3411 if (mapping_locked) 3412 filemap_invalidate_unlock_shared(mapping); 3413 3414 /* 3415 * Found the page and have a reference on it. 3416 * We must recheck i_size under page lock. 3417 */ 3418 max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 3419 if (unlikely(index >= max_idx)) { 3420 folio_unlock(folio); 3421 folio_put(folio); 3422 return VM_FAULT_SIGBUS; 3423 } 3424 3425 vmf->page = folio_file_page(folio, index); 3426 return ret | VM_FAULT_LOCKED; 3427 3428 page_not_uptodate: 3429 /* 3430 * Umm, take care of errors if the page isn't up-to-date. 3431 * Try to re-read it _once_. We do this synchronously, 3432 * because there really aren't any performance issues here 3433 * and we need to check for errors. 3434 */ 3435 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3436 error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); 3437 if (fpin) 3438 goto out_retry; 3439 folio_put(folio); 3440 3441 if (!error || error == AOP_TRUNCATED_PAGE) 3442 goto retry_find; 3443 filemap_invalidate_unlock_shared(mapping); 3444 3445 return VM_FAULT_SIGBUS; 3446 3447 out_retry: 3448 /* 3449 * We dropped the mmap_lock, we need to return to the fault handler to 3450 * re-find the vma and come back and find our hopefully still populated 3451 * page. 3452 */ 3453 if (!IS_ERR(folio)) 3454 folio_put(folio); 3455 if (mapping_locked) 3456 filemap_invalidate_unlock_shared(mapping); 3457 if (fpin) 3458 fput(fpin); 3459 return ret | VM_FAULT_RETRY; 3460 } 3461 EXPORT_SYMBOL(filemap_fault); 3462 3463 static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, 3464 pgoff_t start) 3465 { 3466 struct mm_struct *mm = vmf->vma->vm_mm; 3467 3468 /* Huge page is mapped? No need to proceed. */ 3469 if (pmd_trans_huge(*vmf->pmd)) { 3470 folio_unlock(folio); 3471 folio_put(folio); 3472 return true; 3473 } 3474 3475 if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { 3476 struct page *page = folio_file_page(folio, start); 3477 vm_fault_t ret = do_set_pmd(vmf, page); 3478 if (!ret) { 3479 /* The page is mapped successfully, reference consumed. */ 3480 folio_unlock(folio); 3481 return true; 3482 } 3483 } 3484 3485 if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) 3486 pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); 3487 3488 return false; 3489 } 3490 3491 static struct folio *next_uptodate_folio(struct xa_state *xas, 3492 struct address_space *mapping, pgoff_t end_pgoff) 3493 { 3494 struct folio *folio = xas_next_entry(xas, end_pgoff); 3495 unsigned long max_idx; 3496 3497 do { 3498 if (!folio) 3499 return NULL; 3500 if (xas_retry(xas, folio)) 3501 continue; 3502 if (xa_is_value(folio)) 3503 continue; 3504 if (folio_test_locked(folio)) 3505 continue; 3506 if (!folio_try_get(folio)) 3507 continue; 3508 /* Has the page moved or been split? */ 3509 if (unlikely(folio != xas_reload(xas))) 3510 goto skip; 3511 if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) 3512 goto skip; 3513 if (!folio_trylock(folio)) 3514 goto skip; 3515 if (folio->mapping != mapping) 3516 goto unlock; 3517 if (!folio_test_uptodate(folio)) 3518 goto unlock; 3519 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 3520 if (xas->xa_index >= max_idx) 3521 goto unlock; 3522 return folio; 3523 unlock: 3524 folio_unlock(folio); 3525 skip: 3526 folio_put(folio); 3527 } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); 3528 3529 return NULL; 3530 } 3531 3532 /* 3533 * Map page range [start_page, start_page + nr_pages) of folio. 3534 * start_page is gotten from start by folio_page(folio, start) 3535 */ 3536 static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, 3537 struct folio *folio, unsigned long start, 3538 unsigned long addr, unsigned int nr_pages, 3539 unsigned long *rss, unsigned int *mmap_miss) 3540 { 3541 vm_fault_t ret = 0; 3542 struct page *page = folio_page(folio, start); 3543 unsigned int count = 0; 3544 pte_t *old_ptep = vmf->pte; 3545 3546 do { 3547 if (PageHWPoison(page + count)) 3548 goto skip; 3549 3550 /* 3551 * If there are too many folios that are recently evicted 3552 * in a file, they will probably continue to be evicted. 3553 * In such situation, read-ahead is only a waste of IO. 3554 * Don't decrease mmap_miss in this scenario to make sure 3555 * we can stop read-ahead. 3556 */ 3557 if (!folio_test_workingset(folio)) 3558 (*mmap_miss)++; 3559 3560 /* 3561 * NOTE: If there're PTE markers, we'll leave them to be 3562 * handled in the specific fault path, and it'll prohibit the 3563 * fault-around logic. 3564 */ 3565 if (!pte_none(ptep_get(&vmf->pte[count]))) 3566 goto skip; 3567 3568 count++; 3569 continue; 3570 skip: 3571 if (count) { 3572 set_pte_range(vmf, folio, page, count, addr); 3573 *rss += count; 3574 folio_ref_add(folio, count); 3575 if (in_range(vmf->address, addr, count * PAGE_SIZE)) 3576 ret = VM_FAULT_NOPAGE; 3577 } 3578 3579 count++; 3580 page += count; 3581 vmf->pte += count; 3582 addr += count * PAGE_SIZE; 3583 count = 0; 3584 } while (--nr_pages > 0); 3585 3586 if (count) { 3587 set_pte_range(vmf, folio, page, count, addr); 3588 *rss += count; 3589 folio_ref_add(folio, count); 3590 if (in_range(vmf->address, addr, count * PAGE_SIZE)) 3591 ret = VM_FAULT_NOPAGE; 3592 } 3593 3594 vmf->pte = old_ptep; 3595 3596 return ret; 3597 } 3598 3599 static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, 3600 struct folio *folio, unsigned long addr, 3601 unsigned long *rss, unsigned int *mmap_miss) 3602 { 3603 vm_fault_t ret = 0; 3604 struct page *page = &folio->page; 3605 3606 if (PageHWPoison(page)) 3607 return ret; 3608 3609 /* See comment of filemap_map_folio_range() */ 3610 if (!folio_test_workingset(folio)) 3611 (*mmap_miss)++; 3612 3613 /* 3614 * NOTE: If there're PTE markers, we'll leave them to be 3615 * handled in the specific fault path, and it'll prohibit 3616 * the fault-around logic. 3617 */ 3618 if (!pte_none(ptep_get(vmf->pte))) 3619 return ret; 3620 3621 if (vmf->address == addr) 3622 ret = VM_FAULT_NOPAGE; 3623 3624 set_pte_range(vmf, folio, page, 1, addr); 3625 (*rss)++; 3626 folio_ref_inc(folio); 3627 3628 return ret; 3629 } 3630 3631 vm_fault_t filemap_map_pages(struct vm_fault *vmf, 3632 pgoff_t start_pgoff, pgoff_t end_pgoff) 3633 { 3634 struct vm_area_struct *vma = vmf->vma; 3635 struct file *file = vma->vm_file; 3636 struct address_space *mapping = file->f_mapping; 3637 pgoff_t file_end, last_pgoff = start_pgoff; 3638 unsigned long addr; 3639 XA_STATE(xas, &mapping->i_pages, start_pgoff); 3640 struct folio *folio; 3641 vm_fault_t ret = 0; 3642 unsigned long rss = 0; 3643 unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; 3644 3645 rcu_read_lock(); 3646 folio = next_uptodate_folio(&xas, mapping, end_pgoff); 3647 if (!folio) 3648 goto out; 3649 3650 if (filemap_map_pmd(vmf, folio, start_pgoff)) { 3651 ret = VM_FAULT_NOPAGE; 3652 goto out; 3653 } 3654 3655 addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); 3656 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); 3657 if (!vmf->pte) { 3658 folio_unlock(folio); 3659 folio_put(folio); 3660 goto out; 3661 } 3662 3663 file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; 3664 if (end_pgoff > file_end) 3665 end_pgoff = file_end; 3666 3667 folio_type = mm_counter_file(folio); 3668 do { 3669 unsigned long end; 3670 3671 addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; 3672 vmf->pte += xas.xa_index - last_pgoff; 3673 last_pgoff = xas.xa_index; 3674 end = folio_next_index(folio) - 1; 3675 nr_pages = min(end, end_pgoff) - xas.xa_index + 1; 3676 3677 if (!folio_test_large(folio)) 3678 ret |= filemap_map_order0_folio(vmf, 3679 folio, addr, &rss, &mmap_miss); 3680 else 3681 ret |= filemap_map_folio_range(vmf, folio, 3682 xas.xa_index - folio->index, addr, 3683 nr_pages, &rss, &mmap_miss); 3684 3685 folio_unlock(folio); 3686 folio_put(folio); 3687 } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); 3688 add_mm_counter(vma->vm_mm, folio_type, rss); 3689 pte_unmap_unlock(vmf->pte, vmf->ptl); 3690 trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); 3691 out: 3692 rcu_read_unlock(); 3693 3694 mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); 3695 if (mmap_miss >= mmap_miss_saved) 3696 WRITE_ONCE(file->f_ra.mmap_miss, 0); 3697 else 3698 WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); 3699 3700 return ret; 3701 } 3702 EXPORT_SYMBOL(filemap_map_pages); 3703 3704 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) 3705 { 3706 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 3707 struct folio *folio = page_folio(vmf->page); 3708 vm_fault_t ret = VM_FAULT_LOCKED; 3709 3710 sb_start_pagefault(mapping->host->i_sb); 3711 file_update_time(vmf->vma->vm_file); 3712 folio_lock(folio); 3713 if (folio->mapping != mapping) { 3714 folio_unlock(folio); 3715 ret = VM_FAULT_NOPAGE; 3716 goto out; 3717 } 3718 /* 3719 * We mark the folio dirty already here so that when freeze is in 3720 * progress, we are guaranteed that writeback during freezing will 3721 * see the dirty folio and writeprotect it again. 3722 */ 3723 folio_mark_dirty(folio); 3724 folio_wait_stable(folio); 3725 out: 3726 sb_end_pagefault(mapping->host->i_sb); 3727 return ret; 3728 } 3729 3730 const struct vm_operations_struct generic_file_vm_ops = { 3731 .fault = filemap_fault, 3732 .map_pages = filemap_map_pages, 3733 .page_mkwrite = filemap_page_mkwrite, 3734 }; 3735 3736 /* This is used for a general mmap of a disk file */ 3737 3738 int generic_file_mmap(struct file *file, struct vm_area_struct *vma) 3739 { 3740 struct address_space *mapping = file->f_mapping; 3741 3742 if (!mapping->a_ops->read_folio) 3743 return -ENOEXEC; 3744 file_accessed(file); 3745 vma->vm_ops = &generic_file_vm_ops; 3746 return 0; 3747 } 3748 3749 /* 3750 * This is for filesystems which do not implement ->writepage. 3751 */ 3752 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 3753 { 3754 if (vma_is_shared_maywrite(vma)) 3755 return -EINVAL; 3756 return generic_file_mmap(file, vma); 3757 } 3758 #else 3759 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) 3760 { 3761 return VM_FAULT_SIGBUS; 3762 } 3763 int generic_file_mmap(struct file *file, struct vm_area_struct *vma) 3764 { 3765 return -ENOSYS; 3766 } 3767 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 3768 { 3769 return -ENOSYS; 3770 } 3771 #endif /* CONFIG_MMU */ 3772 3773 EXPORT_SYMBOL(filemap_page_mkwrite); 3774 EXPORT_SYMBOL(generic_file_mmap); 3775 EXPORT_SYMBOL(generic_file_readonly_mmap); 3776 3777 static struct folio *do_read_cache_folio(struct address_space *mapping, 3778 pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) 3779 { 3780 struct folio *folio; 3781 int err; 3782 3783 if (!filler) 3784 filler = mapping->a_ops->read_folio; 3785 repeat: 3786 folio = filemap_get_folio(mapping, index); 3787 if (IS_ERR(folio)) { 3788 folio = filemap_alloc_folio(gfp, 3789 mapping_min_folio_order(mapping)); 3790 if (!folio) 3791 return ERR_PTR(-ENOMEM); 3792 index = mapping_align_index(mapping, index); 3793 err = filemap_add_folio(mapping, folio, index, gfp); 3794 if (unlikely(err)) { 3795 folio_put(folio); 3796 if (err == -EEXIST) 3797 goto repeat; 3798 /* Presumably ENOMEM for xarray node */ 3799 return ERR_PTR(err); 3800 } 3801 3802 goto filler; 3803 } 3804 if (folio_test_uptodate(folio)) 3805 goto out; 3806 3807 if (!folio_trylock(folio)) { 3808 folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); 3809 goto repeat; 3810 } 3811 3812 /* Folio was truncated from mapping */ 3813 if (!folio->mapping) { 3814 folio_unlock(folio); 3815 folio_put(folio); 3816 goto repeat; 3817 } 3818 3819 /* Someone else locked and filled the page in a very small window */ 3820 if (folio_test_uptodate(folio)) { 3821 folio_unlock(folio); 3822 goto out; 3823 } 3824 3825 filler: 3826 err = filemap_read_folio(file, filler, folio); 3827 if (err) { 3828 folio_put(folio); 3829 if (err == AOP_TRUNCATED_PAGE) 3830 goto repeat; 3831 return ERR_PTR(err); 3832 } 3833 3834 out: 3835 folio_mark_accessed(folio); 3836 return folio; 3837 } 3838 3839 /** 3840 * read_cache_folio - Read into page cache, fill it if needed. 3841 * @mapping: The address_space to read from. 3842 * @index: The index to read. 3843 * @filler: Function to perform the read, or NULL to use aops->read_folio(). 3844 * @file: Passed to filler function, may be NULL if not required. 3845 * 3846 * Read one page into the page cache. If it succeeds, the folio returned 3847 * will contain @index, but it may not be the first page of the folio. 3848 * 3849 * If the filler function returns an error, it will be returned to the 3850 * caller. 3851 * 3852 * Context: May sleep. Expects mapping->invalidate_lock to be held. 3853 * Return: An uptodate folio on success, ERR_PTR() on failure. 3854 */ 3855 struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, 3856 filler_t filler, struct file *file) 3857 { 3858 return do_read_cache_folio(mapping, index, filler, file, 3859 mapping_gfp_mask(mapping)); 3860 } 3861 EXPORT_SYMBOL(read_cache_folio); 3862 3863 /** 3864 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. 3865 * @mapping: The address_space for the folio. 3866 * @index: The index that the allocated folio will contain. 3867 * @gfp: The page allocator flags to use if allocating. 3868 * 3869 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with 3870 * any new memory allocations done using the specified allocation flags. 3871 * 3872 * The most likely error from this function is EIO, but ENOMEM is 3873 * possible and so is EINTR. If ->read_folio returns another error, 3874 * that will be returned to the caller. 3875 * 3876 * The function expects mapping->invalidate_lock to be already held. 3877 * 3878 * Return: Uptodate folio on success, ERR_PTR() on failure. 3879 */ 3880 struct folio *mapping_read_folio_gfp(struct address_space *mapping, 3881 pgoff_t index, gfp_t gfp) 3882 { 3883 return do_read_cache_folio(mapping, index, NULL, NULL, gfp); 3884 } 3885 EXPORT_SYMBOL(mapping_read_folio_gfp); 3886 3887 static struct page *do_read_cache_page(struct address_space *mapping, 3888 pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) 3889 { 3890 struct folio *folio; 3891 3892 folio = do_read_cache_folio(mapping, index, filler, file, gfp); 3893 if (IS_ERR(folio)) 3894 return &folio->page; 3895 return folio_file_page(folio, index); 3896 } 3897 3898 struct page *read_cache_page(struct address_space *mapping, 3899 pgoff_t index, filler_t *filler, struct file *file) 3900 { 3901 return do_read_cache_page(mapping, index, filler, file, 3902 mapping_gfp_mask(mapping)); 3903 } 3904 EXPORT_SYMBOL(read_cache_page); 3905 3906 /** 3907 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 3908 * @mapping: the page's address_space 3909 * @index: the page index 3910 * @gfp: the page allocator flags to use if allocating 3911 * 3912 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 3913 * any new page allocations done using the specified allocation flags. 3914 * 3915 * If the page does not get brought uptodate, return -EIO. 3916 * 3917 * The function expects mapping->invalidate_lock to be already held. 3918 * 3919 * Return: up to date page on success, ERR_PTR() on failure. 3920 */ 3921 struct page *read_cache_page_gfp(struct address_space *mapping, 3922 pgoff_t index, 3923 gfp_t gfp) 3924 { 3925 return do_read_cache_page(mapping, index, NULL, NULL, gfp); 3926 } 3927 EXPORT_SYMBOL(read_cache_page_gfp); 3928 3929 /* 3930 * Warn about a page cache invalidation failure during a direct I/O write. 3931 */ 3932 static void dio_warn_stale_pagecache(struct file *filp) 3933 { 3934 static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); 3935 char pathname[128]; 3936 char *path; 3937 3938 errseq_set(&filp->f_mapping->wb_err, -EIO); 3939 if (__ratelimit(&_rs)) { 3940 path = file_path(filp, pathname, sizeof(pathname)); 3941 if (IS_ERR(path)) 3942 path = "(unknown)"; 3943 pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); 3944 pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, 3945 current->comm); 3946 } 3947 } 3948 3949 void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) 3950 { 3951 struct address_space *mapping = iocb->ki_filp->f_mapping; 3952 3953 if (mapping->nrpages && 3954 invalidate_inode_pages2_range(mapping, 3955 iocb->ki_pos >> PAGE_SHIFT, 3956 (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) 3957 dio_warn_stale_pagecache(iocb->ki_filp); 3958 } 3959 3960 ssize_t 3961 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 3962 { 3963 struct address_space *mapping = iocb->ki_filp->f_mapping; 3964 size_t write_len = iov_iter_count(from); 3965 ssize_t written; 3966 3967 /* 3968 * If a page can not be invalidated, return 0 to fall back 3969 * to buffered write. 3970 */ 3971 written = kiocb_invalidate_pages(iocb, write_len); 3972 if (written) { 3973 if (written == -EBUSY) 3974 return 0; 3975 return written; 3976 } 3977 3978 written = mapping->a_ops->direct_IO(iocb, from); 3979 3980 /* 3981 * Finally, try again to invalidate clean pages which might have been 3982 * cached by non-direct readahead, or faulted in by get_user_pages() 3983 * if the source of the write was an mmap'ed region of the file 3984 * we're writing. Either one is a pretty crazy thing to do, 3985 * so we don't support it 100%. If this invalidation 3986 * fails, tough, the write still worked... 3987 * 3988 * Most of the time we do not need this since dio_complete() will do 3989 * the invalidation for us. However there are some file systems that 3990 * do not end up with dio_complete() being called, so let's not break 3991 * them by removing it completely. 3992 * 3993 * Noticeable example is a blkdev_direct_IO(). 3994 * 3995 * Skip invalidation for async writes or if mapping has no pages. 3996 */ 3997 if (written > 0) { 3998 struct inode *inode = mapping->host; 3999 loff_t pos = iocb->ki_pos; 4000 4001 kiocb_invalidate_post_direct_write(iocb, written); 4002 pos += written; 4003 write_len -= written; 4004 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 4005 i_size_write(inode, pos); 4006 mark_inode_dirty(inode); 4007 } 4008 iocb->ki_pos = pos; 4009 } 4010 if (written != -EIOCBQUEUED) 4011 iov_iter_revert(from, write_len - iov_iter_count(from)); 4012 return written; 4013 } 4014 EXPORT_SYMBOL(generic_file_direct_write); 4015 4016 ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) 4017 { 4018 struct file *file = iocb->ki_filp; 4019 loff_t pos = iocb->ki_pos; 4020 struct address_space *mapping = file->f_mapping; 4021 const struct address_space_operations *a_ops = mapping->a_ops; 4022 size_t chunk = mapping_max_folio_size(mapping); 4023 long status = 0; 4024 ssize_t written = 0; 4025 4026 do { 4027 struct folio *folio; 4028 size_t offset; /* Offset into folio */ 4029 size_t bytes; /* Bytes to write to folio */ 4030 size_t copied; /* Bytes copied from user */ 4031 void *fsdata = NULL; 4032 4033 bytes = iov_iter_count(i); 4034 retry: 4035 offset = pos & (chunk - 1); 4036 bytes = min(chunk - offset, bytes); 4037 balance_dirty_pages_ratelimited(mapping); 4038 4039 /* 4040 * Bring in the user page that we will copy from _first_. 4041 * Otherwise there's a nasty deadlock on copying from the 4042 * same page as we're writing to, without it being marked 4043 * up-to-date. 4044 */ 4045 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { 4046 status = -EFAULT; 4047 break; 4048 } 4049 4050 if (fatal_signal_pending(current)) { 4051 status = -EINTR; 4052 break; 4053 } 4054 4055 status = a_ops->write_begin(file, mapping, pos, bytes, 4056 &folio, &fsdata); 4057 if (unlikely(status < 0)) 4058 break; 4059 4060 offset = offset_in_folio(folio, pos); 4061 if (bytes > folio_size(folio) - offset) 4062 bytes = folio_size(folio) - offset; 4063 4064 if (mapping_writably_mapped(mapping)) 4065 flush_dcache_folio(folio); 4066 4067 copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); 4068 flush_dcache_folio(folio); 4069 4070 status = a_ops->write_end(file, mapping, pos, bytes, copied, 4071 folio, fsdata); 4072 if (unlikely(status != copied)) { 4073 iov_iter_revert(i, copied - max(status, 0L)); 4074 if (unlikely(status < 0)) 4075 break; 4076 } 4077 cond_resched(); 4078 4079 if (unlikely(status == 0)) { 4080 /* 4081 * A short copy made ->write_end() reject the 4082 * thing entirely. Might be memory poisoning 4083 * halfway through, might be a race with munmap, 4084 * might be severe memory pressure. 4085 */ 4086 if (chunk > PAGE_SIZE) 4087 chunk /= 2; 4088 if (copied) { 4089 bytes = copied; 4090 goto retry; 4091 } 4092 } else { 4093 pos += status; 4094 written += status; 4095 } 4096 } while (iov_iter_count(i)); 4097 4098 if (!written) 4099 return status; 4100 iocb->ki_pos += written; 4101 return written; 4102 } 4103 EXPORT_SYMBOL(generic_perform_write); 4104 4105 /** 4106 * __generic_file_write_iter - write data to a file 4107 * @iocb: IO state structure (file, offset, etc.) 4108 * @from: iov_iter with data to write 4109 * 4110 * This function does all the work needed for actually writing data to a 4111 * file. It does all basic checks, removes SUID from the file, updates 4112 * modification times and calls proper subroutines depending on whether we 4113 * do direct IO or a standard buffered write. 4114 * 4115 * It expects i_rwsem to be grabbed unless we work on a block device or similar 4116 * object which does not need locking at all. 4117 * 4118 * This function does *not* take care of syncing data in case of O_SYNC write. 4119 * A caller has to handle it. This is mainly due to the fact that we want to 4120 * avoid syncing under i_rwsem. 4121 * 4122 * Return: 4123 * * number of bytes written, even for truncated writes 4124 * * negative error code if no data has been written at all 4125 */ 4126 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 4127 { 4128 struct file *file = iocb->ki_filp; 4129 struct address_space *mapping = file->f_mapping; 4130 struct inode *inode = mapping->host; 4131 ssize_t ret; 4132 4133 ret = file_remove_privs(file); 4134 if (ret) 4135 return ret; 4136 4137 ret = file_update_time(file); 4138 if (ret) 4139 return ret; 4140 4141 if (iocb->ki_flags & IOCB_DIRECT) { 4142 ret = generic_file_direct_write(iocb, from); 4143 /* 4144 * If the write stopped short of completing, fall back to 4145 * buffered writes. Some filesystems do this for writes to 4146 * holes, for example. For DAX files, a buffered write will 4147 * not succeed (even if it did, DAX does not handle dirty 4148 * page-cache pages correctly). 4149 */ 4150 if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) 4151 return ret; 4152 return direct_write_fallback(iocb, from, ret, 4153 generic_perform_write(iocb, from)); 4154 } 4155 4156 return generic_perform_write(iocb, from); 4157 } 4158 EXPORT_SYMBOL(__generic_file_write_iter); 4159 4160 /** 4161 * generic_file_write_iter - write data to a file 4162 * @iocb: IO state structure 4163 * @from: iov_iter with data to write 4164 * 4165 * This is a wrapper around __generic_file_write_iter() to be used by most 4166 * filesystems. It takes care of syncing the file in case of O_SYNC file 4167 * and acquires i_rwsem as needed. 4168 * Return: 4169 * * negative error code if no data has been written at all of 4170 * vfs_fsync_range() failed for a synchronous write 4171 * * number of bytes written, even for truncated writes 4172 */ 4173 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 4174 { 4175 struct file *file = iocb->ki_filp; 4176 struct inode *inode = file->f_mapping->host; 4177 ssize_t ret; 4178 4179 inode_lock(inode); 4180 ret = generic_write_checks(iocb, from); 4181 if (ret > 0) 4182 ret = __generic_file_write_iter(iocb, from); 4183 inode_unlock(inode); 4184 4185 if (ret > 0) 4186 ret = generic_write_sync(iocb, ret); 4187 return ret; 4188 } 4189 EXPORT_SYMBOL(generic_file_write_iter); 4190 4191 /** 4192 * filemap_release_folio() - Release fs-specific metadata on a folio. 4193 * @folio: The folio which the kernel is trying to free. 4194 * @gfp: Memory allocation flags (and I/O mode). 4195 * 4196 * The address_space is trying to release any data attached to a folio 4197 * (presumably at folio->private). 4198 * 4199 * This will also be called if the private_2 flag is set on a page, 4200 * indicating that the folio has other metadata associated with it. 4201 * 4202 * The @gfp argument specifies whether I/O may be performed to release 4203 * this page (__GFP_IO), and whether the call may block 4204 * (__GFP_RECLAIM & __GFP_FS). 4205 * 4206 * Return: %true if the release was successful, otherwise %false. 4207 */ 4208 bool filemap_release_folio(struct folio *folio, gfp_t gfp) 4209 { 4210 struct address_space * const mapping = folio->mapping; 4211 4212 BUG_ON(!folio_test_locked(folio)); 4213 if (!folio_needs_release(folio)) 4214 return true; 4215 if (folio_test_writeback(folio)) 4216 return false; 4217 4218 if (mapping && mapping->a_ops->release_folio) 4219 return mapping->a_ops->release_folio(folio, gfp); 4220 return try_to_free_buffers(folio); 4221 } 4222 EXPORT_SYMBOL(filemap_release_folio); 4223 4224 /** 4225 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache 4226 * @inode: The inode to flush 4227 * @flush: Set to write back rather than simply invalidate. 4228 * @start: First byte to in range. 4229 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start 4230 * onwards. 4231 * 4232 * Invalidate all the folios on an inode that contribute to the specified 4233 * range, possibly writing them back first. Whilst the operation is 4234 * undertaken, the invalidate lock is held to prevent new folios from being 4235 * installed. 4236 */ 4237 int filemap_invalidate_inode(struct inode *inode, bool flush, 4238 loff_t start, loff_t end) 4239 { 4240 struct address_space *mapping = inode->i_mapping; 4241 pgoff_t first = start >> PAGE_SHIFT; 4242 pgoff_t last = end >> PAGE_SHIFT; 4243 pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; 4244 4245 if (!mapping || !mapping->nrpages || end < start) 4246 goto out; 4247 4248 /* Prevent new folios from being added to the inode. */ 4249 filemap_invalidate_lock(mapping); 4250 4251 if (!mapping->nrpages) 4252 goto unlock; 4253 4254 unmap_mapping_pages(mapping, first, nr, false); 4255 4256 /* Write back the data if we're asked to. */ 4257 if (flush) { 4258 struct writeback_control wbc = { 4259 .sync_mode = WB_SYNC_ALL, 4260 .nr_to_write = LONG_MAX, 4261 .range_start = start, 4262 .range_end = end, 4263 }; 4264 4265 filemap_fdatawrite_wbc(mapping, &wbc); 4266 } 4267 4268 /* Wait for writeback to complete on all folios and discard. */ 4269 invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); 4270 4271 unlock: 4272 filemap_invalidate_unlock(mapping); 4273 out: 4274 return filemap_check_errors(mapping); 4275 } 4276 EXPORT_SYMBOL_GPL(filemap_invalidate_inode); 4277 4278 #ifdef CONFIG_CACHESTAT_SYSCALL 4279 /** 4280 * filemap_cachestat() - compute the page cache statistics of a mapping 4281 * @mapping: The mapping to compute the statistics for. 4282 * @first_index: The starting page cache index. 4283 * @last_index: The final page index (inclusive). 4284 * @cs: the cachestat struct to write the result to. 4285 * 4286 * This will query the page cache statistics of a mapping in the 4287 * page range of [first_index, last_index] (inclusive). The statistics 4288 * queried include: number of dirty pages, number of pages marked for 4289 * writeback, and the number of (recently) evicted pages. 4290 */ 4291 static void filemap_cachestat(struct address_space *mapping, 4292 pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) 4293 { 4294 XA_STATE(xas, &mapping->i_pages, first_index); 4295 struct folio *folio; 4296 4297 /* Flush stats (and potentially sleep) outside the RCU read section. */ 4298 mem_cgroup_flush_stats_ratelimited(NULL); 4299 4300 rcu_read_lock(); 4301 xas_for_each(&xas, folio, last_index) { 4302 int order; 4303 unsigned long nr_pages; 4304 pgoff_t folio_first_index, folio_last_index; 4305 4306 /* 4307 * Don't deref the folio. It is not pinned, and might 4308 * get freed (and reused) underneath us. 4309 * 4310 * We *could* pin it, but that would be expensive for 4311 * what should be a fast and lightweight syscall. 4312 * 4313 * Instead, derive all information of interest from 4314 * the rcu-protected xarray. 4315 */ 4316 4317 if (xas_retry(&xas, folio)) 4318 continue; 4319 4320 order = xas_get_order(&xas); 4321 nr_pages = 1 << order; 4322 folio_first_index = round_down(xas.xa_index, 1 << order); 4323 folio_last_index = folio_first_index + nr_pages - 1; 4324 4325 /* Folios might straddle the range boundaries, only count covered pages */ 4326 if (folio_first_index < first_index) 4327 nr_pages -= first_index - folio_first_index; 4328 4329 if (folio_last_index > last_index) 4330 nr_pages -= folio_last_index - last_index; 4331 4332 if (xa_is_value(folio)) { 4333 /* page is evicted */ 4334 void *shadow = (void *)folio; 4335 bool workingset; /* not used */ 4336 4337 cs->nr_evicted += nr_pages; 4338 4339 #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ 4340 if (shmem_mapping(mapping)) { 4341 /* shmem file - in swap cache */ 4342 swp_entry_t swp = radix_to_swp_entry(folio); 4343 4344 /* swapin error results in poisoned entry */ 4345 if (non_swap_entry(swp)) 4346 goto resched; 4347 4348 /* 4349 * Getting a swap entry from the shmem 4350 * inode means we beat 4351 * shmem_unuse(). rcu_read_lock() 4352 * ensures swapoff waits for us before 4353 * freeing the swapper space. However, 4354 * we can race with swapping and 4355 * invalidation, so there might not be 4356 * a shadow in the swapcache (yet). 4357 */ 4358 shadow = get_shadow_from_swap_cache(swp); 4359 if (!shadow) 4360 goto resched; 4361 } 4362 #endif 4363 if (workingset_test_recent(shadow, true, &workingset, false)) 4364 cs->nr_recently_evicted += nr_pages; 4365 4366 goto resched; 4367 } 4368 4369 /* page is in cache */ 4370 cs->nr_cache += nr_pages; 4371 4372 if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) 4373 cs->nr_dirty += nr_pages; 4374 4375 if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) 4376 cs->nr_writeback += nr_pages; 4377 4378 resched: 4379 if (need_resched()) { 4380 xas_pause(&xas); 4381 cond_resched_rcu(); 4382 } 4383 } 4384 rcu_read_unlock(); 4385 } 4386 4387 /* 4388 * The cachestat(2) system call. 4389 * 4390 * cachestat() returns the page cache statistics of a file in the 4391 * bytes range specified by `off` and `len`: number of cached pages, 4392 * number of dirty pages, number of pages marked for writeback, 4393 * number of evicted pages, and number of recently evicted pages. 4394 * 4395 * An evicted page is a page that is previously in the page cache 4396 * but has been evicted since. A page is recently evicted if its last 4397 * eviction was recent enough that its reentry to the cache would 4398 * indicate that it is actively being used by the system, and that 4399 * there is memory pressure on the system. 4400 * 4401 * `off` and `len` must be non-negative integers. If `len` > 0, 4402 * the queried range is [`off`, `off` + `len`]. If `len` == 0, 4403 * we will query in the range from `off` to the end of the file. 4404 * 4405 * The `flags` argument is unused for now, but is included for future 4406 * extensibility. User should pass 0 (i.e no flag specified). 4407 * 4408 * Currently, hugetlbfs is not supported. 4409 * 4410 * Because the status of a page can change after cachestat() checks it 4411 * but before it returns to the application, the returned values may 4412 * contain stale information. 4413 * 4414 * return values: 4415 * zero - success 4416 * -EFAULT - cstat or cstat_range points to an illegal address 4417 * -EINVAL - invalid flags 4418 * -EBADF - invalid file descriptor 4419 * -EOPNOTSUPP - file descriptor is of a hugetlbfs file 4420 */ 4421 SYSCALL_DEFINE4(cachestat, unsigned int, fd, 4422 struct cachestat_range __user *, cstat_range, 4423 struct cachestat __user *, cstat, unsigned int, flags) 4424 { 4425 CLASS(fd, f)(fd); 4426 struct address_space *mapping; 4427 struct cachestat_range csr; 4428 struct cachestat cs; 4429 pgoff_t first_index, last_index; 4430 4431 if (fd_empty(f)) 4432 return -EBADF; 4433 4434 if (copy_from_user(&csr, cstat_range, 4435 sizeof(struct cachestat_range))) 4436 return -EFAULT; 4437 4438 /* hugetlbfs is not supported */ 4439 if (is_file_hugepages(fd_file(f))) 4440 return -EOPNOTSUPP; 4441 4442 if (flags != 0) 4443 return -EINVAL; 4444 4445 first_index = csr.off >> PAGE_SHIFT; 4446 last_index = 4447 csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; 4448 memset(&cs, 0, sizeof(struct cachestat)); 4449 mapping = fd_file(f)->f_mapping; 4450 filemap_cachestat(mapping, first_index, last_index, &cs); 4451 4452 if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) 4453 return -EFAULT; 4454 4455 return 0; 4456 } 4457 #endif /* CONFIG_CACHESTAT_SYSCALL */ 4458