1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/export.h> 13 #include <linux/compiler.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include <linux/uaccess.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/cpuset.h> 33 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34 #include <linux/hugetlb.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cleancache.h> 37 #include <linux/rmap.h> 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/filemap.h> 42 43 /* 44 * FIXME: remove all knowledge of the buffer layer from the core VM 45 */ 46 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 47 48 #include <asm/mman.h> 49 50 /* 51 * Shared mappings implemented 30.11.1994. It's not fully working yet, 52 * though. 53 * 54 * Shared mappings now work. 15.8.1995 Bruno. 55 * 56 * finished 'unifying' the page and buffer cache and SMP-threaded the 57 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 58 * 59 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 60 */ 61 62 /* 63 * Lock ordering: 64 * 65 * ->i_mmap_rwsem (truncate_pagecache) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 67 * ->swap_lock (exclusive_swap_page, others) 68 * ->mapping->tree_lock 69 * 70 * ->i_mutex 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 72 * 73 * ->mmap_sem 74 * ->i_mmap_rwsem 75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 77 * 78 * ->mmap_sem 79 * ->lock_page (access_process_vm) 80 * 81 * ->i_mutex (generic_perform_write) 82 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 83 * 84 * bdi->wb.list_lock 85 * sb_lock (fs/fs-writeback.c) 86 * ->mapping->tree_lock (__sync_single_inode) 87 * 88 * ->i_mmap_rwsem 89 * ->anon_vma.lock (vma_adjust) 90 * 91 * ->anon_vma.lock 92 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 93 * 94 * ->page_table_lock or pte_lock 95 * ->swap_lock (try_to_unmap_one) 96 * ->private_lock (try_to_unmap_one) 97 * ->tree_lock (try_to_unmap_one) 98 * ->zone.lru_lock (follow_page->mark_page_accessed) 99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 100 * ->private_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 104 * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 108 * 109 * ->i_mmap_rwsem 110 * ->tasklist_lock (memory_failure, collect_procs_ao) 111 */ 112 113 static void page_cache_tree_delete(struct address_space *mapping, 114 struct page *page, void *shadow) 115 { 116 struct radix_tree_node *node; 117 118 VM_BUG_ON(!PageLocked(page)); 119 120 node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, 121 shadow); 122 123 if (shadow) { 124 mapping->nrexceptional++; 125 /* 126 * Make sure the nrexceptional update is committed before 127 * the nrpages update so that final truncate racing 128 * with reclaim does not see both counters 0 at the 129 * same time and miss a shadow entry. 130 */ 131 smp_wmb(); 132 } 133 mapping->nrpages--; 134 135 if (!node) 136 return; 137 138 workingset_node_pages_dec(node); 139 if (shadow) 140 workingset_node_shadows_inc(node); 141 else 142 if (__radix_tree_delete_node(&mapping->page_tree, node)) 143 return; 144 145 /* 146 * Track node that only contains shadow entries. 147 * 148 * Avoid acquiring the list_lru lock if already tracked. The 149 * list_empty() test is safe as node->private_list is 150 * protected by mapping->tree_lock. 151 */ 152 if (!workingset_node_pages(node) && 153 list_empty(&node->private_list)) { 154 node->private_data = mapping; 155 list_lru_add(&workingset_shadow_nodes, &node->private_list); 156 } 157 } 158 159 /* 160 * Delete a page from the page cache and free it. Caller has to make 161 * sure the page is locked and that nobody else uses it - or that usage 162 * is safe. The caller must hold the mapping's tree_lock. 163 */ 164 void __delete_from_page_cache(struct page *page, void *shadow) 165 { 166 struct address_space *mapping = page->mapping; 167 168 trace_mm_filemap_delete_from_page_cache(page); 169 /* 170 * if we're uptodate, flush out into the cleancache, otherwise 171 * invalidate any existing cleancache entries. We can't leave 172 * stale data around in the cleancache once our page is gone 173 */ 174 if (PageUptodate(page) && PageMappedToDisk(page)) 175 cleancache_put_page(page); 176 else 177 cleancache_invalidate_page(mapping, page); 178 179 VM_BUG_ON_PAGE(page_mapped(page), page); 180 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 181 int mapcount; 182 183 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 184 current->comm, page_to_pfn(page)); 185 dump_page(page, "still mapped when deleted"); 186 dump_stack(); 187 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 188 189 mapcount = page_mapcount(page); 190 if (mapping_exiting(mapping) && 191 page_count(page) >= mapcount + 2) { 192 /* 193 * All vmas have already been torn down, so it's 194 * a good bet that actually the page is unmapped, 195 * and we'd prefer not to leak it: if we're wrong, 196 * some other bad page check should catch it later. 197 */ 198 page_mapcount_reset(page); 199 page_ref_sub(page, mapcount); 200 } 201 } 202 203 page_cache_tree_delete(mapping, page, shadow); 204 205 page->mapping = NULL; 206 /* Leave page->index set: truncation lookup relies upon it */ 207 208 /* hugetlb pages do not participate in page cache accounting. */ 209 if (!PageHuge(page)) 210 __dec_zone_page_state(page, NR_FILE_PAGES); 211 if (PageSwapBacked(page)) 212 __dec_zone_page_state(page, NR_SHMEM); 213 214 /* 215 * At this point page must be either written or cleaned by truncate. 216 * Dirty page here signals a bug and loss of unwritten data. 217 * 218 * This fixes dirty accounting after removing the page entirely but 219 * leaves PageDirty set: it has no effect for truncated page and 220 * anyway will be cleared before returning page into buddy allocator. 221 */ 222 if (WARN_ON_ONCE(PageDirty(page))) 223 account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); 224 } 225 226 /** 227 * delete_from_page_cache - delete page from page cache 228 * @page: the page which the kernel is trying to remove from page cache 229 * 230 * This must be called only on pages that have been verified to be in the page 231 * cache and locked. It will never put the page into the free list, the caller 232 * has a reference on the page. 233 */ 234 void delete_from_page_cache(struct page *page) 235 { 236 struct address_space *mapping = page->mapping; 237 unsigned long flags; 238 239 void (*freepage)(struct page *); 240 241 BUG_ON(!PageLocked(page)); 242 243 freepage = mapping->a_ops->freepage; 244 245 spin_lock_irqsave(&mapping->tree_lock, flags); 246 __delete_from_page_cache(page, NULL); 247 spin_unlock_irqrestore(&mapping->tree_lock, flags); 248 249 if (freepage) 250 freepage(page); 251 put_page(page); 252 } 253 EXPORT_SYMBOL(delete_from_page_cache); 254 255 static int filemap_check_errors(struct address_space *mapping) 256 { 257 int ret = 0; 258 /* Check for outstanding write errors */ 259 if (test_bit(AS_ENOSPC, &mapping->flags) && 260 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 261 ret = -ENOSPC; 262 if (test_bit(AS_EIO, &mapping->flags) && 263 test_and_clear_bit(AS_EIO, &mapping->flags)) 264 ret = -EIO; 265 return ret; 266 } 267 268 /** 269 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 270 * @mapping: address space structure to write 271 * @start: offset in bytes where the range starts 272 * @end: offset in bytes where the range ends (inclusive) 273 * @sync_mode: enable synchronous operation 274 * 275 * Start writeback against all of a mapping's dirty pages that lie 276 * within the byte offsets <start, end> inclusive. 277 * 278 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 279 * opposed to a regular memory cleansing writeback. The difference between 280 * these two operations is that if a dirty page/buffer is encountered, it must 281 * be waited upon, and not just skipped over. 282 */ 283 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 284 loff_t end, int sync_mode) 285 { 286 int ret; 287 struct writeback_control wbc = { 288 .sync_mode = sync_mode, 289 .nr_to_write = LONG_MAX, 290 .range_start = start, 291 .range_end = end, 292 }; 293 294 if (!mapping_cap_writeback_dirty(mapping)) 295 return 0; 296 297 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 298 ret = do_writepages(mapping, &wbc); 299 wbc_detach_inode(&wbc); 300 return ret; 301 } 302 303 static inline int __filemap_fdatawrite(struct address_space *mapping, 304 int sync_mode) 305 { 306 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 307 } 308 309 int filemap_fdatawrite(struct address_space *mapping) 310 { 311 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 312 } 313 EXPORT_SYMBOL(filemap_fdatawrite); 314 315 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 316 loff_t end) 317 { 318 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 319 } 320 EXPORT_SYMBOL(filemap_fdatawrite_range); 321 322 /** 323 * filemap_flush - mostly a non-blocking flush 324 * @mapping: target address_space 325 * 326 * This is a mostly non-blocking flush. Not suitable for data-integrity 327 * purposes - I/O may not be started against all dirty pages. 328 */ 329 int filemap_flush(struct address_space *mapping) 330 { 331 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 332 } 333 EXPORT_SYMBOL(filemap_flush); 334 335 static int __filemap_fdatawait_range(struct address_space *mapping, 336 loff_t start_byte, loff_t end_byte) 337 { 338 pgoff_t index = start_byte >> PAGE_SHIFT; 339 pgoff_t end = end_byte >> PAGE_SHIFT; 340 struct pagevec pvec; 341 int nr_pages; 342 int ret = 0; 343 344 if (end_byte < start_byte) 345 goto out; 346 347 pagevec_init(&pvec, 0); 348 while ((index <= end) && 349 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 350 PAGECACHE_TAG_WRITEBACK, 351 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 352 unsigned i; 353 354 for (i = 0; i < nr_pages; i++) { 355 struct page *page = pvec.pages[i]; 356 357 /* until radix tree lookup accepts end_index */ 358 if (page->index > end) 359 continue; 360 361 wait_on_page_writeback(page); 362 if (TestClearPageError(page)) 363 ret = -EIO; 364 } 365 pagevec_release(&pvec); 366 cond_resched(); 367 } 368 out: 369 return ret; 370 } 371 372 /** 373 * filemap_fdatawait_range - wait for writeback to complete 374 * @mapping: address space structure to wait for 375 * @start_byte: offset in bytes where the range starts 376 * @end_byte: offset in bytes where the range ends (inclusive) 377 * 378 * Walk the list of under-writeback pages of the given address space 379 * in the given range and wait for all of them. Check error status of 380 * the address space and return it. 381 * 382 * Since the error status of the address space is cleared by this function, 383 * callers are responsible for checking the return value and handling and/or 384 * reporting the error. 385 */ 386 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 387 loff_t end_byte) 388 { 389 int ret, ret2; 390 391 ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); 392 ret2 = filemap_check_errors(mapping); 393 if (!ret) 394 ret = ret2; 395 396 return ret; 397 } 398 EXPORT_SYMBOL(filemap_fdatawait_range); 399 400 /** 401 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 402 * @mapping: address space structure to wait for 403 * 404 * Walk the list of under-writeback pages of the given address space 405 * and wait for all of them. Unlike filemap_fdatawait(), this function 406 * does not clear error status of the address space. 407 * 408 * Use this function if callers don't handle errors themselves. Expected 409 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 410 * fsfreeze(8) 411 */ 412 void filemap_fdatawait_keep_errors(struct address_space *mapping) 413 { 414 loff_t i_size = i_size_read(mapping->host); 415 416 if (i_size == 0) 417 return; 418 419 __filemap_fdatawait_range(mapping, 0, i_size - 1); 420 } 421 422 /** 423 * filemap_fdatawait - wait for all under-writeback pages to complete 424 * @mapping: address space structure to wait for 425 * 426 * Walk the list of under-writeback pages of the given address space 427 * and wait for all of them. Check error status of the address space 428 * and return it. 429 * 430 * Since the error status of the address space is cleared by this function, 431 * callers are responsible for checking the return value and handling and/or 432 * reporting the error. 433 */ 434 int filemap_fdatawait(struct address_space *mapping) 435 { 436 loff_t i_size = i_size_read(mapping->host); 437 438 if (i_size == 0) 439 return 0; 440 441 return filemap_fdatawait_range(mapping, 0, i_size - 1); 442 } 443 EXPORT_SYMBOL(filemap_fdatawait); 444 445 int filemap_write_and_wait(struct address_space *mapping) 446 { 447 int err = 0; 448 449 if ((!dax_mapping(mapping) && mapping->nrpages) || 450 (dax_mapping(mapping) && mapping->nrexceptional)) { 451 err = filemap_fdatawrite(mapping); 452 /* 453 * Even if the above returned error, the pages may be 454 * written partially (e.g. -ENOSPC), so we wait for it. 455 * But the -EIO is special case, it may indicate the worst 456 * thing (e.g. bug) happened, so we avoid waiting for it. 457 */ 458 if (err != -EIO) { 459 int err2 = filemap_fdatawait(mapping); 460 if (!err) 461 err = err2; 462 } 463 } else { 464 err = filemap_check_errors(mapping); 465 } 466 return err; 467 } 468 EXPORT_SYMBOL(filemap_write_and_wait); 469 470 /** 471 * filemap_write_and_wait_range - write out & wait on a file range 472 * @mapping: the address_space for the pages 473 * @lstart: offset in bytes where the range starts 474 * @lend: offset in bytes where the range ends (inclusive) 475 * 476 * Write out and wait upon file offsets lstart->lend, inclusive. 477 * 478 * Note that `lend' is inclusive (describes the last byte to be written) so 479 * that this function can be used to write to the very end-of-file (end = -1). 480 */ 481 int filemap_write_and_wait_range(struct address_space *mapping, 482 loff_t lstart, loff_t lend) 483 { 484 int err = 0; 485 486 if ((!dax_mapping(mapping) && mapping->nrpages) || 487 (dax_mapping(mapping) && mapping->nrexceptional)) { 488 err = __filemap_fdatawrite_range(mapping, lstart, lend, 489 WB_SYNC_ALL); 490 /* See comment of filemap_write_and_wait() */ 491 if (err != -EIO) { 492 int err2 = filemap_fdatawait_range(mapping, 493 lstart, lend); 494 if (!err) 495 err = err2; 496 } 497 } else { 498 err = filemap_check_errors(mapping); 499 } 500 return err; 501 } 502 EXPORT_SYMBOL(filemap_write_and_wait_range); 503 504 /** 505 * replace_page_cache_page - replace a pagecache page with a new one 506 * @old: page to be replaced 507 * @new: page to replace with 508 * @gfp_mask: allocation mode 509 * 510 * This function replaces a page in the pagecache with a new one. On 511 * success it acquires the pagecache reference for the new page and 512 * drops it for the old page. Both the old and new pages must be 513 * locked. This function does not add the new page to the LRU, the 514 * caller must do that. 515 * 516 * The remove + add is atomic. The only way this function can fail is 517 * memory allocation failure. 518 */ 519 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 520 { 521 int error; 522 523 VM_BUG_ON_PAGE(!PageLocked(old), old); 524 VM_BUG_ON_PAGE(!PageLocked(new), new); 525 VM_BUG_ON_PAGE(new->mapping, new); 526 527 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 528 if (!error) { 529 struct address_space *mapping = old->mapping; 530 void (*freepage)(struct page *); 531 unsigned long flags; 532 533 pgoff_t offset = old->index; 534 freepage = mapping->a_ops->freepage; 535 536 get_page(new); 537 new->mapping = mapping; 538 new->index = offset; 539 540 spin_lock_irqsave(&mapping->tree_lock, flags); 541 __delete_from_page_cache(old, NULL); 542 error = radix_tree_insert(&mapping->page_tree, offset, new); 543 BUG_ON(error); 544 mapping->nrpages++; 545 546 /* 547 * hugetlb pages do not participate in page cache accounting. 548 */ 549 if (!PageHuge(new)) 550 __inc_zone_page_state(new, NR_FILE_PAGES); 551 if (PageSwapBacked(new)) 552 __inc_zone_page_state(new, NR_SHMEM); 553 spin_unlock_irqrestore(&mapping->tree_lock, flags); 554 mem_cgroup_migrate(old, new); 555 radix_tree_preload_end(); 556 if (freepage) 557 freepage(old); 558 put_page(old); 559 } 560 561 return error; 562 } 563 EXPORT_SYMBOL_GPL(replace_page_cache_page); 564 565 static int page_cache_tree_insert(struct address_space *mapping, 566 struct page *page, void **shadowp) 567 { 568 struct radix_tree_node *node; 569 void **slot; 570 int error; 571 572 error = __radix_tree_create(&mapping->page_tree, page->index, 0, 573 &node, &slot); 574 if (error) 575 return error; 576 if (*slot) { 577 void *p; 578 579 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 580 if (!radix_tree_exceptional_entry(p)) 581 return -EEXIST; 582 583 if (WARN_ON(dax_mapping(mapping))) 584 return -EINVAL; 585 586 if (shadowp) 587 *shadowp = p; 588 mapping->nrexceptional--; 589 if (node) 590 workingset_node_shadows_dec(node); 591 } 592 radix_tree_replace_slot(slot, page); 593 mapping->nrpages++; 594 if (node) { 595 workingset_node_pages_inc(node); 596 /* 597 * Don't track node that contains actual pages. 598 * 599 * Avoid acquiring the list_lru lock if already 600 * untracked. The list_empty() test is safe as 601 * node->private_list is protected by 602 * mapping->tree_lock. 603 */ 604 if (!list_empty(&node->private_list)) 605 list_lru_del(&workingset_shadow_nodes, 606 &node->private_list); 607 } 608 return 0; 609 } 610 611 static int __add_to_page_cache_locked(struct page *page, 612 struct address_space *mapping, 613 pgoff_t offset, gfp_t gfp_mask, 614 void **shadowp) 615 { 616 int huge = PageHuge(page); 617 struct mem_cgroup *memcg; 618 int error; 619 620 VM_BUG_ON_PAGE(!PageLocked(page), page); 621 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 622 623 if (!huge) { 624 error = mem_cgroup_try_charge(page, current->mm, 625 gfp_mask, &memcg, false); 626 if (error) 627 return error; 628 } 629 630 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 631 if (error) { 632 if (!huge) 633 mem_cgroup_cancel_charge(page, memcg, false); 634 return error; 635 } 636 637 get_page(page); 638 page->mapping = mapping; 639 page->index = offset; 640 641 spin_lock_irq(&mapping->tree_lock); 642 error = page_cache_tree_insert(mapping, page, shadowp); 643 radix_tree_preload_end(); 644 if (unlikely(error)) 645 goto err_insert; 646 647 /* hugetlb pages do not participate in page cache accounting. */ 648 if (!huge) 649 __inc_zone_page_state(page, NR_FILE_PAGES); 650 spin_unlock_irq(&mapping->tree_lock); 651 if (!huge) 652 mem_cgroup_commit_charge(page, memcg, false, false); 653 trace_mm_filemap_add_to_page_cache(page); 654 return 0; 655 err_insert: 656 page->mapping = NULL; 657 /* Leave page->index set: truncation relies upon it */ 658 spin_unlock_irq(&mapping->tree_lock); 659 if (!huge) 660 mem_cgroup_cancel_charge(page, memcg, false); 661 put_page(page); 662 return error; 663 } 664 665 /** 666 * add_to_page_cache_locked - add a locked page to the pagecache 667 * @page: page to add 668 * @mapping: the page's address_space 669 * @offset: page index 670 * @gfp_mask: page allocation mode 671 * 672 * This function is used to add a page to the pagecache. It must be locked. 673 * This function does not add the page to the LRU. The caller must do that. 674 */ 675 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 676 pgoff_t offset, gfp_t gfp_mask) 677 { 678 return __add_to_page_cache_locked(page, mapping, offset, 679 gfp_mask, NULL); 680 } 681 EXPORT_SYMBOL(add_to_page_cache_locked); 682 683 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 684 pgoff_t offset, gfp_t gfp_mask) 685 { 686 void *shadow = NULL; 687 int ret; 688 689 __SetPageLocked(page); 690 ret = __add_to_page_cache_locked(page, mapping, offset, 691 gfp_mask, &shadow); 692 if (unlikely(ret)) 693 __ClearPageLocked(page); 694 else { 695 /* 696 * The page might have been evicted from cache only 697 * recently, in which case it should be activated like 698 * any other repeatedly accessed page. 699 * The exception is pages getting rewritten; evicting other 700 * data from the working set, only to cache data that will 701 * get overwritten with something else, is a waste of memory. 702 */ 703 if (!(gfp_mask & __GFP_WRITE) && 704 shadow && workingset_refault(shadow)) { 705 SetPageActive(page); 706 workingset_activation(page); 707 } else 708 ClearPageActive(page); 709 lru_cache_add(page); 710 } 711 return ret; 712 } 713 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 714 715 #ifdef CONFIG_NUMA 716 struct page *__page_cache_alloc(gfp_t gfp) 717 { 718 int n; 719 struct page *page; 720 721 if (cpuset_do_page_mem_spread()) { 722 unsigned int cpuset_mems_cookie; 723 do { 724 cpuset_mems_cookie = read_mems_allowed_begin(); 725 n = cpuset_mem_spread_node(); 726 page = __alloc_pages_node(n, gfp, 0); 727 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 728 729 return page; 730 } 731 return alloc_pages(gfp, 0); 732 } 733 EXPORT_SYMBOL(__page_cache_alloc); 734 #endif 735 736 /* 737 * In order to wait for pages to become available there must be 738 * waitqueues associated with pages. By using a hash table of 739 * waitqueues where the bucket discipline is to maintain all 740 * waiters on the same queue and wake all when any of the pages 741 * become available, and for the woken contexts to check to be 742 * sure the appropriate page became available, this saves space 743 * at a cost of "thundering herd" phenomena during rare hash 744 * collisions. 745 */ 746 wait_queue_head_t *page_waitqueue(struct page *page) 747 { 748 const struct zone *zone = page_zone(page); 749 750 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 751 } 752 EXPORT_SYMBOL(page_waitqueue); 753 754 void wait_on_page_bit(struct page *page, int bit_nr) 755 { 756 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 757 758 if (test_bit(bit_nr, &page->flags)) 759 __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, 760 TASK_UNINTERRUPTIBLE); 761 } 762 EXPORT_SYMBOL(wait_on_page_bit); 763 764 int wait_on_page_bit_killable(struct page *page, int bit_nr) 765 { 766 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 767 768 if (!test_bit(bit_nr, &page->flags)) 769 return 0; 770 771 return __wait_on_bit(page_waitqueue(page), &wait, 772 bit_wait_io, TASK_KILLABLE); 773 } 774 775 int wait_on_page_bit_killable_timeout(struct page *page, 776 int bit_nr, unsigned long timeout) 777 { 778 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 779 780 wait.key.timeout = jiffies + timeout; 781 if (!test_bit(bit_nr, &page->flags)) 782 return 0; 783 return __wait_on_bit(page_waitqueue(page), &wait, 784 bit_wait_io_timeout, TASK_KILLABLE); 785 } 786 EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); 787 788 /** 789 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 790 * @page: Page defining the wait queue of interest 791 * @waiter: Waiter to add to the queue 792 * 793 * Add an arbitrary @waiter to the wait queue for the nominated @page. 794 */ 795 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 796 { 797 wait_queue_head_t *q = page_waitqueue(page); 798 unsigned long flags; 799 800 spin_lock_irqsave(&q->lock, flags); 801 __add_wait_queue(q, waiter); 802 spin_unlock_irqrestore(&q->lock, flags); 803 } 804 EXPORT_SYMBOL_GPL(add_page_wait_queue); 805 806 /** 807 * unlock_page - unlock a locked page 808 * @page: the page 809 * 810 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 811 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 812 * mechanism between PageLocked pages and PageWriteback pages is shared. 813 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 814 * 815 * The mb is necessary to enforce ordering between the clear_bit and the read 816 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 817 */ 818 void unlock_page(struct page *page) 819 { 820 page = compound_head(page); 821 VM_BUG_ON_PAGE(!PageLocked(page), page); 822 clear_bit_unlock(PG_locked, &page->flags); 823 smp_mb__after_atomic(); 824 wake_up_page(page, PG_locked); 825 } 826 EXPORT_SYMBOL(unlock_page); 827 828 /** 829 * end_page_writeback - end writeback against a page 830 * @page: the page 831 */ 832 void end_page_writeback(struct page *page) 833 { 834 /* 835 * TestClearPageReclaim could be used here but it is an atomic 836 * operation and overkill in this particular case. Failing to 837 * shuffle a page marked for immediate reclaim is too mild to 838 * justify taking an atomic operation penalty at the end of 839 * ever page writeback. 840 */ 841 if (PageReclaim(page)) { 842 ClearPageReclaim(page); 843 rotate_reclaimable_page(page); 844 } 845 846 if (!test_clear_page_writeback(page)) 847 BUG(); 848 849 smp_mb__after_atomic(); 850 wake_up_page(page, PG_writeback); 851 } 852 EXPORT_SYMBOL(end_page_writeback); 853 854 /* 855 * After completing I/O on a page, call this routine to update the page 856 * flags appropriately 857 */ 858 void page_endio(struct page *page, int rw, int err) 859 { 860 if (rw == READ) { 861 if (!err) { 862 SetPageUptodate(page); 863 } else { 864 ClearPageUptodate(page); 865 SetPageError(page); 866 } 867 unlock_page(page); 868 } else { /* rw == WRITE */ 869 if (err) { 870 SetPageError(page); 871 if (page->mapping) 872 mapping_set_error(page->mapping, err); 873 } 874 end_page_writeback(page); 875 } 876 } 877 EXPORT_SYMBOL_GPL(page_endio); 878 879 /** 880 * __lock_page - get a lock on the page, assuming we need to sleep to get it 881 * @page: the page to lock 882 */ 883 void __lock_page(struct page *page) 884 { 885 struct page *page_head = compound_head(page); 886 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 887 888 __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, 889 TASK_UNINTERRUPTIBLE); 890 } 891 EXPORT_SYMBOL(__lock_page); 892 893 int __lock_page_killable(struct page *page) 894 { 895 struct page *page_head = compound_head(page); 896 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 897 898 return __wait_on_bit_lock(page_waitqueue(page_head), &wait, 899 bit_wait_io, TASK_KILLABLE); 900 } 901 EXPORT_SYMBOL_GPL(__lock_page_killable); 902 903 /* 904 * Return values: 905 * 1 - page is locked; mmap_sem is still held. 906 * 0 - page is not locked. 907 * mmap_sem has been released (up_read()), unless flags had both 908 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 909 * which case mmap_sem is still held. 910 * 911 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 912 * with the page locked and the mmap_sem unperturbed. 913 */ 914 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 915 unsigned int flags) 916 { 917 if (flags & FAULT_FLAG_ALLOW_RETRY) { 918 /* 919 * CAUTION! In this case, mmap_sem is not released 920 * even though return 0. 921 */ 922 if (flags & FAULT_FLAG_RETRY_NOWAIT) 923 return 0; 924 925 up_read(&mm->mmap_sem); 926 if (flags & FAULT_FLAG_KILLABLE) 927 wait_on_page_locked_killable(page); 928 else 929 wait_on_page_locked(page); 930 return 0; 931 } else { 932 if (flags & FAULT_FLAG_KILLABLE) { 933 int ret; 934 935 ret = __lock_page_killable(page); 936 if (ret) { 937 up_read(&mm->mmap_sem); 938 return 0; 939 } 940 } else 941 __lock_page(page); 942 return 1; 943 } 944 } 945 946 /** 947 * page_cache_next_hole - find the next hole (not-present entry) 948 * @mapping: mapping 949 * @index: index 950 * @max_scan: maximum range to search 951 * 952 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 953 * lowest indexed hole. 954 * 955 * Returns: the index of the hole if found, otherwise returns an index 956 * outside of the set specified (in which case 'return - index >= 957 * max_scan' will be true). In rare cases of index wrap-around, 0 will 958 * be returned. 959 * 960 * page_cache_next_hole may be called under rcu_read_lock. However, 961 * like radix_tree_gang_lookup, this will not atomically search a 962 * snapshot of the tree at a single point in time. For example, if a 963 * hole is created at index 5, then subsequently a hole is created at 964 * index 10, page_cache_next_hole covering both indexes may return 10 965 * if called under rcu_read_lock. 966 */ 967 pgoff_t page_cache_next_hole(struct address_space *mapping, 968 pgoff_t index, unsigned long max_scan) 969 { 970 unsigned long i; 971 972 for (i = 0; i < max_scan; i++) { 973 struct page *page; 974 975 page = radix_tree_lookup(&mapping->page_tree, index); 976 if (!page || radix_tree_exceptional_entry(page)) 977 break; 978 index++; 979 if (index == 0) 980 break; 981 } 982 983 return index; 984 } 985 EXPORT_SYMBOL(page_cache_next_hole); 986 987 /** 988 * page_cache_prev_hole - find the prev hole (not-present entry) 989 * @mapping: mapping 990 * @index: index 991 * @max_scan: maximum range to search 992 * 993 * Search backwards in the range [max(index-max_scan+1, 0), index] for 994 * the first hole. 995 * 996 * Returns: the index of the hole if found, otherwise returns an index 997 * outside of the set specified (in which case 'index - return >= 998 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 999 * will be returned. 1000 * 1001 * page_cache_prev_hole may be called under rcu_read_lock. However, 1002 * like radix_tree_gang_lookup, this will not atomically search a 1003 * snapshot of the tree at a single point in time. For example, if a 1004 * hole is created at index 10, then subsequently a hole is created at 1005 * index 5, page_cache_prev_hole covering both indexes may return 5 if 1006 * called under rcu_read_lock. 1007 */ 1008 pgoff_t page_cache_prev_hole(struct address_space *mapping, 1009 pgoff_t index, unsigned long max_scan) 1010 { 1011 unsigned long i; 1012 1013 for (i = 0; i < max_scan; i++) { 1014 struct page *page; 1015 1016 page = radix_tree_lookup(&mapping->page_tree, index); 1017 if (!page || radix_tree_exceptional_entry(page)) 1018 break; 1019 index--; 1020 if (index == ULONG_MAX) 1021 break; 1022 } 1023 1024 return index; 1025 } 1026 EXPORT_SYMBOL(page_cache_prev_hole); 1027 1028 /** 1029 * find_get_entry - find and get a page cache entry 1030 * @mapping: the address_space to search 1031 * @offset: the page cache index 1032 * 1033 * Looks up the page cache slot at @mapping & @offset. If there is a 1034 * page cache page, it is returned with an increased refcount. 1035 * 1036 * If the slot holds a shadow entry of a previously evicted page, or a 1037 * swap entry from shmem/tmpfs, it is returned. 1038 * 1039 * Otherwise, %NULL is returned. 1040 */ 1041 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1042 { 1043 void **pagep; 1044 struct page *page; 1045 1046 rcu_read_lock(); 1047 repeat: 1048 page = NULL; 1049 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 1050 if (pagep) { 1051 page = radix_tree_deref_slot(pagep); 1052 if (unlikely(!page)) 1053 goto out; 1054 if (radix_tree_exception(page)) { 1055 if (radix_tree_deref_retry(page)) 1056 goto repeat; 1057 /* 1058 * A shadow entry of a recently evicted page, 1059 * or a swap entry from shmem/tmpfs. Return 1060 * it without attempting to raise page count. 1061 */ 1062 goto out; 1063 } 1064 if (!page_cache_get_speculative(page)) 1065 goto repeat; 1066 1067 /* 1068 * Has the page moved? 1069 * This is part of the lockless pagecache protocol. See 1070 * include/linux/pagemap.h for details. 1071 */ 1072 if (unlikely(page != *pagep)) { 1073 put_page(page); 1074 goto repeat; 1075 } 1076 } 1077 out: 1078 rcu_read_unlock(); 1079 1080 return page; 1081 } 1082 EXPORT_SYMBOL(find_get_entry); 1083 1084 /** 1085 * find_lock_entry - locate, pin and lock a page cache entry 1086 * @mapping: the address_space to search 1087 * @offset: the page cache index 1088 * 1089 * Looks up the page cache slot at @mapping & @offset. If there is a 1090 * page cache page, it is returned locked and with an increased 1091 * refcount. 1092 * 1093 * If the slot holds a shadow entry of a previously evicted page, or a 1094 * swap entry from shmem/tmpfs, it is returned. 1095 * 1096 * Otherwise, %NULL is returned. 1097 * 1098 * find_lock_entry() may sleep. 1099 */ 1100 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1101 { 1102 struct page *page; 1103 1104 repeat: 1105 page = find_get_entry(mapping, offset); 1106 if (page && !radix_tree_exception(page)) { 1107 lock_page(page); 1108 /* Has the page been truncated? */ 1109 if (unlikely(page->mapping != mapping)) { 1110 unlock_page(page); 1111 put_page(page); 1112 goto repeat; 1113 } 1114 VM_BUG_ON_PAGE(page->index != offset, page); 1115 } 1116 return page; 1117 } 1118 EXPORT_SYMBOL(find_lock_entry); 1119 1120 /** 1121 * pagecache_get_page - find and get a page reference 1122 * @mapping: the address_space to search 1123 * @offset: the page index 1124 * @fgp_flags: PCG flags 1125 * @gfp_mask: gfp mask to use for the page cache data page allocation 1126 * 1127 * Looks up the page cache slot at @mapping & @offset. 1128 * 1129 * PCG flags modify how the page is returned. 1130 * 1131 * FGP_ACCESSED: the page will be marked accessed 1132 * FGP_LOCK: Page is return locked 1133 * FGP_CREAT: If page is not present then a new page is allocated using 1134 * @gfp_mask and added to the page cache and the VM's LRU 1135 * list. The page is returned locked and with an increased 1136 * refcount. Otherwise, %NULL is returned. 1137 * 1138 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1139 * if the GFP flags specified for FGP_CREAT are atomic. 1140 * 1141 * If there is a page cache page, it is returned with an increased refcount. 1142 */ 1143 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1144 int fgp_flags, gfp_t gfp_mask) 1145 { 1146 struct page *page; 1147 1148 repeat: 1149 page = find_get_entry(mapping, offset); 1150 if (radix_tree_exceptional_entry(page)) 1151 page = NULL; 1152 if (!page) 1153 goto no_page; 1154 1155 if (fgp_flags & FGP_LOCK) { 1156 if (fgp_flags & FGP_NOWAIT) { 1157 if (!trylock_page(page)) { 1158 put_page(page); 1159 return NULL; 1160 } 1161 } else { 1162 lock_page(page); 1163 } 1164 1165 /* Has the page been truncated? */ 1166 if (unlikely(page->mapping != mapping)) { 1167 unlock_page(page); 1168 put_page(page); 1169 goto repeat; 1170 } 1171 VM_BUG_ON_PAGE(page->index != offset, page); 1172 } 1173 1174 if (page && (fgp_flags & FGP_ACCESSED)) 1175 mark_page_accessed(page); 1176 1177 no_page: 1178 if (!page && (fgp_flags & FGP_CREAT)) { 1179 int err; 1180 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1181 gfp_mask |= __GFP_WRITE; 1182 if (fgp_flags & FGP_NOFS) 1183 gfp_mask &= ~__GFP_FS; 1184 1185 page = __page_cache_alloc(gfp_mask); 1186 if (!page) 1187 return NULL; 1188 1189 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1190 fgp_flags |= FGP_LOCK; 1191 1192 /* Init accessed so avoid atomic mark_page_accessed later */ 1193 if (fgp_flags & FGP_ACCESSED) 1194 __SetPageReferenced(page); 1195 1196 err = add_to_page_cache_lru(page, mapping, offset, 1197 gfp_mask & GFP_RECLAIM_MASK); 1198 if (unlikely(err)) { 1199 put_page(page); 1200 page = NULL; 1201 if (err == -EEXIST) 1202 goto repeat; 1203 } 1204 } 1205 1206 return page; 1207 } 1208 EXPORT_SYMBOL(pagecache_get_page); 1209 1210 /** 1211 * find_get_entries - gang pagecache lookup 1212 * @mapping: The address_space to search 1213 * @start: The starting page cache index 1214 * @nr_entries: The maximum number of entries 1215 * @entries: Where the resulting entries are placed 1216 * @indices: The cache indices corresponding to the entries in @entries 1217 * 1218 * find_get_entries() will search for and return a group of up to 1219 * @nr_entries entries in the mapping. The entries are placed at 1220 * @entries. find_get_entries() takes a reference against any actual 1221 * pages it returns. 1222 * 1223 * The search returns a group of mapping-contiguous page cache entries 1224 * with ascending indexes. There may be holes in the indices due to 1225 * not-present pages. 1226 * 1227 * Any shadow entries of evicted pages, or swap entries from 1228 * shmem/tmpfs, are included in the returned array. 1229 * 1230 * find_get_entries() returns the number of pages and shadow entries 1231 * which were found. 1232 */ 1233 unsigned find_get_entries(struct address_space *mapping, 1234 pgoff_t start, unsigned int nr_entries, 1235 struct page **entries, pgoff_t *indices) 1236 { 1237 void **slot; 1238 unsigned int ret = 0; 1239 struct radix_tree_iter iter; 1240 1241 if (!nr_entries) 1242 return 0; 1243 1244 rcu_read_lock(); 1245 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1246 struct page *page; 1247 repeat: 1248 page = radix_tree_deref_slot(slot); 1249 if (unlikely(!page)) 1250 continue; 1251 if (radix_tree_exception(page)) { 1252 if (radix_tree_deref_retry(page)) { 1253 slot = radix_tree_iter_retry(&iter); 1254 continue; 1255 } 1256 /* 1257 * A shadow entry of a recently evicted page, a swap 1258 * entry from shmem/tmpfs or a DAX entry. Return it 1259 * without attempting to raise page count. 1260 */ 1261 goto export; 1262 } 1263 if (!page_cache_get_speculative(page)) 1264 goto repeat; 1265 1266 /* Has the page moved? */ 1267 if (unlikely(page != *slot)) { 1268 put_page(page); 1269 goto repeat; 1270 } 1271 export: 1272 indices[ret] = iter.index; 1273 entries[ret] = page; 1274 if (++ret == nr_entries) 1275 break; 1276 } 1277 rcu_read_unlock(); 1278 return ret; 1279 } 1280 1281 /** 1282 * find_get_pages - gang pagecache lookup 1283 * @mapping: The address_space to search 1284 * @start: The starting page index 1285 * @nr_pages: The maximum number of pages 1286 * @pages: Where the resulting pages are placed 1287 * 1288 * find_get_pages() will search for and return a group of up to 1289 * @nr_pages pages in the mapping. The pages are placed at @pages. 1290 * find_get_pages() takes a reference against the returned pages. 1291 * 1292 * The search returns a group of mapping-contiguous pages with ascending 1293 * indexes. There may be holes in the indices due to not-present pages. 1294 * 1295 * find_get_pages() returns the number of pages which were found. 1296 */ 1297 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1298 unsigned int nr_pages, struct page **pages) 1299 { 1300 struct radix_tree_iter iter; 1301 void **slot; 1302 unsigned ret = 0; 1303 1304 if (unlikely(!nr_pages)) 1305 return 0; 1306 1307 rcu_read_lock(); 1308 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1309 struct page *page; 1310 repeat: 1311 page = radix_tree_deref_slot(slot); 1312 if (unlikely(!page)) 1313 continue; 1314 1315 if (radix_tree_exception(page)) { 1316 if (radix_tree_deref_retry(page)) { 1317 slot = radix_tree_iter_retry(&iter); 1318 continue; 1319 } 1320 /* 1321 * A shadow entry of a recently evicted page, 1322 * or a swap entry from shmem/tmpfs. Skip 1323 * over it. 1324 */ 1325 continue; 1326 } 1327 1328 if (!page_cache_get_speculative(page)) 1329 goto repeat; 1330 1331 /* Has the page moved? */ 1332 if (unlikely(page != *slot)) { 1333 put_page(page); 1334 goto repeat; 1335 } 1336 1337 pages[ret] = page; 1338 if (++ret == nr_pages) 1339 break; 1340 } 1341 1342 rcu_read_unlock(); 1343 return ret; 1344 } 1345 1346 /** 1347 * find_get_pages_contig - gang contiguous pagecache lookup 1348 * @mapping: The address_space to search 1349 * @index: The starting page index 1350 * @nr_pages: The maximum number of pages 1351 * @pages: Where the resulting pages are placed 1352 * 1353 * find_get_pages_contig() works exactly like find_get_pages(), except 1354 * that the returned number of pages are guaranteed to be contiguous. 1355 * 1356 * find_get_pages_contig() returns the number of pages which were found. 1357 */ 1358 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1359 unsigned int nr_pages, struct page **pages) 1360 { 1361 struct radix_tree_iter iter; 1362 void **slot; 1363 unsigned int ret = 0; 1364 1365 if (unlikely(!nr_pages)) 1366 return 0; 1367 1368 rcu_read_lock(); 1369 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1370 struct page *page; 1371 repeat: 1372 page = radix_tree_deref_slot(slot); 1373 /* The hole, there no reason to continue */ 1374 if (unlikely(!page)) 1375 break; 1376 1377 if (radix_tree_exception(page)) { 1378 if (radix_tree_deref_retry(page)) { 1379 slot = radix_tree_iter_retry(&iter); 1380 continue; 1381 } 1382 /* 1383 * A shadow entry of a recently evicted page, 1384 * or a swap entry from shmem/tmpfs. Stop 1385 * looking for contiguous pages. 1386 */ 1387 break; 1388 } 1389 1390 if (!page_cache_get_speculative(page)) 1391 goto repeat; 1392 1393 /* Has the page moved? */ 1394 if (unlikely(page != *slot)) { 1395 put_page(page); 1396 goto repeat; 1397 } 1398 1399 /* 1400 * must check mapping and index after taking the ref. 1401 * otherwise we can get both false positives and false 1402 * negatives, which is just confusing to the caller. 1403 */ 1404 if (page->mapping == NULL || page->index != iter.index) { 1405 put_page(page); 1406 break; 1407 } 1408 1409 pages[ret] = page; 1410 if (++ret == nr_pages) 1411 break; 1412 } 1413 rcu_read_unlock(); 1414 return ret; 1415 } 1416 EXPORT_SYMBOL(find_get_pages_contig); 1417 1418 /** 1419 * find_get_pages_tag - find and return pages that match @tag 1420 * @mapping: the address_space to search 1421 * @index: the starting page index 1422 * @tag: the tag index 1423 * @nr_pages: the maximum number of pages 1424 * @pages: where the resulting pages are placed 1425 * 1426 * Like find_get_pages, except we only return pages which are tagged with 1427 * @tag. We update @index to index the next page for the traversal. 1428 */ 1429 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1430 int tag, unsigned int nr_pages, struct page **pages) 1431 { 1432 struct radix_tree_iter iter; 1433 void **slot; 1434 unsigned ret = 0; 1435 1436 if (unlikely(!nr_pages)) 1437 return 0; 1438 1439 rcu_read_lock(); 1440 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1441 &iter, *index, tag) { 1442 struct page *page; 1443 repeat: 1444 page = radix_tree_deref_slot(slot); 1445 if (unlikely(!page)) 1446 continue; 1447 1448 if (radix_tree_exception(page)) { 1449 if (radix_tree_deref_retry(page)) { 1450 slot = radix_tree_iter_retry(&iter); 1451 continue; 1452 } 1453 /* 1454 * A shadow entry of a recently evicted page. 1455 * 1456 * Those entries should never be tagged, but 1457 * this tree walk is lockless and the tags are 1458 * looked up in bulk, one radix tree node at a 1459 * time, so there is a sizable window for page 1460 * reclaim to evict a page we saw tagged. 1461 * 1462 * Skip over it. 1463 */ 1464 continue; 1465 } 1466 1467 if (!page_cache_get_speculative(page)) 1468 goto repeat; 1469 1470 /* Has the page moved? */ 1471 if (unlikely(page != *slot)) { 1472 put_page(page); 1473 goto repeat; 1474 } 1475 1476 pages[ret] = page; 1477 if (++ret == nr_pages) 1478 break; 1479 } 1480 1481 rcu_read_unlock(); 1482 1483 if (ret) 1484 *index = pages[ret - 1]->index + 1; 1485 1486 return ret; 1487 } 1488 EXPORT_SYMBOL(find_get_pages_tag); 1489 1490 /** 1491 * find_get_entries_tag - find and return entries that match @tag 1492 * @mapping: the address_space to search 1493 * @start: the starting page cache index 1494 * @tag: the tag index 1495 * @nr_entries: the maximum number of entries 1496 * @entries: where the resulting entries are placed 1497 * @indices: the cache indices corresponding to the entries in @entries 1498 * 1499 * Like find_get_entries, except we only return entries which are tagged with 1500 * @tag. 1501 */ 1502 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1503 int tag, unsigned int nr_entries, 1504 struct page **entries, pgoff_t *indices) 1505 { 1506 void **slot; 1507 unsigned int ret = 0; 1508 struct radix_tree_iter iter; 1509 1510 if (!nr_entries) 1511 return 0; 1512 1513 rcu_read_lock(); 1514 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1515 &iter, start, tag) { 1516 struct page *page; 1517 repeat: 1518 page = radix_tree_deref_slot(slot); 1519 if (unlikely(!page)) 1520 continue; 1521 if (radix_tree_exception(page)) { 1522 if (radix_tree_deref_retry(page)) { 1523 slot = radix_tree_iter_retry(&iter); 1524 continue; 1525 } 1526 1527 /* 1528 * A shadow entry of a recently evicted page, a swap 1529 * entry from shmem/tmpfs or a DAX entry. Return it 1530 * without attempting to raise page count. 1531 */ 1532 goto export; 1533 } 1534 if (!page_cache_get_speculative(page)) 1535 goto repeat; 1536 1537 /* Has the page moved? */ 1538 if (unlikely(page != *slot)) { 1539 put_page(page); 1540 goto repeat; 1541 } 1542 export: 1543 indices[ret] = iter.index; 1544 entries[ret] = page; 1545 if (++ret == nr_entries) 1546 break; 1547 } 1548 rcu_read_unlock(); 1549 return ret; 1550 } 1551 EXPORT_SYMBOL(find_get_entries_tag); 1552 1553 /* 1554 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1555 * a _large_ part of the i/o request. Imagine the worst scenario: 1556 * 1557 * ---R__________________________________________B__________ 1558 * ^ reading here ^ bad block(assume 4k) 1559 * 1560 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1561 * => failing the whole request => read(R) => read(R+1) => 1562 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1563 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1564 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1565 * 1566 * It is going insane. Fix it by quickly scaling down the readahead size. 1567 */ 1568 static void shrink_readahead_size_eio(struct file *filp, 1569 struct file_ra_state *ra) 1570 { 1571 ra->ra_pages /= 4; 1572 } 1573 1574 /** 1575 * do_generic_file_read - generic file read routine 1576 * @filp: the file to read 1577 * @ppos: current file position 1578 * @iter: data destination 1579 * @written: already copied 1580 * 1581 * This is a generic file read routine, and uses the 1582 * mapping->a_ops->readpage() function for the actual low-level stuff. 1583 * 1584 * This is really ugly. But the goto's actually try to clarify some 1585 * of the logic when it comes to error handling etc. 1586 */ 1587 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, 1588 struct iov_iter *iter, ssize_t written) 1589 { 1590 struct address_space *mapping = filp->f_mapping; 1591 struct inode *inode = mapping->host; 1592 struct file_ra_state *ra = &filp->f_ra; 1593 pgoff_t index; 1594 pgoff_t last_index; 1595 pgoff_t prev_index; 1596 unsigned long offset; /* offset into pagecache page */ 1597 unsigned int prev_offset; 1598 int error = 0; 1599 1600 index = *ppos >> PAGE_SHIFT; 1601 prev_index = ra->prev_pos >> PAGE_SHIFT; 1602 prev_offset = ra->prev_pos & (PAGE_SIZE-1); 1603 last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; 1604 offset = *ppos & ~PAGE_MASK; 1605 1606 for (;;) { 1607 struct page *page; 1608 pgoff_t end_index; 1609 loff_t isize; 1610 unsigned long nr, ret; 1611 1612 cond_resched(); 1613 find_page: 1614 page = find_get_page(mapping, index); 1615 if (!page) { 1616 page_cache_sync_readahead(mapping, 1617 ra, filp, 1618 index, last_index - index); 1619 page = find_get_page(mapping, index); 1620 if (unlikely(page == NULL)) 1621 goto no_cached_page; 1622 } 1623 if (PageReadahead(page)) { 1624 page_cache_async_readahead(mapping, 1625 ra, filp, page, 1626 index, last_index - index); 1627 } 1628 if (!PageUptodate(page)) { 1629 /* 1630 * See comment in do_read_cache_page on why 1631 * wait_on_page_locked is used to avoid unnecessarily 1632 * serialisations and why it's safe. 1633 */ 1634 wait_on_page_locked_killable(page); 1635 if (PageUptodate(page)) 1636 goto page_ok; 1637 1638 if (inode->i_blkbits == PAGE_SHIFT || 1639 !mapping->a_ops->is_partially_uptodate) 1640 goto page_not_up_to_date; 1641 if (!trylock_page(page)) 1642 goto page_not_up_to_date; 1643 /* Did it get truncated before we got the lock? */ 1644 if (!page->mapping) 1645 goto page_not_up_to_date_locked; 1646 if (!mapping->a_ops->is_partially_uptodate(page, 1647 offset, iter->count)) 1648 goto page_not_up_to_date_locked; 1649 unlock_page(page); 1650 } 1651 page_ok: 1652 /* 1653 * i_size must be checked after we know the page is Uptodate. 1654 * 1655 * Checking i_size after the check allows us to calculate 1656 * the correct value for "nr", which means the zero-filled 1657 * part of the page is not copied back to userspace (unless 1658 * another truncate extends the file - this is desired though). 1659 */ 1660 1661 isize = i_size_read(inode); 1662 end_index = (isize - 1) >> PAGE_SHIFT; 1663 if (unlikely(!isize || index > end_index)) { 1664 put_page(page); 1665 goto out; 1666 } 1667 1668 /* nr is the maximum number of bytes to copy from this page */ 1669 nr = PAGE_SIZE; 1670 if (index == end_index) { 1671 nr = ((isize - 1) & ~PAGE_MASK) + 1; 1672 if (nr <= offset) { 1673 put_page(page); 1674 goto out; 1675 } 1676 } 1677 nr = nr - offset; 1678 1679 /* If users can be writing to this page using arbitrary 1680 * virtual addresses, take care about potential aliasing 1681 * before reading the page on the kernel side. 1682 */ 1683 if (mapping_writably_mapped(mapping)) 1684 flush_dcache_page(page); 1685 1686 /* 1687 * When a sequential read accesses a page several times, 1688 * only mark it as accessed the first time. 1689 */ 1690 if (prev_index != index || offset != prev_offset) 1691 mark_page_accessed(page); 1692 prev_index = index; 1693 1694 /* 1695 * Ok, we have the page, and it's up-to-date, so 1696 * now we can copy it to user space... 1697 */ 1698 1699 ret = copy_page_to_iter(page, offset, nr, iter); 1700 offset += ret; 1701 index += offset >> PAGE_SHIFT; 1702 offset &= ~PAGE_MASK; 1703 prev_offset = offset; 1704 1705 put_page(page); 1706 written += ret; 1707 if (!iov_iter_count(iter)) 1708 goto out; 1709 if (ret < nr) { 1710 error = -EFAULT; 1711 goto out; 1712 } 1713 continue; 1714 1715 page_not_up_to_date: 1716 /* Get exclusive access to the page ... */ 1717 error = lock_page_killable(page); 1718 if (unlikely(error)) 1719 goto readpage_error; 1720 1721 page_not_up_to_date_locked: 1722 /* Did it get truncated before we got the lock? */ 1723 if (!page->mapping) { 1724 unlock_page(page); 1725 put_page(page); 1726 continue; 1727 } 1728 1729 /* Did somebody else fill it already? */ 1730 if (PageUptodate(page)) { 1731 unlock_page(page); 1732 goto page_ok; 1733 } 1734 1735 readpage: 1736 /* 1737 * A previous I/O error may have been due to temporary 1738 * failures, eg. multipath errors. 1739 * PG_error will be set again if readpage fails. 1740 */ 1741 ClearPageError(page); 1742 /* Start the actual read. The read will unlock the page. */ 1743 error = mapping->a_ops->readpage(filp, page); 1744 1745 if (unlikely(error)) { 1746 if (error == AOP_TRUNCATED_PAGE) { 1747 put_page(page); 1748 error = 0; 1749 goto find_page; 1750 } 1751 goto readpage_error; 1752 } 1753 1754 if (!PageUptodate(page)) { 1755 error = lock_page_killable(page); 1756 if (unlikely(error)) 1757 goto readpage_error; 1758 if (!PageUptodate(page)) { 1759 if (page->mapping == NULL) { 1760 /* 1761 * invalidate_mapping_pages got it 1762 */ 1763 unlock_page(page); 1764 put_page(page); 1765 goto find_page; 1766 } 1767 unlock_page(page); 1768 shrink_readahead_size_eio(filp, ra); 1769 error = -EIO; 1770 goto readpage_error; 1771 } 1772 unlock_page(page); 1773 } 1774 1775 goto page_ok; 1776 1777 readpage_error: 1778 /* UHHUH! A synchronous read error occurred. Report it */ 1779 put_page(page); 1780 goto out; 1781 1782 no_cached_page: 1783 /* 1784 * Ok, it wasn't cached, so we need to create a new 1785 * page.. 1786 */ 1787 page = page_cache_alloc_cold(mapping); 1788 if (!page) { 1789 error = -ENOMEM; 1790 goto out; 1791 } 1792 error = add_to_page_cache_lru(page, mapping, index, 1793 mapping_gfp_constraint(mapping, GFP_KERNEL)); 1794 if (error) { 1795 put_page(page); 1796 if (error == -EEXIST) { 1797 error = 0; 1798 goto find_page; 1799 } 1800 goto out; 1801 } 1802 goto readpage; 1803 } 1804 1805 out: 1806 ra->prev_pos = prev_index; 1807 ra->prev_pos <<= PAGE_SHIFT; 1808 ra->prev_pos |= prev_offset; 1809 1810 *ppos = ((loff_t)index << PAGE_SHIFT) + offset; 1811 file_accessed(filp); 1812 return written ? written : error; 1813 } 1814 1815 /** 1816 * generic_file_read_iter - generic filesystem read routine 1817 * @iocb: kernel I/O control block 1818 * @iter: destination for the data read 1819 * 1820 * This is the "read_iter()" routine for all filesystems 1821 * that can use the page cache directly. 1822 */ 1823 ssize_t 1824 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 1825 { 1826 struct file *file = iocb->ki_filp; 1827 ssize_t retval = 0; 1828 size_t count = iov_iter_count(iter); 1829 1830 if (!count) 1831 goto out; /* skip atime */ 1832 1833 if (iocb->ki_flags & IOCB_DIRECT) { 1834 struct address_space *mapping = file->f_mapping; 1835 struct inode *inode = mapping->host; 1836 loff_t size; 1837 1838 size = i_size_read(inode); 1839 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, 1840 iocb->ki_pos + count - 1); 1841 if (!retval) { 1842 struct iov_iter data = *iter; 1843 retval = mapping->a_ops->direct_IO(iocb, &data); 1844 } 1845 1846 if (retval > 0) { 1847 iocb->ki_pos += retval; 1848 iov_iter_advance(iter, retval); 1849 } 1850 1851 /* 1852 * Btrfs can have a short DIO read if we encounter 1853 * compressed extents, so if there was an error, or if 1854 * we've already read everything we wanted to, or if 1855 * there was a short read because we hit EOF, go ahead 1856 * and return. Otherwise fallthrough to buffered io for 1857 * the rest of the read. Buffered reads will not work for 1858 * DAX files, so don't bother trying. 1859 */ 1860 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || 1861 IS_DAX(inode)) { 1862 file_accessed(file); 1863 goto out; 1864 } 1865 } 1866 1867 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); 1868 out: 1869 return retval; 1870 } 1871 EXPORT_SYMBOL(generic_file_read_iter); 1872 1873 #ifdef CONFIG_MMU 1874 /** 1875 * page_cache_read - adds requested page to the page cache if not already there 1876 * @file: file to read 1877 * @offset: page index 1878 * @gfp_mask: memory allocation flags 1879 * 1880 * This adds the requested page to the page cache if it isn't already there, 1881 * and schedules an I/O to read in its contents from disk. 1882 */ 1883 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 1884 { 1885 struct address_space *mapping = file->f_mapping; 1886 struct page *page; 1887 int ret; 1888 1889 do { 1890 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 1891 if (!page) 1892 return -ENOMEM; 1893 1894 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); 1895 if (ret == 0) 1896 ret = mapping->a_ops->readpage(file, page); 1897 else if (ret == -EEXIST) 1898 ret = 0; /* losing race to add is OK */ 1899 1900 put_page(page); 1901 1902 } while (ret == AOP_TRUNCATED_PAGE); 1903 1904 return ret; 1905 } 1906 1907 #define MMAP_LOTSAMISS (100) 1908 1909 /* 1910 * Synchronous readahead happens when we don't even find 1911 * a page in the page cache at all. 1912 */ 1913 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1914 struct file_ra_state *ra, 1915 struct file *file, 1916 pgoff_t offset) 1917 { 1918 struct address_space *mapping = file->f_mapping; 1919 1920 /* If we don't want any read-ahead, don't bother */ 1921 if (vma->vm_flags & VM_RAND_READ) 1922 return; 1923 if (!ra->ra_pages) 1924 return; 1925 1926 if (vma->vm_flags & VM_SEQ_READ) { 1927 page_cache_sync_readahead(mapping, ra, file, offset, 1928 ra->ra_pages); 1929 return; 1930 } 1931 1932 /* Avoid banging the cache line if not needed */ 1933 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 1934 ra->mmap_miss++; 1935 1936 /* 1937 * Do we miss much more than hit in this file? If so, 1938 * stop bothering with read-ahead. It will only hurt. 1939 */ 1940 if (ra->mmap_miss > MMAP_LOTSAMISS) 1941 return; 1942 1943 /* 1944 * mmap read-around 1945 */ 1946 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 1947 ra->size = ra->ra_pages; 1948 ra->async_size = ra->ra_pages / 4; 1949 ra_submit(ra, mapping, file); 1950 } 1951 1952 /* 1953 * Asynchronous readahead happens when we find the page and PG_readahead, 1954 * so we want to possibly extend the readahead further.. 1955 */ 1956 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1957 struct file_ra_state *ra, 1958 struct file *file, 1959 struct page *page, 1960 pgoff_t offset) 1961 { 1962 struct address_space *mapping = file->f_mapping; 1963 1964 /* If we don't want any read-ahead, don't bother */ 1965 if (vma->vm_flags & VM_RAND_READ) 1966 return; 1967 if (ra->mmap_miss > 0) 1968 ra->mmap_miss--; 1969 if (PageReadahead(page)) 1970 page_cache_async_readahead(mapping, ra, file, 1971 page, offset, ra->ra_pages); 1972 } 1973 1974 /** 1975 * filemap_fault - read in file data for page fault handling 1976 * @vma: vma in which the fault was taken 1977 * @vmf: struct vm_fault containing details of the fault 1978 * 1979 * filemap_fault() is invoked via the vma operations vector for a 1980 * mapped memory region to read in file data during a page fault. 1981 * 1982 * The goto's are kind of ugly, but this streamlines the normal case of having 1983 * it in the page cache, and handles the special cases reasonably without 1984 * having a lot of duplicated code. 1985 * 1986 * vma->vm_mm->mmap_sem must be held on entry. 1987 * 1988 * If our return value has VM_FAULT_RETRY set, it's because 1989 * lock_page_or_retry() returned 0. 1990 * The mmap_sem has usually been released in this case. 1991 * See __lock_page_or_retry() for the exception. 1992 * 1993 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem 1994 * has not been released. 1995 * 1996 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 1997 */ 1998 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1999 { 2000 int error; 2001 struct file *file = vma->vm_file; 2002 struct address_space *mapping = file->f_mapping; 2003 struct file_ra_state *ra = &file->f_ra; 2004 struct inode *inode = mapping->host; 2005 pgoff_t offset = vmf->pgoff; 2006 struct page *page; 2007 loff_t size; 2008 int ret = 0; 2009 2010 size = round_up(i_size_read(inode), PAGE_SIZE); 2011 if (offset >= size >> PAGE_SHIFT) 2012 return VM_FAULT_SIGBUS; 2013 2014 /* 2015 * Do we have something in the page cache already? 2016 */ 2017 page = find_get_page(mapping, offset); 2018 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 2019 /* 2020 * We found the page, so try async readahead before 2021 * waiting for the lock. 2022 */ 2023 do_async_mmap_readahead(vma, ra, file, page, offset); 2024 } else if (!page) { 2025 /* No page in the page cache at all */ 2026 do_sync_mmap_readahead(vma, ra, file, offset); 2027 count_vm_event(PGMAJFAULT); 2028 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 2029 ret = VM_FAULT_MAJOR; 2030 retry_find: 2031 page = find_get_page(mapping, offset); 2032 if (!page) 2033 goto no_cached_page; 2034 } 2035 2036 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 2037 put_page(page); 2038 return ret | VM_FAULT_RETRY; 2039 } 2040 2041 /* Did it get truncated? */ 2042 if (unlikely(page->mapping != mapping)) { 2043 unlock_page(page); 2044 put_page(page); 2045 goto retry_find; 2046 } 2047 VM_BUG_ON_PAGE(page->index != offset, page); 2048 2049 /* 2050 * We have a locked page in the page cache, now we need to check 2051 * that it's up-to-date. If not, it is going to be due to an error. 2052 */ 2053 if (unlikely(!PageUptodate(page))) 2054 goto page_not_uptodate; 2055 2056 /* 2057 * Found the page and have a reference on it. 2058 * We must recheck i_size under page lock. 2059 */ 2060 size = round_up(i_size_read(inode), PAGE_SIZE); 2061 if (unlikely(offset >= size >> PAGE_SHIFT)) { 2062 unlock_page(page); 2063 put_page(page); 2064 return VM_FAULT_SIGBUS; 2065 } 2066 2067 vmf->page = page; 2068 return ret | VM_FAULT_LOCKED; 2069 2070 no_cached_page: 2071 /* 2072 * We're only likely to ever get here if MADV_RANDOM is in 2073 * effect. 2074 */ 2075 error = page_cache_read(file, offset, vmf->gfp_mask); 2076 2077 /* 2078 * The page we want has now been added to the page cache. 2079 * In the unlikely event that someone removed it in the 2080 * meantime, we'll just come back here and read it again. 2081 */ 2082 if (error >= 0) 2083 goto retry_find; 2084 2085 /* 2086 * An error return from page_cache_read can result if the 2087 * system is low on memory, or a problem occurs while trying 2088 * to schedule I/O. 2089 */ 2090 if (error == -ENOMEM) 2091 return VM_FAULT_OOM; 2092 return VM_FAULT_SIGBUS; 2093 2094 page_not_uptodate: 2095 /* 2096 * Umm, take care of errors if the page isn't up-to-date. 2097 * Try to re-read it _once_. We do this synchronously, 2098 * because there really aren't any performance issues here 2099 * and we need to check for errors. 2100 */ 2101 ClearPageError(page); 2102 error = mapping->a_ops->readpage(file, page); 2103 if (!error) { 2104 wait_on_page_locked(page); 2105 if (!PageUptodate(page)) 2106 error = -EIO; 2107 } 2108 put_page(page); 2109 2110 if (!error || error == AOP_TRUNCATED_PAGE) 2111 goto retry_find; 2112 2113 /* Things didn't work out. Return zero to tell the mm layer so. */ 2114 shrink_readahead_size_eio(file, ra); 2115 return VM_FAULT_SIGBUS; 2116 } 2117 EXPORT_SYMBOL(filemap_fault); 2118 2119 void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 2120 { 2121 struct radix_tree_iter iter; 2122 void **slot; 2123 struct file *file = vma->vm_file; 2124 struct address_space *mapping = file->f_mapping; 2125 loff_t size; 2126 struct page *page; 2127 unsigned long address = (unsigned long) vmf->virtual_address; 2128 unsigned long addr; 2129 pte_t *pte; 2130 2131 rcu_read_lock(); 2132 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { 2133 if (iter.index > vmf->max_pgoff) 2134 break; 2135 repeat: 2136 page = radix_tree_deref_slot(slot); 2137 if (unlikely(!page)) 2138 goto next; 2139 if (radix_tree_exception(page)) { 2140 if (radix_tree_deref_retry(page)) { 2141 slot = radix_tree_iter_retry(&iter); 2142 continue; 2143 } 2144 goto next; 2145 } 2146 2147 if (!page_cache_get_speculative(page)) 2148 goto repeat; 2149 2150 /* Has the page moved? */ 2151 if (unlikely(page != *slot)) { 2152 put_page(page); 2153 goto repeat; 2154 } 2155 2156 if (!PageUptodate(page) || 2157 PageReadahead(page) || 2158 PageHWPoison(page)) 2159 goto skip; 2160 if (!trylock_page(page)) 2161 goto skip; 2162 2163 if (page->mapping != mapping || !PageUptodate(page)) 2164 goto unlock; 2165 2166 size = round_up(i_size_read(mapping->host), PAGE_SIZE); 2167 if (page->index >= size >> PAGE_SHIFT) 2168 goto unlock; 2169 2170 pte = vmf->pte + page->index - vmf->pgoff; 2171 if (!pte_none(*pte)) 2172 goto unlock; 2173 2174 if (file->f_ra.mmap_miss > 0) 2175 file->f_ra.mmap_miss--; 2176 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; 2177 do_set_pte(vma, addr, page, pte, false, false, true); 2178 unlock_page(page); 2179 goto next; 2180 unlock: 2181 unlock_page(page); 2182 skip: 2183 put_page(page); 2184 next: 2185 if (iter.index == vmf->max_pgoff) 2186 break; 2187 } 2188 rcu_read_unlock(); 2189 } 2190 EXPORT_SYMBOL(filemap_map_pages); 2191 2192 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 2193 { 2194 struct page *page = vmf->page; 2195 struct inode *inode = file_inode(vma->vm_file); 2196 int ret = VM_FAULT_LOCKED; 2197 2198 sb_start_pagefault(inode->i_sb); 2199 file_update_time(vma->vm_file); 2200 lock_page(page); 2201 if (page->mapping != inode->i_mapping) { 2202 unlock_page(page); 2203 ret = VM_FAULT_NOPAGE; 2204 goto out; 2205 } 2206 /* 2207 * We mark the page dirty already here so that when freeze is in 2208 * progress, we are guaranteed that writeback during freezing will 2209 * see the dirty page and writeprotect it again. 2210 */ 2211 set_page_dirty(page); 2212 wait_for_stable_page(page); 2213 out: 2214 sb_end_pagefault(inode->i_sb); 2215 return ret; 2216 } 2217 EXPORT_SYMBOL(filemap_page_mkwrite); 2218 2219 const struct vm_operations_struct generic_file_vm_ops = { 2220 .fault = filemap_fault, 2221 .map_pages = filemap_map_pages, 2222 .page_mkwrite = filemap_page_mkwrite, 2223 }; 2224 2225 /* This is used for a general mmap of a disk file */ 2226 2227 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2228 { 2229 struct address_space *mapping = file->f_mapping; 2230 2231 if (!mapping->a_ops->readpage) 2232 return -ENOEXEC; 2233 file_accessed(file); 2234 vma->vm_ops = &generic_file_vm_ops; 2235 return 0; 2236 } 2237 2238 /* 2239 * This is for filesystems which do not implement ->writepage. 2240 */ 2241 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 2242 { 2243 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2244 return -EINVAL; 2245 return generic_file_mmap(file, vma); 2246 } 2247 #else 2248 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2249 { 2250 return -ENOSYS; 2251 } 2252 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 2253 { 2254 return -ENOSYS; 2255 } 2256 #endif /* CONFIG_MMU */ 2257 2258 EXPORT_SYMBOL(generic_file_mmap); 2259 EXPORT_SYMBOL(generic_file_readonly_mmap); 2260 2261 static struct page *wait_on_page_read(struct page *page) 2262 { 2263 if (!IS_ERR(page)) { 2264 wait_on_page_locked(page); 2265 if (!PageUptodate(page)) { 2266 put_page(page); 2267 page = ERR_PTR(-EIO); 2268 } 2269 } 2270 return page; 2271 } 2272 2273 static struct page *do_read_cache_page(struct address_space *mapping, 2274 pgoff_t index, 2275 int (*filler)(void *, struct page *), 2276 void *data, 2277 gfp_t gfp) 2278 { 2279 struct page *page; 2280 int err; 2281 repeat: 2282 page = find_get_page(mapping, index); 2283 if (!page) { 2284 page = __page_cache_alloc(gfp | __GFP_COLD); 2285 if (!page) 2286 return ERR_PTR(-ENOMEM); 2287 err = add_to_page_cache_lru(page, mapping, index, gfp); 2288 if (unlikely(err)) { 2289 put_page(page); 2290 if (err == -EEXIST) 2291 goto repeat; 2292 /* Presumably ENOMEM for radix tree node */ 2293 return ERR_PTR(err); 2294 } 2295 2296 filler: 2297 err = filler(data, page); 2298 if (err < 0) { 2299 put_page(page); 2300 return ERR_PTR(err); 2301 } 2302 2303 page = wait_on_page_read(page); 2304 if (IS_ERR(page)) 2305 return page; 2306 goto out; 2307 } 2308 if (PageUptodate(page)) 2309 goto out; 2310 2311 /* 2312 * Page is not up to date and may be locked due one of the following 2313 * case a: Page is being filled and the page lock is held 2314 * case b: Read/write error clearing the page uptodate status 2315 * case c: Truncation in progress (page locked) 2316 * case d: Reclaim in progress 2317 * 2318 * Case a, the page will be up to date when the page is unlocked. 2319 * There is no need to serialise on the page lock here as the page 2320 * is pinned so the lock gives no additional protection. Even if the 2321 * the page is truncated, the data is still valid if PageUptodate as 2322 * it's a race vs truncate race. 2323 * Case b, the page will not be up to date 2324 * Case c, the page may be truncated but in itself, the data may still 2325 * be valid after IO completes as it's a read vs truncate race. The 2326 * operation must restart if the page is not uptodate on unlock but 2327 * otherwise serialising on page lock to stabilise the mapping gives 2328 * no additional guarantees to the caller as the page lock is 2329 * released before return. 2330 * Case d, similar to truncation. If reclaim holds the page lock, it 2331 * will be a race with remove_mapping that determines if the mapping 2332 * is valid on unlock but otherwise the data is valid and there is 2333 * no need to serialise with page lock. 2334 * 2335 * As the page lock gives no additional guarantee, we optimistically 2336 * wait on the page to be unlocked and check if it's up to date and 2337 * use the page if it is. Otherwise, the page lock is required to 2338 * distinguish between the different cases. The motivation is that we 2339 * avoid spurious serialisations and wakeups when multiple processes 2340 * wait on the same page for IO to complete. 2341 */ 2342 wait_on_page_locked(page); 2343 if (PageUptodate(page)) 2344 goto out; 2345 2346 /* Distinguish between all the cases under the safety of the lock */ 2347 lock_page(page); 2348 2349 /* Case c or d, restart the operation */ 2350 if (!page->mapping) { 2351 unlock_page(page); 2352 put_page(page); 2353 goto repeat; 2354 } 2355 2356 /* Someone else locked and filled the page in a very small window */ 2357 if (PageUptodate(page)) { 2358 unlock_page(page); 2359 goto out; 2360 } 2361 goto filler; 2362 2363 out: 2364 mark_page_accessed(page); 2365 return page; 2366 } 2367 2368 /** 2369 * read_cache_page - read into page cache, fill it if needed 2370 * @mapping: the page's address_space 2371 * @index: the page index 2372 * @filler: function to perform the read 2373 * @data: first arg to filler(data, page) function, often left as NULL 2374 * 2375 * Read into the page cache. If a page already exists, and PageUptodate() is 2376 * not set, try to fill the page and wait for it to become unlocked. 2377 * 2378 * If the page does not get brought uptodate, return -EIO. 2379 */ 2380 struct page *read_cache_page(struct address_space *mapping, 2381 pgoff_t index, 2382 int (*filler)(void *, struct page *), 2383 void *data) 2384 { 2385 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2386 } 2387 EXPORT_SYMBOL(read_cache_page); 2388 2389 /** 2390 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2391 * @mapping: the page's address_space 2392 * @index: the page index 2393 * @gfp: the page allocator flags to use if allocating 2394 * 2395 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 2396 * any new page allocations done using the specified allocation flags. 2397 * 2398 * If the page does not get brought uptodate, return -EIO. 2399 */ 2400 struct page *read_cache_page_gfp(struct address_space *mapping, 2401 pgoff_t index, 2402 gfp_t gfp) 2403 { 2404 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2405 2406 return do_read_cache_page(mapping, index, filler, NULL, gfp); 2407 } 2408 EXPORT_SYMBOL(read_cache_page_gfp); 2409 2410 /* 2411 * Performs necessary checks before doing a write 2412 * 2413 * Can adjust writing position or amount of bytes to write. 2414 * Returns appropriate error code that caller should return or 2415 * zero in case that write should be allowed. 2416 */ 2417 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 2418 { 2419 struct file *file = iocb->ki_filp; 2420 struct inode *inode = file->f_mapping->host; 2421 unsigned long limit = rlimit(RLIMIT_FSIZE); 2422 loff_t pos; 2423 2424 if (!iov_iter_count(from)) 2425 return 0; 2426 2427 /* FIXME: this is for backwards compatibility with 2.4 */ 2428 if (iocb->ki_flags & IOCB_APPEND) 2429 iocb->ki_pos = i_size_read(inode); 2430 2431 pos = iocb->ki_pos; 2432 2433 if (limit != RLIM_INFINITY) { 2434 if (iocb->ki_pos >= limit) { 2435 send_sig(SIGXFSZ, current, 0); 2436 return -EFBIG; 2437 } 2438 iov_iter_truncate(from, limit - (unsigned long)pos); 2439 } 2440 2441 /* 2442 * LFS rule 2443 */ 2444 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && 2445 !(file->f_flags & O_LARGEFILE))) { 2446 if (pos >= MAX_NON_LFS) 2447 return -EFBIG; 2448 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); 2449 } 2450 2451 /* 2452 * Are we about to exceed the fs block limit ? 2453 * 2454 * If we have written data it becomes a short write. If we have 2455 * exceeded without writing data we send a signal and return EFBIG. 2456 * Linus frestrict idea will clean these up nicely.. 2457 */ 2458 if (unlikely(pos >= inode->i_sb->s_maxbytes)) 2459 return -EFBIG; 2460 2461 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); 2462 return iov_iter_count(from); 2463 } 2464 EXPORT_SYMBOL(generic_write_checks); 2465 2466 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2467 loff_t pos, unsigned len, unsigned flags, 2468 struct page **pagep, void **fsdata) 2469 { 2470 const struct address_space_operations *aops = mapping->a_ops; 2471 2472 return aops->write_begin(file, mapping, pos, len, flags, 2473 pagep, fsdata); 2474 } 2475 EXPORT_SYMBOL(pagecache_write_begin); 2476 2477 int pagecache_write_end(struct file *file, struct address_space *mapping, 2478 loff_t pos, unsigned len, unsigned copied, 2479 struct page *page, void *fsdata) 2480 { 2481 const struct address_space_operations *aops = mapping->a_ops; 2482 2483 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2484 } 2485 EXPORT_SYMBOL(pagecache_write_end); 2486 2487 ssize_t 2488 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) 2489 { 2490 struct file *file = iocb->ki_filp; 2491 struct address_space *mapping = file->f_mapping; 2492 struct inode *inode = mapping->host; 2493 loff_t pos = iocb->ki_pos; 2494 ssize_t written; 2495 size_t write_len; 2496 pgoff_t end; 2497 struct iov_iter data; 2498 2499 write_len = iov_iter_count(from); 2500 end = (pos + write_len - 1) >> PAGE_SHIFT; 2501 2502 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2503 if (written) 2504 goto out; 2505 2506 /* 2507 * After a write we want buffered reads to be sure to go to disk to get 2508 * the new data. We invalidate clean cached page from the region we're 2509 * about to write. We do this *before* the write so that we can return 2510 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2511 */ 2512 if (mapping->nrpages) { 2513 written = invalidate_inode_pages2_range(mapping, 2514 pos >> PAGE_SHIFT, end); 2515 /* 2516 * If a page can not be invalidated, return 0 to fall back 2517 * to buffered write. 2518 */ 2519 if (written) { 2520 if (written == -EBUSY) 2521 return 0; 2522 goto out; 2523 } 2524 } 2525 2526 data = *from; 2527 written = mapping->a_ops->direct_IO(iocb, &data); 2528 2529 /* 2530 * Finally, try again to invalidate clean pages which might have been 2531 * cached by non-direct readahead, or faulted in by get_user_pages() 2532 * if the source of the write was an mmap'ed region of the file 2533 * we're writing. Either one is a pretty crazy thing to do, 2534 * so we don't support it 100%. If this invalidation 2535 * fails, tough, the write still worked... 2536 */ 2537 if (mapping->nrpages) { 2538 invalidate_inode_pages2_range(mapping, 2539 pos >> PAGE_SHIFT, end); 2540 } 2541 2542 if (written > 0) { 2543 pos += written; 2544 iov_iter_advance(from, written); 2545 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2546 i_size_write(inode, pos); 2547 mark_inode_dirty(inode); 2548 } 2549 iocb->ki_pos = pos; 2550 } 2551 out: 2552 return written; 2553 } 2554 EXPORT_SYMBOL(generic_file_direct_write); 2555 2556 /* 2557 * Find or create a page at the given pagecache position. Return the locked 2558 * page. This function is specifically for buffered writes. 2559 */ 2560 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2561 pgoff_t index, unsigned flags) 2562 { 2563 struct page *page; 2564 int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; 2565 2566 if (flags & AOP_FLAG_NOFS) 2567 fgp_flags |= FGP_NOFS; 2568 2569 page = pagecache_get_page(mapping, index, fgp_flags, 2570 mapping_gfp_mask(mapping)); 2571 if (page) 2572 wait_for_stable_page(page); 2573 2574 return page; 2575 } 2576 EXPORT_SYMBOL(grab_cache_page_write_begin); 2577 2578 ssize_t generic_perform_write(struct file *file, 2579 struct iov_iter *i, loff_t pos) 2580 { 2581 struct address_space *mapping = file->f_mapping; 2582 const struct address_space_operations *a_ops = mapping->a_ops; 2583 long status = 0; 2584 ssize_t written = 0; 2585 unsigned int flags = 0; 2586 2587 /* 2588 * Copies from kernel address space cannot fail (NFSD is a big user). 2589 */ 2590 if (!iter_is_iovec(i)) 2591 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2592 2593 do { 2594 struct page *page; 2595 unsigned long offset; /* Offset into pagecache page */ 2596 unsigned long bytes; /* Bytes to write to page */ 2597 size_t copied; /* Bytes copied from user */ 2598 void *fsdata; 2599 2600 offset = (pos & (PAGE_SIZE - 1)); 2601 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2602 iov_iter_count(i)); 2603 2604 again: 2605 /* 2606 * Bring in the user page that we will copy from _first_. 2607 * Otherwise there's a nasty deadlock on copying from the 2608 * same page as we're writing to, without it being marked 2609 * up-to-date. 2610 * 2611 * Not only is this an optimisation, but it is also required 2612 * to check that the address is actually valid, when atomic 2613 * usercopies are used, below. 2614 */ 2615 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2616 status = -EFAULT; 2617 break; 2618 } 2619 2620 if (fatal_signal_pending(current)) { 2621 status = -EINTR; 2622 break; 2623 } 2624 2625 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2626 &page, &fsdata); 2627 if (unlikely(status < 0)) 2628 break; 2629 2630 if (mapping_writably_mapped(mapping)) 2631 flush_dcache_page(page); 2632 2633 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2634 flush_dcache_page(page); 2635 2636 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2637 page, fsdata); 2638 if (unlikely(status < 0)) 2639 break; 2640 copied = status; 2641 2642 cond_resched(); 2643 2644 iov_iter_advance(i, copied); 2645 if (unlikely(copied == 0)) { 2646 /* 2647 * If we were unable to copy any data at all, we must 2648 * fall back to a single segment length write. 2649 * 2650 * If we didn't fallback here, we could livelock 2651 * because not all segments in the iov can be copied at 2652 * once without a pagefault. 2653 */ 2654 bytes = min_t(unsigned long, PAGE_SIZE - offset, 2655 iov_iter_single_seg_count(i)); 2656 goto again; 2657 } 2658 pos += copied; 2659 written += copied; 2660 2661 balance_dirty_pages_ratelimited(mapping); 2662 } while (iov_iter_count(i)); 2663 2664 return written ? written : status; 2665 } 2666 EXPORT_SYMBOL(generic_perform_write); 2667 2668 /** 2669 * __generic_file_write_iter - write data to a file 2670 * @iocb: IO state structure (file, offset, etc.) 2671 * @from: iov_iter with data to write 2672 * 2673 * This function does all the work needed for actually writing data to a 2674 * file. It does all basic checks, removes SUID from the file, updates 2675 * modification times and calls proper subroutines depending on whether we 2676 * do direct IO or a standard buffered write. 2677 * 2678 * It expects i_mutex to be grabbed unless we work on a block device or similar 2679 * object which does not need locking at all. 2680 * 2681 * This function does *not* take care of syncing data in case of O_SYNC write. 2682 * A caller has to handle it. This is mainly due to the fact that we want to 2683 * avoid syncing under i_mutex. 2684 */ 2685 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2686 { 2687 struct file *file = iocb->ki_filp; 2688 struct address_space * mapping = file->f_mapping; 2689 struct inode *inode = mapping->host; 2690 ssize_t written = 0; 2691 ssize_t err; 2692 ssize_t status; 2693 2694 /* We can write back this queue in page reclaim */ 2695 current->backing_dev_info = inode_to_bdi(inode); 2696 err = file_remove_privs(file); 2697 if (err) 2698 goto out; 2699 2700 err = file_update_time(file); 2701 if (err) 2702 goto out; 2703 2704 if (iocb->ki_flags & IOCB_DIRECT) { 2705 loff_t pos, endbyte; 2706 2707 written = generic_file_direct_write(iocb, from); 2708 /* 2709 * If the write stopped short of completing, fall back to 2710 * buffered writes. Some filesystems do this for writes to 2711 * holes, for example. For DAX files, a buffered write will 2712 * not succeed (even if it did, DAX does not handle dirty 2713 * page-cache pages correctly). 2714 */ 2715 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 2716 goto out; 2717 2718 status = generic_perform_write(file, from, pos = iocb->ki_pos); 2719 /* 2720 * If generic_perform_write() returned a synchronous error 2721 * then we want to return the number of bytes which were 2722 * direct-written, or the error code if that was zero. Note 2723 * that this differs from normal direct-io semantics, which 2724 * will return -EFOO even if some bytes were written. 2725 */ 2726 if (unlikely(status < 0)) { 2727 err = status; 2728 goto out; 2729 } 2730 /* 2731 * We need to ensure that the page cache pages are written to 2732 * disk and invalidated to preserve the expected O_DIRECT 2733 * semantics. 2734 */ 2735 endbyte = pos + status - 1; 2736 err = filemap_write_and_wait_range(mapping, pos, endbyte); 2737 if (err == 0) { 2738 iocb->ki_pos = endbyte + 1; 2739 written += status; 2740 invalidate_mapping_pages(mapping, 2741 pos >> PAGE_SHIFT, 2742 endbyte >> PAGE_SHIFT); 2743 } else { 2744 /* 2745 * We don't know how much we wrote, so just return 2746 * the number of bytes which were direct-written 2747 */ 2748 } 2749 } else { 2750 written = generic_perform_write(file, from, iocb->ki_pos); 2751 if (likely(written > 0)) 2752 iocb->ki_pos += written; 2753 } 2754 out: 2755 current->backing_dev_info = NULL; 2756 return written ? written : err; 2757 } 2758 EXPORT_SYMBOL(__generic_file_write_iter); 2759 2760 /** 2761 * generic_file_write_iter - write data to a file 2762 * @iocb: IO state structure 2763 * @from: iov_iter with data to write 2764 * 2765 * This is a wrapper around __generic_file_write_iter() to be used by most 2766 * filesystems. It takes care of syncing the file in case of O_SYNC file 2767 * and acquires i_mutex as needed. 2768 */ 2769 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2770 { 2771 struct file *file = iocb->ki_filp; 2772 struct inode *inode = file->f_mapping->host; 2773 ssize_t ret; 2774 2775 inode_lock(inode); 2776 ret = generic_write_checks(iocb, from); 2777 if (ret > 0) 2778 ret = __generic_file_write_iter(iocb, from); 2779 inode_unlock(inode); 2780 2781 if (ret > 0) 2782 ret = generic_write_sync(iocb, ret); 2783 return ret; 2784 } 2785 EXPORT_SYMBOL(generic_file_write_iter); 2786 2787 /** 2788 * try_to_release_page() - release old fs-specific metadata on a page 2789 * 2790 * @page: the page which the kernel is trying to free 2791 * @gfp_mask: memory allocation flags (and I/O mode) 2792 * 2793 * The address_space is to try to release any data against the page 2794 * (presumably at page->private). If the release was successful, return `1'. 2795 * Otherwise return zero. 2796 * 2797 * This may also be called if PG_fscache is set on a page, indicating that the 2798 * page is known to the local caching routines. 2799 * 2800 * The @gfp_mask argument specifies whether I/O may be performed to release 2801 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 2802 * 2803 */ 2804 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2805 { 2806 struct address_space * const mapping = page->mapping; 2807 2808 BUG_ON(!PageLocked(page)); 2809 if (PageWriteback(page)) 2810 return 0; 2811 2812 if (mapping && mapping->a_ops->releasepage) 2813 return mapping->a_ops->releasepage(page, gfp_mask); 2814 return try_to_free_buffers(page); 2815 } 2816 2817 EXPORT_SYMBOL(try_to_release_page); 2818