1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/export.h> 13 #include <linux/compiler.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include <linux/uaccess.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/cpuset.h> 33 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34 #include <linux/hugetlb.h> 35 #include <linux/memcontrol.h> 36 #include <linux/cleancache.h> 37 #include <linux/rmap.h> 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/filemap.h> 42 43 /* 44 * FIXME: remove all knowledge of the buffer layer from the core VM 45 */ 46 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 47 48 #include <asm/mman.h> 49 50 /* 51 * Shared mappings implemented 30.11.1994. It's not fully working yet, 52 * though. 53 * 54 * Shared mappings now work. 15.8.1995 Bruno. 55 * 56 * finished 'unifying' the page and buffer cache and SMP-threaded the 57 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 58 * 59 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 60 */ 61 62 /* 63 * Lock ordering: 64 * 65 * ->i_mmap_rwsem (truncate_pagecache) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers) 67 * ->swap_lock (exclusive_swap_page, others) 68 * ->mapping->tree_lock 69 * 70 * ->i_mutex 71 * ->i_mmap_rwsem (truncate->unmap_mapping_range) 72 * 73 * ->mmap_sem 74 * ->i_mmap_rwsem 75 * ->page_table_lock or pte_lock (various, mainly in memory.c) 76 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 77 * 78 * ->mmap_sem 79 * ->lock_page (access_process_vm) 80 * 81 * ->i_mutex (generic_perform_write) 82 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 83 * 84 * bdi->wb.list_lock 85 * sb_lock (fs/fs-writeback.c) 86 * ->mapping->tree_lock (__sync_single_inode) 87 * 88 * ->i_mmap_rwsem 89 * ->anon_vma.lock (vma_adjust) 90 * 91 * ->anon_vma.lock 92 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 93 * 94 * ->page_table_lock or pte_lock 95 * ->swap_lock (try_to_unmap_one) 96 * ->private_lock (try_to_unmap_one) 97 * ->tree_lock (try_to_unmap_one) 98 * ->zone.lru_lock (follow_page->mark_page_accessed) 99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 100 * ->private_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty) 102 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 104 * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) 105 * bdi.wb->list_lock (zap_pte_range->set_page_dirty) 106 * ->inode->i_lock (zap_pte_range->set_page_dirty) 107 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 108 * 109 * ->i_mmap_rwsem 110 * ->tasklist_lock (memory_failure, collect_procs_ao) 111 */ 112 113 static void page_cache_tree_delete(struct address_space *mapping, 114 struct page *page, void *shadow) 115 { 116 struct radix_tree_node *node; 117 unsigned long index; 118 unsigned int offset; 119 unsigned int tag; 120 void **slot; 121 122 VM_BUG_ON(!PageLocked(page)); 123 124 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); 125 126 if (shadow) { 127 mapping->nrexceptional++; 128 /* 129 * Make sure the nrexceptional update is committed before 130 * the nrpages update so that final truncate racing 131 * with reclaim does not see both counters 0 at the 132 * same time and miss a shadow entry. 133 */ 134 smp_wmb(); 135 } 136 mapping->nrpages--; 137 138 if (!node) { 139 /* Clear direct pointer tags in root node */ 140 mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; 141 radix_tree_replace_slot(slot, shadow); 142 return; 143 } 144 145 /* Clear tree tags for the removed page */ 146 index = page->index; 147 offset = index & RADIX_TREE_MAP_MASK; 148 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 149 if (test_bit(offset, node->tags[tag])) 150 radix_tree_tag_clear(&mapping->page_tree, index, tag); 151 } 152 153 /* Delete page, swap shadow entry */ 154 radix_tree_replace_slot(slot, shadow); 155 workingset_node_pages_dec(node); 156 if (shadow) 157 workingset_node_shadows_inc(node); 158 else 159 if (__radix_tree_delete_node(&mapping->page_tree, node)) 160 return; 161 162 /* 163 * Track node that only contains shadow entries. 164 * 165 * Avoid acquiring the list_lru lock if already tracked. The 166 * list_empty() test is safe as node->private_list is 167 * protected by mapping->tree_lock. 168 */ 169 if (!workingset_node_pages(node) && 170 list_empty(&node->private_list)) { 171 node->private_data = mapping; 172 list_lru_add(&workingset_shadow_nodes, &node->private_list); 173 } 174 } 175 176 /* 177 * Delete a page from the page cache and free it. Caller has to make 178 * sure the page is locked and that nobody else uses it - or that usage 179 * is safe. The caller must hold the mapping's tree_lock and 180 * mem_cgroup_begin_page_stat(). 181 */ 182 void __delete_from_page_cache(struct page *page, void *shadow, 183 struct mem_cgroup *memcg) 184 { 185 struct address_space *mapping = page->mapping; 186 187 trace_mm_filemap_delete_from_page_cache(page); 188 /* 189 * if we're uptodate, flush out into the cleancache, otherwise 190 * invalidate any existing cleancache entries. We can't leave 191 * stale data around in the cleancache once our page is gone 192 */ 193 if (PageUptodate(page) && PageMappedToDisk(page)) 194 cleancache_put_page(page); 195 else 196 cleancache_invalidate_page(mapping, page); 197 198 VM_BUG_ON_PAGE(page_mapped(page), page); 199 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 200 int mapcount; 201 202 pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", 203 current->comm, page_to_pfn(page)); 204 dump_page(page, "still mapped when deleted"); 205 dump_stack(); 206 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 207 208 mapcount = page_mapcount(page); 209 if (mapping_exiting(mapping) && 210 page_count(page) >= mapcount + 2) { 211 /* 212 * All vmas have already been torn down, so it's 213 * a good bet that actually the page is unmapped, 214 * and we'd prefer not to leak it: if we're wrong, 215 * some other bad page check should catch it later. 216 */ 217 page_mapcount_reset(page); 218 atomic_sub(mapcount, &page->_count); 219 } 220 } 221 222 page_cache_tree_delete(mapping, page, shadow); 223 224 page->mapping = NULL; 225 /* Leave page->index set: truncation lookup relies upon it */ 226 227 /* hugetlb pages do not participate in page cache accounting. */ 228 if (!PageHuge(page)) 229 __dec_zone_page_state(page, NR_FILE_PAGES); 230 if (PageSwapBacked(page)) 231 __dec_zone_page_state(page, NR_SHMEM); 232 233 /* 234 * At this point page must be either written or cleaned by truncate. 235 * Dirty page here signals a bug and loss of unwritten data. 236 * 237 * This fixes dirty accounting after removing the page entirely but 238 * leaves PageDirty set: it has no effect for truncated page and 239 * anyway will be cleared before returning page into buddy allocator. 240 */ 241 if (WARN_ON_ONCE(PageDirty(page))) 242 account_page_cleaned(page, mapping, memcg, 243 inode_to_wb(mapping->host)); 244 } 245 246 /** 247 * delete_from_page_cache - delete page from page cache 248 * @page: the page which the kernel is trying to remove from page cache 249 * 250 * This must be called only on pages that have been verified to be in the page 251 * cache and locked. It will never put the page into the free list, the caller 252 * has a reference on the page. 253 */ 254 void delete_from_page_cache(struct page *page) 255 { 256 struct address_space *mapping = page->mapping; 257 struct mem_cgroup *memcg; 258 unsigned long flags; 259 260 void (*freepage)(struct page *); 261 262 BUG_ON(!PageLocked(page)); 263 264 freepage = mapping->a_ops->freepage; 265 266 memcg = mem_cgroup_begin_page_stat(page); 267 spin_lock_irqsave(&mapping->tree_lock, flags); 268 __delete_from_page_cache(page, NULL, memcg); 269 spin_unlock_irqrestore(&mapping->tree_lock, flags); 270 mem_cgroup_end_page_stat(memcg); 271 272 if (freepage) 273 freepage(page); 274 page_cache_release(page); 275 } 276 EXPORT_SYMBOL(delete_from_page_cache); 277 278 static int filemap_check_errors(struct address_space *mapping) 279 { 280 int ret = 0; 281 /* Check for outstanding write errors */ 282 if (test_bit(AS_ENOSPC, &mapping->flags) && 283 test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 284 ret = -ENOSPC; 285 if (test_bit(AS_EIO, &mapping->flags) && 286 test_and_clear_bit(AS_EIO, &mapping->flags)) 287 ret = -EIO; 288 return ret; 289 } 290 291 /** 292 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 293 * @mapping: address space structure to write 294 * @start: offset in bytes where the range starts 295 * @end: offset in bytes where the range ends (inclusive) 296 * @sync_mode: enable synchronous operation 297 * 298 * Start writeback against all of a mapping's dirty pages that lie 299 * within the byte offsets <start, end> inclusive. 300 * 301 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 302 * opposed to a regular memory cleansing writeback. The difference between 303 * these two operations is that if a dirty page/buffer is encountered, it must 304 * be waited upon, and not just skipped over. 305 */ 306 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 307 loff_t end, int sync_mode) 308 { 309 int ret; 310 struct writeback_control wbc = { 311 .sync_mode = sync_mode, 312 .nr_to_write = LONG_MAX, 313 .range_start = start, 314 .range_end = end, 315 }; 316 317 if (!mapping_cap_writeback_dirty(mapping)) 318 return 0; 319 320 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 321 ret = do_writepages(mapping, &wbc); 322 wbc_detach_inode(&wbc); 323 return ret; 324 } 325 326 static inline int __filemap_fdatawrite(struct address_space *mapping, 327 int sync_mode) 328 { 329 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 330 } 331 332 int filemap_fdatawrite(struct address_space *mapping) 333 { 334 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 335 } 336 EXPORT_SYMBOL(filemap_fdatawrite); 337 338 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 339 loff_t end) 340 { 341 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 342 } 343 EXPORT_SYMBOL(filemap_fdatawrite_range); 344 345 /** 346 * filemap_flush - mostly a non-blocking flush 347 * @mapping: target address_space 348 * 349 * This is a mostly non-blocking flush. Not suitable for data-integrity 350 * purposes - I/O may not be started against all dirty pages. 351 */ 352 int filemap_flush(struct address_space *mapping) 353 { 354 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 355 } 356 EXPORT_SYMBOL(filemap_flush); 357 358 static int __filemap_fdatawait_range(struct address_space *mapping, 359 loff_t start_byte, loff_t end_byte) 360 { 361 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 362 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 363 struct pagevec pvec; 364 int nr_pages; 365 int ret = 0; 366 367 if (end_byte < start_byte) 368 goto out; 369 370 pagevec_init(&pvec, 0); 371 while ((index <= end) && 372 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 373 PAGECACHE_TAG_WRITEBACK, 374 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 375 unsigned i; 376 377 for (i = 0; i < nr_pages; i++) { 378 struct page *page = pvec.pages[i]; 379 380 /* until radix tree lookup accepts end_index */ 381 if (page->index > end) 382 continue; 383 384 wait_on_page_writeback(page); 385 if (TestClearPageError(page)) 386 ret = -EIO; 387 } 388 pagevec_release(&pvec); 389 cond_resched(); 390 } 391 out: 392 return ret; 393 } 394 395 /** 396 * filemap_fdatawait_range - wait for writeback to complete 397 * @mapping: address space structure to wait for 398 * @start_byte: offset in bytes where the range starts 399 * @end_byte: offset in bytes where the range ends (inclusive) 400 * 401 * Walk the list of under-writeback pages of the given address space 402 * in the given range and wait for all of them. Check error status of 403 * the address space and return it. 404 * 405 * Since the error status of the address space is cleared by this function, 406 * callers are responsible for checking the return value and handling and/or 407 * reporting the error. 408 */ 409 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 410 loff_t end_byte) 411 { 412 int ret, ret2; 413 414 ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); 415 ret2 = filemap_check_errors(mapping); 416 if (!ret) 417 ret = ret2; 418 419 return ret; 420 } 421 EXPORT_SYMBOL(filemap_fdatawait_range); 422 423 /** 424 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors 425 * @mapping: address space structure to wait for 426 * 427 * Walk the list of under-writeback pages of the given address space 428 * and wait for all of them. Unlike filemap_fdatawait(), this function 429 * does not clear error status of the address space. 430 * 431 * Use this function if callers don't handle errors themselves. Expected 432 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 433 * fsfreeze(8) 434 */ 435 void filemap_fdatawait_keep_errors(struct address_space *mapping) 436 { 437 loff_t i_size = i_size_read(mapping->host); 438 439 if (i_size == 0) 440 return; 441 442 __filemap_fdatawait_range(mapping, 0, i_size - 1); 443 } 444 445 /** 446 * filemap_fdatawait - wait for all under-writeback pages to complete 447 * @mapping: address space structure to wait for 448 * 449 * Walk the list of under-writeback pages of the given address space 450 * and wait for all of them. Check error status of the address space 451 * and return it. 452 * 453 * Since the error status of the address space is cleared by this function, 454 * callers are responsible for checking the return value and handling and/or 455 * reporting the error. 456 */ 457 int filemap_fdatawait(struct address_space *mapping) 458 { 459 loff_t i_size = i_size_read(mapping->host); 460 461 if (i_size == 0) 462 return 0; 463 464 return filemap_fdatawait_range(mapping, 0, i_size - 1); 465 } 466 EXPORT_SYMBOL(filemap_fdatawait); 467 468 int filemap_write_and_wait(struct address_space *mapping) 469 { 470 int err = 0; 471 472 if ((!dax_mapping(mapping) && mapping->nrpages) || 473 (dax_mapping(mapping) && mapping->nrexceptional)) { 474 err = filemap_fdatawrite(mapping); 475 /* 476 * Even if the above returned error, the pages may be 477 * written partially (e.g. -ENOSPC), so we wait for it. 478 * But the -EIO is special case, it may indicate the worst 479 * thing (e.g. bug) happened, so we avoid waiting for it. 480 */ 481 if (err != -EIO) { 482 int err2 = filemap_fdatawait(mapping); 483 if (!err) 484 err = err2; 485 } 486 } else { 487 err = filemap_check_errors(mapping); 488 } 489 return err; 490 } 491 EXPORT_SYMBOL(filemap_write_and_wait); 492 493 /** 494 * filemap_write_and_wait_range - write out & wait on a file range 495 * @mapping: the address_space for the pages 496 * @lstart: offset in bytes where the range starts 497 * @lend: offset in bytes where the range ends (inclusive) 498 * 499 * Write out and wait upon file offsets lstart->lend, inclusive. 500 * 501 * Note that `lend' is inclusive (describes the last byte to be written) so 502 * that this function can be used to write to the very end-of-file (end = -1). 503 */ 504 int filemap_write_and_wait_range(struct address_space *mapping, 505 loff_t lstart, loff_t lend) 506 { 507 int err = 0; 508 509 if ((!dax_mapping(mapping) && mapping->nrpages) || 510 (dax_mapping(mapping) && mapping->nrexceptional)) { 511 err = __filemap_fdatawrite_range(mapping, lstart, lend, 512 WB_SYNC_ALL); 513 /* See comment of filemap_write_and_wait() */ 514 if (err != -EIO) { 515 int err2 = filemap_fdatawait_range(mapping, 516 lstart, lend); 517 if (!err) 518 err = err2; 519 } 520 } else { 521 err = filemap_check_errors(mapping); 522 } 523 return err; 524 } 525 EXPORT_SYMBOL(filemap_write_and_wait_range); 526 527 /** 528 * replace_page_cache_page - replace a pagecache page with a new one 529 * @old: page to be replaced 530 * @new: page to replace with 531 * @gfp_mask: allocation mode 532 * 533 * This function replaces a page in the pagecache with a new one. On 534 * success it acquires the pagecache reference for the new page and 535 * drops it for the old page. Both the old and new pages must be 536 * locked. This function does not add the new page to the LRU, the 537 * caller must do that. 538 * 539 * The remove + add is atomic. The only way this function can fail is 540 * memory allocation failure. 541 */ 542 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 543 { 544 int error; 545 546 VM_BUG_ON_PAGE(!PageLocked(old), old); 547 VM_BUG_ON_PAGE(!PageLocked(new), new); 548 VM_BUG_ON_PAGE(new->mapping, new); 549 550 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 551 if (!error) { 552 struct address_space *mapping = old->mapping; 553 void (*freepage)(struct page *); 554 struct mem_cgroup *memcg; 555 unsigned long flags; 556 557 pgoff_t offset = old->index; 558 freepage = mapping->a_ops->freepage; 559 560 page_cache_get(new); 561 new->mapping = mapping; 562 new->index = offset; 563 564 memcg = mem_cgroup_begin_page_stat(old); 565 spin_lock_irqsave(&mapping->tree_lock, flags); 566 __delete_from_page_cache(old, NULL, memcg); 567 error = radix_tree_insert(&mapping->page_tree, offset, new); 568 BUG_ON(error); 569 mapping->nrpages++; 570 571 /* 572 * hugetlb pages do not participate in page cache accounting. 573 */ 574 if (!PageHuge(new)) 575 __inc_zone_page_state(new, NR_FILE_PAGES); 576 if (PageSwapBacked(new)) 577 __inc_zone_page_state(new, NR_SHMEM); 578 spin_unlock_irqrestore(&mapping->tree_lock, flags); 579 mem_cgroup_end_page_stat(memcg); 580 mem_cgroup_replace_page(old, new); 581 radix_tree_preload_end(); 582 if (freepage) 583 freepage(old); 584 page_cache_release(old); 585 } 586 587 return error; 588 } 589 EXPORT_SYMBOL_GPL(replace_page_cache_page); 590 591 static int page_cache_tree_insert(struct address_space *mapping, 592 struct page *page, void **shadowp) 593 { 594 struct radix_tree_node *node; 595 void **slot; 596 int error; 597 598 error = __radix_tree_create(&mapping->page_tree, page->index, 599 &node, &slot); 600 if (error) 601 return error; 602 if (*slot) { 603 void *p; 604 605 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 606 if (!radix_tree_exceptional_entry(p)) 607 return -EEXIST; 608 609 if (WARN_ON(dax_mapping(mapping))) 610 return -EINVAL; 611 612 if (shadowp) 613 *shadowp = p; 614 mapping->nrexceptional--; 615 if (node) 616 workingset_node_shadows_dec(node); 617 } 618 radix_tree_replace_slot(slot, page); 619 mapping->nrpages++; 620 if (node) { 621 workingset_node_pages_inc(node); 622 /* 623 * Don't track node that contains actual pages. 624 * 625 * Avoid acquiring the list_lru lock if already 626 * untracked. The list_empty() test is safe as 627 * node->private_list is protected by 628 * mapping->tree_lock. 629 */ 630 if (!list_empty(&node->private_list)) 631 list_lru_del(&workingset_shadow_nodes, 632 &node->private_list); 633 } 634 return 0; 635 } 636 637 static int __add_to_page_cache_locked(struct page *page, 638 struct address_space *mapping, 639 pgoff_t offset, gfp_t gfp_mask, 640 void **shadowp) 641 { 642 int huge = PageHuge(page); 643 struct mem_cgroup *memcg; 644 int error; 645 646 VM_BUG_ON_PAGE(!PageLocked(page), page); 647 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 648 649 if (!huge) { 650 error = mem_cgroup_try_charge(page, current->mm, 651 gfp_mask, &memcg, false); 652 if (error) 653 return error; 654 } 655 656 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 657 if (error) { 658 if (!huge) 659 mem_cgroup_cancel_charge(page, memcg, false); 660 return error; 661 } 662 663 page_cache_get(page); 664 page->mapping = mapping; 665 page->index = offset; 666 667 spin_lock_irq(&mapping->tree_lock); 668 error = page_cache_tree_insert(mapping, page, shadowp); 669 radix_tree_preload_end(); 670 if (unlikely(error)) 671 goto err_insert; 672 673 /* hugetlb pages do not participate in page cache accounting. */ 674 if (!huge) 675 __inc_zone_page_state(page, NR_FILE_PAGES); 676 spin_unlock_irq(&mapping->tree_lock); 677 if (!huge) 678 mem_cgroup_commit_charge(page, memcg, false, false); 679 trace_mm_filemap_add_to_page_cache(page); 680 return 0; 681 err_insert: 682 page->mapping = NULL; 683 /* Leave page->index set: truncation relies upon it */ 684 spin_unlock_irq(&mapping->tree_lock); 685 if (!huge) 686 mem_cgroup_cancel_charge(page, memcg, false); 687 page_cache_release(page); 688 return error; 689 } 690 691 /** 692 * add_to_page_cache_locked - add a locked page to the pagecache 693 * @page: page to add 694 * @mapping: the page's address_space 695 * @offset: page index 696 * @gfp_mask: page allocation mode 697 * 698 * This function is used to add a page to the pagecache. It must be locked. 699 * This function does not add the page to the LRU. The caller must do that. 700 */ 701 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 702 pgoff_t offset, gfp_t gfp_mask) 703 { 704 return __add_to_page_cache_locked(page, mapping, offset, 705 gfp_mask, NULL); 706 } 707 EXPORT_SYMBOL(add_to_page_cache_locked); 708 709 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 710 pgoff_t offset, gfp_t gfp_mask) 711 { 712 void *shadow = NULL; 713 int ret; 714 715 __SetPageLocked(page); 716 ret = __add_to_page_cache_locked(page, mapping, offset, 717 gfp_mask, &shadow); 718 if (unlikely(ret)) 719 __ClearPageLocked(page); 720 else { 721 /* 722 * The page might have been evicted from cache only 723 * recently, in which case it should be activated like 724 * any other repeatedly accessed page. 725 */ 726 if (shadow && workingset_refault(shadow)) { 727 SetPageActive(page); 728 workingset_activation(page); 729 } else 730 ClearPageActive(page); 731 lru_cache_add(page); 732 } 733 return ret; 734 } 735 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 736 737 #ifdef CONFIG_NUMA 738 struct page *__page_cache_alloc(gfp_t gfp) 739 { 740 int n; 741 struct page *page; 742 743 if (cpuset_do_page_mem_spread()) { 744 unsigned int cpuset_mems_cookie; 745 do { 746 cpuset_mems_cookie = read_mems_allowed_begin(); 747 n = cpuset_mem_spread_node(); 748 page = __alloc_pages_node(n, gfp, 0); 749 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); 750 751 return page; 752 } 753 return alloc_pages(gfp, 0); 754 } 755 EXPORT_SYMBOL(__page_cache_alloc); 756 #endif 757 758 /* 759 * In order to wait for pages to become available there must be 760 * waitqueues associated with pages. By using a hash table of 761 * waitqueues where the bucket discipline is to maintain all 762 * waiters on the same queue and wake all when any of the pages 763 * become available, and for the woken contexts to check to be 764 * sure the appropriate page became available, this saves space 765 * at a cost of "thundering herd" phenomena during rare hash 766 * collisions. 767 */ 768 wait_queue_head_t *page_waitqueue(struct page *page) 769 { 770 const struct zone *zone = page_zone(page); 771 772 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 773 } 774 EXPORT_SYMBOL(page_waitqueue); 775 776 void wait_on_page_bit(struct page *page, int bit_nr) 777 { 778 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 779 780 if (test_bit(bit_nr, &page->flags)) 781 __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, 782 TASK_UNINTERRUPTIBLE); 783 } 784 EXPORT_SYMBOL(wait_on_page_bit); 785 786 int wait_on_page_bit_killable(struct page *page, int bit_nr) 787 { 788 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 789 790 if (!test_bit(bit_nr, &page->flags)) 791 return 0; 792 793 return __wait_on_bit(page_waitqueue(page), &wait, 794 bit_wait_io, TASK_KILLABLE); 795 } 796 797 int wait_on_page_bit_killable_timeout(struct page *page, 798 int bit_nr, unsigned long timeout) 799 { 800 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 801 802 wait.key.timeout = jiffies + timeout; 803 if (!test_bit(bit_nr, &page->flags)) 804 return 0; 805 return __wait_on_bit(page_waitqueue(page), &wait, 806 bit_wait_io_timeout, TASK_KILLABLE); 807 } 808 EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); 809 810 /** 811 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 812 * @page: Page defining the wait queue of interest 813 * @waiter: Waiter to add to the queue 814 * 815 * Add an arbitrary @waiter to the wait queue for the nominated @page. 816 */ 817 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 818 { 819 wait_queue_head_t *q = page_waitqueue(page); 820 unsigned long flags; 821 822 spin_lock_irqsave(&q->lock, flags); 823 __add_wait_queue(q, waiter); 824 spin_unlock_irqrestore(&q->lock, flags); 825 } 826 EXPORT_SYMBOL_GPL(add_page_wait_queue); 827 828 /** 829 * unlock_page - unlock a locked page 830 * @page: the page 831 * 832 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 833 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 834 * mechanism between PageLocked pages and PageWriteback pages is shared. 835 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 836 * 837 * The mb is necessary to enforce ordering between the clear_bit and the read 838 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 839 */ 840 void unlock_page(struct page *page) 841 { 842 page = compound_head(page); 843 VM_BUG_ON_PAGE(!PageLocked(page), page); 844 clear_bit_unlock(PG_locked, &page->flags); 845 smp_mb__after_atomic(); 846 wake_up_page(page, PG_locked); 847 } 848 EXPORT_SYMBOL(unlock_page); 849 850 /** 851 * end_page_writeback - end writeback against a page 852 * @page: the page 853 */ 854 void end_page_writeback(struct page *page) 855 { 856 /* 857 * TestClearPageReclaim could be used here but it is an atomic 858 * operation and overkill in this particular case. Failing to 859 * shuffle a page marked for immediate reclaim is too mild to 860 * justify taking an atomic operation penalty at the end of 861 * ever page writeback. 862 */ 863 if (PageReclaim(page)) { 864 ClearPageReclaim(page); 865 rotate_reclaimable_page(page); 866 } 867 868 if (!test_clear_page_writeback(page)) 869 BUG(); 870 871 smp_mb__after_atomic(); 872 wake_up_page(page, PG_writeback); 873 } 874 EXPORT_SYMBOL(end_page_writeback); 875 876 /* 877 * After completing I/O on a page, call this routine to update the page 878 * flags appropriately 879 */ 880 void page_endio(struct page *page, int rw, int err) 881 { 882 if (rw == READ) { 883 if (!err) { 884 SetPageUptodate(page); 885 } else { 886 ClearPageUptodate(page); 887 SetPageError(page); 888 } 889 unlock_page(page); 890 } else { /* rw == WRITE */ 891 if (err) { 892 SetPageError(page); 893 if (page->mapping) 894 mapping_set_error(page->mapping, err); 895 } 896 end_page_writeback(page); 897 } 898 } 899 EXPORT_SYMBOL_GPL(page_endio); 900 901 /** 902 * __lock_page - get a lock on the page, assuming we need to sleep to get it 903 * @page: the page to lock 904 */ 905 void __lock_page(struct page *page) 906 { 907 struct page *page_head = compound_head(page); 908 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 909 910 __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, 911 TASK_UNINTERRUPTIBLE); 912 } 913 EXPORT_SYMBOL(__lock_page); 914 915 int __lock_page_killable(struct page *page) 916 { 917 struct page *page_head = compound_head(page); 918 DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); 919 920 return __wait_on_bit_lock(page_waitqueue(page_head), &wait, 921 bit_wait_io, TASK_KILLABLE); 922 } 923 EXPORT_SYMBOL_GPL(__lock_page_killable); 924 925 /* 926 * Return values: 927 * 1 - page is locked; mmap_sem is still held. 928 * 0 - page is not locked. 929 * mmap_sem has been released (up_read()), unless flags had both 930 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in 931 * which case mmap_sem is still held. 932 * 933 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 934 * with the page locked and the mmap_sem unperturbed. 935 */ 936 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 937 unsigned int flags) 938 { 939 if (flags & FAULT_FLAG_ALLOW_RETRY) { 940 /* 941 * CAUTION! In this case, mmap_sem is not released 942 * even though return 0. 943 */ 944 if (flags & FAULT_FLAG_RETRY_NOWAIT) 945 return 0; 946 947 up_read(&mm->mmap_sem); 948 if (flags & FAULT_FLAG_KILLABLE) 949 wait_on_page_locked_killable(page); 950 else 951 wait_on_page_locked(page); 952 return 0; 953 } else { 954 if (flags & FAULT_FLAG_KILLABLE) { 955 int ret; 956 957 ret = __lock_page_killable(page); 958 if (ret) { 959 up_read(&mm->mmap_sem); 960 return 0; 961 } 962 } else 963 __lock_page(page); 964 return 1; 965 } 966 } 967 968 /** 969 * page_cache_next_hole - find the next hole (not-present entry) 970 * @mapping: mapping 971 * @index: index 972 * @max_scan: maximum range to search 973 * 974 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the 975 * lowest indexed hole. 976 * 977 * Returns: the index of the hole if found, otherwise returns an index 978 * outside of the set specified (in which case 'return - index >= 979 * max_scan' will be true). In rare cases of index wrap-around, 0 will 980 * be returned. 981 * 982 * page_cache_next_hole may be called under rcu_read_lock. However, 983 * like radix_tree_gang_lookup, this will not atomically search a 984 * snapshot of the tree at a single point in time. For example, if a 985 * hole is created at index 5, then subsequently a hole is created at 986 * index 10, page_cache_next_hole covering both indexes may return 10 987 * if called under rcu_read_lock. 988 */ 989 pgoff_t page_cache_next_hole(struct address_space *mapping, 990 pgoff_t index, unsigned long max_scan) 991 { 992 unsigned long i; 993 994 for (i = 0; i < max_scan; i++) { 995 struct page *page; 996 997 page = radix_tree_lookup(&mapping->page_tree, index); 998 if (!page || radix_tree_exceptional_entry(page)) 999 break; 1000 index++; 1001 if (index == 0) 1002 break; 1003 } 1004 1005 return index; 1006 } 1007 EXPORT_SYMBOL(page_cache_next_hole); 1008 1009 /** 1010 * page_cache_prev_hole - find the prev hole (not-present entry) 1011 * @mapping: mapping 1012 * @index: index 1013 * @max_scan: maximum range to search 1014 * 1015 * Search backwards in the range [max(index-max_scan+1, 0), index] for 1016 * the first hole. 1017 * 1018 * Returns: the index of the hole if found, otherwise returns an index 1019 * outside of the set specified (in which case 'index - return >= 1020 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX 1021 * will be returned. 1022 * 1023 * page_cache_prev_hole may be called under rcu_read_lock. However, 1024 * like radix_tree_gang_lookup, this will not atomically search a 1025 * snapshot of the tree at a single point in time. For example, if a 1026 * hole is created at index 10, then subsequently a hole is created at 1027 * index 5, page_cache_prev_hole covering both indexes may return 5 if 1028 * called under rcu_read_lock. 1029 */ 1030 pgoff_t page_cache_prev_hole(struct address_space *mapping, 1031 pgoff_t index, unsigned long max_scan) 1032 { 1033 unsigned long i; 1034 1035 for (i = 0; i < max_scan; i++) { 1036 struct page *page; 1037 1038 page = radix_tree_lookup(&mapping->page_tree, index); 1039 if (!page || radix_tree_exceptional_entry(page)) 1040 break; 1041 index--; 1042 if (index == ULONG_MAX) 1043 break; 1044 } 1045 1046 return index; 1047 } 1048 EXPORT_SYMBOL(page_cache_prev_hole); 1049 1050 /** 1051 * find_get_entry - find and get a page cache entry 1052 * @mapping: the address_space to search 1053 * @offset: the page cache index 1054 * 1055 * Looks up the page cache slot at @mapping & @offset. If there is a 1056 * page cache page, it is returned with an increased refcount. 1057 * 1058 * If the slot holds a shadow entry of a previously evicted page, or a 1059 * swap entry from shmem/tmpfs, it is returned. 1060 * 1061 * Otherwise, %NULL is returned. 1062 */ 1063 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1064 { 1065 void **pagep; 1066 struct page *page; 1067 1068 rcu_read_lock(); 1069 repeat: 1070 page = NULL; 1071 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 1072 if (pagep) { 1073 page = radix_tree_deref_slot(pagep); 1074 if (unlikely(!page)) 1075 goto out; 1076 if (radix_tree_exception(page)) { 1077 if (radix_tree_deref_retry(page)) 1078 goto repeat; 1079 /* 1080 * A shadow entry of a recently evicted page, 1081 * or a swap entry from shmem/tmpfs. Return 1082 * it without attempting to raise page count. 1083 */ 1084 goto out; 1085 } 1086 if (!page_cache_get_speculative(page)) 1087 goto repeat; 1088 1089 /* 1090 * Has the page moved? 1091 * This is part of the lockless pagecache protocol. See 1092 * include/linux/pagemap.h for details. 1093 */ 1094 if (unlikely(page != *pagep)) { 1095 page_cache_release(page); 1096 goto repeat; 1097 } 1098 } 1099 out: 1100 rcu_read_unlock(); 1101 1102 return page; 1103 } 1104 EXPORT_SYMBOL(find_get_entry); 1105 1106 /** 1107 * find_lock_entry - locate, pin and lock a page cache entry 1108 * @mapping: the address_space to search 1109 * @offset: the page cache index 1110 * 1111 * Looks up the page cache slot at @mapping & @offset. If there is a 1112 * page cache page, it is returned locked and with an increased 1113 * refcount. 1114 * 1115 * If the slot holds a shadow entry of a previously evicted page, or a 1116 * swap entry from shmem/tmpfs, it is returned. 1117 * 1118 * Otherwise, %NULL is returned. 1119 * 1120 * find_lock_entry() may sleep. 1121 */ 1122 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1123 { 1124 struct page *page; 1125 1126 repeat: 1127 page = find_get_entry(mapping, offset); 1128 if (page && !radix_tree_exception(page)) { 1129 lock_page(page); 1130 /* Has the page been truncated? */ 1131 if (unlikely(page->mapping != mapping)) { 1132 unlock_page(page); 1133 page_cache_release(page); 1134 goto repeat; 1135 } 1136 VM_BUG_ON_PAGE(page->index != offset, page); 1137 } 1138 return page; 1139 } 1140 EXPORT_SYMBOL(find_lock_entry); 1141 1142 /** 1143 * pagecache_get_page - find and get a page reference 1144 * @mapping: the address_space to search 1145 * @offset: the page index 1146 * @fgp_flags: PCG flags 1147 * @gfp_mask: gfp mask to use for the page cache data page allocation 1148 * 1149 * Looks up the page cache slot at @mapping & @offset. 1150 * 1151 * PCG flags modify how the page is returned. 1152 * 1153 * FGP_ACCESSED: the page will be marked accessed 1154 * FGP_LOCK: Page is return locked 1155 * FGP_CREAT: If page is not present then a new page is allocated using 1156 * @gfp_mask and added to the page cache and the VM's LRU 1157 * list. The page is returned locked and with an increased 1158 * refcount. Otherwise, %NULL is returned. 1159 * 1160 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1161 * if the GFP flags specified for FGP_CREAT are atomic. 1162 * 1163 * If there is a page cache page, it is returned with an increased refcount. 1164 */ 1165 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1166 int fgp_flags, gfp_t gfp_mask) 1167 { 1168 struct page *page; 1169 1170 repeat: 1171 page = find_get_entry(mapping, offset); 1172 if (radix_tree_exceptional_entry(page)) 1173 page = NULL; 1174 if (!page) 1175 goto no_page; 1176 1177 if (fgp_flags & FGP_LOCK) { 1178 if (fgp_flags & FGP_NOWAIT) { 1179 if (!trylock_page(page)) { 1180 page_cache_release(page); 1181 return NULL; 1182 } 1183 } else { 1184 lock_page(page); 1185 } 1186 1187 /* Has the page been truncated? */ 1188 if (unlikely(page->mapping != mapping)) { 1189 unlock_page(page); 1190 page_cache_release(page); 1191 goto repeat; 1192 } 1193 VM_BUG_ON_PAGE(page->index != offset, page); 1194 } 1195 1196 if (page && (fgp_flags & FGP_ACCESSED)) 1197 mark_page_accessed(page); 1198 1199 no_page: 1200 if (!page && (fgp_flags & FGP_CREAT)) { 1201 int err; 1202 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) 1203 gfp_mask |= __GFP_WRITE; 1204 if (fgp_flags & FGP_NOFS) 1205 gfp_mask &= ~__GFP_FS; 1206 1207 page = __page_cache_alloc(gfp_mask); 1208 if (!page) 1209 return NULL; 1210 1211 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1212 fgp_flags |= FGP_LOCK; 1213 1214 /* Init accessed so avoid atomic mark_page_accessed later */ 1215 if (fgp_flags & FGP_ACCESSED) 1216 __SetPageReferenced(page); 1217 1218 err = add_to_page_cache_lru(page, mapping, offset, 1219 gfp_mask & GFP_RECLAIM_MASK); 1220 if (unlikely(err)) { 1221 page_cache_release(page); 1222 page = NULL; 1223 if (err == -EEXIST) 1224 goto repeat; 1225 } 1226 } 1227 1228 return page; 1229 } 1230 EXPORT_SYMBOL(pagecache_get_page); 1231 1232 /** 1233 * find_get_entries - gang pagecache lookup 1234 * @mapping: The address_space to search 1235 * @start: The starting page cache index 1236 * @nr_entries: The maximum number of entries 1237 * @entries: Where the resulting entries are placed 1238 * @indices: The cache indices corresponding to the entries in @entries 1239 * 1240 * find_get_entries() will search for and return a group of up to 1241 * @nr_entries entries in the mapping. The entries are placed at 1242 * @entries. find_get_entries() takes a reference against any actual 1243 * pages it returns. 1244 * 1245 * The search returns a group of mapping-contiguous page cache entries 1246 * with ascending indexes. There may be holes in the indices due to 1247 * not-present pages. 1248 * 1249 * Any shadow entries of evicted pages, or swap entries from 1250 * shmem/tmpfs, are included in the returned array. 1251 * 1252 * find_get_entries() returns the number of pages and shadow entries 1253 * which were found. 1254 */ 1255 unsigned find_get_entries(struct address_space *mapping, 1256 pgoff_t start, unsigned int nr_entries, 1257 struct page **entries, pgoff_t *indices) 1258 { 1259 void **slot; 1260 unsigned int ret = 0; 1261 struct radix_tree_iter iter; 1262 1263 if (!nr_entries) 1264 return 0; 1265 1266 rcu_read_lock(); 1267 restart: 1268 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1269 struct page *page; 1270 repeat: 1271 page = radix_tree_deref_slot(slot); 1272 if (unlikely(!page)) 1273 continue; 1274 if (radix_tree_exception(page)) { 1275 if (radix_tree_deref_retry(page)) 1276 goto restart; 1277 /* 1278 * A shadow entry of a recently evicted page, a swap 1279 * entry from shmem/tmpfs or a DAX entry. Return it 1280 * without attempting to raise page count. 1281 */ 1282 goto export; 1283 } 1284 if (!page_cache_get_speculative(page)) 1285 goto repeat; 1286 1287 /* Has the page moved? */ 1288 if (unlikely(page != *slot)) { 1289 page_cache_release(page); 1290 goto repeat; 1291 } 1292 export: 1293 indices[ret] = iter.index; 1294 entries[ret] = page; 1295 if (++ret == nr_entries) 1296 break; 1297 } 1298 rcu_read_unlock(); 1299 return ret; 1300 } 1301 1302 /** 1303 * find_get_pages - gang pagecache lookup 1304 * @mapping: The address_space to search 1305 * @start: The starting page index 1306 * @nr_pages: The maximum number of pages 1307 * @pages: Where the resulting pages are placed 1308 * 1309 * find_get_pages() will search for and return a group of up to 1310 * @nr_pages pages in the mapping. The pages are placed at @pages. 1311 * find_get_pages() takes a reference against the returned pages. 1312 * 1313 * The search returns a group of mapping-contiguous pages with ascending 1314 * indexes. There may be holes in the indices due to not-present pages. 1315 * 1316 * find_get_pages() returns the number of pages which were found. 1317 */ 1318 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 1319 unsigned int nr_pages, struct page **pages) 1320 { 1321 struct radix_tree_iter iter; 1322 void **slot; 1323 unsigned ret = 0; 1324 1325 if (unlikely(!nr_pages)) 1326 return 0; 1327 1328 rcu_read_lock(); 1329 restart: 1330 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1331 struct page *page; 1332 repeat: 1333 page = radix_tree_deref_slot(slot); 1334 if (unlikely(!page)) 1335 continue; 1336 1337 if (radix_tree_exception(page)) { 1338 if (radix_tree_deref_retry(page)) { 1339 /* 1340 * Transient condition which can only trigger 1341 * when entry at index 0 moves out of or back 1342 * to root: none yet gotten, safe to restart. 1343 */ 1344 WARN_ON(iter.index); 1345 goto restart; 1346 } 1347 /* 1348 * A shadow entry of a recently evicted page, 1349 * or a swap entry from shmem/tmpfs. Skip 1350 * over it. 1351 */ 1352 continue; 1353 } 1354 1355 if (!page_cache_get_speculative(page)) 1356 goto repeat; 1357 1358 /* Has the page moved? */ 1359 if (unlikely(page != *slot)) { 1360 page_cache_release(page); 1361 goto repeat; 1362 } 1363 1364 pages[ret] = page; 1365 if (++ret == nr_pages) 1366 break; 1367 } 1368 1369 rcu_read_unlock(); 1370 return ret; 1371 } 1372 1373 /** 1374 * find_get_pages_contig - gang contiguous pagecache lookup 1375 * @mapping: The address_space to search 1376 * @index: The starting page index 1377 * @nr_pages: The maximum number of pages 1378 * @pages: Where the resulting pages are placed 1379 * 1380 * find_get_pages_contig() works exactly like find_get_pages(), except 1381 * that the returned number of pages are guaranteed to be contiguous. 1382 * 1383 * find_get_pages_contig() returns the number of pages which were found. 1384 */ 1385 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1386 unsigned int nr_pages, struct page **pages) 1387 { 1388 struct radix_tree_iter iter; 1389 void **slot; 1390 unsigned int ret = 0; 1391 1392 if (unlikely(!nr_pages)) 1393 return 0; 1394 1395 rcu_read_lock(); 1396 restart: 1397 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1398 struct page *page; 1399 repeat: 1400 page = radix_tree_deref_slot(slot); 1401 /* The hole, there no reason to continue */ 1402 if (unlikely(!page)) 1403 break; 1404 1405 if (radix_tree_exception(page)) { 1406 if (radix_tree_deref_retry(page)) { 1407 /* 1408 * Transient condition which can only trigger 1409 * when entry at index 0 moves out of or back 1410 * to root: none yet gotten, safe to restart. 1411 */ 1412 goto restart; 1413 } 1414 /* 1415 * A shadow entry of a recently evicted page, 1416 * or a swap entry from shmem/tmpfs. Stop 1417 * looking for contiguous pages. 1418 */ 1419 break; 1420 } 1421 1422 if (!page_cache_get_speculative(page)) 1423 goto repeat; 1424 1425 /* Has the page moved? */ 1426 if (unlikely(page != *slot)) { 1427 page_cache_release(page); 1428 goto repeat; 1429 } 1430 1431 /* 1432 * must check mapping and index after taking the ref. 1433 * otherwise we can get both false positives and false 1434 * negatives, which is just confusing to the caller. 1435 */ 1436 if (page->mapping == NULL || page->index != iter.index) { 1437 page_cache_release(page); 1438 break; 1439 } 1440 1441 pages[ret] = page; 1442 if (++ret == nr_pages) 1443 break; 1444 } 1445 rcu_read_unlock(); 1446 return ret; 1447 } 1448 EXPORT_SYMBOL(find_get_pages_contig); 1449 1450 /** 1451 * find_get_pages_tag - find and return pages that match @tag 1452 * @mapping: the address_space to search 1453 * @index: the starting page index 1454 * @tag: the tag index 1455 * @nr_pages: the maximum number of pages 1456 * @pages: where the resulting pages are placed 1457 * 1458 * Like find_get_pages, except we only return pages which are tagged with 1459 * @tag. We update @index to index the next page for the traversal. 1460 */ 1461 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 1462 int tag, unsigned int nr_pages, struct page **pages) 1463 { 1464 struct radix_tree_iter iter; 1465 void **slot; 1466 unsigned ret = 0; 1467 1468 if (unlikely(!nr_pages)) 1469 return 0; 1470 1471 rcu_read_lock(); 1472 restart: 1473 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1474 &iter, *index, tag) { 1475 struct page *page; 1476 repeat: 1477 page = radix_tree_deref_slot(slot); 1478 if (unlikely(!page)) 1479 continue; 1480 1481 if (radix_tree_exception(page)) { 1482 if (radix_tree_deref_retry(page)) { 1483 /* 1484 * Transient condition which can only trigger 1485 * when entry at index 0 moves out of or back 1486 * to root: none yet gotten, safe to restart. 1487 */ 1488 goto restart; 1489 } 1490 /* 1491 * A shadow entry of a recently evicted page. 1492 * 1493 * Those entries should never be tagged, but 1494 * this tree walk is lockless and the tags are 1495 * looked up in bulk, one radix tree node at a 1496 * time, so there is a sizable window for page 1497 * reclaim to evict a page we saw tagged. 1498 * 1499 * Skip over it. 1500 */ 1501 continue; 1502 } 1503 1504 if (!page_cache_get_speculative(page)) 1505 goto repeat; 1506 1507 /* Has the page moved? */ 1508 if (unlikely(page != *slot)) { 1509 page_cache_release(page); 1510 goto repeat; 1511 } 1512 1513 pages[ret] = page; 1514 if (++ret == nr_pages) 1515 break; 1516 } 1517 1518 rcu_read_unlock(); 1519 1520 if (ret) 1521 *index = pages[ret - 1]->index + 1; 1522 1523 return ret; 1524 } 1525 EXPORT_SYMBOL(find_get_pages_tag); 1526 1527 /** 1528 * find_get_entries_tag - find and return entries that match @tag 1529 * @mapping: the address_space to search 1530 * @start: the starting page cache index 1531 * @tag: the tag index 1532 * @nr_entries: the maximum number of entries 1533 * @entries: where the resulting entries are placed 1534 * @indices: the cache indices corresponding to the entries in @entries 1535 * 1536 * Like find_get_entries, except we only return entries which are tagged with 1537 * @tag. 1538 */ 1539 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1540 int tag, unsigned int nr_entries, 1541 struct page **entries, pgoff_t *indices) 1542 { 1543 void **slot; 1544 unsigned int ret = 0; 1545 struct radix_tree_iter iter; 1546 1547 if (!nr_entries) 1548 return 0; 1549 1550 rcu_read_lock(); 1551 restart: 1552 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1553 &iter, start, tag) { 1554 struct page *page; 1555 repeat: 1556 page = radix_tree_deref_slot(slot); 1557 if (unlikely(!page)) 1558 continue; 1559 if (radix_tree_exception(page)) { 1560 if (radix_tree_deref_retry(page)) { 1561 /* 1562 * Transient condition which can only trigger 1563 * when entry at index 0 moves out of or back 1564 * to root: none yet gotten, safe to restart. 1565 */ 1566 goto restart; 1567 } 1568 1569 /* 1570 * A shadow entry of a recently evicted page, a swap 1571 * entry from shmem/tmpfs or a DAX entry. Return it 1572 * without attempting to raise page count. 1573 */ 1574 goto export; 1575 } 1576 if (!page_cache_get_speculative(page)) 1577 goto repeat; 1578 1579 /* Has the page moved? */ 1580 if (unlikely(page != *slot)) { 1581 page_cache_release(page); 1582 goto repeat; 1583 } 1584 export: 1585 indices[ret] = iter.index; 1586 entries[ret] = page; 1587 if (++ret == nr_entries) 1588 break; 1589 } 1590 rcu_read_unlock(); 1591 return ret; 1592 } 1593 EXPORT_SYMBOL(find_get_entries_tag); 1594 1595 /* 1596 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1597 * a _large_ part of the i/o request. Imagine the worst scenario: 1598 * 1599 * ---R__________________________________________B__________ 1600 * ^ reading here ^ bad block(assume 4k) 1601 * 1602 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1603 * => failing the whole request => read(R) => read(R+1) => 1604 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1605 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1606 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1607 * 1608 * It is going insane. Fix it by quickly scaling down the readahead size. 1609 */ 1610 static void shrink_readahead_size_eio(struct file *filp, 1611 struct file_ra_state *ra) 1612 { 1613 ra->ra_pages /= 4; 1614 } 1615 1616 /** 1617 * do_generic_file_read - generic file read routine 1618 * @filp: the file to read 1619 * @ppos: current file position 1620 * @iter: data destination 1621 * @written: already copied 1622 * 1623 * This is a generic file read routine, and uses the 1624 * mapping->a_ops->readpage() function for the actual low-level stuff. 1625 * 1626 * This is really ugly. But the goto's actually try to clarify some 1627 * of the logic when it comes to error handling etc. 1628 */ 1629 static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, 1630 struct iov_iter *iter, ssize_t written) 1631 { 1632 struct address_space *mapping = filp->f_mapping; 1633 struct inode *inode = mapping->host; 1634 struct file_ra_state *ra = &filp->f_ra; 1635 pgoff_t index; 1636 pgoff_t last_index; 1637 pgoff_t prev_index; 1638 unsigned long offset; /* offset into pagecache page */ 1639 unsigned int prev_offset; 1640 int error = 0; 1641 1642 index = *ppos >> PAGE_CACHE_SHIFT; 1643 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 1644 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 1645 last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 1646 offset = *ppos & ~PAGE_CACHE_MASK; 1647 1648 for (;;) { 1649 struct page *page; 1650 pgoff_t end_index; 1651 loff_t isize; 1652 unsigned long nr, ret; 1653 1654 cond_resched(); 1655 find_page: 1656 page = find_get_page(mapping, index); 1657 if (!page) { 1658 page_cache_sync_readahead(mapping, 1659 ra, filp, 1660 index, last_index - index); 1661 page = find_get_page(mapping, index); 1662 if (unlikely(page == NULL)) 1663 goto no_cached_page; 1664 } 1665 if (PageReadahead(page)) { 1666 page_cache_async_readahead(mapping, 1667 ra, filp, page, 1668 index, last_index - index); 1669 } 1670 if (!PageUptodate(page)) { 1671 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1672 !mapping->a_ops->is_partially_uptodate) 1673 goto page_not_up_to_date; 1674 if (!trylock_page(page)) 1675 goto page_not_up_to_date; 1676 /* Did it get truncated before we got the lock? */ 1677 if (!page->mapping) 1678 goto page_not_up_to_date_locked; 1679 if (!mapping->a_ops->is_partially_uptodate(page, 1680 offset, iter->count)) 1681 goto page_not_up_to_date_locked; 1682 unlock_page(page); 1683 } 1684 page_ok: 1685 /* 1686 * i_size must be checked after we know the page is Uptodate. 1687 * 1688 * Checking i_size after the check allows us to calculate 1689 * the correct value for "nr", which means the zero-filled 1690 * part of the page is not copied back to userspace (unless 1691 * another truncate extends the file - this is desired though). 1692 */ 1693 1694 isize = i_size_read(inode); 1695 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1696 if (unlikely(!isize || index > end_index)) { 1697 page_cache_release(page); 1698 goto out; 1699 } 1700 1701 /* nr is the maximum number of bytes to copy from this page */ 1702 nr = PAGE_CACHE_SIZE; 1703 if (index == end_index) { 1704 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1705 if (nr <= offset) { 1706 page_cache_release(page); 1707 goto out; 1708 } 1709 } 1710 nr = nr - offset; 1711 1712 /* If users can be writing to this page using arbitrary 1713 * virtual addresses, take care about potential aliasing 1714 * before reading the page on the kernel side. 1715 */ 1716 if (mapping_writably_mapped(mapping)) 1717 flush_dcache_page(page); 1718 1719 /* 1720 * When a sequential read accesses a page several times, 1721 * only mark it as accessed the first time. 1722 */ 1723 if (prev_index != index || offset != prev_offset) 1724 mark_page_accessed(page); 1725 prev_index = index; 1726 1727 /* 1728 * Ok, we have the page, and it's up-to-date, so 1729 * now we can copy it to user space... 1730 */ 1731 1732 ret = copy_page_to_iter(page, offset, nr, iter); 1733 offset += ret; 1734 index += offset >> PAGE_CACHE_SHIFT; 1735 offset &= ~PAGE_CACHE_MASK; 1736 prev_offset = offset; 1737 1738 page_cache_release(page); 1739 written += ret; 1740 if (!iov_iter_count(iter)) 1741 goto out; 1742 if (ret < nr) { 1743 error = -EFAULT; 1744 goto out; 1745 } 1746 continue; 1747 1748 page_not_up_to_date: 1749 /* Get exclusive access to the page ... */ 1750 error = lock_page_killable(page); 1751 if (unlikely(error)) 1752 goto readpage_error; 1753 1754 page_not_up_to_date_locked: 1755 /* Did it get truncated before we got the lock? */ 1756 if (!page->mapping) { 1757 unlock_page(page); 1758 page_cache_release(page); 1759 continue; 1760 } 1761 1762 /* Did somebody else fill it already? */ 1763 if (PageUptodate(page)) { 1764 unlock_page(page); 1765 goto page_ok; 1766 } 1767 1768 readpage: 1769 /* 1770 * A previous I/O error may have been due to temporary 1771 * failures, eg. multipath errors. 1772 * PG_error will be set again if readpage fails. 1773 */ 1774 ClearPageError(page); 1775 /* Start the actual read. The read will unlock the page. */ 1776 error = mapping->a_ops->readpage(filp, page); 1777 1778 if (unlikely(error)) { 1779 if (error == AOP_TRUNCATED_PAGE) { 1780 page_cache_release(page); 1781 error = 0; 1782 goto find_page; 1783 } 1784 goto readpage_error; 1785 } 1786 1787 if (!PageUptodate(page)) { 1788 error = lock_page_killable(page); 1789 if (unlikely(error)) 1790 goto readpage_error; 1791 if (!PageUptodate(page)) { 1792 if (page->mapping == NULL) { 1793 /* 1794 * invalidate_mapping_pages got it 1795 */ 1796 unlock_page(page); 1797 page_cache_release(page); 1798 goto find_page; 1799 } 1800 unlock_page(page); 1801 shrink_readahead_size_eio(filp, ra); 1802 error = -EIO; 1803 goto readpage_error; 1804 } 1805 unlock_page(page); 1806 } 1807 1808 goto page_ok; 1809 1810 readpage_error: 1811 /* UHHUH! A synchronous read error occurred. Report it */ 1812 page_cache_release(page); 1813 goto out; 1814 1815 no_cached_page: 1816 /* 1817 * Ok, it wasn't cached, so we need to create a new 1818 * page.. 1819 */ 1820 page = page_cache_alloc_cold(mapping); 1821 if (!page) { 1822 error = -ENOMEM; 1823 goto out; 1824 } 1825 error = add_to_page_cache_lru(page, mapping, index, 1826 mapping_gfp_constraint(mapping, GFP_KERNEL)); 1827 if (error) { 1828 page_cache_release(page); 1829 if (error == -EEXIST) { 1830 error = 0; 1831 goto find_page; 1832 } 1833 goto out; 1834 } 1835 goto readpage; 1836 } 1837 1838 out: 1839 ra->prev_pos = prev_index; 1840 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1841 ra->prev_pos |= prev_offset; 1842 1843 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1844 file_accessed(filp); 1845 return written ? written : error; 1846 } 1847 1848 /** 1849 * generic_file_read_iter - generic filesystem read routine 1850 * @iocb: kernel I/O control block 1851 * @iter: destination for the data read 1852 * 1853 * This is the "read_iter()" routine for all filesystems 1854 * that can use the page cache directly. 1855 */ 1856 ssize_t 1857 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 1858 { 1859 struct file *file = iocb->ki_filp; 1860 ssize_t retval = 0; 1861 loff_t *ppos = &iocb->ki_pos; 1862 loff_t pos = *ppos; 1863 1864 if (iocb->ki_flags & IOCB_DIRECT) { 1865 struct address_space *mapping = file->f_mapping; 1866 struct inode *inode = mapping->host; 1867 size_t count = iov_iter_count(iter); 1868 loff_t size; 1869 1870 if (!count) 1871 goto out; /* skip atime */ 1872 size = i_size_read(inode); 1873 retval = filemap_write_and_wait_range(mapping, pos, 1874 pos + count - 1); 1875 if (!retval) { 1876 struct iov_iter data = *iter; 1877 retval = mapping->a_ops->direct_IO(iocb, &data, pos); 1878 } 1879 1880 if (retval > 0) { 1881 *ppos = pos + retval; 1882 iov_iter_advance(iter, retval); 1883 } 1884 1885 /* 1886 * Btrfs can have a short DIO read if we encounter 1887 * compressed extents, so if there was an error, or if 1888 * we've already read everything we wanted to, or if 1889 * there was a short read because we hit EOF, go ahead 1890 * and return. Otherwise fallthrough to buffered io for 1891 * the rest of the read. Buffered reads will not work for 1892 * DAX files, so don't bother trying. 1893 */ 1894 if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || 1895 IS_DAX(inode)) { 1896 file_accessed(file); 1897 goto out; 1898 } 1899 } 1900 1901 retval = do_generic_file_read(file, ppos, iter, retval); 1902 out: 1903 return retval; 1904 } 1905 EXPORT_SYMBOL(generic_file_read_iter); 1906 1907 #ifdef CONFIG_MMU 1908 /** 1909 * page_cache_read - adds requested page to the page cache if not already there 1910 * @file: file to read 1911 * @offset: page index 1912 * @gfp_mask: memory allocation flags 1913 * 1914 * This adds the requested page to the page cache if it isn't already there, 1915 * and schedules an I/O to read in its contents from disk. 1916 */ 1917 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 1918 { 1919 struct address_space *mapping = file->f_mapping; 1920 struct page *page; 1921 int ret; 1922 1923 do { 1924 page = __page_cache_alloc(gfp_mask|__GFP_COLD); 1925 if (!page) 1926 return -ENOMEM; 1927 1928 ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); 1929 if (ret == 0) 1930 ret = mapping->a_ops->readpage(file, page); 1931 else if (ret == -EEXIST) 1932 ret = 0; /* losing race to add is OK */ 1933 1934 page_cache_release(page); 1935 1936 } while (ret == AOP_TRUNCATED_PAGE); 1937 1938 return ret; 1939 } 1940 1941 #define MMAP_LOTSAMISS (100) 1942 1943 /* 1944 * Synchronous readahead happens when we don't even find 1945 * a page in the page cache at all. 1946 */ 1947 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1948 struct file_ra_state *ra, 1949 struct file *file, 1950 pgoff_t offset) 1951 { 1952 struct address_space *mapping = file->f_mapping; 1953 1954 /* If we don't want any read-ahead, don't bother */ 1955 if (vma->vm_flags & VM_RAND_READ) 1956 return; 1957 if (!ra->ra_pages) 1958 return; 1959 1960 if (vma->vm_flags & VM_SEQ_READ) { 1961 page_cache_sync_readahead(mapping, ra, file, offset, 1962 ra->ra_pages); 1963 return; 1964 } 1965 1966 /* Avoid banging the cache line if not needed */ 1967 if (ra->mmap_miss < MMAP_LOTSAMISS * 10) 1968 ra->mmap_miss++; 1969 1970 /* 1971 * Do we miss much more than hit in this file? If so, 1972 * stop bothering with read-ahead. It will only hurt. 1973 */ 1974 if (ra->mmap_miss > MMAP_LOTSAMISS) 1975 return; 1976 1977 /* 1978 * mmap read-around 1979 */ 1980 ra->start = max_t(long, 0, offset - ra->ra_pages / 2); 1981 ra->size = ra->ra_pages; 1982 ra->async_size = ra->ra_pages / 4; 1983 ra_submit(ra, mapping, file); 1984 } 1985 1986 /* 1987 * Asynchronous readahead happens when we find the page and PG_readahead, 1988 * so we want to possibly extend the readahead further.. 1989 */ 1990 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1991 struct file_ra_state *ra, 1992 struct file *file, 1993 struct page *page, 1994 pgoff_t offset) 1995 { 1996 struct address_space *mapping = file->f_mapping; 1997 1998 /* If we don't want any read-ahead, don't bother */ 1999 if (vma->vm_flags & VM_RAND_READ) 2000 return; 2001 if (ra->mmap_miss > 0) 2002 ra->mmap_miss--; 2003 if (PageReadahead(page)) 2004 page_cache_async_readahead(mapping, ra, file, 2005 page, offset, ra->ra_pages); 2006 } 2007 2008 /** 2009 * filemap_fault - read in file data for page fault handling 2010 * @vma: vma in which the fault was taken 2011 * @vmf: struct vm_fault containing details of the fault 2012 * 2013 * filemap_fault() is invoked via the vma operations vector for a 2014 * mapped memory region to read in file data during a page fault. 2015 * 2016 * The goto's are kind of ugly, but this streamlines the normal case of having 2017 * it in the page cache, and handles the special cases reasonably without 2018 * having a lot of duplicated code. 2019 * 2020 * vma->vm_mm->mmap_sem must be held on entry. 2021 * 2022 * If our return value has VM_FAULT_RETRY set, it's because 2023 * lock_page_or_retry() returned 0. 2024 * The mmap_sem has usually been released in this case. 2025 * See __lock_page_or_retry() for the exception. 2026 * 2027 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem 2028 * has not been released. 2029 * 2030 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2031 */ 2032 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2033 { 2034 int error; 2035 struct file *file = vma->vm_file; 2036 struct address_space *mapping = file->f_mapping; 2037 struct file_ra_state *ra = &file->f_ra; 2038 struct inode *inode = mapping->host; 2039 pgoff_t offset = vmf->pgoff; 2040 struct page *page; 2041 loff_t size; 2042 int ret = 0; 2043 2044 size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); 2045 if (offset >= size >> PAGE_CACHE_SHIFT) 2046 return VM_FAULT_SIGBUS; 2047 2048 /* 2049 * Do we have something in the page cache already? 2050 */ 2051 page = find_get_page(mapping, offset); 2052 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 2053 /* 2054 * We found the page, so try async readahead before 2055 * waiting for the lock. 2056 */ 2057 do_async_mmap_readahead(vma, ra, file, page, offset); 2058 } else if (!page) { 2059 /* No page in the page cache at all */ 2060 do_sync_mmap_readahead(vma, ra, file, offset); 2061 count_vm_event(PGMAJFAULT); 2062 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 2063 ret = VM_FAULT_MAJOR; 2064 retry_find: 2065 page = find_get_page(mapping, offset); 2066 if (!page) 2067 goto no_cached_page; 2068 } 2069 2070 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 2071 page_cache_release(page); 2072 return ret | VM_FAULT_RETRY; 2073 } 2074 2075 /* Did it get truncated? */ 2076 if (unlikely(page->mapping != mapping)) { 2077 unlock_page(page); 2078 put_page(page); 2079 goto retry_find; 2080 } 2081 VM_BUG_ON_PAGE(page->index != offset, page); 2082 2083 /* 2084 * We have a locked page in the page cache, now we need to check 2085 * that it's up-to-date. If not, it is going to be due to an error. 2086 */ 2087 if (unlikely(!PageUptodate(page))) 2088 goto page_not_uptodate; 2089 2090 /* 2091 * Found the page and have a reference on it. 2092 * We must recheck i_size under page lock. 2093 */ 2094 size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); 2095 if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { 2096 unlock_page(page); 2097 page_cache_release(page); 2098 return VM_FAULT_SIGBUS; 2099 } 2100 2101 vmf->page = page; 2102 return ret | VM_FAULT_LOCKED; 2103 2104 no_cached_page: 2105 /* 2106 * We're only likely to ever get here if MADV_RANDOM is in 2107 * effect. 2108 */ 2109 error = page_cache_read(file, offset, vmf->gfp_mask); 2110 2111 /* 2112 * The page we want has now been added to the page cache. 2113 * In the unlikely event that someone removed it in the 2114 * meantime, we'll just come back here and read it again. 2115 */ 2116 if (error >= 0) 2117 goto retry_find; 2118 2119 /* 2120 * An error return from page_cache_read can result if the 2121 * system is low on memory, or a problem occurs while trying 2122 * to schedule I/O. 2123 */ 2124 if (error == -ENOMEM) 2125 return VM_FAULT_OOM; 2126 return VM_FAULT_SIGBUS; 2127 2128 page_not_uptodate: 2129 /* 2130 * Umm, take care of errors if the page isn't up-to-date. 2131 * Try to re-read it _once_. We do this synchronously, 2132 * because there really aren't any performance issues here 2133 * and we need to check for errors. 2134 */ 2135 ClearPageError(page); 2136 error = mapping->a_ops->readpage(file, page); 2137 if (!error) { 2138 wait_on_page_locked(page); 2139 if (!PageUptodate(page)) 2140 error = -EIO; 2141 } 2142 page_cache_release(page); 2143 2144 if (!error || error == AOP_TRUNCATED_PAGE) 2145 goto retry_find; 2146 2147 /* Things didn't work out. Return zero to tell the mm layer so. */ 2148 shrink_readahead_size_eio(file, ra); 2149 return VM_FAULT_SIGBUS; 2150 } 2151 EXPORT_SYMBOL(filemap_fault); 2152 2153 void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 2154 { 2155 struct radix_tree_iter iter; 2156 void **slot; 2157 struct file *file = vma->vm_file; 2158 struct address_space *mapping = file->f_mapping; 2159 loff_t size; 2160 struct page *page; 2161 unsigned long address = (unsigned long) vmf->virtual_address; 2162 unsigned long addr; 2163 pte_t *pte; 2164 2165 rcu_read_lock(); 2166 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { 2167 if (iter.index > vmf->max_pgoff) 2168 break; 2169 repeat: 2170 page = radix_tree_deref_slot(slot); 2171 if (unlikely(!page)) 2172 goto next; 2173 if (radix_tree_exception(page)) { 2174 if (radix_tree_deref_retry(page)) 2175 break; 2176 else 2177 goto next; 2178 } 2179 2180 if (!page_cache_get_speculative(page)) 2181 goto repeat; 2182 2183 /* Has the page moved? */ 2184 if (unlikely(page != *slot)) { 2185 page_cache_release(page); 2186 goto repeat; 2187 } 2188 2189 if (!PageUptodate(page) || 2190 PageReadahead(page) || 2191 PageHWPoison(page)) 2192 goto skip; 2193 if (!trylock_page(page)) 2194 goto skip; 2195 2196 if (page->mapping != mapping || !PageUptodate(page)) 2197 goto unlock; 2198 2199 size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); 2200 if (page->index >= size >> PAGE_CACHE_SHIFT) 2201 goto unlock; 2202 2203 pte = vmf->pte + page->index - vmf->pgoff; 2204 if (!pte_none(*pte)) 2205 goto unlock; 2206 2207 if (file->f_ra.mmap_miss > 0) 2208 file->f_ra.mmap_miss--; 2209 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; 2210 do_set_pte(vma, addr, page, pte, false, false); 2211 unlock_page(page); 2212 goto next; 2213 unlock: 2214 unlock_page(page); 2215 skip: 2216 page_cache_release(page); 2217 next: 2218 if (iter.index == vmf->max_pgoff) 2219 break; 2220 } 2221 rcu_read_unlock(); 2222 } 2223 EXPORT_SYMBOL(filemap_map_pages); 2224 2225 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 2226 { 2227 struct page *page = vmf->page; 2228 struct inode *inode = file_inode(vma->vm_file); 2229 int ret = VM_FAULT_LOCKED; 2230 2231 sb_start_pagefault(inode->i_sb); 2232 file_update_time(vma->vm_file); 2233 lock_page(page); 2234 if (page->mapping != inode->i_mapping) { 2235 unlock_page(page); 2236 ret = VM_FAULT_NOPAGE; 2237 goto out; 2238 } 2239 /* 2240 * We mark the page dirty already here so that when freeze is in 2241 * progress, we are guaranteed that writeback during freezing will 2242 * see the dirty page and writeprotect it again. 2243 */ 2244 set_page_dirty(page); 2245 wait_for_stable_page(page); 2246 out: 2247 sb_end_pagefault(inode->i_sb); 2248 return ret; 2249 } 2250 EXPORT_SYMBOL(filemap_page_mkwrite); 2251 2252 const struct vm_operations_struct generic_file_vm_ops = { 2253 .fault = filemap_fault, 2254 .map_pages = filemap_map_pages, 2255 .page_mkwrite = filemap_page_mkwrite, 2256 }; 2257 2258 /* This is used for a general mmap of a disk file */ 2259 2260 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2261 { 2262 struct address_space *mapping = file->f_mapping; 2263 2264 if (!mapping->a_ops->readpage) 2265 return -ENOEXEC; 2266 file_accessed(file); 2267 vma->vm_ops = &generic_file_vm_ops; 2268 return 0; 2269 } 2270 2271 /* 2272 * This is for filesystems which do not implement ->writepage. 2273 */ 2274 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 2275 { 2276 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2277 return -EINVAL; 2278 return generic_file_mmap(file, vma); 2279 } 2280 #else 2281 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2282 { 2283 return -ENOSYS; 2284 } 2285 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 2286 { 2287 return -ENOSYS; 2288 } 2289 #endif /* CONFIG_MMU */ 2290 2291 EXPORT_SYMBOL(generic_file_mmap); 2292 EXPORT_SYMBOL(generic_file_readonly_mmap); 2293 2294 static struct page *wait_on_page_read(struct page *page) 2295 { 2296 if (!IS_ERR(page)) { 2297 wait_on_page_locked(page); 2298 if (!PageUptodate(page)) { 2299 page_cache_release(page); 2300 page = ERR_PTR(-EIO); 2301 } 2302 } 2303 return page; 2304 } 2305 2306 static struct page *__read_cache_page(struct address_space *mapping, 2307 pgoff_t index, 2308 int (*filler)(void *, struct page *), 2309 void *data, 2310 gfp_t gfp) 2311 { 2312 struct page *page; 2313 int err; 2314 repeat: 2315 page = find_get_page(mapping, index); 2316 if (!page) { 2317 page = __page_cache_alloc(gfp | __GFP_COLD); 2318 if (!page) 2319 return ERR_PTR(-ENOMEM); 2320 err = add_to_page_cache_lru(page, mapping, index, gfp); 2321 if (unlikely(err)) { 2322 page_cache_release(page); 2323 if (err == -EEXIST) 2324 goto repeat; 2325 /* Presumably ENOMEM for radix tree node */ 2326 return ERR_PTR(err); 2327 } 2328 err = filler(data, page); 2329 if (err < 0) { 2330 page_cache_release(page); 2331 page = ERR_PTR(err); 2332 } else { 2333 page = wait_on_page_read(page); 2334 } 2335 } 2336 return page; 2337 } 2338 2339 static struct page *do_read_cache_page(struct address_space *mapping, 2340 pgoff_t index, 2341 int (*filler)(void *, struct page *), 2342 void *data, 2343 gfp_t gfp) 2344 2345 { 2346 struct page *page; 2347 int err; 2348 2349 retry: 2350 page = __read_cache_page(mapping, index, filler, data, gfp); 2351 if (IS_ERR(page)) 2352 return page; 2353 if (PageUptodate(page)) 2354 goto out; 2355 2356 lock_page(page); 2357 if (!page->mapping) { 2358 unlock_page(page); 2359 page_cache_release(page); 2360 goto retry; 2361 } 2362 if (PageUptodate(page)) { 2363 unlock_page(page); 2364 goto out; 2365 } 2366 err = filler(data, page); 2367 if (err < 0) { 2368 page_cache_release(page); 2369 return ERR_PTR(err); 2370 } else { 2371 page = wait_on_page_read(page); 2372 if (IS_ERR(page)) 2373 return page; 2374 } 2375 out: 2376 mark_page_accessed(page); 2377 return page; 2378 } 2379 2380 /** 2381 * read_cache_page - read into page cache, fill it if needed 2382 * @mapping: the page's address_space 2383 * @index: the page index 2384 * @filler: function to perform the read 2385 * @data: first arg to filler(data, page) function, often left as NULL 2386 * 2387 * Read into the page cache. If a page already exists, and PageUptodate() is 2388 * not set, try to fill the page and wait for it to become unlocked. 2389 * 2390 * If the page does not get brought uptodate, return -EIO. 2391 */ 2392 struct page *read_cache_page(struct address_space *mapping, 2393 pgoff_t index, 2394 int (*filler)(void *, struct page *), 2395 void *data) 2396 { 2397 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2398 } 2399 EXPORT_SYMBOL(read_cache_page); 2400 2401 /** 2402 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2403 * @mapping: the page's address_space 2404 * @index: the page index 2405 * @gfp: the page allocator flags to use if allocating 2406 * 2407 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 2408 * any new page allocations done using the specified allocation flags. 2409 * 2410 * If the page does not get brought uptodate, return -EIO. 2411 */ 2412 struct page *read_cache_page_gfp(struct address_space *mapping, 2413 pgoff_t index, 2414 gfp_t gfp) 2415 { 2416 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2417 2418 return do_read_cache_page(mapping, index, filler, NULL, gfp); 2419 } 2420 EXPORT_SYMBOL(read_cache_page_gfp); 2421 2422 /* 2423 * Performs necessary checks before doing a write 2424 * 2425 * Can adjust writing position or amount of bytes to write. 2426 * Returns appropriate error code that caller should return or 2427 * zero in case that write should be allowed. 2428 */ 2429 inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 2430 { 2431 struct file *file = iocb->ki_filp; 2432 struct inode *inode = file->f_mapping->host; 2433 unsigned long limit = rlimit(RLIMIT_FSIZE); 2434 loff_t pos; 2435 2436 if (!iov_iter_count(from)) 2437 return 0; 2438 2439 /* FIXME: this is for backwards compatibility with 2.4 */ 2440 if (iocb->ki_flags & IOCB_APPEND) 2441 iocb->ki_pos = i_size_read(inode); 2442 2443 pos = iocb->ki_pos; 2444 2445 if (limit != RLIM_INFINITY) { 2446 if (iocb->ki_pos >= limit) { 2447 send_sig(SIGXFSZ, current, 0); 2448 return -EFBIG; 2449 } 2450 iov_iter_truncate(from, limit - (unsigned long)pos); 2451 } 2452 2453 /* 2454 * LFS rule 2455 */ 2456 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && 2457 !(file->f_flags & O_LARGEFILE))) { 2458 if (pos >= MAX_NON_LFS) 2459 return -EFBIG; 2460 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); 2461 } 2462 2463 /* 2464 * Are we about to exceed the fs block limit ? 2465 * 2466 * If we have written data it becomes a short write. If we have 2467 * exceeded without writing data we send a signal and return EFBIG. 2468 * Linus frestrict idea will clean these up nicely.. 2469 */ 2470 if (unlikely(pos >= inode->i_sb->s_maxbytes)) 2471 return -EFBIG; 2472 2473 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); 2474 return iov_iter_count(from); 2475 } 2476 EXPORT_SYMBOL(generic_write_checks); 2477 2478 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2479 loff_t pos, unsigned len, unsigned flags, 2480 struct page **pagep, void **fsdata) 2481 { 2482 const struct address_space_operations *aops = mapping->a_ops; 2483 2484 return aops->write_begin(file, mapping, pos, len, flags, 2485 pagep, fsdata); 2486 } 2487 EXPORT_SYMBOL(pagecache_write_begin); 2488 2489 int pagecache_write_end(struct file *file, struct address_space *mapping, 2490 loff_t pos, unsigned len, unsigned copied, 2491 struct page *page, void *fsdata) 2492 { 2493 const struct address_space_operations *aops = mapping->a_ops; 2494 2495 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2496 } 2497 EXPORT_SYMBOL(pagecache_write_end); 2498 2499 ssize_t 2500 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) 2501 { 2502 struct file *file = iocb->ki_filp; 2503 struct address_space *mapping = file->f_mapping; 2504 struct inode *inode = mapping->host; 2505 ssize_t written; 2506 size_t write_len; 2507 pgoff_t end; 2508 struct iov_iter data; 2509 2510 write_len = iov_iter_count(from); 2511 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2512 2513 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2514 if (written) 2515 goto out; 2516 2517 /* 2518 * After a write we want buffered reads to be sure to go to disk to get 2519 * the new data. We invalidate clean cached page from the region we're 2520 * about to write. We do this *before* the write so that we can return 2521 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2522 */ 2523 if (mapping->nrpages) { 2524 written = invalidate_inode_pages2_range(mapping, 2525 pos >> PAGE_CACHE_SHIFT, end); 2526 /* 2527 * If a page can not be invalidated, return 0 to fall back 2528 * to buffered write. 2529 */ 2530 if (written) { 2531 if (written == -EBUSY) 2532 return 0; 2533 goto out; 2534 } 2535 } 2536 2537 data = *from; 2538 written = mapping->a_ops->direct_IO(iocb, &data, pos); 2539 2540 /* 2541 * Finally, try again to invalidate clean pages which might have been 2542 * cached by non-direct readahead, or faulted in by get_user_pages() 2543 * if the source of the write was an mmap'ed region of the file 2544 * we're writing. Either one is a pretty crazy thing to do, 2545 * so we don't support it 100%. If this invalidation 2546 * fails, tough, the write still worked... 2547 */ 2548 if (mapping->nrpages) { 2549 invalidate_inode_pages2_range(mapping, 2550 pos >> PAGE_CACHE_SHIFT, end); 2551 } 2552 2553 if (written > 0) { 2554 pos += written; 2555 iov_iter_advance(from, written); 2556 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2557 i_size_write(inode, pos); 2558 mark_inode_dirty(inode); 2559 } 2560 iocb->ki_pos = pos; 2561 } 2562 out: 2563 return written; 2564 } 2565 EXPORT_SYMBOL(generic_file_direct_write); 2566 2567 /* 2568 * Find or create a page at the given pagecache position. Return the locked 2569 * page. This function is specifically for buffered writes. 2570 */ 2571 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2572 pgoff_t index, unsigned flags) 2573 { 2574 struct page *page; 2575 int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; 2576 2577 if (flags & AOP_FLAG_NOFS) 2578 fgp_flags |= FGP_NOFS; 2579 2580 page = pagecache_get_page(mapping, index, fgp_flags, 2581 mapping_gfp_mask(mapping)); 2582 if (page) 2583 wait_for_stable_page(page); 2584 2585 return page; 2586 } 2587 EXPORT_SYMBOL(grab_cache_page_write_begin); 2588 2589 ssize_t generic_perform_write(struct file *file, 2590 struct iov_iter *i, loff_t pos) 2591 { 2592 struct address_space *mapping = file->f_mapping; 2593 const struct address_space_operations *a_ops = mapping->a_ops; 2594 long status = 0; 2595 ssize_t written = 0; 2596 unsigned int flags = 0; 2597 2598 /* 2599 * Copies from kernel address space cannot fail (NFSD is a big user). 2600 */ 2601 if (!iter_is_iovec(i)) 2602 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2603 2604 do { 2605 struct page *page; 2606 unsigned long offset; /* Offset into pagecache page */ 2607 unsigned long bytes; /* Bytes to write to page */ 2608 size_t copied; /* Bytes copied from user */ 2609 void *fsdata; 2610 2611 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2612 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2613 iov_iter_count(i)); 2614 2615 again: 2616 /* 2617 * Bring in the user page that we will copy from _first_. 2618 * Otherwise there's a nasty deadlock on copying from the 2619 * same page as we're writing to, without it being marked 2620 * up-to-date. 2621 * 2622 * Not only is this an optimisation, but it is also required 2623 * to check that the address is actually valid, when atomic 2624 * usercopies are used, below. 2625 */ 2626 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2627 status = -EFAULT; 2628 break; 2629 } 2630 2631 if (fatal_signal_pending(current)) { 2632 status = -EINTR; 2633 break; 2634 } 2635 2636 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2637 &page, &fsdata); 2638 if (unlikely(status < 0)) 2639 break; 2640 2641 if (mapping_writably_mapped(mapping)) 2642 flush_dcache_page(page); 2643 2644 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2645 flush_dcache_page(page); 2646 2647 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2648 page, fsdata); 2649 if (unlikely(status < 0)) 2650 break; 2651 copied = status; 2652 2653 cond_resched(); 2654 2655 iov_iter_advance(i, copied); 2656 if (unlikely(copied == 0)) { 2657 /* 2658 * If we were unable to copy any data at all, we must 2659 * fall back to a single segment length write. 2660 * 2661 * If we didn't fallback here, we could livelock 2662 * because not all segments in the iov can be copied at 2663 * once without a pagefault. 2664 */ 2665 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2666 iov_iter_single_seg_count(i)); 2667 goto again; 2668 } 2669 pos += copied; 2670 written += copied; 2671 2672 balance_dirty_pages_ratelimited(mapping); 2673 } while (iov_iter_count(i)); 2674 2675 return written ? written : status; 2676 } 2677 EXPORT_SYMBOL(generic_perform_write); 2678 2679 /** 2680 * __generic_file_write_iter - write data to a file 2681 * @iocb: IO state structure (file, offset, etc.) 2682 * @from: iov_iter with data to write 2683 * 2684 * This function does all the work needed for actually writing data to a 2685 * file. It does all basic checks, removes SUID from the file, updates 2686 * modification times and calls proper subroutines depending on whether we 2687 * do direct IO or a standard buffered write. 2688 * 2689 * It expects i_mutex to be grabbed unless we work on a block device or similar 2690 * object which does not need locking at all. 2691 * 2692 * This function does *not* take care of syncing data in case of O_SYNC write. 2693 * A caller has to handle it. This is mainly due to the fact that we want to 2694 * avoid syncing under i_mutex. 2695 */ 2696 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2697 { 2698 struct file *file = iocb->ki_filp; 2699 struct address_space * mapping = file->f_mapping; 2700 struct inode *inode = mapping->host; 2701 ssize_t written = 0; 2702 ssize_t err; 2703 ssize_t status; 2704 2705 /* We can write back this queue in page reclaim */ 2706 current->backing_dev_info = inode_to_bdi(inode); 2707 err = file_remove_privs(file); 2708 if (err) 2709 goto out; 2710 2711 err = file_update_time(file); 2712 if (err) 2713 goto out; 2714 2715 if (iocb->ki_flags & IOCB_DIRECT) { 2716 loff_t pos, endbyte; 2717 2718 written = generic_file_direct_write(iocb, from, iocb->ki_pos); 2719 /* 2720 * If the write stopped short of completing, fall back to 2721 * buffered writes. Some filesystems do this for writes to 2722 * holes, for example. For DAX files, a buffered write will 2723 * not succeed (even if it did, DAX does not handle dirty 2724 * page-cache pages correctly). 2725 */ 2726 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) 2727 goto out; 2728 2729 status = generic_perform_write(file, from, pos = iocb->ki_pos); 2730 /* 2731 * If generic_perform_write() returned a synchronous error 2732 * then we want to return the number of bytes which were 2733 * direct-written, or the error code if that was zero. Note 2734 * that this differs from normal direct-io semantics, which 2735 * will return -EFOO even if some bytes were written. 2736 */ 2737 if (unlikely(status < 0)) { 2738 err = status; 2739 goto out; 2740 } 2741 /* 2742 * We need to ensure that the page cache pages are written to 2743 * disk and invalidated to preserve the expected O_DIRECT 2744 * semantics. 2745 */ 2746 endbyte = pos + status - 1; 2747 err = filemap_write_and_wait_range(mapping, pos, endbyte); 2748 if (err == 0) { 2749 iocb->ki_pos = endbyte + 1; 2750 written += status; 2751 invalidate_mapping_pages(mapping, 2752 pos >> PAGE_CACHE_SHIFT, 2753 endbyte >> PAGE_CACHE_SHIFT); 2754 } else { 2755 /* 2756 * We don't know how much we wrote, so just return 2757 * the number of bytes which were direct-written 2758 */ 2759 } 2760 } else { 2761 written = generic_perform_write(file, from, iocb->ki_pos); 2762 if (likely(written > 0)) 2763 iocb->ki_pos += written; 2764 } 2765 out: 2766 current->backing_dev_info = NULL; 2767 return written ? written : err; 2768 } 2769 EXPORT_SYMBOL(__generic_file_write_iter); 2770 2771 /** 2772 * generic_file_write_iter - write data to a file 2773 * @iocb: IO state structure 2774 * @from: iov_iter with data to write 2775 * 2776 * This is a wrapper around __generic_file_write_iter() to be used by most 2777 * filesystems. It takes care of syncing the file in case of O_SYNC file 2778 * and acquires i_mutex as needed. 2779 */ 2780 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2781 { 2782 struct file *file = iocb->ki_filp; 2783 struct inode *inode = file->f_mapping->host; 2784 ssize_t ret; 2785 2786 inode_lock(inode); 2787 ret = generic_write_checks(iocb, from); 2788 if (ret > 0) 2789 ret = __generic_file_write_iter(iocb, from); 2790 inode_unlock(inode); 2791 2792 if (ret > 0) { 2793 ssize_t err; 2794 2795 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 2796 if (err < 0) 2797 ret = err; 2798 } 2799 return ret; 2800 } 2801 EXPORT_SYMBOL(generic_file_write_iter); 2802 2803 /** 2804 * try_to_release_page() - release old fs-specific metadata on a page 2805 * 2806 * @page: the page which the kernel is trying to free 2807 * @gfp_mask: memory allocation flags (and I/O mode) 2808 * 2809 * The address_space is to try to release any data against the page 2810 * (presumably at page->private). If the release was successful, return `1'. 2811 * Otherwise return zero. 2812 * 2813 * This may also be called if PG_fscache is set on a page, indicating that the 2814 * page is known to the local caching routines. 2815 * 2816 * The @gfp_mask argument specifies whether I/O may be performed to release 2817 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 2818 * 2819 */ 2820 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2821 { 2822 struct address_space * const mapping = page->mapping; 2823 2824 BUG_ON(!PageLocked(page)); 2825 if (PageWriteback(page)) 2826 return 0; 2827 2828 if (mapping && mapping->a_ops->releasepage) 2829 return mapping->a_ops->releasepage(page, gfp_mask); 2830 return try_to_free_buffers(page); 2831 } 2832 2833 EXPORT_SYMBOL(try_to_release_page); 2834