1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/module.h> 13 #include <linux/compiler.h> 14 #include <linux/fs.h> 15 #include <linux/uaccess.h> 16 #include <linux/aio.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/syscalls.h> 33 #include <linux/cpuset.h> 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35 #include <linux/memcontrol.h> 36 #include <linux/mm_inline.h> /* for page_is_file_cache() */ 37 #include "internal.h" 38 39 /* 40 * FIXME: remove all knowledge of the buffer layer from the core VM 41 */ 42 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 43 44 #include <asm/mman.h> 45 46 /* 47 * Shared mappings implemented 30.11.1994. It's not fully working yet, 48 * though. 49 * 50 * Shared mappings now work. 15.8.1995 Bruno. 51 * 52 * finished 'unifying' the page and buffer cache and SMP-threaded the 53 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 54 * 55 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 56 */ 57 58 /* 59 * Lock ordering: 60 * 61 * ->i_mmap_lock (truncate_pagecache) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->swap_lock (exclusive_swap_page, others) 64 * ->mapping->tree_lock 65 * 66 * ->i_mutex 67 * ->i_mmap_lock (truncate->unmap_mapping_range) 68 * 69 * ->mmap_sem 70 * ->i_mmap_lock 71 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * 74 * ->mmap_sem 75 * ->lock_page (access_process_vm) 76 * 77 * ->i_mutex (generic_file_buffered_write) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * 80 * ->i_mutex 81 * ->i_alloc_sem (various) 82 * 83 * ->inode_lock 84 * ->sb_lock (fs/fs-writeback.c) 85 * ->mapping->tree_lock (__sync_single_inode) 86 * 87 * ->i_mmap_lock 88 * ->anon_vma.lock (vma_adjust) 89 * 90 * ->anon_vma.lock 91 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 92 * 93 * ->page_table_lock or pte_lock 94 * ->swap_lock (try_to_unmap_one) 95 * ->private_lock (try_to_unmap_one) 96 * ->tree_lock (try_to_unmap_one) 97 * ->zone.lru_lock (follow_page->mark_page_accessed) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 104 * 105 * (code doesn't rely on that order, so you could switch it around) 106 * ->tasklist_lock (memory_failure, collect_procs_ao) 107 * ->i_mmap_lock 108 */ 109 110 /* 111 * Remove a page from the page cache and free it. Caller has to make 112 * sure the page is locked and that nobody else uses it - or that usage 113 * is safe. The caller must hold the mapping's tree_lock. 114 */ 115 void __remove_from_page_cache(struct page *page) 116 { 117 struct address_space *mapping = page->mapping; 118 119 radix_tree_delete(&mapping->page_tree, page->index); 120 page->mapping = NULL; 121 mapping->nrpages--; 122 __dec_zone_page_state(page, NR_FILE_PAGES); 123 if (PageSwapBacked(page)) 124 __dec_zone_page_state(page, NR_SHMEM); 125 BUG_ON(page_mapped(page)); 126 127 /* 128 * Some filesystems seem to re-dirty the page even after 129 * the VM has canceled the dirty bit (eg ext3 journaling). 130 * 131 * Fix it up by doing a final dirty accounting check after 132 * having removed the page entirely. 133 */ 134 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 135 dec_zone_page_state(page, NR_FILE_DIRTY); 136 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 137 } 138 } 139 140 void remove_from_page_cache(struct page *page) 141 { 142 struct address_space *mapping = page->mapping; 143 void (*freepage)(struct page *); 144 145 BUG_ON(!PageLocked(page)); 146 147 freepage = mapping->a_ops->freepage; 148 spin_lock_irq(&mapping->tree_lock); 149 __remove_from_page_cache(page); 150 spin_unlock_irq(&mapping->tree_lock); 151 mem_cgroup_uncharge_cache_page(page); 152 153 if (freepage) 154 freepage(page); 155 } 156 EXPORT_SYMBOL(remove_from_page_cache); 157 158 static int sync_page(void *word) 159 { 160 struct address_space *mapping; 161 struct page *page; 162 163 page = container_of((unsigned long *)word, struct page, flags); 164 165 /* 166 * page_mapping() is being called without PG_locked held. 167 * Some knowledge of the state and use of the page is used to 168 * reduce the requirements down to a memory barrier. 169 * The danger here is of a stale page_mapping() return value 170 * indicating a struct address_space different from the one it's 171 * associated with when it is associated with one. 172 * After smp_mb(), it's either the correct page_mapping() for 173 * the page, or an old page_mapping() and the page's own 174 * page_mapping() has gone NULL. 175 * The ->sync_page() address_space operation must tolerate 176 * page_mapping() going NULL. By an amazing coincidence, 177 * this comes about because none of the users of the page 178 * in the ->sync_page() methods make essential use of the 179 * page_mapping(), merely passing the page down to the backing 180 * device's unplug functions when it's non-NULL, which in turn 181 * ignore it for all cases but swap, where only page_private(page) is 182 * of interest. When page_mapping() does go NULL, the entire 183 * call stack gracefully ignores the page and returns. 184 * -- wli 185 */ 186 smp_mb(); 187 mapping = page_mapping(page); 188 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 189 mapping->a_ops->sync_page(page); 190 io_schedule(); 191 return 0; 192 } 193 194 static int sync_page_killable(void *word) 195 { 196 sync_page(word); 197 return fatal_signal_pending(current) ? -EINTR : 0; 198 } 199 200 /** 201 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 202 * @mapping: address space structure to write 203 * @start: offset in bytes where the range starts 204 * @end: offset in bytes where the range ends (inclusive) 205 * @sync_mode: enable synchronous operation 206 * 207 * Start writeback against all of a mapping's dirty pages that lie 208 * within the byte offsets <start, end> inclusive. 209 * 210 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 211 * opposed to a regular memory cleansing writeback. The difference between 212 * these two operations is that if a dirty page/buffer is encountered, it must 213 * be waited upon, and not just skipped over. 214 */ 215 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 216 loff_t end, int sync_mode) 217 { 218 int ret; 219 struct writeback_control wbc = { 220 .sync_mode = sync_mode, 221 .nr_to_write = LONG_MAX, 222 .range_start = start, 223 .range_end = end, 224 }; 225 226 if (!mapping_cap_writeback_dirty(mapping)) 227 return 0; 228 229 ret = do_writepages(mapping, &wbc); 230 return ret; 231 } 232 233 static inline int __filemap_fdatawrite(struct address_space *mapping, 234 int sync_mode) 235 { 236 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 237 } 238 239 int filemap_fdatawrite(struct address_space *mapping) 240 { 241 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 242 } 243 EXPORT_SYMBOL(filemap_fdatawrite); 244 245 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 246 loff_t end) 247 { 248 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 249 } 250 EXPORT_SYMBOL(filemap_fdatawrite_range); 251 252 /** 253 * filemap_flush - mostly a non-blocking flush 254 * @mapping: target address_space 255 * 256 * This is a mostly non-blocking flush. Not suitable for data-integrity 257 * purposes - I/O may not be started against all dirty pages. 258 */ 259 int filemap_flush(struct address_space *mapping) 260 { 261 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 262 } 263 EXPORT_SYMBOL(filemap_flush); 264 265 /** 266 * filemap_fdatawait_range - wait for writeback to complete 267 * @mapping: address space structure to wait for 268 * @start_byte: offset in bytes where the range starts 269 * @end_byte: offset in bytes where the range ends (inclusive) 270 * 271 * Walk the list of under-writeback pages of the given address space 272 * in the given range and wait for all of them. 273 */ 274 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 275 loff_t end_byte) 276 { 277 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 278 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 279 struct pagevec pvec; 280 int nr_pages; 281 int ret = 0; 282 283 if (end_byte < start_byte) 284 return 0; 285 286 pagevec_init(&pvec, 0); 287 while ((index <= end) && 288 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 289 PAGECACHE_TAG_WRITEBACK, 290 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 291 unsigned i; 292 293 for (i = 0; i < nr_pages; i++) { 294 struct page *page = pvec.pages[i]; 295 296 /* until radix tree lookup accepts end_index */ 297 if (page->index > end) 298 continue; 299 300 wait_on_page_writeback(page); 301 if (PageError(page)) 302 ret = -EIO; 303 } 304 pagevec_release(&pvec); 305 cond_resched(); 306 } 307 308 /* Check for outstanding write errors */ 309 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 310 ret = -ENOSPC; 311 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 312 ret = -EIO; 313 314 return ret; 315 } 316 EXPORT_SYMBOL(filemap_fdatawait_range); 317 318 /** 319 * filemap_fdatawait - wait for all under-writeback pages to complete 320 * @mapping: address space structure to wait for 321 * 322 * Walk the list of under-writeback pages of the given address space 323 * and wait for all of them. 324 */ 325 int filemap_fdatawait(struct address_space *mapping) 326 { 327 loff_t i_size = i_size_read(mapping->host); 328 329 if (i_size == 0) 330 return 0; 331 332 return filemap_fdatawait_range(mapping, 0, i_size - 1); 333 } 334 EXPORT_SYMBOL(filemap_fdatawait); 335 336 int filemap_write_and_wait(struct address_space *mapping) 337 { 338 int err = 0; 339 340 if (mapping->nrpages) { 341 err = filemap_fdatawrite(mapping); 342 /* 343 * Even if the above returned error, the pages may be 344 * written partially (e.g. -ENOSPC), so we wait for it. 345 * But the -EIO is special case, it may indicate the worst 346 * thing (e.g. bug) happened, so we avoid waiting for it. 347 */ 348 if (err != -EIO) { 349 int err2 = filemap_fdatawait(mapping); 350 if (!err) 351 err = err2; 352 } 353 } 354 return err; 355 } 356 EXPORT_SYMBOL(filemap_write_and_wait); 357 358 /** 359 * filemap_write_and_wait_range - write out & wait on a file range 360 * @mapping: the address_space for the pages 361 * @lstart: offset in bytes where the range starts 362 * @lend: offset in bytes where the range ends (inclusive) 363 * 364 * Write out and wait upon file offsets lstart->lend, inclusive. 365 * 366 * Note that `lend' is inclusive (describes the last byte to be written) so 367 * that this function can be used to write to the very end-of-file (end = -1). 368 */ 369 int filemap_write_and_wait_range(struct address_space *mapping, 370 loff_t lstart, loff_t lend) 371 { 372 int err = 0; 373 374 if (mapping->nrpages) { 375 err = __filemap_fdatawrite_range(mapping, lstart, lend, 376 WB_SYNC_ALL); 377 /* See comment of filemap_write_and_wait() */ 378 if (err != -EIO) { 379 int err2 = filemap_fdatawait_range(mapping, 380 lstart, lend); 381 if (!err) 382 err = err2; 383 } 384 } 385 return err; 386 } 387 EXPORT_SYMBOL(filemap_write_and_wait_range); 388 389 /** 390 * add_to_page_cache_locked - add a locked page to the pagecache 391 * @page: page to add 392 * @mapping: the page's address_space 393 * @offset: page index 394 * @gfp_mask: page allocation mode 395 * 396 * This function is used to add a page to the pagecache. It must be locked. 397 * This function does not add the page to the LRU. The caller must do that. 398 */ 399 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 400 pgoff_t offset, gfp_t gfp_mask) 401 { 402 int error; 403 404 VM_BUG_ON(!PageLocked(page)); 405 406 error = mem_cgroup_cache_charge(page, current->mm, 407 gfp_mask & GFP_RECLAIM_MASK); 408 if (error) 409 goto out; 410 411 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 412 if (error == 0) { 413 page_cache_get(page); 414 page->mapping = mapping; 415 page->index = offset; 416 417 spin_lock_irq(&mapping->tree_lock); 418 error = radix_tree_insert(&mapping->page_tree, offset, page); 419 if (likely(!error)) { 420 mapping->nrpages++; 421 __inc_zone_page_state(page, NR_FILE_PAGES); 422 if (PageSwapBacked(page)) 423 __inc_zone_page_state(page, NR_SHMEM); 424 spin_unlock_irq(&mapping->tree_lock); 425 } else { 426 page->mapping = NULL; 427 spin_unlock_irq(&mapping->tree_lock); 428 mem_cgroup_uncharge_cache_page(page); 429 page_cache_release(page); 430 } 431 radix_tree_preload_end(); 432 } else 433 mem_cgroup_uncharge_cache_page(page); 434 out: 435 return error; 436 } 437 EXPORT_SYMBOL(add_to_page_cache_locked); 438 439 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 440 pgoff_t offset, gfp_t gfp_mask) 441 { 442 int ret; 443 444 /* 445 * Splice_read and readahead add shmem/tmpfs pages into the page cache 446 * before shmem_readpage has a chance to mark them as SwapBacked: they 447 * need to go on the anon lru below, and mem_cgroup_cache_charge 448 * (called in add_to_page_cache) needs to know where they're going too. 449 */ 450 if (mapping_cap_swap_backed(mapping)) 451 SetPageSwapBacked(page); 452 453 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 454 if (ret == 0) { 455 if (page_is_file_cache(page)) 456 lru_cache_add_file(page); 457 else 458 lru_cache_add_anon(page); 459 } 460 return ret; 461 } 462 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 463 464 #ifdef CONFIG_NUMA 465 struct page *__page_cache_alloc(gfp_t gfp) 466 { 467 int n; 468 struct page *page; 469 470 if (cpuset_do_page_mem_spread()) { 471 get_mems_allowed(); 472 n = cpuset_mem_spread_node(); 473 page = alloc_pages_exact_node(n, gfp, 0); 474 put_mems_allowed(); 475 return page; 476 } 477 return alloc_pages(gfp, 0); 478 } 479 EXPORT_SYMBOL(__page_cache_alloc); 480 #endif 481 482 static int __sleep_on_page_lock(void *word) 483 { 484 io_schedule(); 485 return 0; 486 } 487 488 /* 489 * In order to wait for pages to become available there must be 490 * waitqueues associated with pages. By using a hash table of 491 * waitqueues where the bucket discipline is to maintain all 492 * waiters on the same queue and wake all when any of the pages 493 * become available, and for the woken contexts to check to be 494 * sure the appropriate page became available, this saves space 495 * at a cost of "thundering herd" phenomena during rare hash 496 * collisions. 497 */ 498 static wait_queue_head_t *page_waitqueue(struct page *page) 499 { 500 const struct zone *zone = page_zone(page); 501 502 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 503 } 504 505 static inline void wake_up_page(struct page *page, int bit) 506 { 507 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 508 } 509 510 void wait_on_page_bit(struct page *page, int bit_nr) 511 { 512 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 513 514 if (test_bit(bit_nr, &page->flags)) 515 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 516 TASK_UNINTERRUPTIBLE); 517 } 518 EXPORT_SYMBOL(wait_on_page_bit); 519 520 /** 521 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 522 * @page: Page defining the wait queue of interest 523 * @waiter: Waiter to add to the queue 524 * 525 * Add an arbitrary @waiter to the wait queue for the nominated @page. 526 */ 527 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 528 { 529 wait_queue_head_t *q = page_waitqueue(page); 530 unsigned long flags; 531 532 spin_lock_irqsave(&q->lock, flags); 533 __add_wait_queue(q, waiter); 534 spin_unlock_irqrestore(&q->lock, flags); 535 } 536 EXPORT_SYMBOL_GPL(add_page_wait_queue); 537 538 /** 539 * unlock_page - unlock a locked page 540 * @page: the page 541 * 542 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 543 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 544 * mechananism between PageLocked pages and PageWriteback pages is shared. 545 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 546 * 547 * The mb is necessary to enforce ordering between the clear_bit and the read 548 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 549 */ 550 void unlock_page(struct page *page) 551 { 552 VM_BUG_ON(!PageLocked(page)); 553 clear_bit_unlock(PG_locked, &page->flags); 554 smp_mb__after_clear_bit(); 555 wake_up_page(page, PG_locked); 556 } 557 EXPORT_SYMBOL(unlock_page); 558 559 /** 560 * end_page_writeback - end writeback against a page 561 * @page: the page 562 */ 563 void end_page_writeback(struct page *page) 564 { 565 if (TestClearPageReclaim(page)) 566 rotate_reclaimable_page(page); 567 568 if (!test_clear_page_writeback(page)) 569 BUG(); 570 571 smp_mb__after_clear_bit(); 572 wake_up_page(page, PG_writeback); 573 } 574 EXPORT_SYMBOL(end_page_writeback); 575 576 /** 577 * __lock_page - get a lock on the page, assuming we need to sleep to get it 578 * @page: the page to lock 579 * 580 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 581 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 582 * chances are that on the second loop, the block layer's plug list is empty, 583 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 584 */ 585 void __lock_page(struct page *page) 586 { 587 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 588 589 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 590 TASK_UNINTERRUPTIBLE); 591 } 592 EXPORT_SYMBOL(__lock_page); 593 594 int __lock_page_killable(struct page *page) 595 { 596 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 597 598 return __wait_on_bit_lock(page_waitqueue(page), &wait, 599 sync_page_killable, TASK_KILLABLE); 600 } 601 EXPORT_SYMBOL_GPL(__lock_page_killable); 602 603 /** 604 * __lock_page_nosync - get a lock on the page, without calling sync_page() 605 * @page: the page to lock 606 * 607 * Variant of lock_page that does not require the caller to hold a reference 608 * on the page's mapping. 609 */ 610 void __lock_page_nosync(struct page *page) 611 { 612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 614 TASK_UNINTERRUPTIBLE); 615 } 616 617 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 618 unsigned int flags) 619 { 620 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { 621 __lock_page(page); 622 return 1; 623 } else { 624 up_read(&mm->mmap_sem); 625 wait_on_page_locked(page); 626 return 0; 627 } 628 } 629 630 /** 631 * find_get_page - find and get a page reference 632 * @mapping: the address_space to search 633 * @offset: the page index 634 * 635 * Is there a pagecache struct page at the given (mapping, offset) tuple? 636 * If yes, increment its refcount and return it; if no, return NULL. 637 */ 638 struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 639 { 640 void **pagep; 641 struct page *page; 642 643 rcu_read_lock(); 644 repeat: 645 page = NULL; 646 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 647 if (pagep) { 648 page = radix_tree_deref_slot(pagep); 649 if (unlikely(!page)) 650 goto out; 651 if (radix_tree_deref_retry(page)) 652 goto repeat; 653 654 if (!page_cache_get_speculative(page)) 655 goto repeat; 656 657 /* 658 * Has the page moved? 659 * This is part of the lockless pagecache protocol. See 660 * include/linux/pagemap.h for details. 661 */ 662 if (unlikely(page != *pagep)) { 663 page_cache_release(page); 664 goto repeat; 665 } 666 } 667 out: 668 rcu_read_unlock(); 669 670 return page; 671 } 672 EXPORT_SYMBOL(find_get_page); 673 674 /** 675 * find_lock_page - locate, pin and lock a pagecache page 676 * @mapping: the address_space to search 677 * @offset: the page index 678 * 679 * Locates the desired pagecache page, locks it, increments its reference 680 * count and returns its address. 681 * 682 * Returns zero if the page was not present. find_lock_page() may sleep. 683 */ 684 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) 685 { 686 struct page *page; 687 688 repeat: 689 page = find_get_page(mapping, offset); 690 if (page) { 691 lock_page(page); 692 /* Has the page been truncated? */ 693 if (unlikely(page->mapping != mapping)) { 694 unlock_page(page); 695 page_cache_release(page); 696 goto repeat; 697 } 698 VM_BUG_ON(page->index != offset); 699 } 700 return page; 701 } 702 EXPORT_SYMBOL(find_lock_page); 703 704 /** 705 * find_or_create_page - locate or add a pagecache page 706 * @mapping: the page's address_space 707 * @index: the page's index into the mapping 708 * @gfp_mask: page allocation mode 709 * 710 * Locates a page in the pagecache. If the page is not present, a new page 711 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 712 * LRU list. The returned page is locked and has its reference count 713 * incremented. 714 * 715 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 716 * allocation! 717 * 718 * find_or_create_page() returns the desired page's address, or zero on 719 * memory exhaustion. 720 */ 721 struct page *find_or_create_page(struct address_space *mapping, 722 pgoff_t index, gfp_t gfp_mask) 723 { 724 struct page *page; 725 int err; 726 repeat: 727 page = find_lock_page(mapping, index); 728 if (!page) { 729 page = __page_cache_alloc(gfp_mask); 730 if (!page) 731 return NULL; 732 /* 733 * We want a regular kernel memory (not highmem or DMA etc) 734 * allocation for the radix tree nodes, but we need to honour 735 * the context-specific requirements the caller has asked for. 736 * GFP_RECLAIM_MASK collects those requirements. 737 */ 738 err = add_to_page_cache_lru(page, mapping, index, 739 (gfp_mask & GFP_RECLAIM_MASK)); 740 if (unlikely(err)) { 741 page_cache_release(page); 742 page = NULL; 743 if (err == -EEXIST) 744 goto repeat; 745 } 746 } 747 return page; 748 } 749 EXPORT_SYMBOL(find_or_create_page); 750 751 /** 752 * find_get_pages - gang pagecache lookup 753 * @mapping: The address_space to search 754 * @start: The starting page index 755 * @nr_pages: The maximum number of pages 756 * @pages: Where the resulting pages are placed 757 * 758 * find_get_pages() will search for and return a group of up to 759 * @nr_pages pages in the mapping. The pages are placed at @pages. 760 * find_get_pages() takes a reference against the returned pages. 761 * 762 * The search returns a group of mapping-contiguous pages with ascending 763 * indexes. There may be holes in the indices due to not-present pages. 764 * 765 * find_get_pages() returns the number of pages which were found. 766 */ 767 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 768 unsigned int nr_pages, struct page **pages) 769 { 770 unsigned int i; 771 unsigned int ret; 772 unsigned int nr_found; 773 774 rcu_read_lock(); 775 restart: 776 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 777 (void ***)pages, start, nr_pages); 778 ret = 0; 779 for (i = 0; i < nr_found; i++) { 780 struct page *page; 781 repeat: 782 page = radix_tree_deref_slot((void **)pages[i]); 783 if (unlikely(!page)) 784 continue; 785 if (radix_tree_deref_retry(page)) { 786 if (ret) 787 start = pages[ret-1]->index; 788 goto restart; 789 } 790 791 if (!page_cache_get_speculative(page)) 792 goto repeat; 793 794 /* Has the page moved? */ 795 if (unlikely(page != *((void **)pages[i]))) { 796 page_cache_release(page); 797 goto repeat; 798 } 799 800 pages[ret] = page; 801 ret++; 802 } 803 rcu_read_unlock(); 804 return ret; 805 } 806 807 /** 808 * find_get_pages_contig - gang contiguous pagecache lookup 809 * @mapping: The address_space to search 810 * @index: The starting page index 811 * @nr_pages: The maximum number of pages 812 * @pages: Where the resulting pages are placed 813 * 814 * find_get_pages_contig() works exactly like find_get_pages(), except 815 * that the returned number of pages are guaranteed to be contiguous. 816 * 817 * find_get_pages_contig() returns the number of pages which were found. 818 */ 819 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 820 unsigned int nr_pages, struct page **pages) 821 { 822 unsigned int i; 823 unsigned int ret; 824 unsigned int nr_found; 825 826 rcu_read_lock(); 827 restart: 828 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 829 (void ***)pages, index, nr_pages); 830 ret = 0; 831 for (i = 0; i < nr_found; i++) { 832 struct page *page; 833 repeat: 834 page = radix_tree_deref_slot((void **)pages[i]); 835 if (unlikely(!page)) 836 continue; 837 if (radix_tree_deref_retry(page)) 838 goto restart; 839 840 if (page->mapping == NULL || page->index != index) 841 break; 842 843 if (!page_cache_get_speculative(page)) 844 goto repeat; 845 846 /* Has the page moved? */ 847 if (unlikely(page != *((void **)pages[i]))) { 848 page_cache_release(page); 849 goto repeat; 850 } 851 852 pages[ret] = page; 853 ret++; 854 index++; 855 } 856 rcu_read_unlock(); 857 return ret; 858 } 859 EXPORT_SYMBOL(find_get_pages_contig); 860 861 /** 862 * find_get_pages_tag - find and return pages that match @tag 863 * @mapping: the address_space to search 864 * @index: the starting page index 865 * @tag: the tag index 866 * @nr_pages: the maximum number of pages 867 * @pages: where the resulting pages are placed 868 * 869 * Like find_get_pages, except we only return pages which are tagged with 870 * @tag. We update @index to index the next page for the traversal. 871 */ 872 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 873 int tag, unsigned int nr_pages, struct page **pages) 874 { 875 unsigned int i; 876 unsigned int ret; 877 unsigned int nr_found; 878 879 rcu_read_lock(); 880 restart: 881 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, 882 (void ***)pages, *index, nr_pages, tag); 883 ret = 0; 884 for (i = 0; i < nr_found; i++) { 885 struct page *page; 886 repeat: 887 page = radix_tree_deref_slot((void **)pages[i]); 888 if (unlikely(!page)) 889 continue; 890 if (radix_tree_deref_retry(page)) 891 goto restart; 892 893 if (!page_cache_get_speculative(page)) 894 goto repeat; 895 896 /* Has the page moved? */ 897 if (unlikely(page != *((void **)pages[i]))) { 898 page_cache_release(page); 899 goto repeat; 900 } 901 902 pages[ret] = page; 903 ret++; 904 } 905 rcu_read_unlock(); 906 907 if (ret) 908 *index = pages[ret - 1]->index + 1; 909 910 return ret; 911 } 912 EXPORT_SYMBOL(find_get_pages_tag); 913 914 /** 915 * grab_cache_page_nowait - returns locked page at given index in given cache 916 * @mapping: target address_space 917 * @index: the page index 918 * 919 * Same as grab_cache_page(), but do not wait if the page is unavailable. 920 * This is intended for speculative data generators, where the data can 921 * be regenerated if the page couldn't be grabbed. This routine should 922 * be safe to call while holding the lock for another page. 923 * 924 * Clear __GFP_FS when allocating the page to avoid recursion into the fs 925 * and deadlock against the caller's locked page. 926 */ 927 struct page * 928 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) 929 { 930 struct page *page = find_get_page(mapping, index); 931 932 if (page) { 933 if (trylock_page(page)) 934 return page; 935 page_cache_release(page); 936 return NULL; 937 } 938 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 939 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { 940 page_cache_release(page); 941 page = NULL; 942 } 943 return page; 944 } 945 EXPORT_SYMBOL(grab_cache_page_nowait); 946 947 /* 948 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 949 * a _large_ part of the i/o request. Imagine the worst scenario: 950 * 951 * ---R__________________________________________B__________ 952 * ^ reading here ^ bad block(assume 4k) 953 * 954 * read(R) => miss => readahead(R...B) => media error => frustrating retries 955 * => failing the whole request => read(R) => read(R+1) => 956 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 957 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 958 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 959 * 960 * It is going insane. Fix it by quickly scaling down the readahead size. 961 */ 962 static void shrink_readahead_size_eio(struct file *filp, 963 struct file_ra_state *ra) 964 { 965 ra->ra_pages /= 4; 966 } 967 968 /** 969 * do_generic_file_read - generic file read routine 970 * @filp: the file to read 971 * @ppos: current file position 972 * @desc: read_descriptor 973 * @actor: read method 974 * 975 * This is a generic file read routine, and uses the 976 * mapping->a_ops->readpage() function for the actual low-level stuff. 977 * 978 * This is really ugly. But the goto's actually try to clarify some 979 * of the logic when it comes to error handling etc. 980 */ 981 static void do_generic_file_read(struct file *filp, loff_t *ppos, 982 read_descriptor_t *desc, read_actor_t actor) 983 { 984 struct address_space *mapping = filp->f_mapping; 985 struct inode *inode = mapping->host; 986 struct file_ra_state *ra = &filp->f_ra; 987 pgoff_t index; 988 pgoff_t last_index; 989 pgoff_t prev_index; 990 unsigned long offset; /* offset into pagecache page */ 991 unsigned int prev_offset; 992 int error; 993 994 index = *ppos >> PAGE_CACHE_SHIFT; 995 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 996 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 997 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 998 offset = *ppos & ~PAGE_CACHE_MASK; 999 1000 for (;;) { 1001 struct page *page; 1002 pgoff_t end_index; 1003 loff_t isize; 1004 unsigned long nr, ret; 1005 1006 cond_resched(); 1007 find_page: 1008 page = find_get_page(mapping, index); 1009 if (!page) { 1010 page_cache_sync_readahead(mapping, 1011 ra, filp, 1012 index, last_index - index); 1013 page = find_get_page(mapping, index); 1014 if (unlikely(page == NULL)) 1015 goto no_cached_page; 1016 } 1017 if (PageReadahead(page)) { 1018 page_cache_async_readahead(mapping, 1019 ra, filp, page, 1020 index, last_index - index); 1021 } 1022 if (!PageUptodate(page)) { 1023 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1024 !mapping->a_ops->is_partially_uptodate) 1025 goto page_not_up_to_date; 1026 if (!trylock_page(page)) 1027 goto page_not_up_to_date; 1028 /* Did it get truncated before we got the lock? */ 1029 if (!page->mapping) 1030 goto page_not_up_to_date_locked; 1031 if (!mapping->a_ops->is_partially_uptodate(page, 1032 desc, offset)) 1033 goto page_not_up_to_date_locked; 1034 unlock_page(page); 1035 } 1036 page_ok: 1037 /* 1038 * i_size must be checked after we know the page is Uptodate. 1039 * 1040 * Checking i_size after the check allows us to calculate 1041 * the correct value for "nr", which means the zero-filled 1042 * part of the page is not copied back to userspace (unless 1043 * another truncate extends the file - this is desired though). 1044 */ 1045 1046 isize = i_size_read(inode); 1047 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1048 if (unlikely(!isize || index > end_index)) { 1049 page_cache_release(page); 1050 goto out; 1051 } 1052 1053 /* nr is the maximum number of bytes to copy from this page */ 1054 nr = PAGE_CACHE_SIZE; 1055 if (index == end_index) { 1056 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1057 if (nr <= offset) { 1058 page_cache_release(page); 1059 goto out; 1060 } 1061 } 1062 nr = nr - offset; 1063 1064 /* If users can be writing to this page using arbitrary 1065 * virtual addresses, take care about potential aliasing 1066 * before reading the page on the kernel side. 1067 */ 1068 if (mapping_writably_mapped(mapping)) 1069 flush_dcache_page(page); 1070 1071 /* 1072 * When a sequential read accesses a page several times, 1073 * only mark it as accessed the first time. 1074 */ 1075 if (prev_index != index || offset != prev_offset) 1076 mark_page_accessed(page); 1077 prev_index = index; 1078 1079 /* 1080 * Ok, we have the page, and it's up-to-date, so 1081 * now we can copy it to user space... 1082 * 1083 * The actor routine returns how many bytes were actually used.. 1084 * NOTE! This may not be the same as how much of a user buffer 1085 * we filled up (we may be padding etc), so we can only update 1086 * "pos" here (the actor routine has to update the user buffer 1087 * pointers and the remaining count). 1088 */ 1089 ret = actor(desc, page, offset, nr); 1090 offset += ret; 1091 index += offset >> PAGE_CACHE_SHIFT; 1092 offset &= ~PAGE_CACHE_MASK; 1093 prev_offset = offset; 1094 1095 page_cache_release(page); 1096 if (ret == nr && desc->count) 1097 continue; 1098 goto out; 1099 1100 page_not_up_to_date: 1101 /* Get exclusive access to the page ... */ 1102 error = lock_page_killable(page); 1103 if (unlikely(error)) 1104 goto readpage_error; 1105 1106 page_not_up_to_date_locked: 1107 /* Did it get truncated before we got the lock? */ 1108 if (!page->mapping) { 1109 unlock_page(page); 1110 page_cache_release(page); 1111 continue; 1112 } 1113 1114 /* Did somebody else fill it already? */ 1115 if (PageUptodate(page)) { 1116 unlock_page(page); 1117 goto page_ok; 1118 } 1119 1120 readpage: 1121 /* 1122 * A previous I/O error may have been due to temporary 1123 * failures, eg. multipath errors. 1124 * PG_error will be set again if readpage fails. 1125 */ 1126 ClearPageError(page); 1127 /* Start the actual read. The read will unlock the page. */ 1128 error = mapping->a_ops->readpage(filp, page); 1129 1130 if (unlikely(error)) { 1131 if (error == AOP_TRUNCATED_PAGE) { 1132 page_cache_release(page); 1133 goto find_page; 1134 } 1135 goto readpage_error; 1136 } 1137 1138 if (!PageUptodate(page)) { 1139 error = lock_page_killable(page); 1140 if (unlikely(error)) 1141 goto readpage_error; 1142 if (!PageUptodate(page)) { 1143 if (page->mapping == NULL) { 1144 /* 1145 * invalidate_mapping_pages got it 1146 */ 1147 unlock_page(page); 1148 page_cache_release(page); 1149 goto find_page; 1150 } 1151 unlock_page(page); 1152 shrink_readahead_size_eio(filp, ra); 1153 error = -EIO; 1154 goto readpage_error; 1155 } 1156 unlock_page(page); 1157 } 1158 1159 goto page_ok; 1160 1161 readpage_error: 1162 /* UHHUH! A synchronous read error occurred. Report it */ 1163 desc->error = error; 1164 page_cache_release(page); 1165 goto out; 1166 1167 no_cached_page: 1168 /* 1169 * Ok, it wasn't cached, so we need to create a new 1170 * page.. 1171 */ 1172 page = page_cache_alloc_cold(mapping); 1173 if (!page) { 1174 desc->error = -ENOMEM; 1175 goto out; 1176 } 1177 error = add_to_page_cache_lru(page, mapping, 1178 index, GFP_KERNEL); 1179 if (error) { 1180 page_cache_release(page); 1181 if (error == -EEXIST) 1182 goto find_page; 1183 desc->error = error; 1184 goto out; 1185 } 1186 goto readpage; 1187 } 1188 1189 out: 1190 ra->prev_pos = prev_index; 1191 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1192 ra->prev_pos |= prev_offset; 1193 1194 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1195 file_accessed(filp); 1196 } 1197 1198 int file_read_actor(read_descriptor_t *desc, struct page *page, 1199 unsigned long offset, unsigned long size) 1200 { 1201 char *kaddr; 1202 unsigned long left, count = desc->count; 1203 1204 if (size > count) 1205 size = count; 1206 1207 /* 1208 * Faults on the destination of a read are common, so do it before 1209 * taking the kmap. 1210 */ 1211 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1212 kaddr = kmap_atomic(page, KM_USER0); 1213 left = __copy_to_user_inatomic(desc->arg.buf, 1214 kaddr + offset, size); 1215 kunmap_atomic(kaddr, KM_USER0); 1216 if (left == 0) 1217 goto success; 1218 } 1219 1220 /* Do it the slow way */ 1221 kaddr = kmap(page); 1222 left = __copy_to_user(desc->arg.buf, kaddr + offset, size); 1223 kunmap(page); 1224 1225 if (left) { 1226 size -= left; 1227 desc->error = -EFAULT; 1228 } 1229 success: 1230 desc->count = count - size; 1231 desc->written += size; 1232 desc->arg.buf += size; 1233 return size; 1234 } 1235 1236 /* 1237 * Performs necessary checks before doing a write 1238 * @iov: io vector request 1239 * @nr_segs: number of segments in the iovec 1240 * @count: number of bytes to write 1241 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE 1242 * 1243 * Adjust number of segments and amount of bytes to write (nr_segs should be 1244 * properly initialized first). Returns appropriate error code that caller 1245 * should return or zero in case that write should be allowed. 1246 */ 1247 int generic_segment_checks(const struct iovec *iov, 1248 unsigned long *nr_segs, size_t *count, int access_flags) 1249 { 1250 unsigned long seg; 1251 size_t cnt = 0; 1252 for (seg = 0; seg < *nr_segs; seg++) { 1253 const struct iovec *iv = &iov[seg]; 1254 1255 /* 1256 * If any segment has a negative length, or the cumulative 1257 * length ever wraps negative then return -EINVAL. 1258 */ 1259 cnt += iv->iov_len; 1260 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) 1261 return -EINVAL; 1262 if (access_ok(access_flags, iv->iov_base, iv->iov_len)) 1263 continue; 1264 if (seg == 0) 1265 return -EFAULT; 1266 *nr_segs = seg; 1267 cnt -= iv->iov_len; /* This segment is no good */ 1268 break; 1269 } 1270 *count = cnt; 1271 return 0; 1272 } 1273 EXPORT_SYMBOL(generic_segment_checks); 1274 1275 /** 1276 * generic_file_aio_read - generic filesystem read routine 1277 * @iocb: kernel I/O control block 1278 * @iov: io vector request 1279 * @nr_segs: number of segments in the iovec 1280 * @pos: current file position 1281 * 1282 * This is the "read()" routine for all filesystems 1283 * that can use the page cache directly. 1284 */ 1285 ssize_t 1286 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1287 unsigned long nr_segs, loff_t pos) 1288 { 1289 struct file *filp = iocb->ki_filp; 1290 ssize_t retval; 1291 unsigned long seg = 0; 1292 size_t count; 1293 loff_t *ppos = &iocb->ki_pos; 1294 1295 count = 0; 1296 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1297 if (retval) 1298 return retval; 1299 1300 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1301 if (filp->f_flags & O_DIRECT) { 1302 loff_t size; 1303 struct address_space *mapping; 1304 struct inode *inode; 1305 1306 mapping = filp->f_mapping; 1307 inode = mapping->host; 1308 if (!count) 1309 goto out; /* skip atime */ 1310 size = i_size_read(inode); 1311 if (pos < size) { 1312 retval = filemap_write_and_wait_range(mapping, pos, 1313 pos + iov_length(iov, nr_segs) - 1); 1314 if (!retval) { 1315 retval = mapping->a_ops->direct_IO(READ, iocb, 1316 iov, pos, nr_segs); 1317 } 1318 if (retval > 0) { 1319 *ppos = pos + retval; 1320 count -= retval; 1321 } 1322 1323 /* 1324 * Btrfs can have a short DIO read if we encounter 1325 * compressed extents, so if there was an error, or if 1326 * we've already read everything we wanted to, or if 1327 * there was a short read because we hit EOF, go ahead 1328 * and return. Otherwise fallthrough to buffered io for 1329 * the rest of the read. 1330 */ 1331 if (retval < 0 || !count || *ppos >= size) { 1332 file_accessed(filp); 1333 goto out; 1334 } 1335 } 1336 } 1337 1338 count = retval; 1339 for (seg = 0; seg < nr_segs; seg++) { 1340 read_descriptor_t desc; 1341 loff_t offset = 0; 1342 1343 /* 1344 * If we did a short DIO read we need to skip the section of the 1345 * iov that we've already read data into. 1346 */ 1347 if (count) { 1348 if (count > iov[seg].iov_len) { 1349 count -= iov[seg].iov_len; 1350 continue; 1351 } 1352 offset = count; 1353 count = 0; 1354 } 1355 1356 desc.written = 0; 1357 desc.arg.buf = iov[seg].iov_base + offset; 1358 desc.count = iov[seg].iov_len - offset; 1359 if (desc.count == 0) 1360 continue; 1361 desc.error = 0; 1362 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1363 retval += desc.written; 1364 if (desc.error) { 1365 retval = retval ?: desc.error; 1366 break; 1367 } 1368 if (desc.count > 0) 1369 break; 1370 } 1371 out: 1372 return retval; 1373 } 1374 EXPORT_SYMBOL(generic_file_aio_read); 1375 1376 static ssize_t 1377 do_readahead(struct address_space *mapping, struct file *filp, 1378 pgoff_t index, unsigned long nr) 1379 { 1380 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1381 return -EINVAL; 1382 1383 force_page_cache_readahead(mapping, filp, index, nr); 1384 return 0; 1385 } 1386 1387 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) 1388 { 1389 ssize_t ret; 1390 struct file *file; 1391 1392 ret = -EBADF; 1393 file = fget(fd); 1394 if (file) { 1395 if (file->f_mode & FMODE_READ) { 1396 struct address_space *mapping = file->f_mapping; 1397 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 1398 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1399 unsigned long len = end - start + 1; 1400 ret = do_readahead(mapping, file, start, len); 1401 } 1402 fput(file); 1403 } 1404 return ret; 1405 } 1406 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 1407 asmlinkage long SyS_readahead(long fd, loff_t offset, long count) 1408 { 1409 return SYSC_readahead((int) fd, offset, (size_t) count); 1410 } 1411 SYSCALL_ALIAS(sys_readahead, SyS_readahead); 1412 #endif 1413 1414 #ifdef CONFIG_MMU 1415 /** 1416 * page_cache_read - adds requested page to the page cache if not already there 1417 * @file: file to read 1418 * @offset: page index 1419 * 1420 * This adds the requested page to the page cache if it isn't already there, 1421 * and schedules an I/O to read in its contents from disk. 1422 */ 1423 static int page_cache_read(struct file *file, pgoff_t offset) 1424 { 1425 struct address_space *mapping = file->f_mapping; 1426 struct page *page; 1427 int ret; 1428 1429 do { 1430 page = page_cache_alloc_cold(mapping); 1431 if (!page) 1432 return -ENOMEM; 1433 1434 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1435 if (ret == 0) 1436 ret = mapping->a_ops->readpage(file, page); 1437 else if (ret == -EEXIST) 1438 ret = 0; /* losing race to add is OK */ 1439 1440 page_cache_release(page); 1441 1442 } while (ret == AOP_TRUNCATED_PAGE); 1443 1444 return ret; 1445 } 1446 1447 #define MMAP_LOTSAMISS (100) 1448 1449 /* 1450 * Synchronous readahead happens when we don't even find 1451 * a page in the page cache at all. 1452 */ 1453 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1454 struct file_ra_state *ra, 1455 struct file *file, 1456 pgoff_t offset) 1457 { 1458 unsigned long ra_pages; 1459 struct address_space *mapping = file->f_mapping; 1460 1461 /* If we don't want any read-ahead, don't bother */ 1462 if (VM_RandomReadHint(vma)) 1463 return; 1464 1465 if (VM_SequentialReadHint(vma) || 1466 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { 1467 page_cache_sync_readahead(mapping, ra, file, offset, 1468 ra->ra_pages); 1469 return; 1470 } 1471 1472 if (ra->mmap_miss < INT_MAX) 1473 ra->mmap_miss++; 1474 1475 /* 1476 * Do we miss much more than hit in this file? If so, 1477 * stop bothering with read-ahead. It will only hurt. 1478 */ 1479 if (ra->mmap_miss > MMAP_LOTSAMISS) 1480 return; 1481 1482 /* 1483 * mmap read-around 1484 */ 1485 ra_pages = max_sane_readahead(ra->ra_pages); 1486 if (ra_pages) { 1487 ra->start = max_t(long, 0, offset - ra_pages/2); 1488 ra->size = ra_pages; 1489 ra->async_size = 0; 1490 ra_submit(ra, mapping, file); 1491 } 1492 } 1493 1494 /* 1495 * Asynchronous readahead happens when we find the page and PG_readahead, 1496 * so we want to possibly extend the readahead further.. 1497 */ 1498 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1499 struct file_ra_state *ra, 1500 struct file *file, 1501 struct page *page, 1502 pgoff_t offset) 1503 { 1504 struct address_space *mapping = file->f_mapping; 1505 1506 /* If we don't want any read-ahead, don't bother */ 1507 if (VM_RandomReadHint(vma)) 1508 return; 1509 if (ra->mmap_miss > 0) 1510 ra->mmap_miss--; 1511 if (PageReadahead(page)) 1512 page_cache_async_readahead(mapping, ra, file, 1513 page, offset, ra->ra_pages); 1514 } 1515 1516 /** 1517 * filemap_fault - read in file data for page fault handling 1518 * @vma: vma in which the fault was taken 1519 * @vmf: struct vm_fault containing details of the fault 1520 * 1521 * filemap_fault() is invoked via the vma operations vector for a 1522 * mapped memory region to read in file data during a page fault. 1523 * 1524 * The goto's are kind of ugly, but this streamlines the normal case of having 1525 * it in the page cache, and handles the special cases reasonably without 1526 * having a lot of duplicated code. 1527 */ 1528 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1529 { 1530 int error; 1531 struct file *file = vma->vm_file; 1532 struct address_space *mapping = file->f_mapping; 1533 struct file_ra_state *ra = &file->f_ra; 1534 struct inode *inode = mapping->host; 1535 pgoff_t offset = vmf->pgoff; 1536 struct page *page; 1537 pgoff_t size; 1538 int ret = 0; 1539 1540 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1541 if (offset >= size) 1542 return VM_FAULT_SIGBUS; 1543 1544 /* 1545 * Do we have something in the page cache already? 1546 */ 1547 page = find_get_page(mapping, offset); 1548 if (likely(page)) { 1549 /* 1550 * We found the page, so try async readahead before 1551 * waiting for the lock. 1552 */ 1553 do_async_mmap_readahead(vma, ra, file, page, offset); 1554 } else { 1555 /* No page in the page cache at all */ 1556 do_sync_mmap_readahead(vma, ra, file, offset); 1557 count_vm_event(PGMAJFAULT); 1558 ret = VM_FAULT_MAJOR; 1559 retry_find: 1560 page = find_get_page(mapping, offset); 1561 if (!page) 1562 goto no_cached_page; 1563 } 1564 1565 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 1566 page_cache_release(page); 1567 return ret | VM_FAULT_RETRY; 1568 } 1569 1570 /* Did it get truncated? */ 1571 if (unlikely(page->mapping != mapping)) { 1572 unlock_page(page); 1573 put_page(page); 1574 goto retry_find; 1575 } 1576 VM_BUG_ON(page->index != offset); 1577 1578 /* 1579 * We have a locked page in the page cache, now we need to check 1580 * that it's up-to-date. If not, it is going to be due to an error. 1581 */ 1582 if (unlikely(!PageUptodate(page))) 1583 goto page_not_uptodate; 1584 1585 /* 1586 * Found the page and have a reference on it. 1587 * We must recheck i_size under page lock. 1588 */ 1589 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1590 if (unlikely(offset >= size)) { 1591 unlock_page(page); 1592 page_cache_release(page); 1593 return VM_FAULT_SIGBUS; 1594 } 1595 1596 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; 1597 vmf->page = page; 1598 return ret | VM_FAULT_LOCKED; 1599 1600 no_cached_page: 1601 /* 1602 * We're only likely to ever get here if MADV_RANDOM is in 1603 * effect. 1604 */ 1605 error = page_cache_read(file, offset); 1606 1607 /* 1608 * The page we want has now been added to the page cache. 1609 * In the unlikely event that someone removed it in the 1610 * meantime, we'll just come back here and read it again. 1611 */ 1612 if (error >= 0) 1613 goto retry_find; 1614 1615 /* 1616 * An error return from page_cache_read can result if the 1617 * system is low on memory, or a problem occurs while trying 1618 * to schedule I/O. 1619 */ 1620 if (error == -ENOMEM) 1621 return VM_FAULT_OOM; 1622 return VM_FAULT_SIGBUS; 1623 1624 page_not_uptodate: 1625 /* 1626 * Umm, take care of errors if the page isn't up-to-date. 1627 * Try to re-read it _once_. We do this synchronously, 1628 * because there really aren't any performance issues here 1629 * and we need to check for errors. 1630 */ 1631 ClearPageError(page); 1632 error = mapping->a_ops->readpage(file, page); 1633 if (!error) { 1634 wait_on_page_locked(page); 1635 if (!PageUptodate(page)) 1636 error = -EIO; 1637 } 1638 page_cache_release(page); 1639 1640 if (!error || error == AOP_TRUNCATED_PAGE) 1641 goto retry_find; 1642 1643 /* Things didn't work out. Return zero to tell the mm layer so. */ 1644 shrink_readahead_size_eio(file, ra); 1645 return VM_FAULT_SIGBUS; 1646 } 1647 EXPORT_SYMBOL(filemap_fault); 1648 1649 const struct vm_operations_struct generic_file_vm_ops = { 1650 .fault = filemap_fault, 1651 }; 1652 1653 /* This is used for a general mmap of a disk file */ 1654 1655 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1656 { 1657 struct address_space *mapping = file->f_mapping; 1658 1659 if (!mapping->a_ops->readpage) 1660 return -ENOEXEC; 1661 file_accessed(file); 1662 vma->vm_ops = &generic_file_vm_ops; 1663 vma->vm_flags |= VM_CAN_NONLINEAR; 1664 return 0; 1665 } 1666 1667 /* 1668 * This is for filesystems which do not implement ->writepage. 1669 */ 1670 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 1671 { 1672 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 1673 return -EINVAL; 1674 return generic_file_mmap(file, vma); 1675 } 1676 #else 1677 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1678 { 1679 return -ENOSYS; 1680 } 1681 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 1682 { 1683 return -ENOSYS; 1684 } 1685 #endif /* CONFIG_MMU */ 1686 1687 EXPORT_SYMBOL(generic_file_mmap); 1688 EXPORT_SYMBOL(generic_file_readonly_mmap); 1689 1690 static struct page *__read_cache_page(struct address_space *mapping, 1691 pgoff_t index, 1692 int (*filler)(void *,struct page*), 1693 void *data, 1694 gfp_t gfp) 1695 { 1696 struct page *page; 1697 int err; 1698 repeat: 1699 page = find_get_page(mapping, index); 1700 if (!page) { 1701 page = __page_cache_alloc(gfp | __GFP_COLD); 1702 if (!page) 1703 return ERR_PTR(-ENOMEM); 1704 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1705 if (unlikely(err)) { 1706 page_cache_release(page); 1707 if (err == -EEXIST) 1708 goto repeat; 1709 /* Presumably ENOMEM for radix tree node */ 1710 return ERR_PTR(err); 1711 } 1712 err = filler(data, page); 1713 if (err < 0) { 1714 page_cache_release(page); 1715 page = ERR_PTR(err); 1716 } 1717 } 1718 return page; 1719 } 1720 1721 static struct page *do_read_cache_page(struct address_space *mapping, 1722 pgoff_t index, 1723 int (*filler)(void *,struct page*), 1724 void *data, 1725 gfp_t gfp) 1726 1727 { 1728 struct page *page; 1729 int err; 1730 1731 retry: 1732 page = __read_cache_page(mapping, index, filler, data, gfp); 1733 if (IS_ERR(page)) 1734 return page; 1735 if (PageUptodate(page)) 1736 goto out; 1737 1738 lock_page(page); 1739 if (!page->mapping) { 1740 unlock_page(page); 1741 page_cache_release(page); 1742 goto retry; 1743 } 1744 if (PageUptodate(page)) { 1745 unlock_page(page); 1746 goto out; 1747 } 1748 err = filler(data, page); 1749 if (err < 0) { 1750 page_cache_release(page); 1751 return ERR_PTR(err); 1752 } 1753 out: 1754 mark_page_accessed(page); 1755 return page; 1756 } 1757 1758 /** 1759 * read_cache_page_async - read into page cache, fill it if needed 1760 * @mapping: the page's address_space 1761 * @index: the page index 1762 * @filler: function to perform the read 1763 * @data: destination for read data 1764 * 1765 * Same as read_cache_page, but don't wait for page to become unlocked 1766 * after submitting it to the filler. 1767 * 1768 * Read into the page cache. If a page already exists, and PageUptodate() is 1769 * not set, try to fill the page but don't wait for it to become unlocked. 1770 * 1771 * If the page does not get brought uptodate, return -EIO. 1772 */ 1773 struct page *read_cache_page_async(struct address_space *mapping, 1774 pgoff_t index, 1775 int (*filler)(void *,struct page*), 1776 void *data) 1777 { 1778 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1779 } 1780 EXPORT_SYMBOL(read_cache_page_async); 1781 1782 static struct page *wait_on_page_read(struct page *page) 1783 { 1784 if (!IS_ERR(page)) { 1785 wait_on_page_locked(page); 1786 if (!PageUptodate(page)) { 1787 page_cache_release(page); 1788 page = ERR_PTR(-EIO); 1789 } 1790 } 1791 return page; 1792 } 1793 1794 /** 1795 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 1796 * @mapping: the page's address_space 1797 * @index: the page index 1798 * @gfp: the page allocator flags to use if allocating 1799 * 1800 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1801 * any new page allocations done using the specified allocation flags. Note 1802 * that the Radix tree operations will still use GFP_KERNEL, so you can't 1803 * expect to do this atomically or anything like that - but you can pass in 1804 * other page requirements. 1805 * 1806 * If the page does not get brought uptodate, return -EIO. 1807 */ 1808 struct page *read_cache_page_gfp(struct address_space *mapping, 1809 pgoff_t index, 1810 gfp_t gfp) 1811 { 1812 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 1813 1814 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); 1815 } 1816 EXPORT_SYMBOL(read_cache_page_gfp); 1817 1818 /** 1819 * read_cache_page - read into page cache, fill it if needed 1820 * @mapping: the page's address_space 1821 * @index: the page index 1822 * @filler: function to perform the read 1823 * @data: destination for read data 1824 * 1825 * Read into the page cache. If a page already exists, and PageUptodate() is 1826 * not set, try to fill the page then wait for it to become unlocked. 1827 * 1828 * If the page does not get brought uptodate, return -EIO. 1829 */ 1830 struct page *read_cache_page(struct address_space *mapping, 1831 pgoff_t index, 1832 int (*filler)(void *,struct page*), 1833 void *data) 1834 { 1835 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1836 } 1837 EXPORT_SYMBOL(read_cache_page); 1838 1839 /* 1840 * The logic we want is 1841 * 1842 * if suid or (sgid and xgrp) 1843 * remove privs 1844 */ 1845 int should_remove_suid(struct dentry *dentry) 1846 { 1847 mode_t mode = dentry->d_inode->i_mode; 1848 int kill = 0; 1849 1850 /* suid always must be killed */ 1851 if (unlikely(mode & S_ISUID)) 1852 kill = ATTR_KILL_SUID; 1853 1854 /* 1855 * sgid without any exec bits is just a mandatory locking mark; leave 1856 * it alone. If some exec bits are set, it's a real sgid; kill it. 1857 */ 1858 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1859 kill |= ATTR_KILL_SGID; 1860 1861 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1862 return kill; 1863 1864 return 0; 1865 } 1866 EXPORT_SYMBOL(should_remove_suid); 1867 1868 static int __remove_suid(struct dentry *dentry, int kill) 1869 { 1870 struct iattr newattrs; 1871 1872 newattrs.ia_valid = ATTR_FORCE | kill; 1873 return notify_change(dentry, &newattrs); 1874 } 1875 1876 int file_remove_suid(struct file *file) 1877 { 1878 struct dentry *dentry = file->f_path.dentry; 1879 int killsuid = should_remove_suid(dentry); 1880 int killpriv = security_inode_need_killpriv(dentry); 1881 int error = 0; 1882 1883 if (killpriv < 0) 1884 return killpriv; 1885 if (killpriv) 1886 error = security_inode_killpriv(dentry); 1887 if (!error && killsuid) 1888 error = __remove_suid(dentry, killsuid); 1889 1890 return error; 1891 } 1892 EXPORT_SYMBOL(file_remove_suid); 1893 1894 static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1895 const struct iovec *iov, size_t base, size_t bytes) 1896 { 1897 size_t copied = 0, left = 0; 1898 1899 while (bytes) { 1900 char __user *buf = iov->iov_base + base; 1901 int copy = min(bytes, iov->iov_len - base); 1902 1903 base = 0; 1904 left = __copy_from_user_inatomic(vaddr, buf, copy); 1905 copied += copy; 1906 bytes -= copy; 1907 vaddr += copy; 1908 iov++; 1909 1910 if (unlikely(left)) 1911 break; 1912 } 1913 return copied - left; 1914 } 1915 1916 /* 1917 * Copy as much as we can into the page and return the number of bytes which 1918 * were successfully copied. If a fault is encountered then return the number of 1919 * bytes which were copied. 1920 */ 1921 size_t iov_iter_copy_from_user_atomic(struct page *page, 1922 struct iov_iter *i, unsigned long offset, size_t bytes) 1923 { 1924 char *kaddr; 1925 size_t copied; 1926 1927 BUG_ON(!in_atomic()); 1928 kaddr = kmap_atomic(page, KM_USER0); 1929 if (likely(i->nr_segs == 1)) { 1930 int left; 1931 char __user *buf = i->iov->iov_base + i->iov_offset; 1932 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 1933 copied = bytes - left; 1934 } else { 1935 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1936 i->iov, i->iov_offset, bytes); 1937 } 1938 kunmap_atomic(kaddr, KM_USER0); 1939 1940 return copied; 1941 } 1942 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 1943 1944 /* 1945 * This has the same sideeffects and return value as 1946 * iov_iter_copy_from_user_atomic(). 1947 * The difference is that it attempts to resolve faults. 1948 * Page must not be locked. 1949 */ 1950 size_t iov_iter_copy_from_user(struct page *page, 1951 struct iov_iter *i, unsigned long offset, size_t bytes) 1952 { 1953 char *kaddr; 1954 size_t copied; 1955 1956 kaddr = kmap(page); 1957 if (likely(i->nr_segs == 1)) { 1958 int left; 1959 char __user *buf = i->iov->iov_base + i->iov_offset; 1960 left = __copy_from_user(kaddr + offset, buf, bytes); 1961 copied = bytes - left; 1962 } else { 1963 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1964 i->iov, i->iov_offset, bytes); 1965 } 1966 kunmap(page); 1967 return copied; 1968 } 1969 EXPORT_SYMBOL(iov_iter_copy_from_user); 1970 1971 void iov_iter_advance(struct iov_iter *i, size_t bytes) 1972 { 1973 BUG_ON(i->count < bytes); 1974 1975 if (likely(i->nr_segs == 1)) { 1976 i->iov_offset += bytes; 1977 i->count -= bytes; 1978 } else { 1979 const struct iovec *iov = i->iov; 1980 size_t base = i->iov_offset; 1981 1982 /* 1983 * The !iov->iov_len check ensures we skip over unlikely 1984 * zero-length segments (without overruning the iovec). 1985 */ 1986 while (bytes || unlikely(i->count && !iov->iov_len)) { 1987 int copy; 1988 1989 copy = min(bytes, iov->iov_len - base); 1990 BUG_ON(!i->count || i->count < copy); 1991 i->count -= copy; 1992 bytes -= copy; 1993 base += copy; 1994 if (iov->iov_len == base) { 1995 iov++; 1996 base = 0; 1997 } 1998 } 1999 i->iov = iov; 2000 i->iov_offset = base; 2001 } 2002 } 2003 EXPORT_SYMBOL(iov_iter_advance); 2004 2005 /* 2006 * Fault in the first iovec of the given iov_iter, to a maximum length 2007 * of bytes. Returns 0 on success, or non-zero if the memory could not be 2008 * accessed (ie. because it is an invalid address). 2009 * 2010 * writev-intensive code may want this to prefault several iovecs -- that 2011 * would be possible (callers must not rely on the fact that _only_ the 2012 * first iovec will be faulted with the current implementation). 2013 */ 2014 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 2015 { 2016 char __user *buf = i->iov->iov_base + i->iov_offset; 2017 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 2018 return fault_in_pages_readable(buf, bytes); 2019 } 2020 EXPORT_SYMBOL(iov_iter_fault_in_readable); 2021 2022 /* 2023 * Return the count of just the current iov_iter segment. 2024 */ 2025 size_t iov_iter_single_seg_count(struct iov_iter *i) 2026 { 2027 const struct iovec *iov = i->iov; 2028 if (i->nr_segs == 1) 2029 return i->count; 2030 else 2031 return min(i->count, iov->iov_len - i->iov_offset); 2032 } 2033 EXPORT_SYMBOL(iov_iter_single_seg_count); 2034 2035 /* 2036 * Performs necessary checks before doing a write 2037 * 2038 * Can adjust writing position or amount of bytes to write. 2039 * Returns appropriate error code that caller should return or 2040 * zero in case that write should be allowed. 2041 */ 2042 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 2043 { 2044 struct inode *inode = file->f_mapping->host; 2045 unsigned long limit = rlimit(RLIMIT_FSIZE); 2046 2047 if (unlikely(*pos < 0)) 2048 return -EINVAL; 2049 2050 if (!isblk) { 2051 /* FIXME: this is for backwards compatibility with 2.4 */ 2052 if (file->f_flags & O_APPEND) 2053 *pos = i_size_read(inode); 2054 2055 if (limit != RLIM_INFINITY) { 2056 if (*pos >= limit) { 2057 send_sig(SIGXFSZ, current, 0); 2058 return -EFBIG; 2059 } 2060 if (*count > limit - (typeof(limit))*pos) { 2061 *count = limit - (typeof(limit))*pos; 2062 } 2063 } 2064 } 2065 2066 /* 2067 * LFS rule 2068 */ 2069 if (unlikely(*pos + *count > MAX_NON_LFS && 2070 !(file->f_flags & O_LARGEFILE))) { 2071 if (*pos >= MAX_NON_LFS) { 2072 return -EFBIG; 2073 } 2074 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 2075 *count = MAX_NON_LFS - (unsigned long)*pos; 2076 } 2077 } 2078 2079 /* 2080 * Are we about to exceed the fs block limit ? 2081 * 2082 * If we have written data it becomes a short write. If we have 2083 * exceeded without writing data we send a signal and return EFBIG. 2084 * Linus frestrict idea will clean these up nicely.. 2085 */ 2086 if (likely(!isblk)) { 2087 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 2088 if (*count || *pos > inode->i_sb->s_maxbytes) { 2089 return -EFBIG; 2090 } 2091 /* zero-length writes at ->s_maxbytes are OK */ 2092 } 2093 2094 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 2095 *count = inode->i_sb->s_maxbytes - *pos; 2096 } else { 2097 #ifdef CONFIG_BLOCK 2098 loff_t isize; 2099 if (bdev_read_only(I_BDEV(inode))) 2100 return -EPERM; 2101 isize = i_size_read(inode); 2102 if (*pos >= isize) { 2103 if (*count || *pos > isize) 2104 return -ENOSPC; 2105 } 2106 2107 if (*pos + *count > isize) 2108 *count = isize - *pos; 2109 #else 2110 return -EPERM; 2111 #endif 2112 } 2113 return 0; 2114 } 2115 EXPORT_SYMBOL(generic_write_checks); 2116 2117 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2118 loff_t pos, unsigned len, unsigned flags, 2119 struct page **pagep, void **fsdata) 2120 { 2121 const struct address_space_operations *aops = mapping->a_ops; 2122 2123 return aops->write_begin(file, mapping, pos, len, flags, 2124 pagep, fsdata); 2125 } 2126 EXPORT_SYMBOL(pagecache_write_begin); 2127 2128 int pagecache_write_end(struct file *file, struct address_space *mapping, 2129 loff_t pos, unsigned len, unsigned copied, 2130 struct page *page, void *fsdata) 2131 { 2132 const struct address_space_operations *aops = mapping->a_ops; 2133 2134 mark_page_accessed(page); 2135 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2136 } 2137 EXPORT_SYMBOL(pagecache_write_end); 2138 2139 ssize_t 2140 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 2141 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 2142 size_t count, size_t ocount) 2143 { 2144 struct file *file = iocb->ki_filp; 2145 struct address_space *mapping = file->f_mapping; 2146 struct inode *inode = mapping->host; 2147 ssize_t written; 2148 size_t write_len; 2149 pgoff_t end; 2150 2151 if (count != ocount) 2152 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2153 2154 write_len = iov_length(iov, *nr_segs); 2155 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2156 2157 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2158 if (written) 2159 goto out; 2160 2161 /* 2162 * After a write we want buffered reads to be sure to go to disk to get 2163 * the new data. We invalidate clean cached page from the region we're 2164 * about to write. We do this *before* the write so that we can return 2165 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2166 */ 2167 if (mapping->nrpages) { 2168 written = invalidate_inode_pages2_range(mapping, 2169 pos >> PAGE_CACHE_SHIFT, end); 2170 /* 2171 * If a page can not be invalidated, return 0 to fall back 2172 * to buffered write. 2173 */ 2174 if (written) { 2175 if (written == -EBUSY) 2176 return 0; 2177 goto out; 2178 } 2179 } 2180 2181 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2182 2183 /* 2184 * Finally, try again to invalidate clean pages which might have been 2185 * cached by non-direct readahead, or faulted in by get_user_pages() 2186 * if the source of the write was an mmap'ed region of the file 2187 * we're writing. Either one is a pretty crazy thing to do, 2188 * so we don't support it 100%. If this invalidation 2189 * fails, tough, the write still worked... 2190 */ 2191 if (mapping->nrpages) { 2192 invalidate_inode_pages2_range(mapping, 2193 pos >> PAGE_CACHE_SHIFT, end); 2194 } 2195 2196 if (written > 0) { 2197 pos += written; 2198 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2199 i_size_write(inode, pos); 2200 mark_inode_dirty(inode); 2201 } 2202 *ppos = pos; 2203 } 2204 out: 2205 return written; 2206 } 2207 EXPORT_SYMBOL(generic_file_direct_write); 2208 2209 /* 2210 * Find or create a page at the given pagecache position. Return the locked 2211 * page. This function is specifically for buffered writes. 2212 */ 2213 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2214 pgoff_t index, unsigned flags) 2215 { 2216 int status; 2217 struct page *page; 2218 gfp_t gfp_notmask = 0; 2219 if (flags & AOP_FLAG_NOFS) 2220 gfp_notmask = __GFP_FS; 2221 repeat: 2222 page = find_lock_page(mapping, index); 2223 if (likely(page)) 2224 return page; 2225 2226 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2227 if (!page) 2228 return NULL; 2229 status = add_to_page_cache_lru(page, mapping, index, 2230 GFP_KERNEL & ~gfp_notmask); 2231 if (unlikely(status)) { 2232 page_cache_release(page); 2233 if (status == -EEXIST) 2234 goto repeat; 2235 return NULL; 2236 } 2237 return page; 2238 } 2239 EXPORT_SYMBOL(grab_cache_page_write_begin); 2240 2241 static ssize_t generic_perform_write(struct file *file, 2242 struct iov_iter *i, loff_t pos) 2243 { 2244 struct address_space *mapping = file->f_mapping; 2245 const struct address_space_operations *a_ops = mapping->a_ops; 2246 long status = 0; 2247 ssize_t written = 0; 2248 unsigned int flags = 0; 2249 2250 /* 2251 * Copies from kernel address space cannot fail (NFSD is a big user). 2252 */ 2253 if (segment_eq(get_fs(), KERNEL_DS)) 2254 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2255 2256 do { 2257 struct page *page; 2258 unsigned long offset; /* Offset into pagecache page */ 2259 unsigned long bytes; /* Bytes to write to page */ 2260 size_t copied; /* Bytes copied from user */ 2261 void *fsdata; 2262 2263 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2264 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2265 iov_iter_count(i)); 2266 2267 again: 2268 2269 /* 2270 * Bring in the user page that we will copy from _first_. 2271 * Otherwise there's a nasty deadlock on copying from the 2272 * same page as we're writing to, without it being marked 2273 * up-to-date. 2274 * 2275 * Not only is this an optimisation, but it is also required 2276 * to check that the address is actually valid, when atomic 2277 * usercopies are used, below. 2278 */ 2279 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2280 status = -EFAULT; 2281 break; 2282 } 2283 2284 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2285 &page, &fsdata); 2286 if (unlikely(status)) 2287 break; 2288 2289 if (mapping_writably_mapped(mapping)) 2290 flush_dcache_page(page); 2291 2292 pagefault_disable(); 2293 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2294 pagefault_enable(); 2295 flush_dcache_page(page); 2296 2297 mark_page_accessed(page); 2298 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2299 page, fsdata); 2300 if (unlikely(status < 0)) 2301 break; 2302 copied = status; 2303 2304 cond_resched(); 2305 2306 iov_iter_advance(i, copied); 2307 if (unlikely(copied == 0)) { 2308 /* 2309 * If we were unable to copy any data at all, we must 2310 * fall back to a single segment length write. 2311 * 2312 * If we didn't fallback here, we could livelock 2313 * because not all segments in the iov can be copied at 2314 * once without a pagefault. 2315 */ 2316 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2317 iov_iter_single_seg_count(i)); 2318 goto again; 2319 } 2320 pos += copied; 2321 written += copied; 2322 2323 balance_dirty_pages_ratelimited(mapping); 2324 2325 } while (iov_iter_count(i)); 2326 2327 return written ? written : status; 2328 } 2329 2330 ssize_t 2331 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2332 unsigned long nr_segs, loff_t pos, loff_t *ppos, 2333 size_t count, ssize_t written) 2334 { 2335 struct file *file = iocb->ki_filp; 2336 ssize_t status; 2337 struct iov_iter i; 2338 2339 iov_iter_init(&i, iov, nr_segs, count, written); 2340 status = generic_perform_write(file, &i, pos); 2341 2342 if (likely(status >= 0)) { 2343 written += status; 2344 *ppos = pos + status; 2345 } 2346 2347 return written ? written : status; 2348 } 2349 EXPORT_SYMBOL(generic_file_buffered_write); 2350 2351 /** 2352 * __generic_file_aio_write - write data to a file 2353 * @iocb: IO state structure (file, offset, etc.) 2354 * @iov: vector with data to write 2355 * @nr_segs: number of segments in the vector 2356 * @ppos: position where to write 2357 * 2358 * This function does all the work needed for actually writing data to a 2359 * file. It does all basic checks, removes SUID from the file, updates 2360 * modification times and calls proper subroutines depending on whether we 2361 * do direct IO or a standard buffered write. 2362 * 2363 * It expects i_mutex to be grabbed unless we work on a block device or similar 2364 * object which does not need locking at all. 2365 * 2366 * This function does *not* take care of syncing data in case of O_SYNC write. 2367 * A caller has to handle it. This is mainly due to the fact that we want to 2368 * avoid syncing under i_mutex. 2369 */ 2370 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2371 unsigned long nr_segs, loff_t *ppos) 2372 { 2373 struct file *file = iocb->ki_filp; 2374 struct address_space * mapping = file->f_mapping; 2375 size_t ocount; /* original count */ 2376 size_t count; /* after file limit checks */ 2377 struct inode *inode = mapping->host; 2378 loff_t pos; 2379 ssize_t written; 2380 ssize_t err; 2381 2382 ocount = 0; 2383 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 2384 if (err) 2385 return err; 2386 2387 count = ocount; 2388 pos = *ppos; 2389 2390 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2391 2392 /* We can write back this queue in page reclaim */ 2393 current->backing_dev_info = mapping->backing_dev_info; 2394 written = 0; 2395 2396 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2397 if (err) 2398 goto out; 2399 2400 if (count == 0) 2401 goto out; 2402 2403 err = file_remove_suid(file); 2404 if (err) 2405 goto out; 2406 2407 file_update_time(file); 2408 2409 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2410 if (unlikely(file->f_flags & O_DIRECT)) { 2411 loff_t endbyte; 2412 ssize_t written_buffered; 2413 2414 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 2415 ppos, count, ocount); 2416 if (written < 0 || written == count) 2417 goto out; 2418 /* 2419 * direct-io write to a hole: fall through to buffered I/O 2420 * for completing the rest of the request. 2421 */ 2422 pos += written; 2423 count -= written; 2424 written_buffered = generic_file_buffered_write(iocb, iov, 2425 nr_segs, pos, ppos, count, 2426 written); 2427 /* 2428 * If generic_file_buffered_write() retuned a synchronous error 2429 * then we want to return the number of bytes which were 2430 * direct-written, or the error code if that was zero. Note 2431 * that this differs from normal direct-io semantics, which 2432 * will return -EFOO even if some bytes were written. 2433 */ 2434 if (written_buffered < 0) { 2435 err = written_buffered; 2436 goto out; 2437 } 2438 2439 /* 2440 * We need to ensure that the page cache pages are written to 2441 * disk and invalidated to preserve the expected O_DIRECT 2442 * semantics. 2443 */ 2444 endbyte = pos + written_buffered - written - 1; 2445 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 2446 if (err == 0) { 2447 written = written_buffered; 2448 invalidate_mapping_pages(mapping, 2449 pos >> PAGE_CACHE_SHIFT, 2450 endbyte >> PAGE_CACHE_SHIFT); 2451 } else { 2452 /* 2453 * We don't know how much we wrote, so just return 2454 * the number of bytes which were direct-written 2455 */ 2456 } 2457 } else { 2458 written = generic_file_buffered_write(iocb, iov, nr_segs, 2459 pos, ppos, count, written); 2460 } 2461 out: 2462 current->backing_dev_info = NULL; 2463 return written ? written : err; 2464 } 2465 EXPORT_SYMBOL(__generic_file_aio_write); 2466 2467 /** 2468 * generic_file_aio_write - write data to a file 2469 * @iocb: IO state structure 2470 * @iov: vector with data to write 2471 * @nr_segs: number of segments in the vector 2472 * @pos: position in file where to write 2473 * 2474 * This is a wrapper around __generic_file_aio_write() to be used by most 2475 * filesystems. It takes care of syncing the file in case of O_SYNC file 2476 * and acquires i_mutex as needed. 2477 */ 2478 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2479 unsigned long nr_segs, loff_t pos) 2480 { 2481 struct file *file = iocb->ki_filp; 2482 struct inode *inode = file->f_mapping->host; 2483 ssize_t ret; 2484 2485 BUG_ON(iocb->ki_pos != pos); 2486 2487 mutex_lock(&inode->i_mutex); 2488 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2489 mutex_unlock(&inode->i_mutex); 2490 2491 if (ret > 0 || ret == -EIOCBQUEUED) { 2492 ssize_t err; 2493 2494 err = generic_write_sync(file, pos, ret); 2495 if (err < 0 && ret > 0) 2496 ret = err; 2497 } 2498 return ret; 2499 } 2500 EXPORT_SYMBOL(generic_file_aio_write); 2501 2502 /** 2503 * try_to_release_page() - release old fs-specific metadata on a page 2504 * 2505 * @page: the page which the kernel is trying to free 2506 * @gfp_mask: memory allocation flags (and I/O mode) 2507 * 2508 * The address_space is to try to release any data against the page 2509 * (presumably at page->private). If the release was successful, return `1'. 2510 * Otherwise return zero. 2511 * 2512 * This may also be called if PG_fscache is set on a page, indicating that the 2513 * page is known to the local caching routines. 2514 * 2515 * The @gfp_mask argument specifies whether I/O may be performed to release 2516 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2517 * 2518 */ 2519 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2520 { 2521 struct address_space * const mapping = page->mapping; 2522 2523 BUG_ON(!PageLocked(page)); 2524 if (PageWriteback(page)) 2525 return 0; 2526 2527 if (mapping && mapping->a_ops->releasepage) 2528 return mapping->a_ops->releasepage(page, gfp_mask); 2529 return try_to_free_buffers(page); 2530 } 2531 2532 EXPORT_SYMBOL(try_to_release_page); 2533