1 /* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7 /* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12 #include <linux/module.h> 13 #include <linux/compiler.h> 14 #include <linux/fs.h> 15 #include <linux/uaccess.h> 16 #include <linux/aio.h> 17 #include <linux/capability.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/gfp.h> 20 #include <linux/mm.h> 21 #include <linux/swap.h> 22 #include <linux/mman.h> 23 #include <linux/pagemap.h> 24 #include <linux/file.h> 25 #include <linux/uio.h> 26 #include <linux/hash.h> 27 #include <linux/writeback.h> 28 #include <linux/backing-dev.h> 29 #include <linux/pagevec.h> 30 #include <linux/blkdev.h> 31 #include <linux/security.h> 32 #include <linux/syscalls.h> 33 #include <linux/cpuset.h> 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35 #include <linux/memcontrol.h> 36 #include <linux/mm_inline.h> /* for page_is_file_cache() */ 37 #include "internal.h" 38 39 /* 40 * FIXME: remove all knowledge of the buffer layer from the core VM 41 */ 42 #include <linux/buffer_head.h> /* for try_to_free_buffers */ 43 44 #include <asm/mman.h> 45 46 /* 47 * Shared mappings implemented 30.11.1994. It's not fully working yet, 48 * though. 49 * 50 * Shared mappings now work. 15.8.1995 Bruno. 51 * 52 * finished 'unifying' the page and buffer cache and SMP-threaded the 53 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 54 * 55 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 56 */ 57 58 /* 59 * Lock ordering: 60 * 61 * ->i_mmap_lock (truncate_pagecache) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->swap_lock (exclusive_swap_page, others) 64 * ->mapping->tree_lock 65 * 66 * ->i_mutex 67 * ->i_mmap_lock (truncate->unmap_mapping_range) 68 * 69 * ->mmap_sem 70 * ->i_mmap_lock 71 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * 74 * ->mmap_sem 75 * ->lock_page (access_process_vm) 76 * 77 * ->i_mutex (generic_file_buffered_write) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * 80 * ->i_mutex 81 * ->i_alloc_sem (various) 82 * 83 * ->inode_lock 84 * ->sb_lock (fs/fs-writeback.c) 85 * ->mapping->tree_lock (__sync_single_inode) 86 * 87 * ->i_mmap_lock 88 * ->anon_vma.lock (vma_adjust) 89 * 90 * ->anon_vma.lock 91 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 92 * 93 * ->page_table_lock or pte_lock 94 * ->swap_lock (try_to_unmap_one) 95 * ->private_lock (try_to_unmap_one) 96 * ->tree_lock (try_to_unmap_one) 97 * ->zone.lru_lock (follow_page->mark_page_accessed) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 104 * 105 * (code doesn't rely on that order, so you could switch it around) 106 * ->tasklist_lock (memory_failure, collect_procs_ao) 107 * ->i_mmap_lock 108 */ 109 110 /* 111 * Delete a page from the page cache and free it. Caller has to make 112 * sure the page is locked and that nobody else uses it - or that usage 113 * is safe. The caller must hold the mapping's tree_lock. 114 */ 115 void __delete_from_page_cache(struct page *page) 116 { 117 struct address_space *mapping = page->mapping; 118 119 radix_tree_delete(&mapping->page_tree, page->index); 120 page->mapping = NULL; 121 mapping->nrpages--; 122 __dec_zone_page_state(page, NR_FILE_PAGES); 123 if (PageSwapBacked(page)) 124 __dec_zone_page_state(page, NR_SHMEM); 125 BUG_ON(page_mapped(page)); 126 127 /* 128 * Some filesystems seem to re-dirty the page even after 129 * the VM has canceled the dirty bit (eg ext3 journaling). 130 * 131 * Fix it up by doing a final dirty accounting check after 132 * having removed the page entirely. 133 */ 134 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 135 dec_zone_page_state(page, NR_FILE_DIRTY); 136 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 137 } 138 } 139 140 /** 141 * delete_from_page_cache - delete page from page cache 142 * @page: the page which the kernel is trying to remove from page cache 143 * 144 * This must be called only on pages that have been verified to be in the page 145 * cache and locked. It will never put the page into the free list, the caller 146 * has a reference on the page. 147 */ 148 void delete_from_page_cache(struct page *page) 149 { 150 struct address_space *mapping = page->mapping; 151 void (*freepage)(struct page *); 152 153 BUG_ON(!PageLocked(page)); 154 155 freepage = mapping->a_ops->freepage; 156 spin_lock_irq(&mapping->tree_lock); 157 __delete_from_page_cache(page); 158 spin_unlock_irq(&mapping->tree_lock); 159 mem_cgroup_uncharge_cache_page(page); 160 161 if (freepage) 162 freepage(page); 163 page_cache_release(page); 164 } 165 EXPORT_SYMBOL(delete_from_page_cache); 166 167 static int sync_page(void *word) 168 { 169 struct address_space *mapping; 170 struct page *page; 171 172 page = container_of((unsigned long *)word, struct page, flags); 173 174 /* 175 * page_mapping() is being called without PG_locked held. 176 * Some knowledge of the state and use of the page is used to 177 * reduce the requirements down to a memory barrier. 178 * The danger here is of a stale page_mapping() return value 179 * indicating a struct address_space different from the one it's 180 * associated with when it is associated with one. 181 * After smp_mb(), it's either the correct page_mapping() for 182 * the page, or an old page_mapping() and the page's own 183 * page_mapping() has gone NULL. 184 * The ->sync_page() address_space operation must tolerate 185 * page_mapping() going NULL. By an amazing coincidence, 186 * this comes about because none of the users of the page 187 * in the ->sync_page() methods make essential use of the 188 * page_mapping(), merely passing the page down to the backing 189 * device's unplug functions when it's non-NULL, which in turn 190 * ignore it for all cases but swap, where only page_private(page) is 191 * of interest. When page_mapping() does go NULL, the entire 192 * call stack gracefully ignores the page and returns. 193 * -- wli 194 */ 195 smp_mb(); 196 mapping = page_mapping(page); 197 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 198 mapping->a_ops->sync_page(page); 199 io_schedule(); 200 return 0; 201 } 202 203 static int sync_page_killable(void *word) 204 { 205 sync_page(word); 206 return fatal_signal_pending(current) ? -EINTR : 0; 207 } 208 209 /** 210 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 211 * @mapping: address space structure to write 212 * @start: offset in bytes where the range starts 213 * @end: offset in bytes where the range ends (inclusive) 214 * @sync_mode: enable synchronous operation 215 * 216 * Start writeback against all of a mapping's dirty pages that lie 217 * within the byte offsets <start, end> inclusive. 218 * 219 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 220 * opposed to a regular memory cleansing writeback. The difference between 221 * these two operations is that if a dirty page/buffer is encountered, it must 222 * be waited upon, and not just skipped over. 223 */ 224 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 225 loff_t end, int sync_mode) 226 { 227 int ret; 228 struct writeback_control wbc = { 229 .sync_mode = sync_mode, 230 .nr_to_write = LONG_MAX, 231 .range_start = start, 232 .range_end = end, 233 }; 234 235 if (!mapping_cap_writeback_dirty(mapping)) 236 return 0; 237 238 ret = do_writepages(mapping, &wbc); 239 return ret; 240 } 241 242 static inline int __filemap_fdatawrite(struct address_space *mapping, 243 int sync_mode) 244 { 245 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 246 } 247 248 int filemap_fdatawrite(struct address_space *mapping) 249 { 250 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 251 } 252 EXPORT_SYMBOL(filemap_fdatawrite); 253 254 int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 255 loff_t end) 256 { 257 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 258 } 259 EXPORT_SYMBOL(filemap_fdatawrite_range); 260 261 /** 262 * filemap_flush - mostly a non-blocking flush 263 * @mapping: target address_space 264 * 265 * This is a mostly non-blocking flush. Not suitable for data-integrity 266 * purposes - I/O may not be started against all dirty pages. 267 */ 268 int filemap_flush(struct address_space *mapping) 269 { 270 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 271 } 272 EXPORT_SYMBOL(filemap_flush); 273 274 /** 275 * filemap_fdatawait_range - wait for writeback to complete 276 * @mapping: address space structure to wait for 277 * @start_byte: offset in bytes where the range starts 278 * @end_byte: offset in bytes where the range ends (inclusive) 279 * 280 * Walk the list of under-writeback pages of the given address space 281 * in the given range and wait for all of them. 282 */ 283 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 284 loff_t end_byte) 285 { 286 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; 287 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; 288 struct pagevec pvec; 289 int nr_pages; 290 int ret = 0; 291 292 if (end_byte < start_byte) 293 return 0; 294 295 pagevec_init(&pvec, 0); 296 while ((index <= end) && 297 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 298 PAGECACHE_TAG_WRITEBACK, 299 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 300 unsigned i; 301 302 for (i = 0; i < nr_pages; i++) { 303 struct page *page = pvec.pages[i]; 304 305 /* until radix tree lookup accepts end_index */ 306 if (page->index > end) 307 continue; 308 309 wait_on_page_writeback(page); 310 if (TestClearPageError(page)) 311 ret = -EIO; 312 } 313 pagevec_release(&pvec); 314 cond_resched(); 315 } 316 317 /* Check for outstanding write errors */ 318 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 319 ret = -ENOSPC; 320 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 321 ret = -EIO; 322 323 return ret; 324 } 325 EXPORT_SYMBOL(filemap_fdatawait_range); 326 327 /** 328 * filemap_fdatawait - wait for all under-writeback pages to complete 329 * @mapping: address space structure to wait for 330 * 331 * Walk the list of under-writeback pages of the given address space 332 * and wait for all of them. 333 */ 334 int filemap_fdatawait(struct address_space *mapping) 335 { 336 loff_t i_size = i_size_read(mapping->host); 337 338 if (i_size == 0) 339 return 0; 340 341 return filemap_fdatawait_range(mapping, 0, i_size - 1); 342 } 343 EXPORT_SYMBOL(filemap_fdatawait); 344 345 int filemap_write_and_wait(struct address_space *mapping) 346 { 347 int err = 0; 348 349 if (mapping->nrpages) { 350 err = filemap_fdatawrite(mapping); 351 /* 352 * Even if the above returned error, the pages may be 353 * written partially (e.g. -ENOSPC), so we wait for it. 354 * But the -EIO is special case, it may indicate the worst 355 * thing (e.g. bug) happened, so we avoid waiting for it. 356 */ 357 if (err != -EIO) { 358 int err2 = filemap_fdatawait(mapping); 359 if (!err) 360 err = err2; 361 } 362 } 363 return err; 364 } 365 EXPORT_SYMBOL(filemap_write_and_wait); 366 367 /** 368 * filemap_write_and_wait_range - write out & wait on a file range 369 * @mapping: the address_space for the pages 370 * @lstart: offset in bytes where the range starts 371 * @lend: offset in bytes where the range ends (inclusive) 372 * 373 * Write out and wait upon file offsets lstart->lend, inclusive. 374 * 375 * Note that `lend' is inclusive (describes the last byte to be written) so 376 * that this function can be used to write to the very end-of-file (end = -1). 377 */ 378 int filemap_write_and_wait_range(struct address_space *mapping, 379 loff_t lstart, loff_t lend) 380 { 381 int err = 0; 382 383 if (mapping->nrpages) { 384 err = __filemap_fdatawrite_range(mapping, lstart, lend, 385 WB_SYNC_ALL); 386 /* See comment of filemap_write_and_wait() */ 387 if (err != -EIO) { 388 int err2 = filemap_fdatawait_range(mapping, 389 lstart, lend); 390 if (!err) 391 err = err2; 392 } 393 } 394 return err; 395 } 396 EXPORT_SYMBOL(filemap_write_and_wait_range); 397 398 /** 399 * replace_page_cache_page - replace a pagecache page with a new one 400 * @old: page to be replaced 401 * @new: page to replace with 402 * @gfp_mask: allocation mode 403 * 404 * This function replaces a page in the pagecache with a new one. On 405 * success it acquires the pagecache reference for the new page and 406 * drops it for the old page. Both the old and new pages must be 407 * locked. This function does not add the new page to the LRU, the 408 * caller must do that. 409 * 410 * The remove + add is atomic. The only way this function can fail is 411 * memory allocation failure. 412 */ 413 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 414 { 415 int error; 416 struct mem_cgroup *memcg = NULL; 417 418 VM_BUG_ON(!PageLocked(old)); 419 VM_BUG_ON(!PageLocked(new)); 420 VM_BUG_ON(new->mapping); 421 422 /* 423 * This is not page migration, but prepare_migration and 424 * end_migration does enough work for charge replacement. 425 * 426 * In the longer term we probably want a specialized function 427 * for moving the charge from old to new in a more efficient 428 * manner. 429 */ 430 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); 431 if (error) 432 return error; 433 434 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 435 if (!error) { 436 struct address_space *mapping = old->mapping; 437 void (*freepage)(struct page *); 438 439 pgoff_t offset = old->index; 440 freepage = mapping->a_ops->freepage; 441 442 page_cache_get(new); 443 new->mapping = mapping; 444 new->index = offset; 445 446 spin_lock_irq(&mapping->tree_lock); 447 __delete_from_page_cache(old); 448 error = radix_tree_insert(&mapping->page_tree, offset, new); 449 BUG_ON(error); 450 mapping->nrpages++; 451 __inc_zone_page_state(new, NR_FILE_PAGES); 452 if (PageSwapBacked(new)) 453 __inc_zone_page_state(new, NR_SHMEM); 454 spin_unlock_irq(&mapping->tree_lock); 455 radix_tree_preload_end(); 456 if (freepage) 457 freepage(old); 458 page_cache_release(old); 459 mem_cgroup_end_migration(memcg, old, new, true); 460 } else { 461 mem_cgroup_end_migration(memcg, old, new, false); 462 } 463 464 return error; 465 } 466 EXPORT_SYMBOL_GPL(replace_page_cache_page); 467 468 /** 469 * add_to_page_cache_locked - add a locked page to the pagecache 470 * @page: page to add 471 * @mapping: the page's address_space 472 * @offset: page index 473 * @gfp_mask: page allocation mode 474 * 475 * This function is used to add a page to the pagecache. It must be locked. 476 * This function does not add the page to the LRU. The caller must do that. 477 */ 478 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 479 pgoff_t offset, gfp_t gfp_mask) 480 { 481 int error; 482 483 VM_BUG_ON(!PageLocked(page)); 484 485 error = mem_cgroup_cache_charge(page, current->mm, 486 gfp_mask & GFP_RECLAIM_MASK); 487 if (error) 488 goto out; 489 490 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 491 if (error == 0) { 492 page_cache_get(page); 493 page->mapping = mapping; 494 page->index = offset; 495 496 spin_lock_irq(&mapping->tree_lock); 497 error = radix_tree_insert(&mapping->page_tree, offset, page); 498 if (likely(!error)) { 499 mapping->nrpages++; 500 __inc_zone_page_state(page, NR_FILE_PAGES); 501 if (PageSwapBacked(page)) 502 __inc_zone_page_state(page, NR_SHMEM); 503 spin_unlock_irq(&mapping->tree_lock); 504 } else { 505 page->mapping = NULL; 506 spin_unlock_irq(&mapping->tree_lock); 507 mem_cgroup_uncharge_cache_page(page); 508 page_cache_release(page); 509 } 510 radix_tree_preload_end(); 511 } else 512 mem_cgroup_uncharge_cache_page(page); 513 out: 514 return error; 515 } 516 EXPORT_SYMBOL(add_to_page_cache_locked); 517 518 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 519 pgoff_t offset, gfp_t gfp_mask) 520 { 521 int ret; 522 523 /* 524 * Splice_read and readahead add shmem/tmpfs pages into the page cache 525 * before shmem_readpage has a chance to mark them as SwapBacked: they 526 * need to go on the anon lru below, and mem_cgroup_cache_charge 527 * (called in add_to_page_cache) needs to know where they're going too. 528 */ 529 if (mapping_cap_swap_backed(mapping)) 530 SetPageSwapBacked(page); 531 532 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 533 if (ret == 0) { 534 if (page_is_file_cache(page)) 535 lru_cache_add_file(page); 536 else 537 lru_cache_add_anon(page); 538 } 539 return ret; 540 } 541 EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 542 543 #ifdef CONFIG_NUMA 544 struct page *__page_cache_alloc(gfp_t gfp) 545 { 546 int n; 547 struct page *page; 548 549 if (cpuset_do_page_mem_spread()) { 550 get_mems_allowed(); 551 n = cpuset_mem_spread_node(); 552 page = alloc_pages_exact_node(n, gfp, 0); 553 put_mems_allowed(); 554 return page; 555 } 556 return alloc_pages(gfp, 0); 557 } 558 EXPORT_SYMBOL(__page_cache_alloc); 559 #endif 560 561 static int __sleep_on_page_lock(void *word) 562 { 563 io_schedule(); 564 return 0; 565 } 566 567 /* 568 * In order to wait for pages to become available there must be 569 * waitqueues associated with pages. By using a hash table of 570 * waitqueues where the bucket discipline is to maintain all 571 * waiters on the same queue and wake all when any of the pages 572 * become available, and for the woken contexts to check to be 573 * sure the appropriate page became available, this saves space 574 * at a cost of "thundering herd" phenomena during rare hash 575 * collisions. 576 */ 577 static wait_queue_head_t *page_waitqueue(struct page *page) 578 { 579 const struct zone *zone = page_zone(page); 580 581 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 582 } 583 584 static inline void wake_up_page(struct page *page, int bit) 585 { 586 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 587 } 588 589 void wait_on_page_bit(struct page *page, int bit_nr) 590 { 591 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 592 593 if (test_bit(bit_nr, &page->flags)) 594 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 595 TASK_UNINTERRUPTIBLE); 596 } 597 EXPORT_SYMBOL(wait_on_page_bit); 598 599 /** 600 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 601 * @page: Page defining the wait queue of interest 602 * @waiter: Waiter to add to the queue 603 * 604 * Add an arbitrary @waiter to the wait queue for the nominated @page. 605 */ 606 void add_page_wait_queue(struct page *page, wait_queue_t *waiter) 607 { 608 wait_queue_head_t *q = page_waitqueue(page); 609 unsigned long flags; 610 611 spin_lock_irqsave(&q->lock, flags); 612 __add_wait_queue(q, waiter); 613 spin_unlock_irqrestore(&q->lock, flags); 614 } 615 EXPORT_SYMBOL_GPL(add_page_wait_queue); 616 617 /** 618 * unlock_page - unlock a locked page 619 * @page: the page 620 * 621 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 622 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 623 * mechananism between PageLocked pages and PageWriteback pages is shared. 624 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 625 * 626 * The mb is necessary to enforce ordering between the clear_bit and the read 627 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). 628 */ 629 void unlock_page(struct page *page) 630 { 631 VM_BUG_ON(!PageLocked(page)); 632 clear_bit_unlock(PG_locked, &page->flags); 633 smp_mb__after_clear_bit(); 634 wake_up_page(page, PG_locked); 635 } 636 EXPORT_SYMBOL(unlock_page); 637 638 /** 639 * end_page_writeback - end writeback against a page 640 * @page: the page 641 */ 642 void end_page_writeback(struct page *page) 643 { 644 if (TestClearPageReclaim(page)) 645 rotate_reclaimable_page(page); 646 647 if (!test_clear_page_writeback(page)) 648 BUG(); 649 650 smp_mb__after_clear_bit(); 651 wake_up_page(page, PG_writeback); 652 } 653 EXPORT_SYMBOL(end_page_writeback); 654 655 /** 656 * __lock_page - get a lock on the page, assuming we need to sleep to get it 657 * @page: the page to lock 658 * 659 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 660 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 661 * chances are that on the second loop, the block layer's plug list is empty, 662 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 663 */ 664 void __lock_page(struct page *page) 665 { 666 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 667 668 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 669 TASK_UNINTERRUPTIBLE); 670 } 671 EXPORT_SYMBOL(__lock_page); 672 673 int __lock_page_killable(struct page *page) 674 { 675 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 676 677 return __wait_on_bit_lock(page_waitqueue(page), &wait, 678 sync_page_killable, TASK_KILLABLE); 679 } 680 EXPORT_SYMBOL_GPL(__lock_page_killable); 681 682 /** 683 * __lock_page_nosync - get a lock on the page, without calling sync_page() 684 * @page: the page to lock 685 * 686 * Variant of lock_page that does not require the caller to hold a reference 687 * on the page's mapping. 688 */ 689 void __lock_page_nosync(struct page *page) 690 { 691 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 692 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 693 TASK_UNINTERRUPTIBLE); 694 } 695 696 int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 697 unsigned int flags) 698 { 699 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { 700 __lock_page(page); 701 return 1; 702 } else { 703 if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { 704 up_read(&mm->mmap_sem); 705 wait_on_page_locked(page); 706 } 707 return 0; 708 } 709 } 710 711 /** 712 * find_get_page - find and get a page reference 713 * @mapping: the address_space to search 714 * @offset: the page index 715 * 716 * Is there a pagecache struct page at the given (mapping, offset) tuple? 717 * If yes, increment its refcount and return it; if no, return NULL. 718 */ 719 struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 720 { 721 void **pagep; 722 struct page *page; 723 724 rcu_read_lock(); 725 repeat: 726 page = NULL; 727 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 728 if (pagep) { 729 page = radix_tree_deref_slot(pagep); 730 if (unlikely(!page)) 731 goto out; 732 if (radix_tree_deref_retry(page)) 733 goto repeat; 734 735 if (!page_cache_get_speculative(page)) 736 goto repeat; 737 738 /* 739 * Has the page moved? 740 * This is part of the lockless pagecache protocol. See 741 * include/linux/pagemap.h for details. 742 */ 743 if (unlikely(page != *pagep)) { 744 page_cache_release(page); 745 goto repeat; 746 } 747 } 748 out: 749 rcu_read_unlock(); 750 751 return page; 752 } 753 EXPORT_SYMBOL(find_get_page); 754 755 /** 756 * find_lock_page - locate, pin and lock a pagecache page 757 * @mapping: the address_space to search 758 * @offset: the page index 759 * 760 * Locates the desired pagecache page, locks it, increments its reference 761 * count and returns its address. 762 * 763 * Returns zero if the page was not present. find_lock_page() may sleep. 764 */ 765 struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) 766 { 767 struct page *page; 768 769 repeat: 770 page = find_get_page(mapping, offset); 771 if (page) { 772 lock_page(page); 773 /* Has the page been truncated? */ 774 if (unlikely(page->mapping != mapping)) { 775 unlock_page(page); 776 page_cache_release(page); 777 goto repeat; 778 } 779 VM_BUG_ON(page->index != offset); 780 } 781 return page; 782 } 783 EXPORT_SYMBOL(find_lock_page); 784 785 /** 786 * find_or_create_page - locate or add a pagecache page 787 * @mapping: the page's address_space 788 * @index: the page's index into the mapping 789 * @gfp_mask: page allocation mode 790 * 791 * Locates a page in the pagecache. If the page is not present, a new page 792 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 793 * LRU list. The returned page is locked and has its reference count 794 * incremented. 795 * 796 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 797 * allocation! 798 * 799 * find_or_create_page() returns the desired page's address, or zero on 800 * memory exhaustion. 801 */ 802 struct page *find_or_create_page(struct address_space *mapping, 803 pgoff_t index, gfp_t gfp_mask) 804 { 805 struct page *page; 806 int err; 807 repeat: 808 page = find_lock_page(mapping, index); 809 if (!page) { 810 page = __page_cache_alloc(gfp_mask); 811 if (!page) 812 return NULL; 813 /* 814 * We want a regular kernel memory (not highmem or DMA etc) 815 * allocation for the radix tree nodes, but we need to honour 816 * the context-specific requirements the caller has asked for. 817 * GFP_RECLAIM_MASK collects those requirements. 818 */ 819 err = add_to_page_cache_lru(page, mapping, index, 820 (gfp_mask & GFP_RECLAIM_MASK)); 821 if (unlikely(err)) { 822 page_cache_release(page); 823 page = NULL; 824 if (err == -EEXIST) 825 goto repeat; 826 } 827 } 828 return page; 829 } 830 EXPORT_SYMBOL(find_or_create_page); 831 832 /** 833 * find_get_pages - gang pagecache lookup 834 * @mapping: The address_space to search 835 * @start: The starting page index 836 * @nr_pages: The maximum number of pages 837 * @pages: Where the resulting pages are placed 838 * 839 * find_get_pages() will search for and return a group of up to 840 * @nr_pages pages in the mapping. The pages are placed at @pages. 841 * find_get_pages() takes a reference against the returned pages. 842 * 843 * The search returns a group of mapping-contiguous pages with ascending 844 * indexes. There may be holes in the indices due to not-present pages. 845 * 846 * find_get_pages() returns the number of pages which were found. 847 */ 848 unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 849 unsigned int nr_pages, struct page **pages) 850 { 851 unsigned int i; 852 unsigned int ret; 853 unsigned int nr_found; 854 855 rcu_read_lock(); 856 restart: 857 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 858 (void ***)pages, start, nr_pages); 859 ret = 0; 860 for (i = 0; i < nr_found; i++) { 861 struct page *page; 862 repeat: 863 page = radix_tree_deref_slot((void **)pages[i]); 864 if (unlikely(!page)) 865 continue; 866 867 /* 868 * This can only trigger when the entry at index 0 moves out 869 * of or back to the root: none yet gotten, safe to restart. 870 */ 871 if (radix_tree_deref_retry(page)) { 872 WARN_ON(start | i); 873 goto restart; 874 } 875 876 if (!page_cache_get_speculative(page)) 877 goto repeat; 878 879 /* Has the page moved? */ 880 if (unlikely(page != *((void **)pages[i]))) { 881 page_cache_release(page); 882 goto repeat; 883 } 884 885 pages[ret] = page; 886 ret++; 887 } 888 889 /* 890 * If all entries were removed before we could secure them, 891 * try again, because callers stop trying once 0 is returned. 892 */ 893 if (unlikely(!ret && nr_found)) 894 goto restart; 895 rcu_read_unlock(); 896 return ret; 897 } 898 899 /** 900 * find_get_pages_contig - gang contiguous pagecache lookup 901 * @mapping: The address_space to search 902 * @index: The starting page index 903 * @nr_pages: The maximum number of pages 904 * @pages: Where the resulting pages are placed 905 * 906 * find_get_pages_contig() works exactly like find_get_pages(), except 907 * that the returned number of pages are guaranteed to be contiguous. 908 * 909 * find_get_pages_contig() returns the number of pages which were found. 910 */ 911 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 912 unsigned int nr_pages, struct page **pages) 913 { 914 unsigned int i; 915 unsigned int ret; 916 unsigned int nr_found; 917 918 rcu_read_lock(); 919 restart: 920 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 921 (void ***)pages, index, nr_pages); 922 ret = 0; 923 for (i = 0; i < nr_found; i++) { 924 struct page *page; 925 repeat: 926 page = radix_tree_deref_slot((void **)pages[i]); 927 if (unlikely(!page)) 928 continue; 929 930 /* 931 * This can only trigger when the entry at index 0 moves out 932 * of or back to the root: none yet gotten, safe to restart. 933 */ 934 if (radix_tree_deref_retry(page)) 935 goto restart; 936 937 if (!page_cache_get_speculative(page)) 938 goto repeat; 939 940 /* Has the page moved? */ 941 if (unlikely(page != *((void **)pages[i]))) { 942 page_cache_release(page); 943 goto repeat; 944 } 945 946 /* 947 * must check mapping and index after taking the ref. 948 * otherwise we can get both false positives and false 949 * negatives, which is just confusing to the caller. 950 */ 951 if (page->mapping == NULL || page->index != index) { 952 page_cache_release(page); 953 break; 954 } 955 956 pages[ret] = page; 957 ret++; 958 index++; 959 } 960 rcu_read_unlock(); 961 return ret; 962 } 963 EXPORT_SYMBOL(find_get_pages_contig); 964 965 /** 966 * find_get_pages_tag - find and return pages that match @tag 967 * @mapping: the address_space to search 968 * @index: the starting page index 969 * @tag: the tag index 970 * @nr_pages: the maximum number of pages 971 * @pages: where the resulting pages are placed 972 * 973 * Like find_get_pages, except we only return pages which are tagged with 974 * @tag. We update @index to index the next page for the traversal. 975 */ 976 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 977 int tag, unsigned int nr_pages, struct page **pages) 978 { 979 unsigned int i; 980 unsigned int ret; 981 unsigned int nr_found; 982 983 rcu_read_lock(); 984 restart: 985 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, 986 (void ***)pages, *index, nr_pages, tag); 987 ret = 0; 988 for (i = 0; i < nr_found; i++) { 989 struct page *page; 990 repeat: 991 page = radix_tree_deref_slot((void **)pages[i]); 992 if (unlikely(!page)) 993 continue; 994 995 /* 996 * This can only trigger when the entry at index 0 moves out 997 * of or back to the root: none yet gotten, safe to restart. 998 */ 999 if (radix_tree_deref_retry(page)) 1000 goto restart; 1001 1002 if (!page_cache_get_speculative(page)) 1003 goto repeat; 1004 1005 /* Has the page moved? */ 1006 if (unlikely(page != *((void **)pages[i]))) { 1007 page_cache_release(page); 1008 goto repeat; 1009 } 1010 1011 pages[ret] = page; 1012 ret++; 1013 } 1014 1015 /* 1016 * If all entries were removed before we could secure them, 1017 * try again, because callers stop trying once 0 is returned. 1018 */ 1019 if (unlikely(!ret && nr_found)) 1020 goto restart; 1021 rcu_read_unlock(); 1022 1023 if (ret) 1024 *index = pages[ret - 1]->index + 1; 1025 1026 return ret; 1027 } 1028 EXPORT_SYMBOL(find_get_pages_tag); 1029 1030 /** 1031 * grab_cache_page_nowait - returns locked page at given index in given cache 1032 * @mapping: target address_space 1033 * @index: the page index 1034 * 1035 * Same as grab_cache_page(), but do not wait if the page is unavailable. 1036 * This is intended for speculative data generators, where the data can 1037 * be regenerated if the page couldn't be grabbed. This routine should 1038 * be safe to call while holding the lock for another page. 1039 * 1040 * Clear __GFP_FS when allocating the page to avoid recursion into the fs 1041 * and deadlock against the caller's locked page. 1042 */ 1043 struct page * 1044 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) 1045 { 1046 struct page *page = find_get_page(mapping, index); 1047 1048 if (page) { 1049 if (trylock_page(page)) 1050 return page; 1051 page_cache_release(page); 1052 return NULL; 1053 } 1054 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 1055 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { 1056 page_cache_release(page); 1057 page = NULL; 1058 } 1059 return page; 1060 } 1061 EXPORT_SYMBOL(grab_cache_page_nowait); 1062 1063 /* 1064 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1065 * a _large_ part of the i/o request. Imagine the worst scenario: 1066 * 1067 * ---R__________________________________________B__________ 1068 * ^ reading here ^ bad block(assume 4k) 1069 * 1070 * read(R) => miss => readahead(R...B) => media error => frustrating retries 1071 * => failing the whole request => read(R) => read(R+1) => 1072 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 1073 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 1074 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 1075 * 1076 * It is going insane. Fix it by quickly scaling down the readahead size. 1077 */ 1078 static void shrink_readahead_size_eio(struct file *filp, 1079 struct file_ra_state *ra) 1080 { 1081 ra->ra_pages /= 4; 1082 } 1083 1084 /** 1085 * do_generic_file_read - generic file read routine 1086 * @filp: the file to read 1087 * @ppos: current file position 1088 * @desc: read_descriptor 1089 * @actor: read method 1090 * 1091 * This is a generic file read routine, and uses the 1092 * mapping->a_ops->readpage() function for the actual low-level stuff. 1093 * 1094 * This is really ugly. But the goto's actually try to clarify some 1095 * of the logic when it comes to error handling etc. 1096 */ 1097 static void do_generic_file_read(struct file *filp, loff_t *ppos, 1098 read_descriptor_t *desc, read_actor_t actor) 1099 { 1100 struct address_space *mapping = filp->f_mapping; 1101 struct inode *inode = mapping->host; 1102 struct file_ra_state *ra = &filp->f_ra; 1103 pgoff_t index; 1104 pgoff_t last_index; 1105 pgoff_t prev_index; 1106 unsigned long offset; /* offset into pagecache page */ 1107 unsigned int prev_offset; 1108 int error; 1109 1110 index = *ppos >> PAGE_CACHE_SHIFT; 1111 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 1112 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 1113 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 1114 offset = *ppos & ~PAGE_CACHE_MASK; 1115 1116 for (;;) { 1117 struct page *page; 1118 pgoff_t end_index; 1119 loff_t isize; 1120 unsigned long nr, ret; 1121 1122 cond_resched(); 1123 find_page: 1124 page = find_get_page(mapping, index); 1125 if (!page) { 1126 page_cache_sync_readahead(mapping, 1127 ra, filp, 1128 index, last_index - index); 1129 page = find_get_page(mapping, index); 1130 if (unlikely(page == NULL)) 1131 goto no_cached_page; 1132 } 1133 if (PageReadahead(page)) { 1134 page_cache_async_readahead(mapping, 1135 ra, filp, page, 1136 index, last_index - index); 1137 } 1138 if (!PageUptodate(page)) { 1139 if (inode->i_blkbits == PAGE_CACHE_SHIFT || 1140 !mapping->a_ops->is_partially_uptodate) 1141 goto page_not_up_to_date; 1142 if (!trylock_page(page)) 1143 goto page_not_up_to_date; 1144 /* Did it get truncated before we got the lock? */ 1145 if (!page->mapping) 1146 goto page_not_up_to_date_locked; 1147 if (!mapping->a_ops->is_partially_uptodate(page, 1148 desc, offset)) 1149 goto page_not_up_to_date_locked; 1150 unlock_page(page); 1151 } 1152 page_ok: 1153 /* 1154 * i_size must be checked after we know the page is Uptodate. 1155 * 1156 * Checking i_size after the check allows us to calculate 1157 * the correct value for "nr", which means the zero-filled 1158 * part of the page is not copied back to userspace (unless 1159 * another truncate extends the file - this is desired though). 1160 */ 1161 1162 isize = i_size_read(inode); 1163 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1164 if (unlikely(!isize || index > end_index)) { 1165 page_cache_release(page); 1166 goto out; 1167 } 1168 1169 /* nr is the maximum number of bytes to copy from this page */ 1170 nr = PAGE_CACHE_SIZE; 1171 if (index == end_index) { 1172 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1173 if (nr <= offset) { 1174 page_cache_release(page); 1175 goto out; 1176 } 1177 } 1178 nr = nr - offset; 1179 1180 /* If users can be writing to this page using arbitrary 1181 * virtual addresses, take care about potential aliasing 1182 * before reading the page on the kernel side. 1183 */ 1184 if (mapping_writably_mapped(mapping)) 1185 flush_dcache_page(page); 1186 1187 /* 1188 * When a sequential read accesses a page several times, 1189 * only mark it as accessed the first time. 1190 */ 1191 if (prev_index != index || offset != prev_offset) 1192 mark_page_accessed(page); 1193 prev_index = index; 1194 1195 /* 1196 * Ok, we have the page, and it's up-to-date, so 1197 * now we can copy it to user space... 1198 * 1199 * The actor routine returns how many bytes were actually used.. 1200 * NOTE! This may not be the same as how much of a user buffer 1201 * we filled up (we may be padding etc), so we can only update 1202 * "pos" here (the actor routine has to update the user buffer 1203 * pointers and the remaining count). 1204 */ 1205 ret = actor(desc, page, offset, nr); 1206 offset += ret; 1207 index += offset >> PAGE_CACHE_SHIFT; 1208 offset &= ~PAGE_CACHE_MASK; 1209 prev_offset = offset; 1210 1211 page_cache_release(page); 1212 if (ret == nr && desc->count) 1213 continue; 1214 goto out; 1215 1216 page_not_up_to_date: 1217 /* Get exclusive access to the page ... */ 1218 error = lock_page_killable(page); 1219 if (unlikely(error)) 1220 goto readpage_error; 1221 1222 page_not_up_to_date_locked: 1223 /* Did it get truncated before we got the lock? */ 1224 if (!page->mapping) { 1225 unlock_page(page); 1226 page_cache_release(page); 1227 continue; 1228 } 1229 1230 /* Did somebody else fill it already? */ 1231 if (PageUptodate(page)) { 1232 unlock_page(page); 1233 goto page_ok; 1234 } 1235 1236 readpage: 1237 /* 1238 * A previous I/O error may have been due to temporary 1239 * failures, eg. multipath errors. 1240 * PG_error will be set again if readpage fails. 1241 */ 1242 ClearPageError(page); 1243 /* Start the actual read. The read will unlock the page. */ 1244 error = mapping->a_ops->readpage(filp, page); 1245 1246 if (unlikely(error)) { 1247 if (error == AOP_TRUNCATED_PAGE) { 1248 page_cache_release(page); 1249 goto find_page; 1250 } 1251 goto readpage_error; 1252 } 1253 1254 if (!PageUptodate(page)) { 1255 error = lock_page_killable(page); 1256 if (unlikely(error)) 1257 goto readpage_error; 1258 if (!PageUptodate(page)) { 1259 if (page->mapping == NULL) { 1260 /* 1261 * invalidate_mapping_pages got it 1262 */ 1263 unlock_page(page); 1264 page_cache_release(page); 1265 goto find_page; 1266 } 1267 unlock_page(page); 1268 shrink_readahead_size_eio(filp, ra); 1269 error = -EIO; 1270 goto readpage_error; 1271 } 1272 unlock_page(page); 1273 } 1274 1275 goto page_ok; 1276 1277 readpage_error: 1278 /* UHHUH! A synchronous read error occurred. Report it */ 1279 desc->error = error; 1280 page_cache_release(page); 1281 goto out; 1282 1283 no_cached_page: 1284 /* 1285 * Ok, it wasn't cached, so we need to create a new 1286 * page.. 1287 */ 1288 page = page_cache_alloc_cold(mapping); 1289 if (!page) { 1290 desc->error = -ENOMEM; 1291 goto out; 1292 } 1293 error = add_to_page_cache_lru(page, mapping, 1294 index, GFP_KERNEL); 1295 if (error) { 1296 page_cache_release(page); 1297 if (error == -EEXIST) 1298 goto find_page; 1299 desc->error = error; 1300 goto out; 1301 } 1302 goto readpage; 1303 } 1304 1305 out: 1306 ra->prev_pos = prev_index; 1307 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1308 ra->prev_pos |= prev_offset; 1309 1310 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1311 file_accessed(filp); 1312 } 1313 1314 int file_read_actor(read_descriptor_t *desc, struct page *page, 1315 unsigned long offset, unsigned long size) 1316 { 1317 char *kaddr; 1318 unsigned long left, count = desc->count; 1319 1320 if (size > count) 1321 size = count; 1322 1323 /* 1324 * Faults on the destination of a read are common, so do it before 1325 * taking the kmap. 1326 */ 1327 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1328 kaddr = kmap_atomic(page, KM_USER0); 1329 left = __copy_to_user_inatomic(desc->arg.buf, 1330 kaddr + offset, size); 1331 kunmap_atomic(kaddr, KM_USER0); 1332 if (left == 0) 1333 goto success; 1334 } 1335 1336 /* Do it the slow way */ 1337 kaddr = kmap(page); 1338 left = __copy_to_user(desc->arg.buf, kaddr + offset, size); 1339 kunmap(page); 1340 1341 if (left) { 1342 size -= left; 1343 desc->error = -EFAULT; 1344 } 1345 success: 1346 desc->count = count - size; 1347 desc->written += size; 1348 desc->arg.buf += size; 1349 return size; 1350 } 1351 1352 /* 1353 * Performs necessary checks before doing a write 1354 * @iov: io vector request 1355 * @nr_segs: number of segments in the iovec 1356 * @count: number of bytes to write 1357 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE 1358 * 1359 * Adjust number of segments and amount of bytes to write (nr_segs should be 1360 * properly initialized first). Returns appropriate error code that caller 1361 * should return or zero in case that write should be allowed. 1362 */ 1363 int generic_segment_checks(const struct iovec *iov, 1364 unsigned long *nr_segs, size_t *count, int access_flags) 1365 { 1366 unsigned long seg; 1367 size_t cnt = 0; 1368 for (seg = 0; seg < *nr_segs; seg++) { 1369 const struct iovec *iv = &iov[seg]; 1370 1371 /* 1372 * If any segment has a negative length, or the cumulative 1373 * length ever wraps negative then return -EINVAL. 1374 */ 1375 cnt += iv->iov_len; 1376 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) 1377 return -EINVAL; 1378 if (access_ok(access_flags, iv->iov_base, iv->iov_len)) 1379 continue; 1380 if (seg == 0) 1381 return -EFAULT; 1382 *nr_segs = seg; 1383 cnt -= iv->iov_len; /* This segment is no good */ 1384 break; 1385 } 1386 *count = cnt; 1387 return 0; 1388 } 1389 EXPORT_SYMBOL(generic_segment_checks); 1390 1391 /** 1392 * generic_file_aio_read - generic filesystem read routine 1393 * @iocb: kernel I/O control block 1394 * @iov: io vector request 1395 * @nr_segs: number of segments in the iovec 1396 * @pos: current file position 1397 * 1398 * This is the "read()" routine for all filesystems 1399 * that can use the page cache directly. 1400 */ 1401 ssize_t 1402 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1403 unsigned long nr_segs, loff_t pos) 1404 { 1405 struct file *filp = iocb->ki_filp; 1406 ssize_t retval; 1407 unsigned long seg = 0; 1408 size_t count; 1409 loff_t *ppos = &iocb->ki_pos; 1410 1411 count = 0; 1412 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1413 if (retval) 1414 return retval; 1415 1416 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1417 if (filp->f_flags & O_DIRECT) { 1418 loff_t size; 1419 struct address_space *mapping; 1420 struct inode *inode; 1421 1422 mapping = filp->f_mapping; 1423 inode = mapping->host; 1424 if (!count) 1425 goto out; /* skip atime */ 1426 size = i_size_read(inode); 1427 if (pos < size) { 1428 retval = filemap_write_and_wait_range(mapping, pos, 1429 pos + iov_length(iov, nr_segs) - 1); 1430 if (!retval) { 1431 retval = mapping->a_ops->direct_IO(READ, iocb, 1432 iov, pos, nr_segs); 1433 } 1434 if (retval > 0) { 1435 *ppos = pos + retval; 1436 count -= retval; 1437 } 1438 1439 /* 1440 * Btrfs can have a short DIO read if we encounter 1441 * compressed extents, so if there was an error, or if 1442 * we've already read everything we wanted to, or if 1443 * there was a short read because we hit EOF, go ahead 1444 * and return. Otherwise fallthrough to buffered io for 1445 * the rest of the read. 1446 */ 1447 if (retval < 0 || !count || *ppos >= size) { 1448 file_accessed(filp); 1449 goto out; 1450 } 1451 } 1452 } 1453 1454 count = retval; 1455 for (seg = 0; seg < nr_segs; seg++) { 1456 read_descriptor_t desc; 1457 loff_t offset = 0; 1458 1459 /* 1460 * If we did a short DIO read we need to skip the section of the 1461 * iov that we've already read data into. 1462 */ 1463 if (count) { 1464 if (count > iov[seg].iov_len) { 1465 count -= iov[seg].iov_len; 1466 continue; 1467 } 1468 offset = count; 1469 count = 0; 1470 } 1471 1472 desc.written = 0; 1473 desc.arg.buf = iov[seg].iov_base + offset; 1474 desc.count = iov[seg].iov_len - offset; 1475 if (desc.count == 0) 1476 continue; 1477 desc.error = 0; 1478 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1479 retval += desc.written; 1480 if (desc.error) { 1481 retval = retval ?: desc.error; 1482 break; 1483 } 1484 if (desc.count > 0) 1485 break; 1486 } 1487 out: 1488 return retval; 1489 } 1490 EXPORT_SYMBOL(generic_file_aio_read); 1491 1492 static ssize_t 1493 do_readahead(struct address_space *mapping, struct file *filp, 1494 pgoff_t index, unsigned long nr) 1495 { 1496 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1497 return -EINVAL; 1498 1499 force_page_cache_readahead(mapping, filp, index, nr); 1500 return 0; 1501 } 1502 1503 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) 1504 { 1505 ssize_t ret; 1506 struct file *file; 1507 1508 ret = -EBADF; 1509 file = fget(fd); 1510 if (file) { 1511 if (file->f_mode & FMODE_READ) { 1512 struct address_space *mapping = file->f_mapping; 1513 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 1514 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1515 unsigned long len = end - start + 1; 1516 ret = do_readahead(mapping, file, start, len); 1517 } 1518 fput(file); 1519 } 1520 return ret; 1521 } 1522 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 1523 asmlinkage long SyS_readahead(long fd, loff_t offset, long count) 1524 { 1525 return SYSC_readahead((int) fd, offset, (size_t) count); 1526 } 1527 SYSCALL_ALIAS(sys_readahead, SyS_readahead); 1528 #endif 1529 1530 #ifdef CONFIG_MMU 1531 /** 1532 * page_cache_read - adds requested page to the page cache if not already there 1533 * @file: file to read 1534 * @offset: page index 1535 * 1536 * This adds the requested page to the page cache if it isn't already there, 1537 * and schedules an I/O to read in its contents from disk. 1538 */ 1539 static int page_cache_read(struct file *file, pgoff_t offset) 1540 { 1541 struct address_space *mapping = file->f_mapping; 1542 struct page *page; 1543 int ret; 1544 1545 do { 1546 page = page_cache_alloc_cold(mapping); 1547 if (!page) 1548 return -ENOMEM; 1549 1550 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1551 if (ret == 0) 1552 ret = mapping->a_ops->readpage(file, page); 1553 else if (ret == -EEXIST) 1554 ret = 0; /* losing race to add is OK */ 1555 1556 page_cache_release(page); 1557 1558 } while (ret == AOP_TRUNCATED_PAGE); 1559 1560 return ret; 1561 } 1562 1563 #define MMAP_LOTSAMISS (100) 1564 1565 /* 1566 * Synchronous readahead happens when we don't even find 1567 * a page in the page cache at all. 1568 */ 1569 static void do_sync_mmap_readahead(struct vm_area_struct *vma, 1570 struct file_ra_state *ra, 1571 struct file *file, 1572 pgoff_t offset) 1573 { 1574 unsigned long ra_pages; 1575 struct address_space *mapping = file->f_mapping; 1576 1577 /* If we don't want any read-ahead, don't bother */ 1578 if (VM_RandomReadHint(vma)) 1579 return; 1580 1581 if (VM_SequentialReadHint(vma) || 1582 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { 1583 page_cache_sync_readahead(mapping, ra, file, offset, 1584 ra->ra_pages); 1585 return; 1586 } 1587 1588 if (ra->mmap_miss < INT_MAX) 1589 ra->mmap_miss++; 1590 1591 /* 1592 * Do we miss much more than hit in this file? If so, 1593 * stop bothering with read-ahead. It will only hurt. 1594 */ 1595 if (ra->mmap_miss > MMAP_LOTSAMISS) 1596 return; 1597 1598 /* 1599 * mmap read-around 1600 */ 1601 ra_pages = max_sane_readahead(ra->ra_pages); 1602 if (ra_pages) { 1603 ra->start = max_t(long, 0, offset - ra_pages/2); 1604 ra->size = ra_pages; 1605 ra->async_size = 0; 1606 ra_submit(ra, mapping, file); 1607 } 1608 } 1609 1610 /* 1611 * Asynchronous readahead happens when we find the page and PG_readahead, 1612 * so we want to possibly extend the readahead further.. 1613 */ 1614 static void do_async_mmap_readahead(struct vm_area_struct *vma, 1615 struct file_ra_state *ra, 1616 struct file *file, 1617 struct page *page, 1618 pgoff_t offset) 1619 { 1620 struct address_space *mapping = file->f_mapping; 1621 1622 /* If we don't want any read-ahead, don't bother */ 1623 if (VM_RandomReadHint(vma)) 1624 return; 1625 if (ra->mmap_miss > 0) 1626 ra->mmap_miss--; 1627 if (PageReadahead(page)) 1628 page_cache_async_readahead(mapping, ra, file, 1629 page, offset, ra->ra_pages); 1630 } 1631 1632 /** 1633 * filemap_fault - read in file data for page fault handling 1634 * @vma: vma in which the fault was taken 1635 * @vmf: struct vm_fault containing details of the fault 1636 * 1637 * filemap_fault() is invoked via the vma operations vector for a 1638 * mapped memory region to read in file data during a page fault. 1639 * 1640 * The goto's are kind of ugly, but this streamlines the normal case of having 1641 * it in the page cache, and handles the special cases reasonably without 1642 * having a lot of duplicated code. 1643 */ 1644 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1645 { 1646 int error; 1647 struct file *file = vma->vm_file; 1648 struct address_space *mapping = file->f_mapping; 1649 struct file_ra_state *ra = &file->f_ra; 1650 struct inode *inode = mapping->host; 1651 pgoff_t offset = vmf->pgoff; 1652 struct page *page; 1653 pgoff_t size; 1654 int ret = 0; 1655 1656 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1657 if (offset >= size) 1658 return VM_FAULT_SIGBUS; 1659 1660 /* 1661 * Do we have something in the page cache already? 1662 */ 1663 page = find_get_page(mapping, offset); 1664 if (likely(page)) { 1665 /* 1666 * We found the page, so try async readahead before 1667 * waiting for the lock. 1668 */ 1669 do_async_mmap_readahead(vma, ra, file, page, offset); 1670 } else { 1671 /* No page in the page cache at all */ 1672 do_sync_mmap_readahead(vma, ra, file, offset); 1673 count_vm_event(PGMAJFAULT); 1674 ret = VM_FAULT_MAJOR; 1675 retry_find: 1676 page = find_get_page(mapping, offset); 1677 if (!page) 1678 goto no_cached_page; 1679 } 1680 1681 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 1682 page_cache_release(page); 1683 return ret | VM_FAULT_RETRY; 1684 } 1685 1686 /* Did it get truncated? */ 1687 if (unlikely(page->mapping != mapping)) { 1688 unlock_page(page); 1689 put_page(page); 1690 goto retry_find; 1691 } 1692 VM_BUG_ON(page->index != offset); 1693 1694 /* 1695 * We have a locked page in the page cache, now we need to check 1696 * that it's up-to-date. If not, it is going to be due to an error. 1697 */ 1698 if (unlikely(!PageUptodate(page))) 1699 goto page_not_uptodate; 1700 1701 /* 1702 * Found the page and have a reference on it. 1703 * We must recheck i_size under page lock. 1704 */ 1705 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1706 if (unlikely(offset >= size)) { 1707 unlock_page(page); 1708 page_cache_release(page); 1709 return VM_FAULT_SIGBUS; 1710 } 1711 1712 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; 1713 vmf->page = page; 1714 return ret | VM_FAULT_LOCKED; 1715 1716 no_cached_page: 1717 /* 1718 * We're only likely to ever get here if MADV_RANDOM is in 1719 * effect. 1720 */ 1721 error = page_cache_read(file, offset); 1722 1723 /* 1724 * The page we want has now been added to the page cache. 1725 * In the unlikely event that someone removed it in the 1726 * meantime, we'll just come back here and read it again. 1727 */ 1728 if (error >= 0) 1729 goto retry_find; 1730 1731 /* 1732 * An error return from page_cache_read can result if the 1733 * system is low on memory, or a problem occurs while trying 1734 * to schedule I/O. 1735 */ 1736 if (error == -ENOMEM) 1737 return VM_FAULT_OOM; 1738 return VM_FAULT_SIGBUS; 1739 1740 page_not_uptodate: 1741 /* 1742 * Umm, take care of errors if the page isn't up-to-date. 1743 * Try to re-read it _once_. We do this synchronously, 1744 * because there really aren't any performance issues here 1745 * and we need to check for errors. 1746 */ 1747 ClearPageError(page); 1748 error = mapping->a_ops->readpage(file, page); 1749 if (!error) { 1750 wait_on_page_locked(page); 1751 if (!PageUptodate(page)) 1752 error = -EIO; 1753 } 1754 page_cache_release(page); 1755 1756 if (!error || error == AOP_TRUNCATED_PAGE) 1757 goto retry_find; 1758 1759 /* Things didn't work out. Return zero to tell the mm layer so. */ 1760 shrink_readahead_size_eio(file, ra); 1761 return VM_FAULT_SIGBUS; 1762 } 1763 EXPORT_SYMBOL(filemap_fault); 1764 1765 const struct vm_operations_struct generic_file_vm_ops = { 1766 .fault = filemap_fault, 1767 }; 1768 1769 /* This is used for a general mmap of a disk file */ 1770 1771 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1772 { 1773 struct address_space *mapping = file->f_mapping; 1774 1775 if (!mapping->a_ops->readpage) 1776 return -ENOEXEC; 1777 file_accessed(file); 1778 vma->vm_ops = &generic_file_vm_ops; 1779 vma->vm_flags |= VM_CAN_NONLINEAR; 1780 return 0; 1781 } 1782 1783 /* 1784 * This is for filesystems which do not implement ->writepage. 1785 */ 1786 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 1787 { 1788 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 1789 return -EINVAL; 1790 return generic_file_mmap(file, vma); 1791 } 1792 #else 1793 int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1794 { 1795 return -ENOSYS; 1796 } 1797 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 1798 { 1799 return -ENOSYS; 1800 } 1801 #endif /* CONFIG_MMU */ 1802 1803 EXPORT_SYMBOL(generic_file_mmap); 1804 EXPORT_SYMBOL(generic_file_readonly_mmap); 1805 1806 static struct page *__read_cache_page(struct address_space *mapping, 1807 pgoff_t index, 1808 int (*filler)(void *,struct page*), 1809 void *data, 1810 gfp_t gfp) 1811 { 1812 struct page *page; 1813 int err; 1814 repeat: 1815 page = find_get_page(mapping, index); 1816 if (!page) { 1817 page = __page_cache_alloc(gfp | __GFP_COLD); 1818 if (!page) 1819 return ERR_PTR(-ENOMEM); 1820 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1821 if (unlikely(err)) { 1822 page_cache_release(page); 1823 if (err == -EEXIST) 1824 goto repeat; 1825 /* Presumably ENOMEM for radix tree node */ 1826 return ERR_PTR(err); 1827 } 1828 err = filler(data, page); 1829 if (err < 0) { 1830 page_cache_release(page); 1831 page = ERR_PTR(err); 1832 } 1833 } 1834 return page; 1835 } 1836 1837 static struct page *do_read_cache_page(struct address_space *mapping, 1838 pgoff_t index, 1839 int (*filler)(void *,struct page*), 1840 void *data, 1841 gfp_t gfp) 1842 1843 { 1844 struct page *page; 1845 int err; 1846 1847 retry: 1848 page = __read_cache_page(mapping, index, filler, data, gfp); 1849 if (IS_ERR(page)) 1850 return page; 1851 if (PageUptodate(page)) 1852 goto out; 1853 1854 lock_page(page); 1855 if (!page->mapping) { 1856 unlock_page(page); 1857 page_cache_release(page); 1858 goto retry; 1859 } 1860 if (PageUptodate(page)) { 1861 unlock_page(page); 1862 goto out; 1863 } 1864 err = filler(data, page); 1865 if (err < 0) { 1866 page_cache_release(page); 1867 return ERR_PTR(err); 1868 } 1869 out: 1870 mark_page_accessed(page); 1871 return page; 1872 } 1873 1874 /** 1875 * read_cache_page_async - read into page cache, fill it if needed 1876 * @mapping: the page's address_space 1877 * @index: the page index 1878 * @filler: function to perform the read 1879 * @data: destination for read data 1880 * 1881 * Same as read_cache_page, but don't wait for page to become unlocked 1882 * after submitting it to the filler. 1883 * 1884 * Read into the page cache. If a page already exists, and PageUptodate() is 1885 * not set, try to fill the page but don't wait for it to become unlocked. 1886 * 1887 * If the page does not get brought uptodate, return -EIO. 1888 */ 1889 struct page *read_cache_page_async(struct address_space *mapping, 1890 pgoff_t index, 1891 int (*filler)(void *,struct page*), 1892 void *data) 1893 { 1894 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1895 } 1896 EXPORT_SYMBOL(read_cache_page_async); 1897 1898 static struct page *wait_on_page_read(struct page *page) 1899 { 1900 if (!IS_ERR(page)) { 1901 wait_on_page_locked(page); 1902 if (!PageUptodate(page)) { 1903 page_cache_release(page); 1904 page = ERR_PTR(-EIO); 1905 } 1906 } 1907 return page; 1908 } 1909 1910 /** 1911 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 1912 * @mapping: the page's address_space 1913 * @index: the page index 1914 * @gfp: the page allocator flags to use if allocating 1915 * 1916 * This is the same as "read_mapping_page(mapping, index, NULL)", but with 1917 * any new page allocations done using the specified allocation flags. Note 1918 * that the Radix tree operations will still use GFP_KERNEL, so you can't 1919 * expect to do this atomically or anything like that - but you can pass in 1920 * other page requirements. 1921 * 1922 * If the page does not get brought uptodate, return -EIO. 1923 */ 1924 struct page *read_cache_page_gfp(struct address_space *mapping, 1925 pgoff_t index, 1926 gfp_t gfp) 1927 { 1928 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 1929 1930 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); 1931 } 1932 EXPORT_SYMBOL(read_cache_page_gfp); 1933 1934 /** 1935 * read_cache_page - read into page cache, fill it if needed 1936 * @mapping: the page's address_space 1937 * @index: the page index 1938 * @filler: function to perform the read 1939 * @data: destination for read data 1940 * 1941 * Read into the page cache. If a page already exists, and PageUptodate() is 1942 * not set, try to fill the page then wait for it to become unlocked. 1943 * 1944 * If the page does not get brought uptodate, return -EIO. 1945 */ 1946 struct page *read_cache_page(struct address_space *mapping, 1947 pgoff_t index, 1948 int (*filler)(void *,struct page*), 1949 void *data) 1950 { 1951 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1952 } 1953 EXPORT_SYMBOL(read_cache_page); 1954 1955 /* 1956 * The logic we want is 1957 * 1958 * if suid or (sgid and xgrp) 1959 * remove privs 1960 */ 1961 int should_remove_suid(struct dentry *dentry) 1962 { 1963 mode_t mode = dentry->d_inode->i_mode; 1964 int kill = 0; 1965 1966 /* suid always must be killed */ 1967 if (unlikely(mode & S_ISUID)) 1968 kill = ATTR_KILL_SUID; 1969 1970 /* 1971 * sgid without any exec bits is just a mandatory locking mark; leave 1972 * it alone. If some exec bits are set, it's a real sgid; kill it. 1973 */ 1974 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1975 kill |= ATTR_KILL_SGID; 1976 1977 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1978 return kill; 1979 1980 return 0; 1981 } 1982 EXPORT_SYMBOL(should_remove_suid); 1983 1984 static int __remove_suid(struct dentry *dentry, int kill) 1985 { 1986 struct iattr newattrs; 1987 1988 newattrs.ia_valid = ATTR_FORCE | kill; 1989 return notify_change(dentry, &newattrs); 1990 } 1991 1992 int file_remove_suid(struct file *file) 1993 { 1994 struct dentry *dentry = file->f_path.dentry; 1995 int killsuid = should_remove_suid(dentry); 1996 int killpriv = security_inode_need_killpriv(dentry); 1997 int error = 0; 1998 1999 if (killpriv < 0) 2000 return killpriv; 2001 if (killpriv) 2002 error = security_inode_killpriv(dentry); 2003 if (!error && killsuid) 2004 error = __remove_suid(dentry, killsuid); 2005 2006 return error; 2007 } 2008 EXPORT_SYMBOL(file_remove_suid); 2009 2010 static size_t __iovec_copy_from_user_inatomic(char *vaddr, 2011 const struct iovec *iov, size_t base, size_t bytes) 2012 { 2013 size_t copied = 0, left = 0; 2014 2015 while (bytes) { 2016 char __user *buf = iov->iov_base + base; 2017 int copy = min(bytes, iov->iov_len - base); 2018 2019 base = 0; 2020 left = __copy_from_user_inatomic(vaddr, buf, copy); 2021 copied += copy; 2022 bytes -= copy; 2023 vaddr += copy; 2024 iov++; 2025 2026 if (unlikely(left)) 2027 break; 2028 } 2029 return copied - left; 2030 } 2031 2032 /* 2033 * Copy as much as we can into the page and return the number of bytes which 2034 * were successfully copied. If a fault is encountered then return the number of 2035 * bytes which were copied. 2036 */ 2037 size_t iov_iter_copy_from_user_atomic(struct page *page, 2038 struct iov_iter *i, unsigned long offset, size_t bytes) 2039 { 2040 char *kaddr; 2041 size_t copied; 2042 2043 BUG_ON(!in_atomic()); 2044 kaddr = kmap_atomic(page, KM_USER0); 2045 if (likely(i->nr_segs == 1)) { 2046 int left; 2047 char __user *buf = i->iov->iov_base + i->iov_offset; 2048 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 2049 copied = bytes - left; 2050 } else { 2051 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2052 i->iov, i->iov_offset, bytes); 2053 } 2054 kunmap_atomic(kaddr, KM_USER0); 2055 2056 return copied; 2057 } 2058 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 2059 2060 /* 2061 * This has the same sideeffects and return value as 2062 * iov_iter_copy_from_user_atomic(). 2063 * The difference is that it attempts to resolve faults. 2064 * Page must not be locked. 2065 */ 2066 size_t iov_iter_copy_from_user(struct page *page, 2067 struct iov_iter *i, unsigned long offset, size_t bytes) 2068 { 2069 char *kaddr; 2070 size_t copied; 2071 2072 kaddr = kmap(page); 2073 if (likely(i->nr_segs == 1)) { 2074 int left; 2075 char __user *buf = i->iov->iov_base + i->iov_offset; 2076 left = __copy_from_user(kaddr + offset, buf, bytes); 2077 copied = bytes - left; 2078 } else { 2079 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 2080 i->iov, i->iov_offset, bytes); 2081 } 2082 kunmap(page); 2083 return copied; 2084 } 2085 EXPORT_SYMBOL(iov_iter_copy_from_user); 2086 2087 void iov_iter_advance(struct iov_iter *i, size_t bytes) 2088 { 2089 BUG_ON(i->count < bytes); 2090 2091 if (likely(i->nr_segs == 1)) { 2092 i->iov_offset += bytes; 2093 i->count -= bytes; 2094 } else { 2095 const struct iovec *iov = i->iov; 2096 size_t base = i->iov_offset; 2097 2098 /* 2099 * The !iov->iov_len check ensures we skip over unlikely 2100 * zero-length segments (without overruning the iovec). 2101 */ 2102 while (bytes || unlikely(i->count && !iov->iov_len)) { 2103 int copy; 2104 2105 copy = min(bytes, iov->iov_len - base); 2106 BUG_ON(!i->count || i->count < copy); 2107 i->count -= copy; 2108 bytes -= copy; 2109 base += copy; 2110 if (iov->iov_len == base) { 2111 iov++; 2112 base = 0; 2113 } 2114 } 2115 i->iov = iov; 2116 i->iov_offset = base; 2117 } 2118 } 2119 EXPORT_SYMBOL(iov_iter_advance); 2120 2121 /* 2122 * Fault in the first iovec of the given iov_iter, to a maximum length 2123 * of bytes. Returns 0 on success, or non-zero if the memory could not be 2124 * accessed (ie. because it is an invalid address). 2125 * 2126 * writev-intensive code may want this to prefault several iovecs -- that 2127 * would be possible (callers must not rely on the fact that _only_ the 2128 * first iovec will be faulted with the current implementation). 2129 */ 2130 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 2131 { 2132 char __user *buf = i->iov->iov_base + i->iov_offset; 2133 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 2134 return fault_in_pages_readable(buf, bytes); 2135 } 2136 EXPORT_SYMBOL(iov_iter_fault_in_readable); 2137 2138 /* 2139 * Return the count of just the current iov_iter segment. 2140 */ 2141 size_t iov_iter_single_seg_count(struct iov_iter *i) 2142 { 2143 const struct iovec *iov = i->iov; 2144 if (i->nr_segs == 1) 2145 return i->count; 2146 else 2147 return min(i->count, iov->iov_len - i->iov_offset); 2148 } 2149 EXPORT_SYMBOL(iov_iter_single_seg_count); 2150 2151 /* 2152 * Performs necessary checks before doing a write 2153 * 2154 * Can adjust writing position or amount of bytes to write. 2155 * Returns appropriate error code that caller should return or 2156 * zero in case that write should be allowed. 2157 */ 2158 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 2159 { 2160 struct inode *inode = file->f_mapping->host; 2161 unsigned long limit = rlimit(RLIMIT_FSIZE); 2162 2163 if (unlikely(*pos < 0)) 2164 return -EINVAL; 2165 2166 if (!isblk) { 2167 /* FIXME: this is for backwards compatibility with 2.4 */ 2168 if (file->f_flags & O_APPEND) 2169 *pos = i_size_read(inode); 2170 2171 if (limit != RLIM_INFINITY) { 2172 if (*pos >= limit) { 2173 send_sig(SIGXFSZ, current, 0); 2174 return -EFBIG; 2175 } 2176 if (*count > limit - (typeof(limit))*pos) { 2177 *count = limit - (typeof(limit))*pos; 2178 } 2179 } 2180 } 2181 2182 /* 2183 * LFS rule 2184 */ 2185 if (unlikely(*pos + *count > MAX_NON_LFS && 2186 !(file->f_flags & O_LARGEFILE))) { 2187 if (*pos >= MAX_NON_LFS) { 2188 return -EFBIG; 2189 } 2190 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 2191 *count = MAX_NON_LFS - (unsigned long)*pos; 2192 } 2193 } 2194 2195 /* 2196 * Are we about to exceed the fs block limit ? 2197 * 2198 * If we have written data it becomes a short write. If we have 2199 * exceeded without writing data we send a signal and return EFBIG. 2200 * Linus frestrict idea will clean these up nicely.. 2201 */ 2202 if (likely(!isblk)) { 2203 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 2204 if (*count || *pos > inode->i_sb->s_maxbytes) { 2205 return -EFBIG; 2206 } 2207 /* zero-length writes at ->s_maxbytes are OK */ 2208 } 2209 2210 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 2211 *count = inode->i_sb->s_maxbytes - *pos; 2212 } else { 2213 #ifdef CONFIG_BLOCK 2214 loff_t isize; 2215 if (bdev_read_only(I_BDEV(inode))) 2216 return -EPERM; 2217 isize = i_size_read(inode); 2218 if (*pos >= isize) { 2219 if (*count || *pos > isize) 2220 return -ENOSPC; 2221 } 2222 2223 if (*pos + *count > isize) 2224 *count = isize - *pos; 2225 #else 2226 return -EPERM; 2227 #endif 2228 } 2229 return 0; 2230 } 2231 EXPORT_SYMBOL(generic_write_checks); 2232 2233 int pagecache_write_begin(struct file *file, struct address_space *mapping, 2234 loff_t pos, unsigned len, unsigned flags, 2235 struct page **pagep, void **fsdata) 2236 { 2237 const struct address_space_operations *aops = mapping->a_ops; 2238 2239 return aops->write_begin(file, mapping, pos, len, flags, 2240 pagep, fsdata); 2241 } 2242 EXPORT_SYMBOL(pagecache_write_begin); 2243 2244 int pagecache_write_end(struct file *file, struct address_space *mapping, 2245 loff_t pos, unsigned len, unsigned copied, 2246 struct page *page, void *fsdata) 2247 { 2248 const struct address_space_operations *aops = mapping->a_ops; 2249 2250 mark_page_accessed(page); 2251 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2252 } 2253 EXPORT_SYMBOL(pagecache_write_end); 2254 2255 ssize_t 2256 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 2257 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 2258 size_t count, size_t ocount) 2259 { 2260 struct file *file = iocb->ki_filp; 2261 struct address_space *mapping = file->f_mapping; 2262 struct inode *inode = mapping->host; 2263 ssize_t written; 2264 size_t write_len; 2265 pgoff_t end; 2266 2267 if (count != ocount) 2268 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2269 2270 write_len = iov_length(iov, *nr_segs); 2271 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2272 2273 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); 2274 if (written) 2275 goto out; 2276 2277 /* 2278 * After a write we want buffered reads to be sure to go to disk to get 2279 * the new data. We invalidate clean cached page from the region we're 2280 * about to write. We do this *before* the write so that we can return 2281 * without clobbering -EIOCBQUEUED from ->direct_IO(). 2282 */ 2283 if (mapping->nrpages) { 2284 written = invalidate_inode_pages2_range(mapping, 2285 pos >> PAGE_CACHE_SHIFT, end); 2286 /* 2287 * If a page can not be invalidated, return 0 to fall back 2288 * to buffered write. 2289 */ 2290 if (written) { 2291 if (written == -EBUSY) 2292 return 0; 2293 goto out; 2294 } 2295 } 2296 2297 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2298 2299 /* 2300 * Finally, try again to invalidate clean pages which might have been 2301 * cached by non-direct readahead, or faulted in by get_user_pages() 2302 * if the source of the write was an mmap'ed region of the file 2303 * we're writing. Either one is a pretty crazy thing to do, 2304 * so we don't support it 100%. If this invalidation 2305 * fails, tough, the write still worked... 2306 */ 2307 if (mapping->nrpages) { 2308 invalidate_inode_pages2_range(mapping, 2309 pos >> PAGE_CACHE_SHIFT, end); 2310 } 2311 2312 if (written > 0) { 2313 pos += written; 2314 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2315 i_size_write(inode, pos); 2316 mark_inode_dirty(inode); 2317 } 2318 *ppos = pos; 2319 } 2320 out: 2321 return written; 2322 } 2323 EXPORT_SYMBOL(generic_file_direct_write); 2324 2325 /* 2326 * Find or create a page at the given pagecache position. Return the locked 2327 * page. This function is specifically for buffered writes. 2328 */ 2329 struct page *grab_cache_page_write_begin(struct address_space *mapping, 2330 pgoff_t index, unsigned flags) 2331 { 2332 int status; 2333 struct page *page; 2334 gfp_t gfp_notmask = 0; 2335 if (flags & AOP_FLAG_NOFS) 2336 gfp_notmask = __GFP_FS; 2337 repeat: 2338 page = find_lock_page(mapping, index); 2339 if (page) 2340 return page; 2341 2342 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2343 if (!page) 2344 return NULL; 2345 status = add_to_page_cache_lru(page, mapping, index, 2346 GFP_KERNEL & ~gfp_notmask); 2347 if (unlikely(status)) { 2348 page_cache_release(page); 2349 if (status == -EEXIST) 2350 goto repeat; 2351 return NULL; 2352 } 2353 return page; 2354 } 2355 EXPORT_SYMBOL(grab_cache_page_write_begin); 2356 2357 static ssize_t generic_perform_write(struct file *file, 2358 struct iov_iter *i, loff_t pos) 2359 { 2360 struct address_space *mapping = file->f_mapping; 2361 const struct address_space_operations *a_ops = mapping->a_ops; 2362 long status = 0; 2363 ssize_t written = 0; 2364 unsigned int flags = 0; 2365 2366 /* 2367 * Copies from kernel address space cannot fail (NFSD is a big user). 2368 */ 2369 if (segment_eq(get_fs(), KERNEL_DS)) 2370 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2371 2372 do { 2373 struct page *page; 2374 unsigned long offset; /* Offset into pagecache page */ 2375 unsigned long bytes; /* Bytes to write to page */ 2376 size_t copied; /* Bytes copied from user */ 2377 void *fsdata; 2378 2379 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2380 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2381 iov_iter_count(i)); 2382 2383 again: 2384 2385 /* 2386 * Bring in the user page that we will copy from _first_. 2387 * Otherwise there's a nasty deadlock on copying from the 2388 * same page as we're writing to, without it being marked 2389 * up-to-date. 2390 * 2391 * Not only is this an optimisation, but it is also required 2392 * to check that the address is actually valid, when atomic 2393 * usercopies are used, below. 2394 */ 2395 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2396 status = -EFAULT; 2397 break; 2398 } 2399 2400 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2401 &page, &fsdata); 2402 if (unlikely(status)) 2403 break; 2404 2405 if (mapping_writably_mapped(mapping)) 2406 flush_dcache_page(page); 2407 2408 pagefault_disable(); 2409 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2410 pagefault_enable(); 2411 flush_dcache_page(page); 2412 2413 mark_page_accessed(page); 2414 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2415 page, fsdata); 2416 if (unlikely(status < 0)) 2417 break; 2418 copied = status; 2419 2420 cond_resched(); 2421 2422 iov_iter_advance(i, copied); 2423 if (unlikely(copied == 0)) { 2424 /* 2425 * If we were unable to copy any data at all, we must 2426 * fall back to a single segment length write. 2427 * 2428 * If we didn't fallback here, we could livelock 2429 * because not all segments in the iov can be copied at 2430 * once without a pagefault. 2431 */ 2432 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2433 iov_iter_single_seg_count(i)); 2434 goto again; 2435 } 2436 pos += copied; 2437 written += copied; 2438 2439 balance_dirty_pages_ratelimited(mapping); 2440 2441 } while (iov_iter_count(i)); 2442 2443 return written ? written : status; 2444 } 2445 2446 ssize_t 2447 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2448 unsigned long nr_segs, loff_t pos, loff_t *ppos, 2449 size_t count, ssize_t written) 2450 { 2451 struct file *file = iocb->ki_filp; 2452 ssize_t status; 2453 struct iov_iter i; 2454 2455 iov_iter_init(&i, iov, nr_segs, count, written); 2456 status = generic_perform_write(file, &i, pos); 2457 2458 if (likely(status >= 0)) { 2459 written += status; 2460 *ppos = pos + status; 2461 } 2462 2463 return written ? written : status; 2464 } 2465 EXPORT_SYMBOL(generic_file_buffered_write); 2466 2467 /** 2468 * __generic_file_aio_write - write data to a file 2469 * @iocb: IO state structure (file, offset, etc.) 2470 * @iov: vector with data to write 2471 * @nr_segs: number of segments in the vector 2472 * @ppos: position where to write 2473 * 2474 * This function does all the work needed for actually writing data to a 2475 * file. It does all basic checks, removes SUID from the file, updates 2476 * modification times and calls proper subroutines depending on whether we 2477 * do direct IO or a standard buffered write. 2478 * 2479 * It expects i_mutex to be grabbed unless we work on a block device or similar 2480 * object which does not need locking at all. 2481 * 2482 * This function does *not* take care of syncing data in case of O_SYNC write. 2483 * A caller has to handle it. This is mainly due to the fact that we want to 2484 * avoid syncing under i_mutex. 2485 */ 2486 ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2487 unsigned long nr_segs, loff_t *ppos) 2488 { 2489 struct file *file = iocb->ki_filp; 2490 struct address_space * mapping = file->f_mapping; 2491 size_t ocount; /* original count */ 2492 size_t count; /* after file limit checks */ 2493 struct inode *inode = mapping->host; 2494 loff_t pos; 2495 ssize_t written; 2496 ssize_t err; 2497 2498 ocount = 0; 2499 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 2500 if (err) 2501 return err; 2502 2503 count = ocount; 2504 pos = *ppos; 2505 2506 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2507 2508 /* We can write back this queue in page reclaim */ 2509 current->backing_dev_info = mapping->backing_dev_info; 2510 written = 0; 2511 2512 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2513 if (err) 2514 goto out; 2515 2516 if (count == 0) 2517 goto out; 2518 2519 err = file_remove_suid(file); 2520 if (err) 2521 goto out; 2522 2523 file_update_time(file); 2524 2525 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2526 if (unlikely(file->f_flags & O_DIRECT)) { 2527 loff_t endbyte; 2528 ssize_t written_buffered; 2529 2530 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 2531 ppos, count, ocount); 2532 if (written < 0 || written == count) 2533 goto out; 2534 /* 2535 * direct-io write to a hole: fall through to buffered I/O 2536 * for completing the rest of the request. 2537 */ 2538 pos += written; 2539 count -= written; 2540 written_buffered = generic_file_buffered_write(iocb, iov, 2541 nr_segs, pos, ppos, count, 2542 written); 2543 /* 2544 * If generic_file_buffered_write() retuned a synchronous error 2545 * then we want to return the number of bytes which were 2546 * direct-written, or the error code if that was zero. Note 2547 * that this differs from normal direct-io semantics, which 2548 * will return -EFOO even if some bytes were written. 2549 */ 2550 if (written_buffered < 0) { 2551 err = written_buffered; 2552 goto out; 2553 } 2554 2555 /* 2556 * We need to ensure that the page cache pages are written to 2557 * disk and invalidated to preserve the expected O_DIRECT 2558 * semantics. 2559 */ 2560 endbyte = pos + written_buffered - written - 1; 2561 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 2562 if (err == 0) { 2563 written = written_buffered; 2564 invalidate_mapping_pages(mapping, 2565 pos >> PAGE_CACHE_SHIFT, 2566 endbyte >> PAGE_CACHE_SHIFT); 2567 } else { 2568 /* 2569 * We don't know how much we wrote, so just return 2570 * the number of bytes which were direct-written 2571 */ 2572 } 2573 } else { 2574 written = generic_file_buffered_write(iocb, iov, nr_segs, 2575 pos, ppos, count, written); 2576 } 2577 out: 2578 current->backing_dev_info = NULL; 2579 return written ? written : err; 2580 } 2581 EXPORT_SYMBOL(__generic_file_aio_write); 2582 2583 /** 2584 * generic_file_aio_write - write data to a file 2585 * @iocb: IO state structure 2586 * @iov: vector with data to write 2587 * @nr_segs: number of segments in the vector 2588 * @pos: position in file where to write 2589 * 2590 * This is a wrapper around __generic_file_aio_write() to be used by most 2591 * filesystems. It takes care of syncing the file in case of O_SYNC file 2592 * and acquires i_mutex as needed. 2593 */ 2594 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2595 unsigned long nr_segs, loff_t pos) 2596 { 2597 struct file *file = iocb->ki_filp; 2598 struct inode *inode = file->f_mapping->host; 2599 ssize_t ret; 2600 2601 BUG_ON(iocb->ki_pos != pos); 2602 2603 mutex_lock(&inode->i_mutex); 2604 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2605 mutex_unlock(&inode->i_mutex); 2606 2607 if (ret > 0 || ret == -EIOCBQUEUED) { 2608 ssize_t err; 2609 2610 err = generic_write_sync(file, pos, ret); 2611 if (err < 0 && ret > 0) 2612 ret = err; 2613 } 2614 return ret; 2615 } 2616 EXPORT_SYMBOL(generic_file_aio_write); 2617 2618 /** 2619 * try_to_release_page() - release old fs-specific metadata on a page 2620 * 2621 * @page: the page which the kernel is trying to free 2622 * @gfp_mask: memory allocation flags (and I/O mode) 2623 * 2624 * The address_space is to try to release any data against the page 2625 * (presumably at page->private). If the release was successful, return `1'. 2626 * Otherwise return zero. 2627 * 2628 * This may also be called if PG_fscache is set on a page, indicating that the 2629 * page is known to the local caching routines. 2630 * 2631 * The @gfp_mask argument specifies whether I/O may be performed to release 2632 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2633 * 2634 */ 2635 int try_to_release_page(struct page *page, gfp_t gfp_mask) 2636 { 2637 struct address_space * const mapping = page->mapping; 2638 2639 BUG_ON(!PageLocked(page)); 2640 if (PageWriteback(page)) 2641 return 0; 2642 2643 if (mapping && mapping->a_ops->releasepage) 2644 return mapping->a_ops->releasepage(page, gfp_mask); 2645 return try_to_free_buffers(page); 2646 } 2647 2648 EXPORT_SYMBOL(try_to_release_page); 2649