xref: /linux/mm/filemap.c (revision 6b2d2cec1081a979e0efd6a1e9559e5a01a3c10e)
1 /*
2  *	linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/compiler.h>
15 #include <linux/fs.h>
16 #include <linux/uaccess.h>
17 #include <linux/aio.h>
18 #include <linux/capability.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/mman.h>
23 #include <linux/pagemap.h>
24 #include <linux/file.h>
25 #include <linux/uio.h>
26 #include <linux/hash.h>
27 #include <linux/writeback.h>
28 #include <linux/backing-dev.h>
29 #include <linux/pagevec.h>
30 #include <linux/blkdev.h>
31 #include <linux/backing-dev.h>
32 #include <linux/security.h>
33 #include <linux/syscalls.h>
34 #include <linux/cpuset.h>
35 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
36 #include "internal.h"
37 
38 /*
39  * FIXME: remove all knowledge of the buffer layer from the core VM
40  */
41 #include <linux/buffer_head.h> /* for generic_osync_inode */
42 
43 #include <asm/mman.h>
44 
45 static ssize_t
46 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
47 	loff_t offset, unsigned long nr_segs);
48 
49 /*
50  * Shared mappings implemented 30.11.1994. It's not fully working yet,
51  * though.
52  *
53  * Shared mappings now work. 15.8.1995  Bruno.
54  *
55  * finished 'unifying' the page and buffer cache and SMP-threaded the
56  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
57  *
58  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
59  */
60 
61 /*
62  * Lock ordering:
63  *
64  *  ->i_mmap_lock		(vmtruncate)
65  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
66  *      ->swap_lock		(exclusive_swap_page, others)
67  *        ->mapping->tree_lock
68  *          ->zone.lock
69  *
70  *  ->i_mutex
71  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
72  *
73  *  ->mmap_sem
74  *    ->i_mmap_lock
75  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
76  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
77  *
78  *  ->mmap_sem
79  *    ->lock_page		(access_process_vm)
80  *
81  *  ->i_mutex			(generic_file_buffered_write)
82  *    ->mmap_sem		(fault_in_pages_readable->do_page_fault)
83  *
84  *  ->i_mutex
85  *    ->i_alloc_sem             (various)
86  *
87  *  ->inode_lock
88  *    ->sb_lock			(fs/fs-writeback.c)
89  *    ->mapping->tree_lock	(__sync_single_inode)
90  *
91  *  ->i_mmap_lock
92  *    ->anon_vma.lock		(vma_adjust)
93  *
94  *  ->anon_vma.lock
95  *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
96  *
97  *  ->page_table_lock or pte_lock
98  *    ->swap_lock		(try_to_unmap_one)
99  *    ->private_lock		(try_to_unmap_one)
100  *    ->tree_lock		(try_to_unmap_one)
101  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
102  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
103  *    ->private_lock		(page_remove_rmap->set_page_dirty)
104  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
105  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
106  *    ->inode_lock		(zap_pte_range->set_page_dirty)
107  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
108  *
109  *  ->task->proc_lock
110  *    ->dcache_lock		(proc_pid_lookup)
111  */
112 
113 /*
114  * Remove a page from the page cache and free it. Caller has to make
115  * sure the page is locked and that nobody else uses it - or that usage
116  * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
117  */
118 void __remove_from_page_cache(struct page *page)
119 {
120 	struct address_space *mapping = page->mapping;
121 
122 	radix_tree_delete(&mapping->page_tree, page->index);
123 	page->mapping = NULL;
124 	mapping->nrpages--;
125 	__dec_zone_page_state(page, NR_FILE_PAGES);
126 	BUG_ON(page_mapped(page));
127 
128 	/*
129 	 * Some filesystems seem to re-dirty the page even after
130 	 * the VM has canceled the dirty bit (eg ext3 journaling).
131 	 *
132 	 * Fix it up by doing a final dirty accounting check after
133 	 * having removed the page entirely.
134 	 */
135 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
136 		dec_zone_page_state(page, NR_FILE_DIRTY);
137 		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
138 	}
139 }
140 
141 void remove_from_page_cache(struct page *page)
142 {
143 	struct address_space *mapping = page->mapping;
144 
145 	BUG_ON(!PageLocked(page));
146 
147 	write_lock_irq(&mapping->tree_lock);
148 	__remove_from_page_cache(page);
149 	write_unlock_irq(&mapping->tree_lock);
150 }
151 
152 static int sync_page(void *word)
153 {
154 	struct address_space *mapping;
155 	struct page *page;
156 
157 	page = container_of((unsigned long *)word, struct page, flags);
158 
159 	/*
160 	 * page_mapping() is being called without PG_locked held.
161 	 * Some knowledge of the state and use of the page is used to
162 	 * reduce the requirements down to a memory barrier.
163 	 * The danger here is of a stale page_mapping() return value
164 	 * indicating a struct address_space different from the one it's
165 	 * associated with when it is associated with one.
166 	 * After smp_mb(), it's either the correct page_mapping() for
167 	 * the page, or an old page_mapping() and the page's own
168 	 * page_mapping() has gone NULL.
169 	 * The ->sync_page() address_space operation must tolerate
170 	 * page_mapping() going NULL. By an amazing coincidence,
171 	 * this comes about because none of the users of the page
172 	 * in the ->sync_page() methods make essential use of the
173 	 * page_mapping(), merely passing the page down to the backing
174 	 * device's unplug functions when it's non-NULL, which in turn
175 	 * ignore it for all cases but swap, where only page_private(page) is
176 	 * of interest. When page_mapping() does go NULL, the entire
177 	 * call stack gracefully ignores the page and returns.
178 	 * -- wli
179 	 */
180 	smp_mb();
181 	mapping = page_mapping(page);
182 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
183 		mapping->a_ops->sync_page(page);
184 	io_schedule();
185 	return 0;
186 }
187 
188 /**
189  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
190  * @mapping:	address space structure to write
191  * @start:	offset in bytes where the range starts
192  * @end:	offset in bytes where the range ends (inclusive)
193  * @sync_mode:	enable synchronous operation
194  *
195  * Start writeback against all of a mapping's dirty pages that lie
196  * within the byte offsets <start, end> inclusive.
197  *
198  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
199  * opposed to a regular memory cleansing writeback.  The difference between
200  * these two operations is that if a dirty page/buffer is encountered, it must
201  * be waited upon, and not just skipped over.
202  */
203 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
204 				loff_t end, int sync_mode)
205 {
206 	int ret;
207 	struct writeback_control wbc = {
208 		.sync_mode = sync_mode,
209 		.nr_to_write = mapping->nrpages * 2,
210 		.range_start = start,
211 		.range_end = end,
212 	};
213 
214 	if (!mapping_cap_writeback_dirty(mapping))
215 		return 0;
216 
217 	ret = do_writepages(mapping, &wbc);
218 	return ret;
219 }
220 
221 static inline int __filemap_fdatawrite(struct address_space *mapping,
222 	int sync_mode)
223 {
224 	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
225 }
226 
227 int filemap_fdatawrite(struct address_space *mapping)
228 {
229 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
230 }
231 EXPORT_SYMBOL(filemap_fdatawrite);
232 
233 static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
234 				loff_t end)
235 {
236 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
237 }
238 
239 /**
240  * filemap_flush - mostly a non-blocking flush
241  * @mapping:	target address_space
242  *
243  * This is a mostly non-blocking flush.  Not suitable for data-integrity
244  * purposes - I/O may not be started against all dirty pages.
245  */
246 int filemap_flush(struct address_space *mapping)
247 {
248 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
249 }
250 EXPORT_SYMBOL(filemap_flush);
251 
252 /**
253  * wait_on_page_writeback_range - wait for writeback to complete
254  * @mapping:	target address_space
255  * @start:	beginning page index
256  * @end:	ending page index
257  *
258  * Wait for writeback to complete against pages indexed by start->end
259  * inclusive
260  */
261 int wait_on_page_writeback_range(struct address_space *mapping,
262 				pgoff_t start, pgoff_t end)
263 {
264 	struct pagevec pvec;
265 	int nr_pages;
266 	int ret = 0;
267 	pgoff_t index;
268 
269 	if (end < start)
270 		return 0;
271 
272 	pagevec_init(&pvec, 0);
273 	index = start;
274 	while ((index <= end) &&
275 			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
276 			PAGECACHE_TAG_WRITEBACK,
277 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
278 		unsigned i;
279 
280 		for (i = 0; i < nr_pages; i++) {
281 			struct page *page = pvec.pages[i];
282 
283 			/* until radix tree lookup accepts end_index */
284 			if (page->index > end)
285 				continue;
286 
287 			wait_on_page_writeback(page);
288 			if (PageError(page))
289 				ret = -EIO;
290 		}
291 		pagevec_release(&pvec);
292 		cond_resched();
293 	}
294 
295 	/* Check for outstanding write errors */
296 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
297 		ret = -ENOSPC;
298 	if (test_and_clear_bit(AS_EIO, &mapping->flags))
299 		ret = -EIO;
300 
301 	return ret;
302 }
303 
304 /**
305  * sync_page_range - write and wait on all pages in the passed range
306  * @inode:	target inode
307  * @mapping:	target address_space
308  * @pos:	beginning offset in pages to write
309  * @count:	number of bytes to write
310  *
311  * Write and wait upon all the pages in the passed range.  This is a "data
312  * integrity" operation.  It waits upon in-flight writeout before starting and
313  * waiting upon new writeout.  If there was an IO error, return it.
314  *
315  * We need to re-take i_mutex during the generic_osync_inode list walk because
316  * it is otherwise livelockable.
317  */
318 int sync_page_range(struct inode *inode, struct address_space *mapping,
319 			loff_t pos, loff_t count)
320 {
321 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
322 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
323 	int ret;
324 
325 	if (!mapping_cap_writeback_dirty(mapping) || !count)
326 		return 0;
327 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
328 	if (ret == 0) {
329 		mutex_lock(&inode->i_mutex);
330 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
331 		mutex_unlock(&inode->i_mutex);
332 	}
333 	if (ret == 0)
334 		ret = wait_on_page_writeback_range(mapping, start, end);
335 	return ret;
336 }
337 EXPORT_SYMBOL(sync_page_range);
338 
339 /**
340  * sync_page_range_nolock
341  * @inode:	target inode
342  * @mapping:	target address_space
343  * @pos:	beginning offset in pages to write
344  * @count:	number of bytes to write
345  *
346  * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
347  * as it forces O_SYNC writers to different parts of the same file
348  * to be serialised right until io completion.
349  */
350 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
351 			   loff_t pos, loff_t count)
352 {
353 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
354 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
355 	int ret;
356 
357 	if (!mapping_cap_writeback_dirty(mapping) || !count)
358 		return 0;
359 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
360 	if (ret == 0)
361 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
362 	if (ret == 0)
363 		ret = wait_on_page_writeback_range(mapping, start, end);
364 	return ret;
365 }
366 EXPORT_SYMBOL(sync_page_range_nolock);
367 
368 /**
369  * filemap_fdatawait - wait for all under-writeback pages to complete
370  * @mapping: address space structure to wait for
371  *
372  * Walk the list of under-writeback pages of the given address space
373  * and wait for all of them.
374  */
375 int filemap_fdatawait(struct address_space *mapping)
376 {
377 	loff_t i_size = i_size_read(mapping->host);
378 
379 	if (i_size == 0)
380 		return 0;
381 
382 	return wait_on_page_writeback_range(mapping, 0,
383 				(i_size - 1) >> PAGE_CACHE_SHIFT);
384 }
385 EXPORT_SYMBOL(filemap_fdatawait);
386 
387 int filemap_write_and_wait(struct address_space *mapping)
388 {
389 	int err = 0;
390 
391 	if (mapping->nrpages) {
392 		err = filemap_fdatawrite(mapping);
393 		/*
394 		 * Even if the above returned error, the pages may be
395 		 * written partially (e.g. -ENOSPC), so we wait for it.
396 		 * But the -EIO is special case, it may indicate the worst
397 		 * thing (e.g. bug) happened, so we avoid waiting for it.
398 		 */
399 		if (err != -EIO) {
400 			int err2 = filemap_fdatawait(mapping);
401 			if (!err)
402 				err = err2;
403 		}
404 	}
405 	return err;
406 }
407 EXPORT_SYMBOL(filemap_write_and_wait);
408 
409 /**
410  * filemap_write_and_wait_range - write out & wait on a file range
411  * @mapping:	the address_space for the pages
412  * @lstart:	offset in bytes where the range starts
413  * @lend:	offset in bytes where the range ends (inclusive)
414  *
415  * Write out and wait upon file offsets lstart->lend, inclusive.
416  *
417  * Note that `lend' is inclusive (describes the last byte to be written) so
418  * that this function can be used to write to the very end-of-file (end = -1).
419  */
420 int filemap_write_and_wait_range(struct address_space *mapping,
421 				 loff_t lstart, loff_t lend)
422 {
423 	int err = 0;
424 
425 	if (mapping->nrpages) {
426 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
427 						 WB_SYNC_ALL);
428 		/* See comment of filemap_write_and_wait() */
429 		if (err != -EIO) {
430 			int err2 = wait_on_page_writeback_range(mapping,
431 						lstart >> PAGE_CACHE_SHIFT,
432 						lend >> PAGE_CACHE_SHIFT);
433 			if (!err)
434 				err = err2;
435 		}
436 	}
437 	return err;
438 }
439 
440 /**
441  * add_to_page_cache - add newly allocated pagecache pages
442  * @page:	page to add
443  * @mapping:	the page's address_space
444  * @offset:	page index
445  * @gfp_mask:	page allocation mode
446  *
447  * This function is used to add newly allocated pagecache pages;
448  * the page is new, so we can just run SetPageLocked() against it.
449  * The other page state flags were set by rmqueue().
450  *
451  * This function does not add the page to the LRU.  The caller must do that.
452  */
453 int add_to_page_cache(struct page *page, struct address_space *mapping,
454 		pgoff_t offset, gfp_t gfp_mask)
455 {
456 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
457 
458 	if (error == 0) {
459 		write_lock_irq(&mapping->tree_lock);
460 		error = radix_tree_insert(&mapping->page_tree, offset, page);
461 		if (!error) {
462 			page_cache_get(page);
463 			SetPageLocked(page);
464 			page->mapping = mapping;
465 			page->index = offset;
466 			mapping->nrpages++;
467 			__inc_zone_page_state(page, NR_FILE_PAGES);
468 		}
469 		write_unlock_irq(&mapping->tree_lock);
470 		radix_tree_preload_end();
471 	}
472 	return error;
473 }
474 EXPORT_SYMBOL(add_to_page_cache);
475 
476 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
477 				pgoff_t offset, gfp_t gfp_mask)
478 {
479 	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
480 	if (ret == 0)
481 		lru_cache_add(page);
482 	return ret;
483 }
484 
485 #ifdef CONFIG_NUMA
486 struct page *__page_cache_alloc(gfp_t gfp)
487 {
488 	if (cpuset_do_page_mem_spread()) {
489 		int n = cpuset_mem_spread_node();
490 		return alloc_pages_node(n, gfp, 0);
491 	}
492 	return alloc_pages(gfp, 0);
493 }
494 EXPORT_SYMBOL(__page_cache_alloc);
495 #endif
496 
497 static int __sleep_on_page_lock(void *word)
498 {
499 	io_schedule();
500 	return 0;
501 }
502 
503 /*
504  * In order to wait for pages to become available there must be
505  * waitqueues associated with pages. By using a hash table of
506  * waitqueues where the bucket discipline is to maintain all
507  * waiters on the same queue and wake all when any of the pages
508  * become available, and for the woken contexts to check to be
509  * sure the appropriate page became available, this saves space
510  * at a cost of "thundering herd" phenomena during rare hash
511  * collisions.
512  */
513 static wait_queue_head_t *page_waitqueue(struct page *page)
514 {
515 	const struct zone *zone = page_zone(page);
516 
517 	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
518 }
519 
520 static inline void wake_up_page(struct page *page, int bit)
521 {
522 	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
523 }
524 
525 void fastcall wait_on_page_bit(struct page *page, int bit_nr)
526 {
527 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
528 
529 	if (test_bit(bit_nr, &page->flags))
530 		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
531 							TASK_UNINTERRUPTIBLE);
532 }
533 EXPORT_SYMBOL(wait_on_page_bit);
534 
535 /**
536  * unlock_page - unlock a locked page
537  * @page: the page
538  *
539  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
540  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
541  * mechananism between PageLocked pages and PageWriteback pages is shared.
542  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
543  *
544  * The first mb is necessary to safely close the critical section opened by the
545  * TestSetPageLocked(), the second mb is necessary to enforce ordering between
546  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
547  * parallel wait_on_page_locked()).
548  */
549 void fastcall unlock_page(struct page *page)
550 {
551 	smp_mb__before_clear_bit();
552 	if (!TestClearPageLocked(page))
553 		BUG();
554 	smp_mb__after_clear_bit();
555 	wake_up_page(page, PG_locked);
556 }
557 EXPORT_SYMBOL(unlock_page);
558 
559 /**
560  * end_page_writeback - end writeback against a page
561  * @page: the page
562  */
563 void end_page_writeback(struct page *page)
564 {
565 	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
566 		if (!test_clear_page_writeback(page))
567 			BUG();
568 	}
569 	smp_mb__after_clear_bit();
570 	wake_up_page(page, PG_writeback);
571 }
572 EXPORT_SYMBOL(end_page_writeback);
573 
574 /**
575  * __lock_page - get a lock on the page, assuming we need to sleep to get it
576  * @page: the page to lock
577  *
578  * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
579  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
580  * chances are that on the second loop, the block layer's plug list is empty,
581  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
582  */
583 void fastcall __lock_page(struct page *page)
584 {
585 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
586 
587 	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
588 							TASK_UNINTERRUPTIBLE);
589 }
590 EXPORT_SYMBOL(__lock_page);
591 
592 /*
593  * Variant of lock_page that does not require the caller to hold a reference
594  * on the page's mapping.
595  */
596 void fastcall __lock_page_nosync(struct page *page)
597 {
598 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
599 	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
600 							TASK_UNINTERRUPTIBLE);
601 }
602 
603 /**
604  * find_get_page - find and get a page reference
605  * @mapping: the address_space to search
606  * @offset: the page index
607  *
608  * Is there a pagecache struct page at the given (mapping, offset) tuple?
609  * If yes, increment its refcount and return it; if no, return NULL.
610  */
611 struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
612 {
613 	struct page *page;
614 
615 	read_lock_irq(&mapping->tree_lock);
616 	page = radix_tree_lookup(&mapping->page_tree, offset);
617 	if (page)
618 		page_cache_get(page);
619 	read_unlock_irq(&mapping->tree_lock);
620 	return page;
621 }
622 EXPORT_SYMBOL(find_get_page);
623 
624 /**
625  * find_lock_page - locate, pin and lock a pagecache page
626  * @mapping: the address_space to search
627  * @offset: the page index
628  *
629  * Locates the desired pagecache page, locks it, increments its reference
630  * count and returns its address.
631  *
632  * Returns zero if the page was not present. find_lock_page() may sleep.
633  */
634 struct page *find_lock_page(struct address_space *mapping,
635 				pgoff_t offset)
636 {
637 	struct page *page;
638 
639 repeat:
640 	read_lock_irq(&mapping->tree_lock);
641 	page = radix_tree_lookup(&mapping->page_tree, offset);
642 	if (page) {
643 		page_cache_get(page);
644 		if (TestSetPageLocked(page)) {
645 			read_unlock_irq(&mapping->tree_lock);
646 			__lock_page(page);
647 
648 			/* Has the page been truncated while we slept? */
649 			if (unlikely(page->mapping != mapping)) {
650 				unlock_page(page);
651 				page_cache_release(page);
652 				goto repeat;
653 			}
654 			VM_BUG_ON(page->index != offset);
655 			goto out;
656 		}
657 	}
658 	read_unlock_irq(&mapping->tree_lock);
659 out:
660 	return page;
661 }
662 EXPORT_SYMBOL(find_lock_page);
663 
664 /**
665  * find_or_create_page - locate or add a pagecache page
666  * @mapping: the page's address_space
667  * @index: the page's index into the mapping
668  * @gfp_mask: page allocation mode
669  *
670  * Locates a page in the pagecache.  If the page is not present, a new page
671  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
672  * LRU list.  The returned page is locked and has its reference count
673  * incremented.
674  *
675  * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
676  * allocation!
677  *
678  * find_or_create_page() returns the desired page's address, or zero on
679  * memory exhaustion.
680  */
681 struct page *find_or_create_page(struct address_space *mapping,
682 		pgoff_t index, gfp_t gfp_mask)
683 {
684 	struct page *page;
685 	int err;
686 repeat:
687 	page = find_lock_page(mapping, index);
688 	if (!page) {
689 		page = __page_cache_alloc(gfp_mask);
690 		if (!page)
691 			return NULL;
692 		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
693 		if (unlikely(err)) {
694 			page_cache_release(page);
695 			page = NULL;
696 			if (err == -EEXIST)
697 				goto repeat;
698 		}
699 	}
700 	return page;
701 }
702 EXPORT_SYMBOL(find_or_create_page);
703 
704 /**
705  * find_get_pages - gang pagecache lookup
706  * @mapping:	The address_space to search
707  * @start:	The starting page index
708  * @nr_pages:	The maximum number of pages
709  * @pages:	Where the resulting pages are placed
710  *
711  * find_get_pages() will search for and return a group of up to
712  * @nr_pages pages in the mapping.  The pages are placed at @pages.
713  * find_get_pages() takes a reference against the returned pages.
714  *
715  * The search returns a group of mapping-contiguous pages with ascending
716  * indexes.  There may be holes in the indices due to not-present pages.
717  *
718  * find_get_pages() returns the number of pages which were found.
719  */
720 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
721 			    unsigned int nr_pages, struct page **pages)
722 {
723 	unsigned int i;
724 	unsigned int ret;
725 
726 	read_lock_irq(&mapping->tree_lock);
727 	ret = radix_tree_gang_lookup(&mapping->page_tree,
728 				(void **)pages, start, nr_pages);
729 	for (i = 0; i < ret; i++)
730 		page_cache_get(pages[i]);
731 	read_unlock_irq(&mapping->tree_lock);
732 	return ret;
733 }
734 
735 /**
736  * find_get_pages_contig - gang contiguous pagecache lookup
737  * @mapping:	The address_space to search
738  * @index:	The starting page index
739  * @nr_pages:	The maximum number of pages
740  * @pages:	Where the resulting pages are placed
741  *
742  * find_get_pages_contig() works exactly like find_get_pages(), except
743  * that the returned number of pages are guaranteed to be contiguous.
744  *
745  * find_get_pages_contig() returns the number of pages which were found.
746  */
747 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
748 			       unsigned int nr_pages, struct page **pages)
749 {
750 	unsigned int i;
751 	unsigned int ret;
752 
753 	read_lock_irq(&mapping->tree_lock);
754 	ret = radix_tree_gang_lookup(&mapping->page_tree,
755 				(void **)pages, index, nr_pages);
756 	for (i = 0; i < ret; i++) {
757 		if (pages[i]->mapping == NULL || pages[i]->index != index)
758 			break;
759 
760 		page_cache_get(pages[i]);
761 		index++;
762 	}
763 	read_unlock_irq(&mapping->tree_lock);
764 	return i;
765 }
766 EXPORT_SYMBOL(find_get_pages_contig);
767 
768 /**
769  * find_get_pages_tag - find and return pages that match @tag
770  * @mapping:	the address_space to search
771  * @index:	the starting page index
772  * @tag:	the tag index
773  * @nr_pages:	the maximum number of pages
774  * @pages:	where the resulting pages are placed
775  *
776  * Like find_get_pages, except we only return pages which are tagged with
777  * @tag.   We update @index to index the next page for the traversal.
778  */
779 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
780 			int tag, unsigned int nr_pages, struct page **pages)
781 {
782 	unsigned int i;
783 	unsigned int ret;
784 
785 	read_lock_irq(&mapping->tree_lock);
786 	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
787 				(void **)pages, *index, nr_pages, tag);
788 	for (i = 0; i < ret; i++)
789 		page_cache_get(pages[i]);
790 	if (ret)
791 		*index = pages[ret - 1]->index + 1;
792 	read_unlock_irq(&mapping->tree_lock);
793 	return ret;
794 }
795 EXPORT_SYMBOL(find_get_pages_tag);
796 
797 /**
798  * grab_cache_page_nowait - returns locked page at given index in given cache
799  * @mapping: target address_space
800  * @index: the page index
801  *
802  * Same as grab_cache_page(), but do not wait if the page is unavailable.
803  * This is intended for speculative data generators, where the data can
804  * be regenerated if the page couldn't be grabbed.  This routine should
805  * be safe to call while holding the lock for another page.
806  *
807  * Clear __GFP_FS when allocating the page to avoid recursion into the fs
808  * and deadlock against the caller's locked page.
809  */
810 struct page *
811 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
812 {
813 	struct page *page = find_get_page(mapping, index);
814 
815 	if (page) {
816 		if (!TestSetPageLocked(page))
817 			return page;
818 		page_cache_release(page);
819 		return NULL;
820 	}
821 	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
822 	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
823 		page_cache_release(page);
824 		page = NULL;
825 	}
826 	return page;
827 }
828 EXPORT_SYMBOL(grab_cache_page_nowait);
829 
830 /*
831  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
832  * a _large_ part of the i/o request. Imagine the worst scenario:
833  *
834  *      ---R__________________________________________B__________
835  *         ^ reading here                             ^ bad block(assume 4k)
836  *
837  * read(R) => miss => readahead(R...B) => media error => frustrating retries
838  * => failing the whole request => read(R) => read(R+1) =>
839  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
840  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
841  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
842  *
843  * It is going insane. Fix it by quickly scaling down the readahead size.
844  */
845 static void shrink_readahead_size_eio(struct file *filp,
846 					struct file_ra_state *ra)
847 {
848 	if (!ra->ra_pages)
849 		return;
850 
851 	ra->ra_pages /= 4;
852 }
853 
854 /**
855  * do_generic_mapping_read - generic file read routine
856  * @mapping:	address_space to be read
857  * @ra:		file's readahead state
858  * @filp:	the file to read
859  * @ppos:	current file position
860  * @desc:	read_descriptor
861  * @actor:	read method
862  *
863  * This is a generic file read routine, and uses the
864  * mapping->a_ops->readpage() function for the actual low-level stuff.
865  *
866  * This is really ugly. But the goto's actually try to clarify some
867  * of the logic when it comes to error handling etc.
868  *
869  * Note the struct file* is only passed for the use of readpage.
870  * It may be NULL.
871  */
872 void do_generic_mapping_read(struct address_space *mapping,
873 			     struct file_ra_state *ra,
874 			     struct file *filp,
875 			     loff_t *ppos,
876 			     read_descriptor_t *desc,
877 			     read_actor_t actor)
878 {
879 	struct inode *inode = mapping->host;
880 	pgoff_t index;
881 	pgoff_t last_index;
882 	pgoff_t prev_index;
883 	unsigned long offset;      /* offset into pagecache page */
884 	unsigned int prev_offset;
885 	int error;
886 
887 	index = *ppos >> PAGE_CACHE_SHIFT;
888 	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
889 	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
890 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
891 	offset = *ppos & ~PAGE_CACHE_MASK;
892 
893 	for (;;) {
894 		struct page *page;
895 		pgoff_t end_index;
896 		loff_t isize;
897 		unsigned long nr, ret;
898 
899 		cond_resched();
900 find_page:
901 		page = find_get_page(mapping, index);
902 		if (!page) {
903 			page_cache_sync_readahead(mapping,
904 					ra, filp,
905 					index, last_index - index);
906 			page = find_get_page(mapping, index);
907 			if (unlikely(page == NULL))
908 				goto no_cached_page;
909 		}
910 		if (PageReadahead(page)) {
911 			page_cache_async_readahead(mapping,
912 					ra, filp, page,
913 					index, last_index - index);
914 		}
915 		if (!PageUptodate(page))
916 			goto page_not_up_to_date;
917 page_ok:
918 		/*
919 		 * i_size must be checked after we know the page is Uptodate.
920 		 *
921 		 * Checking i_size after the check allows us to calculate
922 		 * the correct value for "nr", which means the zero-filled
923 		 * part of the page is not copied back to userspace (unless
924 		 * another truncate extends the file - this is desired though).
925 		 */
926 
927 		isize = i_size_read(inode);
928 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
929 		if (unlikely(!isize || index > end_index)) {
930 			page_cache_release(page);
931 			goto out;
932 		}
933 
934 		/* nr is the maximum number of bytes to copy from this page */
935 		nr = PAGE_CACHE_SIZE;
936 		if (index == end_index) {
937 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
938 			if (nr <= offset) {
939 				page_cache_release(page);
940 				goto out;
941 			}
942 		}
943 		nr = nr - offset;
944 
945 		/* If users can be writing to this page using arbitrary
946 		 * virtual addresses, take care about potential aliasing
947 		 * before reading the page on the kernel side.
948 		 */
949 		if (mapping_writably_mapped(mapping))
950 			flush_dcache_page(page);
951 
952 		/*
953 		 * When a sequential read accesses a page several times,
954 		 * only mark it as accessed the first time.
955 		 */
956 		if (prev_index != index || offset != prev_offset)
957 			mark_page_accessed(page);
958 		prev_index = index;
959 
960 		/*
961 		 * Ok, we have the page, and it's up-to-date, so
962 		 * now we can copy it to user space...
963 		 *
964 		 * The actor routine returns how many bytes were actually used..
965 		 * NOTE! This may not be the same as how much of a user buffer
966 		 * we filled up (we may be padding etc), so we can only update
967 		 * "pos" here (the actor routine has to update the user buffer
968 		 * pointers and the remaining count).
969 		 */
970 		ret = actor(desc, page, offset, nr);
971 		offset += ret;
972 		index += offset >> PAGE_CACHE_SHIFT;
973 		offset &= ~PAGE_CACHE_MASK;
974 		prev_offset = offset;
975 
976 		page_cache_release(page);
977 		if (ret == nr && desc->count)
978 			continue;
979 		goto out;
980 
981 page_not_up_to_date:
982 		/* Get exclusive access to the page ... */
983 		lock_page(page);
984 
985 		/* Did it get truncated before we got the lock? */
986 		if (!page->mapping) {
987 			unlock_page(page);
988 			page_cache_release(page);
989 			continue;
990 		}
991 
992 		/* Did somebody else fill it already? */
993 		if (PageUptodate(page)) {
994 			unlock_page(page);
995 			goto page_ok;
996 		}
997 
998 readpage:
999 		/* Start the actual read. The read will unlock the page. */
1000 		error = mapping->a_ops->readpage(filp, page);
1001 
1002 		if (unlikely(error)) {
1003 			if (error == AOP_TRUNCATED_PAGE) {
1004 				page_cache_release(page);
1005 				goto find_page;
1006 			}
1007 			goto readpage_error;
1008 		}
1009 
1010 		if (!PageUptodate(page)) {
1011 			lock_page(page);
1012 			if (!PageUptodate(page)) {
1013 				if (page->mapping == NULL) {
1014 					/*
1015 					 * invalidate_inode_pages got it
1016 					 */
1017 					unlock_page(page);
1018 					page_cache_release(page);
1019 					goto find_page;
1020 				}
1021 				unlock_page(page);
1022 				error = -EIO;
1023 				shrink_readahead_size_eio(filp, ra);
1024 				goto readpage_error;
1025 			}
1026 			unlock_page(page);
1027 		}
1028 
1029 		goto page_ok;
1030 
1031 readpage_error:
1032 		/* UHHUH! A synchronous read error occurred. Report it */
1033 		desc->error = error;
1034 		page_cache_release(page);
1035 		goto out;
1036 
1037 no_cached_page:
1038 		/*
1039 		 * Ok, it wasn't cached, so we need to create a new
1040 		 * page..
1041 		 */
1042 		page = page_cache_alloc_cold(mapping);
1043 		if (!page) {
1044 			desc->error = -ENOMEM;
1045 			goto out;
1046 		}
1047 		error = add_to_page_cache_lru(page, mapping,
1048 						index, GFP_KERNEL);
1049 		if (error) {
1050 			page_cache_release(page);
1051 			if (error == -EEXIST)
1052 				goto find_page;
1053 			desc->error = error;
1054 			goto out;
1055 		}
1056 		goto readpage;
1057 	}
1058 
1059 out:
1060 	ra->prev_pos = prev_index;
1061 	ra->prev_pos <<= PAGE_CACHE_SHIFT;
1062 	ra->prev_pos |= prev_offset;
1063 
1064 	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1065 	if (filp)
1066 		file_accessed(filp);
1067 }
1068 EXPORT_SYMBOL(do_generic_mapping_read);
1069 
1070 int file_read_actor(read_descriptor_t *desc, struct page *page,
1071 			unsigned long offset, unsigned long size)
1072 {
1073 	char *kaddr;
1074 	unsigned long left, count = desc->count;
1075 
1076 	if (size > count)
1077 		size = count;
1078 
1079 	/*
1080 	 * Faults on the destination of a read are common, so do it before
1081 	 * taking the kmap.
1082 	 */
1083 	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1084 		kaddr = kmap_atomic(page, KM_USER0);
1085 		left = __copy_to_user_inatomic(desc->arg.buf,
1086 						kaddr + offset, size);
1087 		kunmap_atomic(kaddr, KM_USER0);
1088 		if (left == 0)
1089 			goto success;
1090 	}
1091 
1092 	/* Do it the slow way */
1093 	kaddr = kmap(page);
1094 	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1095 	kunmap(page);
1096 
1097 	if (left) {
1098 		size -= left;
1099 		desc->error = -EFAULT;
1100 	}
1101 success:
1102 	desc->count = count - size;
1103 	desc->written += size;
1104 	desc->arg.buf += size;
1105 	return size;
1106 }
1107 
1108 /*
1109  * Performs necessary checks before doing a write
1110  * @iov:	io vector request
1111  * @nr_segs:	number of segments in the iovec
1112  * @count:	number of bytes to write
1113  * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1114  *
1115  * Adjust number of segments and amount of bytes to write (nr_segs should be
1116  * properly initialized first). Returns appropriate error code that caller
1117  * should return or zero in case that write should be allowed.
1118  */
1119 int generic_segment_checks(const struct iovec *iov,
1120 			unsigned long *nr_segs, size_t *count, int access_flags)
1121 {
1122 	unsigned long   seg;
1123 	size_t cnt = 0;
1124 	for (seg = 0; seg < *nr_segs; seg++) {
1125 		const struct iovec *iv = &iov[seg];
1126 
1127 		/*
1128 		 * If any segment has a negative length, or the cumulative
1129 		 * length ever wraps negative then return -EINVAL.
1130 		 */
1131 		cnt += iv->iov_len;
1132 		if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1133 			return -EINVAL;
1134 		if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1135 			continue;
1136 		if (seg == 0)
1137 			return -EFAULT;
1138 		*nr_segs = seg;
1139 		cnt -= iv->iov_len;	/* This segment is no good */
1140 		break;
1141 	}
1142 	*count = cnt;
1143 	return 0;
1144 }
1145 EXPORT_SYMBOL(generic_segment_checks);
1146 
1147 /**
1148  * generic_file_aio_read - generic filesystem read routine
1149  * @iocb:	kernel I/O control block
1150  * @iov:	io vector request
1151  * @nr_segs:	number of segments in the iovec
1152  * @pos:	current file position
1153  *
1154  * This is the "read()" routine for all filesystems
1155  * that can use the page cache directly.
1156  */
1157 ssize_t
1158 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1159 		unsigned long nr_segs, loff_t pos)
1160 {
1161 	struct file *filp = iocb->ki_filp;
1162 	ssize_t retval;
1163 	unsigned long seg;
1164 	size_t count;
1165 	loff_t *ppos = &iocb->ki_pos;
1166 
1167 	count = 0;
1168 	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1169 	if (retval)
1170 		return retval;
1171 
1172 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1173 	if (filp->f_flags & O_DIRECT) {
1174 		loff_t size;
1175 		struct address_space *mapping;
1176 		struct inode *inode;
1177 
1178 		mapping = filp->f_mapping;
1179 		inode = mapping->host;
1180 		retval = 0;
1181 		if (!count)
1182 			goto out; /* skip atime */
1183 		size = i_size_read(inode);
1184 		if (pos < size) {
1185 			retval = generic_file_direct_IO(READ, iocb,
1186 						iov, pos, nr_segs);
1187 			if (retval > 0)
1188 				*ppos = pos + retval;
1189 		}
1190 		if (likely(retval != 0)) {
1191 			file_accessed(filp);
1192 			goto out;
1193 		}
1194 	}
1195 
1196 	retval = 0;
1197 	if (count) {
1198 		for (seg = 0; seg < nr_segs; seg++) {
1199 			read_descriptor_t desc;
1200 
1201 			desc.written = 0;
1202 			desc.arg.buf = iov[seg].iov_base;
1203 			desc.count = iov[seg].iov_len;
1204 			if (desc.count == 0)
1205 				continue;
1206 			desc.error = 0;
1207 			do_generic_file_read(filp,ppos,&desc,file_read_actor);
1208 			retval += desc.written;
1209 			if (desc.error) {
1210 				retval = retval ?: desc.error;
1211 				break;
1212 			}
1213 			if (desc.count > 0)
1214 				break;
1215 		}
1216 	}
1217 out:
1218 	return retval;
1219 }
1220 EXPORT_SYMBOL(generic_file_aio_read);
1221 
1222 static ssize_t
1223 do_readahead(struct address_space *mapping, struct file *filp,
1224 	     pgoff_t index, unsigned long nr)
1225 {
1226 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1227 		return -EINVAL;
1228 
1229 	force_page_cache_readahead(mapping, filp, index,
1230 					max_sane_readahead(nr));
1231 	return 0;
1232 }
1233 
1234 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1235 {
1236 	ssize_t ret;
1237 	struct file *file;
1238 
1239 	ret = -EBADF;
1240 	file = fget(fd);
1241 	if (file) {
1242 		if (file->f_mode & FMODE_READ) {
1243 			struct address_space *mapping = file->f_mapping;
1244 			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1245 			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1246 			unsigned long len = end - start + 1;
1247 			ret = do_readahead(mapping, file, start, len);
1248 		}
1249 		fput(file);
1250 	}
1251 	return ret;
1252 }
1253 
1254 #ifdef CONFIG_MMU
1255 /**
1256  * page_cache_read - adds requested page to the page cache if not already there
1257  * @file:	file to read
1258  * @offset:	page index
1259  *
1260  * This adds the requested page to the page cache if it isn't already there,
1261  * and schedules an I/O to read in its contents from disk.
1262  */
1263 static int fastcall page_cache_read(struct file * file, pgoff_t offset)
1264 {
1265 	struct address_space *mapping = file->f_mapping;
1266 	struct page *page;
1267 	int ret;
1268 
1269 	do {
1270 		page = page_cache_alloc_cold(mapping);
1271 		if (!page)
1272 			return -ENOMEM;
1273 
1274 		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1275 		if (ret == 0)
1276 			ret = mapping->a_ops->readpage(file, page);
1277 		else if (ret == -EEXIST)
1278 			ret = 0; /* losing race to add is OK */
1279 
1280 		page_cache_release(page);
1281 
1282 	} while (ret == AOP_TRUNCATED_PAGE);
1283 
1284 	return ret;
1285 }
1286 
1287 #define MMAP_LOTSAMISS  (100)
1288 
1289 /**
1290  * filemap_fault - read in file data for page fault handling
1291  * @vma:	vma in which the fault was taken
1292  * @vmf:	struct vm_fault containing details of the fault
1293  *
1294  * filemap_fault() is invoked via the vma operations vector for a
1295  * mapped memory region to read in file data during a page fault.
1296  *
1297  * The goto's are kind of ugly, but this streamlines the normal case of having
1298  * it in the page cache, and handles the special cases reasonably without
1299  * having a lot of duplicated code.
1300  */
1301 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1302 {
1303 	int error;
1304 	struct file *file = vma->vm_file;
1305 	struct address_space *mapping = file->f_mapping;
1306 	struct file_ra_state *ra = &file->f_ra;
1307 	struct inode *inode = mapping->host;
1308 	struct page *page;
1309 	unsigned long size;
1310 	int did_readaround = 0;
1311 	int ret = 0;
1312 
1313 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1314 	if (vmf->pgoff >= size)
1315 		return VM_FAULT_SIGBUS;
1316 
1317 	/* If we don't want any read-ahead, don't bother */
1318 	if (VM_RandomReadHint(vma))
1319 		goto no_cached_page;
1320 
1321 	/*
1322 	 * Do we have something in the page cache already?
1323 	 */
1324 retry_find:
1325 	page = find_lock_page(mapping, vmf->pgoff);
1326 	/*
1327 	 * For sequential accesses, we use the generic readahead logic.
1328 	 */
1329 	if (VM_SequentialReadHint(vma)) {
1330 		if (!page) {
1331 			page_cache_sync_readahead(mapping, ra, file,
1332 							   vmf->pgoff, 1);
1333 			page = find_lock_page(mapping, vmf->pgoff);
1334 			if (!page)
1335 				goto no_cached_page;
1336 		}
1337 		if (PageReadahead(page)) {
1338 			page_cache_async_readahead(mapping, ra, file, page,
1339 							   vmf->pgoff, 1);
1340 		}
1341 	}
1342 
1343 	if (!page) {
1344 		unsigned long ra_pages;
1345 
1346 		ra->mmap_miss++;
1347 
1348 		/*
1349 		 * Do we miss much more than hit in this file? If so,
1350 		 * stop bothering with read-ahead. It will only hurt.
1351 		 */
1352 		if (ra->mmap_miss > MMAP_LOTSAMISS)
1353 			goto no_cached_page;
1354 
1355 		/*
1356 		 * To keep the pgmajfault counter straight, we need to
1357 		 * check did_readaround, as this is an inner loop.
1358 		 */
1359 		if (!did_readaround) {
1360 			ret = VM_FAULT_MAJOR;
1361 			count_vm_event(PGMAJFAULT);
1362 		}
1363 		did_readaround = 1;
1364 		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1365 		if (ra_pages) {
1366 			pgoff_t start = 0;
1367 
1368 			if (vmf->pgoff > ra_pages / 2)
1369 				start = vmf->pgoff - ra_pages / 2;
1370 			do_page_cache_readahead(mapping, file, start, ra_pages);
1371 		}
1372 		page = find_lock_page(mapping, vmf->pgoff);
1373 		if (!page)
1374 			goto no_cached_page;
1375 	}
1376 
1377 	if (!did_readaround)
1378 		ra->mmap_miss--;
1379 
1380 	/*
1381 	 * We have a locked page in the page cache, now we need to check
1382 	 * that it's up-to-date. If not, it is going to be due to an error.
1383 	 */
1384 	if (unlikely(!PageUptodate(page)))
1385 		goto page_not_uptodate;
1386 
1387 	/* Must recheck i_size under page lock */
1388 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1389 	if (unlikely(vmf->pgoff >= size)) {
1390 		unlock_page(page);
1391 		page_cache_release(page);
1392 		return VM_FAULT_SIGBUS;
1393 	}
1394 
1395 	/*
1396 	 * Found the page and have a reference on it.
1397 	 */
1398 	mark_page_accessed(page);
1399 	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1400 	vmf->page = page;
1401 	return ret | VM_FAULT_LOCKED;
1402 
1403 no_cached_page:
1404 	/*
1405 	 * We're only likely to ever get here if MADV_RANDOM is in
1406 	 * effect.
1407 	 */
1408 	error = page_cache_read(file, vmf->pgoff);
1409 
1410 	/*
1411 	 * The page we want has now been added to the page cache.
1412 	 * In the unlikely event that someone removed it in the
1413 	 * meantime, we'll just come back here and read it again.
1414 	 */
1415 	if (error >= 0)
1416 		goto retry_find;
1417 
1418 	/*
1419 	 * An error return from page_cache_read can result if the
1420 	 * system is low on memory, or a problem occurs while trying
1421 	 * to schedule I/O.
1422 	 */
1423 	if (error == -ENOMEM)
1424 		return VM_FAULT_OOM;
1425 	return VM_FAULT_SIGBUS;
1426 
1427 page_not_uptodate:
1428 	/* IO error path */
1429 	if (!did_readaround) {
1430 		ret = VM_FAULT_MAJOR;
1431 		count_vm_event(PGMAJFAULT);
1432 	}
1433 
1434 	/*
1435 	 * Umm, take care of errors if the page isn't up-to-date.
1436 	 * Try to re-read it _once_. We do this synchronously,
1437 	 * because there really aren't any performance issues here
1438 	 * and we need to check for errors.
1439 	 */
1440 	ClearPageError(page);
1441 	error = mapping->a_ops->readpage(file, page);
1442 	page_cache_release(page);
1443 
1444 	if (!error || error == AOP_TRUNCATED_PAGE)
1445 		goto retry_find;
1446 
1447 	/* Things didn't work out. Return zero to tell the mm layer so. */
1448 	shrink_readahead_size_eio(file, ra);
1449 	return VM_FAULT_SIGBUS;
1450 }
1451 EXPORT_SYMBOL(filemap_fault);
1452 
1453 struct vm_operations_struct generic_file_vm_ops = {
1454 	.fault		= filemap_fault,
1455 };
1456 
1457 /* This is used for a general mmap of a disk file */
1458 
1459 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1460 {
1461 	struct address_space *mapping = file->f_mapping;
1462 
1463 	if (!mapping->a_ops->readpage)
1464 		return -ENOEXEC;
1465 	file_accessed(file);
1466 	vma->vm_ops = &generic_file_vm_ops;
1467 	vma->vm_flags |= VM_CAN_NONLINEAR;
1468 	return 0;
1469 }
1470 
1471 /*
1472  * This is for filesystems which do not implement ->writepage.
1473  */
1474 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1475 {
1476 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1477 		return -EINVAL;
1478 	return generic_file_mmap(file, vma);
1479 }
1480 #else
1481 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1482 {
1483 	return -ENOSYS;
1484 }
1485 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1486 {
1487 	return -ENOSYS;
1488 }
1489 #endif /* CONFIG_MMU */
1490 
1491 EXPORT_SYMBOL(generic_file_mmap);
1492 EXPORT_SYMBOL(generic_file_readonly_mmap);
1493 
1494 static struct page *__read_cache_page(struct address_space *mapping,
1495 				pgoff_t index,
1496 				int (*filler)(void *,struct page*),
1497 				void *data)
1498 {
1499 	struct page *page;
1500 	int err;
1501 repeat:
1502 	page = find_get_page(mapping, index);
1503 	if (!page) {
1504 		page = page_cache_alloc_cold(mapping);
1505 		if (!page)
1506 			return ERR_PTR(-ENOMEM);
1507 		err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1508 		if (unlikely(err)) {
1509 			page_cache_release(page);
1510 			if (err == -EEXIST)
1511 				goto repeat;
1512 			/* Presumably ENOMEM for radix tree node */
1513 			return ERR_PTR(err);
1514 		}
1515 		err = filler(data, page);
1516 		if (err < 0) {
1517 			page_cache_release(page);
1518 			page = ERR_PTR(err);
1519 		}
1520 	}
1521 	return page;
1522 }
1523 
1524 /*
1525  * Same as read_cache_page, but don't wait for page to become unlocked
1526  * after submitting it to the filler.
1527  */
1528 struct page *read_cache_page_async(struct address_space *mapping,
1529 				pgoff_t index,
1530 				int (*filler)(void *,struct page*),
1531 				void *data)
1532 {
1533 	struct page *page;
1534 	int err;
1535 
1536 retry:
1537 	page = __read_cache_page(mapping, index, filler, data);
1538 	if (IS_ERR(page))
1539 		return page;
1540 	if (PageUptodate(page))
1541 		goto out;
1542 
1543 	lock_page(page);
1544 	if (!page->mapping) {
1545 		unlock_page(page);
1546 		page_cache_release(page);
1547 		goto retry;
1548 	}
1549 	if (PageUptodate(page)) {
1550 		unlock_page(page);
1551 		goto out;
1552 	}
1553 	err = filler(data, page);
1554 	if (err < 0) {
1555 		page_cache_release(page);
1556 		return ERR_PTR(err);
1557 	}
1558 out:
1559 	mark_page_accessed(page);
1560 	return page;
1561 }
1562 EXPORT_SYMBOL(read_cache_page_async);
1563 
1564 /**
1565  * read_cache_page - read into page cache, fill it if needed
1566  * @mapping:	the page's address_space
1567  * @index:	the page index
1568  * @filler:	function to perform the read
1569  * @data:	destination for read data
1570  *
1571  * Read into the page cache. If a page already exists, and PageUptodate() is
1572  * not set, try to fill the page then wait for it to become unlocked.
1573  *
1574  * If the page does not get brought uptodate, return -EIO.
1575  */
1576 struct page *read_cache_page(struct address_space *mapping,
1577 				pgoff_t index,
1578 				int (*filler)(void *,struct page*),
1579 				void *data)
1580 {
1581 	struct page *page;
1582 
1583 	page = read_cache_page_async(mapping, index, filler, data);
1584 	if (IS_ERR(page))
1585 		goto out;
1586 	wait_on_page_locked(page);
1587 	if (!PageUptodate(page)) {
1588 		page_cache_release(page);
1589 		page = ERR_PTR(-EIO);
1590 	}
1591  out:
1592 	return page;
1593 }
1594 EXPORT_SYMBOL(read_cache_page);
1595 
1596 /*
1597  * The logic we want is
1598  *
1599  *	if suid or (sgid and xgrp)
1600  *		remove privs
1601  */
1602 int should_remove_suid(struct dentry *dentry)
1603 {
1604 	mode_t mode = dentry->d_inode->i_mode;
1605 	int kill = 0;
1606 
1607 	/* suid always must be killed */
1608 	if (unlikely(mode & S_ISUID))
1609 		kill = ATTR_KILL_SUID;
1610 
1611 	/*
1612 	 * sgid without any exec bits is just a mandatory locking mark; leave
1613 	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
1614 	 */
1615 	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1616 		kill |= ATTR_KILL_SGID;
1617 
1618 	if (unlikely(kill && !capable(CAP_FSETID)))
1619 		return kill;
1620 
1621 	return 0;
1622 }
1623 EXPORT_SYMBOL(should_remove_suid);
1624 
1625 int __remove_suid(struct dentry *dentry, int kill)
1626 {
1627 	struct iattr newattrs;
1628 
1629 	newattrs.ia_valid = ATTR_FORCE | kill;
1630 	return notify_change(dentry, &newattrs);
1631 }
1632 
1633 int remove_suid(struct dentry *dentry)
1634 {
1635 	int killsuid = should_remove_suid(dentry);
1636 	int killpriv = security_inode_need_killpriv(dentry);
1637 	int error = 0;
1638 
1639 	if (killpriv < 0)
1640 		return killpriv;
1641 	if (killpriv)
1642 		error = security_inode_killpriv(dentry);
1643 	if (!error && killsuid)
1644 		error = __remove_suid(dentry, killsuid);
1645 
1646 	return error;
1647 }
1648 EXPORT_SYMBOL(remove_suid);
1649 
1650 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1651 			const struct iovec *iov, size_t base, size_t bytes)
1652 {
1653 	size_t copied = 0, left = 0;
1654 
1655 	while (bytes) {
1656 		char __user *buf = iov->iov_base + base;
1657 		int copy = min(bytes, iov->iov_len - base);
1658 
1659 		base = 0;
1660 		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1661 		copied += copy;
1662 		bytes -= copy;
1663 		vaddr += copy;
1664 		iov++;
1665 
1666 		if (unlikely(left))
1667 			break;
1668 	}
1669 	return copied - left;
1670 }
1671 
1672 /*
1673  * Copy as much as we can into the page and return the number of bytes which
1674  * were sucessfully copied.  If a fault is encountered then return the number of
1675  * bytes which were copied.
1676  */
1677 size_t iov_iter_copy_from_user_atomic(struct page *page,
1678 		struct iov_iter *i, unsigned long offset, size_t bytes)
1679 {
1680 	char *kaddr;
1681 	size_t copied;
1682 
1683 	BUG_ON(!in_atomic());
1684 	kaddr = kmap_atomic(page, KM_USER0);
1685 	if (likely(i->nr_segs == 1)) {
1686 		int left;
1687 		char __user *buf = i->iov->iov_base + i->iov_offset;
1688 		left = __copy_from_user_inatomic_nocache(kaddr + offset,
1689 							buf, bytes);
1690 		copied = bytes - left;
1691 	} else {
1692 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1693 						i->iov, i->iov_offset, bytes);
1694 	}
1695 	kunmap_atomic(kaddr, KM_USER0);
1696 
1697 	return copied;
1698 }
1699 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1700 
1701 /*
1702  * This has the same sideeffects and return value as
1703  * iov_iter_copy_from_user_atomic().
1704  * The difference is that it attempts to resolve faults.
1705  * Page must not be locked.
1706  */
1707 size_t iov_iter_copy_from_user(struct page *page,
1708 		struct iov_iter *i, unsigned long offset, size_t bytes)
1709 {
1710 	char *kaddr;
1711 	size_t copied;
1712 
1713 	kaddr = kmap(page);
1714 	if (likely(i->nr_segs == 1)) {
1715 		int left;
1716 		char __user *buf = i->iov->iov_base + i->iov_offset;
1717 		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1718 		copied = bytes - left;
1719 	} else {
1720 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1721 						i->iov, i->iov_offset, bytes);
1722 	}
1723 	kunmap(page);
1724 	return copied;
1725 }
1726 EXPORT_SYMBOL(iov_iter_copy_from_user);
1727 
1728 static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
1729 {
1730 	if (likely(i->nr_segs == 1)) {
1731 		i->iov_offset += bytes;
1732 	} else {
1733 		const struct iovec *iov = i->iov;
1734 		size_t base = i->iov_offset;
1735 
1736 		while (bytes) {
1737 			int copy = min(bytes, iov->iov_len - base);
1738 
1739 			bytes -= copy;
1740 			base += copy;
1741 			if (iov->iov_len == base) {
1742 				iov++;
1743 				base = 0;
1744 			}
1745 		}
1746 		i->iov = iov;
1747 		i->iov_offset = base;
1748 	}
1749 }
1750 
1751 void iov_iter_advance(struct iov_iter *i, size_t bytes)
1752 {
1753 	BUG_ON(i->count < bytes);
1754 
1755 	__iov_iter_advance_iov(i, bytes);
1756 	i->count -= bytes;
1757 }
1758 EXPORT_SYMBOL(iov_iter_advance);
1759 
1760 /*
1761  * Fault in the first iovec of the given iov_iter, to a maximum length
1762  * of bytes. Returns 0 on success, or non-zero if the memory could not be
1763  * accessed (ie. because it is an invalid address).
1764  *
1765  * writev-intensive code may want this to prefault several iovecs -- that
1766  * would be possible (callers must not rely on the fact that _only_ the
1767  * first iovec will be faulted with the current implementation).
1768  */
1769 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1770 {
1771 	char __user *buf = i->iov->iov_base + i->iov_offset;
1772 	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1773 	return fault_in_pages_readable(buf, bytes);
1774 }
1775 EXPORT_SYMBOL(iov_iter_fault_in_readable);
1776 
1777 /*
1778  * Return the count of just the current iov_iter segment.
1779  */
1780 size_t iov_iter_single_seg_count(struct iov_iter *i)
1781 {
1782 	const struct iovec *iov = i->iov;
1783 	if (i->nr_segs == 1)
1784 		return i->count;
1785 	else
1786 		return min(i->count, iov->iov_len - i->iov_offset);
1787 }
1788 EXPORT_SYMBOL(iov_iter_single_seg_count);
1789 
1790 /*
1791  * Performs necessary checks before doing a write
1792  *
1793  * Can adjust writing position or amount of bytes to write.
1794  * Returns appropriate error code that caller should return or
1795  * zero in case that write should be allowed.
1796  */
1797 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1798 {
1799 	struct inode *inode = file->f_mapping->host;
1800 	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1801 
1802         if (unlikely(*pos < 0))
1803                 return -EINVAL;
1804 
1805 	if (!isblk) {
1806 		/* FIXME: this is for backwards compatibility with 2.4 */
1807 		if (file->f_flags & O_APPEND)
1808                         *pos = i_size_read(inode);
1809 
1810 		if (limit != RLIM_INFINITY) {
1811 			if (*pos >= limit) {
1812 				send_sig(SIGXFSZ, current, 0);
1813 				return -EFBIG;
1814 			}
1815 			if (*count > limit - (typeof(limit))*pos) {
1816 				*count = limit - (typeof(limit))*pos;
1817 			}
1818 		}
1819 	}
1820 
1821 	/*
1822 	 * LFS rule
1823 	 */
1824 	if (unlikely(*pos + *count > MAX_NON_LFS &&
1825 				!(file->f_flags & O_LARGEFILE))) {
1826 		if (*pos >= MAX_NON_LFS) {
1827 			return -EFBIG;
1828 		}
1829 		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1830 			*count = MAX_NON_LFS - (unsigned long)*pos;
1831 		}
1832 	}
1833 
1834 	/*
1835 	 * Are we about to exceed the fs block limit ?
1836 	 *
1837 	 * If we have written data it becomes a short write.  If we have
1838 	 * exceeded without writing data we send a signal and return EFBIG.
1839 	 * Linus frestrict idea will clean these up nicely..
1840 	 */
1841 	if (likely(!isblk)) {
1842 		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1843 			if (*count || *pos > inode->i_sb->s_maxbytes) {
1844 				return -EFBIG;
1845 			}
1846 			/* zero-length writes at ->s_maxbytes are OK */
1847 		}
1848 
1849 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1850 			*count = inode->i_sb->s_maxbytes - *pos;
1851 	} else {
1852 #ifdef CONFIG_BLOCK
1853 		loff_t isize;
1854 		if (bdev_read_only(I_BDEV(inode)))
1855 			return -EPERM;
1856 		isize = i_size_read(inode);
1857 		if (*pos >= isize) {
1858 			if (*count || *pos > isize)
1859 				return -ENOSPC;
1860 		}
1861 
1862 		if (*pos + *count > isize)
1863 			*count = isize - *pos;
1864 #else
1865 		return -EPERM;
1866 #endif
1867 	}
1868 	return 0;
1869 }
1870 EXPORT_SYMBOL(generic_write_checks);
1871 
1872 int pagecache_write_begin(struct file *file, struct address_space *mapping,
1873 				loff_t pos, unsigned len, unsigned flags,
1874 				struct page **pagep, void **fsdata)
1875 {
1876 	const struct address_space_operations *aops = mapping->a_ops;
1877 
1878 	if (aops->write_begin) {
1879 		return aops->write_begin(file, mapping, pos, len, flags,
1880 							pagep, fsdata);
1881 	} else {
1882 		int ret;
1883 		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1884 		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1885 		struct inode *inode = mapping->host;
1886 		struct page *page;
1887 again:
1888 		page = __grab_cache_page(mapping, index);
1889 		*pagep = page;
1890 		if (!page)
1891 			return -ENOMEM;
1892 
1893 		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1894 			/*
1895 			 * There is no way to resolve a short write situation
1896 			 * for a !Uptodate page (except by double copying in
1897 			 * the caller done by generic_perform_write_2copy).
1898 			 *
1899 			 * Instead, we have to bring it uptodate here.
1900 			 */
1901 			ret = aops->readpage(file, page);
1902 			page_cache_release(page);
1903 			if (ret) {
1904 				if (ret == AOP_TRUNCATED_PAGE)
1905 					goto again;
1906 				return ret;
1907 			}
1908 			goto again;
1909 		}
1910 
1911 		ret = aops->prepare_write(file, page, offset, offset+len);
1912 		if (ret) {
1913 			unlock_page(page);
1914 			page_cache_release(page);
1915 			if (pos + len > inode->i_size)
1916 				vmtruncate(inode, inode->i_size);
1917 		}
1918 		return ret;
1919 	}
1920 }
1921 EXPORT_SYMBOL(pagecache_write_begin);
1922 
1923 int pagecache_write_end(struct file *file, struct address_space *mapping,
1924 				loff_t pos, unsigned len, unsigned copied,
1925 				struct page *page, void *fsdata)
1926 {
1927 	const struct address_space_operations *aops = mapping->a_ops;
1928 	int ret;
1929 
1930 	if (aops->write_end) {
1931 		mark_page_accessed(page);
1932 		ret = aops->write_end(file, mapping, pos, len, copied,
1933 							page, fsdata);
1934 	} else {
1935 		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1936 		struct inode *inode = mapping->host;
1937 
1938 		flush_dcache_page(page);
1939 		ret = aops->commit_write(file, page, offset, offset+len);
1940 		unlock_page(page);
1941 		mark_page_accessed(page);
1942 		page_cache_release(page);
1943 
1944 		if (ret < 0) {
1945 			if (pos + len > inode->i_size)
1946 				vmtruncate(inode, inode->i_size);
1947 		} else if (ret > 0)
1948 			ret = min_t(size_t, copied, ret);
1949 		else
1950 			ret = copied;
1951 	}
1952 
1953 	return ret;
1954 }
1955 EXPORT_SYMBOL(pagecache_write_end);
1956 
1957 ssize_t
1958 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1959 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1960 		size_t count, size_t ocount)
1961 {
1962 	struct file	*file = iocb->ki_filp;
1963 	struct address_space *mapping = file->f_mapping;
1964 	struct inode	*inode = mapping->host;
1965 	ssize_t		written;
1966 
1967 	if (count != ocount)
1968 		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1969 
1970 	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1971 	if (written > 0) {
1972 		loff_t end = pos + written;
1973 		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1974 			i_size_write(inode,  end);
1975 			mark_inode_dirty(inode);
1976 		}
1977 		*ppos = end;
1978 	}
1979 
1980 	/*
1981 	 * Sync the fs metadata but not the minor inode changes and
1982 	 * of course not the data as we did direct DMA for the IO.
1983 	 * i_mutex is held, which protects generic_osync_inode() from
1984 	 * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
1985 	 */
1986 	if ((written >= 0 || written == -EIOCBQUEUED) &&
1987 	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1988 		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1989 		if (err < 0)
1990 			written = err;
1991 	}
1992 	return written;
1993 }
1994 EXPORT_SYMBOL(generic_file_direct_write);
1995 
1996 /*
1997  * Find or create a page at the given pagecache position. Return the locked
1998  * page. This function is specifically for buffered writes.
1999  */
2000 struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
2001 {
2002 	int status;
2003 	struct page *page;
2004 repeat:
2005 	page = find_lock_page(mapping, index);
2006 	if (likely(page))
2007 		return page;
2008 
2009 	page = page_cache_alloc(mapping);
2010 	if (!page)
2011 		return NULL;
2012 	status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
2013 	if (unlikely(status)) {
2014 		page_cache_release(page);
2015 		if (status == -EEXIST)
2016 			goto repeat;
2017 		return NULL;
2018 	}
2019 	return page;
2020 }
2021 EXPORT_SYMBOL(__grab_cache_page);
2022 
2023 static ssize_t generic_perform_write_2copy(struct file *file,
2024 				struct iov_iter *i, loff_t pos)
2025 {
2026 	struct address_space *mapping = file->f_mapping;
2027 	const struct address_space_operations *a_ops = mapping->a_ops;
2028 	struct inode *inode = mapping->host;
2029 	long status = 0;
2030 	ssize_t written = 0;
2031 
2032 	do {
2033 		struct page *src_page;
2034 		struct page *page;
2035 		pgoff_t index;		/* Pagecache index for current page */
2036 		unsigned long offset;	/* Offset into pagecache page */
2037 		unsigned long bytes;	/* Bytes to write to page */
2038 		size_t copied;		/* Bytes copied from user */
2039 
2040 		offset = (pos & (PAGE_CACHE_SIZE - 1));
2041 		index = pos >> PAGE_CACHE_SHIFT;
2042 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2043 						iov_iter_count(i));
2044 
2045 		/*
2046 		 * a non-NULL src_page indicates that we're doing the
2047 		 * copy via get_user_pages and kmap.
2048 		 */
2049 		src_page = NULL;
2050 
2051 		/*
2052 		 * Bring in the user page that we will copy from _first_.
2053 		 * Otherwise there's a nasty deadlock on copying from the
2054 		 * same page as we're writing to, without it being marked
2055 		 * up-to-date.
2056 		 *
2057 		 * Not only is this an optimisation, but it is also required
2058 		 * to check that the address is actually valid, when atomic
2059 		 * usercopies are used, below.
2060 		 */
2061 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2062 			status = -EFAULT;
2063 			break;
2064 		}
2065 
2066 		page = __grab_cache_page(mapping, index);
2067 		if (!page) {
2068 			status = -ENOMEM;
2069 			break;
2070 		}
2071 
2072 		/*
2073 		 * non-uptodate pages cannot cope with short copies, and we
2074 		 * cannot take a pagefault with the destination page locked.
2075 		 * So pin the source page to copy it.
2076 		 */
2077 		if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2078 			unlock_page(page);
2079 
2080 			src_page = alloc_page(GFP_KERNEL);
2081 			if (!src_page) {
2082 				page_cache_release(page);
2083 				status = -ENOMEM;
2084 				break;
2085 			}
2086 
2087 			/*
2088 			 * Cannot get_user_pages with a page locked for the
2089 			 * same reason as we can't take a page fault with a
2090 			 * page locked (as explained below).
2091 			 */
2092 			copied = iov_iter_copy_from_user(src_page, i,
2093 								offset, bytes);
2094 			if (unlikely(copied == 0)) {
2095 				status = -EFAULT;
2096 				page_cache_release(page);
2097 				page_cache_release(src_page);
2098 				break;
2099 			}
2100 			bytes = copied;
2101 
2102 			lock_page(page);
2103 			/*
2104 			 * Can't handle the page going uptodate here, because
2105 			 * that means we would use non-atomic usercopies, which
2106 			 * zero out the tail of the page, which can cause
2107 			 * zeroes to become transiently visible. We could just
2108 			 * use a non-zeroing copy, but the APIs aren't too
2109 			 * consistent.
2110 			 */
2111 			if (unlikely(!page->mapping || PageUptodate(page))) {
2112 				unlock_page(page);
2113 				page_cache_release(page);
2114 				page_cache_release(src_page);
2115 				continue;
2116 			}
2117 		}
2118 
2119 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
2120 		if (unlikely(status))
2121 			goto fs_write_aop_error;
2122 
2123 		if (!src_page) {
2124 			/*
2125 			 * Must not enter the pagefault handler here, because
2126 			 * we hold the page lock, so we might recursively
2127 			 * deadlock on the same lock, or get an ABBA deadlock
2128 			 * against a different lock, or against the mmap_sem
2129 			 * (which nests outside the page lock).  So increment
2130 			 * preempt count, and use _atomic usercopies.
2131 			 *
2132 			 * The page is uptodate so we are OK to encounter a
2133 			 * short copy: if unmodified parts of the page are
2134 			 * marked dirty and written out to disk, it doesn't
2135 			 * really matter.
2136 			 */
2137 			pagefault_disable();
2138 			copied = iov_iter_copy_from_user_atomic(page, i,
2139 								offset, bytes);
2140 			pagefault_enable();
2141 		} else {
2142 			void *src, *dst;
2143 			src = kmap_atomic(src_page, KM_USER0);
2144 			dst = kmap_atomic(page, KM_USER1);
2145 			memcpy(dst + offset, src + offset, bytes);
2146 			kunmap_atomic(dst, KM_USER1);
2147 			kunmap_atomic(src, KM_USER0);
2148 			copied = bytes;
2149 		}
2150 		flush_dcache_page(page);
2151 
2152 		status = a_ops->commit_write(file, page, offset, offset+bytes);
2153 		if (unlikely(status < 0))
2154 			goto fs_write_aop_error;
2155 		if (unlikely(status > 0)) /* filesystem did partial write */
2156 			copied = min_t(size_t, copied, status);
2157 
2158 		unlock_page(page);
2159 		mark_page_accessed(page);
2160 		page_cache_release(page);
2161 		if (src_page)
2162 			page_cache_release(src_page);
2163 
2164 		iov_iter_advance(i, copied);
2165 		pos += copied;
2166 		written += copied;
2167 
2168 		balance_dirty_pages_ratelimited(mapping);
2169 		cond_resched();
2170 		continue;
2171 
2172 fs_write_aop_error:
2173 		unlock_page(page);
2174 		page_cache_release(page);
2175 		if (src_page)
2176 			page_cache_release(src_page);
2177 
2178 		/*
2179 		 * prepare_write() may have instantiated a few blocks
2180 		 * outside i_size.  Trim these off again. Don't need
2181 		 * i_size_read because we hold i_mutex.
2182 		 */
2183 		if (pos + bytes > inode->i_size)
2184 			vmtruncate(inode, inode->i_size);
2185 		break;
2186 	} while (iov_iter_count(i));
2187 
2188 	return written ? written : status;
2189 }
2190 
2191 static ssize_t generic_perform_write(struct file *file,
2192 				struct iov_iter *i, loff_t pos)
2193 {
2194 	struct address_space *mapping = file->f_mapping;
2195 	const struct address_space_operations *a_ops = mapping->a_ops;
2196 	long status = 0;
2197 	ssize_t written = 0;
2198 	unsigned int flags = 0;
2199 
2200 	/*
2201 	 * Copies from kernel address space cannot fail (NFSD is a big user).
2202 	 */
2203 	if (segment_eq(get_fs(), KERNEL_DS))
2204 		flags |= AOP_FLAG_UNINTERRUPTIBLE;
2205 
2206 	do {
2207 		struct page *page;
2208 		pgoff_t index;		/* Pagecache index for current page */
2209 		unsigned long offset;	/* Offset into pagecache page */
2210 		unsigned long bytes;	/* Bytes to write to page */
2211 		size_t copied;		/* Bytes copied from user */
2212 		void *fsdata;
2213 
2214 		offset = (pos & (PAGE_CACHE_SIZE - 1));
2215 		index = pos >> PAGE_CACHE_SHIFT;
2216 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2217 						iov_iter_count(i));
2218 
2219 again:
2220 
2221 		/*
2222 		 * Bring in the user page that we will copy from _first_.
2223 		 * Otherwise there's a nasty deadlock on copying from the
2224 		 * same page as we're writing to, without it being marked
2225 		 * up-to-date.
2226 		 *
2227 		 * Not only is this an optimisation, but it is also required
2228 		 * to check that the address is actually valid, when atomic
2229 		 * usercopies are used, below.
2230 		 */
2231 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2232 			status = -EFAULT;
2233 			break;
2234 		}
2235 
2236 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2237 						&page, &fsdata);
2238 		if (unlikely(status))
2239 			break;
2240 
2241 		pagefault_disable();
2242 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2243 		pagefault_enable();
2244 		flush_dcache_page(page);
2245 
2246 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
2247 						page, fsdata);
2248 		if (unlikely(status < 0))
2249 			break;
2250 		copied = status;
2251 
2252 		cond_resched();
2253 
2254 		if (unlikely(copied == 0)) {
2255 			/*
2256 			 * If we were unable to copy any data at all, we must
2257 			 * fall back to a single segment length write.
2258 			 *
2259 			 * If we didn't fallback here, we could livelock
2260 			 * because not all segments in the iov can be copied at
2261 			 * once without a pagefault.
2262 			 */
2263 			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2264 						iov_iter_single_seg_count(i));
2265 			goto again;
2266 		}
2267 		iov_iter_advance(i, copied);
2268 		pos += copied;
2269 		written += copied;
2270 
2271 		balance_dirty_pages_ratelimited(mapping);
2272 
2273 	} while (iov_iter_count(i));
2274 
2275 	return written ? written : status;
2276 }
2277 
2278 ssize_t
2279 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2280 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
2281 		size_t count, ssize_t written)
2282 {
2283 	struct file *file = iocb->ki_filp;
2284 	struct address_space *mapping = file->f_mapping;
2285 	const struct address_space_operations *a_ops = mapping->a_ops;
2286 	struct inode *inode = mapping->host;
2287 	ssize_t status;
2288 	struct iov_iter i;
2289 
2290 	iov_iter_init(&i, iov, nr_segs, count, written);
2291 	if (a_ops->write_begin)
2292 		status = generic_perform_write(file, &i, pos);
2293 	else
2294 		status = generic_perform_write_2copy(file, &i, pos);
2295 
2296 	if (likely(status >= 0)) {
2297 		written += status;
2298 		*ppos = pos + status;
2299 
2300 		/*
2301 		 * For now, when the user asks for O_SYNC, we'll actually give
2302 		 * O_DSYNC
2303 		 */
2304 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2305 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
2306 				status = generic_osync_inode(inode, mapping,
2307 						OSYNC_METADATA|OSYNC_DATA);
2308 		}
2309   	}
2310 
2311 	/*
2312 	 * If we get here for O_DIRECT writes then we must have fallen through
2313 	 * to buffered writes (block instantiation inside i_size).  So we sync
2314 	 * the file data here, to try to honour O_DIRECT expectations.
2315 	 */
2316 	if (unlikely(file->f_flags & O_DIRECT) && written)
2317 		status = filemap_write_and_wait(mapping);
2318 
2319 	return written ? written : status;
2320 }
2321 EXPORT_SYMBOL(generic_file_buffered_write);
2322 
2323 static ssize_t
2324 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2325 				unsigned long nr_segs, loff_t *ppos)
2326 {
2327 	struct file *file = iocb->ki_filp;
2328 	struct address_space * mapping = file->f_mapping;
2329 	size_t ocount;		/* original count */
2330 	size_t count;		/* after file limit checks */
2331 	struct inode 	*inode = mapping->host;
2332 	loff_t		pos;
2333 	ssize_t		written;
2334 	ssize_t		err;
2335 
2336 	ocount = 0;
2337 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2338 	if (err)
2339 		return err;
2340 
2341 	count = ocount;
2342 	pos = *ppos;
2343 
2344 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2345 
2346 	/* We can write back this queue in page reclaim */
2347 	current->backing_dev_info = mapping->backing_dev_info;
2348 	written = 0;
2349 
2350 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2351 	if (err)
2352 		goto out;
2353 
2354 	if (count == 0)
2355 		goto out;
2356 
2357 	err = remove_suid(file->f_path.dentry);
2358 	if (err)
2359 		goto out;
2360 
2361 	file_update_time(file);
2362 
2363 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2364 	if (unlikely(file->f_flags & O_DIRECT)) {
2365 		loff_t endbyte;
2366 		ssize_t written_buffered;
2367 
2368 		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2369 							ppos, count, ocount);
2370 		if (written < 0 || written == count)
2371 			goto out;
2372 		/*
2373 		 * direct-io write to a hole: fall through to buffered I/O
2374 		 * for completing the rest of the request.
2375 		 */
2376 		pos += written;
2377 		count -= written;
2378 		written_buffered = generic_file_buffered_write(iocb, iov,
2379 						nr_segs, pos, ppos, count,
2380 						written);
2381 		/*
2382 		 * If generic_file_buffered_write() retuned a synchronous error
2383 		 * then we want to return the number of bytes which were
2384 		 * direct-written, or the error code if that was zero.  Note
2385 		 * that this differs from normal direct-io semantics, which
2386 		 * will return -EFOO even if some bytes were written.
2387 		 */
2388 		if (written_buffered < 0) {
2389 			err = written_buffered;
2390 			goto out;
2391 		}
2392 
2393 		/*
2394 		 * We need to ensure that the page cache pages are written to
2395 		 * disk and invalidated to preserve the expected O_DIRECT
2396 		 * semantics.
2397 		 */
2398 		endbyte = pos + written_buffered - written - 1;
2399 		err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2400 					    SYNC_FILE_RANGE_WAIT_BEFORE|
2401 					    SYNC_FILE_RANGE_WRITE|
2402 					    SYNC_FILE_RANGE_WAIT_AFTER);
2403 		if (err == 0) {
2404 			written = written_buffered;
2405 			invalidate_mapping_pages(mapping,
2406 						 pos >> PAGE_CACHE_SHIFT,
2407 						 endbyte >> PAGE_CACHE_SHIFT);
2408 		} else {
2409 			/*
2410 			 * We don't know how much we wrote, so just return
2411 			 * the number of bytes which were direct-written
2412 			 */
2413 		}
2414 	} else {
2415 		written = generic_file_buffered_write(iocb, iov, nr_segs,
2416 				pos, ppos, count, written);
2417 	}
2418 out:
2419 	current->backing_dev_info = NULL;
2420 	return written ? written : err;
2421 }
2422 
2423 ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2424 		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2425 {
2426 	struct file *file = iocb->ki_filp;
2427 	struct address_space *mapping = file->f_mapping;
2428 	struct inode *inode = mapping->host;
2429 	ssize_t ret;
2430 
2431 	BUG_ON(iocb->ki_pos != pos);
2432 
2433 	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2434 			&iocb->ki_pos);
2435 
2436 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2437 		ssize_t err;
2438 
2439 		err = sync_page_range_nolock(inode, mapping, pos, ret);
2440 		if (err < 0)
2441 			ret = err;
2442 	}
2443 	return ret;
2444 }
2445 EXPORT_SYMBOL(generic_file_aio_write_nolock);
2446 
2447 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2448 		unsigned long nr_segs, loff_t pos)
2449 {
2450 	struct file *file = iocb->ki_filp;
2451 	struct address_space *mapping = file->f_mapping;
2452 	struct inode *inode = mapping->host;
2453 	ssize_t ret;
2454 
2455 	BUG_ON(iocb->ki_pos != pos);
2456 
2457 	mutex_lock(&inode->i_mutex);
2458 	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2459 			&iocb->ki_pos);
2460 	mutex_unlock(&inode->i_mutex);
2461 
2462 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2463 		ssize_t err;
2464 
2465 		err = sync_page_range(inode, mapping, pos, ret);
2466 		if (err < 0)
2467 			ret = err;
2468 	}
2469 	return ret;
2470 }
2471 EXPORT_SYMBOL(generic_file_aio_write);
2472 
2473 /*
2474  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2475  * went wrong during pagecache shootdown.
2476  */
2477 static ssize_t
2478 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2479 	loff_t offset, unsigned long nr_segs)
2480 {
2481 	struct file *file = iocb->ki_filp;
2482 	struct address_space *mapping = file->f_mapping;
2483 	ssize_t retval;
2484 	size_t write_len;
2485 	pgoff_t end = 0; /* silence gcc */
2486 
2487 	/*
2488 	 * If it's a write, unmap all mmappings of the file up-front.  This
2489 	 * will cause any pte dirty bits to be propagated into the pageframes
2490 	 * for the subsequent filemap_write_and_wait().
2491 	 */
2492 	if (rw == WRITE) {
2493 		write_len = iov_length(iov, nr_segs);
2494 		end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2495 	       	if (mapping_mapped(mapping))
2496 			unmap_mapping_range(mapping, offset, write_len, 0);
2497 	}
2498 
2499 	retval = filemap_write_and_wait(mapping);
2500 	if (retval)
2501 		goto out;
2502 
2503 	/*
2504 	 * After a write we want buffered reads to be sure to go to disk to get
2505 	 * the new data.  We invalidate clean cached page from the region we're
2506 	 * about to write.  We do this *before* the write so that we can return
2507 	 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2508 	 */
2509 	if (rw == WRITE && mapping->nrpages) {
2510 		retval = invalidate_inode_pages2_range(mapping,
2511 					offset >> PAGE_CACHE_SHIFT, end);
2512 		if (retval)
2513 			goto out;
2514 	}
2515 
2516 	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2517 
2518 	/*
2519 	 * Finally, try again to invalidate clean pages which might have been
2520 	 * cached by non-direct readahead, or faulted in by get_user_pages()
2521 	 * if the source of the write was an mmap'ed region of the file
2522 	 * we're writing.  Either one is a pretty crazy thing to do,
2523 	 * so we don't support it 100%.  If this invalidation
2524 	 * fails, tough, the write still worked...
2525 	 */
2526 	if (rw == WRITE && mapping->nrpages) {
2527 		invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2528 	}
2529 out:
2530 	return retval;
2531 }
2532 
2533 /**
2534  * try_to_release_page() - release old fs-specific metadata on a page
2535  *
2536  * @page: the page which the kernel is trying to free
2537  * @gfp_mask: memory allocation flags (and I/O mode)
2538  *
2539  * The address_space is to try to release any data against the page
2540  * (presumably at page->private).  If the release was successful, return `1'.
2541  * Otherwise return zero.
2542  *
2543  * The @gfp_mask argument specifies whether I/O may be performed to release
2544  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
2545  *
2546  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2547  */
2548 int try_to_release_page(struct page *page, gfp_t gfp_mask)
2549 {
2550 	struct address_space * const mapping = page->mapping;
2551 
2552 	BUG_ON(!PageLocked(page));
2553 	if (PageWriteback(page))
2554 		return 0;
2555 
2556 	if (mapping && mapping->a_ops->releasepage)
2557 		return mapping->a_ops->releasepage(page, gfp_mask);
2558 	return try_to_free_buffers(page);
2559 }
2560 
2561 EXPORT_SYMBOL(try_to_release_page);
2562