xref: /linux/mm/filemap.c (revision d8327c784b51b57dac2c26cfad87dce0d68dfd98)
1 /*
2  *	linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/config.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/compiler.h>
16 #include <linux/fs.h>
17 #include <linux/aio.h>
18 #include <linux/capability.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/mman.h>
23 #include <linux/pagemap.h>
24 #include <linux/file.h>
25 #include <linux/uio.h>
26 #include <linux/hash.h>
27 #include <linux/writeback.h>
28 #include <linux/pagevec.h>
29 #include <linux/blkdev.h>
30 #include <linux/security.h>
31 #include <linux/syscalls.h>
32 #include "filemap.h"
33 /*
34  * FIXME: remove all knowledge of the buffer layer from the core VM
35  */
36 #include <linux/buffer_head.h> /* for generic_osync_inode */
37 
38 #include <asm/uaccess.h>
39 #include <asm/mman.h>
40 
41 static ssize_t
42 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
43 	loff_t offset, unsigned long nr_segs);
44 
45 /*
46  * Shared mappings implemented 30.11.1994. It's not fully working yet,
47  * though.
48  *
49  * Shared mappings now work. 15.8.1995  Bruno.
50  *
51  * finished 'unifying' the page and buffer cache and SMP-threaded the
52  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
53  *
54  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
55  */
56 
57 /*
58  * Lock ordering:
59  *
60  *  ->i_mmap_lock		(vmtruncate)
61  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
62  *      ->swap_lock		(exclusive_swap_page, others)
63  *        ->mapping->tree_lock
64  *
65  *  ->i_mutex
66  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
67  *
68  *  ->mmap_sem
69  *    ->i_mmap_lock
70  *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
71  *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
72  *
73  *  ->mmap_sem
74  *    ->lock_page		(access_process_vm)
75  *
76  *  ->mmap_sem
77  *    ->i_mutex			(msync)
78  *
79  *  ->i_mutex
80  *    ->i_alloc_sem             (various)
81  *
82  *  ->inode_lock
83  *    ->sb_lock			(fs/fs-writeback.c)
84  *    ->mapping->tree_lock	(__sync_single_inode)
85  *
86  *  ->i_mmap_lock
87  *    ->anon_vma.lock		(vma_adjust)
88  *
89  *  ->anon_vma.lock
90  *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
91  *
92  *  ->page_table_lock or pte_lock
93  *    ->swap_lock		(try_to_unmap_one)
94  *    ->private_lock		(try_to_unmap_one)
95  *    ->tree_lock		(try_to_unmap_one)
96  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
97  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
98  *    ->private_lock		(page_remove_rmap->set_page_dirty)
99  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
100  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
101  *    ->inode_lock		(zap_pte_range->set_page_dirty)
102  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
103  *
104  *  ->task->proc_lock
105  *    ->dcache_lock		(proc_pid_lookup)
106  */
107 
108 /*
109  * Remove a page from the page cache and free it. Caller has to make
110  * sure the page is locked and that nobody else uses it - or that usage
111  * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
112  */
113 void __remove_from_page_cache(struct page *page)
114 {
115 	struct address_space *mapping = page->mapping;
116 
117 	radix_tree_delete(&mapping->page_tree, page->index);
118 	page->mapping = NULL;
119 	mapping->nrpages--;
120 	pagecache_acct(-1);
121 }
122 
123 void remove_from_page_cache(struct page *page)
124 {
125 	struct address_space *mapping = page->mapping;
126 
127 	BUG_ON(!PageLocked(page));
128 
129 	write_lock_irq(&mapping->tree_lock);
130 	__remove_from_page_cache(page);
131 	write_unlock_irq(&mapping->tree_lock);
132 }
133 
134 static int sync_page(void *word)
135 {
136 	struct address_space *mapping;
137 	struct page *page;
138 
139 	page = container_of((unsigned long *)word, struct page, flags);
140 
141 	/*
142 	 * page_mapping() is being called without PG_locked held.
143 	 * Some knowledge of the state and use of the page is used to
144 	 * reduce the requirements down to a memory barrier.
145 	 * The danger here is of a stale page_mapping() return value
146 	 * indicating a struct address_space different from the one it's
147 	 * associated with when it is associated with one.
148 	 * After smp_mb(), it's either the correct page_mapping() for
149 	 * the page, or an old page_mapping() and the page's own
150 	 * page_mapping() has gone NULL.
151 	 * The ->sync_page() address_space operation must tolerate
152 	 * page_mapping() going NULL. By an amazing coincidence,
153 	 * this comes about because none of the users of the page
154 	 * in the ->sync_page() methods make essential use of the
155 	 * page_mapping(), merely passing the page down to the backing
156 	 * device's unplug functions when it's non-NULL, which in turn
157 	 * ignore it for all cases but swap, where only page_private(page) is
158 	 * of interest. When page_mapping() does go NULL, the entire
159 	 * call stack gracefully ignores the page and returns.
160 	 * -- wli
161 	 */
162 	smp_mb();
163 	mapping = page_mapping(page);
164 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
165 		mapping->a_ops->sync_page(page);
166 	io_schedule();
167 	return 0;
168 }
169 
170 /**
171  * filemap_fdatawrite_range - start writeback against all of a mapping's
172  * dirty pages that lie within the byte offsets <start, end>
173  * @mapping:	address space structure to write
174  * @start:	offset in bytes where the range starts
175  * @end:	offset in bytes where the range ends
176  * @sync_mode:	enable synchronous operation
177  *
178  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
179  * opposed to a regular memory * cleansing writeback.  The difference between
180  * these two operations is that if a dirty page/buffer is encountered, it must
181  * be waited upon, and not just skipped over.
182  */
183 static int __filemap_fdatawrite_range(struct address_space *mapping,
184 	loff_t start, loff_t end, int sync_mode)
185 {
186 	int ret;
187 	struct writeback_control wbc = {
188 		.sync_mode = sync_mode,
189 		.nr_to_write = mapping->nrpages * 2,
190 		.start = start,
191 		.end = end,
192 	};
193 
194 	if (!mapping_cap_writeback_dirty(mapping))
195 		return 0;
196 
197 	ret = do_writepages(mapping, &wbc);
198 	return ret;
199 }
200 
201 static inline int __filemap_fdatawrite(struct address_space *mapping,
202 	int sync_mode)
203 {
204 	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
205 }
206 
207 int filemap_fdatawrite(struct address_space *mapping)
208 {
209 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
210 }
211 EXPORT_SYMBOL(filemap_fdatawrite);
212 
213 static int filemap_fdatawrite_range(struct address_space *mapping,
214 	loff_t start, loff_t end)
215 {
216 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
217 }
218 
219 /*
220  * This is a mostly non-blocking flush.  Not suitable for data-integrity
221  * purposes - I/O may not be started against all dirty pages.
222  */
223 int filemap_flush(struct address_space *mapping)
224 {
225 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
226 }
227 EXPORT_SYMBOL(filemap_flush);
228 
229 /*
230  * Wait for writeback to complete against pages indexed by start->end
231  * inclusive
232  */
233 static int wait_on_page_writeback_range(struct address_space *mapping,
234 				pgoff_t start, pgoff_t end)
235 {
236 	struct pagevec pvec;
237 	int nr_pages;
238 	int ret = 0;
239 	pgoff_t index;
240 
241 	if (end < start)
242 		return 0;
243 
244 	pagevec_init(&pvec, 0);
245 	index = start;
246 	while ((index <= end) &&
247 			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
248 			PAGECACHE_TAG_WRITEBACK,
249 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
250 		unsigned i;
251 
252 		for (i = 0; i < nr_pages; i++) {
253 			struct page *page = pvec.pages[i];
254 
255 			/* until radix tree lookup accepts end_index */
256 			if (page->index > end)
257 				continue;
258 
259 			wait_on_page_writeback(page);
260 			if (PageError(page))
261 				ret = -EIO;
262 		}
263 		pagevec_release(&pvec);
264 		cond_resched();
265 	}
266 
267 	/* Check for outstanding write errors */
268 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
269 		ret = -ENOSPC;
270 	if (test_and_clear_bit(AS_EIO, &mapping->flags))
271 		ret = -EIO;
272 
273 	return ret;
274 }
275 
276 /*
277  * Write and wait upon all the pages in the passed range.  This is a "data
278  * integrity" operation.  It waits upon in-flight writeout before starting and
279  * waiting upon new writeout.  If there was an IO error, return it.
280  *
281  * We need to re-take i_mutex during the generic_osync_inode list walk because
282  * it is otherwise livelockable.
283  */
284 int sync_page_range(struct inode *inode, struct address_space *mapping,
285 			loff_t pos, loff_t count)
286 {
287 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
288 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
289 	int ret;
290 
291 	if (!mapping_cap_writeback_dirty(mapping) || !count)
292 		return 0;
293 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
294 	if (ret == 0) {
295 		mutex_lock(&inode->i_mutex);
296 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
297 		mutex_unlock(&inode->i_mutex);
298 	}
299 	if (ret == 0)
300 		ret = wait_on_page_writeback_range(mapping, start, end);
301 	return ret;
302 }
303 EXPORT_SYMBOL(sync_page_range);
304 
305 /*
306  * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
307  * as it forces O_SYNC writers to different parts of the same file
308  * to be serialised right until io completion.
309  */
310 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
311 			   loff_t pos, loff_t count)
312 {
313 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
314 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
315 	int ret;
316 
317 	if (!mapping_cap_writeback_dirty(mapping) || !count)
318 		return 0;
319 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
320 	if (ret == 0)
321 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
322 	if (ret == 0)
323 		ret = wait_on_page_writeback_range(mapping, start, end);
324 	return ret;
325 }
326 EXPORT_SYMBOL(sync_page_range_nolock);
327 
328 /**
329  * filemap_fdatawait - walk the list of under-writeback pages of the given
330  *     address space and wait for all of them.
331  *
332  * @mapping: address space structure to wait for
333  */
334 int filemap_fdatawait(struct address_space *mapping)
335 {
336 	loff_t i_size = i_size_read(mapping->host);
337 
338 	if (i_size == 0)
339 		return 0;
340 
341 	return wait_on_page_writeback_range(mapping, 0,
342 				(i_size - 1) >> PAGE_CACHE_SHIFT);
343 }
344 EXPORT_SYMBOL(filemap_fdatawait);
345 
346 int filemap_write_and_wait(struct address_space *mapping)
347 {
348 	int err = 0;
349 
350 	if (mapping->nrpages) {
351 		err = filemap_fdatawrite(mapping);
352 		/*
353 		 * Even if the above returned error, the pages may be
354 		 * written partially (e.g. -ENOSPC), so we wait for it.
355 		 * But the -EIO is special case, it may indicate the worst
356 		 * thing (e.g. bug) happened, so we avoid waiting for it.
357 		 */
358 		if (err != -EIO) {
359 			int err2 = filemap_fdatawait(mapping);
360 			if (!err)
361 				err = err2;
362 		}
363 	}
364 	return err;
365 }
366 EXPORT_SYMBOL(filemap_write_and_wait);
367 
368 int filemap_write_and_wait_range(struct address_space *mapping,
369 				 loff_t lstart, loff_t lend)
370 {
371 	int err = 0;
372 
373 	if (mapping->nrpages) {
374 		err = __filemap_fdatawrite_range(mapping, lstart, lend,
375 						 WB_SYNC_ALL);
376 		/* See comment of filemap_write_and_wait() */
377 		if (err != -EIO) {
378 			int err2 = wait_on_page_writeback_range(mapping,
379 						lstart >> PAGE_CACHE_SHIFT,
380 						lend >> PAGE_CACHE_SHIFT);
381 			if (!err)
382 				err = err2;
383 		}
384 	}
385 	return err;
386 }
387 
388 /*
389  * This function is used to add newly allocated pagecache pages:
390  * the page is new, so we can just run SetPageLocked() against it.
391  * The other page state flags were set by rmqueue().
392  *
393  * This function does not add the page to the LRU.  The caller must do that.
394  */
395 int add_to_page_cache(struct page *page, struct address_space *mapping,
396 		pgoff_t offset, gfp_t gfp_mask)
397 {
398 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
399 
400 	if (error == 0) {
401 		write_lock_irq(&mapping->tree_lock);
402 		error = radix_tree_insert(&mapping->page_tree, offset, page);
403 		if (!error) {
404 			page_cache_get(page);
405 			SetPageLocked(page);
406 			page->mapping = mapping;
407 			page->index = offset;
408 			mapping->nrpages++;
409 			pagecache_acct(1);
410 		}
411 		write_unlock_irq(&mapping->tree_lock);
412 		radix_tree_preload_end();
413 	}
414 	return error;
415 }
416 
417 EXPORT_SYMBOL(add_to_page_cache);
418 
419 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
420 				pgoff_t offset, gfp_t gfp_mask)
421 {
422 	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
423 	if (ret == 0)
424 		lru_cache_add(page);
425 	return ret;
426 }
427 
428 /*
429  * In order to wait for pages to become available there must be
430  * waitqueues associated with pages. By using a hash table of
431  * waitqueues where the bucket discipline is to maintain all
432  * waiters on the same queue and wake all when any of the pages
433  * become available, and for the woken contexts to check to be
434  * sure the appropriate page became available, this saves space
435  * at a cost of "thundering herd" phenomena during rare hash
436  * collisions.
437  */
438 static wait_queue_head_t *page_waitqueue(struct page *page)
439 {
440 	const struct zone *zone = page_zone(page);
441 
442 	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
443 }
444 
445 static inline void wake_up_page(struct page *page, int bit)
446 {
447 	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
448 }
449 
450 void fastcall wait_on_page_bit(struct page *page, int bit_nr)
451 {
452 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
453 
454 	if (test_bit(bit_nr, &page->flags))
455 		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
456 							TASK_UNINTERRUPTIBLE);
457 }
458 EXPORT_SYMBOL(wait_on_page_bit);
459 
460 /**
461  * unlock_page() - unlock a locked page
462  *
463  * @page: the page
464  *
465  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
466  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
467  * mechananism between PageLocked pages and PageWriteback pages is shared.
468  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
469  *
470  * The first mb is necessary to safely close the critical section opened by the
471  * TestSetPageLocked(), the second mb is necessary to enforce ordering between
472  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
473  * parallel wait_on_page_locked()).
474  */
475 void fastcall unlock_page(struct page *page)
476 {
477 	smp_mb__before_clear_bit();
478 	if (!TestClearPageLocked(page))
479 		BUG();
480 	smp_mb__after_clear_bit();
481 	wake_up_page(page, PG_locked);
482 }
483 EXPORT_SYMBOL(unlock_page);
484 
485 /*
486  * End writeback against a page.
487  */
488 void end_page_writeback(struct page *page)
489 {
490 	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
491 		if (!test_clear_page_writeback(page))
492 			BUG();
493 	}
494 	smp_mb__after_clear_bit();
495 	wake_up_page(page, PG_writeback);
496 }
497 EXPORT_SYMBOL(end_page_writeback);
498 
499 /*
500  * Get a lock on the page, assuming we need to sleep to get it.
501  *
502  * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
503  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
504  * chances are that on the second loop, the block layer's plug list is empty,
505  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
506  */
507 void fastcall __lock_page(struct page *page)
508 {
509 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
510 
511 	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
512 							TASK_UNINTERRUPTIBLE);
513 }
514 EXPORT_SYMBOL(__lock_page);
515 
516 /*
517  * a rather lightweight function, finding and getting a reference to a
518  * hashed page atomically.
519  */
520 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
521 {
522 	struct page *page;
523 
524 	read_lock_irq(&mapping->tree_lock);
525 	page = radix_tree_lookup(&mapping->page_tree, offset);
526 	if (page)
527 		page_cache_get(page);
528 	read_unlock_irq(&mapping->tree_lock);
529 	return page;
530 }
531 
532 EXPORT_SYMBOL(find_get_page);
533 
534 /*
535  * Same as above, but trylock it instead of incrementing the count.
536  */
537 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
538 {
539 	struct page *page;
540 
541 	read_lock_irq(&mapping->tree_lock);
542 	page = radix_tree_lookup(&mapping->page_tree, offset);
543 	if (page && TestSetPageLocked(page))
544 		page = NULL;
545 	read_unlock_irq(&mapping->tree_lock);
546 	return page;
547 }
548 
549 EXPORT_SYMBOL(find_trylock_page);
550 
551 /**
552  * find_lock_page - locate, pin and lock a pagecache page
553  *
554  * @mapping: the address_space to search
555  * @offset: the page index
556  *
557  * Locates the desired pagecache page, locks it, increments its reference
558  * count and returns its address.
559  *
560  * Returns zero if the page was not present. find_lock_page() may sleep.
561  */
562 struct page *find_lock_page(struct address_space *mapping,
563 				unsigned long offset)
564 {
565 	struct page *page;
566 
567 	read_lock_irq(&mapping->tree_lock);
568 repeat:
569 	page = radix_tree_lookup(&mapping->page_tree, offset);
570 	if (page) {
571 		page_cache_get(page);
572 		if (TestSetPageLocked(page)) {
573 			read_unlock_irq(&mapping->tree_lock);
574 			__lock_page(page);
575 			read_lock_irq(&mapping->tree_lock);
576 
577 			/* Has the page been truncated while we slept? */
578 			if (unlikely(page->mapping != mapping ||
579 				     page->index != offset)) {
580 				unlock_page(page);
581 				page_cache_release(page);
582 				goto repeat;
583 			}
584 		}
585 	}
586 	read_unlock_irq(&mapping->tree_lock);
587 	return page;
588 }
589 
590 EXPORT_SYMBOL(find_lock_page);
591 
592 /**
593  * find_or_create_page - locate or add a pagecache page
594  *
595  * @mapping: the page's address_space
596  * @index: the page's index into the mapping
597  * @gfp_mask: page allocation mode
598  *
599  * Locates a page in the pagecache.  If the page is not present, a new page
600  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
601  * LRU list.  The returned page is locked and has its reference count
602  * incremented.
603  *
604  * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
605  * allocation!
606  *
607  * find_or_create_page() returns the desired page's address, or zero on
608  * memory exhaustion.
609  */
610 struct page *find_or_create_page(struct address_space *mapping,
611 		unsigned long index, gfp_t gfp_mask)
612 {
613 	struct page *page, *cached_page = NULL;
614 	int err;
615 repeat:
616 	page = find_lock_page(mapping, index);
617 	if (!page) {
618 		if (!cached_page) {
619 			cached_page = alloc_page(gfp_mask);
620 			if (!cached_page)
621 				return NULL;
622 		}
623 		err = add_to_page_cache_lru(cached_page, mapping,
624 					index, gfp_mask);
625 		if (!err) {
626 			page = cached_page;
627 			cached_page = NULL;
628 		} else if (err == -EEXIST)
629 			goto repeat;
630 	}
631 	if (cached_page)
632 		page_cache_release(cached_page);
633 	return page;
634 }
635 
636 EXPORT_SYMBOL(find_or_create_page);
637 
638 /**
639  * find_get_pages - gang pagecache lookup
640  * @mapping:	The address_space to search
641  * @start:	The starting page index
642  * @nr_pages:	The maximum number of pages
643  * @pages:	Where the resulting pages are placed
644  *
645  * find_get_pages() will search for and return a group of up to
646  * @nr_pages pages in the mapping.  The pages are placed at @pages.
647  * find_get_pages() takes a reference against the returned pages.
648  *
649  * The search returns a group of mapping-contiguous pages with ascending
650  * indexes.  There may be holes in the indices due to not-present pages.
651  *
652  * find_get_pages() returns the number of pages which were found.
653  */
654 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
655 			    unsigned int nr_pages, struct page **pages)
656 {
657 	unsigned int i;
658 	unsigned int ret;
659 
660 	read_lock_irq(&mapping->tree_lock);
661 	ret = radix_tree_gang_lookup(&mapping->page_tree,
662 				(void **)pages, start, nr_pages);
663 	for (i = 0; i < ret; i++)
664 		page_cache_get(pages[i]);
665 	read_unlock_irq(&mapping->tree_lock);
666 	return ret;
667 }
668 
669 /*
670  * Like find_get_pages, except we only return pages which are tagged with
671  * `tag'.   We update *index to index the next page for the traversal.
672  */
673 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
674 			int tag, unsigned int nr_pages, struct page **pages)
675 {
676 	unsigned int i;
677 	unsigned int ret;
678 
679 	read_lock_irq(&mapping->tree_lock);
680 	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
681 				(void **)pages, *index, nr_pages, tag);
682 	for (i = 0; i < ret; i++)
683 		page_cache_get(pages[i]);
684 	if (ret)
685 		*index = pages[ret - 1]->index + 1;
686 	read_unlock_irq(&mapping->tree_lock);
687 	return ret;
688 }
689 
690 /*
691  * Same as grab_cache_page, but do not wait if the page is unavailable.
692  * This is intended for speculative data generators, where the data can
693  * be regenerated if the page couldn't be grabbed.  This routine should
694  * be safe to call while holding the lock for another page.
695  *
696  * Clear __GFP_FS when allocating the page to avoid recursion into the fs
697  * and deadlock against the caller's locked page.
698  */
699 struct page *
700 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
701 {
702 	struct page *page = find_get_page(mapping, index);
703 	gfp_t gfp_mask;
704 
705 	if (page) {
706 		if (!TestSetPageLocked(page))
707 			return page;
708 		page_cache_release(page);
709 		return NULL;
710 	}
711 	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
712 	page = alloc_pages(gfp_mask, 0);
713 	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
714 		page_cache_release(page);
715 		page = NULL;
716 	}
717 	return page;
718 }
719 
720 EXPORT_SYMBOL(grab_cache_page_nowait);
721 
722 /*
723  * This is a generic file read routine, and uses the
724  * mapping->a_ops->readpage() function for the actual low-level
725  * stuff.
726  *
727  * This is really ugly. But the goto's actually try to clarify some
728  * of the logic when it comes to error handling etc.
729  *
730  * Note the struct file* is only passed for the use of readpage.  It may be
731  * NULL.
732  */
733 void do_generic_mapping_read(struct address_space *mapping,
734 			     struct file_ra_state *_ra,
735 			     struct file *filp,
736 			     loff_t *ppos,
737 			     read_descriptor_t *desc,
738 			     read_actor_t actor)
739 {
740 	struct inode *inode = mapping->host;
741 	unsigned long index;
742 	unsigned long end_index;
743 	unsigned long offset;
744 	unsigned long last_index;
745 	unsigned long next_index;
746 	unsigned long prev_index;
747 	loff_t isize;
748 	struct page *cached_page;
749 	int error;
750 	struct file_ra_state ra = *_ra;
751 
752 	cached_page = NULL;
753 	index = *ppos >> PAGE_CACHE_SHIFT;
754 	next_index = index;
755 	prev_index = ra.prev_page;
756 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
757 	offset = *ppos & ~PAGE_CACHE_MASK;
758 
759 	isize = i_size_read(inode);
760 	if (!isize)
761 		goto out;
762 
763 	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
764 	for (;;) {
765 		struct page *page;
766 		unsigned long nr, ret;
767 
768 		/* nr is the maximum number of bytes to copy from this page */
769 		nr = PAGE_CACHE_SIZE;
770 		if (index >= end_index) {
771 			if (index > end_index)
772 				goto out;
773 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
774 			if (nr <= offset) {
775 				goto out;
776 			}
777 		}
778 		nr = nr - offset;
779 
780 		cond_resched();
781 		if (index == next_index)
782 			next_index = page_cache_readahead(mapping, &ra, filp,
783 					index, last_index - index);
784 
785 find_page:
786 		page = find_get_page(mapping, index);
787 		if (unlikely(page == NULL)) {
788 			handle_ra_miss(mapping, &ra, index);
789 			goto no_cached_page;
790 		}
791 		if (!PageUptodate(page))
792 			goto page_not_up_to_date;
793 page_ok:
794 
795 		/* If users can be writing to this page using arbitrary
796 		 * virtual addresses, take care about potential aliasing
797 		 * before reading the page on the kernel side.
798 		 */
799 		if (mapping_writably_mapped(mapping))
800 			flush_dcache_page(page);
801 
802 		/*
803 		 * When (part of) the same page is read multiple times
804 		 * in succession, only mark it as accessed the first time.
805 		 */
806 		if (prev_index != index)
807 			mark_page_accessed(page);
808 		prev_index = index;
809 
810 		/*
811 		 * Ok, we have the page, and it's up-to-date, so
812 		 * now we can copy it to user space...
813 		 *
814 		 * The actor routine returns how many bytes were actually used..
815 		 * NOTE! This may not be the same as how much of a user buffer
816 		 * we filled up (we may be padding etc), so we can only update
817 		 * "pos" here (the actor routine has to update the user buffer
818 		 * pointers and the remaining count).
819 		 */
820 		ret = actor(desc, page, offset, nr);
821 		offset += ret;
822 		index += offset >> PAGE_CACHE_SHIFT;
823 		offset &= ~PAGE_CACHE_MASK;
824 
825 		page_cache_release(page);
826 		if (ret == nr && desc->count)
827 			continue;
828 		goto out;
829 
830 page_not_up_to_date:
831 		/* Get exclusive access to the page ... */
832 		lock_page(page);
833 
834 		/* Did it get unhashed before we got the lock? */
835 		if (!page->mapping) {
836 			unlock_page(page);
837 			page_cache_release(page);
838 			continue;
839 		}
840 
841 		/* Did somebody else fill it already? */
842 		if (PageUptodate(page)) {
843 			unlock_page(page);
844 			goto page_ok;
845 		}
846 
847 readpage:
848 		/* Start the actual read. The read will unlock the page. */
849 		error = mapping->a_ops->readpage(filp, page);
850 
851 		if (unlikely(error)) {
852 			if (error == AOP_TRUNCATED_PAGE) {
853 				page_cache_release(page);
854 				goto find_page;
855 			}
856 			goto readpage_error;
857 		}
858 
859 		if (!PageUptodate(page)) {
860 			lock_page(page);
861 			if (!PageUptodate(page)) {
862 				if (page->mapping == NULL) {
863 					/*
864 					 * invalidate_inode_pages got it
865 					 */
866 					unlock_page(page);
867 					page_cache_release(page);
868 					goto find_page;
869 				}
870 				unlock_page(page);
871 				error = -EIO;
872 				goto readpage_error;
873 			}
874 			unlock_page(page);
875 		}
876 
877 		/*
878 		 * i_size must be checked after we have done ->readpage.
879 		 *
880 		 * Checking i_size after the readpage allows us to calculate
881 		 * the correct value for "nr", which means the zero-filled
882 		 * part of the page is not copied back to userspace (unless
883 		 * another truncate extends the file - this is desired though).
884 		 */
885 		isize = i_size_read(inode);
886 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
887 		if (unlikely(!isize || index > end_index)) {
888 			page_cache_release(page);
889 			goto out;
890 		}
891 
892 		/* nr is the maximum number of bytes to copy from this page */
893 		nr = PAGE_CACHE_SIZE;
894 		if (index == end_index) {
895 			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
896 			if (nr <= offset) {
897 				page_cache_release(page);
898 				goto out;
899 			}
900 		}
901 		nr = nr - offset;
902 		goto page_ok;
903 
904 readpage_error:
905 		/* UHHUH! A synchronous read error occurred. Report it */
906 		desc->error = error;
907 		page_cache_release(page);
908 		goto out;
909 
910 no_cached_page:
911 		/*
912 		 * Ok, it wasn't cached, so we need to create a new
913 		 * page..
914 		 */
915 		if (!cached_page) {
916 			cached_page = page_cache_alloc_cold(mapping);
917 			if (!cached_page) {
918 				desc->error = -ENOMEM;
919 				goto out;
920 			}
921 		}
922 		error = add_to_page_cache_lru(cached_page, mapping,
923 						index, GFP_KERNEL);
924 		if (error) {
925 			if (error == -EEXIST)
926 				goto find_page;
927 			desc->error = error;
928 			goto out;
929 		}
930 		page = cached_page;
931 		cached_page = NULL;
932 		goto readpage;
933 	}
934 
935 out:
936 	*_ra = ra;
937 
938 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
939 	if (cached_page)
940 		page_cache_release(cached_page);
941 	if (filp)
942 		file_accessed(filp);
943 }
944 
945 EXPORT_SYMBOL(do_generic_mapping_read);
946 
947 int file_read_actor(read_descriptor_t *desc, struct page *page,
948 			unsigned long offset, unsigned long size)
949 {
950 	char *kaddr;
951 	unsigned long left, count = desc->count;
952 
953 	if (size > count)
954 		size = count;
955 
956 	/*
957 	 * Faults on the destination of a read are common, so do it before
958 	 * taking the kmap.
959 	 */
960 	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
961 		kaddr = kmap_atomic(page, KM_USER0);
962 		left = __copy_to_user_inatomic(desc->arg.buf,
963 						kaddr + offset, size);
964 		kunmap_atomic(kaddr, KM_USER0);
965 		if (left == 0)
966 			goto success;
967 	}
968 
969 	/* Do it the slow way */
970 	kaddr = kmap(page);
971 	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
972 	kunmap(page);
973 
974 	if (left) {
975 		size -= left;
976 		desc->error = -EFAULT;
977 	}
978 success:
979 	desc->count = count - size;
980 	desc->written += size;
981 	desc->arg.buf += size;
982 	return size;
983 }
984 
985 /*
986  * This is the "read()" routine for all filesystems
987  * that can use the page cache directly.
988  */
989 ssize_t
990 __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
991 		unsigned long nr_segs, loff_t *ppos)
992 {
993 	struct file *filp = iocb->ki_filp;
994 	ssize_t retval;
995 	unsigned long seg;
996 	size_t count;
997 
998 	count = 0;
999 	for (seg = 0; seg < nr_segs; seg++) {
1000 		const struct iovec *iv = &iov[seg];
1001 
1002 		/*
1003 		 * If any segment has a negative length, or the cumulative
1004 		 * length ever wraps negative then return -EINVAL.
1005 		 */
1006 		count += iv->iov_len;
1007 		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1008 			return -EINVAL;
1009 		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1010 			continue;
1011 		if (seg == 0)
1012 			return -EFAULT;
1013 		nr_segs = seg;
1014 		count -= iv->iov_len;	/* This segment is no good */
1015 		break;
1016 	}
1017 
1018 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1019 	if (filp->f_flags & O_DIRECT) {
1020 		loff_t pos = *ppos, size;
1021 		struct address_space *mapping;
1022 		struct inode *inode;
1023 
1024 		mapping = filp->f_mapping;
1025 		inode = mapping->host;
1026 		retval = 0;
1027 		if (!count)
1028 			goto out; /* skip atime */
1029 		size = i_size_read(inode);
1030 		if (pos < size) {
1031 			retval = generic_file_direct_IO(READ, iocb,
1032 						iov, pos, nr_segs);
1033 			if (retval > 0 && !is_sync_kiocb(iocb))
1034 				retval = -EIOCBQUEUED;
1035 			if (retval > 0)
1036 				*ppos = pos + retval;
1037 		}
1038 		file_accessed(filp);
1039 		goto out;
1040 	}
1041 
1042 	retval = 0;
1043 	if (count) {
1044 		for (seg = 0; seg < nr_segs; seg++) {
1045 			read_descriptor_t desc;
1046 
1047 			desc.written = 0;
1048 			desc.arg.buf = iov[seg].iov_base;
1049 			desc.count = iov[seg].iov_len;
1050 			if (desc.count == 0)
1051 				continue;
1052 			desc.error = 0;
1053 			do_generic_file_read(filp,ppos,&desc,file_read_actor);
1054 			retval += desc.written;
1055 			if (desc.error) {
1056 				retval = retval ?: desc.error;
1057 				break;
1058 			}
1059 		}
1060 	}
1061 out:
1062 	return retval;
1063 }
1064 
1065 EXPORT_SYMBOL(__generic_file_aio_read);
1066 
1067 ssize_t
1068 generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1069 {
1070 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1071 
1072 	BUG_ON(iocb->ki_pos != pos);
1073 	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1074 }
1075 
1076 EXPORT_SYMBOL(generic_file_aio_read);
1077 
1078 ssize_t
1079 generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1080 {
1081 	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1082 	struct kiocb kiocb;
1083 	ssize_t ret;
1084 
1085 	init_sync_kiocb(&kiocb, filp);
1086 	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1087 	if (-EIOCBQUEUED == ret)
1088 		ret = wait_on_sync_kiocb(&kiocb);
1089 	return ret;
1090 }
1091 
1092 EXPORT_SYMBOL(generic_file_read);
1093 
1094 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1095 {
1096 	ssize_t written;
1097 	unsigned long count = desc->count;
1098 	struct file *file = desc->arg.data;
1099 
1100 	if (size > count)
1101 		size = count;
1102 
1103 	written = file->f_op->sendpage(file, page, offset,
1104 				       size, &file->f_pos, size<count);
1105 	if (written < 0) {
1106 		desc->error = written;
1107 		written = 0;
1108 	}
1109 	desc->count = count - written;
1110 	desc->written += written;
1111 	return written;
1112 }
1113 
1114 ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1115 			 size_t count, read_actor_t actor, void *target)
1116 {
1117 	read_descriptor_t desc;
1118 
1119 	if (!count)
1120 		return 0;
1121 
1122 	desc.written = 0;
1123 	desc.count = count;
1124 	desc.arg.data = target;
1125 	desc.error = 0;
1126 
1127 	do_generic_file_read(in_file, ppos, &desc, actor);
1128 	if (desc.written)
1129 		return desc.written;
1130 	return desc.error;
1131 }
1132 
1133 EXPORT_SYMBOL(generic_file_sendfile);
1134 
1135 static ssize_t
1136 do_readahead(struct address_space *mapping, struct file *filp,
1137 	     unsigned long index, unsigned long nr)
1138 {
1139 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1140 		return -EINVAL;
1141 
1142 	force_page_cache_readahead(mapping, filp, index,
1143 					max_sane_readahead(nr));
1144 	return 0;
1145 }
1146 
1147 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1148 {
1149 	ssize_t ret;
1150 	struct file *file;
1151 
1152 	ret = -EBADF;
1153 	file = fget(fd);
1154 	if (file) {
1155 		if (file->f_mode & FMODE_READ) {
1156 			struct address_space *mapping = file->f_mapping;
1157 			unsigned long start = offset >> PAGE_CACHE_SHIFT;
1158 			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1159 			unsigned long len = end - start + 1;
1160 			ret = do_readahead(mapping, file, start, len);
1161 		}
1162 		fput(file);
1163 	}
1164 	return ret;
1165 }
1166 
1167 #ifdef CONFIG_MMU
1168 /*
1169  * This adds the requested page to the page cache if it isn't already there,
1170  * and schedules an I/O to read in its contents from disk.
1171  */
1172 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1173 static int fastcall page_cache_read(struct file * file, unsigned long offset)
1174 {
1175 	struct address_space *mapping = file->f_mapping;
1176 	struct page *page;
1177 	int ret;
1178 
1179 	do {
1180 		page = page_cache_alloc_cold(mapping);
1181 		if (!page)
1182 			return -ENOMEM;
1183 
1184 		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1185 		if (ret == 0)
1186 			ret = mapping->a_ops->readpage(file, page);
1187 		else if (ret == -EEXIST)
1188 			ret = 0; /* losing race to add is OK */
1189 
1190 		page_cache_release(page);
1191 
1192 	} while (ret == AOP_TRUNCATED_PAGE);
1193 
1194 	return ret;
1195 }
1196 
1197 #define MMAP_LOTSAMISS  (100)
1198 
1199 /*
1200  * filemap_nopage() is invoked via the vma operations vector for a
1201  * mapped memory region to read in file data during a page fault.
1202  *
1203  * The goto's are kind of ugly, but this streamlines the normal case of having
1204  * it in the page cache, and handles the special cases reasonably without
1205  * having a lot of duplicated code.
1206  */
1207 struct page *filemap_nopage(struct vm_area_struct *area,
1208 				unsigned long address, int *type)
1209 {
1210 	int error;
1211 	struct file *file = area->vm_file;
1212 	struct address_space *mapping = file->f_mapping;
1213 	struct file_ra_state *ra = &file->f_ra;
1214 	struct inode *inode = mapping->host;
1215 	struct page *page;
1216 	unsigned long size, pgoff;
1217 	int did_readaround = 0, majmin = VM_FAULT_MINOR;
1218 
1219 	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1220 
1221 retry_all:
1222 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1223 	if (pgoff >= size)
1224 		goto outside_data_content;
1225 
1226 	/* If we don't want any read-ahead, don't bother */
1227 	if (VM_RandomReadHint(area))
1228 		goto no_cached_page;
1229 
1230 	/*
1231 	 * The readahead code wants to be told about each and every page
1232 	 * so it can build and shrink its windows appropriately
1233 	 *
1234 	 * For sequential accesses, we use the generic readahead logic.
1235 	 */
1236 	if (VM_SequentialReadHint(area))
1237 		page_cache_readahead(mapping, ra, file, pgoff, 1);
1238 
1239 	/*
1240 	 * Do we have something in the page cache already?
1241 	 */
1242 retry_find:
1243 	page = find_get_page(mapping, pgoff);
1244 	if (!page) {
1245 		unsigned long ra_pages;
1246 
1247 		if (VM_SequentialReadHint(area)) {
1248 			handle_ra_miss(mapping, ra, pgoff);
1249 			goto no_cached_page;
1250 		}
1251 		ra->mmap_miss++;
1252 
1253 		/*
1254 		 * Do we miss much more than hit in this file? If so,
1255 		 * stop bothering with read-ahead. It will only hurt.
1256 		 */
1257 		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1258 			goto no_cached_page;
1259 
1260 		/*
1261 		 * To keep the pgmajfault counter straight, we need to
1262 		 * check did_readaround, as this is an inner loop.
1263 		 */
1264 		if (!did_readaround) {
1265 			majmin = VM_FAULT_MAJOR;
1266 			inc_page_state(pgmajfault);
1267 		}
1268 		did_readaround = 1;
1269 		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1270 		if (ra_pages) {
1271 			pgoff_t start = 0;
1272 
1273 			if (pgoff > ra_pages / 2)
1274 				start = pgoff - ra_pages / 2;
1275 			do_page_cache_readahead(mapping, file, start, ra_pages);
1276 		}
1277 		page = find_get_page(mapping, pgoff);
1278 		if (!page)
1279 			goto no_cached_page;
1280 	}
1281 
1282 	if (!did_readaround)
1283 		ra->mmap_hit++;
1284 
1285 	/*
1286 	 * Ok, found a page in the page cache, now we need to check
1287 	 * that it's up-to-date.
1288 	 */
1289 	if (!PageUptodate(page))
1290 		goto page_not_uptodate;
1291 
1292 success:
1293 	/*
1294 	 * Found the page and have a reference on it.
1295 	 */
1296 	mark_page_accessed(page);
1297 	if (type)
1298 		*type = majmin;
1299 	return page;
1300 
1301 outside_data_content:
1302 	/*
1303 	 * An external ptracer can access pages that normally aren't
1304 	 * accessible..
1305 	 */
1306 	if (area->vm_mm == current->mm)
1307 		return NULL;
1308 	/* Fall through to the non-read-ahead case */
1309 no_cached_page:
1310 	/*
1311 	 * We're only likely to ever get here if MADV_RANDOM is in
1312 	 * effect.
1313 	 */
1314 	error = page_cache_read(file, pgoff);
1315 	grab_swap_token();
1316 
1317 	/*
1318 	 * The page we want has now been added to the page cache.
1319 	 * In the unlikely event that someone removed it in the
1320 	 * meantime, we'll just come back here and read it again.
1321 	 */
1322 	if (error >= 0)
1323 		goto retry_find;
1324 
1325 	/*
1326 	 * An error return from page_cache_read can result if the
1327 	 * system is low on memory, or a problem occurs while trying
1328 	 * to schedule I/O.
1329 	 */
1330 	if (error == -ENOMEM)
1331 		return NOPAGE_OOM;
1332 	return NULL;
1333 
1334 page_not_uptodate:
1335 	if (!did_readaround) {
1336 		majmin = VM_FAULT_MAJOR;
1337 		inc_page_state(pgmajfault);
1338 	}
1339 	lock_page(page);
1340 
1341 	/* Did it get unhashed while we waited for it? */
1342 	if (!page->mapping) {
1343 		unlock_page(page);
1344 		page_cache_release(page);
1345 		goto retry_all;
1346 	}
1347 
1348 	/* Did somebody else get it up-to-date? */
1349 	if (PageUptodate(page)) {
1350 		unlock_page(page);
1351 		goto success;
1352 	}
1353 
1354 	error = mapping->a_ops->readpage(file, page);
1355 	if (!error) {
1356 		wait_on_page_locked(page);
1357 		if (PageUptodate(page))
1358 			goto success;
1359 	} else if (error == AOP_TRUNCATED_PAGE) {
1360 		page_cache_release(page);
1361 		goto retry_find;
1362 	}
1363 
1364 	/*
1365 	 * Umm, take care of errors if the page isn't up-to-date.
1366 	 * Try to re-read it _once_. We do this synchronously,
1367 	 * because there really aren't any performance issues here
1368 	 * and we need to check for errors.
1369 	 */
1370 	lock_page(page);
1371 
1372 	/* Somebody truncated the page on us? */
1373 	if (!page->mapping) {
1374 		unlock_page(page);
1375 		page_cache_release(page);
1376 		goto retry_all;
1377 	}
1378 
1379 	/* Somebody else successfully read it in? */
1380 	if (PageUptodate(page)) {
1381 		unlock_page(page);
1382 		goto success;
1383 	}
1384 	ClearPageError(page);
1385 	error = mapping->a_ops->readpage(file, page);
1386 	if (!error) {
1387 		wait_on_page_locked(page);
1388 		if (PageUptodate(page))
1389 			goto success;
1390 	} else if (error == AOP_TRUNCATED_PAGE) {
1391 		page_cache_release(page);
1392 		goto retry_find;
1393 	}
1394 
1395 	/*
1396 	 * Things didn't work out. Return zero to tell the
1397 	 * mm layer so, possibly freeing the page cache page first.
1398 	 */
1399 	page_cache_release(page);
1400 	return NULL;
1401 }
1402 
1403 EXPORT_SYMBOL(filemap_nopage);
1404 
1405 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1406 					int nonblock)
1407 {
1408 	struct address_space *mapping = file->f_mapping;
1409 	struct page *page;
1410 	int error;
1411 
1412 	/*
1413 	 * Do we have something in the page cache already?
1414 	 */
1415 retry_find:
1416 	page = find_get_page(mapping, pgoff);
1417 	if (!page) {
1418 		if (nonblock)
1419 			return NULL;
1420 		goto no_cached_page;
1421 	}
1422 
1423 	/*
1424 	 * Ok, found a page in the page cache, now we need to check
1425 	 * that it's up-to-date.
1426 	 */
1427 	if (!PageUptodate(page)) {
1428 		if (nonblock) {
1429 			page_cache_release(page);
1430 			return NULL;
1431 		}
1432 		goto page_not_uptodate;
1433 	}
1434 
1435 success:
1436 	/*
1437 	 * Found the page and have a reference on it.
1438 	 */
1439 	mark_page_accessed(page);
1440 	return page;
1441 
1442 no_cached_page:
1443 	error = page_cache_read(file, pgoff);
1444 
1445 	/*
1446 	 * The page we want has now been added to the page cache.
1447 	 * In the unlikely event that someone removed it in the
1448 	 * meantime, we'll just come back here and read it again.
1449 	 */
1450 	if (error >= 0)
1451 		goto retry_find;
1452 
1453 	/*
1454 	 * An error return from page_cache_read can result if the
1455 	 * system is low on memory, or a problem occurs while trying
1456 	 * to schedule I/O.
1457 	 */
1458 	return NULL;
1459 
1460 page_not_uptodate:
1461 	lock_page(page);
1462 
1463 	/* Did it get unhashed while we waited for it? */
1464 	if (!page->mapping) {
1465 		unlock_page(page);
1466 		goto err;
1467 	}
1468 
1469 	/* Did somebody else get it up-to-date? */
1470 	if (PageUptodate(page)) {
1471 		unlock_page(page);
1472 		goto success;
1473 	}
1474 
1475 	error = mapping->a_ops->readpage(file, page);
1476 	if (!error) {
1477 		wait_on_page_locked(page);
1478 		if (PageUptodate(page))
1479 			goto success;
1480 	} else if (error == AOP_TRUNCATED_PAGE) {
1481 		page_cache_release(page);
1482 		goto retry_find;
1483 	}
1484 
1485 	/*
1486 	 * Umm, take care of errors if the page isn't up-to-date.
1487 	 * Try to re-read it _once_. We do this synchronously,
1488 	 * because there really aren't any performance issues here
1489 	 * and we need to check for errors.
1490 	 */
1491 	lock_page(page);
1492 
1493 	/* Somebody truncated the page on us? */
1494 	if (!page->mapping) {
1495 		unlock_page(page);
1496 		goto err;
1497 	}
1498 	/* Somebody else successfully read it in? */
1499 	if (PageUptodate(page)) {
1500 		unlock_page(page);
1501 		goto success;
1502 	}
1503 
1504 	ClearPageError(page);
1505 	error = mapping->a_ops->readpage(file, page);
1506 	if (!error) {
1507 		wait_on_page_locked(page);
1508 		if (PageUptodate(page))
1509 			goto success;
1510 	} else if (error == AOP_TRUNCATED_PAGE) {
1511 		page_cache_release(page);
1512 		goto retry_find;
1513 	}
1514 
1515 	/*
1516 	 * Things didn't work out. Return zero to tell the
1517 	 * mm layer so, possibly freeing the page cache page first.
1518 	 */
1519 err:
1520 	page_cache_release(page);
1521 
1522 	return NULL;
1523 }
1524 
1525 int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1526 		unsigned long len, pgprot_t prot, unsigned long pgoff,
1527 		int nonblock)
1528 {
1529 	struct file *file = vma->vm_file;
1530 	struct address_space *mapping = file->f_mapping;
1531 	struct inode *inode = mapping->host;
1532 	unsigned long size;
1533 	struct mm_struct *mm = vma->vm_mm;
1534 	struct page *page;
1535 	int err;
1536 
1537 	if (!nonblock)
1538 		force_page_cache_readahead(mapping, vma->vm_file,
1539 					pgoff, len >> PAGE_CACHE_SHIFT);
1540 
1541 repeat:
1542 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1543 	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1544 		return -EINVAL;
1545 
1546 	page = filemap_getpage(file, pgoff, nonblock);
1547 
1548 	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
1549 	 * done in shmem_populate calling shmem_getpage */
1550 	if (!page && !nonblock)
1551 		return -ENOMEM;
1552 
1553 	if (page) {
1554 		err = install_page(mm, vma, addr, page, prot);
1555 		if (err) {
1556 			page_cache_release(page);
1557 			return err;
1558 		}
1559 	} else if (vma->vm_flags & VM_NONLINEAR) {
1560 		/* No page was found just because we can't read it in now (being
1561 		 * here implies nonblock != 0), but the page may exist, so set
1562 		 * the PTE to fault it in later. */
1563 		err = install_file_pte(mm, vma, addr, pgoff, prot);
1564 		if (err)
1565 			return err;
1566 	}
1567 
1568 	len -= PAGE_SIZE;
1569 	addr += PAGE_SIZE;
1570 	pgoff++;
1571 	if (len)
1572 		goto repeat;
1573 
1574 	return 0;
1575 }
1576 EXPORT_SYMBOL(filemap_populate);
1577 
1578 struct vm_operations_struct generic_file_vm_ops = {
1579 	.nopage		= filemap_nopage,
1580 	.populate	= filemap_populate,
1581 };
1582 
1583 /* This is used for a general mmap of a disk file */
1584 
1585 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1586 {
1587 	struct address_space *mapping = file->f_mapping;
1588 
1589 	if (!mapping->a_ops->readpage)
1590 		return -ENOEXEC;
1591 	file_accessed(file);
1592 	vma->vm_ops = &generic_file_vm_ops;
1593 	return 0;
1594 }
1595 
1596 /*
1597  * This is for filesystems which do not implement ->writepage.
1598  */
1599 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1600 {
1601 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1602 		return -EINVAL;
1603 	return generic_file_mmap(file, vma);
1604 }
1605 #else
1606 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1607 {
1608 	return -ENOSYS;
1609 }
1610 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1611 {
1612 	return -ENOSYS;
1613 }
1614 #endif /* CONFIG_MMU */
1615 
1616 EXPORT_SYMBOL(generic_file_mmap);
1617 EXPORT_SYMBOL(generic_file_readonly_mmap);
1618 
1619 static inline struct page *__read_cache_page(struct address_space *mapping,
1620 				unsigned long index,
1621 				int (*filler)(void *,struct page*),
1622 				void *data)
1623 {
1624 	struct page *page, *cached_page = NULL;
1625 	int err;
1626 repeat:
1627 	page = find_get_page(mapping, index);
1628 	if (!page) {
1629 		if (!cached_page) {
1630 			cached_page = page_cache_alloc_cold(mapping);
1631 			if (!cached_page)
1632 				return ERR_PTR(-ENOMEM);
1633 		}
1634 		err = add_to_page_cache_lru(cached_page, mapping,
1635 					index, GFP_KERNEL);
1636 		if (err == -EEXIST)
1637 			goto repeat;
1638 		if (err < 0) {
1639 			/* Presumably ENOMEM for radix tree node */
1640 			page_cache_release(cached_page);
1641 			return ERR_PTR(err);
1642 		}
1643 		page = cached_page;
1644 		cached_page = NULL;
1645 		err = filler(data, page);
1646 		if (err < 0) {
1647 			page_cache_release(page);
1648 			page = ERR_PTR(err);
1649 		}
1650 	}
1651 	if (cached_page)
1652 		page_cache_release(cached_page);
1653 	return page;
1654 }
1655 
1656 /*
1657  * Read into the page cache. If a page already exists,
1658  * and PageUptodate() is not set, try to fill the page.
1659  */
1660 struct page *read_cache_page(struct address_space *mapping,
1661 				unsigned long index,
1662 				int (*filler)(void *,struct page*),
1663 				void *data)
1664 {
1665 	struct page *page;
1666 	int err;
1667 
1668 retry:
1669 	page = __read_cache_page(mapping, index, filler, data);
1670 	if (IS_ERR(page))
1671 		goto out;
1672 	mark_page_accessed(page);
1673 	if (PageUptodate(page))
1674 		goto out;
1675 
1676 	lock_page(page);
1677 	if (!page->mapping) {
1678 		unlock_page(page);
1679 		page_cache_release(page);
1680 		goto retry;
1681 	}
1682 	if (PageUptodate(page)) {
1683 		unlock_page(page);
1684 		goto out;
1685 	}
1686 	err = filler(data, page);
1687 	if (err < 0) {
1688 		page_cache_release(page);
1689 		page = ERR_PTR(err);
1690 	}
1691  out:
1692 	return page;
1693 }
1694 
1695 EXPORT_SYMBOL(read_cache_page);
1696 
1697 /*
1698  * If the page was newly created, increment its refcount and add it to the
1699  * caller's lru-buffering pagevec.  This function is specifically for
1700  * generic_file_write().
1701  */
1702 static inline struct page *
1703 __grab_cache_page(struct address_space *mapping, unsigned long index,
1704 			struct page **cached_page, struct pagevec *lru_pvec)
1705 {
1706 	int err;
1707 	struct page *page;
1708 repeat:
1709 	page = find_lock_page(mapping, index);
1710 	if (!page) {
1711 		if (!*cached_page) {
1712 			*cached_page = page_cache_alloc(mapping);
1713 			if (!*cached_page)
1714 				return NULL;
1715 		}
1716 		err = add_to_page_cache(*cached_page, mapping,
1717 					index, GFP_KERNEL);
1718 		if (err == -EEXIST)
1719 			goto repeat;
1720 		if (err == 0) {
1721 			page = *cached_page;
1722 			page_cache_get(page);
1723 			if (!pagevec_add(lru_pvec, page))
1724 				__pagevec_lru_add(lru_pvec);
1725 			*cached_page = NULL;
1726 		}
1727 	}
1728 	return page;
1729 }
1730 
1731 /*
1732  * The logic we want is
1733  *
1734  *	if suid or (sgid and xgrp)
1735  *		remove privs
1736  */
1737 int remove_suid(struct dentry *dentry)
1738 {
1739 	mode_t mode = dentry->d_inode->i_mode;
1740 	int kill = 0;
1741 	int result = 0;
1742 
1743 	/* suid always must be killed */
1744 	if (unlikely(mode & S_ISUID))
1745 		kill = ATTR_KILL_SUID;
1746 
1747 	/*
1748 	 * sgid without any exec bits is just a mandatory locking mark; leave
1749 	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
1750 	 */
1751 	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1752 		kill |= ATTR_KILL_SGID;
1753 
1754 	if (unlikely(kill && !capable(CAP_FSETID))) {
1755 		struct iattr newattrs;
1756 
1757 		newattrs.ia_valid = ATTR_FORCE | kill;
1758 		result = notify_change(dentry, &newattrs);
1759 	}
1760 	return result;
1761 }
1762 EXPORT_SYMBOL(remove_suid);
1763 
1764 size_t
1765 __filemap_copy_from_user_iovec(char *vaddr,
1766 			const struct iovec *iov, size_t base, size_t bytes)
1767 {
1768 	size_t copied = 0, left = 0;
1769 
1770 	while (bytes) {
1771 		char __user *buf = iov->iov_base + base;
1772 		int copy = min(bytes, iov->iov_len - base);
1773 
1774 		base = 0;
1775 		left = __copy_from_user_inatomic(vaddr, buf, copy);
1776 		copied += copy;
1777 		bytes -= copy;
1778 		vaddr += copy;
1779 		iov++;
1780 
1781 		if (unlikely(left)) {
1782 			/* zero the rest of the target like __copy_from_user */
1783 			if (bytes)
1784 				memset(vaddr, 0, bytes);
1785 			break;
1786 		}
1787 	}
1788 	return copied - left;
1789 }
1790 
1791 /*
1792  * Performs necessary checks before doing a write
1793  *
1794  * Can adjust writing position aor amount of bytes to write.
1795  * Returns appropriate error code that caller should return or
1796  * zero in case that write should be allowed.
1797  */
1798 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1799 {
1800 	struct inode *inode = file->f_mapping->host;
1801 	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1802 
1803         if (unlikely(*pos < 0))
1804                 return -EINVAL;
1805 
1806 	if (!isblk) {
1807 		/* FIXME: this is for backwards compatibility with 2.4 */
1808 		if (file->f_flags & O_APPEND)
1809                         *pos = i_size_read(inode);
1810 
1811 		if (limit != RLIM_INFINITY) {
1812 			if (*pos >= limit) {
1813 				send_sig(SIGXFSZ, current, 0);
1814 				return -EFBIG;
1815 			}
1816 			if (*count > limit - (typeof(limit))*pos) {
1817 				*count = limit - (typeof(limit))*pos;
1818 			}
1819 		}
1820 	}
1821 
1822 	/*
1823 	 * LFS rule
1824 	 */
1825 	if (unlikely(*pos + *count > MAX_NON_LFS &&
1826 				!(file->f_flags & O_LARGEFILE))) {
1827 		if (*pos >= MAX_NON_LFS) {
1828 			send_sig(SIGXFSZ, current, 0);
1829 			return -EFBIG;
1830 		}
1831 		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1832 			*count = MAX_NON_LFS - (unsigned long)*pos;
1833 		}
1834 	}
1835 
1836 	/*
1837 	 * Are we about to exceed the fs block limit ?
1838 	 *
1839 	 * If we have written data it becomes a short write.  If we have
1840 	 * exceeded without writing data we send a signal and return EFBIG.
1841 	 * Linus frestrict idea will clean these up nicely..
1842 	 */
1843 	if (likely(!isblk)) {
1844 		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1845 			if (*count || *pos > inode->i_sb->s_maxbytes) {
1846 				send_sig(SIGXFSZ, current, 0);
1847 				return -EFBIG;
1848 			}
1849 			/* zero-length writes at ->s_maxbytes are OK */
1850 		}
1851 
1852 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1853 			*count = inode->i_sb->s_maxbytes - *pos;
1854 	} else {
1855 		loff_t isize;
1856 		if (bdev_read_only(I_BDEV(inode)))
1857 			return -EPERM;
1858 		isize = i_size_read(inode);
1859 		if (*pos >= isize) {
1860 			if (*count || *pos > isize)
1861 				return -ENOSPC;
1862 		}
1863 
1864 		if (*pos + *count > isize)
1865 			*count = isize - *pos;
1866 	}
1867 	return 0;
1868 }
1869 EXPORT_SYMBOL(generic_write_checks);
1870 
1871 ssize_t
1872 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1873 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1874 		size_t count, size_t ocount)
1875 {
1876 	struct file	*file = iocb->ki_filp;
1877 	struct address_space *mapping = file->f_mapping;
1878 	struct inode	*inode = mapping->host;
1879 	ssize_t		written;
1880 
1881 	if (count != ocount)
1882 		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1883 
1884 	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1885 	if (written > 0) {
1886 		loff_t end = pos + written;
1887 		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1888 			i_size_write(inode,  end);
1889 			mark_inode_dirty(inode);
1890 		}
1891 		*ppos = end;
1892 	}
1893 
1894 	/*
1895 	 * Sync the fs metadata but not the minor inode changes and
1896 	 * of course not the data as we did direct DMA for the IO.
1897 	 * i_mutex is held, which protects generic_osync_inode() from
1898 	 * livelocking.
1899 	 */
1900 	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1901 		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1902 		if (err < 0)
1903 			written = err;
1904 	}
1905 	if (written == count && !is_sync_kiocb(iocb))
1906 		written = -EIOCBQUEUED;
1907 	return written;
1908 }
1909 EXPORT_SYMBOL(generic_file_direct_write);
1910 
1911 ssize_t
1912 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1913 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
1914 		size_t count, ssize_t written)
1915 {
1916 	struct file *file = iocb->ki_filp;
1917 	struct address_space * mapping = file->f_mapping;
1918 	struct address_space_operations *a_ops = mapping->a_ops;
1919 	struct inode 	*inode = mapping->host;
1920 	long		status = 0;
1921 	struct page	*page;
1922 	struct page	*cached_page = NULL;
1923 	size_t		bytes;
1924 	struct pagevec	lru_pvec;
1925 	const struct iovec *cur_iov = iov; /* current iovec */
1926 	size_t		iov_base = 0;	   /* offset in the current iovec */
1927 	char __user	*buf;
1928 
1929 	pagevec_init(&lru_pvec, 0);
1930 
1931 	/*
1932 	 * handle partial DIO write.  Adjust cur_iov if needed.
1933 	 */
1934 	if (likely(nr_segs == 1))
1935 		buf = iov->iov_base + written;
1936 	else {
1937 		filemap_set_next_iovec(&cur_iov, &iov_base, written);
1938 		buf = cur_iov->iov_base + iov_base;
1939 	}
1940 
1941 	do {
1942 		unsigned long index;
1943 		unsigned long offset;
1944 		unsigned long maxlen;
1945 		size_t copied;
1946 
1947 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1948 		index = pos >> PAGE_CACHE_SHIFT;
1949 		bytes = PAGE_CACHE_SIZE - offset;
1950 		if (bytes > count)
1951 			bytes = count;
1952 
1953 		/*
1954 		 * Bring in the user page that we will copy from _first_.
1955 		 * Otherwise there's a nasty deadlock on copying from the
1956 		 * same page as we're writing to, without it being marked
1957 		 * up-to-date.
1958 		 */
1959 		maxlen = cur_iov->iov_len - iov_base;
1960 		if (maxlen > bytes)
1961 			maxlen = bytes;
1962 		fault_in_pages_readable(buf, maxlen);
1963 
1964 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1965 		if (!page) {
1966 			status = -ENOMEM;
1967 			break;
1968 		}
1969 
1970 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
1971 		if (unlikely(status)) {
1972 			loff_t isize = i_size_read(inode);
1973 
1974 			if (status != AOP_TRUNCATED_PAGE)
1975 				unlock_page(page);
1976 			page_cache_release(page);
1977 			if (status == AOP_TRUNCATED_PAGE)
1978 				continue;
1979 			/*
1980 			 * prepare_write() may have instantiated a few blocks
1981 			 * outside i_size.  Trim these off again.
1982 			 */
1983 			if (pos + bytes > isize)
1984 				vmtruncate(inode, isize);
1985 			break;
1986 		}
1987 		if (likely(nr_segs == 1))
1988 			copied = filemap_copy_from_user(page, offset,
1989 							buf, bytes);
1990 		else
1991 			copied = filemap_copy_from_user_iovec(page, offset,
1992 						cur_iov, iov_base, bytes);
1993 		flush_dcache_page(page);
1994 		status = a_ops->commit_write(file, page, offset, offset+bytes);
1995 		if (status == AOP_TRUNCATED_PAGE) {
1996 			page_cache_release(page);
1997 			continue;
1998 		}
1999 		if (likely(copied > 0)) {
2000 			if (!status)
2001 				status = copied;
2002 
2003 			if (status >= 0) {
2004 				written += status;
2005 				count -= status;
2006 				pos += status;
2007 				buf += status;
2008 				if (unlikely(nr_segs > 1)) {
2009 					filemap_set_next_iovec(&cur_iov,
2010 							&iov_base, status);
2011 					if (count)
2012 						buf = cur_iov->iov_base +
2013 							iov_base;
2014 				} else {
2015 					iov_base += status;
2016 				}
2017 			}
2018 		}
2019 		if (unlikely(copied != bytes))
2020 			if (status >= 0)
2021 				status = -EFAULT;
2022 		unlock_page(page);
2023 		mark_page_accessed(page);
2024 		page_cache_release(page);
2025 		if (status < 0)
2026 			break;
2027 		balance_dirty_pages_ratelimited(mapping);
2028 		cond_resched();
2029 	} while (count);
2030 	*ppos = pos;
2031 
2032 	if (cached_page)
2033 		page_cache_release(cached_page);
2034 
2035 	/*
2036 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2037 	 */
2038 	if (likely(status >= 0)) {
2039 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2040 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
2041 				status = generic_osync_inode(inode, mapping,
2042 						OSYNC_METADATA|OSYNC_DATA);
2043 		}
2044   	}
2045 
2046 	/*
2047 	 * If we get here for O_DIRECT writes then we must have fallen through
2048 	 * to buffered writes (block instantiation inside i_size).  So we sync
2049 	 * the file data here, to try to honour O_DIRECT expectations.
2050 	 */
2051 	if (unlikely(file->f_flags & O_DIRECT) && written)
2052 		status = filemap_write_and_wait(mapping);
2053 
2054 	pagevec_lru_add(&lru_pvec);
2055 	return written ? written : status;
2056 }
2057 EXPORT_SYMBOL(generic_file_buffered_write);
2058 
2059 static ssize_t
2060 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2061 				unsigned long nr_segs, loff_t *ppos)
2062 {
2063 	struct file *file = iocb->ki_filp;
2064 	struct address_space * mapping = file->f_mapping;
2065 	size_t ocount;		/* original count */
2066 	size_t count;		/* after file limit checks */
2067 	struct inode 	*inode = mapping->host;
2068 	unsigned long	seg;
2069 	loff_t		pos;
2070 	ssize_t		written;
2071 	ssize_t		err;
2072 
2073 	ocount = 0;
2074 	for (seg = 0; seg < nr_segs; seg++) {
2075 		const struct iovec *iv = &iov[seg];
2076 
2077 		/*
2078 		 * If any segment has a negative length, or the cumulative
2079 		 * length ever wraps negative then return -EINVAL.
2080 		 */
2081 		ocount += iv->iov_len;
2082 		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2083 			return -EINVAL;
2084 		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2085 			continue;
2086 		if (seg == 0)
2087 			return -EFAULT;
2088 		nr_segs = seg;
2089 		ocount -= iv->iov_len;	/* This segment is no good */
2090 		break;
2091 	}
2092 
2093 	count = ocount;
2094 	pos = *ppos;
2095 
2096 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2097 
2098 	/* We can write back this queue in page reclaim */
2099 	current->backing_dev_info = mapping->backing_dev_info;
2100 	written = 0;
2101 
2102 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2103 	if (err)
2104 		goto out;
2105 
2106 	if (count == 0)
2107 		goto out;
2108 
2109 	err = remove_suid(file->f_dentry);
2110 	if (err)
2111 		goto out;
2112 
2113 	file_update_time(file);
2114 
2115 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2116 	if (unlikely(file->f_flags & O_DIRECT)) {
2117 		written = generic_file_direct_write(iocb, iov,
2118 				&nr_segs, pos, ppos, count, ocount);
2119 		if (written < 0 || written == count)
2120 			goto out;
2121 		/*
2122 		 * direct-io write to a hole: fall through to buffered I/O
2123 		 * for completing the rest of the request.
2124 		 */
2125 		pos += written;
2126 		count -= written;
2127 	}
2128 
2129 	written = generic_file_buffered_write(iocb, iov, nr_segs,
2130 			pos, ppos, count, written);
2131 out:
2132 	current->backing_dev_info = NULL;
2133 	return written ? written : err;
2134 }
2135 EXPORT_SYMBOL(generic_file_aio_write_nolock);
2136 
2137 ssize_t
2138 generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2139 				unsigned long nr_segs, loff_t *ppos)
2140 {
2141 	struct file *file = iocb->ki_filp;
2142 	struct address_space *mapping = file->f_mapping;
2143 	struct inode *inode = mapping->host;
2144 	ssize_t ret;
2145 	loff_t pos = *ppos;
2146 
2147 	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2148 
2149 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2150 		int err;
2151 
2152 		err = sync_page_range_nolock(inode, mapping, pos, ret);
2153 		if (err < 0)
2154 			ret = err;
2155 	}
2156 	return ret;
2157 }
2158 
2159 static ssize_t
2160 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
2161 				unsigned long nr_segs, loff_t *ppos)
2162 {
2163 	struct kiocb kiocb;
2164 	ssize_t ret;
2165 
2166 	init_sync_kiocb(&kiocb, file);
2167 	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2168 	if (ret == -EIOCBQUEUED)
2169 		ret = wait_on_sync_kiocb(&kiocb);
2170 	return ret;
2171 }
2172 
2173 ssize_t
2174 generic_file_write_nolock(struct file *file, const struct iovec *iov,
2175 				unsigned long nr_segs, loff_t *ppos)
2176 {
2177 	struct kiocb kiocb;
2178 	ssize_t ret;
2179 
2180 	init_sync_kiocb(&kiocb, file);
2181 	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2182 	if (-EIOCBQUEUED == ret)
2183 		ret = wait_on_sync_kiocb(&kiocb);
2184 	return ret;
2185 }
2186 EXPORT_SYMBOL(generic_file_write_nolock);
2187 
2188 ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2189 			       size_t count, loff_t pos)
2190 {
2191 	struct file *file = iocb->ki_filp;
2192 	struct address_space *mapping = file->f_mapping;
2193 	struct inode *inode = mapping->host;
2194 	ssize_t ret;
2195 	struct iovec local_iov = { .iov_base = (void __user *)buf,
2196 					.iov_len = count };
2197 
2198 	BUG_ON(iocb->ki_pos != pos);
2199 
2200 	mutex_lock(&inode->i_mutex);
2201 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2202 						&iocb->ki_pos);
2203 	mutex_unlock(&inode->i_mutex);
2204 
2205 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2206 		ssize_t err;
2207 
2208 		err = sync_page_range(inode, mapping, pos, ret);
2209 		if (err < 0)
2210 			ret = err;
2211 	}
2212 	return ret;
2213 }
2214 EXPORT_SYMBOL(generic_file_aio_write);
2215 
2216 ssize_t generic_file_write(struct file *file, const char __user *buf,
2217 			   size_t count, loff_t *ppos)
2218 {
2219 	struct address_space *mapping = file->f_mapping;
2220 	struct inode *inode = mapping->host;
2221 	ssize_t	ret;
2222 	struct iovec local_iov = { .iov_base = (void __user *)buf,
2223 					.iov_len = count };
2224 
2225 	mutex_lock(&inode->i_mutex);
2226 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2227 	mutex_unlock(&inode->i_mutex);
2228 
2229 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2230 		ssize_t err;
2231 
2232 		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2233 		if (err < 0)
2234 			ret = err;
2235 	}
2236 	return ret;
2237 }
2238 EXPORT_SYMBOL(generic_file_write);
2239 
2240 ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2241 			unsigned long nr_segs, loff_t *ppos)
2242 {
2243 	struct kiocb kiocb;
2244 	ssize_t ret;
2245 
2246 	init_sync_kiocb(&kiocb, filp);
2247 	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2248 	if (-EIOCBQUEUED == ret)
2249 		ret = wait_on_sync_kiocb(&kiocb);
2250 	return ret;
2251 }
2252 EXPORT_SYMBOL(generic_file_readv);
2253 
2254 ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2255 			unsigned long nr_segs, loff_t *ppos)
2256 {
2257 	struct address_space *mapping = file->f_mapping;
2258 	struct inode *inode = mapping->host;
2259 	ssize_t ret;
2260 
2261 	mutex_lock(&inode->i_mutex);
2262 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2263 	mutex_unlock(&inode->i_mutex);
2264 
2265 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2266 		int err;
2267 
2268 		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2269 		if (err < 0)
2270 			ret = err;
2271 	}
2272 	return ret;
2273 }
2274 EXPORT_SYMBOL(generic_file_writev);
2275 
2276 /*
2277  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2278  * went wrong during pagecache shootdown.
2279  */
2280 static ssize_t
2281 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2282 	loff_t offset, unsigned long nr_segs)
2283 {
2284 	struct file *file = iocb->ki_filp;
2285 	struct address_space *mapping = file->f_mapping;
2286 	ssize_t retval;
2287 	size_t write_len = 0;
2288 
2289 	/*
2290 	 * If it's a write, unmap all mmappings of the file up-front.  This
2291 	 * will cause any pte dirty bits to be propagated into the pageframes
2292 	 * for the subsequent filemap_write_and_wait().
2293 	 */
2294 	if (rw == WRITE) {
2295 		write_len = iov_length(iov, nr_segs);
2296 	       	if (mapping_mapped(mapping))
2297 			unmap_mapping_range(mapping, offset, write_len, 0);
2298 	}
2299 
2300 	retval = filemap_write_and_wait(mapping);
2301 	if (retval == 0) {
2302 		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2303 						offset, nr_segs);
2304 		if (rw == WRITE && mapping->nrpages) {
2305 			pgoff_t end = (offset + write_len - 1)
2306 						>> PAGE_CACHE_SHIFT;
2307 			int err = invalidate_inode_pages2_range(mapping,
2308 					offset >> PAGE_CACHE_SHIFT, end);
2309 			if (err)
2310 				retval = err;
2311 		}
2312 	}
2313 	return retval;
2314 }
2315