xref: /linux/mm/memory.c (revision 7f3edee81fbd49114c28057512906f169caa0bed)
1 /*
2  *  linux/mm/memory.c
3  *
4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5  */
6 
7 /*
8  * demand-loading started 01.12.91 - seems it is high on the list of
9  * things wanted, and it should be easy to implement. - Linus
10  */
11 
12 /*
13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14  * pages started 02.12.91, seems to work. - Linus.
15  *
16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17  * would have taken more than the 6M I have free, but it worked well as
18  * far as I could see.
19  *
20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21  */
22 
23 /*
24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
25  * thought has to go into this. Oh, well..
26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
27  *		Found it. Everything seems to work now.
28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
29  */
30 
31 /*
32  * 05.04.94  -  Multi-page memory management added for v1.1.
33  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
34  *
35  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
36  *		(Gerhard.Wichert@pdb.siemens.de)
37  *
38  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39  */
40 
41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h>
43 #include <linux/hugetlb.h>
44 #include <linux/mman.h>
45 #include <linux/swap.h>
46 #include <linux/highmem.h>
47 #include <linux/pagemap.h>
48 #include <linux/rmap.h>
49 #include <linux/module.h>
50 #include <linux/delayacct.h>
51 #include <linux/init.h>
52 #include <linux/writeback.h>
53 
54 #include <asm/pgalloc.h>
55 #include <asm/uaccess.h>
56 #include <asm/tlb.h>
57 #include <asm/tlbflush.h>
58 #include <asm/pgtable.h>
59 
60 #include <linux/swapops.h>
61 #include <linux/elf.h>
62 
63 #ifndef CONFIG_NEED_MULTIPLE_NODES
64 /* use the per-pgdat data instead for discontigmem - mbligh */
65 unsigned long max_mapnr;
66 struct page *mem_map;
67 
68 EXPORT_SYMBOL(max_mapnr);
69 EXPORT_SYMBOL(mem_map);
70 #endif
71 
72 unsigned long num_physpages;
73 /*
74  * A number of key systems in x86 including ioremap() rely on the assumption
75  * that high_memory defines the upper bound on direct map memory, then end
76  * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
77  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
78  * and ZONE_HIGHMEM.
79  */
80 void * high_memory;
81 
82 EXPORT_SYMBOL(num_physpages);
83 EXPORT_SYMBOL(high_memory);
84 
85 int randomize_va_space __read_mostly = 1;
86 
87 static int __init disable_randmaps(char *s)
88 {
89 	randomize_va_space = 0;
90 	return 1;
91 }
92 __setup("norandmaps", disable_randmaps);
93 
94 
95 /*
96  * If a p?d_bad entry is found while walking page tables, report
97  * the error, before resetting entry to p?d_none.  Usually (but
98  * very seldom) called out from the p?d_none_or_clear_bad macros.
99  */
100 
101 void pgd_clear_bad(pgd_t *pgd)
102 {
103 	pgd_ERROR(*pgd);
104 	pgd_clear(pgd);
105 }
106 
107 void pud_clear_bad(pud_t *pud)
108 {
109 	pud_ERROR(*pud);
110 	pud_clear(pud);
111 }
112 
113 void pmd_clear_bad(pmd_t *pmd)
114 {
115 	pmd_ERROR(*pmd);
116 	pmd_clear(pmd);
117 }
118 
119 /*
120  * Note: this doesn't free the actual pages themselves. That
121  * has been handled earlier when unmapping all the memory regions.
122  */
123 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
124 {
125 	struct page *page = pmd_page(*pmd);
126 	pmd_clear(pmd);
127 	pte_lock_deinit(page);
128 	pte_free_tlb(tlb, page);
129 	dec_zone_page_state(page, NR_PAGETABLE);
130 	tlb->mm->nr_ptes--;
131 }
132 
133 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
134 				unsigned long addr, unsigned long end,
135 				unsigned long floor, unsigned long ceiling)
136 {
137 	pmd_t *pmd;
138 	unsigned long next;
139 	unsigned long start;
140 
141 	start = addr;
142 	pmd = pmd_offset(pud, addr);
143 	do {
144 		next = pmd_addr_end(addr, end);
145 		if (pmd_none_or_clear_bad(pmd))
146 			continue;
147 		free_pte_range(tlb, pmd);
148 	} while (pmd++, addr = next, addr != end);
149 
150 	start &= PUD_MASK;
151 	if (start < floor)
152 		return;
153 	if (ceiling) {
154 		ceiling &= PUD_MASK;
155 		if (!ceiling)
156 			return;
157 	}
158 	if (end - 1 > ceiling - 1)
159 		return;
160 
161 	pmd = pmd_offset(pud, start);
162 	pud_clear(pud);
163 	pmd_free_tlb(tlb, pmd);
164 }
165 
166 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
167 				unsigned long addr, unsigned long end,
168 				unsigned long floor, unsigned long ceiling)
169 {
170 	pud_t *pud;
171 	unsigned long next;
172 	unsigned long start;
173 
174 	start = addr;
175 	pud = pud_offset(pgd, addr);
176 	do {
177 		next = pud_addr_end(addr, end);
178 		if (pud_none_or_clear_bad(pud))
179 			continue;
180 		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
181 	} while (pud++, addr = next, addr != end);
182 
183 	start &= PGDIR_MASK;
184 	if (start < floor)
185 		return;
186 	if (ceiling) {
187 		ceiling &= PGDIR_MASK;
188 		if (!ceiling)
189 			return;
190 	}
191 	if (end - 1 > ceiling - 1)
192 		return;
193 
194 	pud = pud_offset(pgd, start);
195 	pgd_clear(pgd);
196 	pud_free_tlb(tlb, pud);
197 }
198 
199 /*
200  * This function frees user-level page tables of a process.
201  *
202  * Must be called with pagetable lock held.
203  */
204 void free_pgd_range(struct mmu_gather **tlb,
205 			unsigned long addr, unsigned long end,
206 			unsigned long floor, unsigned long ceiling)
207 {
208 	pgd_t *pgd;
209 	unsigned long next;
210 	unsigned long start;
211 
212 	/*
213 	 * The next few lines have given us lots of grief...
214 	 *
215 	 * Why are we testing PMD* at this top level?  Because often
216 	 * there will be no work to do at all, and we'd prefer not to
217 	 * go all the way down to the bottom just to discover that.
218 	 *
219 	 * Why all these "- 1"s?  Because 0 represents both the bottom
220 	 * of the address space and the top of it (using -1 for the
221 	 * top wouldn't help much: the masks would do the wrong thing).
222 	 * The rule is that addr 0 and floor 0 refer to the bottom of
223 	 * the address space, but end 0 and ceiling 0 refer to the top
224 	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
225 	 * that end 0 case should be mythical).
226 	 *
227 	 * Wherever addr is brought up or ceiling brought down, we must
228 	 * be careful to reject "the opposite 0" before it confuses the
229 	 * subsequent tests.  But what about where end is brought down
230 	 * by PMD_SIZE below? no, end can't go down to 0 there.
231 	 *
232 	 * Whereas we round start (addr) and ceiling down, by different
233 	 * masks at different levels, in order to test whether a table
234 	 * now has no other vmas using it, so can be freed, we don't
235 	 * bother to round floor or end up - the tests don't need that.
236 	 */
237 
238 	addr &= PMD_MASK;
239 	if (addr < floor) {
240 		addr += PMD_SIZE;
241 		if (!addr)
242 			return;
243 	}
244 	if (ceiling) {
245 		ceiling &= PMD_MASK;
246 		if (!ceiling)
247 			return;
248 	}
249 	if (end - 1 > ceiling - 1)
250 		end -= PMD_SIZE;
251 	if (addr > end - 1)
252 		return;
253 
254 	start = addr;
255 	pgd = pgd_offset((*tlb)->mm, addr);
256 	do {
257 		next = pgd_addr_end(addr, end);
258 		if (pgd_none_or_clear_bad(pgd))
259 			continue;
260 		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
261 	} while (pgd++, addr = next, addr != end);
262 }
263 
264 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
265 		unsigned long floor, unsigned long ceiling)
266 {
267 	while (vma) {
268 		struct vm_area_struct *next = vma->vm_next;
269 		unsigned long addr = vma->vm_start;
270 
271 		/*
272 		 * Hide vma from rmap and vmtruncate before freeing pgtables
273 		 */
274 		anon_vma_unlink(vma);
275 		unlink_file_vma(vma);
276 
277 		if (is_vm_hugetlb_page(vma)) {
278 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
279 				floor, next? next->vm_start: ceiling);
280 		} else {
281 			/*
282 			 * Optimization: gather nearby vmas into one call down
283 			 */
284 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
285 			       && !is_vm_hugetlb_page(next)) {
286 				vma = next;
287 				next = vma->vm_next;
288 				anon_vma_unlink(vma);
289 				unlink_file_vma(vma);
290 			}
291 			free_pgd_range(tlb, addr, vma->vm_end,
292 				floor, next? next->vm_start: ceiling);
293 		}
294 		vma = next;
295 	}
296 }
297 
298 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
299 {
300 	struct page *new = pte_alloc_one(mm, address);
301 	if (!new)
302 		return -ENOMEM;
303 
304 	pte_lock_init(new);
305 	spin_lock(&mm->page_table_lock);
306 	if (pmd_present(*pmd)) {	/* Another has populated it */
307 		pte_lock_deinit(new);
308 		pte_free(new);
309 	} else {
310 		mm->nr_ptes++;
311 		inc_zone_page_state(new, NR_PAGETABLE);
312 		pmd_populate(mm, pmd, new);
313 	}
314 	spin_unlock(&mm->page_table_lock);
315 	return 0;
316 }
317 
318 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
319 {
320 	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
321 	if (!new)
322 		return -ENOMEM;
323 
324 	spin_lock(&init_mm.page_table_lock);
325 	if (pmd_present(*pmd))		/* Another has populated it */
326 		pte_free_kernel(new);
327 	else
328 		pmd_populate_kernel(&init_mm, pmd, new);
329 	spin_unlock(&init_mm.page_table_lock);
330 	return 0;
331 }
332 
333 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
334 {
335 	if (file_rss)
336 		add_mm_counter(mm, file_rss, file_rss);
337 	if (anon_rss)
338 		add_mm_counter(mm, anon_rss, anon_rss);
339 }
340 
341 /*
342  * This function is called to print an error when a bad pte
343  * is found. For example, we might have a PFN-mapped pte in
344  * a region that doesn't allow it.
345  *
346  * The calling function must still handle the error.
347  */
348 void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
349 {
350 	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
351 			"vm_flags = %lx, vaddr = %lx\n",
352 		(long long)pte_val(pte),
353 		(vma->vm_mm == current->mm ? current->comm : "???"),
354 		vma->vm_flags, vaddr);
355 	dump_stack();
356 }
357 
358 static inline int is_cow_mapping(unsigned int flags)
359 {
360 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
361 }
362 
363 /*
364  * This function gets the "struct page" associated with a pte.
365  *
366  * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
367  * will have each page table entry just pointing to a raw page frame
368  * number, and as far as the VM layer is concerned, those do not have
369  * pages associated with them - even if the PFN might point to memory
370  * that otherwise is perfectly fine and has a "struct page".
371  *
372  * The way we recognize those mappings is through the rules set up
373  * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
374  * and the vm_pgoff will point to the first PFN mapped: thus every
375  * page that is a raw mapping will always honor the rule
376  *
377  *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
378  *
379  * and if that isn't true, the page has been COW'ed (in which case it
380  * _does_ have a "struct page" associated with it even if it is in a
381  * VM_PFNMAP range).
382  */
383 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
384 {
385 	unsigned long pfn = pte_pfn(pte);
386 
387 	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
388 		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
389 		if (pfn == vma->vm_pgoff + off)
390 			return NULL;
391 		if (!is_cow_mapping(vma->vm_flags))
392 			return NULL;
393 	}
394 
395 #ifdef CONFIG_DEBUG_VM
396 	/*
397 	 * Add some anal sanity checks for now. Eventually,
398 	 * we should just do "return pfn_to_page(pfn)", but
399 	 * in the meantime we check that we get a valid pfn,
400 	 * and that the resulting page looks ok.
401 	 */
402 	if (unlikely(!pfn_valid(pfn))) {
403 		print_bad_pte(vma, pte, addr);
404 		return NULL;
405 	}
406 #endif
407 
408 	/*
409 	 * NOTE! We still have PageReserved() pages in the page
410 	 * tables.
411 	 *
412 	 * The PAGE_ZERO() pages and various VDSO mappings can
413 	 * cause them to exist.
414 	 */
415 	return pfn_to_page(pfn);
416 }
417 
418 /*
419  * copy one vm_area from one task to the other. Assumes the page tables
420  * already present in the new task to be cleared in the whole range
421  * covered by this vma.
422  */
423 
424 static inline void
425 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
426 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
427 		unsigned long addr, int *rss)
428 {
429 	unsigned long vm_flags = vma->vm_flags;
430 	pte_t pte = *src_pte;
431 	struct page *page;
432 
433 	/* pte contains position in swap or file, so copy. */
434 	if (unlikely(!pte_present(pte))) {
435 		if (!pte_file(pte)) {
436 			swp_entry_t entry = pte_to_swp_entry(pte);
437 
438 			swap_duplicate(entry);
439 			/* make sure dst_mm is on swapoff's mmlist. */
440 			if (unlikely(list_empty(&dst_mm->mmlist))) {
441 				spin_lock(&mmlist_lock);
442 				if (list_empty(&dst_mm->mmlist))
443 					list_add(&dst_mm->mmlist,
444 						 &src_mm->mmlist);
445 				spin_unlock(&mmlist_lock);
446 			}
447 			if (is_write_migration_entry(entry) &&
448 					is_cow_mapping(vm_flags)) {
449 				/*
450 				 * COW mappings require pages in both parent
451 				 * and child to be set to read.
452 				 */
453 				make_migration_entry_read(&entry);
454 				pte = swp_entry_to_pte(entry);
455 				set_pte_at(src_mm, addr, src_pte, pte);
456 			}
457 		}
458 		goto out_set_pte;
459 	}
460 
461 	/*
462 	 * If it's a COW mapping, write protect it both
463 	 * in the parent and the child
464 	 */
465 	if (is_cow_mapping(vm_flags)) {
466 		ptep_set_wrprotect(src_mm, addr, src_pte);
467 		pte = pte_wrprotect(pte);
468 	}
469 
470 	/*
471 	 * If it's a shared mapping, mark it clean in
472 	 * the child
473 	 */
474 	if (vm_flags & VM_SHARED)
475 		pte = pte_mkclean(pte);
476 	pte = pte_mkold(pte);
477 
478 	page = vm_normal_page(vma, addr, pte);
479 	if (page) {
480 		get_page(page);
481 		page_dup_rmap(page, vma, addr);
482 		rss[!!PageAnon(page)]++;
483 	}
484 
485 out_set_pte:
486 	set_pte_at(dst_mm, addr, dst_pte, pte);
487 }
488 
489 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
490 		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
491 		unsigned long addr, unsigned long end)
492 {
493 	pte_t *src_pte, *dst_pte;
494 	spinlock_t *src_ptl, *dst_ptl;
495 	int progress = 0;
496 	int rss[2];
497 
498 again:
499 	rss[1] = rss[0] = 0;
500 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
501 	if (!dst_pte)
502 		return -ENOMEM;
503 	src_pte = pte_offset_map_nested(src_pmd, addr);
504 	src_ptl = pte_lockptr(src_mm, src_pmd);
505 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
506 	arch_enter_lazy_mmu_mode();
507 
508 	do {
509 		/*
510 		 * We are holding two locks at this point - either of them
511 		 * could generate latencies in another task on another CPU.
512 		 */
513 		if (progress >= 32) {
514 			progress = 0;
515 			if (need_resched() ||
516 			    need_lockbreak(src_ptl) ||
517 			    need_lockbreak(dst_ptl))
518 				break;
519 		}
520 		if (pte_none(*src_pte)) {
521 			progress++;
522 			continue;
523 		}
524 		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
525 		progress += 8;
526 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
527 
528 	arch_leave_lazy_mmu_mode();
529 	spin_unlock(src_ptl);
530 	pte_unmap_nested(src_pte - 1);
531 	add_mm_rss(dst_mm, rss[0], rss[1]);
532 	pte_unmap_unlock(dst_pte - 1, dst_ptl);
533 	cond_resched();
534 	if (addr != end)
535 		goto again;
536 	return 0;
537 }
538 
539 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
540 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
541 		unsigned long addr, unsigned long end)
542 {
543 	pmd_t *src_pmd, *dst_pmd;
544 	unsigned long next;
545 
546 	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
547 	if (!dst_pmd)
548 		return -ENOMEM;
549 	src_pmd = pmd_offset(src_pud, addr);
550 	do {
551 		next = pmd_addr_end(addr, end);
552 		if (pmd_none_or_clear_bad(src_pmd))
553 			continue;
554 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
555 						vma, addr, next))
556 			return -ENOMEM;
557 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
558 	return 0;
559 }
560 
561 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
562 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
563 		unsigned long addr, unsigned long end)
564 {
565 	pud_t *src_pud, *dst_pud;
566 	unsigned long next;
567 
568 	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
569 	if (!dst_pud)
570 		return -ENOMEM;
571 	src_pud = pud_offset(src_pgd, addr);
572 	do {
573 		next = pud_addr_end(addr, end);
574 		if (pud_none_or_clear_bad(src_pud))
575 			continue;
576 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
577 						vma, addr, next))
578 			return -ENOMEM;
579 	} while (dst_pud++, src_pud++, addr = next, addr != end);
580 	return 0;
581 }
582 
583 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
584 		struct vm_area_struct *vma)
585 {
586 	pgd_t *src_pgd, *dst_pgd;
587 	unsigned long next;
588 	unsigned long addr = vma->vm_start;
589 	unsigned long end = vma->vm_end;
590 
591 	/*
592 	 * Don't copy ptes where a page fault will fill them correctly.
593 	 * Fork becomes much lighter when there are big shared or private
594 	 * readonly mappings. The tradeoff is that copy_page_range is more
595 	 * efficient than faulting.
596 	 */
597 	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
598 		if (!vma->anon_vma)
599 			return 0;
600 	}
601 
602 	if (is_vm_hugetlb_page(vma))
603 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
604 
605 	dst_pgd = pgd_offset(dst_mm, addr);
606 	src_pgd = pgd_offset(src_mm, addr);
607 	do {
608 		next = pgd_addr_end(addr, end);
609 		if (pgd_none_or_clear_bad(src_pgd))
610 			continue;
611 		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
612 						vma, addr, next))
613 			return -ENOMEM;
614 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
615 	return 0;
616 }
617 
618 static unsigned long zap_pte_range(struct mmu_gather *tlb,
619 				struct vm_area_struct *vma, pmd_t *pmd,
620 				unsigned long addr, unsigned long end,
621 				long *zap_work, struct zap_details *details)
622 {
623 	struct mm_struct *mm = tlb->mm;
624 	pte_t *pte;
625 	spinlock_t *ptl;
626 	int file_rss = 0;
627 	int anon_rss = 0;
628 
629 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
630 	arch_enter_lazy_mmu_mode();
631 	do {
632 		pte_t ptent = *pte;
633 		if (pte_none(ptent)) {
634 			(*zap_work)--;
635 			continue;
636 		}
637 
638 		(*zap_work) -= PAGE_SIZE;
639 
640 		if (pte_present(ptent)) {
641 			struct page *page;
642 
643 			page = vm_normal_page(vma, addr, ptent);
644 			if (unlikely(details) && page) {
645 				/*
646 				 * unmap_shared_mapping_pages() wants to
647 				 * invalidate cache without truncating:
648 				 * unmap shared but keep private pages.
649 				 */
650 				if (details->check_mapping &&
651 				    details->check_mapping != page->mapping)
652 					continue;
653 				/*
654 				 * Each page->index must be checked when
655 				 * invalidating or truncating nonlinear.
656 				 */
657 				if (details->nonlinear_vma &&
658 				    (page->index < details->first_index ||
659 				     page->index > details->last_index))
660 					continue;
661 			}
662 			ptent = ptep_get_and_clear_full(mm, addr, pte,
663 							tlb->fullmm);
664 			tlb_remove_tlb_entry(tlb, pte, addr);
665 			if (unlikely(!page))
666 				continue;
667 			if (unlikely(details) && details->nonlinear_vma
668 			    && linear_page_index(details->nonlinear_vma,
669 						addr) != page->index)
670 				set_pte_at(mm, addr, pte,
671 					   pgoff_to_pte(page->index));
672 			if (PageAnon(page))
673 				anon_rss--;
674 			else {
675 				if (pte_dirty(ptent))
676 					set_page_dirty(page);
677 				if (pte_young(ptent))
678 					SetPageReferenced(page);
679 				file_rss--;
680 			}
681 			page_remove_rmap(page, vma);
682 			tlb_remove_page(tlb, page);
683 			continue;
684 		}
685 		/*
686 		 * If details->check_mapping, we leave swap entries;
687 		 * if details->nonlinear_vma, we leave file entries.
688 		 */
689 		if (unlikely(details))
690 			continue;
691 		if (!pte_file(ptent))
692 			free_swap_and_cache(pte_to_swp_entry(ptent));
693 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
694 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
695 
696 	add_mm_rss(mm, file_rss, anon_rss);
697 	arch_leave_lazy_mmu_mode();
698 	pte_unmap_unlock(pte - 1, ptl);
699 
700 	return addr;
701 }
702 
703 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
704 				struct vm_area_struct *vma, pud_t *pud,
705 				unsigned long addr, unsigned long end,
706 				long *zap_work, struct zap_details *details)
707 {
708 	pmd_t *pmd;
709 	unsigned long next;
710 
711 	pmd = pmd_offset(pud, addr);
712 	do {
713 		next = pmd_addr_end(addr, end);
714 		if (pmd_none_or_clear_bad(pmd)) {
715 			(*zap_work)--;
716 			continue;
717 		}
718 		next = zap_pte_range(tlb, vma, pmd, addr, next,
719 						zap_work, details);
720 	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
721 
722 	return addr;
723 }
724 
725 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
726 				struct vm_area_struct *vma, pgd_t *pgd,
727 				unsigned long addr, unsigned long end,
728 				long *zap_work, struct zap_details *details)
729 {
730 	pud_t *pud;
731 	unsigned long next;
732 
733 	pud = pud_offset(pgd, addr);
734 	do {
735 		next = pud_addr_end(addr, end);
736 		if (pud_none_or_clear_bad(pud)) {
737 			(*zap_work)--;
738 			continue;
739 		}
740 		next = zap_pmd_range(tlb, vma, pud, addr, next,
741 						zap_work, details);
742 	} while (pud++, addr = next, (addr != end && *zap_work > 0));
743 
744 	return addr;
745 }
746 
747 static unsigned long unmap_page_range(struct mmu_gather *tlb,
748 				struct vm_area_struct *vma,
749 				unsigned long addr, unsigned long end,
750 				long *zap_work, struct zap_details *details)
751 {
752 	pgd_t *pgd;
753 	unsigned long next;
754 
755 	if (details && !details->check_mapping && !details->nonlinear_vma)
756 		details = NULL;
757 
758 	BUG_ON(addr >= end);
759 	tlb_start_vma(tlb, vma);
760 	pgd = pgd_offset(vma->vm_mm, addr);
761 	do {
762 		next = pgd_addr_end(addr, end);
763 		if (pgd_none_or_clear_bad(pgd)) {
764 			(*zap_work)--;
765 			continue;
766 		}
767 		next = zap_pud_range(tlb, vma, pgd, addr, next,
768 						zap_work, details);
769 	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
770 	tlb_end_vma(tlb, vma);
771 
772 	return addr;
773 }
774 
775 #ifdef CONFIG_PREEMPT
776 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
777 #else
778 /* No preempt: go for improved straight-line efficiency */
779 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
780 #endif
781 
782 /**
783  * unmap_vmas - unmap a range of memory covered by a list of vma's
784  * @tlbp: address of the caller's struct mmu_gather
785  * @vma: the starting vma
786  * @start_addr: virtual address at which to start unmapping
787  * @end_addr: virtual address at which to end unmapping
788  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
789  * @details: details of nonlinear truncation or shared cache invalidation
790  *
791  * Returns the end address of the unmapping (restart addr if interrupted).
792  *
793  * Unmap all pages in the vma list.
794  *
795  * We aim to not hold locks for too long (for scheduling latency reasons).
796  * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
797  * return the ending mmu_gather to the caller.
798  *
799  * Only addresses between `start' and `end' will be unmapped.
800  *
801  * The VMA list must be sorted in ascending virtual address order.
802  *
803  * unmap_vmas() assumes that the caller will flush the whole unmapped address
804  * range after unmap_vmas() returns.  So the only responsibility here is to
805  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
806  * drops the lock and schedules.
807  */
808 unsigned long unmap_vmas(struct mmu_gather **tlbp,
809 		struct vm_area_struct *vma, unsigned long start_addr,
810 		unsigned long end_addr, unsigned long *nr_accounted,
811 		struct zap_details *details)
812 {
813 	long zap_work = ZAP_BLOCK_SIZE;
814 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
815 	int tlb_start_valid = 0;
816 	unsigned long start = start_addr;
817 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
818 	int fullmm = (*tlbp)->fullmm;
819 
820 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
821 		unsigned long end;
822 
823 		start = max(vma->vm_start, start_addr);
824 		if (start >= vma->vm_end)
825 			continue;
826 		end = min(vma->vm_end, end_addr);
827 		if (end <= vma->vm_start)
828 			continue;
829 
830 		if (vma->vm_flags & VM_ACCOUNT)
831 			*nr_accounted += (end - start) >> PAGE_SHIFT;
832 
833 		while (start != end) {
834 			if (!tlb_start_valid) {
835 				tlb_start = start;
836 				tlb_start_valid = 1;
837 			}
838 
839 			if (unlikely(is_vm_hugetlb_page(vma))) {
840 				unmap_hugepage_range(vma, start, end);
841 				zap_work -= (end - start) /
842 						(HPAGE_SIZE / PAGE_SIZE);
843 				start = end;
844 			} else
845 				start = unmap_page_range(*tlbp, vma,
846 						start, end, &zap_work, details);
847 
848 			if (zap_work > 0) {
849 				BUG_ON(start != end);
850 				break;
851 			}
852 
853 			tlb_finish_mmu(*tlbp, tlb_start, start);
854 
855 			if (need_resched() ||
856 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
857 				if (i_mmap_lock) {
858 					*tlbp = NULL;
859 					goto out;
860 				}
861 				cond_resched();
862 			}
863 
864 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
865 			tlb_start_valid = 0;
866 			zap_work = ZAP_BLOCK_SIZE;
867 		}
868 	}
869 out:
870 	return start;	/* which is now the end (or restart) address */
871 }
872 
873 /**
874  * zap_page_range - remove user pages in a given range
875  * @vma: vm_area_struct holding the applicable pages
876  * @address: starting address of pages to zap
877  * @size: number of bytes to zap
878  * @details: details of nonlinear truncation or shared cache invalidation
879  */
880 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
881 		unsigned long size, struct zap_details *details)
882 {
883 	struct mm_struct *mm = vma->vm_mm;
884 	struct mmu_gather *tlb;
885 	unsigned long end = address + size;
886 	unsigned long nr_accounted = 0;
887 
888 	lru_add_drain();
889 	tlb = tlb_gather_mmu(mm, 0);
890 	update_hiwater_rss(mm);
891 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
892 	if (tlb)
893 		tlb_finish_mmu(tlb, address, end);
894 	return end;
895 }
896 
897 /*
898  * Do a quick page-table lookup for a single page.
899  */
900 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
901 			unsigned int flags)
902 {
903 	pgd_t *pgd;
904 	pud_t *pud;
905 	pmd_t *pmd;
906 	pte_t *ptep, pte;
907 	spinlock_t *ptl;
908 	struct page *page;
909 	struct mm_struct *mm = vma->vm_mm;
910 
911 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
912 	if (!IS_ERR(page)) {
913 		BUG_ON(flags & FOLL_GET);
914 		goto out;
915 	}
916 
917 	page = NULL;
918 	pgd = pgd_offset(mm, address);
919 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
920 		goto no_page_table;
921 
922 	pud = pud_offset(pgd, address);
923 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
924 		goto no_page_table;
925 
926 	pmd = pmd_offset(pud, address);
927 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
928 		goto no_page_table;
929 
930 	if (pmd_huge(*pmd)) {
931 		BUG_ON(flags & FOLL_GET);
932 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
933 		goto out;
934 	}
935 
936 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
937 	if (!ptep)
938 		goto out;
939 
940 	pte = *ptep;
941 	if (!pte_present(pte))
942 		goto unlock;
943 	if ((flags & FOLL_WRITE) && !pte_write(pte))
944 		goto unlock;
945 	page = vm_normal_page(vma, address, pte);
946 	if (unlikely(!page))
947 		goto unlock;
948 
949 	if (flags & FOLL_GET)
950 		get_page(page);
951 	if (flags & FOLL_TOUCH) {
952 		if ((flags & FOLL_WRITE) &&
953 		    !pte_dirty(pte) && !PageDirty(page))
954 			set_page_dirty(page);
955 		mark_page_accessed(page);
956 	}
957 unlock:
958 	pte_unmap_unlock(ptep, ptl);
959 out:
960 	return page;
961 
962 no_page_table:
963 	/*
964 	 * When core dumping an enormous anonymous area that nobody
965 	 * has touched so far, we don't want to allocate page tables.
966 	 */
967 	if (flags & FOLL_ANON) {
968 		page = ZERO_PAGE(0);
969 		if (flags & FOLL_GET)
970 			get_page(page);
971 		BUG_ON(flags & FOLL_WRITE);
972 	}
973 	return page;
974 }
975 
976 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
977 		unsigned long start, int len, int write, int force,
978 		struct page **pages, struct vm_area_struct **vmas)
979 {
980 	int i;
981 	unsigned int vm_flags;
982 
983 	/*
984 	 * Require read or write permissions.
985 	 * If 'force' is set, we only require the "MAY" flags.
986 	 */
987 	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
988 	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
989 	i = 0;
990 
991 	do {
992 		struct vm_area_struct *vma;
993 		unsigned int foll_flags;
994 
995 		vma = find_extend_vma(mm, start);
996 		if (!vma && in_gate_area(tsk, start)) {
997 			unsigned long pg = start & PAGE_MASK;
998 			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
999 			pgd_t *pgd;
1000 			pud_t *pud;
1001 			pmd_t *pmd;
1002 			pte_t *pte;
1003 			if (write) /* user gate pages are read-only */
1004 				return i ? : -EFAULT;
1005 			if (pg > TASK_SIZE)
1006 				pgd = pgd_offset_k(pg);
1007 			else
1008 				pgd = pgd_offset_gate(mm, pg);
1009 			BUG_ON(pgd_none(*pgd));
1010 			pud = pud_offset(pgd, pg);
1011 			BUG_ON(pud_none(*pud));
1012 			pmd = pmd_offset(pud, pg);
1013 			if (pmd_none(*pmd))
1014 				return i ? : -EFAULT;
1015 			pte = pte_offset_map(pmd, pg);
1016 			if (pte_none(*pte)) {
1017 				pte_unmap(pte);
1018 				return i ? : -EFAULT;
1019 			}
1020 			if (pages) {
1021 				struct page *page = vm_normal_page(gate_vma, start, *pte);
1022 				pages[i] = page;
1023 				if (page)
1024 					get_page(page);
1025 			}
1026 			pte_unmap(pte);
1027 			if (vmas)
1028 				vmas[i] = gate_vma;
1029 			i++;
1030 			start += PAGE_SIZE;
1031 			len--;
1032 			continue;
1033 		}
1034 
1035 		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
1036 				|| !(vm_flags & vma->vm_flags))
1037 			return i ? : -EFAULT;
1038 
1039 		if (is_vm_hugetlb_page(vma)) {
1040 			i = follow_hugetlb_page(mm, vma, pages, vmas,
1041 						&start, &len, i, write);
1042 			continue;
1043 		}
1044 
1045 		foll_flags = FOLL_TOUCH;
1046 		if (pages)
1047 			foll_flags |= FOLL_GET;
1048 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
1049 		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
1050 					!vma->vm_ops->fault)))
1051 			foll_flags |= FOLL_ANON;
1052 
1053 		do {
1054 			struct page *page;
1055 
1056 			/*
1057 			 * If tsk is ooming, cut off its access to large memory
1058 			 * allocations. It has a pending SIGKILL, but it can't
1059 			 * be processed until returning to user space.
1060 			 */
1061 			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1062 				return -ENOMEM;
1063 
1064 			if (write)
1065 				foll_flags |= FOLL_WRITE;
1066 
1067 			cond_resched();
1068 			while (!(page = follow_page(vma, start, foll_flags))) {
1069 				int ret;
1070 				ret = handle_mm_fault(mm, vma, start,
1071 						foll_flags & FOLL_WRITE);
1072 				if (ret & VM_FAULT_ERROR) {
1073 					if (ret & VM_FAULT_OOM)
1074 						return i ? i : -ENOMEM;
1075 					else if (ret & VM_FAULT_SIGBUS)
1076 						return i ? i : -EFAULT;
1077 					BUG();
1078 				}
1079 				if (ret & VM_FAULT_MAJOR)
1080 					tsk->maj_flt++;
1081 				else
1082 					tsk->min_flt++;
1083 
1084 				/*
1085 				 * The VM_FAULT_WRITE bit tells us that
1086 				 * do_wp_page has broken COW when necessary,
1087 				 * even if maybe_mkwrite decided not to set
1088 				 * pte_write. We can thus safely do subsequent
1089 				 * page lookups as if they were reads.
1090 				 */
1091 				if (ret & VM_FAULT_WRITE)
1092 					foll_flags &= ~FOLL_WRITE;
1093 
1094 				cond_resched();
1095 			}
1096 			if (pages) {
1097 				pages[i] = page;
1098 
1099 				flush_anon_page(vma, page, start);
1100 				flush_dcache_page(page);
1101 			}
1102 			if (vmas)
1103 				vmas[i] = vma;
1104 			i++;
1105 			start += PAGE_SIZE;
1106 			len--;
1107 		} while (len && start < vma->vm_end);
1108 	} while (len);
1109 	return i;
1110 }
1111 EXPORT_SYMBOL(get_user_pages);
1112 
1113 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1114 {
1115 	pgd_t * pgd = pgd_offset(mm, addr);
1116 	pud_t * pud = pud_alloc(mm, pgd, addr);
1117 	if (pud) {
1118 		pmd_t * pmd = pmd_alloc(mm, pud, addr);
1119 		if (pmd)
1120 			return pte_alloc_map_lock(mm, pmd, addr, ptl);
1121 	}
1122 	return NULL;
1123 }
1124 
1125 /*
1126  * This is the old fallback for page remapping.
1127  *
1128  * For historical reasons, it only allows reserved pages. Only
1129  * old drivers should use this, and they needed to mark their
1130  * pages reserved for the old functions anyway.
1131  */
1132 static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
1133 {
1134 	int retval;
1135 	pte_t *pte;
1136 	spinlock_t *ptl;
1137 
1138 	retval = -EINVAL;
1139 	if (PageAnon(page))
1140 		goto out;
1141 	retval = -ENOMEM;
1142 	flush_dcache_page(page);
1143 	pte = get_locked_pte(mm, addr, &ptl);
1144 	if (!pte)
1145 		goto out;
1146 	retval = -EBUSY;
1147 	if (!pte_none(*pte))
1148 		goto out_unlock;
1149 
1150 	/* Ok, finally just insert the thing.. */
1151 	get_page(page);
1152 	inc_mm_counter(mm, file_rss);
1153 	page_add_file_rmap(page);
1154 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
1155 
1156 	retval = 0;
1157 out_unlock:
1158 	pte_unmap_unlock(pte, ptl);
1159 out:
1160 	return retval;
1161 }
1162 
1163 /**
1164  * vm_insert_page - insert single page into user vma
1165  * @vma: user vma to map to
1166  * @addr: target user address of this page
1167  * @page: source kernel page
1168  *
1169  * This allows drivers to insert individual pages they've allocated
1170  * into a user vma.
1171  *
1172  * The page has to be a nice clean _individual_ kernel allocation.
1173  * If you allocate a compound page, you need to have marked it as
1174  * such (__GFP_COMP), or manually just split the page up yourself
1175  * (see split_page()).
1176  *
1177  * NOTE! Traditionally this was done with "remap_pfn_range()" which
1178  * took an arbitrary page protection parameter. This doesn't allow
1179  * that. Your vma protection will have to be set up correctly, which
1180  * means that if you want a shared writable mapping, you'd better
1181  * ask for a shared writable mapping!
1182  *
1183  * The page does not need to be reserved.
1184  */
1185 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
1186 {
1187 	if (addr < vma->vm_start || addr >= vma->vm_end)
1188 		return -EFAULT;
1189 	if (!page_count(page))
1190 		return -EINVAL;
1191 	vma->vm_flags |= VM_INSERTPAGE;
1192 	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
1193 }
1194 EXPORT_SYMBOL(vm_insert_page);
1195 
1196 /**
1197  * vm_insert_pfn - insert single pfn into user vma
1198  * @vma: user vma to map to
1199  * @addr: target user address of this page
1200  * @pfn: source kernel pfn
1201  *
1202  * Similar to vm_inert_page, this allows drivers to insert individual pages
1203  * they've allocated into a user vma. Same comments apply.
1204  *
1205  * This function should only be called from a vm_ops->fault handler, and
1206  * in that case the handler should return NULL.
1207  */
1208 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1209 		unsigned long pfn)
1210 {
1211 	struct mm_struct *mm = vma->vm_mm;
1212 	int retval;
1213 	pte_t *pte, entry;
1214 	spinlock_t *ptl;
1215 
1216 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
1217 	BUG_ON(is_cow_mapping(vma->vm_flags));
1218 
1219 	retval = -ENOMEM;
1220 	pte = get_locked_pte(mm, addr, &ptl);
1221 	if (!pte)
1222 		goto out;
1223 	retval = -EBUSY;
1224 	if (!pte_none(*pte))
1225 		goto out_unlock;
1226 
1227 	/* Ok, finally just insert the thing.. */
1228 	entry = pfn_pte(pfn, vma->vm_page_prot);
1229 	set_pte_at(mm, addr, pte, entry);
1230 	update_mmu_cache(vma, addr, entry);
1231 
1232 	retval = 0;
1233 out_unlock:
1234 	pte_unmap_unlock(pte, ptl);
1235 
1236 out:
1237 	return retval;
1238 }
1239 EXPORT_SYMBOL(vm_insert_pfn);
1240 
1241 /*
1242  * maps a range of physical memory into the requested pages. the old
1243  * mappings are removed. any references to nonexistent pages results
1244  * in null mappings (currently treated as "copy-on-access")
1245  */
1246 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1247 			unsigned long addr, unsigned long end,
1248 			unsigned long pfn, pgprot_t prot)
1249 {
1250 	pte_t *pte;
1251 	spinlock_t *ptl;
1252 
1253 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1254 	if (!pte)
1255 		return -ENOMEM;
1256 	arch_enter_lazy_mmu_mode();
1257 	do {
1258 		BUG_ON(!pte_none(*pte));
1259 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1260 		pfn++;
1261 	} while (pte++, addr += PAGE_SIZE, addr != end);
1262 	arch_leave_lazy_mmu_mode();
1263 	pte_unmap_unlock(pte - 1, ptl);
1264 	return 0;
1265 }
1266 
1267 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1268 			unsigned long addr, unsigned long end,
1269 			unsigned long pfn, pgprot_t prot)
1270 {
1271 	pmd_t *pmd;
1272 	unsigned long next;
1273 
1274 	pfn -= addr >> PAGE_SHIFT;
1275 	pmd = pmd_alloc(mm, pud, addr);
1276 	if (!pmd)
1277 		return -ENOMEM;
1278 	do {
1279 		next = pmd_addr_end(addr, end);
1280 		if (remap_pte_range(mm, pmd, addr, next,
1281 				pfn + (addr >> PAGE_SHIFT), prot))
1282 			return -ENOMEM;
1283 	} while (pmd++, addr = next, addr != end);
1284 	return 0;
1285 }
1286 
1287 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1288 			unsigned long addr, unsigned long end,
1289 			unsigned long pfn, pgprot_t prot)
1290 {
1291 	pud_t *pud;
1292 	unsigned long next;
1293 
1294 	pfn -= addr >> PAGE_SHIFT;
1295 	pud = pud_alloc(mm, pgd, addr);
1296 	if (!pud)
1297 		return -ENOMEM;
1298 	do {
1299 		next = pud_addr_end(addr, end);
1300 		if (remap_pmd_range(mm, pud, addr, next,
1301 				pfn + (addr >> PAGE_SHIFT), prot))
1302 			return -ENOMEM;
1303 	} while (pud++, addr = next, addr != end);
1304 	return 0;
1305 }
1306 
1307 /**
1308  * remap_pfn_range - remap kernel memory to userspace
1309  * @vma: user vma to map to
1310  * @addr: target user address to start at
1311  * @pfn: physical address of kernel memory
1312  * @size: size of map area
1313  * @prot: page protection flags for this mapping
1314  *
1315  *  Note: this is only safe if the mm semaphore is held when called.
1316  */
1317 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1318 		    unsigned long pfn, unsigned long size, pgprot_t prot)
1319 {
1320 	pgd_t *pgd;
1321 	unsigned long next;
1322 	unsigned long end = addr + PAGE_ALIGN(size);
1323 	struct mm_struct *mm = vma->vm_mm;
1324 	int err;
1325 
1326 	/*
1327 	 * Physically remapped pages are special. Tell the
1328 	 * rest of the world about it:
1329 	 *   VM_IO tells people not to look at these pages
1330 	 *	(accesses can have side effects).
1331 	 *   VM_RESERVED is specified all over the place, because
1332 	 *	in 2.4 it kept swapout's vma scan off this vma; but
1333 	 *	in 2.6 the LRU scan won't even find its pages, so this
1334 	 *	flag means no more than count its pages in reserved_vm,
1335 	 * 	and omit it from core dump, even when VM_IO turned off.
1336 	 *   VM_PFNMAP tells the core MM that the base pages are just
1337 	 *	raw PFN mappings, and do not have a "struct page" associated
1338 	 *	with them.
1339 	 *
1340 	 * There's a horrible special case to handle copy-on-write
1341 	 * behaviour that some programs depend on. We mark the "original"
1342 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1343 	 */
1344 	if (is_cow_mapping(vma->vm_flags)) {
1345 		if (addr != vma->vm_start || end != vma->vm_end)
1346 			return -EINVAL;
1347 		vma->vm_pgoff = pfn;
1348 	}
1349 
1350 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1351 
1352 	BUG_ON(addr >= end);
1353 	pfn -= addr >> PAGE_SHIFT;
1354 	pgd = pgd_offset(mm, addr);
1355 	flush_cache_range(vma, addr, end);
1356 	do {
1357 		next = pgd_addr_end(addr, end);
1358 		err = remap_pud_range(mm, pgd, addr, next,
1359 				pfn + (addr >> PAGE_SHIFT), prot);
1360 		if (err)
1361 			break;
1362 	} while (pgd++, addr = next, addr != end);
1363 	return err;
1364 }
1365 EXPORT_SYMBOL(remap_pfn_range);
1366 
1367 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1368 				     unsigned long addr, unsigned long end,
1369 				     pte_fn_t fn, void *data)
1370 {
1371 	pte_t *pte;
1372 	int err;
1373 	struct page *pmd_page;
1374 	spinlock_t *uninitialized_var(ptl);
1375 
1376 	pte = (mm == &init_mm) ?
1377 		pte_alloc_kernel(pmd, addr) :
1378 		pte_alloc_map_lock(mm, pmd, addr, &ptl);
1379 	if (!pte)
1380 		return -ENOMEM;
1381 
1382 	BUG_ON(pmd_huge(*pmd));
1383 
1384 	pmd_page = pmd_page(*pmd);
1385 
1386 	do {
1387 		err = fn(pte, pmd_page, addr, data);
1388 		if (err)
1389 			break;
1390 	} while (pte++, addr += PAGE_SIZE, addr != end);
1391 
1392 	if (mm != &init_mm)
1393 		pte_unmap_unlock(pte-1, ptl);
1394 	return err;
1395 }
1396 
1397 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1398 				     unsigned long addr, unsigned long end,
1399 				     pte_fn_t fn, void *data)
1400 {
1401 	pmd_t *pmd;
1402 	unsigned long next;
1403 	int err;
1404 
1405 	pmd = pmd_alloc(mm, pud, addr);
1406 	if (!pmd)
1407 		return -ENOMEM;
1408 	do {
1409 		next = pmd_addr_end(addr, end);
1410 		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
1411 		if (err)
1412 			break;
1413 	} while (pmd++, addr = next, addr != end);
1414 	return err;
1415 }
1416 
1417 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
1418 				     unsigned long addr, unsigned long end,
1419 				     pte_fn_t fn, void *data)
1420 {
1421 	pud_t *pud;
1422 	unsigned long next;
1423 	int err;
1424 
1425 	pud = pud_alloc(mm, pgd, addr);
1426 	if (!pud)
1427 		return -ENOMEM;
1428 	do {
1429 		next = pud_addr_end(addr, end);
1430 		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
1431 		if (err)
1432 			break;
1433 	} while (pud++, addr = next, addr != end);
1434 	return err;
1435 }
1436 
1437 /*
1438  * Scan a region of virtual memory, filling in page tables as necessary
1439  * and calling a provided function on each leaf page table.
1440  */
1441 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1442 			unsigned long size, pte_fn_t fn, void *data)
1443 {
1444 	pgd_t *pgd;
1445 	unsigned long next;
1446 	unsigned long end = addr + size;
1447 	int err;
1448 
1449 	BUG_ON(addr >= end);
1450 	pgd = pgd_offset(mm, addr);
1451 	do {
1452 		next = pgd_addr_end(addr, end);
1453 		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
1454 		if (err)
1455 			break;
1456 	} while (pgd++, addr = next, addr != end);
1457 	return err;
1458 }
1459 EXPORT_SYMBOL_GPL(apply_to_page_range);
1460 
1461 /*
1462  * handle_pte_fault chooses page fault handler according to an entry
1463  * which was read non-atomically.  Before making any commitment, on
1464  * those architectures or configurations (e.g. i386 with PAE) which
1465  * might give a mix of unmatched parts, do_swap_page and do_file_page
1466  * must check under lock before unmapping the pte and proceeding
1467  * (but do_wp_page is only called after already making such a check;
1468  * and do_anonymous_page and do_no_page can safely check later on).
1469  */
1470 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1471 				pte_t *page_table, pte_t orig_pte)
1472 {
1473 	int same = 1;
1474 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1475 	if (sizeof(pte_t) > sizeof(unsigned long)) {
1476 		spinlock_t *ptl = pte_lockptr(mm, pmd);
1477 		spin_lock(ptl);
1478 		same = pte_same(*page_table, orig_pte);
1479 		spin_unlock(ptl);
1480 	}
1481 #endif
1482 	pte_unmap(page_table);
1483 	return same;
1484 }
1485 
1486 /*
1487  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
1488  * servicing faults for write access.  In the normal case, do always want
1489  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
1490  * that do not have writing enabled, when used by access_process_vm.
1491  */
1492 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1493 {
1494 	if (likely(vma->vm_flags & VM_WRITE))
1495 		pte = pte_mkwrite(pte);
1496 	return pte;
1497 }
1498 
1499 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1500 {
1501 	/*
1502 	 * If the source page was a PFN mapping, we don't have
1503 	 * a "struct page" for it. We do a best-effort copy by
1504 	 * just copying from the original user address. If that
1505 	 * fails, we just zero-fill it. Live with it.
1506 	 */
1507 	if (unlikely(!src)) {
1508 		void *kaddr = kmap_atomic(dst, KM_USER0);
1509 		void __user *uaddr = (void __user *)(va & PAGE_MASK);
1510 
1511 		/*
1512 		 * This really shouldn't fail, because the page is there
1513 		 * in the page tables. But it might just be unreadable,
1514 		 * in which case we just give up and fill the result with
1515 		 * zeroes.
1516 		 */
1517 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1518 			memset(kaddr, 0, PAGE_SIZE);
1519 		kunmap_atomic(kaddr, KM_USER0);
1520 		flush_dcache_page(dst);
1521 		return;
1522 
1523 	}
1524 	copy_user_highpage(dst, src, va, vma);
1525 }
1526 
1527 /*
1528  * This routine handles present pages, when users try to write
1529  * to a shared page. It is done by copying the page to a new address
1530  * and decrementing the shared-page counter for the old page.
1531  *
1532  * Note that this routine assumes that the protection checks have been
1533  * done by the caller (the low-level page fault routine in most cases).
1534  * Thus we can safely just mark it writable once we've done any necessary
1535  * COW.
1536  *
1537  * We also mark the page dirty at this point even though the page will
1538  * change only once the write actually happens. This avoids a few races,
1539  * and potentially makes it more efficient.
1540  *
1541  * We enter with non-exclusive mmap_sem (to exclude vma changes,
1542  * but allow concurrent faults), with pte both mapped and locked.
1543  * We return with mmap_sem still held, but pte unmapped and unlocked.
1544  */
1545 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1546 		unsigned long address, pte_t *page_table, pmd_t *pmd,
1547 		spinlock_t *ptl, pte_t orig_pte)
1548 {
1549 	struct page *old_page, *new_page;
1550 	pte_t entry;
1551 	int reuse = 0, ret = 0;
1552 	int page_mkwrite = 0;
1553 	struct page *dirty_page = NULL;
1554 
1555 	old_page = vm_normal_page(vma, address, orig_pte);
1556 	if (!old_page)
1557 		goto gotten;
1558 
1559 	/*
1560 	 * Take out anonymous pages first, anonymous shared vmas are
1561 	 * not dirty accountable.
1562 	 */
1563 	if (PageAnon(old_page)) {
1564 		if (!TestSetPageLocked(old_page)) {
1565 			reuse = can_share_swap_page(old_page);
1566 			unlock_page(old_page);
1567 		}
1568 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1569 					(VM_WRITE|VM_SHARED))) {
1570 		/*
1571 		 * Only catch write-faults on shared writable pages,
1572 		 * read-only shared pages can get COWed by
1573 		 * get_user_pages(.write=1, .force=1).
1574 		 */
1575 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1576 			/*
1577 			 * Notify the address space that the page is about to
1578 			 * become writable so that it can prohibit this or wait
1579 			 * for the page to get into an appropriate state.
1580 			 *
1581 			 * We do this without the lock held, so that it can
1582 			 * sleep if it needs to.
1583 			 */
1584 			page_cache_get(old_page);
1585 			pte_unmap_unlock(page_table, ptl);
1586 
1587 			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1588 				goto unwritable_page;
1589 
1590 			/*
1591 			 * Since we dropped the lock we need to revalidate
1592 			 * the PTE as someone else may have changed it.  If
1593 			 * they did, we just return, as we can count on the
1594 			 * MMU to tell us if they didn't also make it writable.
1595 			 */
1596 			page_table = pte_offset_map_lock(mm, pmd, address,
1597 							 &ptl);
1598 			page_cache_release(old_page);
1599 			if (!pte_same(*page_table, orig_pte))
1600 				goto unlock;
1601 
1602 			page_mkwrite = 1;
1603 		}
1604 		dirty_page = old_page;
1605 		get_page(dirty_page);
1606 		reuse = 1;
1607 	}
1608 
1609 	if (reuse) {
1610 		flush_cache_page(vma, address, pte_pfn(orig_pte));
1611 		entry = pte_mkyoung(orig_pte);
1612 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1613 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
1614 			update_mmu_cache(vma, address, entry);
1615 		ret |= VM_FAULT_WRITE;
1616 		goto unlock;
1617 	}
1618 
1619 	/*
1620 	 * Ok, we need to copy. Oh, well..
1621 	 */
1622 	page_cache_get(old_page);
1623 gotten:
1624 	pte_unmap_unlock(page_table, ptl);
1625 
1626 	if (unlikely(anon_vma_prepare(vma)))
1627 		goto oom;
1628 	VM_BUG_ON(old_page == ZERO_PAGE(0));
1629 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1630 	if (!new_page)
1631 		goto oom;
1632 	cow_user_page(new_page, old_page, address, vma);
1633 
1634 	/*
1635 	 * Re-check the pte - we dropped the lock
1636 	 */
1637 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1638 	if (likely(pte_same(*page_table, orig_pte))) {
1639 		if (old_page) {
1640 			page_remove_rmap(old_page, vma);
1641 			if (!PageAnon(old_page)) {
1642 				dec_mm_counter(mm, file_rss);
1643 				inc_mm_counter(mm, anon_rss);
1644 			}
1645 		} else
1646 			inc_mm_counter(mm, anon_rss);
1647 		flush_cache_page(vma, address, pte_pfn(orig_pte));
1648 		entry = mk_pte(new_page, vma->vm_page_prot);
1649 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1650 		/*
1651 		 * Clear the pte entry and flush it first, before updating the
1652 		 * pte with the new entry. This will avoid a race condition
1653 		 * seen in the presence of one thread doing SMC and another
1654 		 * thread doing COW.
1655 		 */
1656 		ptep_clear_flush(vma, address, page_table);
1657 		set_pte_at(mm, address, page_table, entry);
1658 		update_mmu_cache(vma, address, entry);
1659 		lru_cache_add_active(new_page);
1660 		page_add_new_anon_rmap(new_page, vma, address);
1661 
1662 		/* Free the old page.. */
1663 		new_page = old_page;
1664 		ret |= VM_FAULT_WRITE;
1665 	}
1666 	if (new_page)
1667 		page_cache_release(new_page);
1668 	if (old_page)
1669 		page_cache_release(old_page);
1670 unlock:
1671 	pte_unmap_unlock(page_table, ptl);
1672 	if (dirty_page) {
1673 		if (vma->vm_file)
1674 			file_update_time(vma->vm_file);
1675 
1676 		/*
1677 		 * Yes, Virginia, this is actually required to prevent a race
1678 		 * with clear_page_dirty_for_io() from clearing the page dirty
1679 		 * bit after it clear all dirty ptes, but before a racing
1680 		 * do_wp_page installs a dirty pte.
1681 		 *
1682 		 * do_no_page is protected similarly.
1683 		 */
1684 		wait_on_page_locked(dirty_page);
1685 		set_page_dirty_balance(dirty_page, page_mkwrite);
1686 		put_page(dirty_page);
1687 	}
1688 	return ret;
1689 oom:
1690 	if (old_page)
1691 		page_cache_release(old_page);
1692 	return VM_FAULT_OOM;
1693 
1694 unwritable_page:
1695 	page_cache_release(old_page);
1696 	return VM_FAULT_SIGBUS;
1697 }
1698 
1699 /*
1700  * Helper functions for unmap_mapping_range().
1701  *
1702  * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
1703  *
1704  * We have to restart searching the prio_tree whenever we drop the lock,
1705  * since the iterator is only valid while the lock is held, and anyway
1706  * a later vma might be split and reinserted earlier while lock dropped.
1707  *
1708  * The list of nonlinear vmas could be handled more efficiently, using
1709  * a placeholder, but handle it in the same way until a need is shown.
1710  * It is important to search the prio_tree before nonlinear list: a vma
1711  * may become nonlinear and be shifted from prio_tree to nonlinear list
1712  * while the lock is dropped; but never shifted from list to prio_tree.
1713  *
1714  * In order to make forward progress despite restarting the search,
1715  * vm_truncate_count is used to mark a vma as now dealt with, so we can
1716  * quickly skip it next time around.  Since the prio_tree search only
1717  * shows us those vmas affected by unmapping the range in question, we
1718  * can't efficiently keep all vmas in step with mapping->truncate_count:
1719  * so instead reset them all whenever it wraps back to 0 (then go to 1).
1720  * mapping->truncate_count and vma->vm_truncate_count are protected by
1721  * i_mmap_lock.
1722  *
1723  * In order to make forward progress despite repeatedly restarting some
1724  * large vma, note the restart_addr from unmap_vmas when it breaks out:
1725  * and restart from that address when we reach that vma again.  It might
1726  * have been split or merged, shrunk or extended, but never shifted: so
1727  * restart_addr remains valid so long as it remains in the vma's range.
1728  * unmap_mapping_range forces truncate_count to leap over page-aligned
1729  * values so we can save vma's restart_addr in its truncate_count field.
1730  */
1731 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
1732 
1733 static void reset_vma_truncate_counts(struct address_space *mapping)
1734 {
1735 	struct vm_area_struct *vma;
1736 	struct prio_tree_iter iter;
1737 
1738 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
1739 		vma->vm_truncate_count = 0;
1740 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1741 		vma->vm_truncate_count = 0;
1742 }
1743 
1744 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1745 		unsigned long start_addr, unsigned long end_addr,
1746 		struct zap_details *details)
1747 {
1748 	unsigned long restart_addr;
1749 	int need_break;
1750 
1751 	/*
1752 	 * files that support invalidating or truncating portions of the
1753 	 * file from under mmaped areas must have their ->fault function
1754 	 * return a locked page (and set VM_FAULT_LOCKED in the return).
1755 	 * This provides synchronisation against concurrent unmapping here.
1756 	 */
1757 
1758 again:
1759 	restart_addr = vma->vm_truncate_count;
1760 	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
1761 		start_addr = restart_addr;
1762 		if (start_addr >= end_addr) {
1763 			/* Top of vma has been split off since last time */
1764 			vma->vm_truncate_count = details->truncate_count;
1765 			return 0;
1766 		}
1767 	}
1768 
1769 	restart_addr = zap_page_range(vma, start_addr,
1770 					end_addr - start_addr, details);
1771 	need_break = need_resched() ||
1772 			need_lockbreak(details->i_mmap_lock);
1773 
1774 	if (restart_addr >= end_addr) {
1775 		/* We have now completed this vma: mark it so */
1776 		vma->vm_truncate_count = details->truncate_count;
1777 		if (!need_break)
1778 			return 0;
1779 	} else {
1780 		/* Note restart_addr in vma's truncate_count field */
1781 		vma->vm_truncate_count = restart_addr;
1782 		if (!need_break)
1783 			goto again;
1784 	}
1785 
1786 	spin_unlock(details->i_mmap_lock);
1787 	cond_resched();
1788 	spin_lock(details->i_mmap_lock);
1789 	return -EINTR;
1790 }
1791 
1792 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
1793 					    struct zap_details *details)
1794 {
1795 	struct vm_area_struct *vma;
1796 	struct prio_tree_iter iter;
1797 	pgoff_t vba, vea, zba, zea;
1798 
1799 restart:
1800 	vma_prio_tree_foreach(vma, &iter, root,
1801 			details->first_index, details->last_index) {
1802 		/* Skip quickly over those we have already dealt with */
1803 		if (vma->vm_truncate_count == details->truncate_count)
1804 			continue;
1805 
1806 		vba = vma->vm_pgoff;
1807 		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
1808 		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
1809 		zba = details->first_index;
1810 		if (zba < vba)
1811 			zba = vba;
1812 		zea = details->last_index;
1813 		if (zea > vea)
1814 			zea = vea;
1815 
1816 		if (unmap_mapping_range_vma(vma,
1817 			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
1818 			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
1819 				details) < 0)
1820 			goto restart;
1821 	}
1822 }
1823 
1824 static inline void unmap_mapping_range_list(struct list_head *head,
1825 					    struct zap_details *details)
1826 {
1827 	struct vm_area_struct *vma;
1828 
1829 	/*
1830 	 * In nonlinear VMAs there is no correspondence between virtual address
1831 	 * offset and file offset.  So we must perform an exhaustive search
1832 	 * across *all* the pages in each nonlinear VMA, not just the pages
1833 	 * whose virtual address lies outside the file truncation point.
1834 	 */
1835 restart:
1836 	list_for_each_entry(vma, head, shared.vm_set.list) {
1837 		/* Skip quickly over those we have already dealt with */
1838 		if (vma->vm_truncate_count == details->truncate_count)
1839 			continue;
1840 		details->nonlinear_vma = vma;
1841 		if (unmap_mapping_range_vma(vma, vma->vm_start,
1842 					vma->vm_end, details) < 0)
1843 			goto restart;
1844 	}
1845 }
1846 
1847 /**
1848  * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
1849  * @mapping: the address space containing mmaps to be unmapped.
1850  * @holebegin: byte in first page to unmap, relative to the start of
1851  * the underlying file.  This will be rounded down to a PAGE_SIZE
1852  * boundary.  Note that this is different from vmtruncate(), which
1853  * must keep the partial page.  In contrast, we must get rid of
1854  * partial pages.
1855  * @holelen: size of prospective hole in bytes.  This will be rounded
1856  * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
1857  * end of the file.
1858  * @even_cows: 1 when truncating a file, unmap even private COWed pages;
1859  * but 0 when invalidating pagecache, don't throw away private data.
1860  */
1861 void unmap_mapping_range(struct address_space *mapping,
1862 		loff_t const holebegin, loff_t const holelen, int even_cows)
1863 {
1864 	struct zap_details details;
1865 	pgoff_t hba = holebegin >> PAGE_SHIFT;
1866 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1867 
1868 	/* Check for overflow. */
1869 	if (sizeof(holelen) > sizeof(hlen)) {
1870 		long long holeend =
1871 			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1872 		if (holeend & ~(long long)ULONG_MAX)
1873 			hlen = ULONG_MAX - hba + 1;
1874 	}
1875 
1876 	details.check_mapping = even_cows? NULL: mapping;
1877 	details.nonlinear_vma = NULL;
1878 	details.first_index = hba;
1879 	details.last_index = hba + hlen - 1;
1880 	if (details.last_index < details.first_index)
1881 		details.last_index = ULONG_MAX;
1882 	details.i_mmap_lock = &mapping->i_mmap_lock;
1883 
1884 	spin_lock(&mapping->i_mmap_lock);
1885 
1886 	/* Protect against endless unmapping loops */
1887 	mapping->truncate_count++;
1888 	if (unlikely(is_restart_addr(mapping->truncate_count))) {
1889 		if (mapping->truncate_count == 0)
1890 			reset_vma_truncate_counts(mapping);
1891 		mapping->truncate_count++;
1892 	}
1893 	details.truncate_count = mapping->truncate_count;
1894 
1895 	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
1896 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
1897 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
1898 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
1899 	spin_unlock(&mapping->i_mmap_lock);
1900 }
1901 EXPORT_SYMBOL(unmap_mapping_range);
1902 
1903 /**
1904  * vmtruncate - unmap mappings "freed" by truncate() syscall
1905  * @inode: inode of the file used
1906  * @offset: file offset to start truncating
1907  *
1908  * NOTE! We have to be ready to update the memory sharing
1909  * between the file and the memory map for a potential last
1910  * incomplete page.  Ugly, but necessary.
1911  */
1912 int vmtruncate(struct inode * inode, loff_t offset)
1913 {
1914 	struct address_space *mapping = inode->i_mapping;
1915 	unsigned long limit;
1916 
1917 	if (inode->i_size < offset)
1918 		goto do_expand;
1919 	/*
1920 	 * truncation of in-use swapfiles is disallowed - it would cause
1921 	 * subsequent swapout to scribble on the now-freed blocks.
1922 	 */
1923 	if (IS_SWAPFILE(inode))
1924 		goto out_busy;
1925 	i_size_write(inode, offset);
1926 
1927 	/*
1928 	 * unmap_mapping_range is called twice, first simply for efficiency
1929 	 * so that truncate_inode_pages does fewer single-page unmaps. However
1930 	 * after this first call, and before truncate_inode_pages finishes,
1931 	 * it is possible for private pages to be COWed, which remain after
1932 	 * truncate_inode_pages finishes, hence the second unmap_mapping_range
1933 	 * call must be made for correctness.
1934 	 */
1935 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1936 	truncate_inode_pages(mapping, offset);
1937 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1938 	goto out_truncate;
1939 
1940 do_expand:
1941 	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1942 	if (limit != RLIM_INFINITY && offset > limit)
1943 		goto out_sig;
1944 	if (offset > inode->i_sb->s_maxbytes)
1945 		goto out_big;
1946 	i_size_write(inode, offset);
1947 
1948 out_truncate:
1949 	if (inode->i_op && inode->i_op->truncate)
1950 		inode->i_op->truncate(inode);
1951 	return 0;
1952 out_sig:
1953 	send_sig(SIGXFSZ, current, 0);
1954 out_big:
1955 	return -EFBIG;
1956 out_busy:
1957 	return -ETXTBSY;
1958 }
1959 EXPORT_SYMBOL(vmtruncate);
1960 
1961 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1962 {
1963 	struct address_space *mapping = inode->i_mapping;
1964 
1965 	/*
1966 	 * If the underlying filesystem is not going to provide
1967 	 * a way to truncate a range of blocks (punch a hole) -
1968 	 * we should return failure right now.
1969 	 */
1970 	if (!inode->i_op || !inode->i_op->truncate_range)
1971 		return -ENOSYS;
1972 
1973 	mutex_lock(&inode->i_mutex);
1974 	down_write(&inode->i_alloc_sem);
1975 	unmap_mapping_range(mapping, offset, (end - offset), 1);
1976 	truncate_inode_pages_range(mapping, offset, end);
1977 	unmap_mapping_range(mapping, offset, (end - offset), 1);
1978 	inode->i_op->truncate_range(inode, offset, end);
1979 	up_write(&inode->i_alloc_sem);
1980 	mutex_unlock(&inode->i_mutex);
1981 
1982 	return 0;
1983 }
1984 
1985 /**
1986  * swapin_readahead - swap in pages in hope we need them soon
1987  * @entry: swap entry of this memory
1988  * @addr: address to start
1989  * @vma: user vma this addresses belong to
1990  *
1991  * Primitive swap readahead code. We simply read an aligned block of
1992  * (1 << page_cluster) entries in the swap area. This method is chosen
1993  * because it doesn't cost us any seek time.  We also make sure to queue
1994  * the 'original' request together with the readahead ones...
1995  *
1996  * This has been extended to use the NUMA policies from the mm triggering
1997  * the readahead.
1998  *
1999  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
2000  */
2001 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
2002 {
2003 #ifdef CONFIG_NUMA
2004 	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
2005 #endif
2006 	int i, num;
2007 	struct page *new_page;
2008 	unsigned long offset;
2009 
2010 	/*
2011 	 * Get the number of handles we should do readahead io to.
2012 	 */
2013 	num = valid_swaphandles(entry, &offset);
2014 	for (i = 0; i < num; offset++, i++) {
2015 		/* Ok, do the async read-ahead now */
2016 		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
2017 							   offset), vma, addr);
2018 		if (!new_page)
2019 			break;
2020 		page_cache_release(new_page);
2021 #ifdef CONFIG_NUMA
2022 		/*
2023 		 * Find the next applicable VMA for the NUMA policy.
2024 		 */
2025 		addr += PAGE_SIZE;
2026 		if (addr == 0)
2027 			vma = NULL;
2028 		if (vma) {
2029 			if (addr >= vma->vm_end) {
2030 				vma = next_vma;
2031 				next_vma = vma ? vma->vm_next : NULL;
2032 			}
2033 			if (vma && addr < vma->vm_start)
2034 				vma = NULL;
2035 		} else {
2036 			if (next_vma && addr >= next_vma->vm_start) {
2037 				vma = next_vma;
2038 				next_vma = vma->vm_next;
2039 			}
2040 		}
2041 #endif
2042 	}
2043 	lru_add_drain();	/* Push any new pages onto the LRU now */
2044 }
2045 
2046 /*
2047  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2048  * but allow concurrent faults), and pte mapped but not yet locked.
2049  * We return with mmap_sem still held, but pte unmapped and unlocked.
2050  */
2051 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2052 		unsigned long address, pte_t *page_table, pmd_t *pmd,
2053 		int write_access, pte_t orig_pte)
2054 {
2055 	spinlock_t *ptl;
2056 	struct page *page;
2057 	swp_entry_t entry;
2058 	pte_t pte;
2059 	int ret = 0;
2060 
2061 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2062 		goto out;
2063 
2064 	entry = pte_to_swp_entry(orig_pte);
2065 	if (is_migration_entry(entry)) {
2066 		migration_entry_wait(mm, pmd, address);
2067 		goto out;
2068 	}
2069 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2070 	page = lookup_swap_cache(entry);
2071 	if (!page) {
2072 		grab_swap_token(); /* Contend for token _before_ read-in */
2073  		swapin_readahead(entry, address, vma);
2074  		page = read_swap_cache_async(entry, vma, address);
2075 		if (!page) {
2076 			/*
2077 			 * Back out if somebody else faulted in this pte
2078 			 * while we released the pte lock.
2079 			 */
2080 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2081 			if (likely(pte_same(*page_table, orig_pte)))
2082 				ret = VM_FAULT_OOM;
2083 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2084 			goto unlock;
2085 		}
2086 
2087 		/* Had to read the page from swap area: Major fault */
2088 		ret = VM_FAULT_MAJOR;
2089 		count_vm_event(PGMAJFAULT);
2090 	}
2091 
2092 	mark_page_accessed(page);
2093 	lock_page(page);
2094 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2095 
2096 	/*
2097 	 * Back out if somebody else already faulted in this pte.
2098 	 */
2099 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2100 	if (unlikely(!pte_same(*page_table, orig_pte)))
2101 		goto out_nomap;
2102 
2103 	if (unlikely(!PageUptodate(page))) {
2104 		ret = VM_FAULT_SIGBUS;
2105 		goto out_nomap;
2106 	}
2107 
2108 	/* The page isn't present yet, go ahead with the fault. */
2109 
2110 	inc_mm_counter(mm, anon_rss);
2111 	pte = mk_pte(page, vma->vm_page_prot);
2112 	if (write_access && can_share_swap_page(page)) {
2113 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2114 		write_access = 0;
2115 	}
2116 
2117 	flush_icache_page(vma, page);
2118 	set_pte_at(mm, address, page_table, pte);
2119 	page_add_anon_rmap(page, vma, address);
2120 
2121 	swap_free(entry);
2122 	if (vm_swap_full())
2123 		remove_exclusive_swap_page(page);
2124 	unlock_page(page);
2125 
2126 	if (write_access) {
2127 		/* XXX: We could OR the do_wp_page code with this one? */
2128 		if (do_wp_page(mm, vma, address,
2129 				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
2130 			ret = VM_FAULT_OOM;
2131 		goto out;
2132 	}
2133 
2134 	/* No need to invalidate - it was non-present before */
2135 	update_mmu_cache(vma, address, pte);
2136 unlock:
2137 	pte_unmap_unlock(page_table, ptl);
2138 out:
2139 	return ret;
2140 out_nomap:
2141 	pte_unmap_unlock(page_table, ptl);
2142 	unlock_page(page);
2143 	page_cache_release(page);
2144 	return ret;
2145 }
2146 
2147 /*
2148  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2149  * but allow concurrent faults), and pte mapped but not yet locked.
2150  * We return with mmap_sem still held, but pte unmapped and unlocked.
2151  */
2152 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2153 		unsigned long address, pte_t *page_table, pmd_t *pmd,
2154 		int write_access)
2155 {
2156 	struct page *page;
2157 	spinlock_t *ptl;
2158 	pte_t entry;
2159 
2160 	/* Allocate our own private page. */
2161 	pte_unmap(page_table);
2162 
2163 	if (unlikely(anon_vma_prepare(vma)))
2164 		goto oom;
2165 	page = alloc_zeroed_user_highpage_movable(vma, address);
2166 	if (!page)
2167 		goto oom;
2168 
2169 	entry = mk_pte(page, vma->vm_page_prot);
2170 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2171 
2172 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2173 	if (!pte_none(*page_table))
2174 		goto release;
2175 	inc_mm_counter(mm, anon_rss);
2176 	lru_cache_add_active(page);
2177 	page_add_new_anon_rmap(page, vma, address);
2178 	set_pte_at(mm, address, page_table, entry);
2179 
2180 	/* No need to invalidate - it was non-present before */
2181 	update_mmu_cache(vma, address, entry);
2182 unlock:
2183 	pte_unmap_unlock(page_table, ptl);
2184 	return 0;
2185 release:
2186 	page_cache_release(page);
2187 	goto unlock;
2188 oom:
2189 	return VM_FAULT_OOM;
2190 }
2191 
2192 /*
2193  * __do_fault() tries to create a new page mapping. It aggressively
2194  * tries to share with existing pages, but makes a separate copy if
2195  * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
2196  * the next page fault.
2197  *
2198  * As this is called only for pages that do not currently exist, we
2199  * do not need to flush old virtual caches or the TLB.
2200  *
2201  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2202  * but allow concurrent faults), and pte neither mapped nor locked.
2203  * We return with mmap_sem still held, but pte unmapped and unlocked.
2204  */
2205 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2206 		unsigned long address, pmd_t *pmd,
2207 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2208 {
2209 	pte_t *page_table;
2210 	spinlock_t *ptl;
2211 	struct page *page;
2212 	pte_t entry;
2213 	int anon = 0;
2214 	struct page *dirty_page = NULL;
2215 	struct vm_fault vmf;
2216 	int ret;
2217 	int page_mkwrite = 0;
2218 
2219 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2220 	vmf.pgoff = pgoff;
2221 	vmf.flags = flags;
2222 	vmf.page = NULL;
2223 
2224 	BUG_ON(vma->vm_flags & VM_PFNMAP);
2225 
2226 	if (likely(vma->vm_ops->fault)) {
2227 		ret = vma->vm_ops->fault(vma, &vmf);
2228 		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2229 			return ret;
2230 	} else {
2231 		/* Legacy ->nopage path */
2232 		ret = 0;
2233 		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2234 		/* no page was available -- either SIGBUS or OOM */
2235 		if (unlikely(vmf.page == NOPAGE_SIGBUS))
2236 			return VM_FAULT_SIGBUS;
2237 		else if (unlikely(vmf.page == NOPAGE_OOM))
2238 			return VM_FAULT_OOM;
2239 	}
2240 
2241 	/*
2242 	 * For consistency in subsequent calls, make the faulted page always
2243 	 * locked.
2244 	 */
2245 	if (unlikely(!(ret & VM_FAULT_LOCKED)))
2246 		lock_page(vmf.page);
2247 	else
2248 		VM_BUG_ON(!PageLocked(vmf.page));
2249 
2250 	/*
2251 	 * Should we do an early C-O-W break?
2252 	 */
2253 	page = vmf.page;
2254 	if (flags & FAULT_FLAG_WRITE) {
2255 		if (!(vma->vm_flags & VM_SHARED)) {
2256 			anon = 1;
2257 			if (unlikely(anon_vma_prepare(vma))) {
2258 				ret = VM_FAULT_OOM;
2259 				goto out;
2260 			}
2261 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2262 						vma, address);
2263 			if (!page) {
2264 				ret = VM_FAULT_OOM;
2265 				goto out;
2266 			}
2267 			copy_user_highpage(page, vmf.page, address, vma);
2268 		} else {
2269 			/*
2270 			 * If the page will be shareable, see if the backing
2271 			 * address space wants to know that the page is about
2272 			 * to become writable
2273 			 */
2274 			if (vma->vm_ops->page_mkwrite) {
2275 				unlock_page(page);
2276 				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
2277 					ret = VM_FAULT_SIGBUS;
2278 					anon = 1; /* no anon but release vmf.page */
2279 					goto out_unlocked;
2280 				}
2281 				lock_page(page);
2282 				/*
2283 				 * XXX: this is not quite right (racy vs
2284 				 * invalidate) to unlock and relock the page
2285 				 * like this, however a better fix requires
2286 				 * reworking page_mkwrite locking API, which
2287 				 * is better done later.
2288 				 */
2289 				if (!page->mapping) {
2290 					ret = 0;
2291 					anon = 1; /* no anon but release vmf.page */
2292 					goto out;
2293 				}
2294 				page_mkwrite = 1;
2295 			}
2296 		}
2297 
2298 	}
2299 
2300 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2301 
2302 	/*
2303 	 * This silly early PAGE_DIRTY setting removes a race
2304 	 * due to the bad i386 page protection. But it's valid
2305 	 * for other architectures too.
2306 	 *
2307 	 * Note that if write_access is true, we either now have
2308 	 * an exclusive copy of the page, or this is a shared mapping,
2309 	 * so we can make it writable and dirty to avoid having to
2310 	 * handle that later.
2311 	 */
2312 	/* Only go through if we didn't race with anybody else... */
2313 	if (likely(pte_same(*page_table, orig_pte))) {
2314 		flush_icache_page(vma, page);
2315 		entry = mk_pte(page, vma->vm_page_prot);
2316 		if (flags & FAULT_FLAG_WRITE)
2317 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2318 		set_pte_at(mm, address, page_table, entry);
2319 		if (anon) {
2320                         inc_mm_counter(mm, anon_rss);
2321                         lru_cache_add_active(page);
2322                         page_add_new_anon_rmap(page, vma, address);
2323 		} else {
2324 			inc_mm_counter(mm, file_rss);
2325 			page_add_file_rmap(page);
2326 			if (flags & FAULT_FLAG_WRITE) {
2327 				dirty_page = page;
2328 				get_page(dirty_page);
2329 			}
2330 		}
2331 
2332 		/* no need to invalidate: a not-present page won't be cached */
2333 		update_mmu_cache(vma, address, entry);
2334 	} else {
2335 		if (anon)
2336 			page_cache_release(page);
2337 		else
2338 			anon = 1; /* no anon but release faulted_page */
2339 	}
2340 
2341 	pte_unmap_unlock(page_table, ptl);
2342 
2343 out:
2344 	unlock_page(vmf.page);
2345 out_unlocked:
2346 	if (anon)
2347 		page_cache_release(vmf.page);
2348 	else if (dirty_page) {
2349 		if (vma->vm_file)
2350 			file_update_time(vma->vm_file);
2351 
2352 		set_page_dirty_balance(dirty_page, page_mkwrite);
2353 		put_page(dirty_page);
2354 	}
2355 
2356 	return ret;
2357 }
2358 
2359 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2360 		unsigned long address, pte_t *page_table, pmd_t *pmd,
2361 		int write_access, pte_t orig_pte)
2362 {
2363 	pgoff_t pgoff = (((address & PAGE_MASK)
2364 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2365 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2366 
2367 	pte_unmap(page_table);
2368 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2369 }
2370 
2371 
2372 /*
2373  * do_no_pfn() tries to create a new page mapping for a page without
2374  * a struct_page backing it
2375  *
2376  * As this is called only for pages that do not currently exist, we
2377  * do not need to flush old virtual caches or the TLB.
2378  *
2379  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2380  * but allow concurrent faults), and pte mapped but not yet locked.
2381  * We return with mmap_sem still held, but pte unmapped and unlocked.
2382  *
2383  * It is expected that the ->nopfn handler always returns the same pfn
2384  * for a given virtual mapping.
2385  *
2386  * Mark this `noinline' to prevent it from bloating the main pagefault code.
2387  */
2388 static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2389 		     unsigned long address, pte_t *page_table, pmd_t *pmd,
2390 		     int write_access)
2391 {
2392 	spinlock_t *ptl;
2393 	pte_t entry;
2394 	unsigned long pfn;
2395 
2396 	pte_unmap(page_table);
2397 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
2398 	BUG_ON(is_cow_mapping(vma->vm_flags));
2399 
2400 	pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2401 	if (unlikely(pfn == NOPFN_OOM))
2402 		return VM_FAULT_OOM;
2403 	else if (unlikely(pfn == NOPFN_SIGBUS))
2404 		return VM_FAULT_SIGBUS;
2405 	else if (unlikely(pfn == NOPFN_REFAULT))
2406 		return 0;
2407 
2408 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2409 
2410 	/* Only go through if we didn't race with anybody else... */
2411 	if (pte_none(*page_table)) {
2412 		entry = pfn_pte(pfn, vma->vm_page_prot);
2413 		if (write_access)
2414 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2415 		set_pte_at(mm, address, page_table, entry);
2416 	}
2417 	pte_unmap_unlock(page_table, ptl);
2418 	return 0;
2419 }
2420 
2421 /*
2422  * Fault of a previously existing named mapping. Repopulate the pte
2423  * from the encoded file_pte if possible. This enables swappable
2424  * nonlinear vmas.
2425  *
2426  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2427  * but allow concurrent faults), and pte mapped but not yet locked.
2428  * We return with mmap_sem still held, but pte unmapped and unlocked.
2429  */
2430 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2431 		unsigned long address, pte_t *page_table, pmd_t *pmd,
2432 		int write_access, pte_t orig_pte)
2433 {
2434 	unsigned int flags = FAULT_FLAG_NONLINEAR |
2435 				(write_access ? FAULT_FLAG_WRITE : 0);
2436 	pgoff_t pgoff;
2437 
2438 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2439 		return 0;
2440 
2441 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
2442 			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
2443 		/*
2444 		 * Page table corrupted: show pte and kill process.
2445 		 */
2446 		print_bad_pte(vma, orig_pte, address);
2447 		return VM_FAULT_OOM;
2448 	}
2449 
2450 	pgoff = pte_to_pgoff(orig_pte);
2451 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2452 }
2453 
2454 /*
2455  * These routines also need to handle stuff like marking pages dirty
2456  * and/or accessed for architectures that don't do it in hardware (most
2457  * RISC architectures).  The early dirtying is also good on the i386.
2458  *
2459  * There is also a hook called "update_mmu_cache()" that architectures
2460  * with external mmu caches can use to update those (ie the Sparc or
2461  * PowerPC hashed page tables that act as extended TLBs).
2462  *
2463  * We enter with non-exclusive mmap_sem (to exclude vma changes,
2464  * but allow concurrent faults), and pte mapped but not yet locked.
2465  * We return with mmap_sem still held, but pte unmapped and unlocked.
2466  */
2467 static inline int handle_pte_fault(struct mm_struct *mm,
2468 		struct vm_area_struct *vma, unsigned long address,
2469 		pte_t *pte, pmd_t *pmd, int write_access)
2470 {
2471 	pte_t entry;
2472 	spinlock_t *ptl;
2473 
2474 	entry = *pte;
2475 	if (!pte_present(entry)) {
2476 		if (pte_none(entry)) {
2477 			if (vma->vm_ops) {
2478 				if (vma->vm_ops->fault || vma->vm_ops->nopage)
2479 					return do_linear_fault(mm, vma, address,
2480 						pte, pmd, write_access, entry);
2481 				if (unlikely(vma->vm_ops->nopfn))
2482 					return do_no_pfn(mm, vma, address, pte,
2483 							 pmd, write_access);
2484 			}
2485 			return do_anonymous_page(mm, vma, address,
2486 						 pte, pmd, write_access);
2487 		}
2488 		if (pte_file(entry))
2489 			return do_nonlinear_fault(mm, vma, address,
2490 					pte, pmd, write_access, entry);
2491 		return do_swap_page(mm, vma, address,
2492 					pte, pmd, write_access, entry);
2493 	}
2494 
2495 	ptl = pte_lockptr(mm, pmd);
2496 	spin_lock(ptl);
2497 	if (unlikely(!pte_same(*pte, entry)))
2498 		goto unlock;
2499 	if (write_access) {
2500 		if (!pte_write(entry))
2501 			return do_wp_page(mm, vma, address,
2502 					pte, pmd, ptl, entry);
2503 		entry = pte_mkdirty(entry);
2504 	}
2505 	entry = pte_mkyoung(entry);
2506 	if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2507 		update_mmu_cache(vma, address, entry);
2508 	} else {
2509 		/*
2510 		 * This is needed only for protection faults but the arch code
2511 		 * is not yet telling us if this is a protection fault or not.
2512 		 * This still avoids useless tlb flushes for .text page faults
2513 		 * with threads.
2514 		 */
2515 		if (write_access)
2516 			flush_tlb_page(vma, address);
2517 	}
2518 unlock:
2519 	pte_unmap_unlock(pte, ptl);
2520 	return 0;
2521 }
2522 
2523 /*
2524  * By the time we get here, we already hold the mm semaphore
2525  */
2526 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2527 		unsigned long address, int write_access)
2528 {
2529 	pgd_t *pgd;
2530 	pud_t *pud;
2531 	pmd_t *pmd;
2532 	pte_t *pte;
2533 
2534 	__set_current_state(TASK_RUNNING);
2535 
2536 	count_vm_event(PGFAULT);
2537 
2538 	if (unlikely(is_vm_hugetlb_page(vma)))
2539 		return hugetlb_fault(mm, vma, address, write_access);
2540 
2541 	pgd = pgd_offset(mm, address);
2542 	pud = pud_alloc(mm, pgd, address);
2543 	if (!pud)
2544 		return VM_FAULT_OOM;
2545 	pmd = pmd_alloc(mm, pud, address);
2546 	if (!pmd)
2547 		return VM_FAULT_OOM;
2548 	pte = pte_alloc_map(mm, pmd, address);
2549 	if (!pte)
2550 		return VM_FAULT_OOM;
2551 
2552 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2553 }
2554 
2555 #ifndef __PAGETABLE_PUD_FOLDED
2556 /*
2557  * Allocate page upper directory.
2558  * We've already handled the fast-path in-line.
2559  */
2560 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2561 {
2562 	pud_t *new = pud_alloc_one(mm, address);
2563 	if (!new)
2564 		return -ENOMEM;
2565 
2566 	spin_lock(&mm->page_table_lock);
2567 	if (pgd_present(*pgd))		/* Another has populated it */
2568 		pud_free(new);
2569 	else
2570 		pgd_populate(mm, pgd, new);
2571 	spin_unlock(&mm->page_table_lock);
2572 	return 0;
2573 }
2574 #endif /* __PAGETABLE_PUD_FOLDED */
2575 
2576 #ifndef __PAGETABLE_PMD_FOLDED
2577 /*
2578  * Allocate page middle directory.
2579  * We've already handled the fast-path in-line.
2580  */
2581 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2582 {
2583 	pmd_t *new = pmd_alloc_one(mm, address);
2584 	if (!new)
2585 		return -ENOMEM;
2586 
2587 	spin_lock(&mm->page_table_lock);
2588 #ifndef __ARCH_HAS_4LEVEL_HACK
2589 	if (pud_present(*pud))		/* Another has populated it */
2590 		pmd_free(new);
2591 	else
2592 		pud_populate(mm, pud, new);
2593 #else
2594 	if (pgd_present(*pud))		/* Another has populated it */
2595 		pmd_free(new);
2596 	else
2597 		pgd_populate(mm, pud, new);
2598 #endif /* __ARCH_HAS_4LEVEL_HACK */
2599 	spin_unlock(&mm->page_table_lock);
2600 	return 0;
2601 }
2602 #endif /* __PAGETABLE_PMD_FOLDED */
2603 
2604 int make_pages_present(unsigned long addr, unsigned long end)
2605 {
2606 	int ret, len, write;
2607 	struct vm_area_struct * vma;
2608 
2609 	vma = find_vma(current->mm, addr);
2610 	if (!vma)
2611 		return -1;
2612 	write = (vma->vm_flags & VM_WRITE) != 0;
2613 	BUG_ON(addr >= end);
2614 	BUG_ON(end > vma->vm_end);
2615 	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2616 	ret = get_user_pages(current, current->mm, addr,
2617 			len, write, 0, NULL, NULL);
2618 	if (ret < 0)
2619 		return ret;
2620 	return ret == len ? 0 : -1;
2621 }
2622 
2623 /*
2624  * Map a vmalloc()-space virtual address to the physical page.
2625  */
2626 struct page * vmalloc_to_page(void * vmalloc_addr)
2627 {
2628 	unsigned long addr = (unsigned long) vmalloc_addr;
2629 	struct page *page = NULL;
2630 	pgd_t *pgd = pgd_offset_k(addr);
2631 	pud_t *pud;
2632 	pmd_t *pmd;
2633 	pte_t *ptep, pte;
2634 
2635 	if (!pgd_none(*pgd)) {
2636 		pud = pud_offset(pgd, addr);
2637 		if (!pud_none(*pud)) {
2638 			pmd = pmd_offset(pud, addr);
2639 			if (!pmd_none(*pmd)) {
2640 				ptep = pte_offset_map(pmd, addr);
2641 				pte = *ptep;
2642 				if (pte_present(pte))
2643 					page = pte_page(pte);
2644 				pte_unmap(ptep);
2645 			}
2646 		}
2647 	}
2648 	return page;
2649 }
2650 
2651 EXPORT_SYMBOL(vmalloc_to_page);
2652 
2653 /*
2654  * Map a vmalloc()-space virtual address to the physical page frame number.
2655  */
2656 unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2657 {
2658 	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2659 }
2660 
2661 EXPORT_SYMBOL(vmalloc_to_pfn);
2662 
2663 #if !defined(__HAVE_ARCH_GATE_AREA)
2664 
2665 #if defined(AT_SYSINFO_EHDR)
2666 static struct vm_area_struct gate_vma;
2667 
2668 static int __init gate_vma_init(void)
2669 {
2670 	gate_vma.vm_mm = NULL;
2671 	gate_vma.vm_start = FIXADDR_USER_START;
2672 	gate_vma.vm_end = FIXADDR_USER_END;
2673 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
2674 	gate_vma.vm_page_prot = __P101;
2675 	/*
2676 	 * Make sure the vDSO gets into every core dump.
2677 	 * Dumping its contents makes post-mortem fully interpretable later
2678 	 * without matching up the same kernel and hardware config to see
2679 	 * what PC values meant.
2680 	 */
2681 	gate_vma.vm_flags |= VM_ALWAYSDUMP;
2682 	return 0;
2683 }
2684 __initcall(gate_vma_init);
2685 #endif
2686 
2687 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
2688 {
2689 #ifdef AT_SYSINFO_EHDR
2690 	return &gate_vma;
2691 #else
2692 	return NULL;
2693 #endif
2694 }
2695 
2696 int in_gate_area_no_task(unsigned long addr)
2697 {
2698 #ifdef AT_SYSINFO_EHDR
2699 	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
2700 		return 1;
2701 #endif
2702 	return 0;
2703 }
2704 
2705 #endif	/* __HAVE_ARCH_GATE_AREA */
2706 
2707 /*
2708  * Access another process' address space.
2709  * Source/target buffer must be kernel space,
2710  * Do not walk the page table directly, use get_user_pages
2711  */
2712 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2713 {
2714 	struct mm_struct *mm;
2715 	struct vm_area_struct *vma;
2716 	struct page *page;
2717 	void *old_buf = buf;
2718 
2719 	mm = get_task_mm(tsk);
2720 	if (!mm)
2721 		return 0;
2722 
2723 	down_read(&mm->mmap_sem);
2724 	/* ignore errors, just check how much was successfully transferred */
2725 	while (len) {
2726 		int bytes, ret, offset;
2727 		void *maddr;
2728 
2729 		ret = get_user_pages(tsk, mm, addr, 1,
2730 				write, 1, &page, &vma);
2731 		if (ret <= 0)
2732 			break;
2733 
2734 		bytes = len;
2735 		offset = addr & (PAGE_SIZE-1);
2736 		if (bytes > PAGE_SIZE-offset)
2737 			bytes = PAGE_SIZE-offset;
2738 
2739 		maddr = kmap(page);
2740 		if (write) {
2741 			copy_to_user_page(vma, page, addr,
2742 					  maddr + offset, buf, bytes);
2743 			set_page_dirty_lock(page);
2744 		} else {
2745 			copy_from_user_page(vma, page, addr,
2746 					    buf, maddr + offset, bytes);
2747 		}
2748 		kunmap(page);
2749 		page_cache_release(page);
2750 		len -= bytes;
2751 		buf += bytes;
2752 		addr += bytes;
2753 	}
2754 	up_read(&mm->mmap_sem);
2755 	mmput(mm);
2756 
2757 	return buf - old_buf;
2758 }
2759