xref: /linux/mm/pagewalk.c (revision 78ddc53473419073ffb2e91178001e87bc513524)
1e6473092SMatt Mackall #include <linux/mm.h>
2e6473092SMatt Mackall #include <linux/highmem.h>
3e6473092SMatt Mackall #include <linux/sched.h>
4d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h>
5e6473092SMatt Mackall 
6e6473092SMatt Mackall static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
72165009bSDave Hansen 			  struct mm_walk *walk)
8e6473092SMatt Mackall {
9e6473092SMatt Mackall 	pte_t *pte;
10e6473092SMatt Mackall 	int err = 0;
11e6473092SMatt Mackall 
12e6473092SMatt Mackall 	pte = pte_offset_map(pmd, addr);
13556637cdSJohannes Weiner 	for (;;) {
142165009bSDave Hansen 		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
15e6473092SMatt Mackall 		if (err)
16e6473092SMatt Mackall 		       break;
17556637cdSJohannes Weiner 		addr += PAGE_SIZE;
18556637cdSJohannes Weiner 		if (addr == end)
19556637cdSJohannes Weiner 			break;
20556637cdSJohannes Weiner 		pte++;
21556637cdSJohannes Weiner 	}
22e6473092SMatt Mackall 
23e6473092SMatt Mackall 	pte_unmap(pte);
24e6473092SMatt Mackall 	return err;
25e6473092SMatt Mackall }
26e6473092SMatt Mackall 
27e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
282165009bSDave Hansen 			  struct mm_walk *walk)
29e6473092SMatt Mackall {
30e6473092SMatt Mackall 	pmd_t *pmd;
31e6473092SMatt Mackall 	unsigned long next;
32e6473092SMatt Mackall 	int err = 0;
33e6473092SMatt Mackall 
34e6473092SMatt Mackall 	pmd = pmd_offset(pud, addr);
35e6473092SMatt Mackall 	do {
3603319327SDave Hansen again:
37e6473092SMatt Mackall 		next = pmd_addr_end(addr, end);
3848684a65SNaoya Horiguchi 		if (pmd_none(*pmd) || !walk->vma) {
39e6473092SMatt Mackall 			if (walk->pte_hole)
402165009bSDave Hansen 				err = walk->pte_hole(addr, next, walk);
41e6473092SMatt Mackall 			if (err)
42e6473092SMatt Mackall 				break;
43e6473092SMatt Mackall 			continue;
44e6473092SMatt Mackall 		}
4503319327SDave Hansen 		/*
4603319327SDave Hansen 		 * This implies that each ->pmd_entry() handler
4703319327SDave Hansen 		 * needs to know about pmd_trans_huge() pmds
4803319327SDave Hansen 		 */
49e6473092SMatt Mackall 		if (walk->pmd_entry)
502165009bSDave Hansen 			err = walk->pmd_entry(pmd, addr, next, walk);
5103319327SDave Hansen 		if (err)
5203319327SDave Hansen 			break;
5303319327SDave Hansen 
5403319327SDave Hansen 		/*
5503319327SDave Hansen 		 * Check this here so we only break down trans_huge
5603319327SDave Hansen 		 * pages when we _need_ to
5703319327SDave Hansen 		 */
5803319327SDave Hansen 		if (!walk->pte_entry)
5903319327SDave Hansen 			continue;
6003319327SDave Hansen 
61*78ddc534SKirill A. Shutemov 		split_huge_pmd(walk->vma, pmd, addr);
62fafaa426SNaoya Horiguchi 		if (pmd_trans_unstable(pmd))
6303319327SDave Hansen 			goto again;
642165009bSDave Hansen 		err = walk_pte_range(pmd, addr, next, walk);
65e6473092SMatt Mackall 		if (err)
66e6473092SMatt Mackall 			break;
67e6473092SMatt Mackall 	} while (pmd++, addr = next, addr != end);
68e6473092SMatt Mackall 
69e6473092SMatt Mackall 	return err;
70e6473092SMatt Mackall }
71e6473092SMatt Mackall 
72e6473092SMatt Mackall static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
732165009bSDave Hansen 			  struct mm_walk *walk)
74e6473092SMatt Mackall {
75e6473092SMatt Mackall 	pud_t *pud;
76e6473092SMatt Mackall 	unsigned long next;
77e6473092SMatt Mackall 	int err = 0;
78e6473092SMatt Mackall 
79e6473092SMatt Mackall 	pud = pud_offset(pgd, addr);
80e6473092SMatt Mackall 	do {
81e6473092SMatt Mackall 		next = pud_addr_end(addr, end);
82e6473092SMatt Mackall 		if (pud_none_or_clear_bad(pud)) {
83e6473092SMatt Mackall 			if (walk->pte_hole)
842165009bSDave Hansen 				err = walk->pte_hole(addr, next, walk);
85e6473092SMatt Mackall 			if (err)
86e6473092SMatt Mackall 				break;
87e6473092SMatt Mackall 			continue;
88e6473092SMatt Mackall 		}
890b1fbfe5SNaoya Horiguchi 		if (walk->pmd_entry || walk->pte_entry)
902165009bSDave Hansen 			err = walk_pmd_range(pud, addr, next, walk);
91e6473092SMatt Mackall 		if (err)
92e6473092SMatt Mackall 			break;
93e6473092SMatt Mackall 	} while (pud++, addr = next, addr != end);
94e6473092SMatt Mackall 
95e6473092SMatt Mackall 	return err;
96e6473092SMatt Mackall }
97e6473092SMatt Mackall 
98fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end,
99fafaa426SNaoya Horiguchi 			  struct mm_walk *walk)
100fafaa426SNaoya Horiguchi {
101fafaa426SNaoya Horiguchi 	pgd_t *pgd;
102fafaa426SNaoya Horiguchi 	unsigned long next;
103fafaa426SNaoya Horiguchi 	int err = 0;
104fafaa426SNaoya Horiguchi 
105fafaa426SNaoya Horiguchi 	pgd = pgd_offset(walk->mm, addr);
106fafaa426SNaoya Horiguchi 	do {
107fafaa426SNaoya Horiguchi 		next = pgd_addr_end(addr, end);
108fafaa426SNaoya Horiguchi 		if (pgd_none_or_clear_bad(pgd)) {
109fafaa426SNaoya Horiguchi 			if (walk->pte_hole)
110fafaa426SNaoya Horiguchi 				err = walk->pte_hole(addr, next, walk);
111fafaa426SNaoya Horiguchi 			if (err)
112fafaa426SNaoya Horiguchi 				break;
113fafaa426SNaoya Horiguchi 			continue;
114fafaa426SNaoya Horiguchi 		}
115fafaa426SNaoya Horiguchi 		if (walk->pmd_entry || walk->pte_entry)
116fafaa426SNaoya Horiguchi 			err = walk_pud_range(pgd, addr, next, walk);
117fafaa426SNaoya Horiguchi 		if (err)
118fafaa426SNaoya Horiguchi 			break;
119fafaa426SNaoya Horiguchi 	} while (pgd++, addr = next, addr != end);
120fafaa426SNaoya Horiguchi 
121fafaa426SNaoya Horiguchi 	return err;
122fafaa426SNaoya Horiguchi }
123fafaa426SNaoya Horiguchi 
124116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
125116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
126116354d1SNaoya Horiguchi 				       unsigned long end)
127116354d1SNaoya Horiguchi {
128116354d1SNaoya Horiguchi 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
129116354d1SNaoya Horiguchi 	return boundary < end ? boundary : end;
130116354d1SNaoya Horiguchi }
131116354d1SNaoya Horiguchi 
132fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
133116354d1SNaoya Horiguchi 			      struct mm_walk *walk)
134116354d1SNaoya Horiguchi {
135fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
136116354d1SNaoya Horiguchi 	struct hstate *h = hstate_vma(vma);
137116354d1SNaoya Horiguchi 	unsigned long next;
138116354d1SNaoya Horiguchi 	unsigned long hmask = huge_page_mask(h);
139116354d1SNaoya Horiguchi 	pte_t *pte;
140116354d1SNaoya Horiguchi 	int err = 0;
141116354d1SNaoya Horiguchi 
142116354d1SNaoya Horiguchi 	do {
143116354d1SNaoya Horiguchi 		next = hugetlb_entry_end(h, addr, end);
144116354d1SNaoya Horiguchi 		pte = huge_pte_offset(walk->mm, addr & hmask);
145116354d1SNaoya Horiguchi 		if (pte && walk->hugetlb_entry)
146116354d1SNaoya Horiguchi 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
147116354d1SNaoya Horiguchi 		if (err)
148fafaa426SNaoya Horiguchi 			break;
149116354d1SNaoya Horiguchi 	} while (addr = next, addr != end);
150116354d1SNaoya Horiguchi 
151fafaa426SNaoya Horiguchi 	return err;
152116354d1SNaoya Horiguchi }
1536c6d5280SKOSAKI Motohiro 
1546c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */
155fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
1566c6d5280SKOSAKI Motohiro 			      struct mm_walk *walk)
1576c6d5280SKOSAKI Motohiro {
1586c6d5280SKOSAKI Motohiro 	return 0;
1596c6d5280SKOSAKI Motohiro }
1606c6d5280SKOSAKI Motohiro 
1616c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */
1626c6d5280SKOSAKI Motohiro 
163fafaa426SNaoya Horiguchi /*
164fafaa426SNaoya Horiguchi  * Decide whether we really walk over the current vma on [@start, @end)
165fafaa426SNaoya Horiguchi  * or skip it via the returned value. Return 0 if we do walk over the
166fafaa426SNaoya Horiguchi  * current vma, and return 1 if we skip the vma. Negative values means
167fafaa426SNaoya Horiguchi  * error, where we abort the current walk.
168e6473092SMatt Mackall  */
169fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end,
1702165009bSDave Hansen 			struct mm_walk *walk)
171e6473092SMatt Mackall {
172fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
173e6473092SMatt Mackall 
174fafaa426SNaoya Horiguchi 	if (walk->test_walk)
175fafaa426SNaoya Horiguchi 		return walk->test_walk(start, end, walk);
176fafaa426SNaoya Horiguchi 
177fafaa426SNaoya Horiguchi 	/*
17848684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
17948684a65SNaoya Horiguchi 	 * range, so we don't walk over it as we do for normal vmas. However,
18048684a65SNaoya Horiguchi 	 * Some callers are interested in handling hole range and they don't
18148684a65SNaoya Horiguchi 	 * want to just ignore any single address range. Such users certainly
18248684a65SNaoya Horiguchi 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
18348684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP).
184fafaa426SNaoya Horiguchi 	 */
18548684a65SNaoya Horiguchi 	if (vma->vm_flags & VM_PFNMAP) {
18648684a65SNaoya Horiguchi 		int err = 1;
18748684a65SNaoya Horiguchi 		if (walk->pte_hole)
18848684a65SNaoya Horiguchi 			err = walk->pte_hole(start, end, walk);
18948684a65SNaoya Horiguchi 		return err ? err : 1;
19048684a65SNaoya Horiguchi 	}
191fafaa426SNaoya Horiguchi 	return 0;
192fafaa426SNaoya Horiguchi }
193fafaa426SNaoya Horiguchi 
194fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end,
195fafaa426SNaoya Horiguchi 			struct mm_walk *walk)
196fafaa426SNaoya Horiguchi {
197fafaa426SNaoya Horiguchi 	int err = 0;
198fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
199fafaa426SNaoya Horiguchi 
200fafaa426SNaoya Horiguchi 	if (vma && is_vm_hugetlb_page(vma)) {
201fafaa426SNaoya Horiguchi 		if (walk->hugetlb_entry)
202fafaa426SNaoya Horiguchi 			err = walk_hugetlb_range(start, end, walk);
203fafaa426SNaoya Horiguchi 	} else
204fafaa426SNaoya Horiguchi 		err = walk_pgd_range(start, end, walk);
205fafaa426SNaoya Horiguchi 
206e6473092SMatt Mackall 	return err;
207fafaa426SNaoya Horiguchi }
208fafaa426SNaoya Horiguchi 
209fafaa426SNaoya Horiguchi /**
210fafaa426SNaoya Horiguchi  * walk_page_range - walk page table with caller specific callbacks
211fafaa426SNaoya Horiguchi  *
212fafaa426SNaoya Horiguchi  * Recursively walk the page table tree of the process represented by @walk->mm
213fafaa426SNaoya Horiguchi  * within the virtual address range [@start, @end). During walking, we can do
214fafaa426SNaoya Horiguchi  * some caller-specific works for each entry, by setting up pmd_entry(),
215fafaa426SNaoya Horiguchi  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
216fafaa426SNaoya Horiguchi  * callbacks, the associated entries/pages are just ignored.
217fafaa426SNaoya Horiguchi  * The return values of these callbacks are commonly defined like below:
218fafaa426SNaoya Horiguchi  *  - 0  : succeeded to handle the current entry, and if you don't reach the
219fafaa426SNaoya Horiguchi  *         end address yet, continue to walk.
220fafaa426SNaoya Horiguchi  *  - >0 : succeeded to handle the current entry, and return to the caller
221fafaa426SNaoya Horiguchi  *         with caller specific value.
222fafaa426SNaoya Horiguchi  *  - <0 : failed to handle the current entry, and return to the caller
223fafaa426SNaoya Horiguchi  *         with error code.
224fafaa426SNaoya Horiguchi  *
225fafaa426SNaoya Horiguchi  * Before starting to walk page table, some callers want to check whether
226fafaa426SNaoya Horiguchi  * they really want to walk over the current vma, typically by checking
227fafaa426SNaoya Horiguchi  * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
228fafaa426SNaoya Horiguchi  * purpose.
229fafaa426SNaoya Horiguchi  *
230fafaa426SNaoya Horiguchi  * struct mm_walk keeps current values of some common data like vma and pmd,
231fafaa426SNaoya Horiguchi  * which are useful for the access from callbacks. If you want to pass some
232fafaa426SNaoya Horiguchi  * caller-specific data to callbacks, @walk->private should be helpful.
233fafaa426SNaoya Horiguchi  *
234fafaa426SNaoya Horiguchi  * Locking:
235fafaa426SNaoya Horiguchi  *   Callers of walk_page_range() and walk_page_vma() should hold
236fafaa426SNaoya Horiguchi  *   @walk->mm->mmap_sem, because these function traverse vma list and/or
237fafaa426SNaoya Horiguchi  *   access to vma's data.
238fafaa426SNaoya Horiguchi  */
239fafaa426SNaoya Horiguchi int walk_page_range(unsigned long start, unsigned long end,
240fafaa426SNaoya Horiguchi 		    struct mm_walk *walk)
241fafaa426SNaoya Horiguchi {
242fafaa426SNaoya Horiguchi 	int err = 0;
243fafaa426SNaoya Horiguchi 	unsigned long next;
244fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma;
245fafaa426SNaoya Horiguchi 
246fafaa426SNaoya Horiguchi 	if (start >= end)
247fafaa426SNaoya Horiguchi 		return -EINVAL;
248e6473092SMatt Mackall 
2492165009bSDave Hansen 	if (!walk->mm)
2502165009bSDave Hansen 		return -EINVAL;
2512165009bSDave Hansen 
25296dad67fSSasha Levin 	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
253a9ff785eSCliff Wickman 
254fafaa426SNaoya Horiguchi 	vma = find_vma(walk->mm, start);
255e6473092SMatt Mackall 	do {
256fafaa426SNaoya Horiguchi 		if (!vma) { /* after the last vma */
257fafaa426SNaoya Horiguchi 			walk->vma = NULL;
258fafaa426SNaoya Horiguchi 			next = end;
259fafaa426SNaoya Horiguchi 		} else if (start < vma->vm_start) { /* outside vma */
260fafaa426SNaoya Horiguchi 			walk->vma = NULL;
261fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_start);
262fafaa426SNaoya Horiguchi 		} else { /* inside vma */
263fafaa426SNaoya Horiguchi 			walk->vma = vma;
264fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_end);
265fafaa426SNaoya Horiguchi 			vma = vma->vm_next;
2665f0af70aSDavid Sterba 
267fafaa426SNaoya Horiguchi 			err = walk_page_test(start, next, walk);
268f6837395SNaoya Horiguchi 			if (err > 0) {
269f6837395SNaoya Horiguchi 				/*
270f6837395SNaoya Horiguchi 				 * positive return values are purely for
271f6837395SNaoya Horiguchi 				 * controlling the pagewalk, so should never
272f6837395SNaoya Horiguchi 				 * be passed to the callers.
273f6837395SNaoya Horiguchi 				 */
274f6837395SNaoya Horiguchi 				err = 0;
275a9ff785eSCliff Wickman 				continue;
276f6837395SNaoya Horiguchi 			}
277fafaa426SNaoya Horiguchi 			if (err < 0)
278fafaa426SNaoya Horiguchi 				break;
279a9ff785eSCliff Wickman 		}
280fafaa426SNaoya Horiguchi 		if (walk->vma || walk->pte_hole)
281fafaa426SNaoya Horiguchi 			err = __walk_page_range(start, next, walk);
2825dc37642SNaoya Horiguchi 		if (err)
2835dc37642SNaoya Horiguchi 			break;
284fafaa426SNaoya Horiguchi 	} while (start = next, start < end);
285e6473092SMatt Mackall 	return err;
286e6473092SMatt Mackall }
287900fc5f1SNaoya Horiguchi 
288900fc5f1SNaoya Horiguchi int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
289900fc5f1SNaoya Horiguchi {
290900fc5f1SNaoya Horiguchi 	int err;
291900fc5f1SNaoya Horiguchi 
292900fc5f1SNaoya Horiguchi 	if (!walk->mm)
293900fc5f1SNaoya Horiguchi 		return -EINVAL;
294900fc5f1SNaoya Horiguchi 
295900fc5f1SNaoya Horiguchi 	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
296900fc5f1SNaoya Horiguchi 	VM_BUG_ON(!vma);
297900fc5f1SNaoya Horiguchi 	walk->vma = vma;
298900fc5f1SNaoya Horiguchi 	err = walk_page_test(vma->vm_start, vma->vm_end, walk);
299900fc5f1SNaoya Horiguchi 	if (err > 0)
300900fc5f1SNaoya Horiguchi 		return 0;
301900fc5f1SNaoya Horiguchi 	if (err < 0)
302900fc5f1SNaoya Horiguchi 		return err;
303900fc5f1SNaoya Horiguchi 	return __walk_page_range(vma->vm_start, vma->vm_end, walk);
304900fc5f1SNaoya Horiguchi }
305