xref: /linux/mm/pagewalk.c (revision c2febafc67734a62196c1b9dfba926412d4077ba)
1e6473092SMatt Mackall #include <linux/mm.h>
2e6473092SMatt Mackall #include <linux/highmem.h>
3e6473092SMatt Mackall #include <linux/sched.h>
4d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h>
5e6473092SMatt Mackall 
6e6473092SMatt Mackall static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
72165009bSDave Hansen 			  struct mm_walk *walk)
8e6473092SMatt Mackall {
9e6473092SMatt Mackall 	pte_t *pte;
10e6473092SMatt Mackall 	int err = 0;
11e6473092SMatt Mackall 
12e6473092SMatt Mackall 	pte = pte_offset_map(pmd, addr);
13556637cdSJohannes Weiner 	for (;;) {
142165009bSDave Hansen 		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
15e6473092SMatt Mackall 		if (err)
16e6473092SMatt Mackall 		       break;
17556637cdSJohannes Weiner 		addr += PAGE_SIZE;
18556637cdSJohannes Weiner 		if (addr == end)
19556637cdSJohannes Weiner 			break;
20556637cdSJohannes Weiner 		pte++;
21556637cdSJohannes Weiner 	}
22e6473092SMatt Mackall 
23e6473092SMatt Mackall 	pte_unmap(pte);
24e6473092SMatt Mackall 	return err;
25e6473092SMatt Mackall }
26e6473092SMatt Mackall 
27e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
282165009bSDave Hansen 			  struct mm_walk *walk)
29e6473092SMatt Mackall {
30e6473092SMatt Mackall 	pmd_t *pmd;
31e6473092SMatt Mackall 	unsigned long next;
32e6473092SMatt Mackall 	int err = 0;
33e6473092SMatt Mackall 
34e6473092SMatt Mackall 	pmd = pmd_offset(pud, addr);
35e6473092SMatt Mackall 	do {
3603319327SDave Hansen again:
37e6473092SMatt Mackall 		next = pmd_addr_end(addr, end);
3848684a65SNaoya Horiguchi 		if (pmd_none(*pmd) || !walk->vma) {
39e6473092SMatt Mackall 			if (walk->pte_hole)
402165009bSDave Hansen 				err = walk->pte_hole(addr, next, walk);
41e6473092SMatt Mackall 			if (err)
42e6473092SMatt Mackall 				break;
43e6473092SMatt Mackall 			continue;
44e6473092SMatt Mackall 		}
4503319327SDave Hansen 		/*
4603319327SDave Hansen 		 * This implies that each ->pmd_entry() handler
4703319327SDave Hansen 		 * needs to know about pmd_trans_huge() pmds
4803319327SDave Hansen 		 */
49e6473092SMatt Mackall 		if (walk->pmd_entry)
502165009bSDave Hansen 			err = walk->pmd_entry(pmd, addr, next, walk);
5103319327SDave Hansen 		if (err)
5203319327SDave Hansen 			break;
5303319327SDave Hansen 
5403319327SDave Hansen 		/*
5503319327SDave Hansen 		 * Check this here so we only break down trans_huge
5603319327SDave Hansen 		 * pages when we _need_ to
5703319327SDave Hansen 		 */
5803319327SDave Hansen 		if (!walk->pte_entry)
5903319327SDave Hansen 			continue;
6003319327SDave Hansen 
6178ddc534SKirill A. Shutemov 		split_huge_pmd(walk->vma, pmd, addr);
62fafaa426SNaoya Horiguchi 		if (pmd_trans_unstable(pmd))
6303319327SDave Hansen 			goto again;
642165009bSDave Hansen 		err = walk_pte_range(pmd, addr, next, walk);
65e6473092SMatt Mackall 		if (err)
66e6473092SMatt Mackall 			break;
67e6473092SMatt Mackall 	} while (pmd++, addr = next, addr != end);
68e6473092SMatt Mackall 
69e6473092SMatt Mackall 	return err;
70e6473092SMatt Mackall }
71e6473092SMatt Mackall 
72*c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
732165009bSDave Hansen 			  struct mm_walk *walk)
74e6473092SMatt Mackall {
75e6473092SMatt Mackall 	pud_t *pud;
76e6473092SMatt Mackall 	unsigned long next;
77e6473092SMatt Mackall 	int err = 0;
78e6473092SMatt Mackall 
79*c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, addr);
80e6473092SMatt Mackall 	do {
81a00cc7d9SMatthew Wilcox  again:
82e6473092SMatt Mackall 		next = pud_addr_end(addr, end);
83a00cc7d9SMatthew Wilcox 		if (pud_none(*pud) || !walk->vma) {
84e6473092SMatt Mackall 			if (walk->pte_hole)
852165009bSDave Hansen 				err = walk->pte_hole(addr, next, walk);
86e6473092SMatt Mackall 			if (err)
87e6473092SMatt Mackall 				break;
88e6473092SMatt Mackall 			continue;
89e6473092SMatt Mackall 		}
90a00cc7d9SMatthew Wilcox 
91a00cc7d9SMatthew Wilcox 		if (walk->pud_entry) {
92a00cc7d9SMatthew Wilcox 			spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
93a00cc7d9SMatthew Wilcox 
94a00cc7d9SMatthew Wilcox 			if (ptl) {
95a00cc7d9SMatthew Wilcox 				err = walk->pud_entry(pud, addr, next, walk);
96a00cc7d9SMatthew Wilcox 				spin_unlock(ptl);
97a00cc7d9SMatthew Wilcox 				if (err)
98a00cc7d9SMatthew Wilcox 					break;
99a00cc7d9SMatthew Wilcox 				continue;
100a00cc7d9SMatthew Wilcox 			}
101a00cc7d9SMatthew Wilcox 		}
102a00cc7d9SMatthew Wilcox 
103a00cc7d9SMatthew Wilcox 		split_huge_pud(walk->vma, pud, addr);
104a00cc7d9SMatthew Wilcox 		if (pud_none(*pud))
105a00cc7d9SMatthew Wilcox 			goto again;
106a00cc7d9SMatthew Wilcox 
1070b1fbfe5SNaoya Horiguchi 		if (walk->pmd_entry || walk->pte_entry)
1082165009bSDave Hansen 			err = walk_pmd_range(pud, addr, next, walk);
109e6473092SMatt Mackall 		if (err)
110e6473092SMatt Mackall 			break;
111e6473092SMatt Mackall 	} while (pud++, addr = next, addr != end);
112e6473092SMatt Mackall 
113e6473092SMatt Mackall 	return err;
114e6473092SMatt Mackall }
115e6473092SMatt Mackall 
116*c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
117*c2febafcSKirill A. Shutemov 			  struct mm_walk *walk)
118*c2febafcSKirill A. Shutemov {
119*c2febafcSKirill A. Shutemov 	p4d_t *p4d;
120*c2febafcSKirill A. Shutemov 	unsigned long next;
121*c2febafcSKirill A. Shutemov 	int err = 0;
122*c2febafcSKirill A. Shutemov 
123*c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, addr);
124*c2febafcSKirill A. Shutemov 	do {
125*c2febafcSKirill A. Shutemov 		next = p4d_addr_end(addr, end);
126*c2febafcSKirill A. Shutemov 		if (p4d_none_or_clear_bad(p4d)) {
127*c2febafcSKirill A. Shutemov 			if (walk->pte_hole)
128*c2febafcSKirill A. Shutemov 				err = walk->pte_hole(addr, next, walk);
129*c2febafcSKirill A. Shutemov 			if (err)
130*c2febafcSKirill A. Shutemov 				break;
131*c2febafcSKirill A. Shutemov 			continue;
132*c2febafcSKirill A. Shutemov 		}
133*c2febafcSKirill A. Shutemov 		if (walk->pmd_entry || walk->pte_entry)
134*c2febafcSKirill A. Shutemov 			err = walk_pud_range(p4d, addr, next, walk);
135*c2febafcSKirill A. Shutemov 		if (err)
136*c2febafcSKirill A. Shutemov 			break;
137*c2febafcSKirill A. Shutemov 	} while (p4d++, addr = next, addr != end);
138*c2febafcSKirill A. Shutemov 
139*c2febafcSKirill A. Shutemov 	return err;
140*c2febafcSKirill A. Shutemov }
141*c2febafcSKirill A. Shutemov 
142fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end,
143fafaa426SNaoya Horiguchi 			  struct mm_walk *walk)
144fafaa426SNaoya Horiguchi {
145fafaa426SNaoya Horiguchi 	pgd_t *pgd;
146fafaa426SNaoya Horiguchi 	unsigned long next;
147fafaa426SNaoya Horiguchi 	int err = 0;
148fafaa426SNaoya Horiguchi 
149fafaa426SNaoya Horiguchi 	pgd = pgd_offset(walk->mm, addr);
150fafaa426SNaoya Horiguchi 	do {
151fafaa426SNaoya Horiguchi 		next = pgd_addr_end(addr, end);
152fafaa426SNaoya Horiguchi 		if (pgd_none_or_clear_bad(pgd)) {
153fafaa426SNaoya Horiguchi 			if (walk->pte_hole)
154fafaa426SNaoya Horiguchi 				err = walk->pte_hole(addr, next, walk);
155fafaa426SNaoya Horiguchi 			if (err)
156fafaa426SNaoya Horiguchi 				break;
157fafaa426SNaoya Horiguchi 			continue;
158fafaa426SNaoya Horiguchi 		}
159fafaa426SNaoya Horiguchi 		if (walk->pmd_entry || walk->pte_entry)
160*c2febafcSKirill A. Shutemov 			err = walk_p4d_range(pgd, addr, next, walk);
161fafaa426SNaoya Horiguchi 		if (err)
162fafaa426SNaoya Horiguchi 			break;
163fafaa426SNaoya Horiguchi 	} while (pgd++, addr = next, addr != end);
164fafaa426SNaoya Horiguchi 
165fafaa426SNaoya Horiguchi 	return err;
166fafaa426SNaoya Horiguchi }
167fafaa426SNaoya Horiguchi 
168116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
169116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
170116354d1SNaoya Horiguchi 				       unsigned long end)
171116354d1SNaoya Horiguchi {
172116354d1SNaoya Horiguchi 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
173116354d1SNaoya Horiguchi 	return boundary < end ? boundary : end;
174116354d1SNaoya Horiguchi }
175116354d1SNaoya Horiguchi 
176fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
177116354d1SNaoya Horiguchi 			      struct mm_walk *walk)
178116354d1SNaoya Horiguchi {
179fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
180116354d1SNaoya Horiguchi 	struct hstate *h = hstate_vma(vma);
181116354d1SNaoya Horiguchi 	unsigned long next;
182116354d1SNaoya Horiguchi 	unsigned long hmask = huge_page_mask(h);
183116354d1SNaoya Horiguchi 	pte_t *pte;
184116354d1SNaoya Horiguchi 	int err = 0;
185116354d1SNaoya Horiguchi 
186116354d1SNaoya Horiguchi 	do {
187116354d1SNaoya Horiguchi 		next = hugetlb_entry_end(h, addr, end);
188116354d1SNaoya Horiguchi 		pte = huge_pte_offset(walk->mm, addr & hmask);
189116354d1SNaoya Horiguchi 		if (pte && walk->hugetlb_entry)
190116354d1SNaoya Horiguchi 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
191116354d1SNaoya Horiguchi 		if (err)
192fafaa426SNaoya Horiguchi 			break;
193116354d1SNaoya Horiguchi 	} while (addr = next, addr != end);
194116354d1SNaoya Horiguchi 
195fafaa426SNaoya Horiguchi 	return err;
196116354d1SNaoya Horiguchi }
1976c6d5280SKOSAKI Motohiro 
1986c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */
199fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
2006c6d5280SKOSAKI Motohiro 			      struct mm_walk *walk)
2016c6d5280SKOSAKI Motohiro {
2026c6d5280SKOSAKI Motohiro 	return 0;
2036c6d5280SKOSAKI Motohiro }
2046c6d5280SKOSAKI Motohiro 
2056c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */
2066c6d5280SKOSAKI Motohiro 
207fafaa426SNaoya Horiguchi /*
208fafaa426SNaoya Horiguchi  * Decide whether we really walk over the current vma on [@start, @end)
209fafaa426SNaoya Horiguchi  * or skip it via the returned value. Return 0 if we do walk over the
210fafaa426SNaoya Horiguchi  * current vma, and return 1 if we skip the vma. Negative values means
211fafaa426SNaoya Horiguchi  * error, where we abort the current walk.
212e6473092SMatt Mackall  */
213fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end,
2142165009bSDave Hansen 			struct mm_walk *walk)
215e6473092SMatt Mackall {
216fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
217e6473092SMatt Mackall 
218fafaa426SNaoya Horiguchi 	if (walk->test_walk)
219fafaa426SNaoya Horiguchi 		return walk->test_walk(start, end, walk);
220fafaa426SNaoya Horiguchi 
221fafaa426SNaoya Horiguchi 	/*
22248684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
22348684a65SNaoya Horiguchi 	 * range, so we don't walk over it as we do for normal vmas. However,
22448684a65SNaoya Horiguchi 	 * Some callers are interested in handling hole range and they don't
22548684a65SNaoya Horiguchi 	 * want to just ignore any single address range. Such users certainly
22648684a65SNaoya Horiguchi 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
22748684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP).
228fafaa426SNaoya Horiguchi 	 */
22948684a65SNaoya Horiguchi 	if (vma->vm_flags & VM_PFNMAP) {
23048684a65SNaoya Horiguchi 		int err = 1;
23148684a65SNaoya Horiguchi 		if (walk->pte_hole)
23248684a65SNaoya Horiguchi 			err = walk->pte_hole(start, end, walk);
23348684a65SNaoya Horiguchi 		return err ? err : 1;
23448684a65SNaoya Horiguchi 	}
235fafaa426SNaoya Horiguchi 	return 0;
236fafaa426SNaoya Horiguchi }
237fafaa426SNaoya Horiguchi 
238fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end,
239fafaa426SNaoya Horiguchi 			struct mm_walk *walk)
240fafaa426SNaoya Horiguchi {
241fafaa426SNaoya Horiguchi 	int err = 0;
242fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
243fafaa426SNaoya Horiguchi 
244fafaa426SNaoya Horiguchi 	if (vma && is_vm_hugetlb_page(vma)) {
245fafaa426SNaoya Horiguchi 		if (walk->hugetlb_entry)
246fafaa426SNaoya Horiguchi 			err = walk_hugetlb_range(start, end, walk);
247fafaa426SNaoya Horiguchi 	} else
248fafaa426SNaoya Horiguchi 		err = walk_pgd_range(start, end, walk);
249fafaa426SNaoya Horiguchi 
250e6473092SMatt Mackall 	return err;
251fafaa426SNaoya Horiguchi }
252fafaa426SNaoya Horiguchi 
253fafaa426SNaoya Horiguchi /**
254fafaa426SNaoya Horiguchi  * walk_page_range - walk page table with caller specific callbacks
255fafaa426SNaoya Horiguchi  *
256fafaa426SNaoya Horiguchi  * Recursively walk the page table tree of the process represented by @walk->mm
257fafaa426SNaoya Horiguchi  * within the virtual address range [@start, @end). During walking, we can do
258fafaa426SNaoya Horiguchi  * some caller-specific works for each entry, by setting up pmd_entry(),
259fafaa426SNaoya Horiguchi  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
260fafaa426SNaoya Horiguchi  * callbacks, the associated entries/pages are just ignored.
261fafaa426SNaoya Horiguchi  * The return values of these callbacks are commonly defined like below:
262fafaa426SNaoya Horiguchi  *  - 0  : succeeded to handle the current entry, and if you don't reach the
263fafaa426SNaoya Horiguchi  *         end address yet, continue to walk.
264fafaa426SNaoya Horiguchi  *  - >0 : succeeded to handle the current entry, and return to the caller
265fafaa426SNaoya Horiguchi  *         with caller specific value.
266fafaa426SNaoya Horiguchi  *  - <0 : failed to handle the current entry, and return to the caller
267fafaa426SNaoya Horiguchi  *         with error code.
268fafaa426SNaoya Horiguchi  *
269fafaa426SNaoya Horiguchi  * Before starting to walk page table, some callers want to check whether
270fafaa426SNaoya Horiguchi  * they really want to walk over the current vma, typically by checking
271fafaa426SNaoya Horiguchi  * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
272fafaa426SNaoya Horiguchi  * purpose.
273fafaa426SNaoya Horiguchi  *
274fafaa426SNaoya Horiguchi  * struct mm_walk keeps current values of some common data like vma and pmd,
275fafaa426SNaoya Horiguchi  * which are useful for the access from callbacks. If you want to pass some
276fafaa426SNaoya Horiguchi  * caller-specific data to callbacks, @walk->private should be helpful.
277fafaa426SNaoya Horiguchi  *
278fafaa426SNaoya Horiguchi  * Locking:
279fafaa426SNaoya Horiguchi  *   Callers of walk_page_range() and walk_page_vma() should hold
280fafaa426SNaoya Horiguchi  *   @walk->mm->mmap_sem, because these function traverse vma list and/or
281fafaa426SNaoya Horiguchi  *   access to vma's data.
282fafaa426SNaoya Horiguchi  */
283fafaa426SNaoya Horiguchi int walk_page_range(unsigned long start, unsigned long end,
284fafaa426SNaoya Horiguchi 		    struct mm_walk *walk)
285fafaa426SNaoya Horiguchi {
286fafaa426SNaoya Horiguchi 	int err = 0;
287fafaa426SNaoya Horiguchi 	unsigned long next;
288fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma;
289fafaa426SNaoya Horiguchi 
290fafaa426SNaoya Horiguchi 	if (start >= end)
291fafaa426SNaoya Horiguchi 		return -EINVAL;
292e6473092SMatt Mackall 
2932165009bSDave Hansen 	if (!walk->mm)
2942165009bSDave Hansen 		return -EINVAL;
2952165009bSDave Hansen 
29696dad67fSSasha Levin 	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
297a9ff785eSCliff Wickman 
298fafaa426SNaoya Horiguchi 	vma = find_vma(walk->mm, start);
299e6473092SMatt Mackall 	do {
300fafaa426SNaoya Horiguchi 		if (!vma) { /* after the last vma */
301fafaa426SNaoya Horiguchi 			walk->vma = NULL;
302fafaa426SNaoya Horiguchi 			next = end;
303fafaa426SNaoya Horiguchi 		} else if (start < vma->vm_start) { /* outside vma */
304fafaa426SNaoya Horiguchi 			walk->vma = NULL;
305fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_start);
306fafaa426SNaoya Horiguchi 		} else { /* inside vma */
307fafaa426SNaoya Horiguchi 			walk->vma = vma;
308fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_end);
309fafaa426SNaoya Horiguchi 			vma = vma->vm_next;
3105f0af70aSDavid Sterba 
311fafaa426SNaoya Horiguchi 			err = walk_page_test(start, next, walk);
312f6837395SNaoya Horiguchi 			if (err > 0) {
313f6837395SNaoya Horiguchi 				/*
314f6837395SNaoya Horiguchi 				 * positive return values are purely for
315f6837395SNaoya Horiguchi 				 * controlling the pagewalk, so should never
316f6837395SNaoya Horiguchi 				 * be passed to the callers.
317f6837395SNaoya Horiguchi 				 */
318f6837395SNaoya Horiguchi 				err = 0;
319a9ff785eSCliff Wickman 				continue;
320f6837395SNaoya Horiguchi 			}
321fafaa426SNaoya Horiguchi 			if (err < 0)
322fafaa426SNaoya Horiguchi 				break;
323a9ff785eSCliff Wickman 		}
324fafaa426SNaoya Horiguchi 		if (walk->vma || walk->pte_hole)
325fafaa426SNaoya Horiguchi 			err = __walk_page_range(start, next, walk);
3265dc37642SNaoya Horiguchi 		if (err)
3275dc37642SNaoya Horiguchi 			break;
328fafaa426SNaoya Horiguchi 	} while (start = next, start < end);
329e6473092SMatt Mackall 	return err;
330e6473092SMatt Mackall }
331900fc5f1SNaoya Horiguchi 
332900fc5f1SNaoya Horiguchi int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
333900fc5f1SNaoya Horiguchi {
334900fc5f1SNaoya Horiguchi 	int err;
335900fc5f1SNaoya Horiguchi 
336900fc5f1SNaoya Horiguchi 	if (!walk->mm)
337900fc5f1SNaoya Horiguchi 		return -EINVAL;
338900fc5f1SNaoya Horiguchi 
339900fc5f1SNaoya Horiguchi 	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
340900fc5f1SNaoya Horiguchi 	VM_BUG_ON(!vma);
341900fc5f1SNaoya Horiguchi 	walk->vma = vma;
342900fc5f1SNaoya Horiguchi 	err = walk_page_test(vma->vm_start, vma->vm_end, walk);
343900fc5f1SNaoya Horiguchi 	if (err > 0)
344900fc5f1SNaoya Horiguchi 		return 0;
345900fc5f1SNaoya Horiguchi 	if (err < 0)
346900fc5f1SNaoya Horiguchi 		return err;
347900fc5f1SNaoya Horiguchi 	return __walk_page_range(vma->vm_start, vma->vm_end, walk);
348900fc5f1SNaoya Horiguchi }
349