xref: /linux/mm/pagewalk.c (revision 9ec08f30f86d70b8891c25642df7d1f16647fde4)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2a520110eSChristoph Hellwig #include <linux/pagewalk.h>
3e6473092SMatt Mackall #include <linux/highmem.h>
4e6473092SMatt Mackall #include <linux/sched.h>
5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h>
6e6473092SMatt Mackall 
7b7a16c7aSSteven Price /*
8b7a16c7aSSteven Price  * We want to know the real level where a entry is located ignoring any
9b7a16c7aSSteven Price  * folding of levels which may be happening. For example if p4d is folded then
10b7a16c7aSSteven Price  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11b7a16c7aSSteven Price  */
12b7a16c7aSSteven Price static int real_depth(int depth)
13b7a16c7aSSteven Price {
14b7a16c7aSSteven Price 	if (depth == 3 && PTRS_PER_PMD == 1)
15b7a16c7aSSteven Price 		depth = 2;
16b7a16c7aSSteven Price 	if (depth == 2 && PTRS_PER_PUD == 1)
17b7a16c7aSSteven Price 		depth = 1;
18b7a16c7aSSteven Price 	if (depth == 1 && PTRS_PER_P4D == 1)
19b7a16c7aSSteven Price 		depth = 0;
20b7a16c7aSSteven Price 	return depth;
21b7a16c7aSSteven Price }
22b7a16c7aSSteven Price 
23fbf56346SSteven Price static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
24fbf56346SSteven Price 				unsigned long end, struct mm_walk *walk)
25e6473092SMatt Mackall {
267b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
27fbf56346SSteven Price 	int err = 0;
28e6473092SMatt Mackall 
29556637cdSJohannes Weiner 	for (;;) {
307b86ac33SChristoph Hellwig 		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31e6473092SMatt Mackall 		if (err)
32e6473092SMatt Mackall 		       break;
33c02a9875SSteven Price 		if (addr >= end - PAGE_SIZE)
34556637cdSJohannes Weiner 			break;
35c02a9875SSteven Price 		addr += PAGE_SIZE;
36556637cdSJohannes Weiner 		pte++;
37556637cdSJohannes Weiner 	}
38fbf56346SSteven Price 	return err;
39fbf56346SSteven Price }
40e6473092SMatt Mackall 
41fbf56346SSteven Price static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
42fbf56346SSteven Price 			  struct mm_walk *walk)
43fbf56346SSteven Price {
44fbf56346SSteven Price 	pte_t *pte;
45fbf56346SSteven Price 	int err = 0;
46fbf56346SSteven Price 	spinlock_t *ptl;
47fbf56346SSteven Price 
48fbf56346SSteven Price 	if (walk->no_vma) {
49fbf56346SSteven Price 		pte = pte_offset_map(pmd, addr);
50fbf56346SSteven Price 		err = walk_pte_range_inner(pte, addr, end, walk);
51fbf56346SSteven Price 		pte_unmap(pte);
52fbf56346SSteven Price 	} else {
53fbf56346SSteven Price 		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
54fbf56346SSteven Price 		err = walk_pte_range_inner(pte, addr, end, walk);
55ace88f10SThomas Hellstrom 		pte_unmap_unlock(pte, ptl);
56fbf56346SSteven Price 	}
57fbf56346SSteven Price 
58e6473092SMatt Mackall 	return err;
59e6473092SMatt Mackall }
60e6473092SMatt Mackall 
61e17eae2bSChristophe Leroy #ifdef CONFIG_ARCH_HAS_HUGEPD
62e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
63e17eae2bSChristophe Leroy 			     unsigned long end, struct mm_walk *walk, int pdshift)
64e17eae2bSChristophe Leroy {
65e17eae2bSChristophe Leroy 	int err = 0;
66e17eae2bSChristophe Leroy 	const struct mm_walk_ops *ops = walk->ops;
67e17eae2bSChristophe Leroy 	int shift = hugepd_shift(*phpd);
68e17eae2bSChristophe Leroy 	int page_size = 1 << shift;
69e17eae2bSChristophe Leroy 
70e17eae2bSChristophe Leroy 	if (!ops->pte_entry)
71e17eae2bSChristophe Leroy 		return 0;
72e17eae2bSChristophe Leroy 
73e17eae2bSChristophe Leroy 	if (addr & (page_size - 1))
74e17eae2bSChristophe Leroy 		return 0;
75e17eae2bSChristophe Leroy 
76e17eae2bSChristophe Leroy 	for (;;) {
77e17eae2bSChristophe Leroy 		pte_t *pte;
78e17eae2bSChristophe Leroy 
79e17eae2bSChristophe Leroy 		spin_lock(&walk->mm->page_table_lock);
80e17eae2bSChristophe Leroy 		pte = hugepte_offset(*phpd, addr, pdshift);
81e17eae2bSChristophe Leroy 		err = ops->pte_entry(pte, addr, addr + page_size, walk);
82e17eae2bSChristophe Leroy 		spin_unlock(&walk->mm->page_table_lock);
83e17eae2bSChristophe Leroy 
84e17eae2bSChristophe Leroy 		if (err)
85e17eae2bSChristophe Leroy 			break;
86e17eae2bSChristophe Leroy 		if (addr >= end - page_size)
87e17eae2bSChristophe Leroy 			break;
88e17eae2bSChristophe Leroy 		addr += page_size;
89e17eae2bSChristophe Leroy 	}
90e17eae2bSChristophe Leroy 	return err;
91e17eae2bSChristophe Leroy }
92e17eae2bSChristophe Leroy #else
93e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
94e17eae2bSChristophe Leroy 			     unsigned long end, struct mm_walk *walk, int pdshift)
95e17eae2bSChristophe Leroy {
96e17eae2bSChristophe Leroy 	return 0;
97e17eae2bSChristophe Leroy }
98e17eae2bSChristophe Leroy #endif
99e17eae2bSChristophe Leroy 
100e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
1012165009bSDave Hansen 			  struct mm_walk *walk)
102e6473092SMatt Mackall {
103e6473092SMatt Mackall 	pmd_t *pmd;
104e6473092SMatt Mackall 	unsigned long next;
1057b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
106e6473092SMatt Mackall 	int err = 0;
107b7a16c7aSSteven Price 	int depth = real_depth(3);
108e6473092SMatt Mackall 
109e6473092SMatt Mackall 	pmd = pmd_offset(pud, addr);
110e6473092SMatt Mackall 	do {
11103319327SDave Hansen again:
112e6473092SMatt Mackall 		next = pmd_addr_end(addr, end);
113488ae6a2SSteven Price 		if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
1147b86ac33SChristoph Hellwig 			if (ops->pte_hole)
115b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
116e6473092SMatt Mackall 			if (err)
117e6473092SMatt Mackall 				break;
118e6473092SMatt Mackall 			continue;
119e6473092SMatt Mackall 		}
1203afc4236SSteven Price 
1213afc4236SSteven Price 		walk->action = ACTION_SUBTREE;
1223afc4236SSteven Price 
12303319327SDave Hansen 		/*
12403319327SDave Hansen 		 * This implies that each ->pmd_entry() handler
12503319327SDave Hansen 		 * needs to know about pmd_trans_huge() pmds
12603319327SDave Hansen 		 */
1277b86ac33SChristoph Hellwig 		if (ops->pmd_entry)
1287b86ac33SChristoph Hellwig 			err = ops->pmd_entry(pmd, addr, next, walk);
12903319327SDave Hansen 		if (err)
13003319327SDave Hansen 			break;
13103319327SDave Hansen 
1323afc4236SSteven Price 		if (walk->action == ACTION_AGAIN)
1333afc4236SSteven Price 			goto again;
1343afc4236SSteven Price 
13503319327SDave Hansen 		/*
13603319327SDave Hansen 		 * Check this here so we only break down trans_huge
13703319327SDave Hansen 		 * pages when we _need_ to
13803319327SDave Hansen 		 */
139488ae6a2SSteven Price 		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
140488ae6a2SSteven Price 		    walk->action == ACTION_CONTINUE ||
1413afc4236SSteven Price 		    !(ops->pte_entry))
14203319327SDave Hansen 			continue;
14303319327SDave Hansen 
144488ae6a2SSteven Price 		if (walk->vma) {
14578ddc534SKirill A. Shutemov 			split_huge_pmd(walk->vma, pmd, addr);
146fafaa426SNaoya Horiguchi 			if (pmd_trans_unstable(pmd))
14703319327SDave Hansen 				goto again;
148488ae6a2SSteven Price 		}
1493afc4236SSteven Price 
150e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
151e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
152e17eae2bSChristophe Leroy 		else
1532165009bSDave Hansen 			err = walk_pte_range(pmd, addr, next, walk);
154e6473092SMatt Mackall 		if (err)
155e6473092SMatt Mackall 			break;
156e6473092SMatt Mackall 	} while (pmd++, addr = next, addr != end);
157e6473092SMatt Mackall 
158e6473092SMatt Mackall 	return err;
159e6473092SMatt Mackall }
160e6473092SMatt Mackall 
161c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
1622165009bSDave Hansen 			  struct mm_walk *walk)
163e6473092SMatt Mackall {
164e6473092SMatt Mackall 	pud_t *pud;
165e6473092SMatt Mackall 	unsigned long next;
1667b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
167e6473092SMatt Mackall 	int err = 0;
168b7a16c7aSSteven Price 	int depth = real_depth(2);
169e6473092SMatt Mackall 
170c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, addr);
171e6473092SMatt Mackall 	do {
172a00cc7d9SMatthew Wilcox  again:
173e6473092SMatt Mackall 		next = pud_addr_end(addr, end);
174488ae6a2SSteven Price 		if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
1757b86ac33SChristoph Hellwig 			if (ops->pte_hole)
176b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
177e6473092SMatt Mackall 			if (err)
178e6473092SMatt Mackall 				break;
179e6473092SMatt Mackall 			continue;
180e6473092SMatt Mackall 		}
181a00cc7d9SMatthew Wilcox 
1823afc4236SSteven Price 		walk->action = ACTION_SUBTREE;
183a00cc7d9SMatthew Wilcox 
1843afc4236SSteven Price 		if (ops->pud_entry)
1857b86ac33SChristoph Hellwig 			err = ops->pud_entry(pud, addr, next, walk);
186a00cc7d9SMatthew Wilcox 		if (err)
187a00cc7d9SMatthew Wilcox 			break;
1883afc4236SSteven Price 
1893afc4236SSteven Price 		if (walk->action == ACTION_AGAIN)
1903afc4236SSteven Price 			goto again;
1913afc4236SSteven Price 
192488ae6a2SSteven Price 		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
193488ae6a2SSteven Price 		    walk->action == ACTION_CONTINUE ||
1943afc4236SSteven Price 		    !(ops->pmd_entry || ops->pte_entry))
195a00cc7d9SMatthew Wilcox 			continue;
196a00cc7d9SMatthew Wilcox 
197488ae6a2SSteven Price 		if (walk->vma)
198a00cc7d9SMatthew Wilcox 			split_huge_pud(walk->vma, pud, addr);
199a00cc7d9SMatthew Wilcox 		if (pud_none(*pud))
200a00cc7d9SMatthew Wilcox 			goto again;
201a00cc7d9SMatthew Wilcox 
202e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pud_val(*pud))))
203e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
204e17eae2bSChristophe Leroy 		else
2052165009bSDave Hansen 			err = walk_pmd_range(pud, addr, next, walk);
206e6473092SMatt Mackall 		if (err)
207e6473092SMatt Mackall 			break;
208e6473092SMatt Mackall 	} while (pud++, addr = next, addr != end);
209e6473092SMatt Mackall 
210e6473092SMatt Mackall 	return err;
211e6473092SMatt Mackall }
212e6473092SMatt Mackall 
213c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
214c2febafcSKirill A. Shutemov 			  struct mm_walk *walk)
215c2febafcSKirill A. Shutemov {
216c2febafcSKirill A. Shutemov 	p4d_t *p4d;
217c2febafcSKirill A. Shutemov 	unsigned long next;
2187b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
219c2febafcSKirill A. Shutemov 	int err = 0;
220b7a16c7aSSteven Price 	int depth = real_depth(1);
221c2febafcSKirill A. Shutemov 
222c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, addr);
223c2febafcSKirill A. Shutemov 	do {
224c2febafcSKirill A. Shutemov 		next = p4d_addr_end(addr, end);
225c2febafcSKirill A. Shutemov 		if (p4d_none_or_clear_bad(p4d)) {
2267b86ac33SChristoph Hellwig 			if (ops->pte_hole)
227b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
228c2febafcSKirill A. Shutemov 			if (err)
229c2febafcSKirill A. Shutemov 				break;
230c2febafcSKirill A. Shutemov 			continue;
231c2febafcSKirill A. Shutemov 		}
2323afc4236SSteven Price 		if (ops->p4d_entry) {
2333afc4236SSteven Price 			err = ops->p4d_entry(p4d, addr, next, walk);
2343afc4236SSteven Price 			if (err)
2353afc4236SSteven Price 				break;
2363afc4236SSteven Price 		}
237e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(p4d_val(*p4d))))
238e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
239e17eae2bSChristophe Leroy 		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
240c2febafcSKirill A. Shutemov 			err = walk_pud_range(p4d, addr, next, walk);
241c2febafcSKirill A. Shutemov 		if (err)
242c2febafcSKirill A. Shutemov 			break;
243c2febafcSKirill A. Shutemov 	} while (p4d++, addr = next, addr != end);
244c2febafcSKirill A. Shutemov 
245c2febafcSKirill A. Shutemov 	return err;
246c2febafcSKirill A. Shutemov }
247c2febafcSKirill A. Shutemov 
248fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end,
249fafaa426SNaoya Horiguchi 			  struct mm_walk *walk)
250fafaa426SNaoya Horiguchi {
251fafaa426SNaoya Horiguchi 	pgd_t *pgd;
252fafaa426SNaoya Horiguchi 	unsigned long next;
2537b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
254fafaa426SNaoya Horiguchi 	int err = 0;
255fafaa426SNaoya Horiguchi 
256e47690d7SSteven Price 	if (walk->pgd)
257e47690d7SSteven Price 		pgd = walk->pgd + pgd_index(addr);
258e47690d7SSteven Price 	else
259fafaa426SNaoya Horiguchi 		pgd = pgd_offset(walk->mm, addr);
260fafaa426SNaoya Horiguchi 	do {
261fafaa426SNaoya Horiguchi 		next = pgd_addr_end(addr, end);
262fafaa426SNaoya Horiguchi 		if (pgd_none_or_clear_bad(pgd)) {
2637b86ac33SChristoph Hellwig 			if (ops->pte_hole)
264b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, 0, walk);
265fafaa426SNaoya Horiguchi 			if (err)
266fafaa426SNaoya Horiguchi 				break;
267fafaa426SNaoya Horiguchi 			continue;
268fafaa426SNaoya Horiguchi 		}
2693afc4236SSteven Price 		if (ops->pgd_entry) {
2703afc4236SSteven Price 			err = ops->pgd_entry(pgd, addr, next, walk);
2713afc4236SSteven Price 			if (err)
2723afc4236SSteven Price 				break;
2733afc4236SSteven Price 		}
274e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pgd_val(*pgd))))
275e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
276e17eae2bSChristophe Leroy 		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
277c2febafcSKirill A. Shutemov 			err = walk_p4d_range(pgd, addr, next, walk);
278fafaa426SNaoya Horiguchi 		if (err)
279fafaa426SNaoya Horiguchi 			break;
280fafaa426SNaoya Horiguchi 	} while (pgd++, addr = next, addr != end);
281fafaa426SNaoya Horiguchi 
282fafaa426SNaoya Horiguchi 	return err;
283fafaa426SNaoya Horiguchi }
284fafaa426SNaoya Horiguchi 
285116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
286116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
287116354d1SNaoya Horiguchi 				       unsigned long end)
288116354d1SNaoya Horiguchi {
289116354d1SNaoya Horiguchi 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
290116354d1SNaoya Horiguchi 	return boundary < end ? boundary : end;
291116354d1SNaoya Horiguchi }
292116354d1SNaoya Horiguchi 
293fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
294116354d1SNaoya Horiguchi 			      struct mm_walk *walk)
295116354d1SNaoya Horiguchi {
296fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
297116354d1SNaoya Horiguchi 	struct hstate *h = hstate_vma(vma);
298116354d1SNaoya Horiguchi 	unsigned long next;
299116354d1SNaoya Horiguchi 	unsigned long hmask = huge_page_mask(h);
3007868a208SPunit Agrawal 	unsigned long sz = huge_page_size(h);
301116354d1SNaoya Horiguchi 	pte_t *pte;
3027b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
303116354d1SNaoya Horiguchi 	int err = 0;
304116354d1SNaoya Horiguchi 
305116354d1SNaoya Horiguchi 	do {
306116354d1SNaoya Horiguchi 		next = hugetlb_entry_end(h, addr, end);
3077868a208SPunit Agrawal 		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
308373c4557SJann Horn 
309373c4557SJann Horn 		if (pte)
3107b86ac33SChristoph Hellwig 			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
3117b86ac33SChristoph Hellwig 		else if (ops->pte_hole)
312b7a16c7aSSteven Price 			err = ops->pte_hole(addr, next, -1, walk);
313373c4557SJann Horn 
314116354d1SNaoya Horiguchi 		if (err)
315fafaa426SNaoya Horiguchi 			break;
316116354d1SNaoya Horiguchi 	} while (addr = next, addr != end);
317116354d1SNaoya Horiguchi 
318fafaa426SNaoya Horiguchi 	return err;
319116354d1SNaoya Horiguchi }
3206c6d5280SKOSAKI Motohiro 
3216c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */
322fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
3236c6d5280SKOSAKI Motohiro 			      struct mm_walk *walk)
3246c6d5280SKOSAKI Motohiro {
3256c6d5280SKOSAKI Motohiro 	return 0;
3266c6d5280SKOSAKI Motohiro }
3276c6d5280SKOSAKI Motohiro 
3286c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */
3296c6d5280SKOSAKI Motohiro 
330fafaa426SNaoya Horiguchi /*
331fafaa426SNaoya Horiguchi  * Decide whether we really walk over the current vma on [@start, @end)
332fafaa426SNaoya Horiguchi  * or skip it via the returned value. Return 0 if we do walk over the
333fafaa426SNaoya Horiguchi  * current vma, and return 1 if we skip the vma. Negative values means
334fafaa426SNaoya Horiguchi  * error, where we abort the current walk.
335e6473092SMatt Mackall  */
336fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end,
3372165009bSDave Hansen 			struct mm_walk *walk)
338e6473092SMatt Mackall {
339fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
3407b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
341e6473092SMatt Mackall 
3427b86ac33SChristoph Hellwig 	if (ops->test_walk)
3437b86ac33SChristoph Hellwig 		return ops->test_walk(start, end, walk);
344fafaa426SNaoya Horiguchi 
345fafaa426SNaoya Horiguchi 	/*
34648684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
34748684a65SNaoya Horiguchi 	 * range, so we don't walk over it as we do for normal vmas. However,
34848684a65SNaoya Horiguchi 	 * Some callers are interested in handling hole range and they don't
34948684a65SNaoya Horiguchi 	 * want to just ignore any single address range. Such users certainly
35048684a65SNaoya Horiguchi 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
35148684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP).
352fafaa426SNaoya Horiguchi 	 */
35348684a65SNaoya Horiguchi 	if (vma->vm_flags & VM_PFNMAP) {
35448684a65SNaoya Horiguchi 		int err = 1;
3557b86ac33SChristoph Hellwig 		if (ops->pte_hole)
356b7a16c7aSSteven Price 			err = ops->pte_hole(start, end, -1, walk);
35748684a65SNaoya Horiguchi 		return err ? err : 1;
35848684a65SNaoya Horiguchi 	}
359fafaa426SNaoya Horiguchi 	return 0;
360fafaa426SNaoya Horiguchi }
361fafaa426SNaoya Horiguchi 
362fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end,
363fafaa426SNaoya Horiguchi 			struct mm_walk *walk)
364fafaa426SNaoya Horiguchi {
365fafaa426SNaoya Horiguchi 	int err = 0;
366fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
367ecaad8acSThomas Hellstrom 	const struct mm_walk_ops *ops = walk->ops;
368ecaad8acSThomas Hellstrom 
369ecaad8acSThomas Hellstrom 	if (vma && ops->pre_vma) {
370ecaad8acSThomas Hellstrom 		err = ops->pre_vma(start, end, walk);
371ecaad8acSThomas Hellstrom 		if (err)
372ecaad8acSThomas Hellstrom 			return err;
373ecaad8acSThomas Hellstrom 	}
374fafaa426SNaoya Horiguchi 
375fafaa426SNaoya Horiguchi 	if (vma && is_vm_hugetlb_page(vma)) {
376ecaad8acSThomas Hellstrom 		if (ops->hugetlb_entry)
377fafaa426SNaoya Horiguchi 			err = walk_hugetlb_range(start, end, walk);
378fafaa426SNaoya Horiguchi 	} else
379fafaa426SNaoya Horiguchi 		err = walk_pgd_range(start, end, walk);
380fafaa426SNaoya Horiguchi 
381ecaad8acSThomas Hellstrom 	if (vma && ops->post_vma)
382ecaad8acSThomas Hellstrom 		ops->post_vma(walk);
383ecaad8acSThomas Hellstrom 
384e6473092SMatt Mackall 	return err;
385fafaa426SNaoya Horiguchi }
386fafaa426SNaoya Horiguchi 
387fafaa426SNaoya Horiguchi /**
388fafaa426SNaoya Horiguchi  * walk_page_range - walk page table with caller specific callbacks
3897b86ac33SChristoph Hellwig  * @mm:		mm_struct representing the target process of page table walk
390e8b098fcSMike Rapoport  * @start:	start address of the virtual address range
391e8b098fcSMike Rapoport  * @end:	end address of the virtual address range
3927b86ac33SChristoph Hellwig  * @ops:	operation to call during the walk
3937b86ac33SChristoph Hellwig  * @private:	private data for callbacks' usage
394fafaa426SNaoya Horiguchi  *
3957b86ac33SChristoph Hellwig  * Recursively walk the page table tree of the process represented by @mm
396fafaa426SNaoya Horiguchi  * within the virtual address range [@start, @end). During walking, we can do
397fafaa426SNaoya Horiguchi  * some caller-specific works for each entry, by setting up pmd_entry(),
398fafaa426SNaoya Horiguchi  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
399fafaa426SNaoya Horiguchi  * callbacks, the associated entries/pages are just ignored.
400fafaa426SNaoya Horiguchi  * The return values of these callbacks are commonly defined like below:
401a5d09bedSMike Rapoport  *
402fafaa426SNaoya Horiguchi  *  - 0  : succeeded to handle the current entry, and if you don't reach the
403fafaa426SNaoya Horiguchi  *         end address yet, continue to walk.
404fafaa426SNaoya Horiguchi  *  - >0 : succeeded to handle the current entry, and return to the caller
405fafaa426SNaoya Horiguchi  *         with caller specific value.
406fafaa426SNaoya Horiguchi  *  - <0 : failed to handle the current entry, and return to the caller
407fafaa426SNaoya Horiguchi  *         with error code.
408fafaa426SNaoya Horiguchi  *
409fafaa426SNaoya Horiguchi  * Before starting to walk page table, some callers want to check whether
410fafaa426SNaoya Horiguchi  * they really want to walk over the current vma, typically by checking
4117b86ac33SChristoph Hellwig  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
412fafaa426SNaoya Horiguchi  * purpose.
413fafaa426SNaoya Horiguchi  *
414ecaad8acSThomas Hellstrom  * If operations need to be staged before and committed after a vma is walked,
415ecaad8acSThomas Hellstrom  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
416ecaad8acSThomas Hellstrom  * since it is intended to handle commit-type operations, can't return any
417ecaad8acSThomas Hellstrom  * errors.
418ecaad8acSThomas Hellstrom  *
419fafaa426SNaoya Horiguchi  * struct mm_walk keeps current values of some common data like vma and pmd,
420fafaa426SNaoya Horiguchi  * which are useful for the access from callbacks. If you want to pass some
4217b86ac33SChristoph Hellwig  * caller-specific data to callbacks, @private should be helpful.
422fafaa426SNaoya Horiguchi  *
423fafaa426SNaoya Horiguchi  * Locking:
424c1e8d7c6SMichel Lespinasse  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
4257b86ac33SChristoph Hellwig  *   because these function traverse vma list and/or access to vma's data.
426fafaa426SNaoya Horiguchi  */
4277b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start,
4287b86ac33SChristoph Hellwig 		unsigned long end, const struct mm_walk_ops *ops,
4297b86ac33SChristoph Hellwig 		void *private)
430fafaa426SNaoya Horiguchi {
431fafaa426SNaoya Horiguchi 	int err = 0;
432fafaa426SNaoya Horiguchi 	unsigned long next;
433fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma;
4347b86ac33SChristoph Hellwig 	struct mm_walk walk = {
4357b86ac33SChristoph Hellwig 		.ops		= ops,
4367b86ac33SChristoph Hellwig 		.mm		= mm,
4377b86ac33SChristoph Hellwig 		.private	= private,
4387b86ac33SChristoph Hellwig 	};
439fafaa426SNaoya Horiguchi 
440fafaa426SNaoya Horiguchi 	if (start >= end)
441fafaa426SNaoya Horiguchi 		return -EINVAL;
442e6473092SMatt Mackall 
4437b86ac33SChristoph Hellwig 	if (!walk.mm)
4442165009bSDave Hansen 		return -EINVAL;
4452165009bSDave Hansen 
44642fc5414SMichel Lespinasse 	mmap_assert_locked(walk.mm);
447a9ff785eSCliff Wickman 
4487b86ac33SChristoph Hellwig 	vma = find_vma(walk.mm, start);
449e6473092SMatt Mackall 	do {
450fafaa426SNaoya Horiguchi 		if (!vma) { /* after the last vma */
4517b86ac33SChristoph Hellwig 			walk.vma = NULL;
452fafaa426SNaoya Horiguchi 			next = end;
453fafaa426SNaoya Horiguchi 		} else if (start < vma->vm_start) { /* outside vma */
4547b86ac33SChristoph Hellwig 			walk.vma = NULL;
455fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_start);
456fafaa426SNaoya Horiguchi 		} else { /* inside vma */
4577b86ac33SChristoph Hellwig 			walk.vma = vma;
458fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_end);
459*9ec08f30SMatthew Wilcox (Oracle) 			vma = find_vma(mm, vma->vm_end);
4605f0af70aSDavid Sterba 
4617b86ac33SChristoph Hellwig 			err = walk_page_test(start, next, &walk);
462f6837395SNaoya Horiguchi 			if (err > 0) {
463f6837395SNaoya Horiguchi 				/*
464f6837395SNaoya Horiguchi 				 * positive return values are purely for
465f6837395SNaoya Horiguchi 				 * controlling the pagewalk, so should never
466f6837395SNaoya Horiguchi 				 * be passed to the callers.
467f6837395SNaoya Horiguchi 				 */
468f6837395SNaoya Horiguchi 				err = 0;
469a9ff785eSCliff Wickman 				continue;
470f6837395SNaoya Horiguchi 			}
471fafaa426SNaoya Horiguchi 			if (err < 0)
472fafaa426SNaoya Horiguchi 				break;
473a9ff785eSCliff Wickman 		}
4747b86ac33SChristoph Hellwig 		if (walk.vma || walk.ops->pte_hole)
4757b86ac33SChristoph Hellwig 			err = __walk_page_range(start, next, &walk);
4765dc37642SNaoya Horiguchi 		if (err)
4775dc37642SNaoya Horiguchi 			break;
478fafaa426SNaoya Horiguchi 	} while (start = next, start < end);
479e6473092SMatt Mackall 	return err;
480e6473092SMatt Mackall }
481900fc5f1SNaoya Horiguchi 
4828bd3873dSRolf Eike Beer /**
4838bd3873dSRolf Eike Beer  * walk_page_range_novma - walk a range of pagetables not backed by a vma
4848bd3873dSRolf Eike Beer  * @mm:		mm_struct representing the target process of page table walk
4858bd3873dSRolf Eike Beer  * @start:	start address of the virtual address range
4868bd3873dSRolf Eike Beer  * @end:	end address of the virtual address range
4878bd3873dSRolf Eike Beer  * @ops:	operation to call during the walk
4888bd3873dSRolf Eike Beer  * @pgd:	pgd to walk if different from mm->pgd
4898bd3873dSRolf Eike Beer  * @private:	private data for callbacks' usage
4908bd3873dSRolf Eike Beer  *
491fbf56346SSteven Price  * Similar to walk_page_range() but can walk any page tables even if they are
492fbf56346SSteven Price  * not backed by VMAs. Because 'unusual' entries may be walked this function
493fbf56346SSteven Price  * will also not lock the PTEs for the pte_entry() callback. This is useful for
494fbf56346SSteven Price  * walking the kernel pages tables or page tables for firmware.
495fbf56346SSteven Price  */
496488ae6a2SSteven Price int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
497488ae6a2SSteven Price 			  unsigned long end, const struct mm_walk_ops *ops,
498e47690d7SSteven Price 			  pgd_t *pgd,
499488ae6a2SSteven Price 			  void *private)
500488ae6a2SSteven Price {
501488ae6a2SSteven Price 	struct mm_walk walk = {
502488ae6a2SSteven Price 		.ops		= ops,
503488ae6a2SSteven Price 		.mm		= mm,
504e47690d7SSteven Price 		.pgd		= pgd,
505488ae6a2SSteven Price 		.private	= private,
506488ae6a2SSteven Price 		.no_vma		= true
507488ae6a2SSteven Price 	};
508488ae6a2SSteven Price 
509488ae6a2SSteven Price 	if (start >= end || !walk.mm)
510488ae6a2SSteven Price 		return -EINVAL;
511488ae6a2SSteven Price 
51242fc5414SMichel Lespinasse 	mmap_assert_locked(walk.mm);
513488ae6a2SSteven Price 
514488ae6a2SSteven Price 	return __walk_page_range(start, end, &walk);
515488ae6a2SSteven Price }
516488ae6a2SSteven Price 
5177b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
5187b86ac33SChristoph Hellwig 		void *private)
519900fc5f1SNaoya Horiguchi {
5207b86ac33SChristoph Hellwig 	struct mm_walk walk = {
5217b86ac33SChristoph Hellwig 		.ops		= ops,
5227b86ac33SChristoph Hellwig 		.mm		= vma->vm_mm,
5237b86ac33SChristoph Hellwig 		.vma		= vma,
5247b86ac33SChristoph Hellwig 		.private	= private,
5257b86ac33SChristoph Hellwig 	};
526900fc5f1SNaoya Horiguchi 	int err;
527900fc5f1SNaoya Horiguchi 
5287b86ac33SChristoph Hellwig 	if (!walk.mm)
529900fc5f1SNaoya Horiguchi 		return -EINVAL;
530900fc5f1SNaoya Horiguchi 
53142fc5414SMichel Lespinasse 	mmap_assert_locked(walk.mm);
5327b86ac33SChristoph Hellwig 
5337b86ac33SChristoph Hellwig 	err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
534900fc5f1SNaoya Horiguchi 	if (err > 0)
535900fc5f1SNaoya Horiguchi 		return 0;
536900fc5f1SNaoya Horiguchi 	if (err < 0)
537900fc5f1SNaoya Horiguchi 		return err;
5387b86ac33SChristoph Hellwig 	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
539900fc5f1SNaoya Horiguchi }
540ecaad8acSThomas Hellstrom 
541ecaad8acSThomas Hellstrom /**
542ecaad8acSThomas Hellstrom  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
543ecaad8acSThomas Hellstrom  * @mapping: Pointer to the struct address_space
544ecaad8acSThomas Hellstrom  * @first_index: First page offset in the address_space
545ecaad8acSThomas Hellstrom  * @nr: Number of incremental page offsets to cover
546ecaad8acSThomas Hellstrom  * @ops:	operation to call during the walk
547ecaad8acSThomas Hellstrom  * @private:	private data for callbacks' usage
548ecaad8acSThomas Hellstrom  *
549ecaad8acSThomas Hellstrom  * This function walks all memory areas mapped into a struct address_space.
550ecaad8acSThomas Hellstrom  * The walk is limited to only the given page-size index range, but if
551ecaad8acSThomas Hellstrom  * the index boundaries cross a huge page-table entry, that entry will be
552ecaad8acSThomas Hellstrom  * included.
553ecaad8acSThomas Hellstrom  *
554ecaad8acSThomas Hellstrom  * Also see walk_page_range() for additional information.
555ecaad8acSThomas Hellstrom  *
556ecaad8acSThomas Hellstrom  * Locking:
557c1e8d7c6SMichel Lespinasse  *   This function can't require that the struct mm_struct::mmap_lock is held,
558ecaad8acSThomas Hellstrom  *   since @mapping may be mapped by multiple processes. Instead
559ecaad8acSThomas Hellstrom  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
560ecaad8acSThomas Hellstrom  *   callbacks, and it's up tho the caller to ensure that the
561c1e8d7c6SMichel Lespinasse  *   struct mm_struct::mmap_lock is not needed.
562ecaad8acSThomas Hellstrom  *
563ecaad8acSThomas Hellstrom  *   Also this means that a caller can't rely on the struct
564ecaad8acSThomas Hellstrom  *   vm_area_struct::vm_flags to be constant across a call,
565ecaad8acSThomas Hellstrom  *   except for immutable flags. Callers requiring this shouldn't use
566ecaad8acSThomas Hellstrom  *   this function.
567ecaad8acSThomas Hellstrom  *
568ecaad8acSThomas Hellstrom  * Return: 0 on success, negative error code on failure, positive number on
569ecaad8acSThomas Hellstrom  * caller defined premature termination.
570ecaad8acSThomas Hellstrom  */
571ecaad8acSThomas Hellstrom int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
572ecaad8acSThomas Hellstrom 		      pgoff_t nr, const struct mm_walk_ops *ops,
573ecaad8acSThomas Hellstrom 		      void *private)
574ecaad8acSThomas Hellstrom {
575ecaad8acSThomas Hellstrom 	struct mm_walk walk = {
576ecaad8acSThomas Hellstrom 		.ops		= ops,
577ecaad8acSThomas Hellstrom 		.private	= private,
578ecaad8acSThomas Hellstrom 	};
579ecaad8acSThomas Hellstrom 	struct vm_area_struct *vma;
580ecaad8acSThomas Hellstrom 	pgoff_t vba, vea, cba, cea;
581ecaad8acSThomas Hellstrom 	unsigned long start_addr, end_addr;
582ecaad8acSThomas Hellstrom 	int err = 0;
583ecaad8acSThomas Hellstrom 
584ecaad8acSThomas Hellstrom 	lockdep_assert_held(&mapping->i_mmap_rwsem);
585ecaad8acSThomas Hellstrom 	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
586ecaad8acSThomas Hellstrom 				  first_index + nr - 1) {
587ecaad8acSThomas Hellstrom 		/* Clip to the vma */
588ecaad8acSThomas Hellstrom 		vba = vma->vm_pgoff;
589ecaad8acSThomas Hellstrom 		vea = vba + vma_pages(vma);
590ecaad8acSThomas Hellstrom 		cba = first_index;
591ecaad8acSThomas Hellstrom 		cba = max(cba, vba);
592ecaad8acSThomas Hellstrom 		cea = first_index + nr;
593ecaad8acSThomas Hellstrom 		cea = min(cea, vea);
594ecaad8acSThomas Hellstrom 
595ecaad8acSThomas Hellstrom 		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
596ecaad8acSThomas Hellstrom 		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
597ecaad8acSThomas Hellstrom 		if (start_addr >= end_addr)
598ecaad8acSThomas Hellstrom 			continue;
599ecaad8acSThomas Hellstrom 
600ecaad8acSThomas Hellstrom 		walk.vma = vma;
601ecaad8acSThomas Hellstrom 		walk.mm = vma->vm_mm;
602ecaad8acSThomas Hellstrom 
603ecaad8acSThomas Hellstrom 		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
604ecaad8acSThomas Hellstrom 		if (err > 0) {
605ecaad8acSThomas Hellstrom 			err = 0;
606ecaad8acSThomas Hellstrom 			break;
607ecaad8acSThomas Hellstrom 		} else if (err < 0)
608ecaad8acSThomas Hellstrom 			break;
609ecaad8acSThomas Hellstrom 
610ecaad8acSThomas Hellstrom 		err = __walk_page_range(start_addr, end_addr, &walk);
611ecaad8acSThomas Hellstrom 		if (err)
612ecaad8acSThomas Hellstrom 			break;
613ecaad8acSThomas Hellstrom 	}
614ecaad8acSThomas Hellstrom 
615ecaad8acSThomas Hellstrom 	return err;
616ecaad8acSThomas Hellstrom }
617