xref: /linux/mm/pagewalk.c (revision dd361e5033cf36c51acab996ea17748b81cedb38)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2a520110eSChristoph Hellwig #include <linux/pagewalk.h>
3e6473092SMatt Mackall #include <linux/highmem.h>
4e6473092SMatt Mackall #include <linux/sched.h>
5d33b9f45SNaoya Horiguchi #include <linux/hugetlb.h>
6e6473092SMatt Mackall 
7b7a16c7aSSteven Price /*
8b7a16c7aSSteven Price  * We want to know the real level where a entry is located ignoring any
9b7a16c7aSSteven Price  * folding of levels which may be happening. For example if p4d is folded then
10b7a16c7aSSteven Price  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11b7a16c7aSSteven Price  */
12b7a16c7aSSteven Price static int real_depth(int depth)
13b7a16c7aSSteven Price {
14b7a16c7aSSteven Price 	if (depth == 3 && PTRS_PER_PMD == 1)
15b7a16c7aSSteven Price 		depth = 2;
16b7a16c7aSSteven Price 	if (depth == 2 && PTRS_PER_PUD == 1)
17b7a16c7aSSteven Price 		depth = 1;
18b7a16c7aSSteven Price 	if (depth == 1 && PTRS_PER_P4D == 1)
19b7a16c7aSSteven Price 		depth = 0;
20b7a16c7aSSteven Price 	return depth;
21b7a16c7aSSteven Price }
22b7a16c7aSSteven Price 
23fbf56346SSteven Price static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
24fbf56346SSteven Price 				unsigned long end, struct mm_walk *walk)
25e6473092SMatt Mackall {
267b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
27fbf56346SSteven Price 	int err = 0;
28e6473092SMatt Mackall 
29556637cdSJohannes Weiner 	for (;;) {
307b86ac33SChristoph Hellwig 		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31e6473092SMatt Mackall 		if (err)
32e6473092SMatt Mackall 		       break;
33c02a9875SSteven Price 		if (addr >= end - PAGE_SIZE)
34556637cdSJohannes Weiner 			break;
35c02a9875SSteven Price 		addr += PAGE_SIZE;
36556637cdSJohannes Weiner 		pte++;
37556637cdSJohannes Weiner 	}
38fbf56346SSteven Price 	return err;
39fbf56346SSteven Price }
40e6473092SMatt Mackall 
41fbf56346SSteven Price static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
42fbf56346SSteven Price 			  struct mm_walk *walk)
43fbf56346SSteven Price {
44fbf56346SSteven Price 	pte_t *pte;
45fbf56346SSteven Price 	int err = 0;
46fbf56346SSteven Price 	spinlock_t *ptl;
47fbf56346SSteven Price 
48fbf56346SSteven Price 	if (walk->no_vma) {
49fbf56346SSteven Price 		pte = pte_offset_map(pmd, addr);
50fbf56346SSteven Price 		err = walk_pte_range_inner(pte, addr, end, walk);
51fbf56346SSteven Price 		pte_unmap(pte);
52fbf56346SSteven Price 	} else {
53fbf56346SSteven Price 		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
54fbf56346SSteven Price 		err = walk_pte_range_inner(pte, addr, end, walk);
55ace88f10SThomas Hellstrom 		pte_unmap_unlock(pte, ptl);
56fbf56346SSteven Price 	}
57fbf56346SSteven Price 
58e6473092SMatt Mackall 	return err;
59e6473092SMatt Mackall }
60e6473092SMatt Mackall 
61e17eae2bSChristophe Leroy #ifdef CONFIG_ARCH_HAS_HUGEPD
62e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
63e17eae2bSChristophe Leroy 			     unsigned long end, struct mm_walk *walk, int pdshift)
64e17eae2bSChristophe Leroy {
65e17eae2bSChristophe Leroy 	int err = 0;
66e17eae2bSChristophe Leroy 	const struct mm_walk_ops *ops = walk->ops;
67e17eae2bSChristophe Leroy 	int shift = hugepd_shift(*phpd);
68e17eae2bSChristophe Leroy 	int page_size = 1 << shift;
69e17eae2bSChristophe Leroy 
70e17eae2bSChristophe Leroy 	if (!ops->pte_entry)
71e17eae2bSChristophe Leroy 		return 0;
72e17eae2bSChristophe Leroy 
73e17eae2bSChristophe Leroy 	if (addr & (page_size - 1))
74e17eae2bSChristophe Leroy 		return 0;
75e17eae2bSChristophe Leroy 
76e17eae2bSChristophe Leroy 	for (;;) {
77e17eae2bSChristophe Leroy 		pte_t *pte;
78e17eae2bSChristophe Leroy 
79e17eae2bSChristophe Leroy 		spin_lock(&walk->mm->page_table_lock);
80e17eae2bSChristophe Leroy 		pte = hugepte_offset(*phpd, addr, pdshift);
81e17eae2bSChristophe Leroy 		err = ops->pte_entry(pte, addr, addr + page_size, walk);
82e17eae2bSChristophe Leroy 		spin_unlock(&walk->mm->page_table_lock);
83e17eae2bSChristophe Leroy 
84e17eae2bSChristophe Leroy 		if (err)
85e17eae2bSChristophe Leroy 			break;
86e17eae2bSChristophe Leroy 		if (addr >= end - page_size)
87e17eae2bSChristophe Leroy 			break;
88e17eae2bSChristophe Leroy 		addr += page_size;
89e17eae2bSChristophe Leroy 	}
90e17eae2bSChristophe Leroy 	return err;
91e17eae2bSChristophe Leroy }
92e17eae2bSChristophe Leroy #else
93e17eae2bSChristophe Leroy static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
94e17eae2bSChristophe Leroy 			     unsigned long end, struct mm_walk *walk, int pdshift)
95e17eae2bSChristophe Leroy {
96e17eae2bSChristophe Leroy 	return 0;
97e17eae2bSChristophe Leroy }
98e17eae2bSChristophe Leroy #endif
99e17eae2bSChristophe Leroy 
100e6473092SMatt Mackall static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
1012165009bSDave Hansen 			  struct mm_walk *walk)
102e6473092SMatt Mackall {
103e6473092SMatt Mackall 	pmd_t *pmd;
104e6473092SMatt Mackall 	unsigned long next;
1057b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
106e6473092SMatt Mackall 	int err = 0;
107b7a16c7aSSteven Price 	int depth = real_depth(3);
108e6473092SMatt Mackall 
109e6473092SMatt Mackall 	pmd = pmd_offset(pud, addr);
110e6473092SMatt Mackall 	do {
11103319327SDave Hansen again:
112e6473092SMatt Mackall 		next = pmd_addr_end(addr, end);
1138782fb61SSteven Price 		if (pmd_none(*pmd)) {
1147b86ac33SChristoph Hellwig 			if (ops->pte_hole)
115b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
116e6473092SMatt Mackall 			if (err)
117e6473092SMatt Mackall 				break;
118e6473092SMatt Mackall 			continue;
119e6473092SMatt Mackall 		}
1203afc4236SSteven Price 
1213afc4236SSteven Price 		walk->action = ACTION_SUBTREE;
1223afc4236SSteven Price 
12303319327SDave Hansen 		/*
12403319327SDave Hansen 		 * This implies that each ->pmd_entry() handler
12503319327SDave Hansen 		 * needs to know about pmd_trans_huge() pmds
12603319327SDave Hansen 		 */
1277b86ac33SChristoph Hellwig 		if (ops->pmd_entry)
1287b86ac33SChristoph Hellwig 			err = ops->pmd_entry(pmd, addr, next, walk);
12903319327SDave Hansen 		if (err)
13003319327SDave Hansen 			break;
13103319327SDave Hansen 
1323afc4236SSteven Price 		if (walk->action == ACTION_AGAIN)
1333afc4236SSteven Price 			goto again;
1343afc4236SSteven Price 
13503319327SDave Hansen 		/*
13603319327SDave Hansen 		 * Check this here so we only break down trans_huge
13703319327SDave Hansen 		 * pages when we _need_ to
13803319327SDave Hansen 		 */
139488ae6a2SSteven Price 		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
140488ae6a2SSteven Price 		    walk->action == ACTION_CONTINUE ||
1413afc4236SSteven Price 		    !(ops->pte_entry))
14203319327SDave Hansen 			continue;
14303319327SDave Hansen 
144488ae6a2SSteven Price 		if (walk->vma) {
14578ddc534SKirill A. Shutemov 			split_huge_pmd(walk->vma, pmd, addr);
146fafaa426SNaoya Horiguchi 			if (pmd_trans_unstable(pmd))
14703319327SDave Hansen 				goto again;
148488ae6a2SSteven Price 		}
1493afc4236SSteven Price 
150e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
151e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
152e17eae2bSChristophe Leroy 		else
1532165009bSDave Hansen 			err = walk_pte_range(pmd, addr, next, walk);
154e6473092SMatt Mackall 		if (err)
155e6473092SMatt Mackall 			break;
156e6473092SMatt Mackall 	} while (pmd++, addr = next, addr != end);
157e6473092SMatt Mackall 
158e6473092SMatt Mackall 	return err;
159e6473092SMatt Mackall }
160e6473092SMatt Mackall 
161c2febafcSKirill A. Shutemov static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
1622165009bSDave Hansen 			  struct mm_walk *walk)
163e6473092SMatt Mackall {
164e6473092SMatt Mackall 	pud_t *pud;
165e6473092SMatt Mackall 	unsigned long next;
1667b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
167e6473092SMatt Mackall 	int err = 0;
168b7a16c7aSSteven Price 	int depth = real_depth(2);
169e6473092SMatt Mackall 
170c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, addr);
171e6473092SMatt Mackall 	do {
172a00cc7d9SMatthew Wilcox  again:
173e6473092SMatt Mackall 		next = pud_addr_end(addr, end);
1748782fb61SSteven Price 		if (pud_none(*pud)) {
1757b86ac33SChristoph Hellwig 			if (ops->pte_hole)
176b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
177e6473092SMatt Mackall 			if (err)
178e6473092SMatt Mackall 				break;
179e6473092SMatt Mackall 			continue;
180e6473092SMatt Mackall 		}
181a00cc7d9SMatthew Wilcox 
1823afc4236SSteven Price 		walk->action = ACTION_SUBTREE;
183a00cc7d9SMatthew Wilcox 
1843afc4236SSteven Price 		if (ops->pud_entry)
1857b86ac33SChristoph Hellwig 			err = ops->pud_entry(pud, addr, next, walk);
186a00cc7d9SMatthew Wilcox 		if (err)
187a00cc7d9SMatthew Wilcox 			break;
1883afc4236SSteven Price 
1893afc4236SSteven Price 		if (walk->action == ACTION_AGAIN)
1903afc4236SSteven Price 			goto again;
1913afc4236SSteven Price 
192488ae6a2SSteven Price 		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
193488ae6a2SSteven Price 		    walk->action == ACTION_CONTINUE ||
1943afc4236SSteven Price 		    !(ops->pmd_entry || ops->pte_entry))
195a00cc7d9SMatthew Wilcox 			continue;
196a00cc7d9SMatthew Wilcox 
197488ae6a2SSteven Price 		if (walk->vma)
198a00cc7d9SMatthew Wilcox 			split_huge_pud(walk->vma, pud, addr);
199a00cc7d9SMatthew Wilcox 		if (pud_none(*pud))
200a00cc7d9SMatthew Wilcox 			goto again;
201a00cc7d9SMatthew Wilcox 
202e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pud_val(*pud))))
203e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
204e17eae2bSChristophe Leroy 		else
2052165009bSDave Hansen 			err = walk_pmd_range(pud, addr, next, walk);
206e6473092SMatt Mackall 		if (err)
207e6473092SMatt Mackall 			break;
208e6473092SMatt Mackall 	} while (pud++, addr = next, addr != end);
209e6473092SMatt Mackall 
210e6473092SMatt Mackall 	return err;
211e6473092SMatt Mackall }
212e6473092SMatt Mackall 
213c2febafcSKirill A. Shutemov static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
214c2febafcSKirill A. Shutemov 			  struct mm_walk *walk)
215c2febafcSKirill A. Shutemov {
216c2febafcSKirill A. Shutemov 	p4d_t *p4d;
217c2febafcSKirill A. Shutemov 	unsigned long next;
2187b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
219c2febafcSKirill A. Shutemov 	int err = 0;
220b7a16c7aSSteven Price 	int depth = real_depth(1);
221c2febafcSKirill A. Shutemov 
222c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, addr);
223c2febafcSKirill A. Shutemov 	do {
224c2febafcSKirill A. Shutemov 		next = p4d_addr_end(addr, end);
225c2febafcSKirill A. Shutemov 		if (p4d_none_or_clear_bad(p4d)) {
2267b86ac33SChristoph Hellwig 			if (ops->pte_hole)
227b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, depth, walk);
228c2febafcSKirill A. Shutemov 			if (err)
229c2febafcSKirill A. Shutemov 				break;
230c2febafcSKirill A. Shutemov 			continue;
231c2febafcSKirill A. Shutemov 		}
2323afc4236SSteven Price 		if (ops->p4d_entry) {
2333afc4236SSteven Price 			err = ops->p4d_entry(p4d, addr, next, walk);
2343afc4236SSteven Price 			if (err)
2353afc4236SSteven Price 				break;
2363afc4236SSteven Price 		}
237e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(p4d_val(*p4d))))
238e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
239e17eae2bSChristophe Leroy 		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
240c2febafcSKirill A. Shutemov 			err = walk_pud_range(p4d, addr, next, walk);
241c2febafcSKirill A. Shutemov 		if (err)
242c2febafcSKirill A. Shutemov 			break;
243c2febafcSKirill A. Shutemov 	} while (p4d++, addr = next, addr != end);
244c2febafcSKirill A. Shutemov 
245c2febafcSKirill A. Shutemov 	return err;
246c2febafcSKirill A. Shutemov }
247c2febafcSKirill A. Shutemov 
248fafaa426SNaoya Horiguchi static int walk_pgd_range(unsigned long addr, unsigned long end,
249fafaa426SNaoya Horiguchi 			  struct mm_walk *walk)
250fafaa426SNaoya Horiguchi {
251fafaa426SNaoya Horiguchi 	pgd_t *pgd;
252fafaa426SNaoya Horiguchi 	unsigned long next;
2537b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
254fafaa426SNaoya Horiguchi 	int err = 0;
255fafaa426SNaoya Horiguchi 
256e47690d7SSteven Price 	if (walk->pgd)
257e47690d7SSteven Price 		pgd = walk->pgd + pgd_index(addr);
258e47690d7SSteven Price 	else
259fafaa426SNaoya Horiguchi 		pgd = pgd_offset(walk->mm, addr);
260fafaa426SNaoya Horiguchi 	do {
261fafaa426SNaoya Horiguchi 		next = pgd_addr_end(addr, end);
262fafaa426SNaoya Horiguchi 		if (pgd_none_or_clear_bad(pgd)) {
2637b86ac33SChristoph Hellwig 			if (ops->pte_hole)
264b7a16c7aSSteven Price 				err = ops->pte_hole(addr, next, 0, walk);
265fafaa426SNaoya Horiguchi 			if (err)
266fafaa426SNaoya Horiguchi 				break;
267fafaa426SNaoya Horiguchi 			continue;
268fafaa426SNaoya Horiguchi 		}
2693afc4236SSteven Price 		if (ops->pgd_entry) {
2703afc4236SSteven Price 			err = ops->pgd_entry(pgd, addr, next, walk);
2713afc4236SSteven Price 			if (err)
2723afc4236SSteven Price 				break;
2733afc4236SSteven Price 		}
274e17eae2bSChristophe Leroy 		if (is_hugepd(__hugepd(pgd_val(*pgd))))
275e17eae2bSChristophe Leroy 			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
276e17eae2bSChristophe Leroy 		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
277c2febafcSKirill A. Shutemov 			err = walk_p4d_range(pgd, addr, next, walk);
278fafaa426SNaoya Horiguchi 		if (err)
279fafaa426SNaoya Horiguchi 			break;
280fafaa426SNaoya Horiguchi 	} while (pgd++, addr = next, addr != end);
281fafaa426SNaoya Horiguchi 
282fafaa426SNaoya Horiguchi 	return err;
283fafaa426SNaoya Horiguchi }
284fafaa426SNaoya Horiguchi 
285116354d1SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
286116354d1SNaoya Horiguchi static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
287116354d1SNaoya Horiguchi 				       unsigned long end)
288116354d1SNaoya Horiguchi {
289116354d1SNaoya Horiguchi 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
290116354d1SNaoya Horiguchi 	return boundary < end ? boundary : end;
291116354d1SNaoya Horiguchi }
292116354d1SNaoya Horiguchi 
293fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
294116354d1SNaoya Horiguchi 			      struct mm_walk *walk)
295116354d1SNaoya Horiguchi {
296fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
297116354d1SNaoya Horiguchi 	struct hstate *h = hstate_vma(vma);
298116354d1SNaoya Horiguchi 	unsigned long next;
299116354d1SNaoya Horiguchi 	unsigned long hmask = huge_page_mask(h);
3007868a208SPunit Agrawal 	unsigned long sz = huge_page_size(h);
301116354d1SNaoya Horiguchi 	pte_t *pte;
3027b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
303116354d1SNaoya Horiguchi 	int err = 0;
304116354d1SNaoya Horiguchi 
305*dd361e50SPeter Xu 	hugetlb_vma_lock_read(vma);
306116354d1SNaoya Horiguchi 	do {
307116354d1SNaoya Horiguchi 		next = hugetlb_entry_end(h, addr, end);
3087868a208SPunit Agrawal 		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
309373c4557SJann Horn 
310373c4557SJann Horn 		if (pte)
3117b86ac33SChristoph Hellwig 			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
3127b86ac33SChristoph Hellwig 		else if (ops->pte_hole)
313b7a16c7aSSteven Price 			err = ops->pte_hole(addr, next, -1, walk);
314373c4557SJann Horn 
315116354d1SNaoya Horiguchi 		if (err)
316fafaa426SNaoya Horiguchi 			break;
317116354d1SNaoya Horiguchi 	} while (addr = next, addr != end);
318*dd361e50SPeter Xu 	hugetlb_vma_unlock_read(vma);
319116354d1SNaoya Horiguchi 
320fafaa426SNaoya Horiguchi 	return err;
321116354d1SNaoya Horiguchi }
3226c6d5280SKOSAKI Motohiro 
3236c6d5280SKOSAKI Motohiro #else /* CONFIG_HUGETLB_PAGE */
324fafaa426SNaoya Horiguchi static int walk_hugetlb_range(unsigned long addr, unsigned long end,
3256c6d5280SKOSAKI Motohiro 			      struct mm_walk *walk)
3266c6d5280SKOSAKI Motohiro {
3276c6d5280SKOSAKI Motohiro 	return 0;
3286c6d5280SKOSAKI Motohiro }
3296c6d5280SKOSAKI Motohiro 
3306c6d5280SKOSAKI Motohiro #endif /* CONFIG_HUGETLB_PAGE */
3316c6d5280SKOSAKI Motohiro 
332fafaa426SNaoya Horiguchi /*
333fafaa426SNaoya Horiguchi  * Decide whether we really walk over the current vma on [@start, @end)
334fafaa426SNaoya Horiguchi  * or skip it via the returned value. Return 0 if we do walk over the
335fafaa426SNaoya Horiguchi  * current vma, and return 1 if we skip the vma. Negative values means
336fafaa426SNaoya Horiguchi  * error, where we abort the current walk.
337e6473092SMatt Mackall  */
338fafaa426SNaoya Horiguchi static int walk_page_test(unsigned long start, unsigned long end,
3392165009bSDave Hansen 			struct mm_walk *walk)
340e6473092SMatt Mackall {
341fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
3427b86ac33SChristoph Hellwig 	const struct mm_walk_ops *ops = walk->ops;
343e6473092SMatt Mackall 
3447b86ac33SChristoph Hellwig 	if (ops->test_walk)
3457b86ac33SChristoph Hellwig 		return ops->test_walk(start, end, walk);
346fafaa426SNaoya Horiguchi 
347fafaa426SNaoya Horiguchi 	/*
34848684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
34948684a65SNaoya Horiguchi 	 * range, so we don't walk over it as we do for normal vmas. However,
35048684a65SNaoya Horiguchi 	 * Some callers are interested in handling hole range and they don't
35148684a65SNaoya Horiguchi 	 * want to just ignore any single address range. Such users certainly
35248684a65SNaoya Horiguchi 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
35348684a65SNaoya Horiguchi 	 * vma(VM_PFNMAP).
354fafaa426SNaoya Horiguchi 	 */
35548684a65SNaoya Horiguchi 	if (vma->vm_flags & VM_PFNMAP) {
35648684a65SNaoya Horiguchi 		int err = 1;
3577b86ac33SChristoph Hellwig 		if (ops->pte_hole)
358b7a16c7aSSteven Price 			err = ops->pte_hole(start, end, -1, walk);
35948684a65SNaoya Horiguchi 		return err ? err : 1;
36048684a65SNaoya Horiguchi 	}
361fafaa426SNaoya Horiguchi 	return 0;
362fafaa426SNaoya Horiguchi }
363fafaa426SNaoya Horiguchi 
364fafaa426SNaoya Horiguchi static int __walk_page_range(unsigned long start, unsigned long end,
365fafaa426SNaoya Horiguchi 			struct mm_walk *walk)
366fafaa426SNaoya Horiguchi {
367fafaa426SNaoya Horiguchi 	int err = 0;
368fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma = walk->vma;
369ecaad8acSThomas Hellstrom 	const struct mm_walk_ops *ops = walk->ops;
370ecaad8acSThomas Hellstrom 
3718782fb61SSteven Price 	if (ops->pre_vma) {
372ecaad8acSThomas Hellstrom 		err = ops->pre_vma(start, end, walk);
373ecaad8acSThomas Hellstrom 		if (err)
374ecaad8acSThomas Hellstrom 			return err;
375ecaad8acSThomas Hellstrom 	}
376fafaa426SNaoya Horiguchi 
3778782fb61SSteven Price 	if (is_vm_hugetlb_page(vma)) {
378ecaad8acSThomas Hellstrom 		if (ops->hugetlb_entry)
379fafaa426SNaoya Horiguchi 			err = walk_hugetlb_range(start, end, walk);
380fafaa426SNaoya Horiguchi 	} else
381fafaa426SNaoya Horiguchi 		err = walk_pgd_range(start, end, walk);
382fafaa426SNaoya Horiguchi 
3838782fb61SSteven Price 	if (ops->post_vma)
384ecaad8acSThomas Hellstrom 		ops->post_vma(walk);
385ecaad8acSThomas Hellstrom 
386e6473092SMatt Mackall 	return err;
387fafaa426SNaoya Horiguchi }
388fafaa426SNaoya Horiguchi 
389fafaa426SNaoya Horiguchi /**
390fafaa426SNaoya Horiguchi  * walk_page_range - walk page table with caller specific callbacks
3917b86ac33SChristoph Hellwig  * @mm:		mm_struct representing the target process of page table walk
392e8b098fcSMike Rapoport  * @start:	start address of the virtual address range
393e8b098fcSMike Rapoport  * @end:	end address of the virtual address range
3947b86ac33SChristoph Hellwig  * @ops:	operation to call during the walk
3957b86ac33SChristoph Hellwig  * @private:	private data for callbacks' usage
396fafaa426SNaoya Horiguchi  *
3977b86ac33SChristoph Hellwig  * Recursively walk the page table tree of the process represented by @mm
398fafaa426SNaoya Horiguchi  * within the virtual address range [@start, @end). During walking, we can do
399fafaa426SNaoya Horiguchi  * some caller-specific works for each entry, by setting up pmd_entry(),
400fafaa426SNaoya Horiguchi  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
401fafaa426SNaoya Horiguchi  * callbacks, the associated entries/pages are just ignored.
402fafaa426SNaoya Horiguchi  * The return values of these callbacks are commonly defined like below:
403a5d09bedSMike Rapoport  *
404fafaa426SNaoya Horiguchi  *  - 0  : succeeded to handle the current entry, and if you don't reach the
405fafaa426SNaoya Horiguchi  *         end address yet, continue to walk.
406fafaa426SNaoya Horiguchi  *  - >0 : succeeded to handle the current entry, and return to the caller
407fafaa426SNaoya Horiguchi  *         with caller specific value.
408fafaa426SNaoya Horiguchi  *  - <0 : failed to handle the current entry, and return to the caller
409fafaa426SNaoya Horiguchi  *         with error code.
410fafaa426SNaoya Horiguchi  *
411fafaa426SNaoya Horiguchi  * Before starting to walk page table, some callers want to check whether
412fafaa426SNaoya Horiguchi  * they really want to walk over the current vma, typically by checking
4137b86ac33SChristoph Hellwig  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
414fafaa426SNaoya Horiguchi  * purpose.
415fafaa426SNaoya Horiguchi  *
416ecaad8acSThomas Hellstrom  * If operations need to be staged before and committed after a vma is walked,
417ecaad8acSThomas Hellstrom  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
418ecaad8acSThomas Hellstrom  * since it is intended to handle commit-type operations, can't return any
419ecaad8acSThomas Hellstrom  * errors.
420ecaad8acSThomas Hellstrom  *
421fafaa426SNaoya Horiguchi  * struct mm_walk keeps current values of some common data like vma and pmd,
422fafaa426SNaoya Horiguchi  * which are useful for the access from callbacks. If you want to pass some
4237b86ac33SChristoph Hellwig  * caller-specific data to callbacks, @private should be helpful.
424fafaa426SNaoya Horiguchi  *
425fafaa426SNaoya Horiguchi  * Locking:
426c1e8d7c6SMichel Lespinasse  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
4277b86ac33SChristoph Hellwig  *   because these function traverse vma list and/or access to vma's data.
428fafaa426SNaoya Horiguchi  */
4297b86ac33SChristoph Hellwig int walk_page_range(struct mm_struct *mm, unsigned long start,
4307b86ac33SChristoph Hellwig 		unsigned long end, const struct mm_walk_ops *ops,
4317b86ac33SChristoph Hellwig 		void *private)
432fafaa426SNaoya Horiguchi {
433fafaa426SNaoya Horiguchi 	int err = 0;
434fafaa426SNaoya Horiguchi 	unsigned long next;
435fafaa426SNaoya Horiguchi 	struct vm_area_struct *vma;
4367b86ac33SChristoph Hellwig 	struct mm_walk walk = {
4377b86ac33SChristoph Hellwig 		.ops		= ops,
4387b86ac33SChristoph Hellwig 		.mm		= mm,
4397b86ac33SChristoph Hellwig 		.private	= private,
4407b86ac33SChristoph Hellwig 	};
441fafaa426SNaoya Horiguchi 
442fafaa426SNaoya Horiguchi 	if (start >= end)
443fafaa426SNaoya Horiguchi 		return -EINVAL;
444e6473092SMatt Mackall 
4457b86ac33SChristoph Hellwig 	if (!walk.mm)
4462165009bSDave Hansen 		return -EINVAL;
4472165009bSDave Hansen 
44842fc5414SMichel Lespinasse 	mmap_assert_locked(walk.mm);
449a9ff785eSCliff Wickman 
4507b86ac33SChristoph Hellwig 	vma = find_vma(walk.mm, start);
451e6473092SMatt Mackall 	do {
452fafaa426SNaoya Horiguchi 		if (!vma) { /* after the last vma */
4537b86ac33SChristoph Hellwig 			walk.vma = NULL;
454fafaa426SNaoya Horiguchi 			next = end;
4558782fb61SSteven Price 			if (ops->pte_hole)
4568782fb61SSteven Price 				err = ops->pte_hole(start, next, -1, &walk);
457fafaa426SNaoya Horiguchi 		} else if (start < vma->vm_start) { /* outside vma */
4587b86ac33SChristoph Hellwig 			walk.vma = NULL;
459fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_start);
4608782fb61SSteven Price 			if (ops->pte_hole)
4618782fb61SSteven Price 				err = ops->pte_hole(start, next, -1, &walk);
462fafaa426SNaoya Horiguchi 		} else { /* inside vma */
4637b86ac33SChristoph Hellwig 			walk.vma = vma;
464fafaa426SNaoya Horiguchi 			next = min(end, vma->vm_end);
4659ec08f30SMatthew Wilcox (Oracle) 			vma = find_vma(mm, vma->vm_end);
4665f0af70aSDavid Sterba 
4677b86ac33SChristoph Hellwig 			err = walk_page_test(start, next, &walk);
468f6837395SNaoya Horiguchi 			if (err > 0) {
469f6837395SNaoya Horiguchi 				/*
470f6837395SNaoya Horiguchi 				 * positive return values are purely for
471f6837395SNaoya Horiguchi 				 * controlling the pagewalk, so should never
472f6837395SNaoya Horiguchi 				 * be passed to the callers.
473f6837395SNaoya Horiguchi 				 */
474f6837395SNaoya Horiguchi 				err = 0;
475a9ff785eSCliff Wickman 				continue;
476f6837395SNaoya Horiguchi 			}
477fafaa426SNaoya Horiguchi 			if (err < 0)
478fafaa426SNaoya Horiguchi 				break;
4797b86ac33SChristoph Hellwig 			err = __walk_page_range(start, next, &walk);
4808782fb61SSteven Price 		}
4815dc37642SNaoya Horiguchi 		if (err)
4825dc37642SNaoya Horiguchi 			break;
483fafaa426SNaoya Horiguchi 	} while (start = next, start < end);
484e6473092SMatt Mackall 	return err;
485e6473092SMatt Mackall }
486900fc5f1SNaoya Horiguchi 
4878bd3873dSRolf Eike Beer /**
4888bd3873dSRolf Eike Beer  * walk_page_range_novma - walk a range of pagetables not backed by a vma
4898bd3873dSRolf Eike Beer  * @mm:		mm_struct representing the target process of page table walk
4908bd3873dSRolf Eike Beer  * @start:	start address of the virtual address range
4918bd3873dSRolf Eike Beer  * @end:	end address of the virtual address range
4928bd3873dSRolf Eike Beer  * @ops:	operation to call during the walk
4938bd3873dSRolf Eike Beer  * @pgd:	pgd to walk if different from mm->pgd
4948bd3873dSRolf Eike Beer  * @private:	private data for callbacks' usage
4958bd3873dSRolf Eike Beer  *
496fbf56346SSteven Price  * Similar to walk_page_range() but can walk any page tables even if they are
497fbf56346SSteven Price  * not backed by VMAs. Because 'unusual' entries may be walked this function
498fbf56346SSteven Price  * will also not lock the PTEs for the pte_entry() callback. This is useful for
499fbf56346SSteven Price  * walking the kernel pages tables or page tables for firmware.
500fbf56346SSteven Price  */
501488ae6a2SSteven Price int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
502488ae6a2SSteven Price 			  unsigned long end, const struct mm_walk_ops *ops,
503e47690d7SSteven Price 			  pgd_t *pgd,
504488ae6a2SSteven Price 			  void *private)
505488ae6a2SSteven Price {
506488ae6a2SSteven Price 	struct mm_walk walk = {
507488ae6a2SSteven Price 		.ops		= ops,
508488ae6a2SSteven Price 		.mm		= mm,
509e47690d7SSteven Price 		.pgd		= pgd,
510488ae6a2SSteven Price 		.private	= private,
511488ae6a2SSteven Price 		.no_vma		= true
512488ae6a2SSteven Price 	};
513488ae6a2SSteven Price 
514488ae6a2SSteven Price 	if (start >= end || !walk.mm)
515488ae6a2SSteven Price 		return -EINVAL;
516488ae6a2SSteven Price 
5178782fb61SSteven Price 	mmap_assert_write_locked(walk.mm);
518488ae6a2SSteven Price 
5198782fb61SSteven Price 	return walk_pgd_range(start, end, &walk);
520488ae6a2SSteven Price }
521488ae6a2SSteven Price 
522e07cda5fSDavid Hildenbrand int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
523e07cda5fSDavid Hildenbrand 			unsigned long end, const struct mm_walk_ops *ops,
524e07cda5fSDavid Hildenbrand 			void *private)
525e07cda5fSDavid Hildenbrand {
526e07cda5fSDavid Hildenbrand 	struct mm_walk walk = {
527e07cda5fSDavid Hildenbrand 		.ops		= ops,
528e07cda5fSDavid Hildenbrand 		.mm		= vma->vm_mm,
529e07cda5fSDavid Hildenbrand 		.vma		= vma,
530e07cda5fSDavid Hildenbrand 		.private	= private,
531e07cda5fSDavid Hildenbrand 	};
532e07cda5fSDavid Hildenbrand 
533e07cda5fSDavid Hildenbrand 	if (start >= end || !walk.mm)
534e07cda5fSDavid Hildenbrand 		return -EINVAL;
535e07cda5fSDavid Hildenbrand 	if (start < vma->vm_start || end > vma->vm_end)
536e07cda5fSDavid Hildenbrand 		return -EINVAL;
537e07cda5fSDavid Hildenbrand 
538e07cda5fSDavid Hildenbrand 	mmap_assert_locked(walk.mm);
539e07cda5fSDavid Hildenbrand 	return __walk_page_range(start, end, &walk);
540e07cda5fSDavid Hildenbrand }
541e07cda5fSDavid Hildenbrand 
5427b86ac33SChristoph Hellwig int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
5437b86ac33SChristoph Hellwig 		void *private)
544900fc5f1SNaoya Horiguchi {
5457b86ac33SChristoph Hellwig 	struct mm_walk walk = {
5467b86ac33SChristoph Hellwig 		.ops		= ops,
5477b86ac33SChristoph Hellwig 		.mm		= vma->vm_mm,
5487b86ac33SChristoph Hellwig 		.vma		= vma,
5497b86ac33SChristoph Hellwig 		.private	= private,
5507b86ac33SChristoph Hellwig 	};
551900fc5f1SNaoya Horiguchi 
5527b86ac33SChristoph Hellwig 	if (!walk.mm)
553900fc5f1SNaoya Horiguchi 		return -EINVAL;
554900fc5f1SNaoya Horiguchi 
55542fc5414SMichel Lespinasse 	mmap_assert_locked(walk.mm);
5567b86ac33SChristoph Hellwig 	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
557900fc5f1SNaoya Horiguchi }
558ecaad8acSThomas Hellstrom 
559ecaad8acSThomas Hellstrom /**
560ecaad8acSThomas Hellstrom  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
561ecaad8acSThomas Hellstrom  * @mapping: Pointer to the struct address_space
562ecaad8acSThomas Hellstrom  * @first_index: First page offset in the address_space
563ecaad8acSThomas Hellstrom  * @nr: Number of incremental page offsets to cover
564ecaad8acSThomas Hellstrom  * @ops:	operation to call during the walk
565ecaad8acSThomas Hellstrom  * @private:	private data for callbacks' usage
566ecaad8acSThomas Hellstrom  *
567ecaad8acSThomas Hellstrom  * This function walks all memory areas mapped into a struct address_space.
568ecaad8acSThomas Hellstrom  * The walk is limited to only the given page-size index range, but if
569ecaad8acSThomas Hellstrom  * the index boundaries cross a huge page-table entry, that entry will be
570ecaad8acSThomas Hellstrom  * included.
571ecaad8acSThomas Hellstrom  *
572ecaad8acSThomas Hellstrom  * Also see walk_page_range() for additional information.
573ecaad8acSThomas Hellstrom  *
574ecaad8acSThomas Hellstrom  * Locking:
575c1e8d7c6SMichel Lespinasse  *   This function can't require that the struct mm_struct::mmap_lock is held,
576ecaad8acSThomas Hellstrom  *   since @mapping may be mapped by multiple processes. Instead
577ecaad8acSThomas Hellstrom  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
578ecaad8acSThomas Hellstrom  *   callbacks, and it's up tho the caller to ensure that the
579c1e8d7c6SMichel Lespinasse  *   struct mm_struct::mmap_lock is not needed.
580ecaad8acSThomas Hellstrom  *
581ecaad8acSThomas Hellstrom  *   Also this means that a caller can't rely on the struct
582ecaad8acSThomas Hellstrom  *   vm_area_struct::vm_flags to be constant across a call,
583ecaad8acSThomas Hellstrom  *   except for immutable flags. Callers requiring this shouldn't use
584ecaad8acSThomas Hellstrom  *   this function.
585ecaad8acSThomas Hellstrom  *
586ecaad8acSThomas Hellstrom  * Return: 0 on success, negative error code on failure, positive number on
587ecaad8acSThomas Hellstrom  * caller defined premature termination.
588ecaad8acSThomas Hellstrom  */
589ecaad8acSThomas Hellstrom int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
590ecaad8acSThomas Hellstrom 		      pgoff_t nr, const struct mm_walk_ops *ops,
591ecaad8acSThomas Hellstrom 		      void *private)
592ecaad8acSThomas Hellstrom {
593ecaad8acSThomas Hellstrom 	struct mm_walk walk = {
594ecaad8acSThomas Hellstrom 		.ops		= ops,
595ecaad8acSThomas Hellstrom 		.private	= private,
596ecaad8acSThomas Hellstrom 	};
597ecaad8acSThomas Hellstrom 	struct vm_area_struct *vma;
598ecaad8acSThomas Hellstrom 	pgoff_t vba, vea, cba, cea;
599ecaad8acSThomas Hellstrom 	unsigned long start_addr, end_addr;
600ecaad8acSThomas Hellstrom 	int err = 0;
601ecaad8acSThomas Hellstrom 
602ecaad8acSThomas Hellstrom 	lockdep_assert_held(&mapping->i_mmap_rwsem);
603ecaad8acSThomas Hellstrom 	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
604ecaad8acSThomas Hellstrom 				  first_index + nr - 1) {
605ecaad8acSThomas Hellstrom 		/* Clip to the vma */
606ecaad8acSThomas Hellstrom 		vba = vma->vm_pgoff;
607ecaad8acSThomas Hellstrom 		vea = vba + vma_pages(vma);
608ecaad8acSThomas Hellstrom 		cba = first_index;
609ecaad8acSThomas Hellstrom 		cba = max(cba, vba);
610ecaad8acSThomas Hellstrom 		cea = first_index + nr;
611ecaad8acSThomas Hellstrom 		cea = min(cea, vea);
612ecaad8acSThomas Hellstrom 
613ecaad8acSThomas Hellstrom 		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
614ecaad8acSThomas Hellstrom 		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
615ecaad8acSThomas Hellstrom 		if (start_addr >= end_addr)
616ecaad8acSThomas Hellstrom 			continue;
617ecaad8acSThomas Hellstrom 
618ecaad8acSThomas Hellstrom 		walk.vma = vma;
619ecaad8acSThomas Hellstrom 		walk.mm = vma->vm_mm;
620ecaad8acSThomas Hellstrom 
621ecaad8acSThomas Hellstrom 		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
622ecaad8acSThomas Hellstrom 		if (err > 0) {
623ecaad8acSThomas Hellstrom 			err = 0;
624ecaad8acSThomas Hellstrom 			break;
625ecaad8acSThomas Hellstrom 		} else if (err < 0)
626ecaad8acSThomas Hellstrom 			break;
627ecaad8acSThomas Hellstrom 
628ecaad8acSThomas Hellstrom 		err = __walk_page_range(start_addr, end_addr, &walk);
629ecaad8acSThomas Hellstrom 		if (err)
630ecaad8acSThomas Hellstrom 			break;
631ecaad8acSThomas Hellstrom 	}
632ecaad8acSThomas Hellstrom 
633ecaad8acSThomas Hellstrom 	return err;
634ecaad8acSThomas Hellstrom }
635