xref: /linux/mm/pagewalk.c (revision 55f1b540d893da740a81200450014c45a8103f54)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6 #include <linux/swap.h>
7 #include <linux/swapops.h>
8 
9 /*
10  * We want to know the real level where a entry is located ignoring any
11  * folding of levels which may be happening. For example if p4d is folded then
12  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
13  */
14 static int real_depth(int depth)
15 {
16 	if (depth == 3 && PTRS_PER_PMD == 1)
17 		depth = 2;
18 	if (depth == 2 && PTRS_PER_PUD == 1)
19 		depth = 1;
20 	if (depth == 1 && PTRS_PER_P4D == 1)
21 		depth = 0;
22 	return depth;
23 }
24 
25 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
26 				unsigned long end, struct mm_walk *walk)
27 {
28 	const struct mm_walk_ops *ops = walk->ops;
29 	int err = 0;
30 
31 	for (;;) {
32 		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
33 		if (err)
34 		       break;
35 		if (addr >= end - PAGE_SIZE)
36 			break;
37 		addr += PAGE_SIZE;
38 		pte++;
39 	}
40 	return err;
41 }
42 
43 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
44 			  struct mm_walk *walk)
45 {
46 	pte_t *pte;
47 	int err = 0;
48 	spinlock_t *ptl;
49 
50 	if (walk->no_vma) {
51 		/*
52 		 * pte_offset_map() might apply user-specific validation.
53 		 * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
54 		 * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
55 		 * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
56 		 */
57 		if (walk->mm == &init_mm || addr >= TASK_SIZE)
58 			pte = pte_offset_kernel(pmd, addr);
59 		else
60 			pte = pte_offset_map(pmd, addr);
61 		if (pte) {
62 			err = walk_pte_range_inner(pte, addr, end, walk);
63 			if (walk->mm != &init_mm && addr < TASK_SIZE)
64 				pte_unmap(pte);
65 		}
66 	} else {
67 		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
68 		if (pte) {
69 			err = walk_pte_range_inner(pte, addr, end, walk);
70 			pte_unmap_unlock(pte, ptl);
71 		}
72 	}
73 	if (!pte)
74 		walk->action = ACTION_AGAIN;
75 	return err;
76 }
77 
78 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
79 			  struct mm_walk *walk)
80 {
81 	pmd_t *pmd;
82 	unsigned long next;
83 	const struct mm_walk_ops *ops = walk->ops;
84 	int err = 0;
85 	int depth = real_depth(3);
86 
87 	pmd = pmd_offset(pud, addr);
88 	do {
89 again:
90 		next = pmd_addr_end(addr, end);
91 		if (pmd_none(*pmd)) {
92 			if (ops->pte_hole)
93 				err = ops->pte_hole(addr, next, depth, walk);
94 			if (err)
95 				break;
96 			continue;
97 		}
98 
99 		walk->action = ACTION_SUBTREE;
100 
101 		/*
102 		 * This implies that each ->pmd_entry() handler
103 		 * needs to know about pmd_trans_huge() pmds
104 		 */
105 		if (ops->pmd_entry)
106 			err = ops->pmd_entry(pmd, addr, next, walk);
107 		if (err)
108 			break;
109 
110 		if (walk->action == ACTION_AGAIN)
111 			goto again;
112 
113 		/*
114 		 * Check this here so we only break down trans_huge
115 		 * pages when we _need_ to
116 		 */
117 		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
118 		    walk->action == ACTION_CONTINUE ||
119 		    !(ops->pte_entry))
120 			continue;
121 
122 		if (walk->vma)
123 			split_huge_pmd(walk->vma, pmd, addr);
124 
125 		err = walk_pte_range(pmd, addr, next, walk);
126 		if (err)
127 			break;
128 
129 		if (walk->action == ACTION_AGAIN)
130 			goto again;
131 
132 	} while (pmd++, addr = next, addr != end);
133 
134 	return err;
135 }
136 
137 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
138 			  struct mm_walk *walk)
139 {
140 	pud_t *pud;
141 	unsigned long next;
142 	const struct mm_walk_ops *ops = walk->ops;
143 	int err = 0;
144 	int depth = real_depth(2);
145 
146 	pud = pud_offset(p4d, addr);
147 	do {
148  again:
149 		next = pud_addr_end(addr, end);
150 		if (pud_none(*pud)) {
151 			if (ops->pte_hole)
152 				err = ops->pte_hole(addr, next, depth, walk);
153 			if (err)
154 				break;
155 			continue;
156 		}
157 
158 		walk->action = ACTION_SUBTREE;
159 
160 		if (ops->pud_entry)
161 			err = ops->pud_entry(pud, addr, next, walk);
162 		if (err)
163 			break;
164 
165 		if (walk->action == ACTION_AGAIN)
166 			goto again;
167 
168 		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
169 		    walk->action == ACTION_CONTINUE ||
170 		    !(ops->pmd_entry || ops->pte_entry))
171 			continue;
172 
173 		if (walk->vma)
174 			split_huge_pud(walk->vma, pud, addr);
175 		if (pud_none(*pud))
176 			goto again;
177 
178 		err = walk_pmd_range(pud, addr, next, walk);
179 		if (err)
180 			break;
181 	} while (pud++, addr = next, addr != end);
182 
183 	return err;
184 }
185 
186 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
187 			  struct mm_walk *walk)
188 {
189 	p4d_t *p4d;
190 	unsigned long next;
191 	const struct mm_walk_ops *ops = walk->ops;
192 	int err = 0;
193 	int depth = real_depth(1);
194 
195 	p4d = p4d_offset(pgd, addr);
196 	do {
197 		next = p4d_addr_end(addr, end);
198 		if (p4d_none_or_clear_bad(p4d)) {
199 			if (ops->pte_hole)
200 				err = ops->pte_hole(addr, next, depth, walk);
201 			if (err)
202 				break;
203 			continue;
204 		}
205 		if (ops->p4d_entry) {
206 			err = ops->p4d_entry(p4d, addr, next, walk);
207 			if (err)
208 				break;
209 		}
210 		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
211 			err = walk_pud_range(p4d, addr, next, walk);
212 		if (err)
213 			break;
214 	} while (p4d++, addr = next, addr != end);
215 
216 	return err;
217 }
218 
219 static int walk_pgd_range(unsigned long addr, unsigned long end,
220 			  struct mm_walk *walk)
221 {
222 	pgd_t *pgd;
223 	unsigned long next;
224 	const struct mm_walk_ops *ops = walk->ops;
225 	int err = 0;
226 
227 	if (walk->pgd)
228 		pgd = walk->pgd + pgd_index(addr);
229 	else
230 		pgd = pgd_offset(walk->mm, addr);
231 	do {
232 		next = pgd_addr_end(addr, end);
233 		if (pgd_none_or_clear_bad(pgd)) {
234 			if (ops->pte_hole)
235 				err = ops->pte_hole(addr, next, 0, walk);
236 			if (err)
237 				break;
238 			continue;
239 		}
240 		if (ops->pgd_entry) {
241 			err = ops->pgd_entry(pgd, addr, next, walk);
242 			if (err)
243 				break;
244 		}
245 		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
246 			err = walk_p4d_range(pgd, addr, next, walk);
247 		if (err)
248 			break;
249 	} while (pgd++, addr = next, addr != end);
250 
251 	return err;
252 }
253 
254 #ifdef CONFIG_HUGETLB_PAGE
255 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
256 				       unsigned long end)
257 {
258 	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
259 	return boundary < end ? boundary : end;
260 }
261 
262 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
263 			      struct mm_walk *walk)
264 {
265 	struct vm_area_struct *vma = walk->vma;
266 	struct hstate *h = hstate_vma(vma);
267 	unsigned long next;
268 	unsigned long hmask = huge_page_mask(h);
269 	unsigned long sz = huge_page_size(h);
270 	pte_t *pte;
271 	const struct mm_walk_ops *ops = walk->ops;
272 	int err = 0;
273 
274 	hugetlb_vma_lock_read(vma);
275 	do {
276 		next = hugetlb_entry_end(h, addr, end);
277 		pte = hugetlb_walk(vma, addr & hmask, sz);
278 		if (pte)
279 			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
280 		else if (ops->pte_hole)
281 			err = ops->pte_hole(addr, next, -1, walk);
282 		if (err)
283 			break;
284 	} while (addr = next, addr != end);
285 	hugetlb_vma_unlock_read(vma);
286 
287 	return err;
288 }
289 
290 #else /* CONFIG_HUGETLB_PAGE */
291 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
292 			      struct mm_walk *walk)
293 {
294 	return 0;
295 }
296 
297 #endif /* CONFIG_HUGETLB_PAGE */
298 
299 /*
300  * Decide whether we really walk over the current vma on [@start, @end)
301  * or skip it via the returned value. Return 0 if we do walk over the
302  * current vma, and return 1 if we skip the vma. Negative values means
303  * error, where we abort the current walk.
304  */
305 static int walk_page_test(unsigned long start, unsigned long end,
306 			struct mm_walk *walk)
307 {
308 	struct vm_area_struct *vma = walk->vma;
309 	const struct mm_walk_ops *ops = walk->ops;
310 
311 	if (ops->test_walk)
312 		return ops->test_walk(start, end, walk);
313 
314 	/*
315 	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
316 	 * range, so we don't walk over it as we do for normal vmas. However,
317 	 * Some callers are interested in handling hole range and they don't
318 	 * want to just ignore any single address range. Such users certainly
319 	 * define their ->pte_hole() callbacks, so let's delegate them to handle
320 	 * vma(VM_PFNMAP).
321 	 */
322 	if (vma->vm_flags & VM_PFNMAP) {
323 		int err = 1;
324 		if (ops->pte_hole)
325 			err = ops->pte_hole(start, end, -1, walk);
326 		return err ? err : 1;
327 	}
328 	return 0;
329 }
330 
331 static int __walk_page_range(unsigned long start, unsigned long end,
332 			struct mm_walk *walk)
333 {
334 	int err = 0;
335 	struct vm_area_struct *vma = walk->vma;
336 	const struct mm_walk_ops *ops = walk->ops;
337 
338 	if (ops->pre_vma) {
339 		err = ops->pre_vma(start, end, walk);
340 		if (err)
341 			return err;
342 	}
343 
344 	if (is_vm_hugetlb_page(vma)) {
345 		if (ops->hugetlb_entry)
346 			err = walk_hugetlb_range(start, end, walk);
347 	} else
348 		err = walk_pgd_range(start, end, walk);
349 
350 	if (ops->post_vma)
351 		ops->post_vma(walk);
352 
353 	return err;
354 }
355 
356 static inline void process_mm_walk_lock(struct mm_struct *mm,
357 					enum page_walk_lock walk_lock)
358 {
359 	if (walk_lock == PGWALK_RDLOCK)
360 		mmap_assert_locked(mm);
361 	else
362 		mmap_assert_write_locked(mm);
363 }
364 
365 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
366 					 enum page_walk_lock walk_lock)
367 {
368 #ifdef CONFIG_PER_VMA_LOCK
369 	switch (walk_lock) {
370 	case PGWALK_WRLOCK:
371 		vma_start_write(vma);
372 		break;
373 	case PGWALK_WRLOCK_VERIFY:
374 		vma_assert_write_locked(vma);
375 		break;
376 	case PGWALK_RDLOCK:
377 		/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
378 		break;
379 	}
380 #endif
381 }
382 
383 /**
384  * walk_page_range - walk page table with caller specific callbacks
385  * @mm:		mm_struct representing the target process of page table walk
386  * @start:	start address of the virtual address range
387  * @end:	end address of the virtual address range
388  * @ops:	operation to call during the walk
389  * @private:	private data for callbacks' usage
390  *
391  * Recursively walk the page table tree of the process represented by @mm
392  * within the virtual address range [@start, @end). During walking, we can do
393  * some caller-specific works for each entry, by setting up pmd_entry(),
394  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
395  * callbacks, the associated entries/pages are just ignored.
396  * The return values of these callbacks are commonly defined like below:
397  *
398  *  - 0  : succeeded to handle the current entry, and if you don't reach the
399  *         end address yet, continue to walk.
400  *  - >0 : succeeded to handle the current entry, and return to the caller
401  *         with caller specific value.
402  *  - <0 : failed to handle the current entry, and return to the caller
403  *         with error code.
404  *
405  * Before starting to walk page table, some callers want to check whether
406  * they really want to walk over the current vma, typically by checking
407  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
408  * purpose.
409  *
410  * If operations need to be staged before and committed after a vma is walked,
411  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
412  * since it is intended to handle commit-type operations, can't return any
413  * errors.
414  *
415  * struct mm_walk keeps current values of some common data like vma and pmd,
416  * which are useful for the access from callbacks. If you want to pass some
417  * caller-specific data to callbacks, @private should be helpful.
418  *
419  * Locking:
420  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
421  *   because these function traverse vma list and/or access to vma's data.
422  */
423 int walk_page_range(struct mm_struct *mm, unsigned long start,
424 		unsigned long end, const struct mm_walk_ops *ops,
425 		void *private)
426 {
427 	int err = 0;
428 	unsigned long next;
429 	struct vm_area_struct *vma;
430 	struct mm_walk walk = {
431 		.ops		= ops,
432 		.mm		= mm,
433 		.private	= private,
434 	};
435 
436 	if (start >= end)
437 		return -EINVAL;
438 
439 	if (!walk.mm)
440 		return -EINVAL;
441 
442 	process_mm_walk_lock(walk.mm, ops->walk_lock);
443 
444 	vma = find_vma(walk.mm, start);
445 	do {
446 		if (!vma) { /* after the last vma */
447 			walk.vma = NULL;
448 			next = end;
449 			if (ops->pte_hole)
450 				err = ops->pte_hole(start, next, -1, &walk);
451 		} else if (start < vma->vm_start) { /* outside vma */
452 			walk.vma = NULL;
453 			next = min(end, vma->vm_start);
454 			if (ops->pte_hole)
455 				err = ops->pte_hole(start, next, -1, &walk);
456 		} else { /* inside vma */
457 			process_vma_walk_lock(vma, ops->walk_lock);
458 			walk.vma = vma;
459 			next = min(end, vma->vm_end);
460 			vma = find_vma(mm, vma->vm_end);
461 
462 			err = walk_page_test(start, next, &walk);
463 			if (err > 0) {
464 				/*
465 				 * positive return values are purely for
466 				 * controlling the pagewalk, so should never
467 				 * be passed to the callers.
468 				 */
469 				err = 0;
470 				continue;
471 			}
472 			if (err < 0)
473 				break;
474 			err = __walk_page_range(start, next, &walk);
475 		}
476 		if (err)
477 			break;
478 	} while (start = next, start < end);
479 	return err;
480 }
481 
482 /**
483  * walk_page_range_novma - walk a range of pagetables not backed by a vma
484  * @mm:		mm_struct representing the target process of page table walk
485  * @start:	start address of the virtual address range
486  * @end:	end address of the virtual address range
487  * @ops:	operation to call during the walk
488  * @pgd:	pgd to walk if different from mm->pgd
489  * @private:	private data for callbacks' usage
490  *
491  * Similar to walk_page_range() but can walk any page tables even if they are
492  * not backed by VMAs. Because 'unusual' entries may be walked this function
493  * will also not lock the PTEs for the pte_entry() callback. This is useful for
494  * walking the kernel pages tables or page tables for firmware.
495  *
496  * Note: Be careful to walk the kernel pages tables, the caller may be need to
497  * take other effective approache (mmap lock may be insufficient) to prevent
498  * the intermediate kernel page tables belonging to the specified address range
499  * from being freed (e.g. memory hot-remove).
500  */
501 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
502 			  unsigned long end, const struct mm_walk_ops *ops,
503 			  pgd_t *pgd,
504 			  void *private)
505 {
506 	struct mm_walk walk = {
507 		.ops		= ops,
508 		.mm		= mm,
509 		.pgd		= pgd,
510 		.private	= private,
511 		.no_vma		= true
512 	};
513 
514 	if (start >= end || !walk.mm)
515 		return -EINVAL;
516 
517 	/*
518 	 * 1) For walking the user virtual address space:
519 	 *
520 	 * The mmap lock protects the page walker from changes to the page
521 	 * tables during the walk.  However a read lock is insufficient to
522 	 * protect those areas which don't have a VMA as munmap() detaches
523 	 * the VMAs before downgrading to a read lock and actually tearing
524 	 * down PTEs/page tables. In which case, the mmap write lock should
525 	 * be hold.
526 	 *
527 	 * 2) For walking the kernel virtual address space:
528 	 *
529 	 * The kernel intermediate page tables usually do not be freed, so
530 	 * the mmap map read lock is sufficient. But there are some exceptions.
531 	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
532 	 * to prevent the intermediate kernel pages tables belonging to the
533 	 * specified address range from being freed. The caller should take
534 	 * other actions to prevent this race.
535 	 */
536 	if (mm == &init_mm)
537 		mmap_assert_locked(walk.mm);
538 	else
539 		mmap_assert_write_locked(walk.mm);
540 
541 	return walk_pgd_range(start, end, &walk);
542 }
543 
544 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
545 			unsigned long end, const struct mm_walk_ops *ops,
546 			void *private)
547 {
548 	struct mm_walk walk = {
549 		.ops		= ops,
550 		.mm		= vma->vm_mm,
551 		.vma		= vma,
552 		.private	= private,
553 	};
554 
555 	if (start >= end || !walk.mm)
556 		return -EINVAL;
557 	if (start < vma->vm_start || end > vma->vm_end)
558 		return -EINVAL;
559 
560 	process_mm_walk_lock(walk.mm, ops->walk_lock);
561 	process_vma_walk_lock(vma, ops->walk_lock);
562 	return __walk_page_range(start, end, &walk);
563 }
564 
565 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
566 		void *private)
567 {
568 	struct mm_walk walk = {
569 		.ops		= ops,
570 		.mm		= vma->vm_mm,
571 		.vma		= vma,
572 		.private	= private,
573 	};
574 
575 	if (!walk.mm)
576 		return -EINVAL;
577 
578 	process_mm_walk_lock(walk.mm, ops->walk_lock);
579 	process_vma_walk_lock(vma, ops->walk_lock);
580 	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
581 }
582 
583 /**
584  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
585  * @mapping: Pointer to the struct address_space
586  * @first_index: First page offset in the address_space
587  * @nr: Number of incremental page offsets to cover
588  * @ops:	operation to call during the walk
589  * @private:	private data for callbacks' usage
590  *
591  * This function walks all memory areas mapped into a struct address_space.
592  * The walk is limited to only the given page-size index range, but if
593  * the index boundaries cross a huge page-table entry, that entry will be
594  * included.
595  *
596  * Also see walk_page_range() for additional information.
597  *
598  * Locking:
599  *   This function can't require that the struct mm_struct::mmap_lock is held,
600  *   since @mapping may be mapped by multiple processes. Instead
601  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
602  *   callbacks, and it's up tho the caller to ensure that the
603  *   struct mm_struct::mmap_lock is not needed.
604  *
605  *   Also this means that a caller can't rely on the struct
606  *   vm_area_struct::vm_flags to be constant across a call,
607  *   except for immutable flags. Callers requiring this shouldn't use
608  *   this function.
609  *
610  * Return: 0 on success, negative error code on failure, positive number on
611  * caller defined premature termination.
612  */
613 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
614 		      pgoff_t nr, const struct mm_walk_ops *ops,
615 		      void *private)
616 {
617 	struct mm_walk walk = {
618 		.ops		= ops,
619 		.private	= private,
620 	};
621 	struct vm_area_struct *vma;
622 	pgoff_t vba, vea, cba, cea;
623 	unsigned long start_addr, end_addr;
624 	int err = 0;
625 
626 	lockdep_assert_held(&mapping->i_mmap_rwsem);
627 	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
628 				  first_index + nr - 1) {
629 		/* Clip to the vma */
630 		vba = vma->vm_pgoff;
631 		vea = vba + vma_pages(vma);
632 		cba = first_index;
633 		cba = max(cba, vba);
634 		cea = first_index + nr;
635 		cea = min(cea, vea);
636 
637 		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
638 		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
639 		if (start_addr >= end_addr)
640 			continue;
641 
642 		walk.vma = vma;
643 		walk.mm = vma->vm_mm;
644 
645 		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
646 		if (err > 0) {
647 			err = 0;
648 			break;
649 		} else if (err < 0)
650 			break;
651 
652 		err = __walk_page_range(start_addr, end_addr, &walk);
653 		if (err)
654 			break;
655 	}
656 
657 	return err;
658 }
659 
660 /**
661  * folio_walk_start - walk the page tables to a folio
662  * @fw: filled with information on success.
663  * @vma: the VMA.
664  * @addr: the virtual address to use for the page table walk.
665  * @flags: flags modifying which folios to walk to.
666  *
667  * Walk the page tables using @addr in a given @vma to a mapped folio and
668  * return the folio, making sure that the page table entry referenced by
669  * @addr cannot change until folio_walk_end() was called.
670  *
671  * As default, this function returns only folios that are not special (e.g., not
672  * the zeropage) and never returns folios that are supposed to be ignored by the
673  * VM as documented by vm_normal_page(). If requested, zeropages will be
674  * returned as well.
675  *
676  * As default, this function only considers present page table entries.
677  * If requested, it will also consider migration entries.
678  *
679  * If this function returns NULL it might either indicate "there is nothing" or
680  * "there is nothing suitable".
681  *
682  * On success, @fw is filled and the function returns the folio while the PTL
683  * is still held and folio_walk_end() must be called to clean up,
684  * releasing any held locks. The returned folio must *not* be used after the
685  * call to folio_walk_end(), unless a short-term folio reference is taken before
686  * that call.
687  *
688  * @fw->page will correspond to the page that is effectively referenced by
689  * @addr. However, for migration entries and shared zeropages @fw->page is
690  * set to NULL. Note that large folios might be mapped by multiple page table
691  * entries, and this function will always only lookup a single entry as
692  * specified by @addr, which might or might not cover more than a single page of
693  * the returned folio.
694  *
695  * This function must *not* be used as a naive replacement for
696  * get_user_pages() / pin_user_pages(), especially not to perform DMA or
697  * to carelessly modify page content. This function may *only* be used to grab
698  * short-term folio references, never to grab long-term folio references.
699  *
700  * Using the page table entry pointers in @fw for reading or modifying the
701  * entry should be avoided where possible: however, there might be valid
702  * use cases.
703  *
704  * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
705  * For example, PMD page table sharing might require prior unsharing. Also,
706  * logical hugetlb entries might span multiple physical page table entries,
707  * which *must* be modified in a single operation (set_huge_pte_at(),
708  * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
709  * not correspond to the first physical entry of a logical hugetlb entry.
710  *
711  * The mmap lock must be held in read mode.
712  *
713  * Return: folio pointer on success, otherwise NULL.
714  */
715 struct folio *folio_walk_start(struct folio_walk *fw,
716 		struct vm_area_struct *vma, unsigned long addr,
717 		folio_walk_flags_t flags)
718 {
719 	unsigned long entry_size;
720 	bool expose_page = true;
721 	struct page *page;
722 	pud_t *pudp, pud;
723 	pmd_t *pmdp, pmd;
724 	pte_t *ptep, pte;
725 	spinlock_t *ptl;
726 	pgd_t *pgdp;
727 	p4d_t *p4dp;
728 
729 	mmap_assert_locked(vma->vm_mm);
730 	vma_pgtable_walk_begin(vma);
731 
732 	if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
733 		goto not_found;
734 
735 	pgdp = pgd_offset(vma->vm_mm, addr);
736 	if (pgd_none_or_clear_bad(pgdp))
737 		goto not_found;
738 
739 	p4dp = p4d_offset(pgdp, addr);
740 	if (p4d_none_or_clear_bad(p4dp))
741 		goto not_found;
742 
743 	pudp = pud_offset(p4dp, addr);
744 	pud = pudp_get(pudp);
745 	if (pud_none(pud))
746 		goto not_found;
747 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
748 		ptl = pud_lock(vma->vm_mm, pudp);
749 		pud = pudp_get(pudp);
750 
751 		entry_size = PUD_SIZE;
752 		fw->level = FW_LEVEL_PUD;
753 		fw->pudp = pudp;
754 		fw->pud = pud;
755 
756 		if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
757 			spin_unlock(ptl);
758 			goto not_found;
759 		} else if (!pud_leaf(pud)) {
760 			spin_unlock(ptl);
761 			goto pmd_table;
762 		}
763 		/*
764 		 * TODO: vm_normal_page_pud() will be handy once we want to
765 		 * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
766 		 */
767 		page = pud_page(pud);
768 		goto found;
769 	}
770 
771 pmd_table:
772 	VM_WARN_ON_ONCE(pud_leaf(*pudp));
773 	pmdp = pmd_offset(pudp, addr);
774 	pmd = pmdp_get_lockless(pmdp);
775 	if (pmd_none(pmd))
776 		goto not_found;
777 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
778 		ptl = pmd_lock(vma->vm_mm, pmdp);
779 		pmd = pmdp_get(pmdp);
780 
781 		entry_size = PMD_SIZE;
782 		fw->level = FW_LEVEL_PMD;
783 		fw->pmdp = pmdp;
784 		fw->pmd = pmd;
785 
786 		if (pmd_none(pmd)) {
787 			spin_unlock(ptl);
788 			goto not_found;
789 		} else if (!pmd_leaf(pmd)) {
790 			spin_unlock(ptl);
791 			goto pte_table;
792 		} else if (pmd_present(pmd)) {
793 			page = vm_normal_page_pmd(vma, addr, pmd);
794 			if (page) {
795 				goto found;
796 			} else if ((flags & FW_ZEROPAGE) &&
797 				    is_huge_zero_pmd(pmd)) {
798 				page = pfn_to_page(pmd_pfn(pmd));
799 				expose_page = false;
800 				goto found;
801 			}
802 		} else if ((flags & FW_MIGRATION) &&
803 			   is_pmd_migration_entry(pmd)) {
804 			swp_entry_t entry = pmd_to_swp_entry(pmd);
805 
806 			page = pfn_swap_entry_to_page(entry);
807 			expose_page = false;
808 			goto found;
809 		}
810 		spin_unlock(ptl);
811 		goto not_found;
812 	}
813 
814 pte_table:
815 	VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
816 	ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
817 	if (!ptep)
818 		goto not_found;
819 	pte = ptep_get(ptep);
820 
821 	entry_size = PAGE_SIZE;
822 	fw->level = FW_LEVEL_PTE;
823 	fw->ptep = ptep;
824 	fw->pte = pte;
825 
826 	if (pte_present(pte)) {
827 		page = vm_normal_page(vma, addr, pte);
828 		if (page)
829 			goto found;
830 		if ((flags & FW_ZEROPAGE) &&
831 		    is_zero_pfn(pte_pfn(pte))) {
832 			page = pfn_to_page(pte_pfn(pte));
833 			expose_page = false;
834 			goto found;
835 		}
836 	} else if (!pte_none(pte)) {
837 		swp_entry_t entry = pte_to_swp_entry(pte);
838 
839 		if ((flags & FW_MIGRATION) &&
840 		    is_migration_entry(entry)) {
841 			page = pfn_swap_entry_to_page(entry);
842 			expose_page = false;
843 			goto found;
844 		}
845 	}
846 	pte_unmap_unlock(ptep, ptl);
847 not_found:
848 	vma_pgtable_walk_end(vma);
849 	return NULL;
850 found:
851 	if (expose_page)
852 		/* Note: Offset from the mapped page, not the folio start. */
853 		fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
854 	else
855 		fw->page = NULL;
856 	fw->ptl = ptl;
857 	return page_folio(page);
858 }
859