xref: /linux/fs/proc/task_mmu.c (revision ceb32d6aa93ce3282a724f3a0828b4b84d5035f1)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/mm_inline.h>
4 #include <linux/hugetlb.h>
5 #include <linux/huge_mm.h>
6 #include <linux/mount.h>
7 #include <linux/ksm.h>
8 #include <linux/seq_file.h>
9 #include <linux/highmem.h>
10 #include <linux/ptrace.h>
11 #include <linux/slab.h>
12 #include <linux/pagemap.h>
13 #include <linux/mempolicy.h>
14 #include <linux/rmap.h>
15 #include <linux/swap.h>
16 #include <linux/sched/mm.h>
17 #include <linux/swapops.h>
18 #include <linux/mmu_notifier.h>
19 #include <linux/page_idle.h>
20 #include <linux/shmem_fs.h>
21 #include <linux/uaccess.h>
22 #include <linux/pkeys.h>
23 #include <linux/minmax.h>
24 #include <linux/overflow.h>
25 
26 #include <asm/elf.h>
27 #include <asm/tlb.h>
28 #include <asm/tlbflush.h>
29 #include "internal.h"
30 
31 #define SEQ_PUT_DEC(str, val) \
32 		seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
33 void task_mem(struct seq_file *m, struct mm_struct *mm)
34 {
35 	unsigned long text, lib, swap, anon, file, shmem;
36 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
37 
38 	anon = get_mm_counter(mm, MM_ANONPAGES);
39 	file = get_mm_counter(mm, MM_FILEPAGES);
40 	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
41 
42 	/*
43 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
44 	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
45 	 * collector of these hiwater stats must therefore get total_vm
46 	 * and rss too, which will usually be the higher.  Barriers? not
47 	 * worth the effort, such snapshots can always be inconsistent.
48 	 */
49 	hiwater_vm = total_vm = mm->total_vm;
50 	if (hiwater_vm < mm->hiwater_vm)
51 		hiwater_vm = mm->hiwater_vm;
52 	hiwater_rss = total_rss = anon + file + shmem;
53 	if (hiwater_rss < mm->hiwater_rss)
54 		hiwater_rss = mm->hiwater_rss;
55 
56 	/* split executable areas between text and lib */
57 	text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
58 	text = min(text, mm->exec_vm << PAGE_SHIFT);
59 	lib = (mm->exec_vm << PAGE_SHIFT) - text;
60 
61 	swap = get_mm_counter(mm, MM_SWAPENTS);
62 	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
63 	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
64 	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
65 	SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
66 	SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
67 	SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
68 	SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
69 	SEQ_PUT_DEC(" kB\nRssFile:\t", file);
70 	SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
71 	SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
72 	SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
73 	seq_put_decimal_ull_width(m,
74 		    " kB\nVmExe:\t", text >> 10, 8);
75 	seq_put_decimal_ull_width(m,
76 		    " kB\nVmLib:\t", lib >> 10, 8);
77 	seq_put_decimal_ull_width(m,
78 		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
79 	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
80 	seq_puts(m, " kB\n");
81 	hugetlb_report_usage(m, mm);
82 }
83 #undef SEQ_PUT_DEC
84 
85 unsigned long task_vsize(struct mm_struct *mm)
86 {
87 	return PAGE_SIZE * mm->total_vm;
88 }
89 
90 unsigned long task_statm(struct mm_struct *mm,
91 			 unsigned long *shared, unsigned long *text,
92 			 unsigned long *data, unsigned long *resident)
93 {
94 	*shared = get_mm_counter(mm, MM_FILEPAGES) +
95 			get_mm_counter(mm, MM_SHMEMPAGES);
96 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
97 								>> PAGE_SHIFT;
98 	*data = mm->data_vm + mm->stack_vm;
99 	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
100 	return mm->total_vm;
101 }
102 
103 #ifdef CONFIG_NUMA
104 /*
105  * Save get_task_policy() for show_numa_map().
106  */
107 static void hold_task_mempolicy(struct proc_maps_private *priv)
108 {
109 	struct task_struct *task = priv->task;
110 
111 	task_lock(task);
112 	priv->task_mempolicy = get_task_policy(task);
113 	mpol_get(priv->task_mempolicy);
114 	task_unlock(task);
115 }
116 static void release_task_mempolicy(struct proc_maps_private *priv)
117 {
118 	mpol_put(priv->task_mempolicy);
119 }
120 #else
121 static void hold_task_mempolicy(struct proc_maps_private *priv)
122 {
123 }
124 static void release_task_mempolicy(struct proc_maps_private *priv)
125 {
126 }
127 #endif
128 
129 static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
130 						loff_t *ppos)
131 {
132 	struct vm_area_struct *vma = vma_next(&priv->iter);
133 
134 	if (vma) {
135 		*ppos = vma->vm_start;
136 	} else {
137 		*ppos = -2UL;
138 		vma = get_gate_vma(priv->mm);
139 	}
140 
141 	return vma;
142 }
143 
144 static void *m_start(struct seq_file *m, loff_t *ppos)
145 {
146 	struct proc_maps_private *priv = m->private;
147 	unsigned long last_addr = *ppos;
148 	struct mm_struct *mm;
149 
150 	/* See m_next(). Zero at the start or after lseek. */
151 	if (last_addr == -1UL)
152 		return NULL;
153 
154 	priv->task = get_proc_task(priv->inode);
155 	if (!priv->task)
156 		return ERR_PTR(-ESRCH);
157 
158 	mm = priv->mm;
159 	if (!mm || !mmget_not_zero(mm)) {
160 		put_task_struct(priv->task);
161 		priv->task = NULL;
162 		return NULL;
163 	}
164 
165 	if (mmap_read_lock_killable(mm)) {
166 		mmput(mm);
167 		put_task_struct(priv->task);
168 		priv->task = NULL;
169 		return ERR_PTR(-EINTR);
170 	}
171 
172 	vma_iter_init(&priv->iter, mm, last_addr);
173 	hold_task_mempolicy(priv);
174 	if (last_addr == -2UL)
175 		return get_gate_vma(mm);
176 
177 	return proc_get_vma(priv, ppos);
178 }
179 
180 static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
181 {
182 	if (*ppos == -2UL) {
183 		*ppos = -1UL;
184 		return NULL;
185 	}
186 	return proc_get_vma(m->private, ppos);
187 }
188 
189 static void m_stop(struct seq_file *m, void *v)
190 {
191 	struct proc_maps_private *priv = m->private;
192 	struct mm_struct *mm = priv->mm;
193 
194 	if (!priv->task)
195 		return;
196 
197 	release_task_mempolicy(priv);
198 	mmap_read_unlock(mm);
199 	mmput(mm);
200 	put_task_struct(priv->task);
201 	priv->task = NULL;
202 }
203 
204 static int proc_maps_open(struct inode *inode, struct file *file,
205 			const struct seq_operations *ops, int psize)
206 {
207 	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
208 
209 	if (!priv)
210 		return -ENOMEM;
211 
212 	priv->inode = inode;
213 	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
214 	if (IS_ERR(priv->mm)) {
215 		int err = PTR_ERR(priv->mm);
216 
217 		seq_release_private(inode, file);
218 		return err;
219 	}
220 
221 	return 0;
222 }
223 
224 static int proc_map_release(struct inode *inode, struct file *file)
225 {
226 	struct seq_file *seq = file->private_data;
227 	struct proc_maps_private *priv = seq->private;
228 
229 	if (priv->mm)
230 		mmdrop(priv->mm);
231 
232 	return seq_release_private(inode, file);
233 }
234 
235 static int do_maps_open(struct inode *inode, struct file *file,
236 			const struct seq_operations *ops)
237 {
238 	return proc_maps_open(inode, file, ops,
239 				sizeof(struct proc_maps_private));
240 }
241 
242 static void show_vma_header_prefix(struct seq_file *m,
243 				   unsigned long start, unsigned long end,
244 				   vm_flags_t flags, unsigned long long pgoff,
245 				   dev_t dev, unsigned long ino)
246 {
247 	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
248 	seq_put_hex_ll(m, NULL, start, 8);
249 	seq_put_hex_ll(m, "-", end, 8);
250 	seq_putc(m, ' ');
251 	seq_putc(m, flags & VM_READ ? 'r' : '-');
252 	seq_putc(m, flags & VM_WRITE ? 'w' : '-');
253 	seq_putc(m, flags & VM_EXEC ? 'x' : '-');
254 	seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
255 	seq_put_hex_ll(m, " ", pgoff, 8);
256 	seq_put_hex_ll(m, " ", MAJOR(dev), 2);
257 	seq_put_hex_ll(m, ":", MINOR(dev), 2);
258 	seq_put_decimal_ull(m, " ", ino);
259 	seq_putc(m, ' ');
260 }
261 
262 static void
263 show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
264 {
265 	struct anon_vma_name *anon_name = NULL;
266 	struct mm_struct *mm = vma->vm_mm;
267 	struct file *file = vma->vm_file;
268 	vm_flags_t flags = vma->vm_flags;
269 	unsigned long ino = 0;
270 	unsigned long long pgoff = 0;
271 	unsigned long start, end;
272 	dev_t dev = 0;
273 	const char *name = NULL;
274 
275 	if (file) {
276 		const struct inode *inode = file_user_inode(vma->vm_file);
277 
278 		dev = inode->i_sb->s_dev;
279 		ino = inode->i_ino;
280 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
281 	}
282 
283 	start = vma->vm_start;
284 	end = vma->vm_end;
285 	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
286 	if (mm)
287 		anon_name = anon_vma_name(vma);
288 
289 	/*
290 	 * Print the dentry name for named mappings, and a
291 	 * special [heap] marker for the heap:
292 	 */
293 	if (file) {
294 		seq_pad(m, ' ');
295 		/*
296 		 * If user named this anon shared memory via
297 		 * prctl(PR_SET_VMA ..., use the provided name.
298 		 */
299 		if (anon_name)
300 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
301 		else
302 			seq_path(m, file_user_path(file), "\n");
303 		goto done;
304 	}
305 
306 	if (vma->vm_ops && vma->vm_ops->name) {
307 		name = vma->vm_ops->name(vma);
308 		if (name)
309 			goto done;
310 	}
311 
312 	name = arch_vma_name(vma);
313 	if (!name) {
314 		if (!mm) {
315 			name = "[vdso]";
316 			goto done;
317 		}
318 
319 		if (vma_is_initial_heap(vma)) {
320 			name = "[heap]";
321 			goto done;
322 		}
323 
324 		if (vma_is_initial_stack(vma)) {
325 			name = "[stack]";
326 			goto done;
327 		}
328 
329 		if (anon_name) {
330 			seq_pad(m, ' ');
331 			seq_printf(m, "[anon:%s]", anon_name->name);
332 		}
333 	}
334 
335 done:
336 	if (name) {
337 		seq_pad(m, ' ');
338 		seq_puts(m, name);
339 	}
340 	seq_putc(m, '\n');
341 }
342 
343 static int show_map(struct seq_file *m, void *v)
344 {
345 	show_map_vma(m, v);
346 	return 0;
347 }
348 
349 static const struct seq_operations proc_pid_maps_op = {
350 	.start	= m_start,
351 	.next	= m_next,
352 	.stop	= m_stop,
353 	.show	= show_map
354 };
355 
356 static int pid_maps_open(struct inode *inode, struct file *file)
357 {
358 	return do_maps_open(inode, file, &proc_pid_maps_op);
359 }
360 
361 const struct file_operations proc_pid_maps_operations = {
362 	.open		= pid_maps_open,
363 	.read		= seq_read,
364 	.llseek		= seq_lseek,
365 	.release	= proc_map_release,
366 };
367 
368 /*
369  * Proportional Set Size(PSS): my share of RSS.
370  *
371  * PSS of a process is the count of pages it has in memory, where each
372  * page is divided by the number of processes sharing it.  So if a
373  * process has 1000 pages all to itself, and 1000 shared with one other
374  * process, its PSS will be 1500.
375  *
376  * To keep (accumulated) division errors low, we adopt a 64bit
377  * fixed-point pss counter to minimize division errors. So (pss >>
378  * PSS_SHIFT) would be the real byte count.
379  *
380  * A shift of 12 before division means (assuming 4K page size):
381  * 	- 1M 3-user-pages add up to 8KB errors;
382  * 	- supports mapcount up to 2^24, or 16M;
383  * 	- supports PSS up to 2^52 bytes, or 4PB.
384  */
385 #define PSS_SHIFT 12
386 
387 #ifdef CONFIG_PROC_PAGE_MONITOR
388 struct mem_size_stats {
389 	unsigned long resident;
390 	unsigned long shared_clean;
391 	unsigned long shared_dirty;
392 	unsigned long private_clean;
393 	unsigned long private_dirty;
394 	unsigned long referenced;
395 	unsigned long anonymous;
396 	unsigned long lazyfree;
397 	unsigned long anonymous_thp;
398 	unsigned long shmem_thp;
399 	unsigned long file_thp;
400 	unsigned long swap;
401 	unsigned long shared_hugetlb;
402 	unsigned long private_hugetlb;
403 	unsigned long ksm;
404 	u64 pss;
405 	u64 pss_anon;
406 	u64 pss_file;
407 	u64 pss_shmem;
408 	u64 pss_dirty;
409 	u64 pss_locked;
410 	u64 swap_pss;
411 };
412 
413 static void smaps_page_accumulate(struct mem_size_stats *mss,
414 		struct folio *folio, unsigned long size, unsigned long pss,
415 		bool dirty, bool locked, bool private)
416 {
417 	mss->pss += pss;
418 
419 	if (folio_test_anon(folio))
420 		mss->pss_anon += pss;
421 	else if (folio_test_swapbacked(folio))
422 		mss->pss_shmem += pss;
423 	else
424 		mss->pss_file += pss;
425 
426 	if (locked)
427 		mss->pss_locked += pss;
428 
429 	if (dirty || folio_test_dirty(folio)) {
430 		mss->pss_dirty += pss;
431 		if (private)
432 			mss->private_dirty += size;
433 		else
434 			mss->shared_dirty += size;
435 	} else {
436 		if (private)
437 			mss->private_clean += size;
438 		else
439 			mss->shared_clean += size;
440 	}
441 }
442 
443 static void smaps_account(struct mem_size_stats *mss, struct page *page,
444 		bool compound, bool young, bool dirty, bool locked,
445 		bool present)
446 {
447 	struct folio *folio = page_folio(page);
448 	int i, nr = compound ? compound_nr(page) : 1;
449 	unsigned long size = nr * PAGE_SIZE;
450 
451 	/*
452 	 * First accumulate quantities that depend only on |size| and the type
453 	 * of the compound page.
454 	 */
455 	if (folio_test_anon(folio)) {
456 		mss->anonymous += size;
457 		if (!folio_test_swapbacked(folio) && !dirty &&
458 		    !folio_test_dirty(folio))
459 			mss->lazyfree += size;
460 	}
461 
462 	if (folio_test_ksm(folio))
463 		mss->ksm += size;
464 
465 	mss->resident += size;
466 	/* Accumulate the size in pages that have been accessed. */
467 	if (young || folio_test_young(folio) || folio_test_referenced(folio))
468 		mss->referenced += size;
469 
470 	/*
471 	 * Then accumulate quantities that may depend on sharing, or that may
472 	 * differ page-by-page.
473 	 *
474 	 * refcount == 1 for present entries guarantees that the folio is mapped
475 	 * exactly once. For large folios this implies that exactly one
476 	 * PTE/PMD/... maps (a part of) this folio.
477 	 *
478 	 * Treat all non-present entries (where relying on the mapcount and
479 	 * refcount doesn't make sense) as "maybe shared, but not sure how
480 	 * often". We treat device private entries as being fake-present.
481 	 *
482 	 * Note that it would not be safe to read the mapcount especially for
483 	 * pages referenced by migration entries, even with the PTL held.
484 	 */
485 	if (folio_ref_count(folio) == 1 || !present) {
486 		smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
487 				      dirty, locked, present);
488 		return;
489 	}
490 	/*
491 	 * We obtain a snapshot of the mapcount. Without holding the folio lock
492 	 * this snapshot can be slightly wrong as we cannot always read the
493 	 * mapcount atomically.
494 	 */
495 	for (i = 0; i < nr; i++, page++) {
496 		int mapcount = folio_precise_page_mapcount(folio, page);
497 		unsigned long pss = PAGE_SIZE << PSS_SHIFT;
498 		if (mapcount >= 2)
499 			pss /= mapcount;
500 		smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
501 				dirty, locked, mapcount < 2);
502 	}
503 }
504 
505 #ifdef CONFIG_SHMEM
506 static int smaps_pte_hole(unsigned long addr, unsigned long end,
507 			  __always_unused int depth, struct mm_walk *walk)
508 {
509 	struct mem_size_stats *mss = walk->private;
510 	struct vm_area_struct *vma = walk->vma;
511 
512 	mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
513 					      linear_page_index(vma, addr),
514 					      linear_page_index(vma, end));
515 
516 	return 0;
517 }
518 #else
519 #define smaps_pte_hole		NULL
520 #endif /* CONFIG_SHMEM */
521 
522 static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
523 {
524 #ifdef CONFIG_SHMEM
525 	if (walk->ops->pte_hole) {
526 		/* depth is not used */
527 		smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
528 	}
529 #endif
530 }
531 
532 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
533 		struct mm_walk *walk)
534 {
535 	struct mem_size_stats *mss = walk->private;
536 	struct vm_area_struct *vma = walk->vma;
537 	bool locked = !!(vma->vm_flags & VM_LOCKED);
538 	struct page *page = NULL;
539 	bool present = false, young = false, dirty = false;
540 	pte_t ptent = ptep_get(pte);
541 
542 	if (pte_present(ptent)) {
543 		page = vm_normal_page(vma, addr, ptent);
544 		young = pte_young(ptent);
545 		dirty = pte_dirty(ptent);
546 		present = true;
547 	} else if (is_swap_pte(ptent)) {
548 		swp_entry_t swpent = pte_to_swp_entry(ptent);
549 
550 		if (!non_swap_entry(swpent)) {
551 			int mapcount;
552 
553 			mss->swap += PAGE_SIZE;
554 			mapcount = swp_swapcount(swpent);
555 			if (mapcount >= 2) {
556 				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
557 
558 				do_div(pss_delta, mapcount);
559 				mss->swap_pss += pss_delta;
560 			} else {
561 				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
562 			}
563 		} else if (is_pfn_swap_entry(swpent)) {
564 			if (is_device_private_entry(swpent))
565 				present = true;
566 			page = pfn_swap_entry_to_page(swpent);
567 		}
568 	} else {
569 		smaps_pte_hole_lookup(addr, walk);
570 		return;
571 	}
572 
573 	if (!page)
574 		return;
575 
576 	smaps_account(mss, page, false, young, dirty, locked, present);
577 }
578 
579 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
580 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
581 		struct mm_walk *walk)
582 {
583 	struct mem_size_stats *mss = walk->private;
584 	struct vm_area_struct *vma = walk->vma;
585 	bool locked = !!(vma->vm_flags & VM_LOCKED);
586 	struct page *page = NULL;
587 	bool present = false;
588 	struct folio *folio;
589 
590 	if (pmd_present(*pmd)) {
591 		page = vm_normal_page_pmd(vma, addr, *pmd);
592 		present = true;
593 	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
594 		swp_entry_t entry = pmd_to_swp_entry(*pmd);
595 
596 		if (is_pfn_swap_entry(entry))
597 			page = pfn_swap_entry_to_page(entry);
598 	}
599 	if (IS_ERR_OR_NULL(page))
600 		return;
601 	folio = page_folio(page);
602 	if (folio_test_anon(folio))
603 		mss->anonymous_thp += HPAGE_PMD_SIZE;
604 	else if (folio_test_swapbacked(folio))
605 		mss->shmem_thp += HPAGE_PMD_SIZE;
606 	else if (folio_is_zone_device(folio))
607 		/* pass */;
608 	else
609 		mss->file_thp += HPAGE_PMD_SIZE;
610 
611 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
612 		      locked, present);
613 }
614 #else
615 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
616 		struct mm_walk *walk)
617 {
618 }
619 #endif
620 
621 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
622 			   struct mm_walk *walk)
623 {
624 	struct vm_area_struct *vma = walk->vma;
625 	pte_t *pte;
626 	spinlock_t *ptl;
627 
628 	ptl = pmd_trans_huge_lock(pmd, vma);
629 	if (ptl) {
630 		smaps_pmd_entry(pmd, addr, walk);
631 		spin_unlock(ptl);
632 		goto out;
633 	}
634 
635 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
636 	if (!pte) {
637 		walk->action = ACTION_AGAIN;
638 		return 0;
639 	}
640 	for (; addr != end; pte++, addr += PAGE_SIZE)
641 		smaps_pte_entry(pte, addr, walk);
642 	pte_unmap_unlock(pte - 1, ptl);
643 out:
644 	cond_resched();
645 	return 0;
646 }
647 
648 static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
649 {
650 	/*
651 	 * Don't forget to update Documentation/ on changes.
652 	 */
653 	static const char mnemonics[BITS_PER_LONG][2] = {
654 		/*
655 		 * In case if we meet a flag we don't know about.
656 		 */
657 		[0 ... (BITS_PER_LONG-1)] = "??",
658 
659 		[ilog2(VM_READ)]	= "rd",
660 		[ilog2(VM_WRITE)]	= "wr",
661 		[ilog2(VM_EXEC)]	= "ex",
662 		[ilog2(VM_SHARED)]	= "sh",
663 		[ilog2(VM_MAYREAD)]	= "mr",
664 		[ilog2(VM_MAYWRITE)]	= "mw",
665 		[ilog2(VM_MAYEXEC)]	= "me",
666 		[ilog2(VM_MAYSHARE)]	= "ms",
667 		[ilog2(VM_GROWSDOWN)]	= "gd",
668 		[ilog2(VM_PFNMAP)]	= "pf",
669 		[ilog2(VM_LOCKED)]	= "lo",
670 		[ilog2(VM_IO)]		= "io",
671 		[ilog2(VM_SEQ_READ)]	= "sr",
672 		[ilog2(VM_RAND_READ)]	= "rr",
673 		[ilog2(VM_DONTCOPY)]	= "dc",
674 		[ilog2(VM_DONTEXPAND)]	= "de",
675 		[ilog2(VM_LOCKONFAULT)]	= "lf",
676 		[ilog2(VM_ACCOUNT)]	= "ac",
677 		[ilog2(VM_NORESERVE)]	= "nr",
678 		[ilog2(VM_HUGETLB)]	= "ht",
679 		[ilog2(VM_SYNC)]	= "sf",
680 		[ilog2(VM_ARCH_1)]	= "ar",
681 		[ilog2(VM_WIPEONFORK)]	= "wf",
682 		[ilog2(VM_DONTDUMP)]	= "dd",
683 #ifdef CONFIG_ARM64_BTI
684 		[ilog2(VM_ARM64_BTI)]	= "bt",
685 #endif
686 #ifdef CONFIG_MEM_SOFT_DIRTY
687 		[ilog2(VM_SOFTDIRTY)]	= "sd",
688 #endif
689 		[ilog2(VM_MIXEDMAP)]	= "mm",
690 		[ilog2(VM_HUGEPAGE)]	= "hg",
691 		[ilog2(VM_NOHUGEPAGE)]	= "nh",
692 		[ilog2(VM_MERGEABLE)]	= "mg",
693 		[ilog2(VM_UFFD_MISSING)]= "um",
694 		[ilog2(VM_UFFD_WP)]	= "uw",
695 #ifdef CONFIG_ARM64_MTE
696 		[ilog2(VM_MTE)]		= "mt",
697 		[ilog2(VM_MTE_ALLOWED)]	= "",
698 #endif
699 #ifdef CONFIG_ARCH_HAS_PKEYS
700 		/* These come out via ProtectionKey: */
701 		[ilog2(VM_PKEY_BIT0)]	= "",
702 		[ilog2(VM_PKEY_BIT1)]	= "",
703 		[ilog2(VM_PKEY_BIT2)]	= "",
704 		[ilog2(VM_PKEY_BIT3)]	= "",
705 #if VM_PKEY_BIT4
706 		[ilog2(VM_PKEY_BIT4)]	= "",
707 #endif
708 #endif /* CONFIG_ARCH_HAS_PKEYS */
709 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
710 		[ilog2(VM_UFFD_MINOR)]	= "ui",
711 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
712 #ifdef CONFIG_X86_USER_SHADOW_STACK
713 		[ilog2(VM_SHADOW_STACK)] = "ss",
714 #endif
715 #ifdef CONFIG_64BIT
716 		[ilog2(VM_SEALED)] = "sl",
717 #endif
718 	};
719 	size_t i;
720 
721 	seq_puts(m, "VmFlags: ");
722 	for (i = 0; i < BITS_PER_LONG; i++) {
723 		if (!mnemonics[i][0])
724 			continue;
725 		if (vma->vm_flags & (1UL << i)) {
726 			seq_putc(m, mnemonics[i][0]);
727 			seq_putc(m, mnemonics[i][1]);
728 			seq_putc(m, ' ');
729 		}
730 	}
731 	seq_putc(m, '\n');
732 }
733 
734 #ifdef CONFIG_HUGETLB_PAGE
735 static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
736 				 unsigned long addr, unsigned long end,
737 				 struct mm_walk *walk)
738 {
739 	struct mem_size_stats *mss = walk->private;
740 	struct vm_area_struct *vma = walk->vma;
741 	pte_t ptent = huge_ptep_get(pte);
742 	struct folio *folio = NULL;
743 	bool present = false;
744 
745 	if (pte_present(ptent)) {
746 		folio = page_folio(pte_page(ptent));
747 		present = true;
748 	} else if (is_swap_pte(ptent)) {
749 		swp_entry_t swpent = pte_to_swp_entry(ptent);
750 
751 		if (is_pfn_swap_entry(swpent))
752 			folio = pfn_swap_entry_folio(swpent);
753 	}
754 
755 	if (folio) {
756 		/* We treat non-present entries as "maybe shared". */
757 		if (!present || folio_likely_mapped_shared(folio) ||
758 		    hugetlb_pmd_shared(pte))
759 			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
760 		else
761 			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
762 	}
763 	return 0;
764 }
765 #else
766 #define smaps_hugetlb_range	NULL
767 #endif /* HUGETLB_PAGE */
768 
769 static const struct mm_walk_ops smaps_walk_ops = {
770 	.pmd_entry		= smaps_pte_range,
771 	.hugetlb_entry		= smaps_hugetlb_range,
772 	.walk_lock		= PGWALK_RDLOCK,
773 };
774 
775 static const struct mm_walk_ops smaps_shmem_walk_ops = {
776 	.pmd_entry		= smaps_pte_range,
777 	.hugetlb_entry		= smaps_hugetlb_range,
778 	.pte_hole		= smaps_pte_hole,
779 	.walk_lock		= PGWALK_RDLOCK,
780 };
781 
782 /*
783  * Gather mem stats from @vma with the indicated beginning
784  * address @start, and keep them in @mss.
785  *
786  * Use vm_start of @vma as the beginning address if @start is 0.
787  */
788 static void smap_gather_stats(struct vm_area_struct *vma,
789 		struct mem_size_stats *mss, unsigned long start)
790 {
791 	const struct mm_walk_ops *ops = &smaps_walk_ops;
792 
793 	/* Invalid start */
794 	if (start >= vma->vm_end)
795 		return;
796 
797 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
798 		/*
799 		 * For shared or readonly shmem mappings we know that all
800 		 * swapped out pages belong to the shmem object, and we can
801 		 * obtain the swap value much more efficiently. For private
802 		 * writable mappings, we might have COW pages that are
803 		 * not affected by the parent swapped out pages of the shmem
804 		 * object, so we have to distinguish them during the page walk.
805 		 * Unless we know that the shmem object (or the part mapped by
806 		 * our VMA) has no swapped out pages at all.
807 		 */
808 		unsigned long shmem_swapped = shmem_swap_usage(vma);
809 
810 		if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
811 					!(vma->vm_flags & VM_WRITE))) {
812 			mss->swap += shmem_swapped;
813 		} else {
814 			ops = &smaps_shmem_walk_ops;
815 		}
816 	}
817 
818 	/* mmap_lock is held in m_start */
819 	if (!start)
820 		walk_page_vma(vma, ops, mss);
821 	else
822 		walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
823 }
824 
825 #define SEQ_PUT_DEC(str, val) \
826 		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
827 
828 /* Show the contents common for smaps and smaps_rollup */
829 static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
830 	bool rollup_mode)
831 {
832 	SEQ_PUT_DEC("Rss:            ", mss->resident);
833 	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
834 	SEQ_PUT_DEC(" kB\nPss_Dirty:      ", mss->pss_dirty >> PSS_SHIFT);
835 	if (rollup_mode) {
836 		/*
837 		 * These are meaningful only for smaps_rollup, otherwise two of
838 		 * them are zero, and the other one is the same as Pss.
839 		 */
840 		SEQ_PUT_DEC(" kB\nPss_Anon:       ",
841 			mss->pss_anon >> PSS_SHIFT);
842 		SEQ_PUT_DEC(" kB\nPss_File:       ",
843 			mss->pss_file >> PSS_SHIFT);
844 		SEQ_PUT_DEC(" kB\nPss_Shmem:      ",
845 			mss->pss_shmem >> PSS_SHIFT);
846 	}
847 	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
848 	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
849 	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
850 	SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
851 	SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
852 	SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
853 	SEQ_PUT_DEC(" kB\nKSM:            ", mss->ksm);
854 	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
855 	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
856 	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
857 	SEQ_PUT_DEC(" kB\nFilePmdMapped:  ", mss->file_thp);
858 	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
859 	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
860 				  mss->private_hugetlb >> 10, 7);
861 	SEQ_PUT_DEC(" kB\nSwap:           ", mss->swap);
862 	SEQ_PUT_DEC(" kB\nSwapPss:        ",
863 					mss->swap_pss >> PSS_SHIFT);
864 	SEQ_PUT_DEC(" kB\nLocked:         ",
865 					mss->pss_locked >> PSS_SHIFT);
866 	seq_puts(m, " kB\n");
867 }
868 
869 static int show_smap(struct seq_file *m, void *v)
870 {
871 	struct vm_area_struct *vma = v;
872 	struct mem_size_stats mss = {};
873 
874 	smap_gather_stats(vma, &mss, 0);
875 
876 	show_map_vma(m, vma);
877 
878 	SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
879 	SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
880 	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
881 	seq_puts(m, " kB\n");
882 
883 	__show_smap(m, &mss, false);
884 
885 	seq_printf(m, "THPeligible:    %8u\n",
886 		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
887 			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
888 
889 	if (arch_pkeys_enabled())
890 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
891 	show_smap_vma_flags(m, vma);
892 
893 	return 0;
894 }
895 
896 static int show_smaps_rollup(struct seq_file *m, void *v)
897 {
898 	struct proc_maps_private *priv = m->private;
899 	struct mem_size_stats mss = {};
900 	struct mm_struct *mm = priv->mm;
901 	struct vm_area_struct *vma;
902 	unsigned long vma_start = 0, last_vma_end = 0;
903 	int ret = 0;
904 	VMA_ITERATOR(vmi, mm, 0);
905 
906 	priv->task = get_proc_task(priv->inode);
907 	if (!priv->task)
908 		return -ESRCH;
909 
910 	if (!mm || !mmget_not_zero(mm)) {
911 		ret = -ESRCH;
912 		goto out_put_task;
913 	}
914 
915 	ret = mmap_read_lock_killable(mm);
916 	if (ret)
917 		goto out_put_mm;
918 
919 	hold_task_mempolicy(priv);
920 	vma = vma_next(&vmi);
921 
922 	if (unlikely(!vma))
923 		goto empty_set;
924 
925 	vma_start = vma->vm_start;
926 	do {
927 		smap_gather_stats(vma, &mss, 0);
928 		last_vma_end = vma->vm_end;
929 
930 		/*
931 		 * Release mmap_lock temporarily if someone wants to
932 		 * access it for write request.
933 		 */
934 		if (mmap_lock_is_contended(mm)) {
935 			vma_iter_invalidate(&vmi);
936 			mmap_read_unlock(mm);
937 			ret = mmap_read_lock_killable(mm);
938 			if (ret) {
939 				release_task_mempolicy(priv);
940 				goto out_put_mm;
941 			}
942 
943 			/*
944 			 * After dropping the lock, there are four cases to
945 			 * consider. See the following example for explanation.
946 			 *
947 			 *   +------+------+-----------+
948 			 *   | VMA1 | VMA2 | VMA3      |
949 			 *   +------+------+-----------+
950 			 *   |      |      |           |
951 			 *  4k     8k     16k         400k
952 			 *
953 			 * Suppose we drop the lock after reading VMA2 due to
954 			 * contention, then we get:
955 			 *
956 			 *	last_vma_end = 16k
957 			 *
958 			 * 1) VMA2 is freed, but VMA3 exists:
959 			 *
960 			 *    vma_next(vmi) will return VMA3.
961 			 *    In this case, just continue from VMA3.
962 			 *
963 			 * 2) VMA2 still exists:
964 			 *
965 			 *    vma_next(vmi) will return VMA3.
966 			 *    In this case, just continue from VMA3.
967 			 *
968 			 * 3) No more VMAs can be found:
969 			 *
970 			 *    vma_next(vmi) will return NULL.
971 			 *    No more things to do, just break.
972 			 *
973 			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
974 			 *
975 			 *    vma_next(vmi) will return VMA' whose range
976 			 *    contains last_vma_end.
977 			 *    Iterate VMA' from last_vma_end.
978 			 */
979 			vma = vma_next(&vmi);
980 			/* Case 3 above */
981 			if (!vma)
982 				break;
983 
984 			/* Case 1 and 2 above */
985 			if (vma->vm_start >= last_vma_end) {
986 				smap_gather_stats(vma, &mss, 0);
987 				last_vma_end = vma->vm_end;
988 				continue;
989 			}
990 
991 			/* Case 4 above */
992 			if (vma->vm_end > last_vma_end) {
993 				smap_gather_stats(vma, &mss, last_vma_end);
994 				last_vma_end = vma->vm_end;
995 			}
996 		}
997 	} for_each_vma(vmi, vma);
998 
999 empty_set:
1000 	show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
1001 	seq_pad(m, ' ');
1002 	seq_puts(m, "[rollup]\n");
1003 
1004 	__show_smap(m, &mss, true);
1005 
1006 	release_task_mempolicy(priv);
1007 	mmap_read_unlock(mm);
1008 
1009 out_put_mm:
1010 	mmput(mm);
1011 out_put_task:
1012 	put_task_struct(priv->task);
1013 	priv->task = NULL;
1014 
1015 	return ret;
1016 }
1017 #undef SEQ_PUT_DEC
1018 
1019 static const struct seq_operations proc_pid_smaps_op = {
1020 	.start	= m_start,
1021 	.next	= m_next,
1022 	.stop	= m_stop,
1023 	.show	= show_smap
1024 };
1025 
1026 static int pid_smaps_open(struct inode *inode, struct file *file)
1027 {
1028 	return do_maps_open(inode, file, &proc_pid_smaps_op);
1029 }
1030 
1031 static int smaps_rollup_open(struct inode *inode, struct file *file)
1032 {
1033 	int ret;
1034 	struct proc_maps_private *priv;
1035 
1036 	priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
1037 	if (!priv)
1038 		return -ENOMEM;
1039 
1040 	ret = single_open(file, show_smaps_rollup, priv);
1041 	if (ret)
1042 		goto out_free;
1043 
1044 	priv->inode = inode;
1045 	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
1046 	if (IS_ERR(priv->mm)) {
1047 		ret = PTR_ERR(priv->mm);
1048 
1049 		single_release(inode, file);
1050 		goto out_free;
1051 	}
1052 
1053 	return 0;
1054 
1055 out_free:
1056 	kfree(priv);
1057 	return ret;
1058 }
1059 
1060 static int smaps_rollup_release(struct inode *inode, struct file *file)
1061 {
1062 	struct seq_file *seq = file->private_data;
1063 	struct proc_maps_private *priv = seq->private;
1064 
1065 	if (priv->mm)
1066 		mmdrop(priv->mm);
1067 
1068 	kfree(priv);
1069 	return single_release(inode, file);
1070 }
1071 
1072 const struct file_operations proc_pid_smaps_operations = {
1073 	.open		= pid_smaps_open,
1074 	.read		= seq_read,
1075 	.llseek		= seq_lseek,
1076 	.release	= proc_map_release,
1077 };
1078 
1079 const struct file_operations proc_pid_smaps_rollup_operations = {
1080 	.open		= smaps_rollup_open,
1081 	.read		= seq_read,
1082 	.llseek		= seq_lseek,
1083 	.release	= smaps_rollup_release,
1084 };
1085 
1086 enum clear_refs_types {
1087 	CLEAR_REFS_ALL = 1,
1088 	CLEAR_REFS_ANON,
1089 	CLEAR_REFS_MAPPED,
1090 	CLEAR_REFS_SOFT_DIRTY,
1091 	CLEAR_REFS_MM_HIWATER_RSS,
1092 	CLEAR_REFS_LAST,
1093 };
1094 
1095 struct clear_refs_private {
1096 	enum clear_refs_types type;
1097 };
1098 
1099 #ifdef CONFIG_MEM_SOFT_DIRTY
1100 
1101 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1102 {
1103 	struct page *page;
1104 
1105 	if (!pte_write(pte))
1106 		return false;
1107 	if (!is_cow_mapping(vma->vm_flags))
1108 		return false;
1109 	if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
1110 		return false;
1111 	page = vm_normal_page(vma, addr, pte);
1112 	if (!page)
1113 		return false;
1114 	return page_maybe_dma_pinned(page);
1115 }
1116 
1117 static inline void clear_soft_dirty(struct vm_area_struct *vma,
1118 		unsigned long addr, pte_t *pte)
1119 {
1120 	/*
1121 	 * The soft-dirty tracker uses #PF-s to catch writes
1122 	 * to pages, so write-protect the pte as well. See the
1123 	 * Documentation/admin-guide/mm/soft-dirty.rst for full description
1124 	 * of how soft-dirty works.
1125 	 */
1126 	pte_t ptent = ptep_get(pte);
1127 
1128 	if (pte_present(ptent)) {
1129 		pte_t old_pte;
1130 
1131 		if (pte_is_pinned(vma, addr, ptent))
1132 			return;
1133 		old_pte = ptep_modify_prot_start(vma, addr, pte);
1134 		ptent = pte_wrprotect(old_pte);
1135 		ptent = pte_clear_soft_dirty(ptent);
1136 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
1137 	} else if (is_swap_pte(ptent)) {
1138 		ptent = pte_swp_clear_soft_dirty(ptent);
1139 		set_pte_at(vma->vm_mm, addr, pte, ptent);
1140 	}
1141 }
1142 #else
1143 static inline void clear_soft_dirty(struct vm_area_struct *vma,
1144 		unsigned long addr, pte_t *pte)
1145 {
1146 }
1147 #endif
1148 
1149 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1150 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1151 		unsigned long addr, pmd_t *pmdp)
1152 {
1153 	pmd_t old, pmd = *pmdp;
1154 
1155 	if (pmd_present(pmd)) {
1156 		/* See comment in change_huge_pmd() */
1157 		old = pmdp_invalidate(vma, addr, pmdp);
1158 		if (pmd_dirty(old))
1159 			pmd = pmd_mkdirty(pmd);
1160 		if (pmd_young(old))
1161 			pmd = pmd_mkyoung(pmd);
1162 
1163 		pmd = pmd_wrprotect(pmd);
1164 		pmd = pmd_clear_soft_dirty(pmd);
1165 
1166 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1167 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1168 		pmd = pmd_swp_clear_soft_dirty(pmd);
1169 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1170 	}
1171 }
1172 #else
1173 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1174 		unsigned long addr, pmd_t *pmdp)
1175 {
1176 }
1177 #endif
1178 
1179 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
1180 				unsigned long end, struct mm_walk *walk)
1181 {
1182 	struct clear_refs_private *cp = walk->private;
1183 	struct vm_area_struct *vma = walk->vma;
1184 	pte_t *pte, ptent;
1185 	spinlock_t *ptl;
1186 	struct folio *folio;
1187 
1188 	ptl = pmd_trans_huge_lock(pmd, vma);
1189 	if (ptl) {
1190 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1191 			clear_soft_dirty_pmd(vma, addr, pmd);
1192 			goto out;
1193 		}
1194 
1195 		if (!pmd_present(*pmd))
1196 			goto out;
1197 
1198 		folio = pmd_folio(*pmd);
1199 
1200 		/* Clear accessed and referenced bits. */
1201 		pmdp_test_and_clear_young(vma, addr, pmd);
1202 		folio_test_clear_young(folio);
1203 		folio_clear_referenced(folio);
1204 out:
1205 		spin_unlock(ptl);
1206 		return 0;
1207 	}
1208 
1209 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1210 	if (!pte) {
1211 		walk->action = ACTION_AGAIN;
1212 		return 0;
1213 	}
1214 	for (; addr != end; pte++, addr += PAGE_SIZE) {
1215 		ptent = ptep_get(pte);
1216 
1217 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1218 			clear_soft_dirty(vma, addr, pte);
1219 			continue;
1220 		}
1221 
1222 		if (!pte_present(ptent))
1223 			continue;
1224 
1225 		folio = vm_normal_folio(vma, addr, ptent);
1226 		if (!folio)
1227 			continue;
1228 
1229 		/* Clear accessed and referenced bits. */
1230 		ptep_test_and_clear_young(vma, addr, pte);
1231 		folio_test_clear_young(folio);
1232 		folio_clear_referenced(folio);
1233 	}
1234 	pte_unmap_unlock(pte - 1, ptl);
1235 	cond_resched();
1236 	return 0;
1237 }
1238 
1239 static int clear_refs_test_walk(unsigned long start, unsigned long end,
1240 				struct mm_walk *walk)
1241 {
1242 	struct clear_refs_private *cp = walk->private;
1243 	struct vm_area_struct *vma = walk->vma;
1244 
1245 	if (vma->vm_flags & VM_PFNMAP)
1246 		return 1;
1247 
1248 	/*
1249 	 * Writing 1 to /proc/pid/clear_refs affects all pages.
1250 	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1251 	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1252 	 * Writing 4 to /proc/pid/clear_refs affects all pages.
1253 	 */
1254 	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1255 		return 1;
1256 	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1257 		return 1;
1258 	return 0;
1259 }
1260 
1261 static const struct mm_walk_ops clear_refs_walk_ops = {
1262 	.pmd_entry		= clear_refs_pte_range,
1263 	.test_walk		= clear_refs_test_walk,
1264 	.walk_lock		= PGWALK_WRLOCK,
1265 };
1266 
1267 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1268 				size_t count, loff_t *ppos)
1269 {
1270 	struct task_struct *task;
1271 	char buffer[PROC_NUMBUF] = {};
1272 	struct mm_struct *mm;
1273 	struct vm_area_struct *vma;
1274 	enum clear_refs_types type;
1275 	int itype;
1276 	int rv;
1277 
1278 	if (count > sizeof(buffer) - 1)
1279 		count = sizeof(buffer) - 1;
1280 	if (copy_from_user(buffer, buf, count))
1281 		return -EFAULT;
1282 	rv = kstrtoint(strstrip(buffer), 10, &itype);
1283 	if (rv < 0)
1284 		return rv;
1285 	type = (enum clear_refs_types)itype;
1286 	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
1287 		return -EINVAL;
1288 
1289 	task = get_proc_task(file_inode(file));
1290 	if (!task)
1291 		return -ESRCH;
1292 	mm = get_task_mm(task);
1293 	if (mm) {
1294 		VMA_ITERATOR(vmi, mm, 0);
1295 		struct mmu_notifier_range range;
1296 		struct clear_refs_private cp = {
1297 			.type = type,
1298 		};
1299 
1300 		if (mmap_write_lock_killable(mm)) {
1301 			count = -EINTR;
1302 			goto out_mm;
1303 		}
1304 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1305 			/*
1306 			 * Writing 5 to /proc/pid/clear_refs resets the peak
1307 			 * resident set size to this mm's current rss value.
1308 			 */
1309 			reset_mm_hiwater_rss(mm);
1310 			goto out_unlock;
1311 		}
1312 
1313 		if (type == CLEAR_REFS_SOFT_DIRTY) {
1314 			for_each_vma(vmi, vma) {
1315 				if (!(vma->vm_flags & VM_SOFTDIRTY))
1316 					continue;
1317 				vm_flags_clear(vma, VM_SOFTDIRTY);
1318 				vma_set_page_prot(vma);
1319 			}
1320 
1321 			inc_tlb_flush_pending(mm);
1322 			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1323 						0, mm, 0, -1UL);
1324 			mmu_notifier_invalidate_range_start(&range);
1325 		}
1326 		walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
1327 		if (type == CLEAR_REFS_SOFT_DIRTY) {
1328 			mmu_notifier_invalidate_range_end(&range);
1329 			flush_tlb_mm(mm);
1330 			dec_tlb_flush_pending(mm);
1331 		}
1332 out_unlock:
1333 		mmap_write_unlock(mm);
1334 out_mm:
1335 		mmput(mm);
1336 	}
1337 	put_task_struct(task);
1338 
1339 	return count;
1340 }
1341 
1342 const struct file_operations proc_clear_refs_operations = {
1343 	.write		= clear_refs_write,
1344 	.llseek		= noop_llseek,
1345 };
1346 
1347 typedef struct {
1348 	u64 pme;
1349 } pagemap_entry_t;
1350 
1351 struct pagemapread {
1352 	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
1353 	pagemap_entry_t *buffer;
1354 	bool show_pfn;
1355 };
1356 
1357 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
1358 #define PAGEMAP_WALK_MASK	(PMD_MASK)
1359 
1360 #define PM_ENTRY_BYTES		sizeof(pagemap_entry_t)
1361 #define PM_PFRAME_BITS		55
1362 #define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1363 #define PM_SOFT_DIRTY		BIT_ULL(55)
1364 #define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
1365 #define PM_UFFD_WP		BIT_ULL(57)
1366 #define PM_FILE			BIT_ULL(61)
1367 #define PM_SWAP			BIT_ULL(62)
1368 #define PM_PRESENT		BIT_ULL(63)
1369 
1370 #define PM_END_OF_BUFFER    1
1371 
1372 static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1373 {
1374 	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1375 }
1376 
1377 static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
1378 {
1379 	pm->buffer[pm->pos++] = *pme;
1380 	if (pm->pos >= pm->len)
1381 		return PM_END_OF_BUFFER;
1382 	return 0;
1383 }
1384 
1385 static int pagemap_pte_hole(unsigned long start, unsigned long end,
1386 			    __always_unused int depth, struct mm_walk *walk)
1387 {
1388 	struct pagemapread *pm = walk->private;
1389 	unsigned long addr = start;
1390 	int err = 0;
1391 
1392 	while (addr < end) {
1393 		struct vm_area_struct *vma = find_vma(walk->mm, addr);
1394 		pagemap_entry_t pme = make_pme(0, 0);
1395 		/* End of address space hole, which we mark as non-present. */
1396 		unsigned long hole_end;
1397 
1398 		if (vma)
1399 			hole_end = min(end, vma->vm_start);
1400 		else
1401 			hole_end = end;
1402 
1403 		for (; addr < hole_end; addr += PAGE_SIZE) {
1404 			err = add_to_pagemap(&pme, pm);
1405 			if (err)
1406 				goto out;
1407 		}
1408 
1409 		if (!vma)
1410 			break;
1411 
1412 		/* Addresses in the VMA. */
1413 		if (vma->vm_flags & VM_SOFTDIRTY)
1414 			pme = make_pme(0, PM_SOFT_DIRTY);
1415 		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1416 			err = add_to_pagemap(&pme, pm);
1417 			if (err)
1418 				goto out;
1419 		}
1420 	}
1421 out:
1422 	return err;
1423 }
1424 
1425 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1426 		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1427 {
1428 	u64 frame = 0, flags = 0;
1429 	struct page *page = NULL;
1430 	struct folio *folio;
1431 
1432 	if (pte_present(pte)) {
1433 		if (pm->show_pfn)
1434 			frame = pte_pfn(pte);
1435 		flags |= PM_PRESENT;
1436 		page = vm_normal_page(vma, addr, pte);
1437 		if (pte_soft_dirty(pte))
1438 			flags |= PM_SOFT_DIRTY;
1439 		if (pte_uffd_wp(pte))
1440 			flags |= PM_UFFD_WP;
1441 	} else if (is_swap_pte(pte)) {
1442 		swp_entry_t entry;
1443 		if (pte_swp_soft_dirty(pte))
1444 			flags |= PM_SOFT_DIRTY;
1445 		if (pte_swp_uffd_wp(pte))
1446 			flags |= PM_UFFD_WP;
1447 		entry = pte_to_swp_entry(pte);
1448 		if (pm->show_pfn) {
1449 			pgoff_t offset;
1450 			/*
1451 			 * For PFN swap offsets, keeping the offset field
1452 			 * to be PFN only to be compatible with old smaps.
1453 			 */
1454 			if (is_pfn_swap_entry(entry))
1455 				offset = swp_offset_pfn(entry);
1456 			else
1457 				offset = swp_offset(entry);
1458 			frame = swp_type(entry) |
1459 			    (offset << MAX_SWAPFILES_SHIFT);
1460 		}
1461 		flags |= PM_SWAP;
1462 		if (is_pfn_swap_entry(entry))
1463 			page = pfn_swap_entry_to_page(entry);
1464 		if (pte_marker_entry_uffd_wp(entry))
1465 			flags |= PM_UFFD_WP;
1466 	}
1467 
1468 	if (page) {
1469 		folio = page_folio(page);
1470 		if (!folio_test_anon(folio))
1471 			flags |= PM_FILE;
1472 		if ((flags & PM_PRESENT) &&
1473 		    folio_precise_page_mapcount(folio, page) == 1)
1474 			flags |= PM_MMAP_EXCLUSIVE;
1475 	}
1476 	if (vma->vm_flags & VM_SOFTDIRTY)
1477 		flags |= PM_SOFT_DIRTY;
1478 
1479 	return make_pme(frame, flags);
1480 }
1481 
1482 static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1483 			     struct mm_walk *walk)
1484 {
1485 	struct vm_area_struct *vma = walk->vma;
1486 	struct pagemapread *pm = walk->private;
1487 	spinlock_t *ptl;
1488 	pte_t *pte, *orig_pte;
1489 	int err = 0;
1490 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1491 
1492 	ptl = pmd_trans_huge_lock(pmdp, vma);
1493 	if (ptl) {
1494 		unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
1495 		u64 flags = 0, frame = 0;
1496 		pmd_t pmd = *pmdp;
1497 		struct page *page = NULL;
1498 		struct folio *folio = NULL;
1499 
1500 		if (vma->vm_flags & VM_SOFTDIRTY)
1501 			flags |= PM_SOFT_DIRTY;
1502 
1503 		if (pmd_present(pmd)) {
1504 			page = pmd_page(pmd);
1505 
1506 			flags |= PM_PRESENT;
1507 			if (pmd_soft_dirty(pmd))
1508 				flags |= PM_SOFT_DIRTY;
1509 			if (pmd_uffd_wp(pmd))
1510 				flags |= PM_UFFD_WP;
1511 			if (pm->show_pfn)
1512 				frame = pmd_pfn(pmd) + idx;
1513 		}
1514 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1515 		else if (is_swap_pmd(pmd)) {
1516 			swp_entry_t entry = pmd_to_swp_entry(pmd);
1517 			unsigned long offset;
1518 
1519 			if (pm->show_pfn) {
1520 				if (is_pfn_swap_entry(entry))
1521 					offset = swp_offset_pfn(entry) + idx;
1522 				else
1523 					offset = swp_offset(entry) + idx;
1524 				frame = swp_type(entry) |
1525 					(offset << MAX_SWAPFILES_SHIFT);
1526 			}
1527 			flags |= PM_SWAP;
1528 			if (pmd_swp_soft_dirty(pmd))
1529 				flags |= PM_SOFT_DIRTY;
1530 			if (pmd_swp_uffd_wp(pmd))
1531 				flags |= PM_UFFD_WP;
1532 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
1533 			page = pfn_swap_entry_to_page(entry);
1534 		}
1535 #endif
1536 
1537 		if (page) {
1538 			folio = page_folio(page);
1539 			if (!folio_test_anon(folio))
1540 				flags |= PM_FILE;
1541 		}
1542 
1543 		for (; addr != end; addr += PAGE_SIZE, idx++) {
1544 			unsigned long cur_flags = flags;
1545 			pagemap_entry_t pme;
1546 
1547 			if (folio && (flags & PM_PRESENT) &&
1548 			    folio_precise_page_mapcount(folio, page + idx) == 1)
1549 				cur_flags |= PM_MMAP_EXCLUSIVE;
1550 
1551 			pme = make_pme(frame, cur_flags);
1552 			err = add_to_pagemap(&pme, pm);
1553 			if (err)
1554 				break;
1555 			if (pm->show_pfn) {
1556 				if (flags & PM_PRESENT)
1557 					frame++;
1558 				else if (flags & PM_SWAP)
1559 					frame += (1 << MAX_SWAPFILES_SHIFT);
1560 			}
1561 		}
1562 		spin_unlock(ptl);
1563 		return err;
1564 	}
1565 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1566 
1567 	/*
1568 	 * We can assume that @vma always points to a valid one and @end never
1569 	 * goes beyond vma->vm_end.
1570 	 */
1571 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1572 	if (!pte) {
1573 		walk->action = ACTION_AGAIN;
1574 		return err;
1575 	}
1576 	for (; addr < end; pte++, addr += PAGE_SIZE) {
1577 		pagemap_entry_t pme;
1578 
1579 		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
1580 		err = add_to_pagemap(&pme, pm);
1581 		if (err)
1582 			break;
1583 	}
1584 	pte_unmap_unlock(orig_pte, ptl);
1585 
1586 	cond_resched();
1587 
1588 	return err;
1589 }
1590 
1591 #ifdef CONFIG_HUGETLB_PAGE
1592 /* This function walks within one hugetlb entry in the single call */
1593 static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1594 				 unsigned long addr, unsigned long end,
1595 				 struct mm_walk *walk)
1596 {
1597 	struct pagemapread *pm = walk->private;
1598 	struct vm_area_struct *vma = walk->vma;
1599 	u64 flags = 0, frame = 0;
1600 	int err = 0;
1601 	pte_t pte;
1602 
1603 	if (vma->vm_flags & VM_SOFTDIRTY)
1604 		flags |= PM_SOFT_DIRTY;
1605 
1606 	pte = huge_ptep_get(ptep);
1607 	if (pte_present(pte)) {
1608 		struct folio *folio = page_folio(pte_page(pte));
1609 
1610 		if (!folio_test_anon(folio))
1611 			flags |= PM_FILE;
1612 
1613 		if (!folio_likely_mapped_shared(folio) &&
1614 		    !hugetlb_pmd_shared(ptep))
1615 			flags |= PM_MMAP_EXCLUSIVE;
1616 
1617 		if (huge_pte_uffd_wp(pte))
1618 			flags |= PM_UFFD_WP;
1619 
1620 		flags |= PM_PRESENT;
1621 		if (pm->show_pfn)
1622 			frame = pte_pfn(pte) +
1623 				((addr & ~hmask) >> PAGE_SHIFT);
1624 	} else if (pte_swp_uffd_wp_any(pte)) {
1625 		flags |= PM_UFFD_WP;
1626 	}
1627 
1628 	for (; addr != end; addr += PAGE_SIZE) {
1629 		pagemap_entry_t pme = make_pme(frame, flags);
1630 
1631 		err = add_to_pagemap(&pme, pm);
1632 		if (err)
1633 			return err;
1634 		if (pm->show_pfn && (flags & PM_PRESENT))
1635 			frame++;
1636 	}
1637 
1638 	cond_resched();
1639 
1640 	return err;
1641 }
1642 #else
1643 #define pagemap_hugetlb_range	NULL
1644 #endif /* HUGETLB_PAGE */
1645 
1646 static const struct mm_walk_ops pagemap_ops = {
1647 	.pmd_entry	= pagemap_pmd_range,
1648 	.pte_hole	= pagemap_pte_hole,
1649 	.hugetlb_entry	= pagemap_hugetlb_range,
1650 	.walk_lock	= PGWALK_RDLOCK,
1651 };
1652 
1653 /*
1654  * /proc/pid/pagemap - an array mapping virtual pages to pfns
1655  *
1656  * For each page in the address space, this file contains one 64-bit entry
1657  * consisting of the following:
1658  *
1659  * Bits 0-54  page frame number (PFN) if present
1660  * Bits 0-4   swap type if swapped
1661  * Bits 5-54  swap offset if swapped
1662  * Bit  55    pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
1663  * Bit  56    page exclusively mapped
1664  * Bit  57    pte is uffd-wp write-protected
1665  * Bits 58-60 zero
1666  * Bit  61    page is file-page or shared-anon
1667  * Bit  62    page swapped
1668  * Bit  63    page present
1669  *
1670  * If the page is not present but in swap, then the PFN contains an
1671  * encoding of the swap file number and the page's offset into the
1672  * swap. Unmapped pages return a null PFN. This allows determining
1673  * precisely which pages are mapped (or in swap) and comparing mapped
1674  * pages between processes.
1675  *
1676  * Efficient users of this interface will use /proc/pid/maps to
1677  * determine which areas of memory are actually mapped and llseek to
1678  * skip over unmapped regions.
1679  */
1680 static ssize_t pagemap_read(struct file *file, char __user *buf,
1681 			    size_t count, loff_t *ppos)
1682 {
1683 	struct mm_struct *mm = file->private_data;
1684 	struct pagemapread pm;
1685 	unsigned long src;
1686 	unsigned long svpfn;
1687 	unsigned long start_vaddr;
1688 	unsigned long end_vaddr;
1689 	int ret = 0, copied = 0;
1690 
1691 	if (!mm || !mmget_not_zero(mm))
1692 		goto out;
1693 
1694 	ret = -EINVAL;
1695 	/* file position must be aligned */
1696 	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1697 		goto out_mm;
1698 
1699 	ret = 0;
1700 	if (!count)
1701 		goto out_mm;
1702 
1703 	/* do not disclose physical addresses: attack vector */
1704 	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1705 
1706 	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1707 	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
1708 	ret = -ENOMEM;
1709 	if (!pm.buffer)
1710 		goto out_mm;
1711 
1712 	src = *ppos;
1713 	svpfn = src / PM_ENTRY_BYTES;
1714 	end_vaddr = mm->task_size;
1715 
1716 	/* watch out for wraparound */
1717 	start_vaddr = end_vaddr;
1718 	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
1719 		unsigned long end;
1720 
1721 		ret = mmap_read_lock_killable(mm);
1722 		if (ret)
1723 			goto out_free;
1724 		start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
1725 		mmap_read_unlock(mm);
1726 
1727 		end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
1728 		if (end >= start_vaddr && end < mm->task_size)
1729 			end_vaddr = end;
1730 	}
1731 
1732 	/* Ensure the address is inside the task */
1733 	if (start_vaddr > mm->task_size)
1734 		start_vaddr = end_vaddr;
1735 
1736 	ret = 0;
1737 	while (count && (start_vaddr < end_vaddr)) {
1738 		int len;
1739 		unsigned long end;
1740 
1741 		pm.pos = 0;
1742 		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1743 		/* overflow ? */
1744 		if (end < start_vaddr || end > end_vaddr)
1745 			end = end_vaddr;
1746 		ret = mmap_read_lock_killable(mm);
1747 		if (ret)
1748 			goto out_free;
1749 		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1750 		mmap_read_unlock(mm);
1751 		start_vaddr = end;
1752 
1753 		len = min(count, PM_ENTRY_BYTES * pm.pos);
1754 		if (copy_to_user(buf, pm.buffer, len)) {
1755 			ret = -EFAULT;
1756 			goto out_free;
1757 		}
1758 		copied += len;
1759 		buf += len;
1760 		count -= len;
1761 	}
1762 	*ppos += copied;
1763 	if (!ret || ret == PM_END_OF_BUFFER)
1764 		ret = copied;
1765 
1766 out_free:
1767 	kfree(pm.buffer);
1768 out_mm:
1769 	mmput(mm);
1770 out:
1771 	return ret;
1772 }
1773 
1774 static int pagemap_open(struct inode *inode, struct file *file)
1775 {
1776 	struct mm_struct *mm;
1777 
1778 	mm = proc_mem_open(inode, PTRACE_MODE_READ);
1779 	if (IS_ERR(mm))
1780 		return PTR_ERR(mm);
1781 	file->private_data = mm;
1782 	return 0;
1783 }
1784 
1785 static int pagemap_release(struct inode *inode, struct file *file)
1786 {
1787 	struct mm_struct *mm = file->private_data;
1788 
1789 	if (mm)
1790 		mmdrop(mm);
1791 	return 0;
1792 }
1793 
1794 #define PM_SCAN_CATEGORIES	(PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |	\
1795 				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
1796 				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
1797 				 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
1798 #define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
1799 
1800 struct pagemap_scan_private {
1801 	struct pm_scan_arg arg;
1802 	unsigned long masks_of_interest, cur_vma_category;
1803 	struct page_region *vec_buf;
1804 	unsigned long vec_buf_len, vec_buf_index, found_pages;
1805 	struct page_region __user *vec_out;
1806 };
1807 
1808 static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
1809 					   struct vm_area_struct *vma,
1810 					   unsigned long addr, pte_t pte)
1811 {
1812 	unsigned long categories = 0;
1813 
1814 	if (pte_present(pte)) {
1815 		struct page *page;
1816 
1817 		categories |= PAGE_IS_PRESENT;
1818 		if (!pte_uffd_wp(pte))
1819 			categories |= PAGE_IS_WRITTEN;
1820 
1821 		if (p->masks_of_interest & PAGE_IS_FILE) {
1822 			page = vm_normal_page(vma, addr, pte);
1823 			if (page && !PageAnon(page))
1824 				categories |= PAGE_IS_FILE;
1825 		}
1826 
1827 		if (is_zero_pfn(pte_pfn(pte)))
1828 			categories |= PAGE_IS_PFNZERO;
1829 		if (pte_soft_dirty(pte))
1830 			categories |= PAGE_IS_SOFT_DIRTY;
1831 	} else if (is_swap_pte(pte)) {
1832 		swp_entry_t swp;
1833 
1834 		categories |= PAGE_IS_SWAPPED;
1835 		if (!pte_swp_uffd_wp_any(pte))
1836 			categories |= PAGE_IS_WRITTEN;
1837 
1838 		if (p->masks_of_interest & PAGE_IS_FILE) {
1839 			swp = pte_to_swp_entry(pte);
1840 			if (is_pfn_swap_entry(swp) &&
1841 			    !folio_test_anon(pfn_swap_entry_folio(swp)))
1842 				categories |= PAGE_IS_FILE;
1843 		}
1844 		if (pte_swp_soft_dirty(pte))
1845 			categories |= PAGE_IS_SOFT_DIRTY;
1846 	}
1847 
1848 	return categories;
1849 }
1850 
1851 static void make_uffd_wp_pte(struct vm_area_struct *vma,
1852 			     unsigned long addr, pte_t *pte, pte_t ptent)
1853 {
1854 	if (pte_present(ptent)) {
1855 		pte_t old_pte;
1856 
1857 		old_pte = ptep_modify_prot_start(vma, addr, pte);
1858 		ptent = pte_mkuffd_wp(old_pte);
1859 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
1860 	} else if (is_swap_pte(ptent)) {
1861 		ptent = pte_swp_mkuffd_wp(ptent);
1862 		set_pte_at(vma->vm_mm, addr, pte, ptent);
1863 	} else {
1864 		set_pte_at(vma->vm_mm, addr, pte,
1865 			   make_pte_marker(PTE_MARKER_UFFD_WP));
1866 	}
1867 }
1868 
1869 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1870 static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
1871 					  struct vm_area_struct *vma,
1872 					  unsigned long addr, pmd_t pmd)
1873 {
1874 	unsigned long categories = PAGE_IS_HUGE;
1875 
1876 	if (pmd_present(pmd)) {
1877 		struct page *page;
1878 
1879 		categories |= PAGE_IS_PRESENT;
1880 		if (!pmd_uffd_wp(pmd))
1881 			categories |= PAGE_IS_WRITTEN;
1882 
1883 		if (p->masks_of_interest & PAGE_IS_FILE) {
1884 			page = vm_normal_page_pmd(vma, addr, pmd);
1885 			if (page && !PageAnon(page))
1886 				categories |= PAGE_IS_FILE;
1887 		}
1888 
1889 		if (is_zero_pfn(pmd_pfn(pmd)))
1890 			categories |= PAGE_IS_PFNZERO;
1891 		if (pmd_soft_dirty(pmd))
1892 			categories |= PAGE_IS_SOFT_DIRTY;
1893 	} else if (is_swap_pmd(pmd)) {
1894 		swp_entry_t swp;
1895 
1896 		categories |= PAGE_IS_SWAPPED;
1897 		if (!pmd_swp_uffd_wp(pmd))
1898 			categories |= PAGE_IS_WRITTEN;
1899 		if (pmd_swp_soft_dirty(pmd))
1900 			categories |= PAGE_IS_SOFT_DIRTY;
1901 
1902 		if (p->masks_of_interest & PAGE_IS_FILE) {
1903 			swp = pmd_to_swp_entry(pmd);
1904 			if (is_pfn_swap_entry(swp) &&
1905 			    !folio_test_anon(pfn_swap_entry_folio(swp)))
1906 				categories |= PAGE_IS_FILE;
1907 		}
1908 	}
1909 
1910 	return categories;
1911 }
1912 
1913 static void make_uffd_wp_pmd(struct vm_area_struct *vma,
1914 			     unsigned long addr, pmd_t *pmdp)
1915 {
1916 	pmd_t old, pmd = *pmdp;
1917 
1918 	if (pmd_present(pmd)) {
1919 		old = pmdp_invalidate_ad(vma, addr, pmdp);
1920 		pmd = pmd_mkuffd_wp(old);
1921 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1922 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1923 		pmd = pmd_swp_mkuffd_wp(pmd);
1924 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1925 	}
1926 }
1927 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1928 
1929 #ifdef CONFIG_HUGETLB_PAGE
1930 static unsigned long pagemap_hugetlb_category(pte_t pte)
1931 {
1932 	unsigned long categories = PAGE_IS_HUGE;
1933 
1934 	/*
1935 	 * According to pagemap_hugetlb_range(), file-backed HugeTLB
1936 	 * page cannot be swapped. So PAGE_IS_FILE is not checked for
1937 	 * swapped pages.
1938 	 */
1939 	if (pte_present(pte)) {
1940 		categories |= PAGE_IS_PRESENT;
1941 		if (!huge_pte_uffd_wp(pte))
1942 			categories |= PAGE_IS_WRITTEN;
1943 		if (!PageAnon(pte_page(pte)))
1944 			categories |= PAGE_IS_FILE;
1945 		if (is_zero_pfn(pte_pfn(pte)))
1946 			categories |= PAGE_IS_PFNZERO;
1947 		if (pte_soft_dirty(pte))
1948 			categories |= PAGE_IS_SOFT_DIRTY;
1949 	} else if (is_swap_pte(pte)) {
1950 		categories |= PAGE_IS_SWAPPED;
1951 		if (!pte_swp_uffd_wp_any(pte))
1952 			categories |= PAGE_IS_WRITTEN;
1953 		if (pte_swp_soft_dirty(pte))
1954 			categories |= PAGE_IS_SOFT_DIRTY;
1955 	}
1956 
1957 	return categories;
1958 }
1959 
1960 static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
1961 				  unsigned long addr, pte_t *ptep,
1962 				  pte_t ptent)
1963 {
1964 	unsigned long psize;
1965 
1966 	if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
1967 		return;
1968 
1969 	psize = huge_page_size(hstate_vma(vma));
1970 
1971 	if (is_hugetlb_entry_migration(ptent))
1972 		set_huge_pte_at(vma->vm_mm, addr, ptep,
1973 				pte_swp_mkuffd_wp(ptent), psize);
1974 	else if (!huge_pte_none(ptent))
1975 		huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
1976 					     huge_pte_mkuffd_wp(ptent));
1977 	else
1978 		set_huge_pte_at(vma->vm_mm, addr, ptep,
1979 				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
1980 }
1981 #endif /* CONFIG_HUGETLB_PAGE */
1982 
1983 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
1984 static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
1985 				       unsigned long addr, unsigned long end)
1986 {
1987 	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
1988 
1989 	if (cur_buf->start != addr)
1990 		cur_buf->end = addr;
1991 	else
1992 		cur_buf->start = cur_buf->end = 0;
1993 
1994 	p->found_pages -= (end - addr) / PAGE_SIZE;
1995 }
1996 #endif
1997 
1998 static bool pagemap_scan_is_interesting_page(unsigned long categories,
1999 					     const struct pagemap_scan_private *p)
2000 {
2001 	categories ^= p->arg.category_inverted;
2002 	if ((categories & p->arg.category_mask) != p->arg.category_mask)
2003 		return false;
2004 	if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
2005 		return false;
2006 
2007 	return true;
2008 }
2009 
2010 static bool pagemap_scan_is_interesting_vma(unsigned long categories,
2011 					    const struct pagemap_scan_private *p)
2012 {
2013 	unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
2014 
2015 	categories ^= p->arg.category_inverted;
2016 	if ((categories & required) != required)
2017 		return false;
2018 
2019 	return true;
2020 }
2021 
2022 static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
2023 				  struct mm_walk *walk)
2024 {
2025 	struct pagemap_scan_private *p = walk->private;
2026 	struct vm_area_struct *vma = walk->vma;
2027 	unsigned long vma_category = 0;
2028 	bool wp_allowed = userfaultfd_wp_async(vma) &&
2029 	    userfaultfd_wp_use_markers(vma);
2030 
2031 	if (!wp_allowed) {
2032 		/* User requested explicit failure over wp-async capability */
2033 		if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
2034 			return -EPERM;
2035 		/*
2036 		 * User requires wr-protect, and allows silently skipping
2037 		 * unsupported vmas.
2038 		 */
2039 		if (p->arg.flags & PM_SCAN_WP_MATCHING)
2040 			return 1;
2041 		/*
2042 		 * Then the request doesn't involve wr-protects at all,
2043 		 * fall through to the rest checks, and allow vma walk.
2044 		 */
2045 	}
2046 
2047 	if (vma->vm_flags & VM_PFNMAP)
2048 		return 1;
2049 
2050 	if (wp_allowed)
2051 		vma_category |= PAGE_IS_WPALLOWED;
2052 
2053 	if (vma->vm_flags & VM_SOFTDIRTY)
2054 		vma_category |= PAGE_IS_SOFT_DIRTY;
2055 
2056 	if (!pagemap_scan_is_interesting_vma(vma_category, p))
2057 		return 1;
2058 
2059 	p->cur_vma_category = vma_category;
2060 
2061 	return 0;
2062 }
2063 
2064 static bool pagemap_scan_push_range(unsigned long categories,
2065 				    struct pagemap_scan_private *p,
2066 				    unsigned long addr, unsigned long end)
2067 {
2068 	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
2069 
2070 	/*
2071 	 * When there is no output buffer provided at all, the sentinel values
2072 	 * won't match here. There is no other way for `cur_buf->end` to be
2073 	 * non-zero other than it being non-empty.
2074 	 */
2075 	if (addr == cur_buf->end && categories == cur_buf->categories) {
2076 		cur_buf->end = end;
2077 		return true;
2078 	}
2079 
2080 	if (cur_buf->end) {
2081 		if (p->vec_buf_index >= p->vec_buf_len - 1)
2082 			return false;
2083 
2084 		cur_buf = &p->vec_buf[++p->vec_buf_index];
2085 	}
2086 
2087 	cur_buf->start = addr;
2088 	cur_buf->end = end;
2089 	cur_buf->categories = categories;
2090 
2091 	return true;
2092 }
2093 
2094 static int pagemap_scan_output(unsigned long categories,
2095 			       struct pagemap_scan_private *p,
2096 			       unsigned long addr, unsigned long *end)
2097 {
2098 	unsigned long n_pages, total_pages;
2099 	int ret = 0;
2100 
2101 	if (!p->vec_buf)
2102 		return 0;
2103 
2104 	categories &= p->arg.return_mask;
2105 
2106 	n_pages = (*end - addr) / PAGE_SIZE;
2107 	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
2108 	    total_pages > p->arg.max_pages) {
2109 		size_t n_too_much = total_pages - p->arg.max_pages;
2110 		*end -= n_too_much * PAGE_SIZE;
2111 		n_pages -= n_too_much;
2112 		ret = -ENOSPC;
2113 	}
2114 
2115 	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
2116 		*end = addr;
2117 		n_pages = 0;
2118 		ret = -ENOSPC;
2119 	}
2120 
2121 	p->found_pages += n_pages;
2122 	if (ret)
2123 		p->arg.walk_end = *end;
2124 
2125 	return ret;
2126 }
2127 
2128 static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
2129 				  unsigned long end, struct mm_walk *walk)
2130 {
2131 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2132 	struct pagemap_scan_private *p = walk->private;
2133 	struct vm_area_struct *vma = walk->vma;
2134 	unsigned long categories;
2135 	spinlock_t *ptl;
2136 	int ret = 0;
2137 
2138 	ptl = pmd_trans_huge_lock(pmd, vma);
2139 	if (!ptl)
2140 		return -ENOENT;
2141 
2142 	categories = p->cur_vma_category |
2143 		     pagemap_thp_category(p, vma, start, *pmd);
2144 
2145 	if (!pagemap_scan_is_interesting_page(categories, p))
2146 		goto out_unlock;
2147 
2148 	ret = pagemap_scan_output(categories, p, start, &end);
2149 	if (start == end)
2150 		goto out_unlock;
2151 
2152 	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2153 		goto out_unlock;
2154 	if (~categories & PAGE_IS_WRITTEN)
2155 		goto out_unlock;
2156 
2157 	/*
2158 	 * Break huge page into small pages if the WP operation
2159 	 * needs to be performed on a portion of the huge page.
2160 	 */
2161 	if (end != start + HPAGE_SIZE) {
2162 		spin_unlock(ptl);
2163 		split_huge_pmd(vma, pmd, start);
2164 		pagemap_scan_backout_range(p, start, end);
2165 		/* Report as if there was no THP */
2166 		return -ENOENT;
2167 	}
2168 
2169 	make_uffd_wp_pmd(vma, start, pmd);
2170 	flush_tlb_range(vma, start, end);
2171 out_unlock:
2172 	spin_unlock(ptl);
2173 	return ret;
2174 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
2175 	return -ENOENT;
2176 #endif
2177 }
2178 
2179 static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
2180 				  unsigned long end, struct mm_walk *walk)
2181 {
2182 	struct pagemap_scan_private *p = walk->private;
2183 	struct vm_area_struct *vma = walk->vma;
2184 	unsigned long addr, flush_end = 0;
2185 	pte_t *pte, *start_pte;
2186 	spinlock_t *ptl;
2187 	int ret;
2188 
2189 	arch_enter_lazy_mmu_mode();
2190 
2191 	ret = pagemap_scan_thp_entry(pmd, start, end, walk);
2192 	if (ret != -ENOENT) {
2193 		arch_leave_lazy_mmu_mode();
2194 		return ret;
2195 	}
2196 
2197 	ret = 0;
2198 	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
2199 	if (!pte) {
2200 		arch_leave_lazy_mmu_mode();
2201 		walk->action = ACTION_AGAIN;
2202 		return 0;
2203 	}
2204 
2205 	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
2206 		/* Fast path for performing exclusive WP */
2207 		for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2208 			pte_t ptent = ptep_get(pte);
2209 
2210 			if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
2211 			    pte_swp_uffd_wp_any(ptent))
2212 				continue;
2213 			make_uffd_wp_pte(vma, addr, pte, ptent);
2214 			if (!flush_end)
2215 				start = addr;
2216 			flush_end = addr + PAGE_SIZE;
2217 		}
2218 		goto flush_and_return;
2219 	}
2220 
2221 	if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
2222 	    p->arg.category_mask == PAGE_IS_WRITTEN &&
2223 	    p->arg.return_mask == PAGE_IS_WRITTEN) {
2224 		for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
2225 			unsigned long next = addr + PAGE_SIZE;
2226 			pte_t ptent = ptep_get(pte);
2227 
2228 			if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
2229 			    pte_swp_uffd_wp_any(ptent))
2230 				continue;
2231 			ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
2232 						  p, addr, &next);
2233 			if (next == addr)
2234 				break;
2235 			if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2236 				continue;
2237 			make_uffd_wp_pte(vma, addr, pte, ptent);
2238 			if (!flush_end)
2239 				start = addr;
2240 			flush_end = next;
2241 		}
2242 		goto flush_and_return;
2243 	}
2244 
2245 	for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2246 		pte_t ptent = ptep_get(pte);
2247 		unsigned long categories = p->cur_vma_category |
2248 					   pagemap_page_category(p, vma, addr, ptent);
2249 		unsigned long next = addr + PAGE_SIZE;
2250 
2251 		if (!pagemap_scan_is_interesting_page(categories, p))
2252 			continue;
2253 
2254 		ret = pagemap_scan_output(categories, p, addr, &next);
2255 		if (next == addr)
2256 			break;
2257 
2258 		if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2259 			continue;
2260 		if (~categories & PAGE_IS_WRITTEN)
2261 			continue;
2262 
2263 		make_uffd_wp_pte(vma, addr, pte, ptent);
2264 		if (!flush_end)
2265 			start = addr;
2266 		flush_end = next;
2267 	}
2268 
2269 flush_and_return:
2270 	if (flush_end)
2271 		flush_tlb_range(vma, start, addr);
2272 
2273 	pte_unmap_unlock(start_pte, ptl);
2274 	arch_leave_lazy_mmu_mode();
2275 
2276 	cond_resched();
2277 	return ret;
2278 }
2279 
2280 #ifdef CONFIG_HUGETLB_PAGE
2281 static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
2282 				      unsigned long start, unsigned long end,
2283 				      struct mm_walk *walk)
2284 {
2285 	struct pagemap_scan_private *p = walk->private;
2286 	struct vm_area_struct *vma = walk->vma;
2287 	unsigned long categories;
2288 	spinlock_t *ptl;
2289 	int ret = 0;
2290 	pte_t pte;
2291 
2292 	if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
2293 		/* Go the short route when not write-protecting pages. */
2294 
2295 		pte = huge_ptep_get(ptep);
2296 		categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2297 
2298 		if (!pagemap_scan_is_interesting_page(categories, p))
2299 			return 0;
2300 
2301 		return pagemap_scan_output(categories, p, start, &end);
2302 	}
2303 
2304 	i_mmap_lock_write(vma->vm_file->f_mapping);
2305 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
2306 
2307 	pte = huge_ptep_get(ptep);
2308 	categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2309 
2310 	if (!pagemap_scan_is_interesting_page(categories, p))
2311 		goto out_unlock;
2312 
2313 	ret = pagemap_scan_output(categories, p, start, &end);
2314 	if (start == end)
2315 		goto out_unlock;
2316 
2317 	if (~categories & PAGE_IS_WRITTEN)
2318 		goto out_unlock;
2319 
2320 	if (end != start + HPAGE_SIZE) {
2321 		/* Partial HugeTLB page WP isn't possible. */
2322 		pagemap_scan_backout_range(p, start, end);
2323 		p->arg.walk_end = start;
2324 		ret = 0;
2325 		goto out_unlock;
2326 	}
2327 
2328 	make_uffd_wp_huge_pte(vma, start, ptep, pte);
2329 	flush_hugetlb_tlb_range(vma, start, end);
2330 
2331 out_unlock:
2332 	spin_unlock(ptl);
2333 	i_mmap_unlock_write(vma->vm_file->f_mapping);
2334 
2335 	return ret;
2336 }
2337 #else
2338 #define pagemap_scan_hugetlb_entry NULL
2339 #endif
2340 
2341 static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
2342 				 int depth, struct mm_walk *walk)
2343 {
2344 	struct pagemap_scan_private *p = walk->private;
2345 	struct vm_area_struct *vma = walk->vma;
2346 	int ret, err;
2347 
2348 	if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
2349 		return 0;
2350 
2351 	ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
2352 	if (addr == end)
2353 		return ret;
2354 
2355 	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2356 		return ret;
2357 
2358 	err = uffd_wp_range(vma, addr, end - addr, true);
2359 	if (err < 0)
2360 		ret = err;
2361 
2362 	return ret;
2363 }
2364 
2365 static const struct mm_walk_ops pagemap_scan_ops = {
2366 	.test_walk = pagemap_scan_test_walk,
2367 	.pmd_entry = pagemap_scan_pmd_entry,
2368 	.pte_hole = pagemap_scan_pte_hole,
2369 	.hugetlb_entry = pagemap_scan_hugetlb_entry,
2370 };
2371 
2372 static int pagemap_scan_get_args(struct pm_scan_arg *arg,
2373 				 unsigned long uarg)
2374 {
2375 	if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
2376 		return -EFAULT;
2377 
2378 	if (arg->size != sizeof(struct pm_scan_arg))
2379 		return -EINVAL;
2380 
2381 	/* Validate requested features */
2382 	if (arg->flags & ~PM_SCAN_FLAGS)
2383 		return -EINVAL;
2384 	if ((arg->category_inverted | arg->category_mask |
2385 	     arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
2386 		return -EINVAL;
2387 
2388 	arg->start = untagged_addr((unsigned long)arg->start);
2389 	arg->end = untagged_addr((unsigned long)arg->end);
2390 	arg->vec = untagged_addr((unsigned long)arg->vec);
2391 
2392 	/* Validate memory pointers */
2393 	if (!IS_ALIGNED(arg->start, PAGE_SIZE))
2394 		return -EINVAL;
2395 	if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
2396 		return -EFAULT;
2397 	if (!arg->vec && arg->vec_len)
2398 		return -EINVAL;
2399 	if (arg->vec && !access_ok((void __user *)(long)arg->vec,
2400 			      arg->vec_len * sizeof(struct page_region)))
2401 		return -EFAULT;
2402 
2403 	/* Fixup default values */
2404 	arg->end = ALIGN(arg->end, PAGE_SIZE);
2405 	arg->walk_end = 0;
2406 	if (!arg->max_pages)
2407 		arg->max_pages = ULONG_MAX;
2408 
2409 	return 0;
2410 }
2411 
2412 static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
2413 				       unsigned long uargl)
2414 {
2415 	struct pm_scan_arg __user *uarg	= (void __user *)uargl;
2416 
2417 	if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
2418 		return -EFAULT;
2419 
2420 	return 0;
2421 }
2422 
2423 static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
2424 {
2425 	if (!p->arg.vec_len)
2426 		return 0;
2427 
2428 	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
2429 			       p->arg.vec_len);
2430 	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
2431 				   GFP_KERNEL);
2432 	if (!p->vec_buf)
2433 		return -ENOMEM;
2434 
2435 	p->vec_buf->start = p->vec_buf->end = 0;
2436 	p->vec_out = (struct page_region __user *)(long)p->arg.vec;
2437 
2438 	return 0;
2439 }
2440 
2441 static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
2442 {
2443 	const struct page_region *buf = p->vec_buf;
2444 	long n = p->vec_buf_index;
2445 
2446 	if (!p->vec_buf)
2447 		return 0;
2448 
2449 	if (buf[n].end != buf[n].start)
2450 		n++;
2451 
2452 	if (!n)
2453 		return 0;
2454 
2455 	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
2456 		return -EFAULT;
2457 
2458 	p->arg.vec_len -= n;
2459 	p->vec_out += n;
2460 
2461 	p->vec_buf_index = 0;
2462 	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
2463 	p->vec_buf->start = p->vec_buf->end = 0;
2464 
2465 	return n;
2466 }
2467 
2468 static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
2469 {
2470 	struct pagemap_scan_private p = {0};
2471 	unsigned long walk_start;
2472 	size_t n_ranges_out = 0;
2473 	int ret;
2474 
2475 	ret = pagemap_scan_get_args(&p.arg, uarg);
2476 	if (ret)
2477 		return ret;
2478 
2479 	p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
2480 			      p.arg.return_mask;
2481 	ret = pagemap_scan_init_bounce_buffer(&p);
2482 	if (ret)
2483 		return ret;
2484 
2485 	for (walk_start = p.arg.start; walk_start < p.arg.end;
2486 			walk_start = p.arg.walk_end) {
2487 		struct mmu_notifier_range range;
2488 		long n_out;
2489 
2490 		if (fatal_signal_pending(current)) {
2491 			ret = -EINTR;
2492 			break;
2493 		}
2494 
2495 		ret = mmap_read_lock_killable(mm);
2496 		if (ret)
2497 			break;
2498 
2499 		/* Protection change for the range is going to happen. */
2500 		if (p.arg.flags & PM_SCAN_WP_MATCHING) {
2501 			mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
2502 						mm, walk_start, p.arg.end);
2503 			mmu_notifier_invalidate_range_start(&range);
2504 		}
2505 
2506 		ret = walk_page_range(mm, walk_start, p.arg.end,
2507 				      &pagemap_scan_ops, &p);
2508 
2509 		if (p.arg.flags & PM_SCAN_WP_MATCHING)
2510 			mmu_notifier_invalidate_range_end(&range);
2511 
2512 		mmap_read_unlock(mm);
2513 
2514 		n_out = pagemap_scan_flush_buffer(&p);
2515 		if (n_out < 0)
2516 			ret = n_out;
2517 		else
2518 			n_ranges_out += n_out;
2519 
2520 		if (ret != -ENOSPC)
2521 			break;
2522 
2523 		if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
2524 			break;
2525 	}
2526 
2527 	/* ENOSPC signifies early stop (buffer full) from the walk. */
2528 	if (!ret || ret == -ENOSPC)
2529 		ret = n_ranges_out;
2530 
2531 	/* The walk_end isn't set when ret is zero */
2532 	if (!p.arg.walk_end)
2533 		p.arg.walk_end = p.arg.end;
2534 	if (pagemap_scan_writeback_args(&p.arg, uarg))
2535 		ret = -EFAULT;
2536 
2537 	kfree(p.vec_buf);
2538 	return ret;
2539 }
2540 
2541 static long do_pagemap_cmd(struct file *file, unsigned int cmd,
2542 			   unsigned long arg)
2543 {
2544 	struct mm_struct *mm = file->private_data;
2545 
2546 	switch (cmd) {
2547 	case PAGEMAP_SCAN:
2548 		return do_pagemap_scan(mm, arg);
2549 
2550 	default:
2551 		return -EINVAL;
2552 	}
2553 }
2554 
2555 const struct file_operations proc_pagemap_operations = {
2556 	.llseek		= mem_lseek, /* borrow this */
2557 	.read		= pagemap_read,
2558 	.open		= pagemap_open,
2559 	.release	= pagemap_release,
2560 	.unlocked_ioctl = do_pagemap_cmd,
2561 	.compat_ioctl	= do_pagemap_cmd,
2562 };
2563 #endif /* CONFIG_PROC_PAGE_MONITOR */
2564 
2565 #ifdef CONFIG_NUMA
2566 
2567 struct numa_maps {
2568 	unsigned long pages;
2569 	unsigned long anon;
2570 	unsigned long active;
2571 	unsigned long writeback;
2572 	unsigned long mapcount_max;
2573 	unsigned long dirty;
2574 	unsigned long swapcache;
2575 	unsigned long node[MAX_NUMNODES];
2576 };
2577 
2578 struct numa_maps_private {
2579 	struct proc_maps_private proc_maps;
2580 	struct numa_maps md;
2581 };
2582 
2583 static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
2584 			unsigned long nr_pages)
2585 {
2586 	struct folio *folio = page_folio(page);
2587 	int count = folio_precise_page_mapcount(folio, page);
2588 
2589 	md->pages += nr_pages;
2590 	if (pte_dirty || folio_test_dirty(folio))
2591 		md->dirty += nr_pages;
2592 
2593 	if (folio_test_swapcache(folio))
2594 		md->swapcache += nr_pages;
2595 
2596 	if (folio_test_active(folio) || folio_test_unevictable(folio))
2597 		md->active += nr_pages;
2598 
2599 	if (folio_test_writeback(folio))
2600 		md->writeback += nr_pages;
2601 
2602 	if (folio_test_anon(folio))
2603 		md->anon += nr_pages;
2604 
2605 	if (count > md->mapcount_max)
2606 		md->mapcount_max = count;
2607 
2608 	md->node[folio_nid(folio)] += nr_pages;
2609 }
2610 
2611 static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
2612 		unsigned long addr)
2613 {
2614 	struct page *page;
2615 	int nid;
2616 
2617 	if (!pte_present(pte))
2618 		return NULL;
2619 
2620 	page = vm_normal_page(vma, addr, pte);
2621 	if (!page || is_zone_device_page(page))
2622 		return NULL;
2623 
2624 	if (PageReserved(page))
2625 		return NULL;
2626 
2627 	nid = page_to_nid(page);
2628 	if (!node_isset(nid, node_states[N_MEMORY]))
2629 		return NULL;
2630 
2631 	return page;
2632 }
2633 
2634 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2635 static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
2636 					      struct vm_area_struct *vma,
2637 					      unsigned long addr)
2638 {
2639 	struct page *page;
2640 	int nid;
2641 
2642 	if (!pmd_present(pmd))
2643 		return NULL;
2644 
2645 	page = vm_normal_page_pmd(vma, addr, pmd);
2646 	if (!page)
2647 		return NULL;
2648 
2649 	if (PageReserved(page))
2650 		return NULL;
2651 
2652 	nid = page_to_nid(page);
2653 	if (!node_isset(nid, node_states[N_MEMORY]))
2654 		return NULL;
2655 
2656 	return page;
2657 }
2658 #endif
2659 
2660 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
2661 		unsigned long end, struct mm_walk *walk)
2662 {
2663 	struct numa_maps *md = walk->private;
2664 	struct vm_area_struct *vma = walk->vma;
2665 	spinlock_t *ptl;
2666 	pte_t *orig_pte;
2667 	pte_t *pte;
2668 
2669 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2670 	ptl = pmd_trans_huge_lock(pmd, vma);
2671 	if (ptl) {
2672 		struct page *page;
2673 
2674 		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
2675 		if (page)
2676 			gather_stats(page, md, pmd_dirty(*pmd),
2677 				     HPAGE_PMD_SIZE/PAGE_SIZE);
2678 		spin_unlock(ptl);
2679 		return 0;
2680 	}
2681 #endif
2682 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
2683 	if (!pte) {
2684 		walk->action = ACTION_AGAIN;
2685 		return 0;
2686 	}
2687 	do {
2688 		pte_t ptent = ptep_get(pte);
2689 		struct page *page = can_gather_numa_stats(ptent, vma, addr);
2690 		if (!page)
2691 			continue;
2692 		gather_stats(page, md, pte_dirty(ptent), 1);
2693 
2694 	} while (pte++, addr += PAGE_SIZE, addr != end);
2695 	pte_unmap_unlock(orig_pte, ptl);
2696 	cond_resched();
2697 	return 0;
2698 }
2699 #ifdef CONFIG_HUGETLB_PAGE
2700 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
2701 		unsigned long addr, unsigned long end, struct mm_walk *walk)
2702 {
2703 	pte_t huge_pte = huge_ptep_get(pte);
2704 	struct numa_maps *md;
2705 	struct page *page;
2706 
2707 	if (!pte_present(huge_pte))
2708 		return 0;
2709 
2710 	page = pte_page(huge_pte);
2711 
2712 	md = walk->private;
2713 	gather_stats(page, md, pte_dirty(huge_pte), 1);
2714 	return 0;
2715 }
2716 
2717 #else
2718 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
2719 		unsigned long addr, unsigned long end, struct mm_walk *walk)
2720 {
2721 	return 0;
2722 }
2723 #endif
2724 
2725 static const struct mm_walk_ops show_numa_ops = {
2726 	.hugetlb_entry = gather_hugetlb_stats,
2727 	.pmd_entry = gather_pte_stats,
2728 	.walk_lock = PGWALK_RDLOCK,
2729 };
2730 
2731 /*
2732  * Display pages allocated per node and memory policy via /proc.
2733  */
2734 static int show_numa_map(struct seq_file *m, void *v)
2735 {
2736 	struct numa_maps_private *numa_priv = m->private;
2737 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
2738 	struct vm_area_struct *vma = v;
2739 	struct numa_maps *md = &numa_priv->md;
2740 	struct file *file = vma->vm_file;
2741 	struct mm_struct *mm = vma->vm_mm;
2742 	char buffer[64];
2743 	struct mempolicy *pol;
2744 	pgoff_t ilx;
2745 	int nid;
2746 
2747 	if (!mm)
2748 		return 0;
2749 
2750 	/* Ensure we start with an empty set of numa_maps statistics. */
2751 	memset(md, 0, sizeof(*md));
2752 
2753 	pol = __get_vma_policy(vma, vma->vm_start, &ilx);
2754 	if (pol) {
2755 		mpol_to_str(buffer, sizeof(buffer), pol);
2756 		mpol_cond_put(pol);
2757 	} else {
2758 		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
2759 	}
2760 
2761 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2762 
2763 	if (file) {
2764 		seq_puts(m, " file=");
2765 		seq_path(m, file_user_path(file), "\n\t= ");
2766 	} else if (vma_is_initial_heap(vma)) {
2767 		seq_puts(m, " heap");
2768 	} else if (vma_is_initial_stack(vma)) {
2769 		seq_puts(m, " stack");
2770 	}
2771 
2772 	if (is_vm_hugetlb_page(vma))
2773 		seq_puts(m, " huge");
2774 
2775 	/* mmap_lock is held by m_start */
2776 	walk_page_vma(vma, &show_numa_ops, md);
2777 
2778 	if (!md->pages)
2779 		goto out;
2780 
2781 	if (md->anon)
2782 		seq_printf(m, " anon=%lu", md->anon);
2783 
2784 	if (md->dirty)
2785 		seq_printf(m, " dirty=%lu", md->dirty);
2786 
2787 	if (md->pages != md->anon && md->pages != md->dirty)
2788 		seq_printf(m, " mapped=%lu", md->pages);
2789 
2790 	if (md->mapcount_max > 1)
2791 		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2792 
2793 	if (md->swapcache)
2794 		seq_printf(m, " swapcache=%lu", md->swapcache);
2795 
2796 	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2797 		seq_printf(m, " active=%lu", md->active);
2798 
2799 	if (md->writeback)
2800 		seq_printf(m, " writeback=%lu", md->writeback);
2801 
2802 	for_each_node_state(nid, N_MEMORY)
2803 		if (md->node[nid])
2804 			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
2805 
2806 	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
2807 out:
2808 	seq_putc(m, '\n');
2809 	return 0;
2810 }
2811 
2812 static const struct seq_operations proc_pid_numa_maps_op = {
2813 	.start  = m_start,
2814 	.next   = m_next,
2815 	.stop   = m_stop,
2816 	.show   = show_numa_map,
2817 };
2818 
2819 static int pid_numa_maps_open(struct inode *inode, struct file *file)
2820 {
2821 	return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
2822 				sizeof(struct numa_maps_private));
2823 }
2824 
2825 const struct file_operations proc_pid_numa_maps_operations = {
2826 	.open		= pid_numa_maps_open,
2827 	.read		= seq_read,
2828 	.llseek		= seq_lseek,
2829 	.release	= proc_map_release,
2830 };
2831 
2832 #endif /* CONFIG_NUMA */
2833