xref: /linux/mm/rmap.c (revision 3a64d5b82eccc0dc629d43cde791a2c19bd67dfc)
1 /*
2  * mm/rmap.c - physical to virtual reverse mappings
3  *
4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5  * Released under the General Public License (GPL).
6  *
7  * Simple, low overhead reverse mapping scheme.
8  * Please try to keep this thing as modular as possible.
9  *
10  * Provides methods for unmapping each kind of mapped page:
11  * the anon methods track anonymous pages, and
12  * the file methods track pages belonging to an inode.
13  *
14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17  * Contributions by Hugh Dickins 2003, 2004
18  */
19 
20 /*
21  * Lock ordering in mm:
22  *
23  * inode->i_rwsem	(while writing or truncating, not reading or faulting)
24  *   mm->mmap_lock
25  *     mapping->invalidate_lock (in filemap_fault)
26  *       folio_lock
27  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
28  *           vma_start_write
29  *             mapping->i_mmap_rwsem
30  *               anon_vma->rwsem
31  *                 mm->page_table_lock or pte_lock
32  *                   swap_lock (in swap_duplicate, swap_info_get)
33  *                     mmlist_lock (in mmput, drain_mmlist and others)
34  *                     mapping->private_lock (in block_dirty_folio)
35  *                         i_pages lock (widely used)
36  *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
37  *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
38  *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
39  *                       sb_lock (within inode_lock in fs/fs-writeback.c)
40  *                       i_pages lock (widely used, in set_page_dirty,
41  *                                 in arch-dependent flush_dcache_mmap_lock,
42  *                                 within bdi.wb->list_lock in __sync_single_inode)
43  *
44  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
45  *   ->tasklist_lock
46  *     pte map lock
47  *
48  * hugetlbfs PageHuge() take locks in this order:
49  *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
50  *     vma_lock (hugetlb specific lock for pmd_sharing)
51  *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
52  *         folio_lock
53  */
54 
55 #include <linux/mm.h>
56 #include <linux/sched/mm.h>
57 #include <linux/sched/task.h>
58 #include <linux/pagemap.h>
59 #include <linux/swap.h>
60 #include <linux/leafops.h>
61 #include <linux/slab.h>
62 #include <linux/init.h>
63 #include <linux/ksm.h>
64 #include <linux/rmap.h>
65 #include <linux/rcupdate.h>
66 #include <linux/export.h>
67 #include <linux/memcontrol.h>
68 #include <linux/mmu_notifier.h>
69 #include <linux/migrate.h>
70 #include <linux/hugetlb.h>
71 #include <linux/huge_mm.h>
72 #include <linux/backing-dev.h>
73 #include <linux/page_idle.h>
74 #include <linux/memremap.h>
75 #include <linux/userfaultfd_k.h>
76 #include <linux/mm_inline.h>
77 #include <linux/oom.h>
78 
79 #include <asm/tlb.h>
80 
81 #define CREATE_TRACE_POINTS
82 #include <trace/events/migrate.h>
83 
84 #include "internal.h"
85 
86 static struct kmem_cache *anon_vma_cachep;
87 static struct kmem_cache *anon_vma_chain_cachep;
88 
89 static inline struct anon_vma *anon_vma_alloc(void)
90 {
91 	struct anon_vma *anon_vma;
92 
93 	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
94 	if (anon_vma) {
95 		atomic_set(&anon_vma->refcount, 1);
96 		anon_vma->num_children = 0;
97 		anon_vma->num_active_vmas = 0;
98 		anon_vma->parent = anon_vma;
99 		/*
100 		 * Initialise the anon_vma root to point to itself. If called
101 		 * from fork, the root will be reset to the parents anon_vma.
102 		 */
103 		anon_vma->root = anon_vma;
104 	}
105 
106 	return anon_vma;
107 }
108 
109 static inline void anon_vma_free(struct anon_vma *anon_vma)
110 {
111 	VM_BUG_ON(atomic_read(&anon_vma->refcount));
112 
113 	/*
114 	 * Synchronize against folio_lock_anon_vma_read() such that
115 	 * we can safely hold the lock without the anon_vma getting
116 	 * freed.
117 	 *
118 	 * Relies on the full mb implied by the atomic_dec_and_test() from
119 	 * put_anon_vma() against the acquire barrier implied by
120 	 * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
121 	 *
122 	 * folio_lock_anon_vma_read()	VS	put_anon_vma()
123 	 *   down_read_trylock()		  atomic_dec_and_test()
124 	 *   LOCK				  MB
125 	 *   atomic_read()			  rwsem_is_locked()
126 	 *
127 	 * LOCK should suffice since the actual taking of the lock must
128 	 * happen _before_ what follows.
129 	 */
130 	might_sleep();
131 	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
132 		anon_vma_lock_write(anon_vma);
133 		anon_vma_unlock_write(anon_vma);
134 	}
135 
136 	kmem_cache_free(anon_vma_cachep, anon_vma);
137 }
138 
139 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
140 {
141 	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
142 }
143 
144 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
145 {
146 	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
147 }
148 
149 static void anon_vma_chain_assign(struct vm_area_struct *vma,
150 				  struct anon_vma_chain *avc,
151 				  struct anon_vma *anon_vma)
152 {
153 	avc->vma = vma;
154 	avc->anon_vma = anon_vma;
155 	list_add(&avc->same_vma, &vma->anon_vma_chain);
156 }
157 
158 /**
159  * __anon_vma_prepare - attach an anon_vma to a memory region
160  * @vma: the memory region in question
161  *
162  * This makes sure the memory mapping described by 'vma' has
163  * an 'anon_vma' attached to it, so that we can associate the
164  * anonymous pages mapped into it with that anon_vma.
165  *
166  * The common case will be that we already have one, which
167  * is handled inline by anon_vma_prepare(). But if
168  * not we either need to find an adjacent mapping that we
169  * can re-use the anon_vma from (very common when the only
170  * reason for splitting a vma has been mprotect()), or we
171  * allocate a new one.
172  *
173  * Anon-vma allocations are very subtle, because we may have
174  * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
175  * and that may actually touch the rwsem even in the newly
176  * allocated vma (it depends on RCU to make sure that the
177  * anon_vma isn't actually destroyed).
178  *
179  * As a result, we need to do proper anon_vma locking even
180  * for the new allocation. At the same time, we do not want
181  * to do any locking for the common case of already having
182  * an anon_vma.
183  */
184 int __anon_vma_prepare(struct vm_area_struct *vma)
185 {
186 	struct mm_struct *mm = vma->vm_mm;
187 	struct anon_vma *anon_vma, *allocated;
188 	struct anon_vma_chain *avc;
189 
190 	mmap_assert_locked(mm);
191 	might_sleep();
192 
193 	avc = anon_vma_chain_alloc(GFP_KERNEL);
194 	if (!avc)
195 		goto out_enomem;
196 
197 	anon_vma = find_mergeable_anon_vma(vma);
198 	allocated = NULL;
199 	if (!anon_vma) {
200 		anon_vma = anon_vma_alloc();
201 		if (unlikely(!anon_vma))
202 			goto out_enomem_free_avc;
203 		anon_vma->num_children++; /* self-parent link for new root */
204 		allocated = anon_vma;
205 	}
206 
207 	anon_vma_lock_write(anon_vma);
208 	/* page_table_lock to protect against threads */
209 	spin_lock(&mm->page_table_lock);
210 	if (likely(!vma->anon_vma)) {
211 		vma->anon_vma = anon_vma;
212 		anon_vma_chain_assign(vma, avc, anon_vma);
213 		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
214 		anon_vma->num_active_vmas++;
215 		allocated = NULL;
216 		avc = NULL;
217 	}
218 	spin_unlock(&mm->page_table_lock);
219 	anon_vma_unlock_write(anon_vma);
220 
221 	if (unlikely(allocated))
222 		put_anon_vma(allocated);
223 	if (unlikely(avc))
224 		anon_vma_chain_free(avc);
225 
226 	return 0;
227 
228  out_enomem_free_avc:
229 	anon_vma_chain_free(avc);
230  out_enomem:
231 	return -ENOMEM;
232 }
233 
234 static void check_anon_vma_clone(struct vm_area_struct *dst,
235 				 struct vm_area_struct *src,
236 				 enum vma_operation operation)
237 {
238 	/* The write lock must be held. */
239 	mmap_assert_write_locked(src->vm_mm);
240 	/* If not a fork then must be on same mm. */
241 	VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
242 
243 	/* If we have anything to do src->anon_vma must be provided. */
244 	VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
245 	VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
246 	/* We are establishing a new anon_vma_chain. */
247 	VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
248 	/*
249 	 * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
250 	 * must be the same across dst and src.
251 	 */
252 	VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
253 	/*
254 	 * Essentially equivalent to above - if not a no-op, we should expect
255 	 * dst->anon_vma to be set for everything except a fork.
256 	 */
257 	VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
258 			!dst->anon_vma);
259 	/* For the anon_vma to be compatible, it can only be singular. */
260 	VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
261 			!list_is_singular(&src->anon_vma_chain));
262 #ifdef CONFIG_PER_VMA_LOCK
263 	/* Only merging an unfaulted VMA leaves the destination attached. */
264 	VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
265 			vma_is_attached(dst));
266 #endif
267 }
268 
269 static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
270 		struct anon_vma *anon_vma)
271 {
272 	/* If already populated, nothing to do.*/
273 	if (dst->anon_vma)
274 		return;
275 
276 	/*
277 	 * We reuse an anon_vma if any linking VMAs were unmapped and it has
278 	 * only a single child at most.
279 	 */
280 	if (anon_vma->num_active_vmas > 0)
281 		return;
282 	if (anon_vma->num_children > 1)
283 		return;
284 
285 	dst->anon_vma = anon_vma;
286 	anon_vma->num_active_vmas++;
287 }
288 
289 static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
290 
291 /**
292  * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
293  * all of the anon_vma objects contained within @src anon_vma_chain's.
294  * @dst: The destination VMA with an empty anon_vma_chain.
295  * @src: The source VMA we wish to duplicate.
296  * @operation: The type of operation which resulted in the clone.
297  *
298  * This is the heart of the VMA side of the anon_vma implementation - we invoke
299  * this function whenever we need to set up a new VMA's anon_vma state.
300  *
301  * This is invoked for:
302  *
303  * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
304  *   clone @src into @dst.
305  * - VMA split.
306  * - VMA (m)remap.
307  * - Fork of faulted VMA.
308  *
309  * In all cases other than fork this is simply a duplication. Fork additionally
310  * adds a new active anon_vma.
311  *
312  * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
313  * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
314  * but do have a single child. This is to avoid waste of memory when repeatedly
315  * forking.
316  *
317  * Returns: 0 on success, -ENOMEM on failure.
318  */
319 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
320 		   enum vma_operation operation)
321 {
322 	struct anon_vma_chain *avc, *pavc;
323 	struct anon_vma *active_anon_vma = src->anon_vma;
324 
325 	check_anon_vma_clone(dst, src, operation);
326 
327 	if (!active_anon_vma)
328 		return 0;
329 
330 	/*
331 	 * Allocate AVCs. We don't need an anon_vma lock for this as we
332 	 * are not updating the anon_vma rbtree nor are we changing
333 	 * anon_vma statistics.
334 	 *
335 	 * Either src, dst have the same mm for which we hold an exclusive mmap
336 	 * write lock, or we are forking and we hold it on src->vm_mm and dst is
337 	 * not yet accessible to other threads so there's no possibliity of the
338 	 * unlinked AVC's being observed yet.
339 	 */
340 	list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
341 		avc = anon_vma_chain_alloc(GFP_KERNEL);
342 		if (!avc)
343 			goto enomem_failure;
344 
345 		anon_vma_chain_assign(dst, avc, pavc->anon_vma);
346 	}
347 
348 	/*
349 	 * Now link the anon_vma's back to the newly inserted AVCs.
350 	 * Note that all anon_vma's share the same root.
351 	 */
352 	anon_vma_lock_write(src->anon_vma);
353 	list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
354 		struct anon_vma *anon_vma = avc->anon_vma;
355 
356 		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
357 		if (operation == VMA_OP_FORK)
358 			maybe_reuse_anon_vma(dst, anon_vma);
359 	}
360 
361 	if (operation != VMA_OP_FORK)
362 		dst->anon_vma->num_active_vmas++;
363 
364 	anon_vma_unlock_write(active_anon_vma);
365 	return 0;
366 
367  enomem_failure:
368 	cleanup_partial_anon_vmas(dst);
369 	return -ENOMEM;
370 }
371 
372 /*
373  * Attach vma to its own anon_vma, as well as to the anon_vmas that
374  * the corresponding VMA in the parent process is attached to.
375  * Returns 0 on success, non-zero on failure.
376  */
377 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
378 {
379 	struct anon_vma_chain *avc;
380 	struct anon_vma *anon_vma;
381 	int rc;
382 
383 	/* Don't bother if the parent process has no anon_vma here. */
384 	if (!pvma->anon_vma)
385 		return 0;
386 
387 	/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
388 	vma->anon_vma = NULL;
389 
390 	anon_vma = anon_vma_alloc();
391 	if (!anon_vma)
392 		return -ENOMEM;
393 	avc = anon_vma_chain_alloc(GFP_KERNEL);
394 	if (!avc) {
395 		put_anon_vma(anon_vma);
396 		return -ENOMEM;
397 	}
398 
399 	/*
400 	 * First, attach the new VMA to the parent VMA's anon_vmas,
401 	 * so rmap can find non-COWed pages in child processes.
402 	 */
403 	rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
404 	/* An error arose or an existing anon_vma was reused, all done then. */
405 	if (rc || vma->anon_vma) {
406 		put_anon_vma(anon_vma);
407 		anon_vma_chain_free(avc);
408 		return rc;
409 	}
410 
411 	/*
412 	 * OK no reuse, so add our own anon_vma.
413 	 *
414 	 * Since it is not linked anywhere we can safely manipulate anon_vma
415 	 * fields without a lock.
416 	 */
417 
418 	anon_vma->num_active_vmas = 1;
419 	/*
420 	 * The root anon_vma's rwsem is the lock actually used when we
421 	 * lock any of the anon_vmas in this anon_vma tree.
422 	 */
423 	anon_vma->root = pvma->anon_vma->root;
424 	anon_vma->parent = pvma->anon_vma;
425 	/*
426 	 * With refcounts, an anon_vma can stay around longer than the
427 	 * process it belongs to. The root anon_vma needs to be pinned until
428 	 * this anon_vma is freed, because the lock lives in the root.
429 	 */
430 	get_anon_vma(anon_vma->root);
431 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
432 	vma->anon_vma = anon_vma;
433 	anon_vma_chain_assign(vma, avc, anon_vma);
434 	/* Now let rmap see it. */
435 	anon_vma_lock_write(anon_vma);
436 	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
437 	anon_vma->parent->num_children++;
438 	anon_vma_unlock_write(anon_vma);
439 
440 	return 0;
441 }
442 
443 /*
444  * In the unfortunate case of anon_vma_clone() failing to allocate memory we
445  * have to clean things up.
446  *
447  * Since we allocate anon_vma_chain's before we insert them into the interval
448  * trees, we simply have to free up the AVC's and remove the entries from the
449  * VMA's anon_vma_chain.
450  */
451 static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
452 {
453 	struct anon_vma_chain *avc, *next;
454 
455 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
456 		list_del(&avc->same_vma);
457 		anon_vma_chain_free(avc);
458 	}
459 }
460 
461 /**
462  * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
463  * anon_vma_chain objects.
464  * @vma: The VMA whose links to anon_vma objects is to be severed.
465  *
466  * As part of the process anon_vma_chain's are freed,
467  * anon_vma->num_children,num_active_vmas is updated as required and, if the
468  * relevant anon_vma references no further VMAs, its reference count is
469  * decremented.
470  */
471 void unlink_anon_vmas(struct vm_area_struct *vma)
472 {
473 	struct anon_vma_chain *avc, *next;
474 	struct anon_vma *active_anon_vma = vma->anon_vma;
475 
476 	/* Always hold mmap lock, read-lock on unmap possibly. */
477 	mmap_assert_locked(vma->vm_mm);
478 
479 	/* Unfaulted is a no-op. */
480 	if (!active_anon_vma) {
481 		VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain));
482 		return;
483 	}
484 
485 	anon_vma_lock_write(active_anon_vma);
486 
487 	/*
488 	 * Unlink each anon_vma chained to the VMA.  This list is ordered
489 	 * from newest to oldest, ensuring the root anon_vma gets freed last.
490 	 */
491 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
492 		struct anon_vma *anon_vma = avc->anon_vma;
493 
494 		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
495 
496 		/*
497 		 * Leave empty anon_vmas on the list - we'll need
498 		 * to free them outside the lock.
499 		 */
500 		if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
501 			anon_vma->parent->num_children--;
502 			continue;
503 		}
504 
505 		list_del(&avc->same_vma);
506 		anon_vma_chain_free(avc);
507 	}
508 
509 	active_anon_vma->num_active_vmas--;
510 	/*
511 	 * vma would still be needed after unlink, and anon_vma will be prepared
512 	 * when handle fault.
513 	 */
514 	vma->anon_vma = NULL;
515 	anon_vma_unlock_write(active_anon_vma);
516 
517 
518 	/*
519 	 * Iterate the list once more, it now only contains empty and unlinked
520 	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
521 	 * needing to write-acquire the anon_vma->root->rwsem.
522 	 */
523 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
524 		struct anon_vma *anon_vma = avc->anon_vma;
525 
526 		VM_WARN_ON(anon_vma->num_children);
527 		VM_WARN_ON(anon_vma->num_active_vmas);
528 		put_anon_vma(anon_vma);
529 
530 		list_del(&avc->same_vma);
531 		anon_vma_chain_free(avc);
532 	}
533 }
534 
535 static void anon_vma_ctor(void *data)
536 {
537 	struct anon_vma *anon_vma = data;
538 
539 	init_rwsem(&anon_vma->rwsem);
540 	atomic_set(&anon_vma->refcount, 0);
541 	anon_vma->rb_root = RB_ROOT_CACHED;
542 }
543 
544 void __init anon_vma_init(void)
545 {
546 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
547 			0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
548 			anon_vma_ctor);
549 	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
550 			SLAB_PANIC|SLAB_ACCOUNT);
551 }
552 
553 /*
554  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
555  *
556  * Since there is no serialization what so ever against folio_remove_rmap_*()
557  * the best this function can do is return a refcount increased anon_vma
558  * that might have been relevant to this page.
559  *
560  * The page might have been remapped to a different anon_vma or the anon_vma
561  * returned may already be freed (and even reused).
562  *
563  * In case it was remapped to a different anon_vma, the new anon_vma will be a
564  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
565  * ensure that any anon_vma obtained from the page will still be valid for as
566  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
567  *
568  * All users of this function must be very careful when walking the anon_vma
569  * chain and verify that the page in question is indeed mapped in it
570  * [ something equivalent to page_mapped_in_vma() ].
571  *
572  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
573  * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
574  * if there is a mapcount, we can dereference the anon_vma after observing
575  * those.
576  *
577  * NOTE: the caller should hold folio lock when calling this.
578  */
579 struct anon_vma *folio_get_anon_vma(const struct folio *folio)
580 {
581 	struct anon_vma *anon_vma = NULL;
582 	unsigned long anon_mapping;
583 
584 	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
585 
586 	rcu_read_lock();
587 	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
588 	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
589 		goto out;
590 	if (!folio_mapped(folio))
591 		goto out;
592 
593 	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
594 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
595 		anon_vma = NULL;
596 		goto out;
597 	}
598 
599 	/*
600 	 * If this folio is still mapped, then its anon_vma cannot have been
601 	 * freed.  But if it has been unmapped, we have no security against the
602 	 * anon_vma structure being freed and reused (for another anon_vma:
603 	 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
604 	 * above cannot corrupt).
605 	 */
606 	if (!folio_mapped(folio)) {
607 		rcu_read_unlock();
608 		put_anon_vma(anon_vma);
609 		return NULL;
610 	}
611 out:
612 	rcu_read_unlock();
613 
614 	return anon_vma;
615 }
616 
617 /*
618  * Similar to folio_get_anon_vma() except it locks the anon_vma.
619  *
620  * Its a little more complex as it tries to keep the fast path to a single
621  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
622  * reference like with folio_get_anon_vma() and then block on the mutex
623  * on !rwc->try_lock case.
624  */
625 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
626 					  struct rmap_walk_control *rwc)
627 {
628 	struct anon_vma *anon_vma = NULL;
629 	struct anon_vma *root_anon_vma;
630 	unsigned long anon_mapping;
631 
632 	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
633 
634 	rcu_read_lock();
635 	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
636 	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
637 		goto out;
638 	if (!folio_mapped(folio))
639 		goto out;
640 
641 	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
642 	root_anon_vma = READ_ONCE(anon_vma->root);
643 	if (down_read_trylock(&root_anon_vma->rwsem)) {
644 		/*
645 		 * If the folio is still mapped, then this anon_vma is still
646 		 * its anon_vma, and holding the mutex ensures that it will
647 		 * not go away, see anon_vma_free().
648 		 */
649 		if (!folio_mapped(folio)) {
650 			up_read(&root_anon_vma->rwsem);
651 			anon_vma = NULL;
652 		}
653 		goto out;
654 	}
655 
656 	if (rwc && rwc->try_lock) {
657 		anon_vma = NULL;
658 		rwc->contended = true;
659 		goto out;
660 	}
661 
662 	/* trylock failed, we got to sleep */
663 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
664 		anon_vma = NULL;
665 		goto out;
666 	}
667 
668 	if (!folio_mapped(folio)) {
669 		rcu_read_unlock();
670 		put_anon_vma(anon_vma);
671 		return NULL;
672 	}
673 
674 	/* we pinned the anon_vma, its safe to sleep */
675 	rcu_read_unlock();
676 	anon_vma_lock_read(anon_vma);
677 
678 	if (atomic_dec_and_test(&anon_vma->refcount)) {
679 		/*
680 		 * Oops, we held the last refcount, release the lock
681 		 * and bail -- can't simply use put_anon_vma() because
682 		 * we'll deadlock on the anon_vma_lock_write() recursion.
683 		 */
684 		anon_vma_unlock_read(anon_vma);
685 		__put_anon_vma(anon_vma);
686 		anon_vma = NULL;
687 	}
688 
689 	return anon_vma;
690 
691 out:
692 	rcu_read_unlock();
693 	return anon_vma;
694 }
695 
696 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
697 /*
698  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
699  * important if a PTE was dirty when it was unmapped that it's flushed
700  * before any IO is initiated on the page to prevent lost writes. Similarly,
701  * it must be flushed before freeing to prevent data leakage.
702  */
703 void try_to_unmap_flush(void)
704 {
705 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
706 
707 	if (!tlb_ubc->flush_required)
708 		return;
709 
710 	arch_tlbbatch_flush(&tlb_ubc->arch);
711 	tlb_ubc->flush_required = false;
712 	tlb_ubc->writable = false;
713 }
714 
715 /* Flush iff there are potentially writable TLB entries that can race with IO */
716 void try_to_unmap_flush_dirty(void)
717 {
718 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
719 
720 	if (tlb_ubc->writable)
721 		try_to_unmap_flush();
722 }
723 
724 /*
725  * Bits 0-14 of mm->tlb_flush_batched record pending generations.
726  * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
727  */
728 #define TLB_FLUSH_BATCH_FLUSHED_SHIFT	16
729 #define TLB_FLUSH_BATCH_PENDING_MASK			\
730 	((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
731 #define TLB_FLUSH_BATCH_PENDING_LARGE			\
732 	(TLB_FLUSH_BATCH_PENDING_MASK / 2)
733 
734 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
735 		unsigned long start, unsigned long end)
736 {
737 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
738 	int batch;
739 	bool writable = pte_dirty(pteval);
740 
741 	if (!pte_accessible(mm, pteval))
742 		return;
743 
744 	arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
745 	tlb_ubc->flush_required = true;
746 
747 	/*
748 	 * Ensure compiler does not re-order the setting of tlb_flush_batched
749 	 * before the PTE is cleared.
750 	 */
751 	barrier();
752 	batch = atomic_read(&mm->tlb_flush_batched);
753 retry:
754 	if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
755 		/*
756 		 * Prevent `pending' from catching up with `flushed' because of
757 		 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
758 		 * `pending' becomes large.
759 		 */
760 		if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
761 			goto retry;
762 	} else {
763 		atomic_inc(&mm->tlb_flush_batched);
764 	}
765 
766 	/*
767 	 * If the PTE was dirty then it's best to assume it's writable. The
768 	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
769 	 * before the page is queued for IO.
770 	 */
771 	if (writable)
772 		tlb_ubc->writable = true;
773 }
774 
775 /*
776  * Returns true if the TLB flush should be deferred to the end of a batch of
777  * unmap operations to reduce IPIs.
778  */
779 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
780 {
781 	if (!(flags & TTU_BATCH_FLUSH))
782 		return false;
783 
784 	return arch_tlbbatch_should_defer(mm);
785 }
786 
787 /*
788  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
789  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
790  * operation such as mprotect or munmap to race between reclaim unmapping
791  * the page and flushing the page. If this race occurs, it potentially allows
792  * access to data via a stale TLB entry. Tracking all mm's that have TLB
793  * batching in flight would be expensive during reclaim so instead track
794  * whether TLB batching occurred in the past and if so then do a flush here
795  * if required. This will cost one additional flush per reclaim cycle paid
796  * by the first operation at risk such as mprotect and mumap.
797  *
798  * This must be called under the PTL so that an access to tlb_flush_batched
799  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
800  * via the PTL.
801  */
802 void flush_tlb_batched_pending(struct mm_struct *mm)
803 {
804 	int batch = atomic_read(&mm->tlb_flush_batched);
805 	int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
806 	int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
807 
808 	if (pending != flushed) {
809 		flush_tlb_mm(mm);
810 		/*
811 		 * If the new TLB flushing is pending during flushing, leave
812 		 * mm->tlb_flush_batched as is, to avoid losing flushing.
813 		 */
814 		atomic_cmpxchg(&mm->tlb_flush_batched, batch,
815 			       pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
816 	}
817 }
818 #else
819 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
820 		unsigned long start, unsigned long end)
821 {
822 }
823 
824 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
825 {
826 	return false;
827 }
828 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
829 
830 /**
831  * page_address_in_vma - The virtual address of a page in this VMA.
832  * @folio: The folio containing the page.
833  * @page: The page within the folio.
834  * @vma: The VMA we need to know the address in.
835  *
836  * Calculates the user virtual address of this page in the specified VMA.
837  * It is the caller's responsibility to check the page is actually
838  * within the VMA.  There may not currently be a PTE pointing at this
839  * page, but if a page fault occurs at this address, this is the page
840  * which will be accessed.
841  *
842  * Context: Caller should hold a reference to the folio.  Caller should
843  * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
844  * VMA from being altered.
845  *
846  * Return: The virtual address corresponding to this page in the VMA.
847  */
848 unsigned long page_address_in_vma(const struct folio *folio,
849 		const struct page *page, const struct vm_area_struct *vma)
850 {
851 	if (folio_test_anon(folio)) {
852 		struct anon_vma *anon_vma = folio_anon_vma(folio);
853 		/*
854 		 * Note: swapoff's unuse_vma() is more efficient with this
855 		 * check, and needs it to match anon_vma when KSM is active.
856 		 */
857 		if (!vma->anon_vma || !anon_vma ||
858 		    vma->anon_vma->root != anon_vma->root)
859 			return -EFAULT;
860 	} else if (!vma->vm_file) {
861 		return -EFAULT;
862 	} else if (vma->vm_file->f_mapping != folio->mapping) {
863 		return -EFAULT;
864 	}
865 
866 	/* KSM folios don't reach here because of the !anon_vma check */
867 	return vma_address(vma, page_pgoff(folio, page), 1);
868 }
869 
870 /*
871  * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
872  * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
873  * represents.
874  */
875 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
876 {
877 	pgd_t *pgd;
878 	p4d_t *p4d;
879 	pud_t *pud;
880 	pmd_t *pmd = NULL;
881 
882 	pgd = pgd_offset(mm, address);
883 	if (!pgd_present(*pgd))
884 		goto out;
885 
886 	p4d = p4d_offset(pgd, address);
887 	if (!p4d_present(*p4d))
888 		goto out;
889 
890 	pud = pud_offset(p4d, address);
891 	if (!pud_present(*pud))
892 		goto out;
893 
894 	pmd = pmd_offset(pud, address);
895 out:
896 	return pmd;
897 }
898 
899 struct folio_referenced_arg {
900 	int mapcount;
901 	int referenced;
902 	vm_flags_t vm_flags;
903 	struct mem_cgroup *memcg;
904 };
905 
906 /*
907  * arg: folio_referenced_arg will be passed
908  */
909 static bool folio_referenced_one(struct folio *folio,
910 		struct vm_area_struct *vma, unsigned long address, void *arg)
911 {
912 	struct folio_referenced_arg *pra = arg;
913 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
914 	int ptes = 0, referenced = 0;
915 
916 	while (page_vma_mapped_walk(&pvmw)) {
917 		address = pvmw.address;
918 
919 		if (vma->vm_flags & VM_LOCKED) {
920 			ptes++;
921 			pra->mapcount--;
922 
923 			/* Only mlock fully mapped pages */
924 			if (pvmw.pte && ptes != pvmw.nr_pages)
925 				continue;
926 
927 			/*
928 			 * All PTEs must be protected by page table lock in
929 			 * order to mlock the page.
930 			 *
931 			 * If page table boundary has been cross, current ptl
932 			 * only protect part of ptes.
933 			 */
934 			if (pvmw.flags & PVMW_PGTABLE_CROSSED)
935 				continue;
936 
937 			/* Restore the mlock which got missed */
938 			mlock_vma_folio(folio, vma);
939 			page_vma_mapped_walk_done(&pvmw);
940 			pra->vm_flags |= VM_LOCKED;
941 			return false; /* To break the loop */
942 		}
943 
944 		/*
945 		 * Skip the non-shared swapbacked folio mapped solely by
946 		 * the exiting or OOM-reaped process. This avoids redundant
947 		 * swap-out followed by an immediate unmap.
948 		 */
949 		if ((!atomic_read(&vma->vm_mm->mm_users) ||
950 		    check_stable_address_space(vma->vm_mm)) &&
951 		    folio_test_anon(folio) && folio_test_swapbacked(folio) &&
952 		    !folio_maybe_mapped_shared(folio)) {
953 			pra->referenced = -1;
954 			page_vma_mapped_walk_done(&pvmw);
955 			return false;
956 		}
957 
958 		if (lru_gen_enabled() && pvmw.pte) {
959 			if (lru_gen_look_around(&pvmw))
960 				referenced++;
961 		} else if (pvmw.pte) {
962 			if (ptep_clear_flush_young_notify(vma, address,
963 						pvmw.pte))
964 				referenced++;
965 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
966 			if (pmdp_clear_flush_young_notify(vma, address,
967 						pvmw.pmd))
968 				referenced++;
969 		} else {
970 			/* unexpected pmd-mapped folio? */
971 			WARN_ON_ONCE(1);
972 		}
973 
974 		pra->mapcount--;
975 	}
976 
977 	if (referenced)
978 		folio_clear_idle(folio);
979 	if (folio_test_clear_young(folio))
980 		referenced++;
981 
982 	if (referenced) {
983 		pra->referenced++;
984 		pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
985 	}
986 
987 	if (!pra->mapcount)
988 		return false; /* To break the loop */
989 
990 	return true;
991 }
992 
993 static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
994 {
995 	struct folio_referenced_arg *pra = arg;
996 	struct mem_cgroup *memcg = pra->memcg;
997 
998 	/*
999 	 * Ignore references from this mapping if it has no recency. If the
1000 	 * folio has been used in another mapping, we will catch it; if this
1001 	 * other mapping is already gone, the unmap path will have set the
1002 	 * referenced flag or activated the folio in zap_pte_range().
1003 	 */
1004 	if (!vma_has_recency(vma))
1005 		return true;
1006 
1007 	/*
1008 	 * If we are reclaiming on behalf of a cgroup, skip counting on behalf
1009 	 * of references from different cgroups.
1010 	 */
1011 	if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1012 		return true;
1013 
1014 	return false;
1015 }
1016 
1017 /**
1018  * folio_referenced() - Test if the folio was referenced.
1019  * @folio: The folio to test.
1020  * @is_locked: Caller holds lock on the folio.
1021  * @memcg: target memory cgroup
1022  * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
1023  *
1024  * Quick test_and_clear_referenced for all mappings of a folio,
1025  *
1026  * Return: The number of mappings which referenced the folio. Return -1 if
1027  * the function bailed out due to rmap lock contention.
1028  */
1029 int folio_referenced(struct folio *folio, int is_locked,
1030 		     struct mem_cgroup *memcg, vm_flags_t *vm_flags)
1031 {
1032 	bool we_locked = false;
1033 	struct folio_referenced_arg pra = {
1034 		.mapcount = folio_mapcount(folio),
1035 		.memcg = memcg,
1036 	};
1037 	struct rmap_walk_control rwc = {
1038 		.rmap_one = folio_referenced_one,
1039 		.arg = (void *)&pra,
1040 		.anon_lock = folio_lock_anon_vma_read,
1041 		.try_lock = true,
1042 		.invalid_vma = invalid_folio_referenced_vma,
1043 	};
1044 
1045 	*vm_flags = 0;
1046 	if (!pra.mapcount)
1047 		return 0;
1048 
1049 	if (!folio_raw_mapping(folio))
1050 		return 0;
1051 
1052 	if (!is_locked) {
1053 		we_locked = folio_trylock(folio);
1054 		if (!we_locked)
1055 			return 1;
1056 	}
1057 
1058 	rmap_walk(folio, &rwc);
1059 	*vm_flags = pra.vm_flags;
1060 
1061 	if (we_locked)
1062 		folio_unlock(folio);
1063 
1064 	return rwc.contended ? -1 : pra.referenced;
1065 }
1066 
1067 static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
1068 {
1069 	int cleaned = 0;
1070 	struct vm_area_struct *vma = pvmw->vma;
1071 	struct mmu_notifier_range range;
1072 	unsigned long address = pvmw->address;
1073 
1074 	/*
1075 	 * We have to assume the worse case ie pmd for invalidation. Note that
1076 	 * the folio can not be freed from this function.
1077 	 */
1078 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
1079 				vma->vm_mm, address, vma_address_end(pvmw));
1080 	mmu_notifier_invalidate_range_start(&range);
1081 
1082 	while (page_vma_mapped_walk(pvmw)) {
1083 		int ret = 0;
1084 
1085 		address = pvmw->address;
1086 		if (pvmw->pte) {
1087 			pte_t *pte = pvmw->pte;
1088 			pte_t entry = ptep_get(pte);
1089 
1090 			/*
1091 			 * PFN swap PTEs, such as device-exclusive ones, that
1092 			 * actually map pages are clean and not writable from a
1093 			 * CPU perspective. The MMU notifier takes care of any
1094 			 * device aspects.
1095 			 */
1096 			if (!pte_present(entry))
1097 				continue;
1098 			if (!pte_dirty(entry) && !pte_write(entry))
1099 				continue;
1100 
1101 			flush_cache_page(vma, address, pte_pfn(entry));
1102 			entry = ptep_clear_flush(vma, address, pte);
1103 			entry = pte_wrprotect(entry);
1104 			entry = pte_mkclean(entry);
1105 			set_pte_at(vma->vm_mm, address, pte, entry);
1106 			ret = 1;
1107 		} else {
1108 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1109 			pmd_t *pmd = pvmw->pmd;
1110 			pmd_t entry = pmdp_get(pmd);
1111 
1112 			/*
1113 			 * Please see the comment above (!pte_present).
1114 			 * A non present PMD is not writable from a CPU
1115 			 * perspective.
1116 			 */
1117 			if (!pmd_present(entry))
1118 				continue;
1119 			if (!pmd_dirty(entry) && !pmd_write(entry))
1120 				continue;
1121 
1122 			flush_cache_range(vma, address,
1123 					  address + HPAGE_PMD_SIZE);
1124 			entry = pmdp_invalidate(vma, address, pmd);
1125 			entry = pmd_wrprotect(entry);
1126 			entry = pmd_mkclean(entry);
1127 			set_pmd_at(vma->vm_mm, address, pmd, entry);
1128 			ret = 1;
1129 #else
1130 			/* unexpected pmd-mapped folio? */
1131 			WARN_ON_ONCE(1);
1132 #endif
1133 		}
1134 
1135 		if (ret)
1136 			cleaned++;
1137 	}
1138 
1139 	mmu_notifier_invalidate_range_end(&range);
1140 
1141 	return cleaned;
1142 }
1143 
1144 static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
1145 			     unsigned long address, void *arg)
1146 {
1147 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
1148 	int *cleaned = arg;
1149 
1150 	*cleaned += page_vma_mkclean_one(&pvmw);
1151 
1152 	return true;
1153 }
1154 
1155 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
1156 {
1157 	if (vma->vm_flags & VM_SHARED)
1158 		return false;
1159 
1160 	return true;
1161 }
1162 
1163 int folio_mkclean(struct folio *folio)
1164 {
1165 	int cleaned = 0;
1166 	struct address_space *mapping;
1167 	struct rmap_walk_control rwc = {
1168 		.arg = (void *)&cleaned,
1169 		.rmap_one = page_mkclean_one,
1170 		.invalid_vma = invalid_mkclean_vma,
1171 	};
1172 
1173 	BUG_ON(!folio_test_locked(folio));
1174 
1175 	if (!folio_mapped(folio))
1176 		return 0;
1177 
1178 	mapping = folio_mapping(folio);
1179 	if (!mapping)
1180 		return 0;
1181 
1182 	rmap_walk(folio, &rwc);
1183 
1184 	return cleaned;
1185 }
1186 EXPORT_SYMBOL_GPL(folio_mkclean);
1187 
1188 struct wrprotect_file_state {
1189 	int cleaned;
1190 	pgoff_t pgoff;
1191 	unsigned long pfn;
1192 	unsigned long nr_pages;
1193 };
1194 
1195 static bool mapping_wrprotect_range_one(struct folio *folio,
1196 		struct vm_area_struct *vma, unsigned long address, void *arg)
1197 {
1198 	struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
1199 	struct page_vma_mapped_walk pvmw = {
1200 		.pfn		= state->pfn,
1201 		.nr_pages	= state->nr_pages,
1202 		.pgoff		= state->pgoff,
1203 		.vma		= vma,
1204 		.address	= address,
1205 		.flags		= PVMW_SYNC,
1206 	};
1207 
1208 	state->cleaned += page_vma_mkclean_one(&pvmw);
1209 
1210 	return true;
1211 }
1212 
1213 static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
1214 			     pgoff_t pgoff_start, unsigned long nr_pages,
1215 			     struct rmap_walk_control *rwc, bool locked);
1216 
1217 /**
1218  * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
1219  *
1220  * @mapping:	The mapping whose reverse mapping should be traversed.
1221  * @pgoff:	The page offset at which @pfn is mapped within @mapping.
1222  * @pfn:	The PFN of the page mapped in @mapping at @pgoff.
1223  * @nr_pages:	The number of physically contiguous base pages spanned.
1224  *
1225  * Traverses the reverse mapping, finding all VMAs which contain a shared
1226  * mapping of the pages in the specified range in @mapping, and write-protects
1227  * them (that is, updates the page tables to mark the mappings read-only such
1228  * that a write protection fault arises when the mappings are written to).
1229  *
1230  * The @pfn value need not refer to a folio, but rather can reference a kernel
1231  * allocation which is mapped into userland. We therefore do not require that
1232  * the page maps to a folio with a valid mapping or index field, rather the
1233  * caller specifies these in @mapping and @pgoff.
1234  *
1235  * Return: the number of write-protected PTEs, or an error.
1236  */
1237 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
1238 		unsigned long pfn, unsigned long nr_pages)
1239 {
1240 	struct wrprotect_file_state state = {
1241 		.cleaned = 0,
1242 		.pgoff = pgoff,
1243 		.pfn = pfn,
1244 		.nr_pages = nr_pages,
1245 	};
1246 	struct rmap_walk_control rwc = {
1247 		.arg = (void *)&state,
1248 		.rmap_one = mapping_wrprotect_range_one,
1249 		.invalid_vma = invalid_mkclean_vma,
1250 	};
1251 
1252 	if (!mapping)
1253 		return 0;
1254 
1255 	__rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
1256 			 /* locked = */false);
1257 
1258 	return state.cleaned;
1259 }
1260 EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
1261 
1262 /**
1263  * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
1264  *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
1265  *                     within the @vma of shared mappings. And since clean PTEs
1266  *                     should also be readonly, write protects them too.
1267  * @pfn: start pfn.
1268  * @nr_pages: number of physically contiguous pages srarting with @pfn.
1269  * @pgoff: page offset that the @pfn mapped with.
1270  * @vma: vma that @pfn mapped within.
1271  *
1272  * Returns the number of cleaned PTEs (including PMDs).
1273  */
1274 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1275 		      struct vm_area_struct *vma)
1276 {
1277 	struct page_vma_mapped_walk pvmw = {
1278 		.pfn		= pfn,
1279 		.nr_pages	= nr_pages,
1280 		.pgoff		= pgoff,
1281 		.vma		= vma,
1282 		.flags		= PVMW_SYNC,
1283 	};
1284 
1285 	if (invalid_mkclean_vma(vma, NULL))
1286 		return 0;
1287 
1288 	pvmw.address = vma_address(vma, pgoff, nr_pages);
1289 	VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
1290 
1291 	return page_vma_mkclean_one(&pvmw);
1292 }
1293 
1294 static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
1295 {
1296 	int idx;
1297 
1298 	if (nr) {
1299 		idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
1300 		lruvec_stat_mod_folio(folio, idx, nr);
1301 	}
1302 	if (nr_pmdmapped) {
1303 		if (folio_test_anon(folio)) {
1304 			idx = NR_ANON_THPS;
1305 			lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
1306 		} else {
1307 			/* NR_*_PMDMAPPED are not maintained per-memcg */
1308 			idx = folio_test_swapbacked(folio) ?
1309 				NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
1310 			__mod_node_page_state(folio_pgdat(folio), idx,
1311 					      nr_pmdmapped);
1312 		}
1313 	}
1314 }
1315 
1316 static __always_inline void __folio_add_rmap(struct folio *folio,
1317 		struct page *page, int nr_pages, struct vm_area_struct *vma,
1318 		enum pgtable_level level)
1319 {
1320 	atomic_t *mapped = &folio->_nr_pages_mapped;
1321 	const int orig_nr_pages = nr_pages;
1322 	int first = 0, nr = 0, nr_pmdmapped = 0;
1323 
1324 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1325 
1326 	switch (level) {
1327 	case PGTABLE_LEVEL_PTE:
1328 		if (!folio_test_large(folio)) {
1329 			nr = atomic_inc_and_test(&folio->_mapcount);
1330 			break;
1331 		}
1332 
1333 		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1334 			nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
1335 			if (nr == orig_nr_pages)
1336 				/* Was completely unmapped. */
1337 				nr = folio_large_nr_pages(folio);
1338 			else
1339 				nr = 0;
1340 			break;
1341 		}
1342 
1343 		do {
1344 			first += atomic_inc_and_test(&page->_mapcount);
1345 		} while (page++, --nr_pages > 0);
1346 
1347 		if (first &&
1348 		    atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
1349 			nr = first;
1350 
1351 		folio_add_large_mapcount(folio, orig_nr_pages, vma);
1352 		break;
1353 	case PGTABLE_LEVEL_PMD:
1354 	case PGTABLE_LEVEL_PUD:
1355 		first = atomic_inc_and_test(&folio->_entire_mapcount);
1356 		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1357 			if (level == PGTABLE_LEVEL_PMD && first)
1358 				nr_pmdmapped = folio_large_nr_pages(folio);
1359 			nr = folio_inc_return_large_mapcount(folio, vma);
1360 			if (nr == 1)
1361 				/* Was completely unmapped. */
1362 				nr = folio_large_nr_pages(folio);
1363 			else
1364 				nr = 0;
1365 			break;
1366 		}
1367 
1368 		if (first) {
1369 			nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
1370 			if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
1371 				nr_pages = folio_large_nr_pages(folio);
1372 				/*
1373 				 * We only track PMD mappings of PMD-sized
1374 				 * folios separately.
1375 				 */
1376 				if (level == PGTABLE_LEVEL_PMD)
1377 					nr_pmdmapped = nr_pages;
1378 				nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1379 				/* Raced ahead of a remove and another add? */
1380 				if (unlikely(nr < 0))
1381 					nr = 0;
1382 			} else {
1383 				/* Raced ahead of a remove of ENTIRELY_MAPPED */
1384 				nr = 0;
1385 			}
1386 		}
1387 		folio_inc_large_mapcount(folio, vma);
1388 		break;
1389 	default:
1390 		BUILD_BUG();
1391 	}
1392 	__folio_mod_stat(folio, nr, nr_pmdmapped);
1393 }
1394 
1395 /**
1396  * folio_move_anon_rmap - move a folio to our anon_vma
1397  * @folio:	The folio to move to our anon_vma
1398  * @vma:	The vma the folio belongs to
1399  *
1400  * When a folio belongs exclusively to one process after a COW event,
1401  * that folio can be moved into the anon_vma that belongs to just that
1402  * process, so the rmap code will not search the parent or sibling processes.
1403  */
1404 void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
1405 {
1406 	void *anon_vma = vma->anon_vma;
1407 
1408 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1409 	VM_BUG_ON_VMA(!anon_vma, vma);
1410 
1411 	anon_vma += FOLIO_MAPPING_ANON;
1412 	/*
1413 	 * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
1414 	 * simultaneously, so a concurrent reader (eg folio_referenced()'s
1415 	 * folio_test_anon()) will not see one without the other.
1416 	 */
1417 	WRITE_ONCE(folio->mapping, anon_vma);
1418 }
1419 
1420 /**
1421  * __folio_set_anon - set up a new anonymous rmap for a folio
1422  * @folio:	The folio to set up the new anonymous rmap for.
1423  * @vma:	VM area to add the folio to.
1424  * @address:	User virtual address of the mapping
1425  * @exclusive:	Whether the folio is exclusive to the process.
1426  */
1427 static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
1428 			     unsigned long address, bool exclusive)
1429 {
1430 	struct anon_vma *anon_vma = vma->anon_vma;
1431 
1432 	BUG_ON(!anon_vma);
1433 
1434 	/*
1435 	 * If the folio isn't exclusive to this vma, we must use the _oldest_
1436 	 * possible anon_vma for the folio mapping!
1437 	 */
1438 	if (!exclusive)
1439 		anon_vma = anon_vma->root;
1440 
1441 	/*
1442 	 * page_idle does a lockless/optimistic rmap scan on folio->mapping.
1443 	 * Make sure the compiler doesn't split the stores of anon_vma and
1444 	 * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
1445 	 * could mistake the mapping for a struct address_space and crash.
1446 	 */
1447 	anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
1448 	WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
1449 	folio->index = linear_page_index(vma, address);
1450 }
1451 
1452 /**
1453  * __page_check_anon_rmap - sanity check anonymous rmap addition
1454  * @folio:	The folio containing @page.
1455  * @page:	the page to check the mapping of
1456  * @vma:	the vm area in which the mapping is added
1457  * @address:	the user virtual address mapped
1458  */
1459 static void __page_check_anon_rmap(const struct folio *folio,
1460 		const struct page *page, struct vm_area_struct *vma,
1461 		unsigned long address)
1462 {
1463 	/*
1464 	 * The page's anon-rmap details (mapping and index) are guaranteed to
1465 	 * be set up correctly at this point.
1466 	 *
1467 	 * We have exclusion against folio_add_anon_rmap_*() because the caller
1468 	 * always holds the page locked.
1469 	 *
1470 	 * We have exclusion against folio_add_new_anon_rmap because those pages
1471 	 * are initially only visible via the pagetables, and the pte is locked
1472 	 * over the call to folio_add_new_anon_rmap.
1473 	 */
1474 	VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
1475 			folio);
1476 	VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
1477 		       page);
1478 }
1479 
1480 static __always_inline void __folio_add_anon_rmap(struct folio *folio,
1481 		struct page *page, int nr_pages, struct vm_area_struct *vma,
1482 		unsigned long address, rmap_t flags, enum pgtable_level level)
1483 {
1484 	int i;
1485 
1486 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
1487 
1488 	__folio_add_rmap(folio, page, nr_pages, vma, level);
1489 
1490 	if (likely(!folio_test_ksm(folio)))
1491 		__page_check_anon_rmap(folio, page, vma, address);
1492 
1493 	if (flags & RMAP_EXCLUSIVE) {
1494 		switch (level) {
1495 		case PGTABLE_LEVEL_PTE:
1496 			for (i = 0; i < nr_pages; i++)
1497 				SetPageAnonExclusive(page + i);
1498 			break;
1499 		case PGTABLE_LEVEL_PMD:
1500 			SetPageAnonExclusive(page);
1501 			break;
1502 		case PGTABLE_LEVEL_PUD:
1503 			/*
1504 			 * Keep the compiler happy, we don't support anonymous
1505 			 * PUD mappings.
1506 			 */
1507 			WARN_ON_ONCE(1);
1508 			break;
1509 		default:
1510 			BUILD_BUG();
1511 		}
1512 	}
1513 
1514 	VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
1515 			 atomic_read(&folio->_mapcount) > 0, folio);
1516 	for (i = 0; i < nr_pages; i++) {
1517 		struct page *cur_page = page + i;
1518 
1519 		VM_WARN_ON_FOLIO(folio_test_large(folio) &&
1520 				 folio_entire_mapcount(folio) > 1 &&
1521 				 PageAnonExclusive(cur_page), folio);
1522 		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
1523 			continue;
1524 
1525 		/*
1526 		 * While PTE-mapping a THP we have a PMD and a PTE
1527 		 * mapping.
1528 		 */
1529 		VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
1530 				 PageAnonExclusive(cur_page), folio);
1531 	}
1532 
1533 	/*
1534 	 * Only mlock it if the folio is fully mapped to the VMA.
1535 	 *
1536 	 * Partially mapped folios can be split on reclaim and part outside
1537 	 * of mlocked VMA can be evicted or freed.
1538 	 */
1539 	if (folio_nr_pages(folio) == nr_pages)
1540 		mlock_vma_folio(folio, vma);
1541 }
1542 
1543 /**
1544  * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
1545  * @folio:	The folio to add the mappings to
1546  * @page:	The first page to add
1547  * @nr_pages:	The number of pages which will be mapped
1548  * @vma:	The vm area in which the mappings are added
1549  * @address:	The user virtual address of the first page to map
1550  * @flags:	The rmap flags
1551  *
1552  * The page range of folio is defined by [first_page, first_page + nr_pages)
1553  *
1554  * The caller needs to hold the page table lock, and the page must be locked in
1555  * the anon_vma case: to serialize mapping,index checking after setting,
1556  * and to ensure that an anon folio is not being upgraded racily to a KSM folio
1557  * (but KSM folios are never downgraded).
1558  */
1559 void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
1560 		int nr_pages, struct vm_area_struct *vma, unsigned long address,
1561 		rmap_t flags)
1562 {
1563 	__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
1564 			      PGTABLE_LEVEL_PTE);
1565 }
1566 
1567 /**
1568  * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
1569  * @folio:	The folio to add the mapping to
1570  * @page:	The first page to add
1571  * @vma:	The vm area in which the mapping is added
1572  * @address:	The user virtual address of the first page to map
1573  * @flags:	The rmap flags
1574  *
1575  * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
1576  *
1577  * The caller needs to hold the page table lock, and the page must be locked in
1578  * the anon_vma case: to serialize mapping,index checking after setting.
1579  */
1580 void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
1581 		struct vm_area_struct *vma, unsigned long address, rmap_t flags)
1582 {
1583 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1584 	__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
1585 			      PGTABLE_LEVEL_PMD);
1586 #else
1587 	WARN_ON_ONCE(true);
1588 #endif
1589 }
1590 
1591 /**
1592  * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
1593  * @folio:	The folio to add the mapping to.
1594  * @vma:	the vm area in which the mapping is added
1595  * @address:	the user virtual address mapped
1596  * @flags:	The rmap flags
1597  *
1598  * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
1599  * This means the inc-and-test can be bypassed.
1600  * The folio doesn't necessarily need to be locked while it's exclusive
1601  * unless two threads map it concurrently. However, the folio must be
1602  * locked if it's shared.
1603  *
1604  * If the folio is pmd-mappable, it is accounted as a THP.
1605  */
1606 void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
1607 		unsigned long address, rmap_t flags)
1608 {
1609 	const bool exclusive = flags & RMAP_EXCLUSIVE;
1610 	int nr = 1, nr_pmdmapped = 0;
1611 
1612 	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1613 	VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
1614 
1615 	/*
1616 	 * VM_DROPPABLE mappings don't swap; instead they're just dropped when
1617 	 * under memory pressure.
1618 	 */
1619 	if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
1620 		__folio_set_swapbacked(folio);
1621 	__folio_set_anon(folio, vma, address, exclusive);
1622 
1623 	if (likely(!folio_test_large(folio))) {
1624 		/* increment count (starts at -1) */
1625 		atomic_set(&folio->_mapcount, 0);
1626 		if (exclusive)
1627 			SetPageAnonExclusive(&folio->page);
1628 	} else if (!folio_test_pmd_mappable(folio)) {
1629 		int i;
1630 
1631 		nr = folio_large_nr_pages(folio);
1632 		for (i = 0; i < nr; i++) {
1633 			struct page *page = folio_page(folio, i);
1634 
1635 			if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1636 				/* increment count (starts at -1) */
1637 				atomic_set(&page->_mapcount, 0);
1638 			if (exclusive)
1639 				SetPageAnonExclusive(page);
1640 		}
1641 
1642 		folio_set_large_mapcount(folio, nr, vma);
1643 		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1644 			atomic_set(&folio->_nr_pages_mapped, nr);
1645 	} else {
1646 		nr = folio_large_nr_pages(folio);
1647 		/* increment count (starts at -1) */
1648 		atomic_set(&folio->_entire_mapcount, 0);
1649 		folio_set_large_mapcount(folio, 1, vma);
1650 		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1651 			atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
1652 		if (exclusive)
1653 			SetPageAnonExclusive(&folio->page);
1654 		nr_pmdmapped = nr;
1655 	}
1656 
1657 	VM_WARN_ON_ONCE(address < vma->vm_start ||
1658 			address + (nr << PAGE_SHIFT) > vma->vm_end);
1659 
1660 	__folio_mod_stat(folio, nr, nr_pmdmapped);
1661 	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
1662 }
1663 
1664 static __always_inline void __folio_add_file_rmap(struct folio *folio,
1665 		struct page *page, int nr_pages, struct vm_area_struct *vma,
1666 		enum pgtable_level level)
1667 {
1668 	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
1669 
1670 	__folio_add_rmap(folio, page, nr_pages, vma, level);
1671 
1672 	/*
1673 	 * Only mlock it if the folio is fully mapped to the VMA.
1674 	 *
1675 	 * Partially mapped folios can be split on reclaim and part outside
1676 	 * of mlocked VMA can be evicted or freed.
1677 	 */
1678 	if (folio_nr_pages(folio) == nr_pages)
1679 		mlock_vma_folio(folio, vma);
1680 }
1681 
1682 /**
1683  * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
1684  * @folio:	The folio to add the mappings to
1685  * @page:	The first page to add
1686  * @nr_pages:	The number of pages that will be mapped using PTEs
1687  * @vma:	The vm area in which the mappings are added
1688  *
1689  * The page range of the folio is defined by [page, page + nr_pages)
1690  *
1691  * The caller needs to hold the page table lock.
1692  */
1693 void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
1694 		int nr_pages, struct vm_area_struct *vma)
1695 {
1696 	__folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
1697 }
1698 
1699 /**
1700  * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
1701  * @folio:	The folio to add the mapping to
1702  * @page:	The first page to add
1703  * @vma:	The vm area in which the mapping is added
1704  *
1705  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1706  *
1707  * The caller needs to hold the page table lock.
1708  */
1709 void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
1710 		struct vm_area_struct *vma)
1711 {
1712 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1713 	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
1714 #else
1715 	WARN_ON_ONCE(true);
1716 #endif
1717 }
1718 
1719 /**
1720  * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
1721  * @folio:	The folio to add the mapping to
1722  * @page:	The first page to add
1723  * @vma:	The vm area in which the mapping is added
1724  *
1725  * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1726  *
1727  * The caller needs to hold the page table lock.
1728  */
1729 void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
1730 		struct vm_area_struct *vma)
1731 {
1732 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1733 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1734 	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
1735 #else
1736 	WARN_ON_ONCE(true);
1737 #endif
1738 }
1739 
1740 static __always_inline void __folio_remove_rmap(struct folio *folio,
1741 		struct page *page, int nr_pages, struct vm_area_struct *vma,
1742 		enum pgtable_level level)
1743 {
1744 	atomic_t *mapped = &folio->_nr_pages_mapped;
1745 	int last = 0, nr = 0, nr_pmdmapped = 0;
1746 	bool partially_mapped = false;
1747 
1748 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1749 
1750 	switch (level) {
1751 	case PGTABLE_LEVEL_PTE:
1752 		if (!folio_test_large(folio)) {
1753 			nr = atomic_add_negative(-1, &folio->_mapcount);
1754 			break;
1755 		}
1756 
1757 		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1758 			nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
1759 			if (!nr) {
1760 				/* Now completely unmapped. */
1761 				nr = folio_large_nr_pages(folio);
1762 			} else {
1763 				partially_mapped = nr < folio_large_nr_pages(folio) &&
1764 						   !folio_entire_mapcount(folio);
1765 				nr = 0;
1766 			}
1767 			break;
1768 		}
1769 
1770 		folio_sub_large_mapcount(folio, nr_pages, vma);
1771 		do {
1772 			last += atomic_add_negative(-1, &page->_mapcount);
1773 		} while (page++, --nr_pages > 0);
1774 
1775 		if (last &&
1776 		    atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
1777 			nr = last;
1778 
1779 		partially_mapped = nr && atomic_read(mapped);
1780 		break;
1781 	case PGTABLE_LEVEL_PMD:
1782 	case PGTABLE_LEVEL_PUD:
1783 		if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1784 			last = atomic_add_negative(-1, &folio->_entire_mapcount);
1785 			if (level == PGTABLE_LEVEL_PMD && last)
1786 				nr_pmdmapped = folio_large_nr_pages(folio);
1787 			nr = folio_dec_return_large_mapcount(folio, vma);
1788 			if (!nr) {
1789 				/* Now completely unmapped. */
1790 				nr = folio_large_nr_pages(folio);
1791 			} else {
1792 				partially_mapped = last &&
1793 						   nr < folio_large_nr_pages(folio);
1794 				nr = 0;
1795 			}
1796 			break;
1797 		}
1798 
1799 		folio_dec_large_mapcount(folio, vma);
1800 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
1801 		if (last) {
1802 			nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
1803 			if (likely(nr < ENTIRELY_MAPPED)) {
1804 				nr_pages = folio_large_nr_pages(folio);
1805 				if (level == PGTABLE_LEVEL_PMD)
1806 					nr_pmdmapped = nr_pages;
1807 				nr = nr_pages - nr;
1808 				/* Raced ahead of another remove and an add? */
1809 				if (unlikely(nr < 0))
1810 					nr = 0;
1811 			} else {
1812 				/* An add of ENTIRELY_MAPPED raced ahead */
1813 				nr = 0;
1814 			}
1815 		}
1816 
1817 		partially_mapped = nr && nr < nr_pmdmapped;
1818 		break;
1819 	default:
1820 		BUILD_BUG();
1821 	}
1822 
1823 	/*
1824 	 * Queue anon large folio for deferred split if at least one page of
1825 	 * the folio is unmapped and at least one page is still mapped.
1826 	 *
1827 	 * Check partially_mapped first to ensure it is a large folio.
1828 	 *
1829 	 * Device private folios do not support deferred splitting and
1830 	 * shrinker based scanning of the folios to free.
1831 	 */
1832 	if (partially_mapped && folio_test_anon(folio) &&
1833 	    !folio_test_partially_mapped(folio) &&
1834 	    !folio_is_device_private(folio))
1835 		deferred_split_folio(folio, true);
1836 
1837 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
1838 
1839 	/*
1840 	 * It would be tidy to reset folio_test_anon mapping when fully
1841 	 * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
1842 	 * which increments mapcount after us but sets mapping before us:
1843 	 * so leave the reset to free_pages_prepare, and remember that
1844 	 * it's only reliable while mapped.
1845 	 */
1846 
1847 	munlock_vma_folio(folio, vma);
1848 }
1849 
1850 /**
1851  * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
1852  * @folio:	The folio to remove the mappings from
1853  * @page:	The first page to remove
1854  * @nr_pages:	The number of pages that will be removed from the mapping
1855  * @vma:	The vm area from which the mappings are removed
1856  *
1857  * The page range of the folio is defined by [page, page + nr_pages)
1858  *
1859  * The caller needs to hold the page table lock.
1860  */
1861 void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
1862 		int nr_pages, struct vm_area_struct *vma)
1863 {
1864 	__folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE);
1865 }
1866 
1867 /**
1868  * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
1869  * @folio:	The folio to remove the mapping from
1870  * @page:	The first page to remove
1871  * @vma:	The vm area from which the mapping is removed
1872  *
1873  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1874  *
1875  * The caller needs to hold the page table lock.
1876  */
1877 void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
1878 		struct vm_area_struct *vma)
1879 {
1880 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1881 	__folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
1882 #else
1883 	WARN_ON_ONCE(true);
1884 #endif
1885 }
1886 
1887 /**
1888  * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
1889  * @folio:	The folio to remove the mapping from
1890  * @page:	The first page to remove
1891  * @vma:	The vm area from which the mapping is removed
1892  *
1893  * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1894  *
1895  * The caller needs to hold the page table lock.
1896  */
1897 void folio_remove_rmap_pud(struct folio *folio, struct page *page,
1898 		struct vm_area_struct *vma)
1899 {
1900 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1901 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1902 	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
1903 #else
1904 	WARN_ON_ONCE(true);
1905 #endif
1906 }
1907 
1908 static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
1909 			struct page_vma_mapped_walk *pvmw,
1910 			enum ttu_flags flags, pte_t pte)
1911 {
1912 	unsigned long end_addr, addr = pvmw->address;
1913 	struct vm_area_struct *vma = pvmw->vma;
1914 	unsigned int max_nr;
1915 
1916 	if (flags & TTU_HWPOISON)
1917 		return 1;
1918 	if (!folio_test_large(folio))
1919 		return 1;
1920 
1921 	/* We may only batch within a single VMA and a single page table. */
1922 	end_addr = pmd_addr_end(addr, vma->vm_end);
1923 	max_nr = (end_addr - addr) >> PAGE_SHIFT;
1924 
1925 	/* We only support lazyfree batching for now ... */
1926 	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
1927 		return 1;
1928 	if (pte_unused(pte))
1929 		return 1;
1930 
1931 	return folio_pte_batch(folio, pvmw->pte, pte, max_nr);
1932 }
1933 
1934 /*
1935  * @arg: enum ttu_flags will be passed to this argument
1936  */
1937 static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
1938 		     unsigned long address, void *arg)
1939 {
1940 	struct mm_struct *mm = vma->vm_mm;
1941 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
1942 	bool anon_exclusive, ret = true;
1943 	pte_t pteval;
1944 	struct page *subpage;
1945 	struct mmu_notifier_range range;
1946 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1947 	unsigned long nr_pages = 1, end_addr;
1948 	unsigned long pfn;
1949 	unsigned long hsz = 0;
1950 	int ptes = 0;
1951 
1952 	/*
1953 	 * When racing against e.g. zap_pte_range() on another cpu,
1954 	 * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
1955 	 * try_to_unmap() may return before page_mapped() has become false,
1956 	 * if page table locking is skipped: use TTU_SYNC to wait for that.
1957 	 */
1958 	if (flags & TTU_SYNC)
1959 		pvmw.flags = PVMW_SYNC;
1960 
1961 	/*
1962 	 * For THP, we have to assume the worse case ie pmd for invalidation.
1963 	 * For hugetlb, it could be much worse if we need to do pud
1964 	 * invalidation in the case of pmd sharing.
1965 	 *
1966 	 * Note that the folio can not be freed in this function as call of
1967 	 * try_to_unmap() must hold a reference on the folio.
1968 	 */
1969 	range.end = vma_address_end(&pvmw);
1970 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
1971 				address, range.end);
1972 	if (folio_test_hugetlb(folio)) {
1973 		/*
1974 		 * If sharing is possible, start and end will be adjusted
1975 		 * accordingly.
1976 		 */
1977 		adjust_range_if_pmd_sharing_possible(vma, &range.start,
1978 						     &range.end);
1979 
1980 		/* We need the huge page size for set_huge_pte_at() */
1981 		hsz = huge_page_size(hstate_vma(vma));
1982 	}
1983 	mmu_notifier_invalidate_range_start(&range);
1984 
1985 	while (page_vma_mapped_walk(&pvmw)) {
1986 		/*
1987 		 * If the folio is in an mlock()d vma, we must not swap it out.
1988 		 */
1989 		if (!(flags & TTU_IGNORE_MLOCK) &&
1990 		    (vma->vm_flags & VM_LOCKED)) {
1991 			ptes++;
1992 
1993 			/*
1994 			 * Set 'ret' to indicate the page cannot be unmapped.
1995 			 *
1996 			 * Do not jump to walk_abort immediately as additional
1997 			 * iteration might be required to detect fully mapped
1998 			 * folio an mlock it.
1999 			 */
2000 			ret = false;
2001 
2002 			/* Only mlock fully mapped pages */
2003 			if (pvmw.pte && ptes != pvmw.nr_pages)
2004 				continue;
2005 
2006 			/*
2007 			 * All PTEs must be protected by page table lock in
2008 			 * order to mlock the page.
2009 			 *
2010 			 * If page table boundary has been cross, current ptl
2011 			 * only protect part of ptes.
2012 			 */
2013 			if (pvmw.flags & PVMW_PGTABLE_CROSSED)
2014 				goto walk_done;
2015 
2016 			/* Restore the mlock which got missed */
2017 			mlock_vma_folio(folio, vma);
2018 			goto walk_done;
2019 		}
2020 
2021 		if (!pvmw.pte) {
2022 			if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
2023 				if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
2024 					goto walk_done;
2025 				/*
2026 				 * unmap_huge_pmd_locked has either already marked
2027 				 * the folio as swap-backed or decided to retain it
2028 				 * due to GUP or speculative references.
2029 				 */
2030 				goto walk_abort;
2031 			}
2032 
2033 			if (flags & TTU_SPLIT_HUGE_PMD) {
2034 				/*
2035 				 * We temporarily have to drop the PTL and
2036 				 * restart so we can process the PTE-mapped THP.
2037 				 */
2038 				split_huge_pmd_locked(vma, pvmw.address,
2039 						      pvmw.pmd, false);
2040 				flags &= ~TTU_SPLIT_HUGE_PMD;
2041 				page_vma_mapped_walk_restart(&pvmw);
2042 				continue;
2043 			}
2044 		}
2045 
2046 		/* Unexpected PMD-mapped THP? */
2047 		VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2048 
2049 		/*
2050 		 * Handle PFN swap PTEs, such as device-exclusive ones, that
2051 		 * actually map pages.
2052 		 */
2053 		pteval = ptep_get(pvmw.pte);
2054 		if (likely(pte_present(pteval))) {
2055 			pfn = pte_pfn(pteval);
2056 		} else {
2057 			const softleaf_t entry = softleaf_from_pte(pteval);
2058 
2059 			pfn = softleaf_to_pfn(entry);
2060 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
2061 		}
2062 
2063 		subpage = folio_page(folio, pfn - folio_pfn(folio));
2064 		address = pvmw.address;
2065 		anon_exclusive = folio_test_anon(folio) &&
2066 				 PageAnonExclusive(subpage);
2067 
2068 		if (folio_test_hugetlb(folio)) {
2069 			bool anon = folio_test_anon(folio);
2070 
2071 			/*
2072 			 * The try_to_unmap() is only passed a hugetlb page
2073 			 * in the case where the hugetlb page is poisoned.
2074 			 */
2075 			VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
2076 			/*
2077 			 * huge_pmd_unshare may unmap an entire PMD page.
2078 			 * There is no way of knowing exactly which PMDs may
2079 			 * be cached for this mm, so we must flush them all.
2080 			 * start/end were already adjusted above to cover this
2081 			 * range.
2082 			 */
2083 			flush_cache_range(vma, range.start, range.end);
2084 
2085 			/*
2086 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
2087 			 * held in write mode.  Caller needs to explicitly
2088 			 * do this outside rmap routines.
2089 			 *
2090 			 * We also must hold hugetlb vma_lock in write mode.
2091 			 * Lock order dictates acquiring vma_lock BEFORE
2092 			 * i_mmap_rwsem.  We can only try lock here and fail
2093 			 * if unsuccessful.
2094 			 */
2095 			if (!anon) {
2096 				struct mmu_gather tlb;
2097 
2098 				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2099 				if (!hugetlb_vma_trylock_write(vma))
2100 					goto walk_abort;
2101 
2102 				tlb_gather_mmu_vma(&tlb, vma);
2103 				if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
2104 					hugetlb_vma_unlock_write(vma);
2105 					huge_pmd_unshare_flush(&tlb, vma);
2106 					tlb_finish_mmu(&tlb);
2107 					/*
2108 					 * The PMD table was unmapped,
2109 					 * consequently unmapping the folio.
2110 					 */
2111 					goto walk_done;
2112 				}
2113 				hugetlb_vma_unlock_write(vma);
2114 				tlb_finish_mmu(&tlb);
2115 			}
2116 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
2117 			if (pte_dirty(pteval))
2118 				folio_mark_dirty(folio);
2119 		} else if (likely(pte_present(pteval))) {
2120 			nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
2121 			end_addr = address + nr_pages * PAGE_SIZE;
2122 			flush_cache_range(vma, address, end_addr);
2123 
2124 			/* Nuke the page table entry. */
2125 			pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
2126 			/*
2127 			 * We clear the PTE but do not flush so potentially
2128 			 * a remote CPU could still be writing to the folio.
2129 			 * If the entry was previously clean then the
2130 			 * architecture must guarantee that a clear->dirty
2131 			 * transition on a cached TLB entry is written through
2132 			 * and traps if the PTE is unmapped.
2133 			 */
2134 			if (should_defer_flush(mm, flags))
2135 				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
2136 			else
2137 				flush_tlb_range(vma, address, end_addr);
2138 			if (pte_dirty(pteval))
2139 				folio_mark_dirty(folio);
2140 		} else {
2141 			pte_clear(mm, address, pvmw.pte);
2142 		}
2143 
2144 		/*
2145 		 * Now the pte is cleared. If this pte was uffd-wp armed,
2146 		 * we may want to replace a none pte with a marker pte if
2147 		 * it's file-backed, so we don't lose the tracking info.
2148 		 */
2149 		pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
2150 
2151 		/* Update high watermark before we lower rss */
2152 		update_hiwater_rss(mm);
2153 
2154 		if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
2155 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
2156 			if (folio_test_hugetlb(folio)) {
2157 				hugetlb_count_sub(folio_nr_pages(folio), mm);
2158 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
2159 						hsz);
2160 			} else {
2161 				dec_mm_counter(mm, mm_counter(folio));
2162 				set_pte_at(mm, address, pvmw.pte, pteval);
2163 			}
2164 		} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
2165 			   !userfaultfd_armed(vma)) {
2166 			/*
2167 			 * The guest indicated that the page content is of no
2168 			 * interest anymore. Simply discard the pte, vmscan
2169 			 * will take care of the rest.
2170 			 * A future reference will then fault in a new zero
2171 			 * page. When userfaultfd is active, we must not drop
2172 			 * this page though, as its main user (postcopy
2173 			 * migration) will not expect userfaults on already
2174 			 * copied pages.
2175 			 */
2176 			dec_mm_counter(mm, mm_counter(folio));
2177 		} else if (folio_test_anon(folio)) {
2178 			swp_entry_t entry = page_swap_entry(subpage);
2179 			pte_t swp_pte;
2180 			/*
2181 			 * Store the swap location in the pte.
2182 			 * See handle_pte_fault() ...
2183 			 */
2184 			if (unlikely(folio_test_swapbacked(folio) !=
2185 					folio_test_swapcache(folio))) {
2186 				WARN_ON_ONCE(1);
2187 				goto walk_abort;
2188 			}
2189 
2190 			/* MADV_FREE page check */
2191 			if (!folio_test_swapbacked(folio)) {
2192 				int ref_count, map_count;
2193 
2194 				/*
2195 				 * Synchronize with gup_pte_range():
2196 				 * - clear PTE; barrier; read refcount
2197 				 * - inc refcount; barrier; read PTE
2198 				 */
2199 				smp_mb();
2200 
2201 				ref_count = folio_ref_count(folio);
2202 				map_count = folio_mapcount(folio);
2203 
2204 				/*
2205 				 * Order reads for page refcount and dirty flag
2206 				 * (see comments in __remove_mapping()).
2207 				 */
2208 				smp_rmb();
2209 
2210 				if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
2211 					/*
2212 					 * redirtied either using the page table or a previously
2213 					 * obtained GUP reference.
2214 					 */
2215 					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
2216 					folio_set_swapbacked(folio);
2217 					goto walk_abort;
2218 				} else if (ref_count != 1 + map_count) {
2219 					/*
2220 					 * Additional reference. Could be a GUP reference or any
2221 					 * speculative reference. GUP users must mark the folio
2222 					 * dirty if there was a modification. This folio cannot be
2223 					 * reclaimed right now either way, so act just like nothing
2224 					 * happened.
2225 					 * We'll come back here later and detect if the folio was
2226 					 * dirtied when the additional reference is gone.
2227 					 */
2228 					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
2229 					goto walk_abort;
2230 				}
2231 				add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
2232 				goto discard;
2233 			}
2234 
2235 			if (swap_duplicate(entry) < 0) {
2236 				set_pte_at(mm, address, pvmw.pte, pteval);
2237 				goto walk_abort;
2238 			}
2239 
2240 			/*
2241 			 * arch_unmap_one() is expected to be a NOP on
2242 			 * architectures where we could have PFN swap PTEs,
2243 			 * so we'll not check/care.
2244 			 */
2245 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
2246 				swap_free(entry);
2247 				set_pte_at(mm, address, pvmw.pte, pteval);
2248 				goto walk_abort;
2249 			}
2250 
2251 			/* See folio_try_share_anon_rmap(): clear PTE first. */
2252 			if (anon_exclusive &&
2253 			    folio_try_share_anon_rmap_pte(folio, subpage)) {
2254 				swap_free(entry);
2255 				set_pte_at(mm, address, pvmw.pte, pteval);
2256 				goto walk_abort;
2257 			}
2258 			if (list_empty(&mm->mmlist)) {
2259 				spin_lock(&mmlist_lock);
2260 				if (list_empty(&mm->mmlist))
2261 					list_add(&mm->mmlist, &init_mm.mmlist);
2262 				spin_unlock(&mmlist_lock);
2263 			}
2264 			dec_mm_counter(mm, MM_ANONPAGES);
2265 			inc_mm_counter(mm, MM_SWAPENTS);
2266 			swp_pte = swp_entry_to_pte(entry);
2267 			if (anon_exclusive)
2268 				swp_pte = pte_swp_mkexclusive(swp_pte);
2269 			if (likely(pte_present(pteval))) {
2270 				if (pte_soft_dirty(pteval))
2271 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2272 				if (pte_uffd_wp(pteval))
2273 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2274 			} else {
2275 				if (pte_swp_soft_dirty(pteval))
2276 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2277 				if (pte_swp_uffd_wp(pteval))
2278 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2279 			}
2280 			set_pte_at(mm, address, pvmw.pte, swp_pte);
2281 		} else {
2282 			/*
2283 			 * This is a locked file-backed folio,
2284 			 * so it cannot be removed from the page
2285 			 * cache and replaced by a new folio before
2286 			 * mmu_notifier_invalidate_range_end, so no
2287 			 * concurrent thread might update its page table
2288 			 * to point at a new folio while a device is
2289 			 * still using this folio.
2290 			 *
2291 			 * See Documentation/mm/mmu_notifier.rst
2292 			 */
2293 			dec_mm_counter(mm, mm_counter_file(folio));
2294 		}
2295 discard:
2296 		if (unlikely(folio_test_hugetlb(folio))) {
2297 			hugetlb_remove_rmap(folio);
2298 		} else {
2299 			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
2300 		}
2301 		if (vma->vm_flags & VM_LOCKED)
2302 			mlock_drain_local();
2303 		folio_put_refs(folio, nr_pages);
2304 
2305 		/*
2306 		 * If we are sure that we batched the entire folio and cleared
2307 		 * all PTEs, we can just optimize and stop right here.
2308 		 */
2309 		if (nr_pages == folio_nr_pages(folio))
2310 			goto walk_done;
2311 		continue;
2312 walk_abort:
2313 		ret = false;
2314 walk_done:
2315 		page_vma_mapped_walk_done(&pvmw);
2316 		break;
2317 	}
2318 
2319 	mmu_notifier_invalidate_range_end(&range);
2320 
2321 	return ret;
2322 }
2323 
2324 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
2325 {
2326 	return vma_is_temporary_stack(vma);
2327 }
2328 
2329 static int folio_not_mapped(struct folio *folio)
2330 {
2331 	return !folio_mapped(folio);
2332 }
2333 
2334 /**
2335  * try_to_unmap - Try to remove all page table mappings to a folio.
2336  * @folio: The folio to unmap.
2337  * @flags: action and flags
2338  *
2339  * Tries to remove all the page table entries which are mapping this
2340  * folio.  It is the caller's responsibility to check if the folio is
2341  * still mapped if needed (use TTU_SYNC to prevent accounting races).
2342  *
2343  * Context: Caller must hold the folio lock.
2344  */
2345 void try_to_unmap(struct folio *folio, enum ttu_flags flags)
2346 {
2347 	struct rmap_walk_control rwc = {
2348 		.rmap_one = try_to_unmap_one,
2349 		.arg = (void *)flags,
2350 		.done = folio_not_mapped,
2351 		.anon_lock = folio_lock_anon_vma_read,
2352 	};
2353 
2354 	if (flags & TTU_RMAP_LOCKED)
2355 		rmap_walk_locked(folio, &rwc);
2356 	else
2357 		rmap_walk(folio, &rwc);
2358 }
2359 
2360 /*
2361  * @arg: enum ttu_flags will be passed to this argument.
2362  *
2363  * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
2364  * containing migration entries.
2365  */
2366 static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
2367 		     unsigned long address, void *arg)
2368 {
2369 	struct mm_struct *mm = vma->vm_mm;
2370 	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
2371 	bool anon_exclusive, writable, ret = true;
2372 	pte_t pteval;
2373 	struct page *subpage;
2374 	struct mmu_notifier_range range;
2375 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
2376 	unsigned long pfn;
2377 	unsigned long hsz = 0;
2378 
2379 	/*
2380 	 * When racing against e.g. zap_pte_range() on another cpu,
2381 	 * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
2382 	 * try_to_migrate() may return before page_mapped() has become false,
2383 	 * if page table locking is skipped: use TTU_SYNC to wait for that.
2384 	 */
2385 	if (flags & TTU_SYNC)
2386 		pvmw.flags = PVMW_SYNC;
2387 
2388 	/*
2389 	 * For THP, we have to assume the worse case ie pmd for invalidation.
2390 	 * For hugetlb, it could be much worse if we need to do pud
2391 	 * invalidation in the case of pmd sharing.
2392 	 *
2393 	 * Note that the page can not be free in this function as call of
2394 	 * try_to_unmap() must hold a reference on the page.
2395 	 */
2396 	range.end = vma_address_end(&pvmw);
2397 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2398 				address, range.end);
2399 	if (folio_test_hugetlb(folio)) {
2400 		/*
2401 		 * If sharing is possible, start and end will be adjusted
2402 		 * accordingly.
2403 		 */
2404 		adjust_range_if_pmd_sharing_possible(vma, &range.start,
2405 						     &range.end);
2406 
2407 		/* We need the huge page size for set_huge_pte_at() */
2408 		hsz = huge_page_size(hstate_vma(vma));
2409 	}
2410 	mmu_notifier_invalidate_range_start(&range);
2411 
2412 	while (page_vma_mapped_walk(&pvmw)) {
2413 		/* PMD-mapped THP migration entry */
2414 		if (!pvmw.pte) {
2415 			__maybe_unused unsigned long pfn;
2416 			__maybe_unused pmd_t pmdval;
2417 
2418 			if (flags & TTU_SPLIT_HUGE_PMD) {
2419 				split_huge_pmd_locked(vma, pvmw.address,
2420 						      pvmw.pmd, true);
2421 				ret = false;
2422 				page_vma_mapped_walk_done(&pvmw);
2423 				break;
2424 			}
2425 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2426 			pmdval = pmdp_get(pvmw.pmd);
2427 			if (likely(pmd_present(pmdval)))
2428 				pfn = pmd_pfn(pmdval);
2429 			else
2430 				pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
2431 
2432 			subpage = folio_page(folio, pfn - folio_pfn(folio));
2433 
2434 			VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
2435 					!folio_test_pmd_mappable(folio), folio);
2436 
2437 			if (set_pmd_migration_entry(&pvmw, subpage)) {
2438 				ret = false;
2439 				page_vma_mapped_walk_done(&pvmw);
2440 				break;
2441 			}
2442 			continue;
2443 #endif
2444 		}
2445 
2446 		/* Unexpected PMD-mapped THP? */
2447 		VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2448 
2449 		/*
2450 		 * Handle PFN swap PTEs, such as device-exclusive ones, that
2451 		 * actually map pages.
2452 		 */
2453 		pteval = ptep_get(pvmw.pte);
2454 		if (likely(pte_present(pteval))) {
2455 			pfn = pte_pfn(pteval);
2456 		} else {
2457 			const softleaf_t entry = softleaf_from_pte(pteval);
2458 
2459 			pfn = softleaf_to_pfn(entry);
2460 			VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
2461 		}
2462 
2463 		subpage = folio_page(folio, pfn - folio_pfn(folio));
2464 		address = pvmw.address;
2465 		anon_exclusive = folio_test_anon(folio) &&
2466 				 PageAnonExclusive(subpage);
2467 
2468 		if (folio_test_hugetlb(folio)) {
2469 			bool anon = folio_test_anon(folio);
2470 
2471 			/*
2472 			 * huge_pmd_unshare may unmap an entire PMD page.
2473 			 * There is no way of knowing exactly which PMDs may
2474 			 * be cached for this mm, so we must flush them all.
2475 			 * start/end were already adjusted above to cover this
2476 			 * range.
2477 			 */
2478 			flush_cache_range(vma, range.start, range.end);
2479 
2480 			/*
2481 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
2482 			 * held in write mode.  Caller needs to explicitly
2483 			 * do this outside rmap routines.
2484 			 *
2485 			 * We also must hold hugetlb vma_lock in write mode.
2486 			 * Lock order dictates acquiring vma_lock BEFORE
2487 			 * i_mmap_rwsem.  We can only try lock here and
2488 			 * fail if unsuccessful.
2489 			 */
2490 			if (!anon) {
2491 				struct mmu_gather tlb;
2492 
2493 				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2494 				if (!hugetlb_vma_trylock_write(vma)) {
2495 					page_vma_mapped_walk_done(&pvmw);
2496 					ret = false;
2497 					break;
2498 				}
2499 
2500 				tlb_gather_mmu_vma(&tlb, vma);
2501 				if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
2502 					hugetlb_vma_unlock_write(vma);
2503 					huge_pmd_unshare_flush(&tlb, vma);
2504 					tlb_finish_mmu(&tlb);
2505 					/*
2506 					 * The PMD table was unmapped,
2507 					 * consequently unmapping the folio.
2508 					 */
2509 					page_vma_mapped_walk_done(&pvmw);
2510 					break;
2511 				}
2512 				hugetlb_vma_unlock_write(vma);
2513 				tlb_finish_mmu(&tlb);
2514 			}
2515 			/* Nuke the hugetlb page table entry */
2516 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
2517 			if (pte_dirty(pteval))
2518 				folio_mark_dirty(folio);
2519 			writable = pte_write(pteval);
2520 		} else if (likely(pte_present(pteval))) {
2521 			flush_cache_page(vma, address, pfn);
2522 			/* Nuke the page table entry. */
2523 			if (should_defer_flush(mm, flags)) {
2524 				/*
2525 				 * We clear the PTE but do not flush so potentially
2526 				 * a remote CPU could still be writing to the folio.
2527 				 * If the entry was previously clean then the
2528 				 * architecture must guarantee that a clear->dirty
2529 				 * transition on a cached TLB entry is written through
2530 				 * and traps if the PTE is unmapped.
2531 				 */
2532 				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
2533 
2534 				set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
2535 			} else {
2536 				pteval = ptep_clear_flush(vma, address, pvmw.pte);
2537 			}
2538 			if (pte_dirty(pteval))
2539 				folio_mark_dirty(folio);
2540 			writable = pte_write(pteval);
2541 		} else {
2542 			const softleaf_t entry = softleaf_from_pte(pteval);
2543 
2544 			pte_clear(mm, address, pvmw.pte);
2545 
2546 			writable = softleaf_is_device_private_write(entry);
2547 		}
2548 
2549 		VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
2550 				!anon_exclusive, folio);
2551 
2552 		/* Update high watermark before we lower rss */
2553 		update_hiwater_rss(mm);
2554 
2555 		if (PageHWPoison(subpage)) {
2556 			VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
2557 
2558 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
2559 			if (folio_test_hugetlb(folio)) {
2560 				hugetlb_count_sub(folio_nr_pages(folio), mm);
2561 				set_huge_pte_at(mm, address, pvmw.pte, pteval,
2562 						hsz);
2563 			} else {
2564 				dec_mm_counter(mm, mm_counter(folio));
2565 				set_pte_at(mm, address, pvmw.pte, pteval);
2566 			}
2567 		} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
2568 			   !userfaultfd_armed(vma)) {
2569 			/*
2570 			 * The guest indicated that the page content is of no
2571 			 * interest anymore. Simply discard the pte, vmscan
2572 			 * will take care of the rest.
2573 			 * A future reference will then fault in a new zero
2574 			 * page. When userfaultfd is active, we must not drop
2575 			 * this page though, as its main user (postcopy
2576 			 * migration) will not expect userfaults on already
2577 			 * copied pages.
2578 			 */
2579 			dec_mm_counter(mm, mm_counter(folio));
2580 		} else {
2581 			swp_entry_t entry;
2582 			pte_t swp_pte;
2583 
2584 			/*
2585 			 * arch_unmap_one() is expected to be a NOP on
2586 			 * architectures where we could have PFN swap PTEs,
2587 			 * so we'll not check/care.
2588 			 */
2589 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
2590 				if (folio_test_hugetlb(folio))
2591 					set_huge_pte_at(mm, address, pvmw.pte,
2592 							pteval, hsz);
2593 				else
2594 					set_pte_at(mm, address, pvmw.pte, pteval);
2595 				ret = false;
2596 				page_vma_mapped_walk_done(&pvmw);
2597 				break;
2598 			}
2599 
2600 			/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
2601 			if (folio_test_hugetlb(folio)) {
2602 				if (anon_exclusive &&
2603 				    hugetlb_try_share_anon_rmap(folio)) {
2604 					set_huge_pte_at(mm, address, pvmw.pte,
2605 							pteval, hsz);
2606 					ret = false;
2607 					page_vma_mapped_walk_done(&pvmw);
2608 					break;
2609 				}
2610 			} else if (anon_exclusive &&
2611 				   folio_try_share_anon_rmap_pte(folio, subpage)) {
2612 				set_pte_at(mm, address, pvmw.pte, pteval);
2613 				ret = false;
2614 				page_vma_mapped_walk_done(&pvmw);
2615 				break;
2616 			}
2617 
2618 			/*
2619 			 * Store the pfn of the page in a special migration
2620 			 * pte. do_swap_page() will wait until the migration
2621 			 * pte is removed and then restart fault handling.
2622 			 */
2623 			if (writable)
2624 				entry = make_writable_migration_entry(
2625 							page_to_pfn(subpage));
2626 			else if (anon_exclusive)
2627 				entry = make_readable_exclusive_migration_entry(
2628 							page_to_pfn(subpage));
2629 			else
2630 				entry = make_readable_migration_entry(
2631 							page_to_pfn(subpage));
2632 			if (likely(pte_present(pteval))) {
2633 				if (pte_young(pteval))
2634 					entry = make_migration_entry_young(entry);
2635 				if (pte_dirty(pteval))
2636 					entry = make_migration_entry_dirty(entry);
2637 				swp_pte = swp_entry_to_pte(entry);
2638 				if (pte_soft_dirty(pteval))
2639 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2640 				if (pte_uffd_wp(pteval))
2641 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2642 			} else {
2643 				swp_pte = swp_entry_to_pte(entry);
2644 				if (pte_swp_soft_dirty(pteval))
2645 					swp_pte = pte_swp_mksoft_dirty(swp_pte);
2646 				if (pte_swp_uffd_wp(pteval))
2647 					swp_pte = pte_swp_mkuffd_wp(swp_pte);
2648 			}
2649 			if (folio_test_hugetlb(folio))
2650 				set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
2651 						hsz);
2652 			else
2653 				set_pte_at(mm, address, pvmw.pte, swp_pte);
2654 			trace_set_migration_pte(address, pte_val(swp_pte),
2655 						folio_order(folio));
2656 			/*
2657 			 * No need to invalidate here it will synchronize on
2658 			 * against the special swap migration pte.
2659 			 */
2660 		}
2661 
2662 		if (unlikely(folio_test_hugetlb(folio)))
2663 			hugetlb_remove_rmap(folio);
2664 		else
2665 			folio_remove_rmap_pte(folio, subpage, vma);
2666 		if (vma->vm_flags & VM_LOCKED)
2667 			mlock_drain_local();
2668 		folio_put(folio);
2669 	}
2670 
2671 	mmu_notifier_invalidate_range_end(&range);
2672 
2673 	return ret;
2674 }
2675 
2676 /**
2677  * try_to_migrate - try to replace all page table mappings with swap entries
2678  * @folio: the folio to replace page table entries for
2679  * @flags: action and flags
2680  *
2681  * Tries to remove all the page table entries which are mapping this folio and
2682  * replace them with special swap entries. Caller must hold the folio lock.
2683  */
2684 void try_to_migrate(struct folio *folio, enum ttu_flags flags)
2685 {
2686 	struct rmap_walk_control rwc = {
2687 		.rmap_one = try_to_migrate_one,
2688 		.arg = (void *)flags,
2689 		.done = folio_not_mapped,
2690 		.anon_lock = folio_lock_anon_vma_read,
2691 	};
2692 
2693 	/*
2694 	 * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
2695 	 * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
2696 	 */
2697 	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
2698 					TTU_SYNC | TTU_BATCH_FLUSH)))
2699 		return;
2700 
2701 	if (folio_is_zone_device(folio) &&
2702 	    (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
2703 		return;
2704 
2705 	/*
2706 	 * During exec, a temporary VMA is setup and later moved.
2707 	 * The VMA is moved under the anon_vma lock but not the
2708 	 * page tables leading to a race where migration cannot
2709 	 * find the migration ptes. Rather than increasing the
2710 	 * locking requirements of exec(), migration skips
2711 	 * temporary VMAs until after exec() completes.
2712 	 */
2713 	if (!folio_test_ksm(folio) && folio_test_anon(folio))
2714 		rwc.invalid_vma = invalid_migration_vma;
2715 
2716 	if (flags & TTU_RMAP_LOCKED)
2717 		rmap_walk_locked(folio, &rwc);
2718 	else
2719 		rmap_walk(folio, &rwc);
2720 }
2721 
2722 #ifdef CONFIG_DEVICE_PRIVATE
2723 /**
2724  * make_device_exclusive() - Mark a page for exclusive use by a device
2725  * @mm: mm_struct of associated target process
2726  * @addr: the virtual address to mark for exclusive device access
2727  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
2728  * @foliop: folio pointer will be stored here on success.
2729  *
2730  * This function looks up the page mapped at the given address, grabs a
2731  * folio reference, locks the folio and replaces the PTE with special
2732  * device-exclusive PFN swap entry, preventing access through the process
2733  * page tables. The function will return with the folio locked and referenced.
2734  *
2735  * On fault, the device-exclusive entries are replaced with the original PTE
2736  * under folio lock, after calling MMU notifiers.
2737  *
2738  * Only anonymous non-hugetlb folios are supported and the VMA must have
2739  * write permissions such that we can fault in the anonymous page writable
2740  * in order to mark it exclusive. The caller must hold the mmap_lock in read
2741  * mode.
2742  *
2743  * A driver using this to program access from a device must use a mmu notifier
2744  * critical section to hold a device specific lock during programming. Once
2745  * programming is complete it should drop the folio lock and reference after
2746  * which point CPU access to the page will revoke the exclusive access.
2747  *
2748  * Notes:
2749  *   #. This function always operates on individual PTEs mapping individual
2750  *      pages. PMD-sized THPs are first remapped to be mapped by PTEs before
2751  *      the conversion happens on a single PTE corresponding to @addr.
2752  *   #. While concurrent access through the process page tables is prevented,
2753  *      concurrent access through other page references (e.g., earlier GUP
2754  *      invocation) is not handled and not supported.
2755  *   #. device-exclusive entries are considered "clean" and "old" by core-mm.
2756  *      Device drivers must update the folio state when informed by MMU
2757  *      notifiers.
2758  *
2759  * Returns: pointer to mapped page on success, otherwise a negative error.
2760  */
2761 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
2762 		void *owner, struct folio **foliop)
2763 {
2764 	struct mmu_notifier_range range;
2765 	struct folio *folio, *fw_folio;
2766 	struct vm_area_struct *vma;
2767 	struct folio_walk fw;
2768 	struct page *page;
2769 	swp_entry_t entry;
2770 	pte_t swp_pte;
2771 	int ret;
2772 
2773 	mmap_assert_locked(mm);
2774 	addr = PAGE_ALIGN_DOWN(addr);
2775 
2776 	/*
2777 	 * Fault in the page writable and try to lock it; note that if the
2778 	 * address would already be marked for exclusive use by a device,
2779 	 * the GUP call would undo that first by triggering a fault.
2780 	 *
2781 	 * If any other device would already map this page exclusively, the
2782 	 * fault will trigger a conversion to an ordinary
2783 	 * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
2784 	 */
2785 retry:
2786 	page = get_user_page_vma_remote(mm, addr,
2787 					FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
2788 					&vma);
2789 	if (IS_ERR(page))
2790 		return page;
2791 	folio = page_folio(page);
2792 
2793 	if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
2794 		folio_put(folio);
2795 		return ERR_PTR(-EOPNOTSUPP);
2796 	}
2797 
2798 	ret = folio_lock_killable(folio);
2799 	if (ret) {
2800 		folio_put(folio);
2801 		return ERR_PTR(ret);
2802 	}
2803 
2804 	/*
2805 	 * Inform secondary MMUs that we are going to convert this PTE to
2806 	 * device-exclusive, such that they unmap it now. Note that the
2807 	 * caller must filter this event out to prevent livelocks.
2808 	 */
2809 	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
2810 				      mm, addr, addr + PAGE_SIZE, owner);
2811 	mmu_notifier_invalidate_range_start(&range);
2812 
2813 	/*
2814 	 * Let's do a second walk and make sure we still find the same page
2815 	 * mapped writable. Note that any page of an anonymous folio can
2816 	 * only be mapped writable using exactly one PTE ("exclusive"), so
2817 	 * there cannot be other mappings.
2818 	 */
2819 	fw_folio = folio_walk_start(&fw, vma, addr, 0);
2820 	if (fw_folio != folio || fw.page != page ||
2821 	    fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
2822 		if (fw_folio)
2823 			folio_walk_end(&fw, vma);
2824 		mmu_notifier_invalidate_range_end(&range);
2825 		folio_unlock(folio);
2826 		folio_put(folio);
2827 		goto retry;
2828 	}
2829 
2830 	/* Nuke the page table entry so we get the uptodate dirty bit. */
2831 	flush_cache_page(vma, addr, page_to_pfn(page));
2832 	fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
2833 
2834 	/* Set the dirty flag on the folio now the PTE is gone. */
2835 	if (pte_dirty(fw.pte))
2836 		folio_mark_dirty(folio);
2837 
2838 	/*
2839 	 * Store the pfn of the page in a special device-exclusive PFN swap PTE.
2840 	 * do_swap_page() will trigger the conversion back while holding the
2841 	 * folio lock.
2842 	 */
2843 	entry = make_device_exclusive_entry(page_to_pfn(page));
2844 	swp_pte = swp_entry_to_pte(entry);
2845 	if (pte_soft_dirty(fw.pte))
2846 		swp_pte = pte_swp_mksoft_dirty(swp_pte);
2847 	/* The pte is writable, uffd-wp does not apply. */
2848 	set_pte_at(mm, addr, fw.ptep, swp_pte);
2849 
2850 	folio_walk_end(&fw, vma);
2851 	mmu_notifier_invalidate_range_end(&range);
2852 	*foliop = folio;
2853 	return page;
2854 }
2855 EXPORT_SYMBOL_GPL(make_device_exclusive);
2856 #endif
2857 
2858 void __put_anon_vma(struct anon_vma *anon_vma)
2859 {
2860 	struct anon_vma *root = anon_vma->root;
2861 
2862 	anon_vma_free(anon_vma);
2863 	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
2864 		anon_vma_free(root);
2865 }
2866 
2867 static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
2868 					    struct rmap_walk_control *rwc)
2869 {
2870 	struct anon_vma *anon_vma;
2871 
2872 	if (rwc->anon_lock)
2873 		return rwc->anon_lock(folio, rwc);
2874 
2875 	/*
2876 	 * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
2877 	 * because that depends on page_mapped(); but not all its usages
2878 	 * are holding mmap_lock. Users without mmap_lock are required to
2879 	 * take a reference count to prevent the anon_vma disappearing
2880 	 */
2881 	anon_vma = folio_anon_vma(folio);
2882 	if (!anon_vma)
2883 		return NULL;
2884 
2885 	if (anon_vma_trylock_read(anon_vma))
2886 		goto out;
2887 
2888 	if (rwc->try_lock) {
2889 		anon_vma = NULL;
2890 		rwc->contended = true;
2891 		goto out;
2892 	}
2893 
2894 	anon_vma_lock_read(anon_vma);
2895 out:
2896 	return anon_vma;
2897 }
2898 
2899 /*
2900  * rmap_walk_anon - do something to anonymous page using the object-based
2901  * rmap method
2902  * @folio: the folio to be handled
2903  * @rwc: control variable according to each walk type
2904  * @locked: caller holds relevant rmap lock
2905  *
2906  * Find all the mappings of a folio using the mapping pointer and the vma
2907  * chains contained in the anon_vma struct it points to.
2908  */
2909 static void rmap_walk_anon(struct folio *folio,
2910 		struct rmap_walk_control *rwc, bool locked)
2911 {
2912 	struct anon_vma *anon_vma;
2913 	pgoff_t pgoff_start, pgoff_end;
2914 	struct anon_vma_chain *avc;
2915 
2916 	/*
2917 	 * The folio lock ensures that folio->mapping can't be changed under us
2918 	 * to an anon_vma with different root.
2919 	 */
2920 	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
2921 
2922 	if (locked) {
2923 		anon_vma = folio_anon_vma(folio);
2924 		/* anon_vma disappear under us? */
2925 		VM_BUG_ON_FOLIO(!anon_vma, folio);
2926 	} else {
2927 		anon_vma = rmap_walk_anon_lock(folio, rwc);
2928 	}
2929 	if (!anon_vma)
2930 		return;
2931 
2932 	pgoff_start = folio_pgoff(folio);
2933 	pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
2934 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2935 			pgoff_start, pgoff_end) {
2936 		struct vm_area_struct *vma = avc->vma;
2937 		unsigned long address = vma_address(vma, pgoff_start,
2938 				folio_nr_pages(folio));
2939 
2940 		VM_BUG_ON_VMA(address == -EFAULT, vma);
2941 		cond_resched();
2942 
2943 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2944 			continue;
2945 
2946 		if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2947 			break;
2948 		if (rwc->done && rwc->done(folio))
2949 			break;
2950 	}
2951 
2952 	if (!locked)
2953 		anon_vma_unlock_read(anon_vma);
2954 }
2955 
2956 /**
2957  * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
2958  * of a page mapped within a specified page cache object at a specified offset.
2959  *
2960  * @folio: 		Either the folio whose mappings to traverse, or if NULL,
2961  * 			the callbacks specified in @rwc will be configured such
2962  * 			as to be able to look up mappings correctly.
2963  * @mapping: 		The page cache object whose mapping VMAs we intend to
2964  * 			traverse. If @folio is non-NULL, this should be equal to
2965  *			folio_mapping(folio).
2966  * @pgoff_start:	The offset within @mapping of the page which we are
2967  * 			looking up. If @folio is non-NULL, this should be equal
2968  * 			to folio_pgoff(folio).
2969  * @nr_pages:		The number of pages mapped by the mapping. If @folio is
2970  *			non-NULL, this should be equal to folio_nr_pages(folio).
2971  * @rwc:		The reverse mapping walk control object describing how
2972  *			the traversal should proceed.
2973  * @locked:		Is the @mapping already locked? If not, we acquire the
2974  *			lock.
2975  */
2976 static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
2977 			     pgoff_t pgoff_start, unsigned long nr_pages,
2978 			     struct rmap_walk_control *rwc, bool locked)
2979 {
2980 	pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
2981 	struct vm_area_struct *vma;
2982 
2983 	VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
2984 	VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
2985 	VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
2986 
2987 	if (!locked) {
2988 		if (i_mmap_trylock_read(mapping))
2989 			goto lookup;
2990 
2991 		if (rwc->try_lock) {
2992 			rwc->contended = true;
2993 			return;
2994 		}
2995 
2996 		i_mmap_lock_read(mapping);
2997 	}
2998 lookup:
2999 	vma_interval_tree_foreach(vma, &mapping->i_mmap,
3000 			pgoff_start, pgoff_end) {
3001 		unsigned long address = vma_address(vma, pgoff_start, nr_pages);
3002 
3003 		VM_BUG_ON_VMA(address == -EFAULT, vma);
3004 		cond_resched();
3005 
3006 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
3007 			continue;
3008 
3009 		if (!rwc->rmap_one(folio, vma, address, rwc->arg))
3010 			goto done;
3011 		if (rwc->done && rwc->done(folio))
3012 			goto done;
3013 	}
3014 done:
3015 	if (!locked)
3016 		i_mmap_unlock_read(mapping);
3017 }
3018 
3019 /*
3020  * rmap_walk_file - do something to file page using the object-based rmap method
3021  * @folio: the folio to be handled
3022  * @rwc: control variable according to each walk type
3023  * @locked: caller holds relevant rmap lock
3024  *
3025  * Find all the mappings of a folio using the mapping pointer and the vma chains
3026  * contained in the address_space struct it points to.
3027  */
3028 static void rmap_walk_file(struct folio *folio,
3029 		struct rmap_walk_control *rwc, bool locked)
3030 {
3031 	/*
3032 	 * The folio lock not only makes sure that folio->mapping cannot
3033 	 * suddenly be NULLified by truncation, it makes sure that the structure
3034 	 * at mapping cannot be freed and reused yet, so we can safely take
3035 	 * mapping->i_mmap_rwsem.
3036 	 */
3037 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3038 
3039 	if (!folio->mapping)
3040 		return;
3041 
3042 	__rmap_walk_file(folio, folio->mapping, folio->index,
3043 			 folio_nr_pages(folio), rwc, locked);
3044 }
3045 
3046 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
3047 {
3048 	if (unlikely(folio_test_ksm(folio)))
3049 		rmap_walk_ksm(folio, rwc);
3050 	else if (folio_test_anon(folio))
3051 		rmap_walk_anon(folio, rwc, false);
3052 	else
3053 		rmap_walk_file(folio, rwc, false);
3054 }
3055 
3056 /* Like rmap_walk, but caller holds relevant rmap lock */
3057 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
3058 {
3059 	/* no ksm support for now */
3060 	VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
3061 	if (folio_test_anon(folio))
3062 		rmap_walk_anon(folio, rwc, true);
3063 	else
3064 		rmap_walk_file(folio, rwc, true);
3065 }
3066 
3067 #ifdef CONFIG_HUGETLB_PAGE
3068 /*
3069  * The following two functions are for anonymous (private mapped) hugepages.
3070  * Unlike common anonymous pages, anonymous hugepages have no accounting code
3071  * and no lru code, because we handle hugepages differently from common pages.
3072  */
3073 void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
3074 		unsigned long address, rmap_t flags)
3075 {
3076 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
3077 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3078 
3079 	atomic_inc(&folio->_entire_mapcount);
3080 	atomic_inc(&folio->_large_mapcount);
3081 	if (flags & RMAP_EXCLUSIVE)
3082 		SetPageAnonExclusive(&folio->page);
3083 	VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
3084 			 PageAnonExclusive(&folio->page), folio);
3085 }
3086 
3087 void hugetlb_add_new_anon_rmap(struct folio *folio,
3088 		struct vm_area_struct *vma, unsigned long address)
3089 {
3090 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
3091 
3092 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
3093 	/* increment count (starts at -1) */
3094 	atomic_set(&folio->_entire_mapcount, 0);
3095 	atomic_set(&folio->_large_mapcount, 0);
3096 	folio_clear_hugetlb_restore_reserve(folio);
3097 	__folio_set_anon(folio, vma, address, true);
3098 	SetPageAnonExclusive(&folio->page);
3099 }
3100 #endif /* CONFIG_HUGETLB_PAGE */
3101