xref: /linux/mm/rmap.c (revision 802a3a92ad7ac0b9be9df229dee530a1f0a8039b)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * mm/rmap.c - physical to virtual reverse mappings
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
51da177e4SLinus Torvalds  * Released under the General Public License (GPL).
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  * Simple, low overhead reverse mapping scheme.
81da177e4SLinus Torvalds  * Please try to keep this thing as modular as possible.
91da177e4SLinus Torvalds  *
101da177e4SLinus Torvalds  * Provides methods for unmapping each kind of mapped page:
111da177e4SLinus Torvalds  * the anon methods track anonymous pages, and
121da177e4SLinus Torvalds  * the file methods track pages belonging to an inode.
131da177e4SLinus Torvalds  *
141da177e4SLinus Torvalds  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
151da177e4SLinus Torvalds  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
161da177e4SLinus Torvalds  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
1798f32602SHugh Dickins  * Contributions by Hugh Dickins 2003, 2004
181da177e4SLinus Torvalds  */
191da177e4SLinus Torvalds 
201da177e4SLinus Torvalds /*
211da177e4SLinus Torvalds  * Lock ordering in mm:
221da177e4SLinus Torvalds  *
231b1dcc1bSJes Sorensen  * inode->i_mutex	(while writing or truncating, not reading or faulting)
241da177e4SLinus Torvalds  *   mm->mmap_sem
251da177e4SLinus Torvalds  *     page->flags PG_locked (lock_page)
2688f306b6SKirill A. Shutemov  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27c8c06efaSDavidlohr Bueso  *         mapping->i_mmap_rwsem
285a505085SIngo Molnar  *           anon_vma->rwsem
29b8072f09SHugh Dickins  *             mm->page_table_lock or pte_lock
30a52633d8SMel Gorman  *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
315d337b91SHugh Dickins  *               swap_lock (in swap_duplicate, swap_info_get)
321da177e4SLinus Torvalds  *                 mmlist_lock (in mmput, drain_mmlist and others)
331da177e4SLinus Torvalds  *                 mapping->private_lock (in __set_page_dirty_buffers)
34c4843a75SGreg Thelen  *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
35c4843a75SGreg Thelen  *                     mapping->tree_lock (widely used)
36250df6edSDave Chinner  *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
37f758eeabSChristoph Hellwig  *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
381da177e4SLinus Torvalds  *                   sb_lock (within inode_lock in fs/fs-writeback.c)
391da177e4SLinus Torvalds  *                   mapping->tree_lock (widely used, in set_page_dirty,
401da177e4SLinus Torvalds  *                             in arch-dependent flush_dcache_mmap_lock,
41f758eeabSChristoph Hellwig  *                             within bdi.wb->list_lock in __sync_single_inode)
426a46079cSAndi Kleen  *
435a505085SIngo Molnar  * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
446a46079cSAndi Kleen  *   ->tasklist_lock
456a46079cSAndi Kleen  *     pte map lock
461da177e4SLinus Torvalds  */
471da177e4SLinus Torvalds 
481da177e4SLinus Torvalds #include <linux/mm.h>
496e84f315SIngo Molnar #include <linux/sched/mm.h>
5029930025SIngo Molnar #include <linux/sched/task.h>
511da177e4SLinus Torvalds #include <linux/pagemap.h>
521da177e4SLinus Torvalds #include <linux/swap.h>
531da177e4SLinus Torvalds #include <linux/swapops.h>
541da177e4SLinus Torvalds #include <linux/slab.h>
551da177e4SLinus Torvalds #include <linux/init.h>
565ad64688SHugh Dickins #include <linux/ksm.h>
571da177e4SLinus Torvalds #include <linux/rmap.h>
581da177e4SLinus Torvalds #include <linux/rcupdate.h>
59b95f1b31SPaul Gortmaker #include <linux/export.h>
608a9f3ccdSBalbir Singh #include <linux/memcontrol.h>
61cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
6264cdd548SKOSAKI Motohiro #include <linux/migrate.h>
630fe6e20bSNaoya Horiguchi #include <linux/hugetlb.h>
64ef5d437fSJan Kara #include <linux/backing-dev.h>
6533c3fc71SVladimir Davydov #include <linux/page_idle.h>
661da177e4SLinus Torvalds 
671da177e4SLinus Torvalds #include <asm/tlbflush.h>
681da177e4SLinus Torvalds 
6972b252aeSMel Gorman #include <trace/events/tlb.h>
7072b252aeSMel Gorman 
71b291f000SNick Piggin #include "internal.h"
72b291f000SNick Piggin 
73fdd2e5f8SAdrian Bunk static struct kmem_cache *anon_vma_cachep;
745beb4930SRik van Riel static struct kmem_cache *anon_vma_chain_cachep;
75fdd2e5f8SAdrian Bunk 
76fdd2e5f8SAdrian Bunk static inline struct anon_vma *anon_vma_alloc(void)
77fdd2e5f8SAdrian Bunk {
7801d8b20dSPeter Zijlstra 	struct anon_vma *anon_vma;
7901d8b20dSPeter Zijlstra 
8001d8b20dSPeter Zijlstra 	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
8101d8b20dSPeter Zijlstra 	if (anon_vma) {
8201d8b20dSPeter Zijlstra 		atomic_set(&anon_vma->refcount, 1);
837a3ef208SKonstantin Khlebnikov 		anon_vma->degree = 1;	/* Reference for first vma */
847a3ef208SKonstantin Khlebnikov 		anon_vma->parent = anon_vma;
8501d8b20dSPeter Zijlstra 		/*
8601d8b20dSPeter Zijlstra 		 * Initialise the anon_vma root to point to itself. If called
8701d8b20dSPeter Zijlstra 		 * from fork, the root will be reset to the parents anon_vma.
8801d8b20dSPeter Zijlstra 		 */
8901d8b20dSPeter Zijlstra 		anon_vma->root = anon_vma;
90fdd2e5f8SAdrian Bunk 	}
91fdd2e5f8SAdrian Bunk 
9201d8b20dSPeter Zijlstra 	return anon_vma;
9301d8b20dSPeter Zijlstra }
9401d8b20dSPeter Zijlstra 
9501d8b20dSPeter Zijlstra static inline void anon_vma_free(struct anon_vma *anon_vma)
96fdd2e5f8SAdrian Bunk {
9701d8b20dSPeter Zijlstra 	VM_BUG_ON(atomic_read(&anon_vma->refcount));
9888c22088SPeter Zijlstra 
9988c22088SPeter Zijlstra 	/*
1004fc3f1d6SIngo Molnar 	 * Synchronize against page_lock_anon_vma_read() such that
10188c22088SPeter Zijlstra 	 * we can safely hold the lock without the anon_vma getting
10288c22088SPeter Zijlstra 	 * freed.
10388c22088SPeter Zijlstra 	 *
10488c22088SPeter Zijlstra 	 * Relies on the full mb implied by the atomic_dec_and_test() from
10588c22088SPeter Zijlstra 	 * put_anon_vma() against the acquire barrier implied by
1064fc3f1d6SIngo Molnar 	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
10788c22088SPeter Zijlstra 	 *
1084fc3f1d6SIngo Molnar 	 * page_lock_anon_vma_read()	VS	put_anon_vma()
1094fc3f1d6SIngo Molnar 	 *   down_read_trylock()		  atomic_dec_and_test()
11088c22088SPeter Zijlstra 	 *   LOCK				  MB
1114fc3f1d6SIngo Molnar 	 *   atomic_read()			  rwsem_is_locked()
11288c22088SPeter Zijlstra 	 *
11388c22088SPeter Zijlstra 	 * LOCK should suffice since the actual taking of the lock must
11488c22088SPeter Zijlstra 	 * happen _before_ what follows.
11588c22088SPeter Zijlstra 	 */
1167f39dda9SHugh Dickins 	might_sleep();
1175a505085SIngo Molnar 	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
1184fc3f1d6SIngo Molnar 		anon_vma_lock_write(anon_vma);
11908b52706SKonstantin Khlebnikov 		anon_vma_unlock_write(anon_vma);
12088c22088SPeter Zijlstra 	}
12188c22088SPeter Zijlstra 
122fdd2e5f8SAdrian Bunk 	kmem_cache_free(anon_vma_cachep, anon_vma);
123fdd2e5f8SAdrian Bunk }
1241da177e4SLinus Torvalds 
125dd34739cSLinus Torvalds static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
1265beb4930SRik van Riel {
127dd34739cSLinus Torvalds 	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
1285beb4930SRik van Riel }
1295beb4930SRik van Riel 
130e574b5fdSNamhyung Kim static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
1315beb4930SRik van Riel {
1325beb4930SRik van Riel 	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
1335beb4930SRik van Riel }
1345beb4930SRik van Riel 
1356583a843SKautuk Consul static void anon_vma_chain_link(struct vm_area_struct *vma,
1366583a843SKautuk Consul 				struct anon_vma_chain *avc,
1376583a843SKautuk Consul 				struct anon_vma *anon_vma)
1386583a843SKautuk Consul {
1396583a843SKautuk Consul 	avc->vma = vma;
1406583a843SKautuk Consul 	avc->anon_vma = anon_vma;
1416583a843SKautuk Consul 	list_add(&avc->same_vma, &vma->anon_vma_chain);
142bf181b9fSMichel Lespinasse 	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
1436583a843SKautuk Consul }
1446583a843SKautuk Consul 
145d9d332e0SLinus Torvalds /**
146d5a187daSVlastimil Babka  * __anon_vma_prepare - attach an anon_vma to a memory region
147d9d332e0SLinus Torvalds  * @vma: the memory region in question
148d9d332e0SLinus Torvalds  *
149d9d332e0SLinus Torvalds  * This makes sure the memory mapping described by 'vma' has
150d9d332e0SLinus Torvalds  * an 'anon_vma' attached to it, so that we can associate the
151d9d332e0SLinus Torvalds  * anonymous pages mapped into it with that anon_vma.
152d9d332e0SLinus Torvalds  *
153d5a187daSVlastimil Babka  * The common case will be that we already have one, which
154d5a187daSVlastimil Babka  * is handled inline by anon_vma_prepare(). But if
15523a0790aSFigo.zhang  * not we either need to find an adjacent mapping that we
156d9d332e0SLinus Torvalds  * can re-use the anon_vma from (very common when the only
157d9d332e0SLinus Torvalds  * reason for splitting a vma has been mprotect()), or we
158d9d332e0SLinus Torvalds  * allocate a new one.
159d9d332e0SLinus Torvalds  *
160d9d332e0SLinus Torvalds  * Anon-vma allocations are very subtle, because we may have
1614fc3f1d6SIngo Molnar  * optimistically looked up an anon_vma in page_lock_anon_vma_read()
162d9d332e0SLinus Torvalds  * and that may actually touch the spinlock even in the newly
163d9d332e0SLinus Torvalds  * allocated vma (it depends on RCU to make sure that the
164d9d332e0SLinus Torvalds  * anon_vma isn't actually destroyed).
165d9d332e0SLinus Torvalds  *
166d9d332e0SLinus Torvalds  * As a result, we need to do proper anon_vma locking even
167d9d332e0SLinus Torvalds  * for the new allocation. At the same time, we do not want
168d9d332e0SLinus Torvalds  * to do any locking for the common case of already having
169d9d332e0SLinus Torvalds  * an anon_vma.
170d9d332e0SLinus Torvalds  *
171d9d332e0SLinus Torvalds  * This must be called with the mmap_sem held for reading.
172d9d332e0SLinus Torvalds  */
173d5a187daSVlastimil Babka int __anon_vma_prepare(struct vm_area_struct *vma)
1741da177e4SLinus Torvalds {
175d5a187daSVlastimil Babka 	struct mm_struct *mm = vma->vm_mm;
176d5a187daSVlastimil Babka 	struct anon_vma *anon_vma, *allocated;
1775beb4930SRik van Riel 	struct anon_vma_chain *avc;
1781da177e4SLinus Torvalds 
1791da177e4SLinus Torvalds 	might_sleep();
1801da177e4SLinus Torvalds 
181dd34739cSLinus Torvalds 	avc = anon_vma_chain_alloc(GFP_KERNEL);
1825beb4930SRik van Riel 	if (!avc)
1835beb4930SRik van Riel 		goto out_enomem;
1845beb4930SRik van Riel 
1851da177e4SLinus Torvalds 	anon_vma = find_mergeable_anon_vma(vma);
1861da177e4SLinus Torvalds 	allocated = NULL;
187d9d332e0SLinus Torvalds 	if (!anon_vma) {
1881da177e4SLinus Torvalds 		anon_vma = anon_vma_alloc();
1891da177e4SLinus Torvalds 		if (unlikely(!anon_vma))
1905beb4930SRik van Riel 			goto out_enomem_free_avc;
1911da177e4SLinus Torvalds 		allocated = anon_vma;
1921da177e4SLinus Torvalds 	}
1931da177e4SLinus Torvalds 
1944fc3f1d6SIngo Molnar 	anon_vma_lock_write(anon_vma);
1951da177e4SLinus Torvalds 	/* page_table_lock to protect against threads */
1961da177e4SLinus Torvalds 	spin_lock(&mm->page_table_lock);
1971da177e4SLinus Torvalds 	if (likely(!vma->anon_vma)) {
1981da177e4SLinus Torvalds 		vma->anon_vma = anon_vma;
1996583a843SKautuk Consul 		anon_vma_chain_link(vma, avc, anon_vma);
2007a3ef208SKonstantin Khlebnikov 		/* vma reference or self-parent link for new root */
2017a3ef208SKonstantin Khlebnikov 		anon_vma->degree++;
2021da177e4SLinus Torvalds 		allocated = NULL;
20331f2b0ebSOleg Nesterov 		avc = NULL;
2041da177e4SLinus Torvalds 	}
2051da177e4SLinus Torvalds 	spin_unlock(&mm->page_table_lock);
20608b52706SKonstantin Khlebnikov 	anon_vma_unlock_write(anon_vma);
20731f2b0ebSOleg Nesterov 
20831f2b0ebSOleg Nesterov 	if (unlikely(allocated))
20901d8b20dSPeter Zijlstra 		put_anon_vma(allocated);
21031f2b0ebSOleg Nesterov 	if (unlikely(avc))
2115beb4930SRik van Riel 		anon_vma_chain_free(avc);
212d5a187daSVlastimil Babka 
2131da177e4SLinus Torvalds 	return 0;
2145beb4930SRik van Riel 
2155beb4930SRik van Riel  out_enomem_free_avc:
2165beb4930SRik van Riel 	anon_vma_chain_free(avc);
2175beb4930SRik van Riel  out_enomem:
2185beb4930SRik van Riel 	return -ENOMEM;
2191da177e4SLinus Torvalds }
2201da177e4SLinus Torvalds 
221bb4aa396SLinus Torvalds /*
222bb4aa396SLinus Torvalds  * This is a useful helper function for locking the anon_vma root as
223bb4aa396SLinus Torvalds  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
224bb4aa396SLinus Torvalds  * have the same vma.
225bb4aa396SLinus Torvalds  *
226bb4aa396SLinus Torvalds  * Such anon_vma's should have the same root, so you'd expect to see
227bb4aa396SLinus Torvalds  * just a single mutex_lock for the whole traversal.
228bb4aa396SLinus Torvalds  */
229bb4aa396SLinus Torvalds static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
230bb4aa396SLinus Torvalds {
231bb4aa396SLinus Torvalds 	struct anon_vma *new_root = anon_vma->root;
232bb4aa396SLinus Torvalds 	if (new_root != root) {
233bb4aa396SLinus Torvalds 		if (WARN_ON_ONCE(root))
2345a505085SIngo Molnar 			up_write(&root->rwsem);
235bb4aa396SLinus Torvalds 		root = new_root;
2365a505085SIngo Molnar 		down_write(&root->rwsem);
237bb4aa396SLinus Torvalds 	}
238bb4aa396SLinus Torvalds 	return root;
239bb4aa396SLinus Torvalds }
240bb4aa396SLinus Torvalds 
241bb4aa396SLinus Torvalds static inline void unlock_anon_vma_root(struct anon_vma *root)
242bb4aa396SLinus Torvalds {
243bb4aa396SLinus Torvalds 	if (root)
2445a505085SIngo Molnar 		up_write(&root->rwsem);
245bb4aa396SLinus Torvalds }
246bb4aa396SLinus Torvalds 
2475beb4930SRik van Riel /*
2485beb4930SRik van Riel  * Attach the anon_vmas from src to dst.
2495beb4930SRik van Riel  * Returns 0 on success, -ENOMEM on failure.
2507a3ef208SKonstantin Khlebnikov  *
2517a3ef208SKonstantin Khlebnikov  * If dst->anon_vma is NULL this function tries to find and reuse existing
2527a3ef208SKonstantin Khlebnikov  * anon_vma which has no vmas and only one child anon_vma. This prevents
2537a3ef208SKonstantin Khlebnikov  * degradation of anon_vma hierarchy to endless linear chain in case of
2547a3ef208SKonstantin Khlebnikov  * constantly forking task. On the other hand, an anon_vma with more than one
2557a3ef208SKonstantin Khlebnikov  * child isn't reused even if there was no alive vma, thus rmap walker has a
2567a3ef208SKonstantin Khlebnikov  * good chance of avoiding scanning the whole hierarchy when it searches where
2577a3ef208SKonstantin Khlebnikov  * page is mapped.
2585beb4930SRik van Riel  */
2595beb4930SRik van Riel int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
2605beb4930SRik van Riel {
2615beb4930SRik van Riel 	struct anon_vma_chain *avc, *pavc;
262bb4aa396SLinus Torvalds 	struct anon_vma *root = NULL;
2635beb4930SRik van Riel 
264646d87b4SLinus Torvalds 	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
265bb4aa396SLinus Torvalds 		struct anon_vma *anon_vma;
266bb4aa396SLinus Torvalds 
267dd34739cSLinus Torvalds 		avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
268dd34739cSLinus Torvalds 		if (unlikely(!avc)) {
269dd34739cSLinus Torvalds 			unlock_anon_vma_root(root);
270dd34739cSLinus Torvalds 			root = NULL;
271dd34739cSLinus Torvalds 			avc = anon_vma_chain_alloc(GFP_KERNEL);
2725beb4930SRik van Riel 			if (!avc)
2735beb4930SRik van Riel 				goto enomem_failure;
274dd34739cSLinus Torvalds 		}
275bb4aa396SLinus Torvalds 		anon_vma = pavc->anon_vma;
276bb4aa396SLinus Torvalds 		root = lock_anon_vma_root(root, anon_vma);
277bb4aa396SLinus Torvalds 		anon_vma_chain_link(dst, avc, anon_vma);
2787a3ef208SKonstantin Khlebnikov 
2797a3ef208SKonstantin Khlebnikov 		/*
2807a3ef208SKonstantin Khlebnikov 		 * Reuse existing anon_vma if its degree lower than two,
2817a3ef208SKonstantin Khlebnikov 		 * that means it has no vma and only one anon_vma child.
2827a3ef208SKonstantin Khlebnikov 		 *
2837a3ef208SKonstantin Khlebnikov 		 * Do not chose parent anon_vma, otherwise first child
2847a3ef208SKonstantin Khlebnikov 		 * will always reuse it. Root anon_vma is never reused:
2857a3ef208SKonstantin Khlebnikov 		 * it has self-parent reference and at least one child.
2867a3ef208SKonstantin Khlebnikov 		 */
2877a3ef208SKonstantin Khlebnikov 		if (!dst->anon_vma && anon_vma != src->anon_vma &&
2887a3ef208SKonstantin Khlebnikov 				anon_vma->degree < 2)
2897a3ef208SKonstantin Khlebnikov 			dst->anon_vma = anon_vma;
2905beb4930SRik van Riel 	}
2917a3ef208SKonstantin Khlebnikov 	if (dst->anon_vma)
2927a3ef208SKonstantin Khlebnikov 		dst->anon_vma->degree++;
293bb4aa396SLinus Torvalds 	unlock_anon_vma_root(root);
2945beb4930SRik van Riel 	return 0;
2955beb4930SRik van Riel 
2965beb4930SRik van Riel  enomem_failure:
2973fe89b3eSLeon Yu 	/*
2983fe89b3eSLeon Yu 	 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
2993fe89b3eSLeon Yu 	 * decremented in unlink_anon_vmas().
3003fe89b3eSLeon Yu 	 * We can safely do this because callers of anon_vma_clone() don't care
3013fe89b3eSLeon Yu 	 * about dst->anon_vma if anon_vma_clone() failed.
3023fe89b3eSLeon Yu 	 */
3033fe89b3eSLeon Yu 	dst->anon_vma = NULL;
3045beb4930SRik van Riel 	unlink_anon_vmas(dst);
3055beb4930SRik van Riel 	return -ENOMEM;
3061da177e4SLinus Torvalds }
3071da177e4SLinus Torvalds 
3085beb4930SRik van Riel /*
3095beb4930SRik van Riel  * Attach vma to its own anon_vma, as well as to the anon_vmas that
3105beb4930SRik van Riel  * the corresponding VMA in the parent process is attached to.
3115beb4930SRik van Riel  * Returns 0 on success, non-zero on failure.
3125beb4930SRik van Riel  */
3135beb4930SRik van Riel int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
3141da177e4SLinus Torvalds {
3155beb4930SRik van Riel 	struct anon_vma_chain *avc;
3165beb4930SRik van Riel 	struct anon_vma *anon_vma;
317c4ea95d7SDaniel Forrest 	int error;
3185beb4930SRik van Riel 
3195beb4930SRik van Riel 	/* Don't bother if the parent process has no anon_vma here. */
3205beb4930SRik van Riel 	if (!pvma->anon_vma)
3215beb4930SRik van Riel 		return 0;
3225beb4930SRik van Riel 
3237a3ef208SKonstantin Khlebnikov 	/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
3247a3ef208SKonstantin Khlebnikov 	vma->anon_vma = NULL;
3257a3ef208SKonstantin Khlebnikov 
3265beb4930SRik van Riel 	/*
3275beb4930SRik van Riel 	 * First, attach the new VMA to the parent VMA's anon_vmas,
3285beb4930SRik van Riel 	 * so rmap can find non-COWed pages in child processes.
3295beb4930SRik van Riel 	 */
330c4ea95d7SDaniel Forrest 	error = anon_vma_clone(vma, pvma);
331c4ea95d7SDaniel Forrest 	if (error)
332c4ea95d7SDaniel Forrest 		return error;
3335beb4930SRik van Riel 
3347a3ef208SKonstantin Khlebnikov 	/* An existing anon_vma has been reused, all done then. */
3357a3ef208SKonstantin Khlebnikov 	if (vma->anon_vma)
3367a3ef208SKonstantin Khlebnikov 		return 0;
3377a3ef208SKonstantin Khlebnikov 
3385beb4930SRik van Riel 	/* Then add our own anon_vma. */
3395beb4930SRik van Riel 	anon_vma = anon_vma_alloc();
3405beb4930SRik van Riel 	if (!anon_vma)
3415beb4930SRik van Riel 		goto out_error;
342dd34739cSLinus Torvalds 	avc = anon_vma_chain_alloc(GFP_KERNEL);
3435beb4930SRik van Riel 	if (!avc)
3445beb4930SRik van Riel 		goto out_error_free_anon_vma;
3455c341ee1SRik van Riel 
3465c341ee1SRik van Riel 	/*
3475c341ee1SRik van Riel 	 * The root anon_vma's spinlock is the lock actually used when we
3485c341ee1SRik van Riel 	 * lock any of the anon_vmas in this anon_vma tree.
3495c341ee1SRik van Riel 	 */
3505c341ee1SRik van Riel 	anon_vma->root = pvma->anon_vma->root;
3517a3ef208SKonstantin Khlebnikov 	anon_vma->parent = pvma->anon_vma;
35276545066SRik van Riel 	/*
35301d8b20dSPeter Zijlstra 	 * With refcounts, an anon_vma can stay around longer than the
35401d8b20dSPeter Zijlstra 	 * process it belongs to. The root anon_vma needs to be pinned until
35501d8b20dSPeter Zijlstra 	 * this anon_vma is freed, because the lock lives in the root.
35676545066SRik van Riel 	 */
35776545066SRik van Riel 	get_anon_vma(anon_vma->root);
3585beb4930SRik van Riel 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
3595beb4930SRik van Riel 	vma->anon_vma = anon_vma;
3604fc3f1d6SIngo Molnar 	anon_vma_lock_write(anon_vma);
3615c341ee1SRik van Riel 	anon_vma_chain_link(vma, avc, anon_vma);
3627a3ef208SKonstantin Khlebnikov 	anon_vma->parent->degree++;
36308b52706SKonstantin Khlebnikov 	anon_vma_unlock_write(anon_vma);
3645beb4930SRik van Riel 
3655beb4930SRik van Riel 	return 0;
3665beb4930SRik van Riel 
3675beb4930SRik van Riel  out_error_free_anon_vma:
36801d8b20dSPeter Zijlstra 	put_anon_vma(anon_vma);
3695beb4930SRik van Riel  out_error:
3704946d54cSRik van Riel 	unlink_anon_vmas(vma);
3715beb4930SRik van Riel 	return -ENOMEM;
3725beb4930SRik van Riel }
3735beb4930SRik van Riel 
3745beb4930SRik van Riel void unlink_anon_vmas(struct vm_area_struct *vma)
3755beb4930SRik van Riel {
3765beb4930SRik van Riel 	struct anon_vma_chain *avc, *next;
377eee2acbaSPeter Zijlstra 	struct anon_vma *root = NULL;
3785beb4930SRik van Riel 
3795c341ee1SRik van Riel 	/*
3805c341ee1SRik van Riel 	 * Unlink each anon_vma chained to the VMA.  This list is ordered
3815c341ee1SRik van Riel 	 * from newest to oldest, ensuring the root anon_vma gets freed last.
3825c341ee1SRik van Riel 	 */
3835beb4930SRik van Riel 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
384eee2acbaSPeter Zijlstra 		struct anon_vma *anon_vma = avc->anon_vma;
385eee2acbaSPeter Zijlstra 
386eee2acbaSPeter Zijlstra 		root = lock_anon_vma_root(root, anon_vma);
387bf181b9fSMichel Lespinasse 		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
388eee2acbaSPeter Zijlstra 
389eee2acbaSPeter Zijlstra 		/*
390eee2acbaSPeter Zijlstra 		 * Leave empty anon_vmas on the list - we'll need
391eee2acbaSPeter Zijlstra 		 * to free them outside the lock.
392eee2acbaSPeter Zijlstra 		 */
3937a3ef208SKonstantin Khlebnikov 		if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
3947a3ef208SKonstantin Khlebnikov 			anon_vma->parent->degree--;
395eee2acbaSPeter Zijlstra 			continue;
3967a3ef208SKonstantin Khlebnikov 		}
397eee2acbaSPeter Zijlstra 
398eee2acbaSPeter Zijlstra 		list_del(&avc->same_vma);
399eee2acbaSPeter Zijlstra 		anon_vma_chain_free(avc);
400eee2acbaSPeter Zijlstra 	}
4017a3ef208SKonstantin Khlebnikov 	if (vma->anon_vma)
4027a3ef208SKonstantin Khlebnikov 		vma->anon_vma->degree--;
403eee2acbaSPeter Zijlstra 	unlock_anon_vma_root(root);
404eee2acbaSPeter Zijlstra 
405eee2acbaSPeter Zijlstra 	/*
406eee2acbaSPeter Zijlstra 	 * Iterate the list once more, it now only contains empty and unlinked
407eee2acbaSPeter Zijlstra 	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
4085a505085SIngo Molnar 	 * needing to write-acquire the anon_vma->root->rwsem.
409eee2acbaSPeter Zijlstra 	 */
410eee2acbaSPeter Zijlstra 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
411eee2acbaSPeter Zijlstra 		struct anon_vma *anon_vma = avc->anon_vma;
412eee2acbaSPeter Zijlstra 
413e4c5800aSKonstantin Khlebnikov 		VM_WARN_ON(anon_vma->degree);
414eee2acbaSPeter Zijlstra 		put_anon_vma(anon_vma);
415eee2acbaSPeter Zijlstra 
4165beb4930SRik van Riel 		list_del(&avc->same_vma);
4175beb4930SRik van Riel 		anon_vma_chain_free(avc);
4185beb4930SRik van Riel 	}
4195beb4930SRik van Riel }
4205beb4930SRik van Riel 
42151cc5068SAlexey Dobriyan static void anon_vma_ctor(void *data)
4221da177e4SLinus Torvalds {
4231da177e4SLinus Torvalds 	struct anon_vma *anon_vma = data;
4241da177e4SLinus Torvalds 
4255a505085SIngo Molnar 	init_rwsem(&anon_vma->rwsem);
42683813267SPeter Zijlstra 	atomic_set(&anon_vma->refcount, 0);
427bf181b9fSMichel Lespinasse 	anon_vma->rb_root = RB_ROOT;
4281da177e4SLinus Torvalds }
4291da177e4SLinus Torvalds 
4301da177e4SLinus Torvalds void __init anon_vma_init(void)
4311da177e4SLinus Torvalds {
4321da177e4SLinus Torvalds 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
4335d097056SVladimir Davydov 			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
4345d097056SVladimir Davydov 			anon_vma_ctor);
4355d097056SVladimir Davydov 	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
4365d097056SVladimir Davydov 			SLAB_PANIC|SLAB_ACCOUNT);
4371da177e4SLinus Torvalds }
4381da177e4SLinus Torvalds 
4391da177e4SLinus Torvalds /*
4406111e4caSPeter Zijlstra  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
4416111e4caSPeter Zijlstra  *
4426111e4caSPeter Zijlstra  * Since there is no serialization what so ever against page_remove_rmap()
4436111e4caSPeter Zijlstra  * the best this function can do is return a locked anon_vma that might
4446111e4caSPeter Zijlstra  * have been relevant to this page.
4456111e4caSPeter Zijlstra  *
4466111e4caSPeter Zijlstra  * The page might have been remapped to a different anon_vma or the anon_vma
4476111e4caSPeter Zijlstra  * returned may already be freed (and even reused).
4486111e4caSPeter Zijlstra  *
449bc658c96SPeter Zijlstra  * In case it was remapped to a different anon_vma, the new anon_vma will be a
450bc658c96SPeter Zijlstra  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
451bc658c96SPeter Zijlstra  * ensure that any anon_vma obtained from the page will still be valid for as
452bc658c96SPeter Zijlstra  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
453bc658c96SPeter Zijlstra  *
4546111e4caSPeter Zijlstra  * All users of this function must be very careful when walking the anon_vma
4556111e4caSPeter Zijlstra  * chain and verify that the page in question is indeed mapped in it
4566111e4caSPeter Zijlstra  * [ something equivalent to page_mapped_in_vma() ].
4576111e4caSPeter Zijlstra  *
4586111e4caSPeter Zijlstra  * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
4596111e4caSPeter Zijlstra  * that the anon_vma pointer from page->mapping is valid if there is a
4606111e4caSPeter Zijlstra  * mapcount, we can dereference the anon_vma after observing those.
4611da177e4SLinus Torvalds  */
462746b18d4SPeter Zijlstra struct anon_vma *page_get_anon_vma(struct page *page)
4631da177e4SLinus Torvalds {
464746b18d4SPeter Zijlstra 	struct anon_vma *anon_vma = NULL;
4651da177e4SLinus Torvalds 	unsigned long anon_mapping;
4661da177e4SLinus Torvalds 
4671da177e4SLinus Torvalds 	rcu_read_lock();
4684db0c3c2SJason Low 	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
4693ca7b3c5SHugh Dickins 	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
4701da177e4SLinus Torvalds 		goto out;
4711da177e4SLinus Torvalds 	if (!page_mapped(page))
4721da177e4SLinus Torvalds 		goto out;
4731da177e4SLinus Torvalds 
4741da177e4SLinus Torvalds 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
475746b18d4SPeter Zijlstra 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
476746b18d4SPeter Zijlstra 		anon_vma = NULL;
477746b18d4SPeter Zijlstra 		goto out;
478746b18d4SPeter Zijlstra 	}
479f1819427SHugh Dickins 
480f1819427SHugh Dickins 	/*
481f1819427SHugh Dickins 	 * If this page is still mapped, then its anon_vma cannot have been
482746b18d4SPeter Zijlstra 	 * freed.  But if it has been unmapped, we have no security against the
483746b18d4SPeter Zijlstra 	 * anon_vma structure being freed and reused (for another anon_vma:
484746b18d4SPeter Zijlstra 	 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
485746b18d4SPeter Zijlstra 	 * above cannot corrupt).
486f1819427SHugh Dickins 	 */
487746b18d4SPeter Zijlstra 	if (!page_mapped(page)) {
4887f39dda9SHugh Dickins 		rcu_read_unlock();
489746b18d4SPeter Zijlstra 		put_anon_vma(anon_vma);
4907f39dda9SHugh Dickins 		return NULL;
491746b18d4SPeter Zijlstra 	}
4921da177e4SLinus Torvalds out:
4931da177e4SLinus Torvalds 	rcu_read_unlock();
494746b18d4SPeter Zijlstra 
495746b18d4SPeter Zijlstra 	return anon_vma;
496746b18d4SPeter Zijlstra }
497746b18d4SPeter Zijlstra 
49888c22088SPeter Zijlstra /*
49988c22088SPeter Zijlstra  * Similar to page_get_anon_vma() except it locks the anon_vma.
50088c22088SPeter Zijlstra  *
50188c22088SPeter Zijlstra  * Its a little more complex as it tries to keep the fast path to a single
50288c22088SPeter Zijlstra  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
50388c22088SPeter Zijlstra  * reference like with page_get_anon_vma() and then block on the mutex.
50488c22088SPeter Zijlstra  */
5054fc3f1d6SIngo Molnar struct anon_vma *page_lock_anon_vma_read(struct page *page)
506746b18d4SPeter Zijlstra {
50788c22088SPeter Zijlstra 	struct anon_vma *anon_vma = NULL;
508eee0f252SHugh Dickins 	struct anon_vma *root_anon_vma;
50988c22088SPeter Zijlstra 	unsigned long anon_mapping;
510746b18d4SPeter Zijlstra 
51188c22088SPeter Zijlstra 	rcu_read_lock();
5124db0c3c2SJason Low 	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
51388c22088SPeter Zijlstra 	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
51488c22088SPeter Zijlstra 		goto out;
51588c22088SPeter Zijlstra 	if (!page_mapped(page))
51688c22088SPeter Zijlstra 		goto out;
51788c22088SPeter Zijlstra 
51888c22088SPeter Zijlstra 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
5194db0c3c2SJason Low 	root_anon_vma = READ_ONCE(anon_vma->root);
5204fc3f1d6SIngo Molnar 	if (down_read_trylock(&root_anon_vma->rwsem)) {
52188c22088SPeter Zijlstra 		/*
522eee0f252SHugh Dickins 		 * If the page is still mapped, then this anon_vma is still
523eee0f252SHugh Dickins 		 * its anon_vma, and holding the mutex ensures that it will
524bc658c96SPeter Zijlstra 		 * not go away, see anon_vma_free().
52588c22088SPeter Zijlstra 		 */
526eee0f252SHugh Dickins 		if (!page_mapped(page)) {
5274fc3f1d6SIngo Molnar 			up_read(&root_anon_vma->rwsem);
52888c22088SPeter Zijlstra 			anon_vma = NULL;
52988c22088SPeter Zijlstra 		}
53088c22088SPeter Zijlstra 		goto out;
53188c22088SPeter Zijlstra 	}
53288c22088SPeter Zijlstra 
53388c22088SPeter Zijlstra 	/* trylock failed, we got to sleep */
53488c22088SPeter Zijlstra 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
53588c22088SPeter Zijlstra 		anon_vma = NULL;
53688c22088SPeter Zijlstra 		goto out;
53788c22088SPeter Zijlstra 	}
53888c22088SPeter Zijlstra 
53988c22088SPeter Zijlstra 	if (!page_mapped(page)) {
5407f39dda9SHugh Dickins 		rcu_read_unlock();
54188c22088SPeter Zijlstra 		put_anon_vma(anon_vma);
5427f39dda9SHugh Dickins 		return NULL;
54388c22088SPeter Zijlstra 	}
54488c22088SPeter Zijlstra 
54588c22088SPeter Zijlstra 	/* we pinned the anon_vma, its safe to sleep */
54688c22088SPeter Zijlstra 	rcu_read_unlock();
5474fc3f1d6SIngo Molnar 	anon_vma_lock_read(anon_vma);
548746b18d4SPeter Zijlstra 
54988c22088SPeter Zijlstra 	if (atomic_dec_and_test(&anon_vma->refcount)) {
55088c22088SPeter Zijlstra 		/*
55188c22088SPeter Zijlstra 		 * Oops, we held the last refcount, release the lock
55288c22088SPeter Zijlstra 		 * and bail -- can't simply use put_anon_vma() because
5534fc3f1d6SIngo Molnar 		 * we'll deadlock on the anon_vma_lock_write() recursion.
55488c22088SPeter Zijlstra 		 */
5554fc3f1d6SIngo Molnar 		anon_vma_unlock_read(anon_vma);
55688c22088SPeter Zijlstra 		__put_anon_vma(anon_vma);
55788c22088SPeter Zijlstra 		anon_vma = NULL;
55888c22088SPeter Zijlstra 	}
55988c22088SPeter Zijlstra 
56088c22088SPeter Zijlstra 	return anon_vma;
56188c22088SPeter Zijlstra 
56288c22088SPeter Zijlstra out:
56388c22088SPeter Zijlstra 	rcu_read_unlock();
564746b18d4SPeter Zijlstra 	return anon_vma;
56534bbd704SOleg Nesterov }
56634bbd704SOleg Nesterov 
5674fc3f1d6SIngo Molnar void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
56834bbd704SOleg Nesterov {
5694fc3f1d6SIngo Molnar 	anon_vma_unlock_read(anon_vma);
5701da177e4SLinus Torvalds }
5711da177e4SLinus Torvalds 
57272b252aeSMel Gorman #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
57372b252aeSMel Gorman /*
57472b252aeSMel Gorman  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
57572b252aeSMel Gorman  * important if a PTE was dirty when it was unmapped that it's flushed
57672b252aeSMel Gorman  * before any IO is initiated on the page to prevent lost writes. Similarly,
57772b252aeSMel Gorman  * it must be flushed before freeing to prevent data leakage.
57872b252aeSMel Gorman  */
57972b252aeSMel Gorman void try_to_unmap_flush(void)
58072b252aeSMel Gorman {
58172b252aeSMel Gorman 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
58272b252aeSMel Gorman 	int cpu;
58372b252aeSMel Gorman 
58472b252aeSMel Gorman 	if (!tlb_ubc->flush_required)
58572b252aeSMel Gorman 		return;
58672b252aeSMel Gorman 
58772b252aeSMel Gorman 	cpu = get_cpu();
58872b252aeSMel Gorman 
589858eaaa7SNadav Amit 	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
590858eaaa7SNadav Amit 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
591858eaaa7SNadav Amit 		local_flush_tlb();
592858eaaa7SNadav Amit 		trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
59372b252aeSMel Gorman 	}
594858eaaa7SNadav Amit 
595858eaaa7SNadav Amit 	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
596858eaaa7SNadav Amit 		flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
59772b252aeSMel Gorman 	cpumask_clear(&tlb_ubc->cpumask);
59872b252aeSMel Gorman 	tlb_ubc->flush_required = false;
599d950c947SMel Gorman 	tlb_ubc->writable = false;
60072b252aeSMel Gorman 	put_cpu();
60172b252aeSMel Gorman }
60272b252aeSMel Gorman 
603d950c947SMel Gorman /* Flush iff there are potentially writable TLB entries that can race with IO */
604d950c947SMel Gorman void try_to_unmap_flush_dirty(void)
605d950c947SMel Gorman {
606d950c947SMel Gorman 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
607d950c947SMel Gorman 
608d950c947SMel Gorman 	if (tlb_ubc->writable)
609d950c947SMel Gorman 		try_to_unmap_flush();
610d950c947SMel Gorman }
611d950c947SMel Gorman 
612c7ab0d2fSKirill A. Shutemov static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
61372b252aeSMel Gorman {
61472b252aeSMel Gorman 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
61572b252aeSMel Gorman 
61672b252aeSMel Gorman 	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
61772b252aeSMel Gorman 	tlb_ubc->flush_required = true;
618d950c947SMel Gorman 
619d950c947SMel Gorman 	/*
620d950c947SMel Gorman 	 * If the PTE was dirty then it's best to assume it's writable. The
621d950c947SMel Gorman 	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
622d950c947SMel Gorman 	 * before the page is queued for IO.
623d950c947SMel Gorman 	 */
624d950c947SMel Gorman 	if (writable)
625d950c947SMel Gorman 		tlb_ubc->writable = true;
62672b252aeSMel Gorman }
62772b252aeSMel Gorman 
62872b252aeSMel Gorman /*
62972b252aeSMel Gorman  * Returns true if the TLB flush should be deferred to the end of a batch of
63072b252aeSMel Gorman  * unmap operations to reduce IPIs.
63172b252aeSMel Gorman  */
63272b252aeSMel Gorman static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
63372b252aeSMel Gorman {
63472b252aeSMel Gorman 	bool should_defer = false;
63572b252aeSMel Gorman 
63672b252aeSMel Gorman 	if (!(flags & TTU_BATCH_FLUSH))
63772b252aeSMel Gorman 		return false;
63872b252aeSMel Gorman 
63972b252aeSMel Gorman 	/* If remote CPUs need to be flushed then defer batch the flush */
64072b252aeSMel Gorman 	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
64172b252aeSMel Gorman 		should_defer = true;
64272b252aeSMel Gorman 	put_cpu();
64372b252aeSMel Gorman 
64472b252aeSMel Gorman 	return should_defer;
64572b252aeSMel Gorman }
64672b252aeSMel Gorman #else
647c7ab0d2fSKirill A. Shutemov static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
64872b252aeSMel Gorman {
64972b252aeSMel Gorman }
65072b252aeSMel Gorman 
65172b252aeSMel Gorman static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
65272b252aeSMel Gorman {
65372b252aeSMel Gorman 	return false;
65472b252aeSMel Gorman }
65572b252aeSMel Gorman #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
65672b252aeSMel Gorman 
6571da177e4SLinus Torvalds /*
658bf89c8c8SHuang Shijie  * At what user virtual address is page expected in vma?
659ab941e0fSNaoya Horiguchi  * Caller should check the page is actually part of the vma.
6601da177e4SLinus Torvalds  */
6611da177e4SLinus Torvalds unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
6621da177e4SLinus Torvalds {
66386c2ad19SMichel Lespinasse 	unsigned long address;
66421d0d443SAndrea Arcangeli 	if (PageAnon(page)) {
6654829b906SHugh Dickins 		struct anon_vma *page__anon_vma = page_anon_vma(page);
6664829b906SHugh Dickins 		/*
6674829b906SHugh Dickins 		 * Note: swapoff's unuse_vma() is more efficient with this
6684829b906SHugh Dickins 		 * check, and needs it to match anon_vma when KSM is active.
6694829b906SHugh Dickins 		 */
6704829b906SHugh Dickins 		if (!vma->anon_vma || !page__anon_vma ||
6714829b906SHugh Dickins 		    vma->anon_vma->root != page__anon_vma->root)
67221d0d443SAndrea Arcangeli 			return -EFAULT;
67327ba0644SKirill A. Shutemov 	} else if (page->mapping) {
67427ba0644SKirill A. Shutemov 		if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
6751da177e4SLinus Torvalds 			return -EFAULT;
6761da177e4SLinus Torvalds 	} else
6771da177e4SLinus Torvalds 		return -EFAULT;
67886c2ad19SMichel Lespinasse 	address = __vma_address(page, vma);
67986c2ad19SMichel Lespinasse 	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
68086c2ad19SMichel Lespinasse 		return -EFAULT;
68186c2ad19SMichel Lespinasse 	return address;
6821da177e4SLinus Torvalds }
6831da177e4SLinus Torvalds 
6846219049aSBob Liu pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
6856219049aSBob Liu {
6866219049aSBob Liu 	pgd_t *pgd;
687c2febafcSKirill A. Shutemov 	p4d_t *p4d;
6886219049aSBob Liu 	pud_t *pud;
6896219049aSBob Liu 	pmd_t *pmd = NULL;
690f72e7dcdSHugh Dickins 	pmd_t pmde;
6916219049aSBob Liu 
6926219049aSBob Liu 	pgd = pgd_offset(mm, address);
6936219049aSBob Liu 	if (!pgd_present(*pgd))
6946219049aSBob Liu 		goto out;
6956219049aSBob Liu 
696c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, address);
697c2febafcSKirill A. Shutemov 	if (!p4d_present(*p4d))
698c2febafcSKirill A. Shutemov 		goto out;
699c2febafcSKirill A. Shutemov 
700c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, address);
7016219049aSBob Liu 	if (!pud_present(*pud))
7026219049aSBob Liu 		goto out;
7036219049aSBob Liu 
7046219049aSBob Liu 	pmd = pmd_offset(pud, address);
705f72e7dcdSHugh Dickins 	/*
7068809aa2dSAneesh Kumar K.V 	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
707f72e7dcdSHugh Dickins 	 * without holding anon_vma lock for write.  So when looking for a
708f72e7dcdSHugh Dickins 	 * genuine pmde (in which to find pte), test present and !THP together.
709f72e7dcdSHugh Dickins 	 */
710e37c6982SChristian Borntraeger 	pmde = *pmd;
711e37c6982SChristian Borntraeger 	barrier();
712f72e7dcdSHugh Dickins 	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
7136219049aSBob Liu 		pmd = NULL;
7146219049aSBob Liu out:
7156219049aSBob Liu 	return pmd;
7166219049aSBob Liu }
7176219049aSBob Liu 
7189f32624bSJoonsoo Kim struct page_referenced_arg {
7199f32624bSJoonsoo Kim 	int mapcount;
7209f32624bSJoonsoo Kim 	int referenced;
7219f32624bSJoonsoo Kim 	unsigned long vm_flags;
7229f32624bSJoonsoo Kim 	struct mem_cgroup *memcg;
7239f32624bSJoonsoo Kim };
72481b4082dSNikita Danilov /*
7259f32624bSJoonsoo Kim  * arg: page_referenced_arg will be passed
7261da177e4SLinus Torvalds  */
727ac769501SKirill A. Shutemov static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
7289f32624bSJoonsoo Kim 			unsigned long address, void *arg)
7291da177e4SLinus Torvalds {
7309f32624bSJoonsoo Kim 	struct page_referenced_arg *pra = arg;
7318eaededeSKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
7328eaededeSKirill A. Shutemov 		.page = page,
7338eaededeSKirill A. Shutemov 		.vma = vma,
7348eaededeSKirill A. Shutemov 		.address = address,
7358eaededeSKirill A. Shutemov 	};
7368749cfeaSVladimir Davydov 	int referenced = 0;
7372da28bfdSAndrea Arcangeli 
7388eaededeSKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
7398eaededeSKirill A. Shutemov 		address = pvmw.address;
7402da28bfdSAndrea Arcangeli 
741b20ce5e0SKirill A. Shutemov 		if (vma->vm_flags & VM_LOCKED) {
7428eaededeSKirill A. Shutemov 			page_vma_mapped_walk_done(&pvmw);
7439f32624bSJoonsoo Kim 			pra->vm_flags |= VM_LOCKED;
7449f32624bSJoonsoo Kim 			return SWAP_FAIL; /* To break the loop */
7452da28bfdSAndrea Arcangeli 		}
7462da28bfdSAndrea Arcangeli 
7478eaededeSKirill A. Shutemov 		if (pvmw.pte) {
7488eaededeSKirill A. Shutemov 			if (ptep_clear_flush_young_notify(vma, address,
7498eaededeSKirill A. Shutemov 						pvmw.pte)) {
7504917e5d0SJohannes Weiner 				/*
7518eaededeSKirill A. Shutemov 				 * Don't treat a reference through
7528eaededeSKirill A. Shutemov 				 * a sequentially read mapping as such.
7538eaededeSKirill A. Shutemov 				 * If the page has been used in another mapping,
7548eaededeSKirill A. Shutemov 				 * we will catch it; if this other mapping is
7558eaededeSKirill A. Shutemov 				 * already gone, the unmap path will have set
7568eaededeSKirill A. Shutemov 				 * PG_referenced or activated the page.
7574917e5d0SJohannes Weiner 				 */
75864363aadSJoe Perches 				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
7591da177e4SLinus Torvalds 					referenced++;
7604917e5d0SJohannes Weiner 			}
7618749cfeaSVladimir Davydov 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
7628eaededeSKirill A. Shutemov 			if (pmdp_clear_flush_young_notify(vma, address,
7638eaededeSKirill A. Shutemov 						pvmw.pmd))
7648749cfeaSVladimir Davydov 				referenced++;
7658749cfeaSVladimir Davydov 		} else {
7668749cfeaSVladimir Davydov 			/* unexpected pmd-mapped page? */
7678749cfeaSVladimir Davydov 			WARN_ON_ONCE(1);
7688749cfeaSVladimir Davydov 		}
7698eaededeSKirill A. Shutemov 
7708eaededeSKirill A. Shutemov 		pra->mapcount--;
7718eaededeSKirill A. Shutemov 	}
77271e3aac0SAndrea Arcangeli 
77333c3fc71SVladimir Davydov 	if (referenced)
77433c3fc71SVladimir Davydov 		clear_page_idle(page);
77533c3fc71SVladimir Davydov 	if (test_and_clear_page_young(page))
77633c3fc71SVladimir Davydov 		referenced++;
77733c3fc71SVladimir Davydov 
7789f32624bSJoonsoo Kim 	if (referenced) {
7799f32624bSJoonsoo Kim 		pra->referenced++;
7809f32624bSJoonsoo Kim 		pra->vm_flags |= vma->vm_flags;
7811da177e4SLinus Torvalds 	}
7821da177e4SLinus Torvalds 
7839f32624bSJoonsoo Kim 	if (!pra->mapcount)
7849f32624bSJoonsoo Kim 		return SWAP_SUCCESS; /* To break the loop */
7859f32624bSJoonsoo Kim 
7869f32624bSJoonsoo Kim 	return SWAP_AGAIN;
7879f32624bSJoonsoo Kim }
7889f32624bSJoonsoo Kim 
7899f32624bSJoonsoo Kim static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
7901da177e4SLinus Torvalds {
7919f32624bSJoonsoo Kim 	struct page_referenced_arg *pra = arg;
7929f32624bSJoonsoo Kim 	struct mem_cgroup *memcg = pra->memcg;
7931da177e4SLinus Torvalds 
7949f32624bSJoonsoo Kim 	if (!mm_match_cgroup(vma->vm_mm, memcg))
7959f32624bSJoonsoo Kim 		return true;
7961da177e4SLinus Torvalds 
7979f32624bSJoonsoo Kim 	return false;
7981da177e4SLinus Torvalds }
7991da177e4SLinus Torvalds 
8001da177e4SLinus Torvalds /**
8011da177e4SLinus Torvalds  * page_referenced - test if the page was referenced
8021da177e4SLinus Torvalds  * @page: the page to test
8031da177e4SLinus Torvalds  * @is_locked: caller holds lock on the page
80472835c86SJohannes Weiner  * @memcg: target memory cgroup
8056fe6b7e3SWu Fengguang  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
8061da177e4SLinus Torvalds  *
8071da177e4SLinus Torvalds  * Quick test_and_clear_referenced for all mappings to a page,
8081da177e4SLinus Torvalds  * returns the number of ptes which referenced the page.
8091da177e4SLinus Torvalds  */
8106fe6b7e3SWu Fengguang int page_referenced(struct page *page,
8116fe6b7e3SWu Fengguang 		    int is_locked,
81272835c86SJohannes Weiner 		    struct mem_cgroup *memcg,
8136fe6b7e3SWu Fengguang 		    unsigned long *vm_flags)
8141da177e4SLinus Torvalds {
8159f32624bSJoonsoo Kim 	int ret;
8165ad64688SHugh Dickins 	int we_locked = 0;
8179f32624bSJoonsoo Kim 	struct page_referenced_arg pra = {
818b20ce5e0SKirill A. Shutemov 		.mapcount = total_mapcount(page),
8199f32624bSJoonsoo Kim 		.memcg = memcg,
8209f32624bSJoonsoo Kim 	};
8219f32624bSJoonsoo Kim 	struct rmap_walk_control rwc = {
8229f32624bSJoonsoo Kim 		.rmap_one = page_referenced_one,
8239f32624bSJoonsoo Kim 		.arg = (void *)&pra,
8249f32624bSJoonsoo Kim 		.anon_lock = page_lock_anon_vma_read,
8259f32624bSJoonsoo Kim 	};
8261da177e4SLinus Torvalds 
8276fe6b7e3SWu Fengguang 	*vm_flags = 0;
8289f32624bSJoonsoo Kim 	if (!page_mapped(page))
8299f32624bSJoonsoo Kim 		return 0;
8309f32624bSJoonsoo Kim 
8319f32624bSJoonsoo Kim 	if (!page_rmapping(page))
8329f32624bSJoonsoo Kim 		return 0;
8339f32624bSJoonsoo Kim 
8345ad64688SHugh Dickins 	if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
8355ad64688SHugh Dickins 		we_locked = trylock_page(page);
8369f32624bSJoonsoo Kim 		if (!we_locked)
8379f32624bSJoonsoo Kim 			return 1;
8385ad64688SHugh Dickins 	}
8399f32624bSJoonsoo Kim 
8409f32624bSJoonsoo Kim 	/*
8419f32624bSJoonsoo Kim 	 * If we are reclaiming on behalf of a cgroup, skip
8429f32624bSJoonsoo Kim 	 * counting on behalf of references from different
8439f32624bSJoonsoo Kim 	 * cgroups
8449f32624bSJoonsoo Kim 	 */
8459f32624bSJoonsoo Kim 	if (memcg) {
8469f32624bSJoonsoo Kim 		rwc.invalid_vma = invalid_page_referenced_vma;
8475ad64688SHugh Dickins 	}
8489f32624bSJoonsoo Kim 
8499f32624bSJoonsoo Kim 	ret = rmap_walk(page, &rwc);
8509f32624bSJoonsoo Kim 	*vm_flags = pra.vm_flags;
8519f32624bSJoonsoo Kim 
8525ad64688SHugh Dickins 	if (we_locked)
8531da177e4SLinus Torvalds 		unlock_page(page);
8549f32624bSJoonsoo Kim 
8559f32624bSJoonsoo Kim 	return pra.referenced;
8561da177e4SLinus Torvalds }
8571da177e4SLinus Torvalds 
8581cb1729bSHugh Dickins static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
8599853a407SJoonsoo Kim 			    unsigned long address, void *arg)
860d08b3851SPeter Zijlstra {
861f27176cfSKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
862f27176cfSKirill A. Shutemov 		.page = page,
863f27176cfSKirill A. Shutemov 		.vma = vma,
864f27176cfSKirill A. Shutemov 		.address = address,
865f27176cfSKirill A. Shutemov 		.flags = PVMW_SYNC,
866f27176cfSKirill A. Shutemov 	};
8679853a407SJoonsoo Kim 	int *cleaned = arg;
868d08b3851SPeter Zijlstra 
869f27176cfSKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
870f27176cfSKirill A. Shutemov 		int ret = 0;
871f27176cfSKirill A. Shutemov 		address = pvmw.address;
872f27176cfSKirill A. Shutemov 		if (pvmw.pte) {
873c2fda5feSPeter Zijlstra 			pte_t entry;
874f27176cfSKirill A. Shutemov 			pte_t *pte = pvmw.pte;
875f27176cfSKirill A. Shutemov 
876f27176cfSKirill A. Shutemov 			if (!pte_dirty(*pte) && !pte_write(*pte))
877f27176cfSKirill A. Shutemov 				continue;
878d08b3851SPeter Zijlstra 
879c2fda5feSPeter Zijlstra 			flush_cache_page(vma, address, pte_pfn(*pte));
8802ec74c3eSSagi Grimberg 			entry = ptep_clear_flush(vma, address, pte);
881d08b3851SPeter Zijlstra 			entry = pte_wrprotect(entry);
882c2fda5feSPeter Zijlstra 			entry = pte_mkclean(entry);
883f27176cfSKirill A. Shutemov 			set_pte_at(vma->vm_mm, address, pte, entry);
884d08b3851SPeter Zijlstra 			ret = 1;
885f27176cfSKirill A. Shutemov 		} else {
886f27176cfSKirill A. Shutemov #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
887f27176cfSKirill A. Shutemov 			pmd_t *pmd = pvmw.pmd;
888f27176cfSKirill A. Shutemov 			pmd_t entry;
889d08b3851SPeter Zijlstra 
890f27176cfSKirill A. Shutemov 			if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
891f27176cfSKirill A. Shutemov 				continue;
892f27176cfSKirill A. Shutemov 
893f27176cfSKirill A. Shutemov 			flush_cache_page(vma, address, page_to_pfn(page));
894f27176cfSKirill A. Shutemov 			entry = pmdp_huge_clear_flush(vma, address, pmd);
895f27176cfSKirill A. Shutemov 			entry = pmd_wrprotect(entry);
896f27176cfSKirill A. Shutemov 			entry = pmd_mkclean(entry);
897f27176cfSKirill A. Shutemov 			set_pmd_at(vma->vm_mm, address, pmd, entry);
898f27176cfSKirill A. Shutemov 			ret = 1;
899f27176cfSKirill A. Shutemov #else
900f27176cfSKirill A. Shutemov 			/* unexpected pmd-mapped page? */
901f27176cfSKirill A. Shutemov 			WARN_ON_ONCE(1);
902f27176cfSKirill A. Shutemov #endif
903f27176cfSKirill A. Shutemov 		}
9042ec74c3eSSagi Grimberg 
9059853a407SJoonsoo Kim 		if (ret) {
906f27176cfSKirill A. Shutemov 			mmu_notifier_invalidate_page(vma->vm_mm, address);
9079853a407SJoonsoo Kim 			(*cleaned)++;
9089853a407SJoonsoo Kim 		}
909f27176cfSKirill A. Shutemov 	}
910f27176cfSKirill A. Shutemov 
9119853a407SJoonsoo Kim 	return SWAP_AGAIN;
912d08b3851SPeter Zijlstra }
913d08b3851SPeter Zijlstra 
9149853a407SJoonsoo Kim static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
915d08b3851SPeter Zijlstra {
9169853a407SJoonsoo Kim 	if (vma->vm_flags & VM_SHARED)
917871beb8cSFengguang Wu 		return false;
918d08b3851SPeter Zijlstra 
919871beb8cSFengguang Wu 	return true;
920d08b3851SPeter Zijlstra }
921d08b3851SPeter Zijlstra 
922d08b3851SPeter Zijlstra int page_mkclean(struct page *page)
923d08b3851SPeter Zijlstra {
9249853a407SJoonsoo Kim 	int cleaned = 0;
9259853a407SJoonsoo Kim 	struct address_space *mapping;
9269853a407SJoonsoo Kim 	struct rmap_walk_control rwc = {
9279853a407SJoonsoo Kim 		.arg = (void *)&cleaned,
9289853a407SJoonsoo Kim 		.rmap_one = page_mkclean_one,
9299853a407SJoonsoo Kim 		.invalid_vma = invalid_mkclean_vma,
9309853a407SJoonsoo Kim 	};
931d08b3851SPeter Zijlstra 
932d08b3851SPeter Zijlstra 	BUG_ON(!PageLocked(page));
933d08b3851SPeter Zijlstra 
9349853a407SJoonsoo Kim 	if (!page_mapped(page))
9359853a407SJoonsoo Kim 		return 0;
936d08b3851SPeter Zijlstra 
9379853a407SJoonsoo Kim 	mapping = page_mapping(page);
9389853a407SJoonsoo Kim 	if (!mapping)
9399853a407SJoonsoo Kim 		return 0;
9409853a407SJoonsoo Kim 
9419853a407SJoonsoo Kim 	rmap_walk(page, &rwc);
9429853a407SJoonsoo Kim 
9439853a407SJoonsoo Kim 	return cleaned;
944d08b3851SPeter Zijlstra }
94560b59beaSJaya Kumar EXPORT_SYMBOL_GPL(page_mkclean);
946d08b3851SPeter Zijlstra 
9471da177e4SLinus Torvalds /**
948c44b6743SRik van Riel  * page_move_anon_rmap - move a page to our anon_vma
949c44b6743SRik van Riel  * @page:	the page to move to our anon_vma
950c44b6743SRik van Riel  * @vma:	the vma the page belongs to
951c44b6743SRik van Riel  *
952c44b6743SRik van Riel  * When a page belongs exclusively to one process after a COW event,
953c44b6743SRik van Riel  * that page can be moved into the anon_vma that belongs to just that
954c44b6743SRik van Riel  * process, so the rmap code will not search the parent or sibling
955c44b6743SRik van Riel  * processes.
956c44b6743SRik van Riel  */
9575a49973dSHugh Dickins void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
958c44b6743SRik van Riel {
959c44b6743SRik van Riel 	struct anon_vma *anon_vma = vma->anon_vma;
960c44b6743SRik van Riel 
9615a49973dSHugh Dickins 	page = compound_head(page);
9625a49973dSHugh Dickins 
963309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageLocked(page), page);
96481d1b09cSSasha Levin 	VM_BUG_ON_VMA(!anon_vma, vma);
965c44b6743SRik van Riel 
966c44b6743SRik van Riel 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
967414e2fb8SVladimir Davydov 	/*
968414e2fb8SVladimir Davydov 	 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
969414e2fb8SVladimir Davydov 	 * simultaneously, so a concurrent reader (eg page_referenced()'s
970414e2fb8SVladimir Davydov 	 * PageAnon()) will not see one without the other.
971414e2fb8SVladimir Davydov 	 */
972414e2fb8SVladimir Davydov 	WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
973c44b6743SRik van Riel }
974c44b6743SRik van Riel 
975c44b6743SRik van Riel /**
97643d8eac4SRandy Dunlap  * __page_set_anon_rmap - set up new anonymous rmap
9774e1c1975SAndi Kleen  * @page:	Page to add to rmap
9784e1c1975SAndi Kleen  * @vma:	VM area to add page to.
9794e1c1975SAndi Kleen  * @address:	User virtual address of the mapping
980e8a03febSRik van Riel  * @exclusive:	the page is exclusively owned by the current process
9811da177e4SLinus Torvalds  */
9829617d95eSNick Piggin static void __page_set_anon_rmap(struct page *page,
983e8a03febSRik van Riel 	struct vm_area_struct *vma, unsigned long address, int exclusive)
9841da177e4SLinus Torvalds {
985e8a03febSRik van Riel 	struct anon_vma *anon_vma = vma->anon_vma;
9862822c1aaSNick Piggin 
987e8a03febSRik van Riel 	BUG_ON(!anon_vma);
988ea90002bSLinus Torvalds 
9894e1c1975SAndi Kleen 	if (PageAnon(page))
9904e1c1975SAndi Kleen 		return;
9914e1c1975SAndi Kleen 
992ea90002bSLinus Torvalds 	/*
993e8a03febSRik van Riel 	 * If the page isn't exclusively mapped into this vma,
994e8a03febSRik van Riel 	 * we must use the _oldest_ possible anon_vma for the
995e8a03febSRik van Riel 	 * page mapping!
996ea90002bSLinus Torvalds 	 */
9974e1c1975SAndi Kleen 	if (!exclusive)
998288468c3SAndrea Arcangeli 		anon_vma = anon_vma->root;
999ea90002bSLinus Torvalds 
10001da177e4SLinus Torvalds 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
10012822c1aaSNick Piggin 	page->mapping = (struct address_space *) anon_vma;
10024d7670e0SNick Piggin 	page->index = linear_page_index(vma, address);
10031da177e4SLinus Torvalds }
10049617d95eSNick Piggin 
10059617d95eSNick Piggin /**
100643d8eac4SRandy Dunlap  * __page_check_anon_rmap - sanity check anonymous rmap addition
1007c97a9e10SNick Piggin  * @page:	the page to add the mapping to
1008c97a9e10SNick Piggin  * @vma:	the vm area in which the mapping is added
1009c97a9e10SNick Piggin  * @address:	the user virtual address mapped
1010c97a9e10SNick Piggin  */
1011c97a9e10SNick Piggin static void __page_check_anon_rmap(struct page *page,
1012c97a9e10SNick Piggin 	struct vm_area_struct *vma, unsigned long address)
1013c97a9e10SNick Piggin {
1014c97a9e10SNick Piggin #ifdef CONFIG_DEBUG_VM
1015c97a9e10SNick Piggin 	/*
1016c97a9e10SNick Piggin 	 * The page's anon-rmap details (mapping and index) are guaranteed to
1017c97a9e10SNick Piggin 	 * be set up correctly at this point.
1018c97a9e10SNick Piggin 	 *
1019c97a9e10SNick Piggin 	 * We have exclusion against page_add_anon_rmap because the caller
1020c97a9e10SNick Piggin 	 * always holds the page locked, except if called from page_dup_rmap,
1021c97a9e10SNick Piggin 	 * in which case the page is already known to be setup.
1022c97a9e10SNick Piggin 	 *
1023c97a9e10SNick Piggin 	 * We have exclusion against page_add_new_anon_rmap because those pages
1024c97a9e10SNick Piggin 	 * are initially only visible via the pagetables, and the pte is locked
1025c97a9e10SNick Piggin 	 * over the call to page_add_new_anon_rmap.
1026c97a9e10SNick Piggin 	 */
102744ab57a0SAndrea Arcangeli 	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
102853f9263bSKirill A. Shutemov 	BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
1029c97a9e10SNick Piggin #endif
1030c97a9e10SNick Piggin }
1031c97a9e10SNick Piggin 
1032c97a9e10SNick Piggin /**
10339617d95eSNick Piggin  * page_add_anon_rmap - add pte mapping to an anonymous page
10349617d95eSNick Piggin  * @page:	the page to add the mapping to
10359617d95eSNick Piggin  * @vma:	the vm area in which the mapping is added
10369617d95eSNick Piggin  * @address:	the user virtual address mapped
1037d281ee61SKirill A. Shutemov  * @compound:	charge the page as compound or small page
10389617d95eSNick Piggin  *
10395ad64688SHugh Dickins  * The caller needs to hold the pte lock, and the page must be locked in
104080e14822SHugh Dickins  * the anon_vma case: to serialize mapping,index checking after setting,
104180e14822SHugh Dickins  * and to ensure that PageAnon is not being upgraded racily to PageKsm
104280e14822SHugh Dickins  * (but PageKsm is never downgraded to PageAnon).
10439617d95eSNick Piggin  */
10449617d95eSNick Piggin void page_add_anon_rmap(struct page *page,
1045d281ee61SKirill A. Shutemov 	struct vm_area_struct *vma, unsigned long address, bool compound)
10469617d95eSNick Piggin {
1047d281ee61SKirill A. Shutemov 	do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
1048ad8c2ee8SRik van Riel }
1049ad8c2ee8SRik van Riel 
1050ad8c2ee8SRik van Riel /*
1051ad8c2ee8SRik van Riel  * Special version of the above for do_swap_page, which often runs
1052ad8c2ee8SRik van Riel  * into pages that are exclusively owned by the current process.
1053ad8c2ee8SRik van Riel  * Everybody else should continue to use page_add_anon_rmap above.
1054ad8c2ee8SRik van Riel  */
1055ad8c2ee8SRik van Riel void do_page_add_anon_rmap(struct page *page,
1056d281ee61SKirill A. Shutemov 	struct vm_area_struct *vma, unsigned long address, int flags)
1057ad8c2ee8SRik van Riel {
1058d281ee61SKirill A. Shutemov 	bool compound = flags & RMAP_COMPOUND;
105953f9263bSKirill A. Shutemov 	bool first;
106053f9263bSKirill A. Shutemov 
106153f9263bSKirill A. Shutemov 	if (compound) {
106253f9263bSKirill A. Shutemov 		atomic_t *mapcount;
1063e9b61f19SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageLocked(page), page);
106453f9263bSKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
106553f9263bSKirill A. Shutemov 		mapcount = compound_mapcount_ptr(page);
106653f9263bSKirill A. Shutemov 		first = atomic_inc_and_test(mapcount);
106753f9263bSKirill A. Shutemov 	} else {
106853f9263bSKirill A. Shutemov 		first = atomic_inc_and_test(&page->_mapcount);
106953f9263bSKirill A. Shutemov 	}
107053f9263bSKirill A. Shutemov 
107153f9263bSKirill A. Shutemov 	if (first) {
1072d281ee61SKirill A. Shutemov 		int nr = compound ? hpage_nr_pages(page) : 1;
1073bea04b07SJianyu Zhan 		/*
1074bea04b07SJianyu Zhan 		 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1075bea04b07SJianyu Zhan 		 * these counters are not modified in interrupt context, and
1076bea04b07SJianyu Zhan 		 * pte lock(a spinlock) is held, which implies preemption
1077bea04b07SJianyu Zhan 		 * disabled.
1078bea04b07SJianyu Zhan 		 */
107965c45377SKirill A. Shutemov 		if (compound)
108011fb9989SMel Gorman 			__inc_node_page_state(page, NR_ANON_THPS);
10814b9d0fabSMel Gorman 		__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
108279134171SAndrea Arcangeli 	}
10835ad64688SHugh Dickins 	if (unlikely(PageKsm(page)))
10845ad64688SHugh Dickins 		return;
10855ad64688SHugh Dickins 
1086309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageLocked(page), page);
108753f9263bSKirill A. Shutemov 
10885dbe0af4SHugh Dickins 	/* address might be in next vma when migration races vma_adjust */
10895ad64688SHugh Dickins 	if (first)
1090d281ee61SKirill A. Shutemov 		__page_set_anon_rmap(page, vma, address,
1091d281ee61SKirill A. Shutemov 				flags & RMAP_EXCLUSIVE);
109269029cd5SKAMEZAWA Hiroyuki 	else
1093c97a9e10SNick Piggin 		__page_check_anon_rmap(page, vma, address);
10941da177e4SLinus Torvalds }
10951da177e4SLinus Torvalds 
109643d8eac4SRandy Dunlap /**
10979617d95eSNick Piggin  * page_add_new_anon_rmap - add pte mapping to a new anonymous page
10989617d95eSNick Piggin  * @page:	the page to add the mapping to
10999617d95eSNick Piggin  * @vma:	the vm area in which the mapping is added
11009617d95eSNick Piggin  * @address:	the user virtual address mapped
1101d281ee61SKirill A. Shutemov  * @compound:	charge the page as compound or small page
11029617d95eSNick Piggin  *
11039617d95eSNick Piggin  * Same as page_add_anon_rmap but must only be called on *new* pages.
11049617d95eSNick Piggin  * This means the inc-and-test can be bypassed.
1105c97a9e10SNick Piggin  * Page does not have to be locked.
11069617d95eSNick Piggin  */
11079617d95eSNick Piggin void page_add_new_anon_rmap(struct page *page,
1108d281ee61SKirill A. Shutemov 	struct vm_area_struct *vma, unsigned long address, bool compound)
11099617d95eSNick Piggin {
1110d281ee61SKirill A. Shutemov 	int nr = compound ? hpage_nr_pages(page) : 1;
1111d281ee61SKirill A. Shutemov 
111281d1b09cSSasha Levin 	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1113fa9949daSHugh Dickins 	__SetPageSwapBacked(page);
1114d281ee61SKirill A. Shutemov 	if (compound) {
1115d281ee61SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
111653f9263bSKirill A. Shutemov 		/* increment count (starts at -1) */
111753f9263bSKirill A. Shutemov 		atomic_set(compound_mapcount_ptr(page), 0);
111811fb9989SMel Gorman 		__inc_node_page_state(page, NR_ANON_THPS);
111953f9263bSKirill A. Shutemov 	} else {
112053f9263bSKirill A. Shutemov 		/* Anon THP always mapped first with PMD */
112153f9263bSKirill A. Shutemov 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
112253f9263bSKirill A. Shutemov 		/* increment count (starts at -1) */
112353f9263bSKirill A. Shutemov 		atomic_set(&page->_mapcount, 0);
1124d281ee61SKirill A. Shutemov 	}
11254b9d0fabSMel Gorman 	__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
1126e8a03febSRik van Riel 	__page_set_anon_rmap(page, vma, address, 1);
11279617d95eSNick Piggin }
11289617d95eSNick Piggin 
11291da177e4SLinus Torvalds /**
11301da177e4SLinus Torvalds  * page_add_file_rmap - add pte mapping to a file page
11311da177e4SLinus Torvalds  * @page: the page to add the mapping to
11321da177e4SLinus Torvalds  *
1133b8072f09SHugh Dickins  * The caller needs to hold the pte lock.
11341da177e4SLinus Torvalds  */
1135dd78feddSKirill A. Shutemov void page_add_file_rmap(struct page *page, bool compound)
11361da177e4SLinus Torvalds {
1137dd78feddSKirill A. Shutemov 	int i, nr = 1;
1138dd78feddSKirill A. Shutemov 
1139dd78feddSKirill A. Shutemov 	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
114062cccb8cSJohannes Weiner 	lock_page_memcg(page);
1141dd78feddSKirill A. Shutemov 	if (compound && PageTransHuge(page)) {
1142dd78feddSKirill A. Shutemov 		for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1143dd78feddSKirill A. Shutemov 			if (atomic_inc_and_test(&page[i]._mapcount))
1144dd78feddSKirill A. Shutemov 				nr++;
1145d69b042fSBalbir Singh 		}
1146dd78feddSKirill A. Shutemov 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1147dd78feddSKirill A. Shutemov 			goto out;
114865c45377SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
114911fb9989SMel Gorman 		__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1150dd78feddSKirill A. Shutemov 	} else {
1151c8efc390SKirill A. Shutemov 		if (PageTransCompound(page) && page_mapping(page)) {
1152c8efc390SKirill A. Shutemov 			VM_WARN_ON_ONCE(!PageLocked(page));
1153c8efc390SKirill A. Shutemov 
11549a73f61bSKirill A. Shutemov 			SetPageDoubleMap(compound_head(page));
11559a73f61bSKirill A. Shutemov 			if (PageMlocked(page))
11569a73f61bSKirill A. Shutemov 				clear_page_mlock(compound_head(page));
11579a73f61bSKirill A. Shutemov 		}
1158dd78feddSKirill A. Shutemov 		if (!atomic_inc_and_test(&page->_mapcount))
1159dd78feddSKirill A. Shutemov 			goto out;
1160dd78feddSKirill A. Shutemov 	}
116150658e2eSMel Gorman 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
1162553af430SJohannes Weiner 	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
1163dd78feddSKirill A. Shutemov out:
116462cccb8cSJohannes Weiner 	unlock_page_memcg(page);
11651da177e4SLinus Torvalds }
11661da177e4SLinus Torvalds 
1167dd78feddSKirill A. Shutemov static void page_remove_file_rmap(struct page *page, bool compound)
11688186eb6aSJohannes Weiner {
1169dd78feddSKirill A. Shutemov 	int i, nr = 1;
1170dd78feddSKirill A. Shutemov 
117157dea93aSSteve Capper 	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
117262cccb8cSJohannes Weiner 	lock_page_memcg(page);
11738186eb6aSJohannes Weiner 
117453f9263bSKirill A. Shutemov 	/* Hugepages are not counted in NR_FILE_MAPPED for now. */
117553f9263bSKirill A. Shutemov 	if (unlikely(PageHuge(page))) {
117653f9263bSKirill A. Shutemov 		/* hugetlb pages are always mapped with pmds */
117753f9263bSKirill A. Shutemov 		atomic_dec(compound_mapcount_ptr(page));
117853f9263bSKirill A. Shutemov 		goto out;
117953f9263bSKirill A. Shutemov 	}
118053f9263bSKirill A. Shutemov 
11818186eb6aSJohannes Weiner 	/* page still mapped by someone else? */
1182dd78feddSKirill A. Shutemov 	if (compound && PageTransHuge(page)) {
1183dd78feddSKirill A. Shutemov 		for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1184dd78feddSKirill A. Shutemov 			if (atomic_add_negative(-1, &page[i]._mapcount))
1185dd78feddSKirill A. Shutemov 				nr++;
1186dd78feddSKirill A. Shutemov 		}
1187dd78feddSKirill A. Shutemov 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1188dd78feddSKirill A. Shutemov 			goto out;
118965c45377SKirill A. Shutemov 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
119011fb9989SMel Gorman 		__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1191dd78feddSKirill A. Shutemov 	} else {
11928186eb6aSJohannes Weiner 		if (!atomic_add_negative(-1, &page->_mapcount))
11938186eb6aSJohannes Weiner 			goto out;
1194dd78feddSKirill A. Shutemov 	}
11958186eb6aSJohannes Weiner 
11968186eb6aSJohannes Weiner 	/*
119750658e2eSMel Gorman 	 * We use the irq-unsafe __{inc|mod}_zone_page_state because
11988186eb6aSJohannes Weiner 	 * these counters are not modified in interrupt context, and
11998186eb6aSJohannes Weiner 	 * pte lock(a spinlock) is held, which implies preemption disabled.
12008186eb6aSJohannes Weiner 	 */
120150658e2eSMel Gorman 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
1202553af430SJohannes Weiner 	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
12038186eb6aSJohannes Weiner 
12048186eb6aSJohannes Weiner 	if (unlikely(PageMlocked(page)))
12058186eb6aSJohannes Weiner 		clear_page_mlock(page);
12068186eb6aSJohannes Weiner out:
120762cccb8cSJohannes Weiner 	unlock_page_memcg(page);
12088186eb6aSJohannes Weiner }
12098186eb6aSJohannes Weiner 
121053f9263bSKirill A. Shutemov static void page_remove_anon_compound_rmap(struct page *page)
121153f9263bSKirill A. Shutemov {
121253f9263bSKirill A. Shutemov 	int i, nr;
121353f9263bSKirill A. Shutemov 
121453f9263bSKirill A. Shutemov 	if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
121553f9263bSKirill A. Shutemov 		return;
121653f9263bSKirill A. Shutemov 
121753f9263bSKirill A. Shutemov 	/* Hugepages are not counted in NR_ANON_PAGES for now. */
121853f9263bSKirill A. Shutemov 	if (unlikely(PageHuge(page)))
121953f9263bSKirill A. Shutemov 		return;
122053f9263bSKirill A. Shutemov 
122153f9263bSKirill A. Shutemov 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
122253f9263bSKirill A. Shutemov 		return;
122353f9263bSKirill A. Shutemov 
122411fb9989SMel Gorman 	__dec_node_page_state(page, NR_ANON_THPS);
122553f9263bSKirill A. Shutemov 
122653f9263bSKirill A. Shutemov 	if (TestClearPageDoubleMap(page)) {
122753f9263bSKirill A. Shutemov 		/*
122853f9263bSKirill A. Shutemov 		 * Subpages can be mapped with PTEs too. Check how many of
122953f9263bSKirill A. Shutemov 		 * themi are still mapped.
123053f9263bSKirill A. Shutemov 		 */
123153f9263bSKirill A. Shutemov 		for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
123253f9263bSKirill A. Shutemov 			if (atomic_add_negative(-1, &page[i]._mapcount))
123353f9263bSKirill A. Shutemov 				nr++;
123453f9263bSKirill A. Shutemov 		}
123553f9263bSKirill A. Shutemov 	} else {
123653f9263bSKirill A. Shutemov 		nr = HPAGE_PMD_NR;
123753f9263bSKirill A. Shutemov 	}
123853f9263bSKirill A. Shutemov 
1239e90309c9SKirill A. Shutemov 	if (unlikely(PageMlocked(page)))
1240e90309c9SKirill A. Shutemov 		clear_page_mlock(page);
1241e90309c9SKirill A. Shutemov 
12429a982250SKirill A. Shutemov 	if (nr) {
12434b9d0fabSMel Gorman 		__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
12449a982250SKirill A. Shutemov 		deferred_split_huge_page(page);
12459a982250SKirill A. Shutemov 	}
124653f9263bSKirill A. Shutemov }
124753f9263bSKirill A. Shutemov 
12481da177e4SLinus Torvalds /**
12491da177e4SLinus Torvalds  * page_remove_rmap - take down pte mapping from a page
12501da177e4SLinus Torvalds  * @page:	page to remove mapping from
1251d281ee61SKirill A. Shutemov  * @compound:	uncharge the page as compound or small page
12521da177e4SLinus Torvalds  *
1253b8072f09SHugh Dickins  * The caller needs to hold the pte lock.
12541da177e4SLinus Torvalds  */
1255d281ee61SKirill A. Shutemov void page_remove_rmap(struct page *page, bool compound)
12561da177e4SLinus Torvalds {
1257dd78feddSKirill A. Shutemov 	if (!PageAnon(page))
1258dd78feddSKirill A. Shutemov 		return page_remove_file_rmap(page, compound);
125989c06bd5SKAMEZAWA Hiroyuki 
126053f9263bSKirill A. Shutemov 	if (compound)
126153f9263bSKirill A. Shutemov 		return page_remove_anon_compound_rmap(page);
126253f9263bSKirill A. Shutemov 
1263b904dcfeSKOSAKI Motohiro 	/* page still mapped by someone else? */
1264b904dcfeSKOSAKI Motohiro 	if (!atomic_add_negative(-1, &page->_mapcount))
12658186eb6aSJohannes Weiner 		return;
12668186eb6aSJohannes Weiner 
12671da177e4SLinus Torvalds 	/*
1268bea04b07SJianyu Zhan 	 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1269bea04b07SJianyu Zhan 	 * these counters are not modified in interrupt context, and
1270bea04b07SJianyu Zhan 	 * pte lock(a spinlock) is held, which implies preemption disabled.
12710fe6e20bSNaoya Horiguchi 	 */
12724b9d0fabSMel Gorman 	__dec_node_page_state(page, NR_ANON_MAPPED);
12738186eb6aSJohannes Weiner 
1274e6c509f8SHugh Dickins 	if (unlikely(PageMlocked(page)))
1275e6c509f8SHugh Dickins 		clear_page_mlock(page);
12768186eb6aSJohannes Weiner 
12779a982250SKirill A. Shutemov 	if (PageTransCompound(page))
12789a982250SKirill A. Shutemov 		deferred_split_huge_page(compound_head(page));
12799a982250SKirill A. Shutemov 
128016f8c5b2SHugh Dickins 	/*
12811da177e4SLinus Torvalds 	 * It would be tidy to reset the PageAnon mapping here,
12821da177e4SLinus Torvalds 	 * but that might overwrite a racing page_add_anon_rmap
12831da177e4SLinus Torvalds 	 * which increments mapcount after us but sets mapping
12841da177e4SLinus Torvalds 	 * before us: so leave the reset to free_hot_cold_page,
12851da177e4SLinus Torvalds 	 * and remember that it's only reliable while mapped.
12861da177e4SLinus Torvalds 	 * Leaving it set also helps swapoff to reinstate ptes
12871da177e4SLinus Torvalds 	 * faster for those pages still in swapcache.
12881da177e4SLinus Torvalds 	 */
12891da177e4SLinus Torvalds }
12901da177e4SLinus Torvalds 
12911da177e4SLinus Torvalds /*
129252629506SJoonsoo Kim  * @arg: enum ttu_flags will be passed to this argument
12931da177e4SLinus Torvalds  */
1294ac769501SKirill A. Shutemov static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
129552629506SJoonsoo Kim 		     unsigned long address, void *arg)
12961da177e4SLinus Torvalds {
12971da177e4SLinus Torvalds 	struct mm_struct *mm = vma->vm_mm;
1298c7ab0d2fSKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
1299c7ab0d2fSKirill A. Shutemov 		.page = page,
1300c7ab0d2fSKirill A. Shutemov 		.vma = vma,
1301c7ab0d2fSKirill A. Shutemov 		.address = address,
1302c7ab0d2fSKirill A. Shutemov 	};
13031da177e4SLinus Torvalds 	pte_t pteval;
1304c7ab0d2fSKirill A. Shutemov 	struct page *subpage;
13051da177e4SLinus Torvalds 	int ret = SWAP_AGAIN;
1306*802a3a92SShaohua Li 	enum ttu_flags flags = (enum ttu_flags)arg;
13071da177e4SLinus Torvalds 
1308b87537d9SHugh Dickins 	/* munlock has nothing to gain from examining un-locked vmas */
1309b87537d9SHugh Dickins 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
1310c7ab0d2fSKirill A. Shutemov 		return SWAP_AGAIN;
1311b87537d9SHugh Dickins 
1312fec89c10SKirill A. Shutemov 	if (flags & TTU_SPLIT_HUGE_PMD) {
1313fec89c10SKirill A. Shutemov 		split_huge_pmd_address(vma, address,
1314fec89c10SKirill A. Shutemov 				flags & TTU_MIGRATION, page);
1315fec89c10SKirill A. Shutemov 	}
1316fec89c10SKirill A. Shutemov 
1317c7ab0d2fSKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
13181da177e4SLinus Torvalds 		/*
13191da177e4SLinus Torvalds 		 * If the page is mlock()d, we cannot swap it out.
13201da177e4SLinus Torvalds 		 * If it's recently referenced (perhaps page_referenced
13211da177e4SLinus Torvalds 		 * skipped over this mm) then we should reactivate it.
13221da177e4SLinus Torvalds 		 */
132314fa31b8SAndi Kleen 		if (!(flags & TTU_IGNORE_MLOCK)) {
1324b87537d9SHugh Dickins 			if (vma->vm_flags & VM_LOCKED) {
13259a73f61bSKirill A. Shutemov 				/* PTE-mapped THP are never mlocked */
13269a73f61bSKirill A. Shutemov 				if (!PageTransCompound(page)) {
13279a73f61bSKirill A. Shutemov 					/*
13289a73f61bSKirill A. Shutemov 					 * Holding pte lock, we do *not* need
13299a73f61bSKirill A. Shutemov 					 * mmap_sem here
13309a73f61bSKirill A. Shutemov 					 */
1331b87537d9SHugh Dickins 					mlock_vma_page(page);
13329a73f61bSKirill A. Shutemov 				}
1333b87537d9SHugh Dickins 				ret = SWAP_MLOCK;
1334c7ab0d2fSKirill A. Shutemov 				page_vma_mapped_walk_done(&pvmw);
1335c7ab0d2fSKirill A. Shutemov 				break;
1336b87537d9SHugh Dickins 			}
1337daa5ba76SKonstantin Khlebnikov 			if (flags & TTU_MUNLOCK)
1338c7ab0d2fSKirill A. Shutemov 				continue;
133914fa31b8SAndi Kleen 		}
1340c7ab0d2fSKirill A. Shutemov 
13418346242aSKirill A. Shutemov 		/* Unexpected PMD-mapped THP? */
13428346242aSKirill A. Shutemov 		VM_BUG_ON_PAGE(!pvmw.pte, page);
13438346242aSKirill A. Shutemov 
13448346242aSKirill A. Shutemov 		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
13458346242aSKirill A. Shutemov 		address = pvmw.address;
13468346242aSKirill A. Shutemov 
13478346242aSKirill A. Shutemov 
134814fa31b8SAndi Kleen 		if (!(flags & TTU_IGNORE_ACCESS)) {
1349c7ab0d2fSKirill A. Shutemov 			if (ptep_clear_flush_young_notify(vma, address,
1350c7ab0d2fSKirill A. Shutemov 						pvmw.pte)) {
13511da177e4SLinus Torvalds 				ret = SWAP_FAIL;
1352c7ab0d2fSKirill A. Shutemov 				page_vma_mapped_walk_done(&pvmw);
1353c7ab0d2fSKirill A. Shutemov 				break;
13541da177e4SLinus Torvalds 			}
1355b291f000SNick Piggin 		}
13561da177e4SLinus Torvalds 
13571da177e4SLinus Torvalds 		/* Nuke the page table entry. */
1358c7ab0d2fSKirill A. Shutemov 		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
135972b252aeSMel Gorman 		if (should_defer_flush(mm, flags)) {
136072b252aeSMel Gorman 			/*
1361c7ab0d2fSKirill A. Shutemov 			 * We clear the PTE but do not flush so potentially
1362c7ab0d2fSKirill A. Shutemov 			 * a remote CPU could still be writing to the page.
1363c7ab0d2fSKirill A. Shutemov 			 * If the entry was previously clean then the
1364c7ab0d2fSKirill A. Shutemov 			 * architecture must guarantee that a clear->dirty
1365c7ab0d2fSKirill A. Shutemov 			 * transition on a cached TLB entry is written through
1366c7ab0d2fSKirill A. Shutemov 			 * and traps if the PTE is unmapped.
136772b252aeSMel Gorman 			 */
1368c7ab0d2fSKirill A. Shutemov 			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
136972b252aeSMel Gorman 
1370c7ab0d2fSKirill A. Shutemov 			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
137172b252aeSMel Gorman 		} else {
1372c7ab0d2fSKirill A. Shutemov 			pteval = ptep_clear_flush(vma, address, pvmw.pte);
137372b252aeSMel Gorman 		}
13741da177e4SLinus Torvalds 
1375c7ab0d2fSKirill A. Shutemov 		/* Move the dirty bit to the page. Now the pte is gone. */
13761da177e4SLinus Torvalds 		if (pte_dirty(pteval))
13771da177e4SLinus Torvalds 			set_page_dirty(page);
13781da177e4SLinus Torvalds 
1379365e9c87SHugh Dickins 		/* Update high watermark before we lower rss */
1380365e9c87SHugh Dickins 		update_hiwater_rss(mm);
1381365e9c87SHugh Dickins 
1382888b9f7cSAndi Kleen 		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
13835d317b2bSNaoya Horiguchi 			if (PageHuge(page)) {
1384c7ab0d2fSKirill A. Shutemov 				int nr = 1 << compound_order(page);
1385c7ab0d2fSKirill A. Shutemov 				hugetlb_count_sub(nr, mm);
13865d317b2bSNaoya Horiguchi 			} else {
1387eca56ff9SJerome Marchand 				dec_mm_counter(mm, mm_counter(page));
13885f24ae58SNaoya Horiguchi 			}
1389c7ab0d2fSKirill A. Shutemov 
1390c7ab0d2fSKirill A. Shutemov 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1391c7ab0d2fSKirill A. Shutemov 			set_pte_at(mm, address, pvmw.pte, pteval);
139245961722SKonstantin Weitz 		} else if (pte_unused(pteval)) {
139345961722SKonstantin Weitz 			/*
139445961722SKonstantin Weitz 			 * The guest indicated that the page content is of no
139545961722SKonstantin Weitz 			 * interest anymore. Simply discard the pte, vmscan
139645961722SKonstantin Weitz 			 * will take care of the rest.
139745961722SKonstantin Weitz 			 */
1398eca56ff9SJerome Marchand 			dec_mm_counter(mm, mm_counter(page));
1399c7ab0d2fSKirill A. Shutemov 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
1400c7ab0d2fSKirill A. Shutemov 				(flags & TTU_MIGRATION)) {
1401470f119fSHugh Dickins 			swp_entry_t entry;
1402470f119fSHugh Dickins 			pte_t swp_pte;
1403470f119fSHugh Dickins 			/*
1404470f119fSHugh Dickins 			 * Store the pfn of the page in a special migration
1405470f119fSHugh Dickins 			 * pte. do_swap_page() will wait until the migration
1406470f119fSHugh Dickins 			 * pte is removed and then restart fault handling.
1407470f119fSHugh Dickins 			 */
1408c7ab0d2fSKirill A. Shutemov 			entry = make_migration_entry(subpage,
1409c7ab0d2fSKirill A. Shutemov 					pte_write(pteval));
1410470f119fSHugh Dickins 			swp_pte = swp_entry_to_pte(entry);
1411470f119fSHugh Dickins 			if (pte_soft_dirty(pteval))
1412470f119fSHugh Dickins 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1413c7ab0d2fSKirill A. Shutemov 			set_pte_at(mm, address, pvmw.pte, swp_pte);
1414888b9f7cSAndi Kleen 		} else if (PageAnon(page)) {
1415c7ab0d2fSKirill A. Shutemov 			swp_entry_t entry = { .val = page_private(subpage) };
1416179ef71cSCyrill Gorcunov 			pte_t swp_pte;
14171da177e4SLinus Torvalds 			/*
14181da177e4SLinus Torvalds 			 * Store the swap location in the pte.
14191da177e4SLinus Torvalds 			 * See handle_pte_fault() ...
14201da177e4SLinus Torvalds 			 */
1421d44d363fSShaohua Li 			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
1422d44d363fSShaohua Li 				page);
1423854e9ed0SMinchan Kim 
1424*802a3a92SShaohua Li 			/* MADV_FREE page check */
1425*802a3a92SShaohua Li 			if (!PageSwapBacked(page)) {
1426a128ca71SShaohua Li 				if (!PageDirty(page)) {
1427854e9ed0SMinchan Kim 					dec_mm_counter(mm, MM_ANONPAGES);
1428854e9ed0SMinchan Kim 					goto discard;
1429854e9ed0SMinchan Kim 				}
1430854e9ed0SMinchan Kim 
1431*802a3a92SShaohua Li 				/*
1432*802a3a92SShaohua Li 				 * If the page was redirtied, it cannot be
1433*802a3a92SShaohua Li 				 * discarded. Remap the page to page table.
1434*802a3a92SShaohua Li 				 */
1435*802a3a92SShaohua Li 				set_pte_at(mm, address, pvmw.pte, pteval);
1436*802a3a92SShaohua Li 				ret = SWAP_DIRTY;
1437*802a3a92SShaohua Li 				page_vma_mapped_walk_done(&pvmw);
1438*802a3a92SShaohua Li 				break;
1439*802a3a92SShaohua Li 			}
1440*802a3a92SShaohua Li 
1441570a335bSHugh Dickins 			if (swap_duplicate(entry) < 0) {
1442c7ab0d2fSKirill A. Shutemov 				set_pte_at(mm, address, pvmw.pte, pteval);
1443570a335bSHugh Dickins 				ret = SWAP_FAIL;
1444c7ab0d2fSKirill A. Shutemov 				page_vma_mapped_walk_done(&pvmw);
1445c7ab0d2fSKirill A. Shutemov 				break;
1446570a335bSHugh Dickins 			}
14471da177e4SLinus Torvalds 			if (list_empty(&mm->mmlist)) {
14481da177e4SLinus Torvalds 				spin_lock(&mmlist_lock);
1449f412ac08SHugh Dickins 				if (list_empty(&mm->mmlist))
14501da177e4SLinus Torvalds 					list_add(&mm->mmlist, &init_mm.mmlist);
14511da177e4SLinus Torvalds 				spin_unlock(&mmlist_lock);
14521da177e4SLinus Torvalds 			}
1453d559db08SKAMEZAWA Hiroyuki 			dec_mm_counter(mm, MM_ANONPAGES);
1454b084d435SKAMEZAWA Hiroyuki 			inc_mm_counter(mm, MM_SWAPENTS);
1455179ef71cSCyrill Gorcunov 			swp_pte = swp_entry_to_pte(entry);
1456179ef71cSCyrill Gorcunov 			if (pte_soft_dirty(pteval))
1457179ef71cSCyrill Gorcunov 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1458c7ab0d2fSKirill A. Shutemov 			set_pte_at(mm, address, pvmw.pte, swp_pte);
145904e62a29SChristoph Lameter 		} else
1460eca56ff9SJerome Marchand 			dec_mm_counter(mm, mm_counter_file(page));
1461854e9ed0SMinchan Kim discard:
1462c7ab0d2fSKirill A. Shutemov 		page_remove_rmap(subpage, PageHuge(page));
146309cbfeafSKirill A. Shutemov 		put_page(page);
14642ec74c3eSSagi Grimberg 		mmu_notifier_invalidate_page(mm, address);
1465c7ab0d2fSKirill A. Shutemov 	}
1466caed0f48SKOSAKI Motohiro 	return ret;
14671da177e4SLinus Torvalds }
14681da177e4SLinus Torvalds 
146971e3aac0SAndrea Arcangeli bool is_vma_temporary_stack(struct vm_area_struct *vma)
1470a8bef8ffSMel Gorman {
1471a8bef8ffSMel Gorman 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1472a8bef8ffSMel Gorman 
1473a8bef8ffSMel Gorman 	if (!maybe_stack)
1474a8bef8ffSMel Gorman 		return false;
1475a8bef8ffSMel Gorman 
1476a8bef8ffSMel Gorman 	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1477a8bef8ffSMel Gorman 						VM_STACK_INCOMPLETE_SETUP)
1478a8bef8ffSMel Gorman 		return true;
1479a8bef8ffSMel Gorman 
1480a8bef8ffSMel Gorman 	return false;
1481a8bef8ffSMel Gorman }
1482a8bef8ffSMel Gorman 
148352629506SJoonsoo Kim static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
148452629506SJoonsoo Kim {
148552629506SJoonsoo Kim 	return is_vma_temporary_stack(vma);
148652629506SJoonsoo Kim }
148752629506SJoonsoo Kim 
14882a52bcbcSKirill A. Shutemov static int page_mapcount_is_zero(struct page *page)
148952629506SJoonsoo Kim {
1490c7ab0d2fSKirill A. Shutemov 	return !total_mapcount(page);
14912a52bcbcSKirill A. Shutemov }
149252629506SJoonsoo Kim 
14931da177e4SLinus Torvalds /**
14941da177e4SLinus Torvalds  * try_to_unmap - try to remove all page table mappings to a page
14951da177e4SLinus Torvalds  * @page: the page to get unmapped
149614fa31b8SAndi Kleen  * @flags: action and flags
14971da177e4SLinus Torvalds  *
14981da177e4SLinus Torvalds  * Tries to remove all the page table entries which are mapping this
14991da177e4SLinus Torvalds  * page, used in the pageout path.  Caller must hold the page lock.
15001da177e4SLinus Torvalds  * Return values are:
15011da177e4SLinus Torvalds  *
15021da177e4SLinus Torvalds  * SWAP_SUCCESS	- we succeeded in removing all mappings
15031da177e4SLinus Torvalds  * SWAP_AGAIN	- we missed a mapping, try again later
15041da177e4SLinus Torvalds  * SWAP_FAIL	- the page is unswappable
1505b291f000SNick Piggin  * SWAP_MLOCK	- page is mlocked.
1506*802a3a92SShaohua Li  * SWAP_DIRTY	- page is dirty MADV_FREE page
15071da177e4SLinus Torvalds  */
150814fa31b8SAndi Kleen int try_to_unmap(struct page *page, enum ttu_flags flags)
15091da177e4SLinus Torvalds {
15101da177e4SLinus Torvalds 	int ret;
1511854e9ed0SMinchan Kim 
151252629506SJoonsoo Kim 	struct rmap_walk_control rwc = {
151352629506SJoonsoo Kim 		.rmap_one = try_to_unmap_one,
1514*802a3a92SShaohua Li 		.arg = (void *)flags,
15152a52bcbcSKirill A. Shutemov 		.done = page_mapcount_is_zero,
151652629506SJoonsoo Kim 		.anon_lock = page_lock_anon_vma_read,
151752629506SJoonsoo Kim 	};
15181da177e4SLinus Torvalds 
151952629506SJoonsoo Kim 	/*
152052629506SJoonsoo Kim 	 * During exec, a temporary VMA is setup and later moved.
152152629506SJoonsoo Kim 	 * The VMA is moved under the anon_vma lock but not the
152252629506SJoonsoo Kim 	 * page tables leading to a race where migration cannot
152352629506SJoonsoo Kim 	 * find the migration ptes. Rather than increasing the
152452629506SJoonsoo Kim 	 * locking requirements of exec(), migration skips
152552629506SJoonsoo Kim 	 * temporary VMAs until after exec() completes.
152652629506SJoonsoo Kim 	 */
1527daa5ba76SKonstantin Khlebnikov 	if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
152852629506SJoonsoo Kim 		rwc.invalid_vma = invalid_migration_vma;
152952629506SJoonsoo Kim 
15302a52bcbcSKirill A. Shutemov 	if (flags & TTU_RMAP_LOCKED)
15312a52bcbcSKirill A. Shutemov 		ret = rmap_walk_locked(page, &rwc);
15322a52bcbcSKirill A. Shutemov 	else
153352629506SJoonsoo Kim 		ret = rmap_walk(page, &rwc);
153452629506SJoonsoo Kim 
1535*802a3a92SShaohua Li 	if (ret != SWAP_MLOCK && !page_mapcount(page))
15361da177e4SLinus Torvalds 		ret = SWAP_SUCCESS;
15371da177e4SLinus Torvalds 	return ret;
15381da177e4SLinus Torvalds }
153981b4082dSNikita Danilov 
15402a52bcbcSKirill A. Shutemov static int page_not_mapped(struct page *page)
15412a52bcbcSKirill A. Shutemov {
15422a52bcbcSKirill A. Shutemov 	return !page_mapped(page);
15432a52bcbcSKirill A. Shutemov };
15442a52bcbcSKirill A. Shutemov 
1545b291f000SNick Piggin /**
1546b291f000SNick Piggin  * try_to_munlock - try to munlock a page
1547b291f000SNick Piggin  * @page: the page to be munlocked
1548b291f000SNick Piggin  *
1549b291f000SNick Piggin  * Called from munlock code.  Checks all of the VMAs mapping the page
1550b291f000SNick Piggin  * to make sure nobody else has this page mlocked. The page will be
1551b291f000SNick Piggin  * returned with PG_mlocked cleared if no other vmas have it mlocked.
1552b291f000SNick Piggin  *
1553b291f000SNick Piggin  * Return values are:
1554b291f000SNick Piggin  *
155553f79acbSHugh Dickins  * SWAP_AGAIN	- no vma is holding page mlocked, or,
1556b291f000SNick Piggin  * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
15575ad64688SHugh Dickins  * SWAP_FAIL	- page cannot be located at present
1558b291f000SNick Piggin  * SWAP_MLOCK	- page is now mlocked.
1559b291f000SNick Piggin  */
1560b291f000SNick Piggin int try_to_munlock(struct page *page)
1561b291f000SNick Piggin {
1562e8351ac9SJoonsoo Kim 	int ret;
1563854e9ed0SMinchan Kim 
1564e8351ac9SJoonsoo Kim 	struct rmap_walk_control rwc = {
1565e8351ac9SJoonsoo Kim 		.rmap_one = try_to_unmap_one,
1566*802a3a92SShaohua Li 		.arg = (void *)TTU_MUNLOCK,
1567e8351ac9SJoonsoo Kim 		.done = page_not_mapped,
1568e8351ac9SJoonsoo Kim 		.anon_lock = page_lock_anon_vma_read,
1569e8351ac9SJoonsoo Kim 
1570e8351ac9SJoonsoo Kim 	};
1571e8351ac9SJoonsoo Kim 
1572309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
1573b291f000SNick Piggin 
1574e8351ac9SJoonsoo Kim 	ret = rmap_walk(page, &rwc);
1575e8351ac9SJoonsoo Kim 	return ret;
1576b291f000SNick Piggin }
1577e9995ef9SHugh Dickins 
157801d8b20dSPeter Zijlstra void __put_anon_vma(struct anon_vma *anon_vma)
157976545066SRik van Riel {
158076545066SRik van Riel 	struct anon_vma *root = anon_vma->root;
158176545066SRik van Riel 
1582624483f3SAndrey Ryabinin 	anon_vma_free(anon_vma);
158301d8b20dSPeter Zijlstra 	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
158476545066SRik van Riel 		anon_vma_free(root);
158576545066SRik van Riel }
158676545066SRik van Riel 
15870dd1c7bbSJoonsoo Kim static struct anon_vma *rmap_walk_anon_lock(struct page *page,
15880dd1c7bbSJoonsoo Kim 					struct rmap_walk_control *rwc)
1589faecd8ddSJoonsoo Kim {
1590faecd8ddSJoonsoo Kim 	struct anon_vma *anon_vma;
1591faecd8ddSJoonsoo Kim 
15920dd1c7bbSJoonsoo Kim 	if (rwc->anon_lock)
15930dd1c7bbSJoonsoo Kim 		return rwc->anon_lock(page);
15940dd1c7bbSJoonsoo Kim 
1595faecd8ddSJoonsoo Kim 	/*
1596faecd8ddSJoonsoo Kim 	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1597faecd8ddSJoonsoo Kim 	 * because that depends on page_mapped(); but not all its usages
1598faecd8ddSJoonsoo Kim 	 * are holding mmap_sem. Users without mmap_sem are required to
1599faecd8ddSJoonsoo Kim 	 * take a reference count to prevent the anon_vma disappearing
1600faecd8ddSJoonsoo Kim 	 */
1601faecd8ddSJoonsoo Kim 	anon_vma = page_anon_vma(page);
1602faecd8ddSJoonsoo Kim 	if (!anon_vma)
1603faecd8ddSJoonsoo Kim 		return NULL;
1604faecd8ddSJoonsoo Kim 
1605faecd8ddSJoonsoo Kim 	anon_vma_lock_read(anon_vma);
1606faecd8ddSJoonsoo Kim 	return anon_vma;
1607faecd8ddSJoonsoo Kim }
1608faecd8ddSJoonsoo Kim 
1609e9995ef9SHugh Dickins /*
1610e8351ac9SJoonsoo Kim  * rmap_walk_anon - do something to anonymous page using the object-based
1611e8351ac9SJoonsoo Kim  * rmap method
1612e8351ac9SJoonsoo Kim  * @page: the page to be handled
1613e8351ac9SJoonsoo Kim  * @rwc: control variable according to each walk type
1614e8351ac9SJoonsoo Kim  *
1615e8351ac9SJoonsoo Kim  * Find all the mappings of a page using the mapping pointer and the vma chains
1616e8351ac9SJoonsoo Kim  * contained in the anon_vma struct it points to.
1617e8351ac9SJoonsoo Kim  *
1618e8351ac9SJoonsoo Kim  * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1619e8351ac9SJoonsoo Kim  * where the page was found will be held for write.  So, we won't recheck
1620e8351ac9SJoonsoo Kim  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
1621e8351ac9SJoonsoo Kim  * LOCKED.
1622e9995ef9SHugh Dickins  */
1623b9773199SKirill A. Shutemov static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
1624b9773199SKirill A. Shutemov 		bool locked)
1625e9995ef9SHugh Dickins {
1626e9995ef9SHugh Dickins 	struct anon_vma *anon_vma;
1627a8fa41adSKirill A. Shutemov 	pgoff_t pgoff_start, pgoff_end;
16285beb4930SRik van Riel 	struct anon_vma_chain *avc;
1629e9995ef9SHugh Dickins 	int ret = SWAP_AGAIN;
1630e9995ef9SHugh Dickins 
1631b9773199SKirill A. Shutemov 	if (locked) {
1632b9773199SKirill A. Shutemov 		anon_vma = page_anon_vma(page);
1633b9773199SKirill A. Shutemov 		/* anon_vma disappear under us? */
1634b9773199SKirill A. Shutemov 		VM_BUG_ON_PAGE(!anon_vma, page);
1635b9773199SKirill A. Shutemov 	} else {
16360dd1c7bbSJoonsoo Kim 		anon_vma = rmap_walk_anon_lock(page, rwc);
1637b9773199SKirill A. Shutemov 	}
1638e9995ef9SHugh Dickins 	if (!anon_vma)
1639e9995ef9SHugh Dickins 		return ret;
1640faecd8ddSJoonsoo Kim 
1641a8fa41adSKirill A. Shutemov 	pgoff_start = page_to_pgoff(page);
1642a8fa41adSKirill A. Shutemov 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1643a8fa41adSKirill A. Shutemov 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
1644a8fa41adSKirill A. Shutemov 			pgoff_start, pgoff_end) {
16455beb4930SRik van Riel 		struct vm_area_struct *vma = avc->vma;
1646e9995ef9SHugh Dickins 		unsigned long address = vma_address(page, vma);
16470dd1c7bbSJoonsoo Kim 
1648ad12695fSAndrea Arcangeli 		cond_resched();
1649ad12695fSAndrea Arcangeli 
16500dd1c7bbSJoonsoo Kim 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
16510dd1c7bbSJoonsoo Kim 			continue;
16520dd1c7bbSJoonsoo Kim 
1653051ac83aSJoonsoo Kim 		ret = rwc->rmap_one(page, vma, address, rwc->arg);
1654e9995ef9SHugh Dickins 		if (ret != SWAP_AGAIN)
1655e9995ef9SHugh Dickins 			break;
16560dd1c7bbSJoonsoo Kim 		if (rwc->done && rwc->done(page))
16570dd1c7bbSJoonsoo Kim 			break;
1658e9995ef9SHugh Dickins 	}
1659b9773199SKirill A. Shutemov 
1660b9773199SKirill A. Shutemov 	if (!locked)
16614fc3f1d6SIngo Molnar 		anon_vma_unlock_read(anon_vma);
1662e9995ef9SHugh Dickins 	return ret;
1663e9995ef9SHugh Dickins }
1664e9995ef9SHugh Dickins 
1665e8351ac9SJoonsoo Kim /*
1666e8351ac9SJoonsoo Kim  * rmap_walk_file - do something to file page using the object-based rmap method
1667e8351ac9SJoonsoo Kim  * @page: the page to be handled
1668e8351ac9SJoonsoo Kim  * @rwc: control variable according to each walk type
1669e8351ac9SJoonsoo Kim  *
1670e8351ac9SJoonsoo Kim  * Find all the mappings of a page using the mapping pointer and the vma chains
1671e8351ac9SJoonsoo Kim  * contained in the address_space struct it points to.
1672e8351ac9SJoonsoo Kim  *
1673e8351ac9SJoonsoo Kim  * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1674e8351ac9SJoonsoo Kim  * where the page was found will be held for write.  So, we won't recheck
1675e8351ac9SJoonsoo Kim  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
1676e8351ac9SJoonsoo Kim  * LOCKED.
1677e8351ac9SJoonsoo Kim  */
1678b9773199SKirill A. Shutemov static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
1679b9773199SKirill A. Shutemov 		bool locked)
1680e9995ef9SHugh Dickins {
1681b9773199SKirill A. Shutemov 	struct address_space *mapping = page_mapping(page);
1682a8fa41adSKirill A. Shutemov 	pgoff_t pgoff_start, pgoff_end;
1683e9995ef9SHugh Dickins 	struct vm_area_struct *vma;
1684e9995ef9SHugh Dickins 	int ret = SWAP_AGAIN;
1685e9995ef9SHugh Dickins 
16869f32624bSJoonsoo Kim 	/*
16879f32624bSJoonsoo Kim 	 * The page lock not only makes sure that page->mapping cannot
16889f32624bSJoonsoo Kim 	 * suddenly be NULLified by truncation, it makes sure that the
16899f32624bSJoonsoo Kim 	 * structure at mapping cannot be freed and reused yet,
1690c8c06efaSDavidlohr Bueso 	 * so we can safely take mapping->i_mmap_rwsem.
16919f32624bSJoonsoo Kim 	 */
169281d1b09cSSasha Levin 	VM_BUG_ON_PAGE(!PageLocked(page), page);
16939f32624bSJoonsoo Kim 
1694e9995ef9SHugh Dickins 	if (!mapping)
1695e9995ef9SHugh Dickins 		return ret;
16963dec0ba0SDavidlohr Bueso 
1697a8fa41adSKirill A. Shutemov 	pgoff_start = page_to_pgoff(page);
1698a8fa41adSKirill A. Shutemov 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
1699b9773199SKirill A. Shutemov 	if (!locked)
17003dec0ba0SDavidlohr Bueso 		i_mmap_lock_read(mapping);
1701a8fa41adSKirill A. Shutemov 	vma_interval_tree_foreach(vma, &mapping->i_mmap,
1702a8fa41adSKirill A. Shutemov 			pgoff_start, pgoff_end) {
1703e9995ef9SHugh Dickins 		unsigned long address = vma_address(page, vma);
17040dd1c7bbSJoonsoo Kim 
1705ad12695fSAndrea Arcangeli 		cond_resched();
1706ad12695fSAndrea Arcangeli 
17070dd1c7bbSJoonsoo Kim 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
17080dd1c7bbSJoonsoo Kim 			continue;
17090dd1c7bbSJoonsoo Kim 
1710051ac83aSJoonsoo Kim 		ret = rwc->rmap_one(page, vma, address, rwc->arg);
1711e9995ef9SHugh Dickins 		if (ret != SWAP_AGAIN)
17120dd1c7bbSJoonsoo Kim 			goto done;
17130dd1c7bbSJoonsoo Kim 		if (rwc->done && rwc->done(page))
17140dd1c7bbSJoonsoo Kim 			goto done;
1715e9995ef9SHugh Dickins 	}
17160dd1c7bbSJoonsoo Kim 
17170dd1c7bbSJoonsoo Kim done:
1718b9773199SKirill A. Shutemov 	if (!locked)
17193dec0ba0SDavidlohr Bueso 		i_mmap_unlock_read(mapping);
1720e9995ef9SHugh Dickins 	return ret;
1721e9995ef9SHugh Dickins }
1722e9995ef9SHugh Dickins 
1723051ac83aSJoonsoo Kim int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
1724e9995ef9SHugh Dickins {
1725e9995ef9SHugh Dickins 	if (unlikely(PageKsm(page)))
1726051ac83aSJoonsoo Kim 		return rmap_walk_ksm(page, rwc);
1727e9995ef9SHugh Dickins 	else if (PageAnon(page))
1728b9773199SKirill A. Shutemov 		return rmap_walk_anon(page, rwc, false);
1729e9995ef9SHugh Dickins 	else
1730b9773199SKirill A. Shutemov 		return rmap_walk_file(page, rwc, false);
1731b9773199SKirill A. Shutemov }
1732b9773199SKirill A. Shutemov 
1733b9773199SKirill A. Shutemov /* Like rmap_walk, but caller holds relevant rmap lock */
1734b9773199SKirill A. Shutemov int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
1735b9773199SKirill A. Shutemov {
1736b9773199SKirill A. Shutemov 	/* no ksm support for now */
1737b9773199SKirill A. Shutemov 	VM_BUG_ON_PAGE(PageKsm(page), page);
1738b9773199SKirill A. Shutemov 	if (PageAnon(page))
1739b9773199SKirill A. Shutemov 		return rmap_walk_anon(page, rwc, true);
1740b9773199SKirill A. Shutemov 	else
1741b9773199SKirill A. Shutemov 		return rmap_walk_file(page, rwc, true);
1742e9995ef9SHugh Dickins }
17430fe6e20bSNaoya Horiguchi 
1744e3390f67SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
17450fe6e20bSNaoya Horiguchi /*
17460fe6e20bSNaoya Horiguchi  * The following three functions are for anonymous (private mapped) hugepages.
17470fe6e20bSNaoya Horiguchi  * Unlike common anonymous pages, anonymous hugepages have no accounting code
17480fe6e20bSNaoya Horiguchi  * and no lru code, because we handle hugepages differently from common pages.
17490fe6e20bSNaoya Horiguchi  */
17500fe6e20bSNaoya Horiguchi static void __hugepage_set_anon_rmap(struct page *page,
17510fe6e20bSNaoya Horiguchi 	struct vm_area_struct *vma, unsigned long address, int exclusive)
17520fe6e20bSNaoya Horiguchi {
17530fe6e20bSNaoya Horiguchi 	struct anon_vma *anon_vma = vma->anon_vma;
1754433abed6SNaoya Horiguchi 
17550fe6e20bSNaoya Horiguchi 	BUG_ON(!anon_vma);
1756433abed6SNaoya Horiguchi 
1757433abed6SNaoya Horiguchi 	if (PageAnon(page))
1758433abed6SNaoya Horiguchi 		return;
1759433abed6SNaoya Horiguchi 	if (!exclusive)
1760433abed6SNaoya Horiguchi 		anon_vma = anon_vma->root;
1761433abed6SNaoya Horiguchi 
17620fe6e20bSNaoya Horiguchi 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
17630fe6e20bSNaoya Horiguchi 	page->mapping = (struct address_space *) anon_vma;
17640fe6e20bSNaoya Horiguchi 	page->index = linear_page_index(vma, address);
17650fe6e20bSNaoya Horiguchi }
17660fe6e20bSNaoya Horiguchi 
17670fe6e20bSNaoya Horiguchi void hugepage_add_anon_rmap(struct page *page,
17680fe6e20bSNaoya Horiguchi 			    struct vm_area_struct *vma, unsigned long address)
17690fe6e20bSNaoya Horiguchi {
17700fe6e20bSNaoya Horiguchi 	struct anon_vma *anon_vma = vma->anon_vma;
17710fe6e20bSNaoya Horiguchi 	int first;
1772a850ea30SNaoya Horiguchi 
1773a850ea30SNaoya Horiguchi 	BUG_ON(!PageLocked(page));
17740fe6e20bSNaoya Horiguchi 	BUG_ON(!anon_vma);
17755dbe0af4SHugh Dickins 	/* address might be in next vma when migration races vma_adjust */
177653f9263bSKirill A. Shutemov 	first = atomic_inc_and_test(compound_mapcount_ptr(page));
17770fe6e20bSNaoya Horiguchi 	if (first)
17780fe6e20bSNaoya Horiguchi 		__hugepage_set_anon_rmap(page, vma, address, 0);
17790fe6e20bSNaoya Horiguchi }
17800fe6e20bSNaoya Horiguchi 
17810fe6e20bSNaoya Horiguchi void hugepage_add_new_anon_rmap(struct page *page,
17820fe6e20bSNaoya Horiguchi 			struct vm_area_struct *vma, unsigned long address)
17830fe6e20bSNaoya Horiguchi {
17840fe6e20bSNaoya Horiguchi 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
178553f9263bSKirill A. Shutemov 	atomic_set(compound_mapcount_ptr(page), 0);
17860fe6e20bSNaoya Horiguchi 	__hugepage_set_anon_rmap(page, vma, address, 1);
17870fe6e20bSNaoya Horiguchi }
1788e3390f67SNaoya Horiguchi #endif /* CONFIG_HUGETLB_PAGE */
1789