xref: /linux/include/linux/rmap.h (revision 832d95b5314eea558cf4cc9ca40db10122ce8f63)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_RMAP_H
3 #define _LINUX_RMAP_H
4 /*
5  * Declarations for Reverse Mapping functions in mm/rmap.c
6  */
7 
8 #include <linux/list.h>
9 #include <linux/slab.h>
10 #include <linux/mm.h>
11 #include <linux/rwsem.h>
12 #include <linux/memcontrol.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/memremap.h>
16 #include <linux/bit_spinlock.h>
17 
18 /*
19  * The anon_vma heads a list of private "related" vmas, to scan if
20  * an anonymous page pointing to this anon_vma needs to be unmapped:
21  * the vmas on the list will be related by forking, or by splitting.
22  *
23  * Since vmas come and go as they are split and merged (particularly
24  * in mprotect), the mapping field of an anonymous page cannot point
25  * directly to a vma: instead it points to an anon_vma, on whose list
26  * the related vmas can be easily linked or unlinked.
27  *
28  * After unlinking the last vma on the list, we must garbage collect
29  * the anon_vma object itself: we're guaranteed no page can be
30  * pointing to this anon_vma once its vma list is empty.
31  */
32 struct anon_vma {
33 	struct anon_vma *root;		/* Root of this anon_vma tree */
34 	struct rw_semaphore rwsem;	/* W: modification, R: walking the list */
35 	/*
36 	 * The refcount is taken on an anon_vma when there is no
37 	 * guarantee that the vma of page tables will exist for
38 	 * the duration of the operation. A caller that takes
39 	 * the reference is responsible for clearing up the
40 	 * anon_vma if they are the last user on release
41 	 */
42 	atomic_t refcount;
43 
44 	/*
45 	 * Count of child anon_vmas. Equals to the count of all anon_vmas that
46 	 * have ->parent pointing to this one, including itself.
47 	 *
48 	 * This counter is used for making decision about reusing anon_vma
49 	 * instead of forking new one. See comments in function anon_vma_clone.
50 	 */
51 	unsigned long num_children;
52 	/* Count of VMAs whose ->anon_vma pointer points to this object. */
53 	unsigned long num_active_vmas;
54 
55 	struct anon_vma *parent;	/* Parent of this anon_vma */
56 
57 	/*
58 	 * NOTE: the LSB of the rb_root.rb_node is set by
59 	 * mm_take_all_locks() _after_ taking the above lock. So the
60 	 * rb_root must only be read/written after taking the above lock
61 	 * to be sure to see a valid next pointer. The LSB bit itself
62 	 * is serialized by a system wide lock only visible to
63 	 * mm_take_all_locks() (mm_all_locks_mutex).
64 	 */
65 
66 	/* Interval tree of private "related" vmas */
67 	struct rb_root_cached rb_root;
68 };
69 
70 /*
71  * The copy-on-write semantics of fork mean that an anon_vma
72  * can become associated with multiple processes. Furthermore,
73  * each child process will have its own anon_vma, where new
74  * pages for that process are instantiated.
75  *
76  * This structure allows us to find the anon_vmas associated
77  * with a VMA, or the VMAs associated with an anon_vma.
78  * The "same_vma" list contains the anon_vma_chains linking
79  * all the anon_vmas associated with this VMA.
80  * The "rb" field indexes on an interval tree the anon_vma_chains
81  * which link all the VMAs associated with this anon_vma.
82  */
83 struct anon_vma_chain {
84 	struct vm_area_struct *vma;
85 	struct anon_vma *anon_vma;
86 	struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
87 	struct rb_node rb;			/* locked by anon_vma->rwsem */
88 	unsigned long rb_subtree_last;
89 #ifdef CONFIG_DEBUG_VM_RB
90 	unsigned long cached_vma_start, cached_vma_last;
91 #endif
92 };
93 
94 enum ttu_flags {
95 	TTU_USE_SHARED_ZEROPAGE	= 0x2,	/* for unused pages of large folios */
96 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
97 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
98 	TTU_SYNC		= 0x10,	/* avoid racy checks with PVMW_SYNC */
99 	TTU_HWPOISON		= 0x20,	/* do convert pte to hwpoison entry */
100 	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
101 					 * and caller guarantees they will
102 					 * do a final flush if necessary */
103 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
104 					 * caller holds it */
105 };
106 
107 #ifdef CONFIG_MMU
108 
109 void anon_vma_init(void);	/* create anon_vma_cachep */
110 
111 #ifdef CONFIG_MM_ID
112 static __always_inline void folio_lock_large_mapcount(struct folio *folio)
113 {
114 	bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
115 }
116 
117 static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
118 {
119 	__bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
120 }
121 
122 static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
123 {
124 	VM_WARN_ON_ONCE(idx != 0 && idx != 1);
125 	return folio->_mm_id[idx] & MM_ID_MASK;
126 }
127 
128 static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
129 {
130 	VM_WARN_ON_ONCE(idx != 0 && idx != 1);
131 	folio->_mm_id[idx] &= ~MM_ID_MASK;
132 	folio->_mm_id[idx] |= id;
133 }
134 
135 static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
136 		int diff, mm_id_t mm_id)
137 {
138 	VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
139 	VM_WARN_ON_ONCE(diff <= 0);
140 	VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
141 
142 	/*
143 	 * Make sure we can detect at least one complete PTE mapping of the
144 	 * folio in a single MM as "exclusively mapped". This is primarily
145 	 * a check on 32bit, where we currently reduce the size of the per-MM
146 	 * mapcount to a short.
147 	 */
148 	VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
149 	VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
150 
151 	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
152 			folio->_mm_id_mapcount[0] != -1);
153 	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
154 			folio->_mm_id_mapcount[0] < 0);
155 	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
156 			folio->_mm_id_mapcount[1] != -1);
157 	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
158 			folio->_mm_id_mapcount[1] < 0);
159 	VM_WARN_ON_ONCE(!folio_mapped(folio) &&
160 			test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
161 }
162 
163 static __always_inline void folio_set_large_mapcount(struct folio *folio,
164 		int mapcount, struct vm_area_struct *vma)
165 {
166 	__folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
167 
168 	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
169 	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
170 
171 	/* Note: mapcounts start at -1. */
172 	atomic_set(&folio->_large_mapcount, mapcount - 1);
173 	folio->_mm_id_mapcount[0] = mapcount - 1;
174 	folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
175 }
176 
177 static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
178 		int diff, struct vm_area_struct *vma)
179 {
180 	const mm_id_t mm_id = vma->vm_mm->mm_id;
181 	int new_mapcount_val;
182 
183 	folio_lock_large_mapcount(folio);
184 	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
185 
186 	new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
187 	atomic_set(&folio->_large_mapcount, new_mapcount_val);
188 
189 	/*
190 	 * If a folio is mapped more than once into an MM on 32bit, we
191 	 * can in theory overflow the per-MM mapcount (although only for
192 	 * fairly large folios), turning it negative. In that case, just
193 	 * free up the slot and mark the folio "mapped shared", otherwise
194 	 * we might be in trouble when unmapping pages later.
195 	 */
196 	if (folio_mm_id(folio, 0) == mm_id) {
197 		folio->_mm_id_mapcount[0] += diff;
198 		if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
199 			folio->_mm_id_mapcount[0] = -1;
200 			folio_set_mm_id(folio, 0, MM_ID_DUMMY);
201 			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
202 		}
203 	} else if (folio_mm_id(folio, 1) == mm_id) {
204 		folio->_mm_id_mapcount[1] += diff;
205 		if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
206 			folio->_mm_id_mapcount[1] = -1;
207 			folio_set_mm_id(folio, 1, MM_ID_DUMMY);
208 			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
209 		}
210 	} else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
211 		folio_set_mm_id(folio, 0, mm_id);
212 		folio->_mm_id_mapcount[0] = diff - 1;
213 		/* We might have other mappings already. */
214 		if (new_mapcount_val != diff - 1)
215 			folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
216 	} else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
217 		folio_set_mm_id(folio, 1, mm_id);
218 		folio->_mm_id_mapcount[1] = diff - 1;
219 		/* Slot 0 certainly has mappings as well. */
220 		folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
221 	}
222 	folio_unlock_large_mapcount(folio);
223 	return new_mapcount_val + 1;
224 }
225 #define folio_add_large_mapcount folio_add_return_large_mapcount
226 
227 static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
228 		int diff, struct vm_area_struct *vma)
229 {
230 	const mm_id_t mm_id = vma->vm_mm->mm_id;
231 	int new_mapcount_val;
232 
233 	folio_lock_large_mapcount(folio);
234 	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
235 
236 	new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
237 	atomic_set(&folio->_large_mapcount, new_mapcount_val);
238 
239 	/*
240 	 * There are valid corner cases where we might underflow a per-MM
241 	 * mapcount (some mappings added when no slot was free, some mappings
242 	 * added once a slot was free), so we always set it to -1 once we go
243 	 * negative.
244 	 */
245 	if (folio_mm_id(folio, 0) == mm_id) {
246 		folio->_mm_id_mapcount[0] -= diff;
247 		if (folio->_mm_id_mapcount[0] >= 0)
248 			goto out;
249 		folio->_mm_id_mapcount[0] = -1;
250 		folio_set_mm_id(folio, 0, MM_ID_DUMMY);
251 	} else if (folio_mm_id(folio, 1) == mm_id) {
252 		folio->_mm_id_mapcount[1] -= diff;
253 		if (folio->_mm_id_mapcount[1] >= 0)
254 			goto out;
255 		folio->_mm_id_mapcount[1] = -1;
256 		folio_set_mm_id(folio, 1, MM_ID_DUMMY);
257 	}
258 
259 	/*
260 	 * If one MM slot owns all mappings, the folio is mapped exclusively.
261 	 * Note that if the folio is now unmapped (new_mapcount_val == -1), both
262 	 * slots must be free (mapcount == -1), and we'll also mark it as
263 	 * exclusive.
264 	 */
265 	if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
266 	    folio->_mm_id_mapcount[1] == new_mapcount_val)
267 		folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
268 out:
269 	folio_unlock_large_mapcount(folio);
270 	return new_mapcount_val + 1;
271 }
272 #define folio_sub_large_mapcount folio_sub_return_large_mapcount
273 #else /* !CONFIG_MM_ID */
274 /*
275  * See __folio_rmap_sanity_checks(), we might map large folios even without
276  * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
277  */
278 static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
279 		struct vm_area_struct *vma)
280 {
281 	/* Note: mapcounts start at -1. */
282 	atomic_set(&folio->_large_mapcount, mapcount - 1);
283 }
284 
285 static inline void folio_add_large_mapcount(struct folio *folio,
286 		int diff, struct vm_area_struct *vma)
287 {
288 	atomic_add(diff, &folio->_large_mapcount);
289 }
290 
291 static inline int folio_add_return_large_mapcount(struct folio *folio,
292 		int diff, struct vm_area_struct *vma)
293 {
294 	BUILD_BUG();
295 }
296 
297 static inline void folio_sub_large_mapcount(struct folio *folio,
298 		int diff, struct vm_area_struct *vma)
299 {
300 	atomic_sub(diff, &folio->_large_mapcount);
301 }
302 
303 static inline int folio_sub_return_large_mapcount(struct folio *folio,
304 		int diff, struct vm_area_struct *vma)
305 {
306 	BUILD_BUG();
307 }
308 #endif /* CONFIG_MM_ID */
309 
310 #define folio_inc_large_mapcount(folio, vma) \
311 	folio_add_large_mapcount(folio, 1, vma)
312 #define folio_inc_return_large_mapcount(folio, vma) \
313 	folio_add_return_large_mapcount(folio, 1, vma)
314 #define folio_dec_large_mapcount(folio, vma) \
315 	folio_sub_large_mapcount(folio, 1, vma)
316 #define folio_dec_return_large_mapcount(folio, vma) \
317 	folio_sub_return_large_mapcount(folio, 1, vma)
318 
319 /* RMAP flags, currently only relevant for some anon rmap operations. */
320 typedef int __bitwise rmap_t;
321 
322 /*
323  * No special request: A mapped anonymous (sub)page is possibly shared between
324  * processes.
325  */
326 #define RMAP_NONE		((__force rmap_t)0)
327 
328 /* The anonymous (sub)page is exclusive to a single process. */
329 #define RMAP_EXCLUSIVE		((__force rmap_t)BIT(0))
330 
331 static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
332 		const struct page *page, int nr_pages, enum pgtable_level level)
333 {
334 	/* hugetlb folios are handled separately. */
335 	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
336 
337 	/* When (un)mapping zeropages, we should never touch ref+mapcount. */
338 	VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);
339 
340 	/*
341 	 * TODO: we get driver-allocated folios that have nothing to do with
342 	 * the rmap using vm_insert_page(); therefore, we cannot assume that
343 	 * folio_test_large_rmappable() holds for large folios. We should
344 	 * handle any desired mapcount+stats accounting for these folios in
345 	 * VM_MIXEDMAP VMAs separately, and then sanity-check here that
346 	 * we really only get rmappable folios.
347 	 */
348 
349 	VM_WARN_ON_ONCE(nr_pages <= 0);
350 	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
351 	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
352 
353 	switch (level) {
354 	case PGTABLE_LEVEL_PTE:
355 		break;
356 	case PGTABLE_LEVEL_PMD:
357 		/*
358 		 * We don't support folios larger than a single PMD yet. So
359 		 * when PGTABLE_LEVEL_PMD is set, we assume that we are creating
360 		 * a single "entire" mapping of the folio.
361 		 */
362 		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
363 		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
364 		break;
365 	case PGTABLE_LEVEL_PUD:
366 		/*
367 		 * Assume that we are creating a single "entire" mapping of the
368 		 * folio.
369 		 */
370 		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
371 		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
372 		break;
373 	default:
374 		BUILD_BUG();
375 	}
376 
377 	/*
378 	 * Anon folios must have an associated live anon_vma as long as they're
379 	 * mapped into userspace.
380 	 * Note that the atomic_read() mainly does two things:
381 	 *
382 	 * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
383 	 *    check that the associated anon_vma has not yet been freed (subject
384 	 *    to KASAN's usual limitations). This check will pass if the
385 	 *    anon_vma's refcount has already dropped to 0 but an RCU grace
386 	 *    period hasn't passed since then.
387 	 * 2. If the anon_vma has not yet been freed, it checks that the
388 	 *    anon_vma still has a nonzero refcount (as opposed to being in the
389 	 *    middle of an RCU delay for getting freed).
390 	 */
391 	if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
392 		unsigned long mapping = (unsigned long)folio->mapping;
393 		struct anon_vma *anon_vma;
394 
395 		anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
396 		VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
397 	}
398 }
399 
400 /*
401  * rmap interfaces called when adding or removing pte of page
402  */
403 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
404 void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
405 		struct vm_area_struct *, unsigned long address, rmap_t flags);
406 #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
407 	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
408 void folio_add_anon_rmap_pmd(struct folio *, struct page *,
409 		struct vm_area_struct *, unsigned long address, rmap_t flags);
410 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
411 		unsigned long address, rmap_t flags);
412 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
413 		struct vm_area_struct *);
414 #define folio_add_file_rmap_pte(folio, page, vma) \
415 	folio_add_file_rmap_ptes(folio, page, 1, vma)
416 void folio_add_file_rmap_pmd(struct folio *, struct page *,
417 		struct vm_area_struct *);
418 void folio_add_file_rmap_pud(struct folio *, struct page *,
419 		struct vm_area_struct *);
420 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
421 		struct vm_area_struct *);
422 #define folio_remove_rmap_pte(folio, page, vma) \
423 	folio_remove_rmap_ptes(folio, page, 1, vma)
424 void folio_remove_rmap_pmd(struct folio *, struct page *,
425 		struct vm_area_struct *);
426 void folio_remove_rmap_pud(struct folio *, struct page *,
427 		struct vm_area_struct *);
428 
429 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
430 		unsigned long address, rmap_t flags);
431 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
432 		unsigned long address);
433 
434 /* See folio_try_dup_anon_rmap_*() */
435 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
436 		struct vm_area_struct *vma)
437 {
438 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
439 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
440 
441 	if (PageAnonExclusive(&folio->page)) {
442 		if (unlikely(folio_needs_cow_for_dma(vma, folio)))
443 			return -EBUSY;
444 		ClearPageAnonExclusive(&folio->page);
445 	}
446 	atomic_inc(&folio->_entire_mapcount);
447 	atomic_inc(&folio->_large_mapcount);
448 	return 0;
449 }
450 
451 /* See folio_try_share_anon_rmap_*() */
452 static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
453 {
454 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
455 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
456 	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
457 
458 	/* Paired with the memory barrier in try_grab_folio(). */
459 	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
460 		smp_mb();
461 
462 	if (unlikely(folio_maybe_dma_pinned(folio)))
463 		return -EBUSY;
464 	ClearPageAnonExclusive(&folio->page);
465 
466 	/*
467 	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
468 	 * gup_must_unshare().
469 	 */
470 	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
471 		smp_mb__after_atomic();
472 	return 0;
473 }
474 
475 static inline void hugetlb_add_file_rmap(struct folio *folio)
476 {
477 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
478 	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
479 
480 	atomic_inc(&folio->_entire_mapcount);
481 	atomic_inc(&folio->_large_mapcount);
482 }
483 
484 static inline void hugetlb_remove_rmap(struct folio *folio)
485 {
486 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
487 
488 	atomic_dec(&folio->_entire_mapcount);
489 	atomic_dec(&folio->_large_mapcount);
490 }
491 
492 static __always_inline void __folio_dup_file_rmap(struct folio *folio,
493 		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
494 		enum pgtable_level level)
495 {
496 	const int orig_nr_pages = nr_pages;
497 
498 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
499 
500 	switch (level) {
501 	case PGTABLE_LEVEL_PTE:
502 		if (!folio_test_large(folio)) {
503 			atomic_inc(&folio->_mapcount);
504 			break;
505 		}
506 
507 		if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
508 			do {
509 				atomic_inc(&page->_mapcount);
510 			} while (page++, --nr_pages > 0);
511 		}
512 		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
513 		break;
514 	case PGTABLE_LEVEL_PMD:
515 	case PGTABLE_LEVEL_PUD:
516 		atomic_inc(&folio->_entire_mapcount);
517 		folio_inc_large_mapcount(folio, dst_vma);
518 		break;
519 	default:
520 		BUILD_BUG();
521 	}
522 }
523 
524 /**
525  * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
526  * @folio:	The folio to duplicate the mappings of
527  * @page:	The first page to duplicate the mappings of
528  * @nr_pages:	The number of pages of which the mapping will be duplicated
529  * @dst_vma:	The destination vm area
530  *
531  * The page range of the folio is defined by [page, page + nr_pages)
532  *
533  * The caller needs to hold the page table lock.
534  */
535 static inline void folio_dup_file_rmap_ptes(struct folio *folio,
536 		struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
537 {
538 	__folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE);
539 }
540 
541 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
542 		struct page *page, struct vm_area_struct *dst_vma)
543 {
544 	__folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE);
545 }
546 
547 /**
548  * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
549  * @folio:	The folio to duplicate the mapping of
550  * @page:	The first page to duplicate the mapping of
551  * @dst_vma:	The destination vm area
552  *
553  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
554  *
555  * The caller needs to hold the page table lock.
556  */
557 static inline void folio_dup_file_rmap_pmd(struct folio *folio,
558 		struct page *page, struct vm_area_struct *dst_vma)
559 {
560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
561 	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
562 #else
563 	WARN_ON_ONCE(true);
564 #endif
565 }
566 
567 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
568 		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
569 		struct vm_area_struct *src_vma, enum pgtable_level level)
570 {
571 	const int orig_nr_pages = nr_pages;
572 	bool maybe_pinned;
573 	int i;
574 
575 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
576 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
577 
578 	/*
579 	 * If this folio may have been pinned by the parent process,
580 	 * don't allow to duplicate the mappings but instead require to e.g.,
581 	 * copy the subpage immediately for the child so that we'll always
582 	 * guarantee the pinned folio won't be randomly replaced in the
583 	 * future on write faults.
584 	 */
585 	maybe_pinned = likely(!folio_is_device_private(folio)) &&
586 		       unlikely(folio_needs_cow_for_dma(src_vma, folio));
587 
588 	/*
589 	 * No need to check+clear for already shared PTEs/PMDs of the
590 	 * folio. But if any page is PageAnonExclusive, we must fallback to
591 	 * copying if the folio maybe pinned.
592 	 */
593 	switch (level) {
594 	case PGTABLE_LEVEL_PTE:
595 		if (unlikely(maybe_pinned)) {
596 			for (i = 0; i < nr_pages; i++)
597 				if (PageAnonExclusive(page + i))
598 					return -EBUSY;
599 		}
600 
601 		if (!folio_test_large(folio)) {
602 			if (PageAnonExclusive(page))
603 				ClearPageAnonExclusive(page);
604 			atomic_inc(&folio->_mapcount);
605 			break;
606 		}
607 
608 		do {
609 			if (PageAnonExclusive(page))
610 				ClearPageAnonExclusive(page);
611 			if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
612 				atomic_inc(&page->_mapcount);
613 		} while (page++, --nr_pages > 0);
614 		folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
615 		break;
616 	case PGTABLE_LEVEL_PMD:
617 	case PGTABLE_LEVEL_PUD:
618 		if (PageAnonExclusive(page)) {
619 			if (unlikely(maybe_pinned))
620 				return -EBUSY;
621 			ClearPageAnonExclusive(page);
622 		}
623 		atomic_inc(&folio->_entire_mapcount);
624 		folio_inc_large_mapcount(folio, dst_vma);
625 		break;
626 	default:
627 		BUILD_BUG();
628 	}
629 	return 0;
630 }
631 
632 /**
633  * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
634  *				  of a folio
635  * @folio:	The folio to duplicate the mappings of
636  * @page:	The first page to duplicate the mappings of
637  * @nr_pages:	The number of pages of which the mapping will be duplicated
638  * @dst_vma:	The destination vm area
639  * @src_vma:	The vm area from which the mappings are duplicated
640  *
641  * The page range of the folio is defined by [page, page + nr_pages)
642  *
643  * The caller needs to hold the page table lock and the
644  * vma->vma_mm->write_protect_seq.
645  *
646  * Duplicating the mappings can only fail if the folio may be pinned; device
647  * private folios cannot get pinned and consequently this function cannot fail
648  * for them.
649  *
650  * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
651  * the parent and the child. They must *not* be writable after this call
652  * succeeded.
653  *
654  * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
655  */
656 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
657 		struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
658 		struct vm_area_struct *src_vma)
659 {
660 	return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
661 					 src_vma, PGTABLE_LEVEL_PTE);
662 }
663 
664 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
665 		struct page *page, struct vm_area_struct *dst_vma,
666 		struct vm_area_struct *src_vma)
667 {
668 	return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
669 					 PGTABLE_LEVEL_PTE);
670 }
671 
672 /**
673  * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
674  *				 of a folio
675  * @folio:	The folio to duplicate the mapping of
676  * @page:	The first page to duplicate the mapping of
677  * @dst_vma:	The destination vm area
678  * @src_vma:	The vm area from which the mapping is duplicated
679  *
680  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
681  *
682  * The caller needs to hold the page table lock and the
683  * vma->vma_mm->write_protect_seq.
684  *
685  * Duplicating the mapping can only fail if the folio may be pinned; device
686  * private folios cannot get pinned and consequently this function cannot fail
687  * for them.
688  *
689  * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
690  * the parent and the child. They must *not* be writable after this call
691  * succeeded.
692  *
693  * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
694  */
695 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
696 		struct page *page, struct vm_area_struct *dst_vma,
697 		struct vm_area_struct *src_vma)
698 {
699 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
700 	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
701 					 src_vma, PGTABLE_LEVEL_PMD);
702 #else
703 	WARN_ON_ONCE(true);
704 	return -EBUSY;
705 #endif
706 }
707 
708 static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
709 		struct page *page, int nr_pages, enum pgtable_level level)
710 {
711 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
712 	VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
713 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
714 
715 	/* device private folios cannot get pinned via GUP. */
716 	if (unlikely(folio_is_device_private(folio))) {
717 		ClearPageAnonExclusive(page);
718 		return 0;
719 	}
720 
721 	/*
722 	 * We have to make sure that when we clear PageAnonExclusive, that
723 	 * the page is not pinned and that concurrent GUP-fast won't succeed in
724 	 * concurrently pinning the page.
725 	 *
726 	 * Conceptually, PageAnonExclusive clearing consists of:
727 	 * (A1) Clear PTE
728 	 * (A2) Check if the page is pinned; back off if so.
729 	 * (A3) Clear PageAnonExclusive
730 	 * (A4) Restore PTE (optional, but certainly not writable)
731 	 *
732 	 * When clearing PageAnonExclusive, we cannot possibly map the page
733 	 * writable again, because anon pages that may be shared must never
734 	 * be writable. So in any case, if the PTE was writable it cannot
735 	 * be writable anymore afterwards and there would be a PTE change. Only
736 	 * if the PTE wasn't writable, there might not be a PTE change.
737 	 *
738 	 * Conceptually, GUP-fast pinning of an anon page consists of:
739 	 * (B1) Read the PTE
740 	 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
741 	 * (B3) Pin the mapped page
742 	 * (B4) Check if the PTE changed by re-reading it; back off if so.
743 	 * (B5) If the original PTE is not writable, check if
744 	 *	PageAnonExclusive is not set; back off if so.
745 	 *
746 	 * If the PTE was writable, we only have to make sure that GUP-fast
747 	 * observes a PTE change and properly backs off.
748 	 *
749 	 * If the PTE was not writable, we have to make sure that GUP-fast either
750 	 * detects a (temporary) PTE change or that PageAnonExclusive is cleared
751 	 * and properly backs off.
752 	 *
753 	 * Consequently, when clearing PageAnonExclusive(), we have to make
754 	 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
755 	 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
756 	 * and (B5) happen in the right memory order.
757 	 *
758 	 * We assume that there might not be a memory barrier after
759 	 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
760 	 * so we use explicit ones here.
761 	 */
762 
763 	/* Paired with the memory barrier in try_grab_folio(). */
764 	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
765 		smp_mb();
766 
767 	if (unlikely(folio_maybe_dma_pinned(folio)))
768 		return -EBUSY;
769 	ClearPageAnonExclusive(page);
770 
771 	/*
772 	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
773 	 * gup_must_unshare().
774 	 */
775 	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
776 		smp_mb__after_atomic();
777 	return 0;
778 }
779 
780 /**
781  * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
782  *				   mapped by a PTE possibly shared to prepare
783  *				   for KSM or temporary unmapping
784  * @folio:	The folio to share a mapping of
785  * @page:	The mapped exclusive page
786  *
787  * The caller needs to hold the page table lock and has to have the page table
788  * entries cleared/invalidated.
789  *
790  * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
791  * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
792  * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
793  *
794  * Marking the mapped page shared can only fail if the folio maybe pinned;
795  * device private folios cannot get pinned and consequently this function cannot
796  * fail.
797  *
798  * Returns 0 if marking the mapped page possibly shared succeeded. Returns
799  * -EBUSY otherwise.
800  */
801 static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
802 		struct page *page)
803 {
804 	return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE);
805 }
806 
807 /**
808  * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
809  *				   range mapped by a PMD possibly shared to
810  *				   prepare for temporary unmapping
811  * @folio:	The folio to share the mapping of
812  * @page:	The first page to share the mapping of
813  *
814  * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
815  *
816  * The caller needs to hold the page table lock and has to have the page table
817  * entries cleared/invalidated.
818  *
819  * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
820  * fork() to duplicate a mapping, but instead to prepare for temporarily
821  * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
822  *
823  * Marking the mapped pages shared can only fail if the folio maybe pinned;
824  * device private folios cannot get pinned and consequently this function cannot
825  * fail.
826  *
827  * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
828  * -EBUSY otherwise.
829  */
830 static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
831 		struct page *page)
832 {
833 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
834 	return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
835 					   PGTABLE_LEVEL_PMD);
836 #else
837 	WARN_ON_ONCE(true);
838 	return -EBUSY;
839 #endif
840 }
841 
842 /*
843  * Called from mm/vmscan.c to handle paging out
844  */
845 int folio_referenced(struct folio *, int is_locked,
846 			struct mem_cgroup *memcg, vm_flags_t *vm_flags);
847 
848 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
849 void try_to_unmap(struct folio *, enum ttu_flags flags);
850 
851 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
852 		void *owner, struct folio **foliop);
853 
854 /* Avoid racy checks */
855 #define PVMW_SYNC		(1 << 0)
856 /* Look for migration entries rather than present PTEs */
857 #define PVMW_MIGRATION		(1 << 1)
858 
859 /* Result flags */
860 
861 /* The page is mapped across page table boundary */
862 #define PVMW_PGTABLE_CROSSED	(1 << 16)
863 
864 struct page_vma_mapped_walk {
865 	unsigned long pfn;
866 	unsigned long nr_pages;
867 	pgoff_t pgoff;
868 	struct vm_area_struct *vma;
869 	unsigned long address;
870 	pmd_t *pmd;
871 	pte_t *pte;
872 	spinlock_t *ptl;
873 	unsigned int flags;
874 };
875 
876 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)	\
877 	struct page_vma_mapped_walk name = {				\
878 		.pfn = folio_pfn(_folio),				\
879 		.nr_pages = folio_nr_pages(_folio),			\
880 		.pgoff = folio_pgoff(_folio),				\
881 		.vma = _vma,						\
882 		.address = _address,					\
883 		.flags = _flags,					\
884 	}
885 
886 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
887 {
888 	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
889 	if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
890 		pte_unmap(pvmw->pte);
891 	if (pvmw->ptl)
892 		spin_unlock(pvmw->ptl);
893 }
894 
895 /**
896  * page_vma_mapped_walk_restart - Restart the page table walk.
897  * @pvmw: Pointer to struct page_vma_mapped_walk.
898  *
899  * It restarts the page table walk when changes occur in the page
900  * table, such as splitting a PMD. Ensures that the PTL held during
901  * the previous walk is released and resets the state to allow for
902  * a new walk starting at the current address stored in pvmw->address.
903  */
904 static inline void
905 page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
906 {
907 	WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);
908 
909 	if (likely(pvmw->ptl))
910 		spin_unlock(pvmw->ptl);
911 	else
912 		WARN_ON_ONCE(1);
913 
914 	pvmw->ptl = NULL;
915 	pvmw->pmd = NULL;
916 	pvmw->pte = NULL;
917 }
918 
919 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
920 unsigned long page_address_in_vma(const struct folio *folio,
921 		const struct page *, const struct vm_area_struct *);
922 
923 /*
924  * Cleans the PTEs of shared mappings.
925  * (and since clean PTEs should also be readonly, write protects them too)
926  *
927  * returns the number of cleaned PTEs.
928  */
929 int folio_mkclean(struct folio *);
930 
931 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
932 		unsigned long pfn, unsigned long nr_pages);
933 
934 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
935 		      struct vm_area_struct *vma);
936 
937 void remove_migration_ptes(struct folio *src, struct folio *dst,
938 		enum ttu_flags flags);
939 
940 /*
941  * rmap_walk_control: To control rmap traversing for specific needs
942  *
943  * arg: passed to rmap_one() and invalid_vma()
944  * try_lock: bail out if the rmap lock is contended
945  * contended: indicate the rmap traversal bailed out due to lock contention
946  * rmap_one: executed on each vma where page is mapped
947  * done: for checking traversing termination condition
948  * anon_lock: for getting anon_lock by optimized way rather than default
949  * invalid_vma: for skipping uninterested vma
950  */
951 struct rmap_walk_control {
952 	void *arg;
953 	bool try_lock;
954 	bool contended;
955 	/*
956 	 * Return false if page table scanning in rmap_walk should be stopped.
957 	 * Otherwise, return true.
958 	 */
959 	bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
960 					unsigned long addr, void *arg);
961 	int (*done)(struct folio *folio);
962 	struct anon_vma *(*anon_lock)(const struct folio *folio,
963 				      struct rmap_walk_control *rwc);
964 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
965 };
966 
967 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
968 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
969 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
970 					  struct rmap_walk_control *rwc);
971 
972 #else	/* !CONFIG_MMU */
973 
974 #define anon_vma_init()		do {} while (0)
975 #define anon_vma_prepare(vma)	(0)
976 
977 static inline int folio_referenced(struct folio *folio, int is_locked,
978 				  struct mem_cgroup *memcg,
979 				  vm_flags_t *vm_flags)
980 {
981 	*vm_flags = 0;
982 	return 0;
983 }
984 
985 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
986 {
987 }
988 
989 static inline int folio_mkclean(struct folio *folio)
990 {
991 	return 0;
992 }
993 #endif	/* CONFIG_MMU */
994 
995 #endif	/* _LINUX_RMAP_H */
996