xref: /linux/include/linux/mm.h (revision cc7a9f6e57c4f71e8e1fee3274b1ae8770f2a743)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_MM_H
3 #define _LINUX_MM_H
4 
5 #include <linux/args.h>
6 #include <linux/errno.h>
7 #include <linux/mmdebug.h>
8 #include <linux/gfp.h>
9 #include <linux/pgalloc_tag.h>
10 #include <linux/bug.h>
11 #include <linux/list.h>
12 #include <linux/mmzone.h>
13 #include <linux/rbtree.h>
14 #include <linux/atomic.h>
15 #include <linux/debug_locks.h>
16 #include <linux/compiler.h>
17 #include <linux/mm_types.h>
18 #include <linux/mmap_lock.h>
19 #include <linux/range.h>
20 #include <linux/pfn.h>
21 #include <linux/percpu-refcount.h>
22 #include <linux/bit_spinlock.h>
23 #include <linux/shrinker.h>
24 #include <linux/resource.h>
25 #include <linux/page_ext.h>
26 #include <linux/err.h>
27 #include <linux/page-flags.h>
28 #include <linux/page_ref.h>
29 #include <linux/overflow.h>
30 #include <linux/sched.h>
31 #include <linux/pgtable.h>
32 #include <linux/kasan.h>
33 #include <linux/memremap.h>
34 #include <linux/slab.h>
35 #include <linux/cacheinfo.h>
36 #include <linux/rcuwait.h>
37 #include <linux/bitmap.h>
38 #include <linux/bitops.h>
39 #include <linux/iommu-debug-pagealloc.h>
40 
41 struct mempolicy;
42 struct anon_vma;
43 struct anon_vma_chain;
44 struct user_struct;
45 struct pt_regs;
46 struct folio_batch;
47 
48 void arch_mm_preinit(void);
49 void mm_core_init_early(void);
50 void mm_core_init(void);
51 void init_mm_internals(void);
52 
53 extern atomic_long_t _totalram_pages;
54 static inline unsigned long totalram_pages(void)
55 {
56 	return (unsigned long)atomic_long_read(&_totalram_pages);
57 }
58 
59 static inline void totalram_pages_inc(void)
60 {
61 	atomic_long_inc(&_totalram_pages);
62 }
63 
64 static inline void totalram_pages_dec(void)
65 {
66 	atomic_long_dec(&_totalram_pages);
67 }
68 
69 static inline void totalram_pages_add(long count)
70 {
71 	atomic_long_add(count, &_totalram_pages);
72 }
73 
74 extern void * high_memory;
75 
76 /*
77  * Convert between pages and MB
78  * 20 is the shift for 1MB (2^20 = 1MB)
79  * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages)
80  * So (20 - PAGE_SHIFT) converts between pages and MB
81  */
82 #define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT))
83 #define MB_TO_PAGES(mb)    ((mb) << (20 - PAGE_SHIFT))
84 
85 #ifdef CONFIG_SYSCTL
86 extern int sysctl_legacy_va_layout;
87 #else
88 #define sysctl_legacy_va_layout 0
89 #endif
90 
91 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
92 extern const int mmap_rnd_bits_min;
93 extern int mmap_rnd_bits_max __ro_after_init;
94 extern int mmap_rnd_bits __read_mostly;
95 #endif
96 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
97 extern const int mmap_rnd_compat_bits_min;
98 extern const int mmap_rnd_compat_bits_max;
99 extern int mmap_rnd_compat_bits __read_mostly;
100 #endif
101 
102 #ifndef DIRECT_MAP_PHYSMEM_END
103 # ifdef MAX_PHYSMEM_BITS
104 # define DIRECT_MAP_PHYSMEM_END	((1ULL << MAX_PHYSMEM_BITS) - 1)
105 # else
106 # define DIRECT_MAP_PHYSMEM_END	(((phys_addr_t)-1)&~(1ULL<<63))
107 # endif
108 #endif
109 
110 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
111 
112 #include <asm/page.h>
113 #include <asm/processor.h>
114 
115 #ifndef __pa_symbol
116 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
117 #endif
118 
119 #ifndef page_to_virt
120 #define page_to_virt(x)	__va(PFN_PHYS(page_to_pfn(x)))
121 #endif
122 
123 #ifndef lm_alias
124 #define lm_alias(x)	__va(__pa_symbol(x))
125 #endif
126 
127 /*
128  * To prevent common memory management code establishing
129  * a zero page mapping on a read fault.
130  * This macro should be defined within <asm/pgtable.h>.
131  * s390 does this to prevent multiplexing of hardware bits
132  * related to the physical page in case of virtualization.
133  */
134 #ifndef mm_forbids_zeropage
135 #define mm_forbids_zeropage(X)	(0)
136 #endif
137 
138 /*
139  * On some architectures it is expensive to call memset() for small sizes.
140  * If an architecture decides to implement their own version of
141  * mm_zero_struct_page they should wrap the defines below in a #ifndef and
142  * define their own version of this macro in <asm/pgtable.h>
143  */
144 #if BITS_PER_LONG == 64
145 /* This function must be updated when the size of struct page grows above 96
146  * or reduces below 56. The idea that compiler optimizes out switch()
147  * statement, and only leaves move/store instructions. Also the compiler can
148  * combine write statements if they are both assignments and can be reordered,
149  * this can result in several of the writes here being dropped.
150  */
151 #define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
152 static inline void __mm_zero_struct_page(struct page *page)
153 {
154 	unsigned long *_pp = (void *)page;
155 
156 	 /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
157 	BUILD_BUG_ON(sizeof(struct page) & 7);
158 	BUILD_BUG_ON(sizeof(struct page) < 56);
159 	BUILD_BUG_ON(sizeof(struct page) > 96);
160 
161 	switch (sizeof(struct page)) {
162 	case 96:
163 		_pp[11] = 0;
164 		fallthrough;
165 	case 88:
166 		_pp[10] = 0;
167 		fallthrough;
168 	case 80:
169 		_pp[9] = 0;
170 		fallthrough;
171 	case 72:
172 		_pp[8] = 0;
173 		fallthrough;
174 	case 64:
175 		_pp[7] = 0;
176 		fallthrough;
177 	case 56:
178 		_pp[6] = 0;
179 		_pp[5] = 0;
180 		_pp[4] = 0;
181 		_pp[3] = 0;
182 		_pp[2] = 0;
183 		_pp[1] = 0;
184 		_pp[0] = 0;
185 	}
186 }
187 #else
188 #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
189 #endif
190 
191 /*
192  * Default maximum number of active map areas, this limits the number of vmas
193  * per mm struct. Users can overwrite this number by sysctl but there is a
194  * problem.
195  *
196  * When a program's coredump is generated as ELF format, a section is created
197  * per a vma. In ELF, the number of sections is represented in unsigned short.
198  * This means the number of sections should be smaller than 65535 at coredump.
199  * Because the kernel adds some informative sections to a image of program at
200  * generating coredump, we need some margin. The number of extra sections is
201  * 1-3 now and depends on arch. We use "5" as safe margin, here.
202  *
203  * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
204  * not a hard limit any more. Although some userspace tools can be surprised by
205  * that.
206  */
207 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
208 #define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
209 
210 extern unsigned long sysctl_user_reserve_kbytes;
211 extern unsigned long sysctl_admin_reserve_kbytes;
212 
213 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
214 bool page_range_contiguous(const struct page *page, unsigned long nr_pages);
215 #else
216 static inline bool page_range_contiguous(const struct page *page,
217 		unsigned long nr_pages)
218 {
219 	return true;
220 }
221 #endif
222 
223 /* to align the pointer to the (next) page boundary */
224 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
225 
226 /* to align the pointer to the (prev) page boundary */
227 #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
228 
229 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
230 #define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
231 
232 /**
233  * folio_page_idx - Return the number of a page in a folio.
234  * @folio: The folio.
235  * @page: The folio page.
236  *
237  * This function expects that the page is actually part of the folio.
238  * The returned number is relative to the start of the folio.
239  */
240 static inline unsigned long folio_page_idx(const struct folio *folio,
241 		const struct page *page)
242 {
243 	return page - &folio->page;
244 }
245 
246 static inline struct folio *lru_to_folio(struct list_head *head)
247 {
248 	return list_entry((head)->prev, struct folio, lru);
249 }
250 
251 void setup_initial_init_mm(void *start_code, void *end_code,
252 			   void *end_data, void *brk);
253 
254 /*
255  * Linux kernel virtual memory manager primitives.
256  * The idea being to have a "virtual" mm in the same way
257  * we have a virtual fs - giving a cleaner interface to the
258  * mm details, and allowing different kinds of memory mappings
259  * (from shared memory to executable loading to arbitrary
260  * mmap() functions).
261  */
262 
263 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
264 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
265 void vm_area_free(struct vm_area_struct *);
266 
267 #ifndef CONFIG_MMU
268 extern struct rb_root nommu_region_tree;
269 extern struct rw_semaphore nommu_region_sem;
270 
271 extern unsigned int kobjsize(const void *objp);
272 #endif
273 
274 /*
275  * vm_flags in vm_area_struct, see mm_types.h.
276  * When changing, update also include/trace/events/mmflags.h
277  */
278 
279 #define VM_NONE		0x00000000
280 
281 /**
282  * typedef vma_flag_t - specifies an individual VMA flag by bit number.
283  *
284  * This value is made type safe by sparse to avoid passing invalid flag values
285  * around.
286  */
287 typedef int __bitwise vma_flag_t;
288 
289 #define DECLARE_VMA_BIT(name, bitnum) \
290 	VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
291 #define DECLARE_VMA_BIT_ALIAS(name, aliased) \
292 	VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
293 enum {
294 	DECLARE_VMA_BIT(READ, 0),
295 	DECLARE_VMA_BIT(WRITE, 1),
296 	DECLARE_VMA_BIT(EXEC, 2),
297 	DECLARE_VMA_BIT(SHARED, 3),
298 	/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
299 	DECLARE_VMA_BIT(MAYREAD, 4),	/* limits for mprotect() etc. */
300 	DECLARE_VMA_BIT(MAYWRITE, 5),
301 	DECLARE_VMA_BIT(MAYEXEC, 6),
302 	DECLARE_VMA_BIT(MAYSHARE, 7),
303 	DECLARE_VMA_BIT(GROWSDOWN, 8),	/* general info on the segment */
304 #ifdef CONFIG_MMU
305 	DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
306 #else
307 	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
308 	DECLARE_VMA_BIT(MAYOVERLAY, 9),
309 #endif /* CONFIG_MMU */
310 	/* Page-ranges managed without "struct page", just pure PFN */
311 	DECLARE_VMA_BIT(PFNMAP, 10),
312 	DECLARE_VMA_BIT(MAYBE_GUARD, 11),
313 	DECLARE_VMA_BIT(UFFD_WP, 12),	/* wrprotect pages tracking */
314 	DECLARE_VMA_BIT(LOCKED, 13),
315 	DECLARE_VMA_BIT(IO, 14),	/* Memory mapped I/O or similar */
316 	DECLARE_VMA_BIT(SEQ_READ, 15),	/* App will access data sequentially */
317 	DECLARE_VMA_BIT(RAND_READ, 16),	/* App will not benefit from clustered reads */
318 	DECLARE_VMA_BIT(DONTCOPY, 17),	/* Do not copy this vma on fork */
319 	DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
320 	DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
321 	DECLARE_VMA_BIT(ACCOUNT, 20),	/* Is a VM accounted object */
322 	DECLARE_VMA_BIT(NORESERVE, 21),	/* should the VM suppress accounting */
323 	DECLARE_VMA_BIT(HUGETLB, 22),	/* Huge TLB Page VM */
324 	DECLARE_VMA_BIT(SYNC, 23),	/* Synchronous page faults */
325 	DECLARE_VMA_BIT(ARCH_1, 24),	/* Architecture-specific flag */
326 	DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
327 	DECLARE_VMA_BIT(DONTDUMP, 26),	/* Do not include in the core dump */
328 	DECLARE_VMA_BIT(SOFTDIRTY, 27),	/* NOT soft dirty clean area */
329 	DECLARE_VMA_BIT(MIXEDMAP, 28),	/* Can contain struct page and pure PFN pages */
330 	DECLARE_VMA_BIT(HUGEPAGE, 29),	/* MADV_HUGEPAGE marked this vma */
331 	DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
332 	DECLARE_VMA_BIT(MERGEABLE, 31),	/* KSM may merge identical pages */
333 	/* These bits are reused, we define specific uses below. */
334 	DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
335 	DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
336 	DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
337 	DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
338 	DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
339 	DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
340 	DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
341 	/*
342 	 * This flag is used to connect VFIO to arch specific KVM code. It
343 	 * indicates that the memory under this VMA is safe for use with any
344 	 * non-cachable memory type inside KVM. Some VFIO devices, on some
345 	 * platforms, are thought to be unsafe and can cause machine crashes
346 	 * if KVM does not lock down the memory type.
347 	 */
348 	DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
349 #if defined(CONFIG_PPC32)
350 	DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
351 #elif defined(CONFIG_64BIT)
352 	DECLARE_VMA_BIT(DROPPABLE, 40),
353 #endif
354 	DECLARE_VMA_BIT(UFFD_MINOR, 41),
355 	DECLARE_VMA_BIT(SEALED, 42),
356 	/* Flags that reuse flags above. */
357 	DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
358 	DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
359 	DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
360 	DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
361 	DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
362 #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_RISCV_USER_CFI)
363 	/*
364 	 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
365 	 * support core mm.
366 	 *
367 	 * These VMAs will get a single end guard page. This helps userspace
368 	 * protect itself from attacks. A single page is enough for current
369 	 * shadow stack archs (x86). See the comments near alloc_shstk() in
370 	 * arch/x86/kernel/shstk.c for more details on the guard size.
371 	 */
372 	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
373 #elif defined(CONFIG_ARM64_GCS)
374 	/*
375 	 * arm64's Guarded Control Stack implements similar functionality and
376 	 * has similar constraints to shadow stacks.
377 	 */
378 	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
379 #endif
380 	DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),		/* Strong Access Ordering (powerpc) */
381 	DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),		/* parisc */
382 	DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),	/* sparc64 */
383 	DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),	/* arm64 */
384 	DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),	/* sparc64, arm64 */
385 	DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),	/* !CONFIG_MMU */
386 	DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),	/* arm64 */
387 	DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
388 #ifdef CONFIG_STACK_GROWSUP
389 	DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
390 	DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
391 #else
392 	DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
393 #endif
394 };
395 #undef DECLARE_VMA_BIT
396 #undef DECLARE_VMA_BIT_ALIAS
397 
398 #define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
399 #define VM_READ		INIT_VM_FLAG(READ)
400 #define VM_WRITE	INIT_VM_FLAG(WRITE)
401 #define VM_EXEC		INIT_VM_FLAG(EXEC)
402 #define VM_SHARED	INIT_VM_FLAG(SHARED)
403 #define VM_MAYREAD	INIT_VM_FLAG(MAYREAD)
404 #define VM_MAYWRITE	INIT_VM_FLAG(MAYWRITE)
405 #define VM_MAYEXEC	INIT_VM_FLAG(MAYEXEC)
406 #define VM_MAYSHARE	INIT_VM_FLAG(MAYSHARE)
407 #define VM_GROWSDOWN	INIT_VM_FLAG(GROWSDOWN)
408 #ifdef CONFIG_MMU
409 #define VM_UFFD_MISSING	INIT_VM_FLAG(UFFD_MISSING)
410 #else
411 #define VM_UFFD_MISSING	VM_NONE
412 #define VM_MAYOVERLAY	INIT_VM_FLAG(MAYOVERLAY)
413 #endif
414 #define VM_PFNMAP	INIT_VM_FLAG(PFNMAP)
415 #define VM_MAYBE_GUARD	INIT_VM_FLAG(MAYBE_GUARD)
416 #define VM_UFFD_WP	INIT_VM_FLAG(UFFD_WP)
417 #define VM_LOCKED	INIT_VM_FLAG(LOCKED)
418 #define VM_IO		INIT_VM_FLAG(IO)
419 #define VM_SEQ_READ	INIT_VM_FLAG(SEQ_READ)
420 #define VM_RAND_READ	INIT_VM_FLAG(RAND_READ)
421 #define VM_DONTCOPY	INIT_VM_FLAG(DONTCOPY)
422 #define VM_DONTEXPAND	INIT_VM_FLAG(DONTEXPAND)
423 #define VM_LOCKONFAULT	INIT_VM_FLAG(LOCKONFAULT)
424 #define VM_ACCOUNT	INIT_VM_FLAG(ACCOUNT)
425 #define VM_NORESERVE	INIT_VM_FLAG(NORESERVE)
426 #define VM_HUGETLB	INIT_VM_FLAG(HUGETLB)
427 #define VM_SYNC		INIT_VM_FLAG(SYNC)
428 #define VM_ARCH_1	INIT_VM_FLAG(ARCH_1)
429 #define VM_WIPEONFORK	INIT_VM_FLAG(WIPEONFORK)
430 #define VM_DONTDUMP	INIT_VM_FLAG(DONTDUMP)
431 #ifdef CONFIG_MEM_SOFT_DIRTY
432 #define VM_SOFTDIRTY	INIT_VM_FLAG(SOFTDIRTY)
433 #else
434 #define VM_SOFTDIRTY	VM_NONE
435 #endif
436 #define VM_MIXEDMAP	INIT_VM_FLAG(MIXEDMAP)
437 #define VM_HUGEPAGE	INIT_VM_FLAG(HUGEPAGE)
438 #define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
439 #define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
440 #define VM_STACK	INIT_VM_FLAG(STACK)
441 #ifdef CONFIG_STACK_GROWSUP
442 #define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
443 #else
444 #define VM_STACK_EARLY	VM_NONE
445 #endif
446 #ifdef CONFIG_ARCH_HAS_PKEYS
447 #define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
448 /* Despite the naming, these are FLAGS not bits. */
449 #define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
450 #define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
451 #define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
452 #if CONFIG_ARCH_PKEY_BITS > 3
453 #define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
454 #else
455 #define VM_PKEY_BIT3  VM_NONE
456 #endif /* CONFIG_ARCH_PKEY_BITS > 3 */
457 #if CONFIG_ARCH_PKEY_BITS > 4
458 #define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
459 #else
460 #define VM_PKEY_BIT4  VM_NONE
461 #endif /* CONFIG_ARCH_PKEY_BITS > 4 */
462 #endif /* CONFIG_ARCH_HAS_PKEYS */
463 #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \
464 	defined(CONFIG_RISCV_USER_CFI)
465 #define VM_SHADOW_STACK	INIT_VM_FLAG(SHADOW_STACK)
466 #define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT)
467 #else
468 #define VM_SHADOW_STACK	VM_NONE
469 #define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT)
470 #endif
471 #if defined(CONFIG_PPC64)
472 #define VM_SAO		INIT_VM_FLAG(SAO)
473 #elif defined(CONFIG_PARISC)
474 #define VM_GROWSUP	INIT_VM_FLAG(GROWSUP)
475 #elif defined(CONFIG_SPARC64)
476 #define VM_SPARC_ADI	INIT_VM_FLAG(SPARC_ADI)
477 #define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
478 #elif defined(CONFIG_ARM64)
479 #define VM_ARM64_BTI	INIT_VM_FLAG(ARM64_BTI)
480 #define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
481 #elif !defined(CONFIG_MMU)
482 #define VM_MAPPED_COPY	INIT_VM_FLAG(MAPPED_COPY)
483 #endif
484 #ifndef VM_GROWSUP
485 #define VM_GROWSUP	VM_NONE
486 #endif
487 #ifdef CONFIG_ARM64_MTE
488 #define VM_MTE		INIT_VM_FLAG(MTE)
489 #define VM_MTE_ALLOWED	INIT_VM_FLAG(MTE_ALLOWED)
490 #else
491 #define VM_MTE		VM_NONE
492 #define VM_MTE_ALLOWED	VM_NONE
493 #endif
494 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
495 #define VM_UFFD_MINOR	INIT_VM_FLAG(UFFD_MINOR)
496 #else
497 #define VM_UFFD_MINOR	VM_NONE
498 #endif
499 
500 /*
501  * vma_flags_t masks for the userfaultfd VMA flags. VMA_UFFD_MINOR is gated on
502  * the same config as VM_UFFD_MINOR -- which implies 64BIT, where the bit fits
503  * -- so an out-of-range bit is never fed to mk_vma_flags() on a build whose
504  * bitmap cannot hold it.
505  */
506 #define VMA_UFFD_MISSING	mk_vma_flags(VMA_UFFD_MISSING_BIT)
507 #define VMA_UFFD_WP		mk_vma_flags(VMA_UFFD_WP_BIT)
508 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
509 #define VMA_UFFD_MINOR		mk_vma_flags(VMA_UFFD_MINOR_BIT)
510 #else
511 #define VMA_UFFD_MINOR		EMPTY_VMA_FLAGS
512 #endif
513 
514 #ifdef CONFIG_64BIT
515 #define VM_ALLOW_ANY_UNCACHED	INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
516 #define VM_SEALED		INIT_VM_FLAG(SEALED)
517 #else
518 #define VM_ALLOW_ANY_UNCACHED	VM_NONE
519 #define VM_SEALED		VM_NONE
520 #endif
521 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
522 #define VM_DROPPABLE		INIT_VM_FLAG(DROPPABLE)
523 #define VMA_DROPPABLE		mk_vma_flags(VMA_DROPPABLE_BIT)
524 #else
525 #define VM_DROPPABLE		VM_NONE
526 #define VMA_DROPPABLE		EMPTY_VMA_FLAGS
527 #endif
528 
529 /* Bits set in the VMA until the stack is in its final location */
530 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
531 
532 #define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \
533 		       VMA_EXEC_BIT : VMA_READ_BIT)
534 
535 /* Common data flag combinations */
536 #define VMA_DATA_FLAGS_TSK_EXEC	mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
537 		TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT,	  \
538 		VMA_MAYEXEC_BIT)
539 #define VMA_DATA_FLAGS_NON_EXEC	mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
540 		VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT)
541 #define VMA_DATA_FLAGS_EXEC	mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \
542 		VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT,	  \
543 		VMA_MAYEXEC_BIT)
544 
545 #ifndef VMA_DATA_DEFAULT_FLAGS		/* arch can override this */
546 #define VMA_DATA_DEFAULT_FLAGS  VMA_DATA_FLAGS_EXEC
547 #endif
548 
549 #ifndef VMA_STACK_DEFAULT_FLAGS		/* arch can override this */
550 #define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS
551 #endif
552 
553 #define VMA_STACK_FLAGS	append_vma_flags(VMA_STACK_DEFAULT_FLAGS,	\
554 		VMA_STACK_BIT, VMA_ACCOUNT_BIT)
555 
556 /* Temporary until VMA flags conversion complete. */
557 #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS)
558 
559 #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
560 #define VM_SEALED_SYSMAP	VM_SEALED
561 #else
562 #define VM_SEALED_SYSMAP	VM_NONE
563 #endif
564 
565 /* VMA basic access permission flags */
566 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
567 #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT)
568 
569 /*
570  * Special vmas that are non-mergable, non-mlock()able.
571  */
572 
573 #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \
574 				       VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT)
575 #define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS)
576 
577 /*
578  * Physically remapped pages are special. Tell the
579  * rest of the world about it:
580  *   IO tells people not to look at these pages
581  *	(accesses can have side effects).
582  *   PFNMAP tells the core MM that the base pages are just
583  *	raw PFN mappings, and do not have a "struct page" associated
584  *	with them.
585  *   DONTEXPAND
586  *      Disable vma merging and expanding with mremap().
587  *   DONTDUMP
588  *      Omit vma from core dump, even when VM_IO turned off.
589  */
590 #define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,	\
591 				     VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)
592 
593 /* This mask prevents VMA from being scanned with khugepaged */
594 #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
595 
596 /* This mask defines which mm->def_flags a process can inherit its parent */
597 #define VM_INIT_DEF_MASK	VM_NOHUGEPAGE
598 
599 /* This mask represents all the VMA flag bits used by mlock */
600 #define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
601 
602 #define VMA_LOCKED_MASK	mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT)
603 
604 /* These flags can be updated atomically via VMA/mmap read lock. */
605 #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
606 
607 /* Arch-specific flags to clear when updating VM flags on protection change */
608 #ifndef VM_ARCH_CLEAR
609 #define VM_ARCH_CLEAR	VM_NONE
610 #endif
611 #define VM_FLAGS_CLEAR	(ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
612 
613 /*
614  * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
615  * possesses it but the other does not, the merged VMA should nonetheless have
616  * applied to it:
617  *
618  *   VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its
619  *                       references cleared via /proc/$pid/clear_refs, any
620  *                       merged VMA should be considered soft-dirty also as it
621  *                       operates at a VMA granularity.
622  *
623  * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies
624  *                       that mapped page tables may contain metadata not
625  *                       described by the VMA and thus any merged VMA may also
626  *                       contain this metadata, and thus we must make this flag
627  *                       sticky.
628  */
629 #ifdef CONFIG_MEM_SOFT_DIRTY
630 #define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT)
631 #else
632 #define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT)
633 #endif
634 
635 /*
636  * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
637  * of these flags and the other not does not preclude a merge.
638  *
639  *    VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they
640  *                       are 'sticky'. If any sticky flags exist in either VMA,
641  *                       we simply set all of them on the merged VMA.
642  */
643 #define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS
644 
645 /*
646  * Flags which should result in page tables being copied on fork. These are
647  * flags which indicate that the VMA maps page tables which cannot be
648  * reconsistuted upon page fault, so necessitate page table copying upon fork.
649  *
650  * Note that these flags should be compared with the DESTINATION VMA not the
651  * source, as VM_UFFD_WP may not be propagated to destination, while all other
652  * flags will be.
653  *
654  * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
655  *                           reasonably reconstructed on page fault.
656  *
657  *              VM_UFFD_WP - Encodes metadata about an installed uffd
658  *                           write protect handler, which cannot be
659  *                           reconstructed on page fault.
660  *
661  *                           We always copy pgtables when dst_vma has uffd-wp
662  *                           enabled even if it's file-backed
663  *                           (e.g. shmem). Because when uffd-wp is enabled,
664  *                           pgtable contains uffd-wp protection information,
665  *                           that's something we can't retrieve from page cache,
666  *                           and skip copying will lose those info.
667  *
668  *          VM_MAYBE_GUARD - Could contain page guard region markers which
669  *                           by design are a property of the page tables
670  *                           only and thus cannot be reconstructed on page
671  *                           fault.
672  */
673 #define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
674 
675 /*
676  * mapping from the currently active vm_flags protection bits (the
677  * low four bits) to a page protection mask..
678  */
679 
680 /*
681  * The default fault flags that should be used by most of the
682  * arch-specific page fault handlers.
683  */
684 #define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
685 			     FAULT_FLAG_KILLABLE | \
686 			     FAULT_FLAG_INTERRUPTIBLE)
687 
688 /**
689  * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
690  * @flags: Fault flags.
691  *
692  * This is mostly used for places where we want to try to avoid taking
693  * the mmap_lock for too long a time when waiting for another condition
694  * to change, in which case we can try to be polite to release the
695  * mmap_lock in the first round to avoid potential starvation of other
696  * processes that would also want the mmap_lock.
697  *
698  * Return: true if the page fault allows retry and this is the first
699  * attempt of the fault handling; false otherwise.
700  */
701 static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
702 {
703 	return (flags & FAULT_FLAG_ALLOW_RETRY) &&
704 	    (!(flags & FAULT_FLAG_TRIED));
705 }
706 
707 #define FAULT_FLAG_TRACE \
708 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
709 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
710 	{ FAULT_FLAG_ALLOW_RETRY,	"ALLOW_RETRY" }, \
711 	{ FAULT_FLAG_RETRY_NOWAIT,	"RETRY_NOWAIT" }, \
712 	{ FAULT_FLAG_KILLABLE,		"KILLABLE" }, \
713 	{ FAULT_FLAG_TRIED,		"TRIED" }, \
714 	{ FAULT_FLAG_USER,		"USER" }, \
715 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
716 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
717 	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
718 	{ FAULT_FLAG_VMA_LOCK,		"VMA_LOCK" }
719 
720 /*
721  * vm_fault is filled by the pagefault handler and passed to the vma's
722  * ->fault function. The vma's ->fault is responsible for returning a bitmask
723  * of VM_FAULT_xxx flags that give details about how the fault was handled.
724  *
725  * MM layer fills up gfp_mask for page allocations but fault handler might
726  * alter it if its implementation requires a different allocation context.
727  *
728  * pgoff should be used in favour of virtual_address, if possible.
729  */
730 struct vm_fault {
731 	const struct {
732 		struct vm_area_struct *vma;	/* Target VMA */
733 		gfp_t gfp_mask;			/* gfp mask to be used for allocations */
734 		pgoff_t pgoff;			/* Logical page offset based on vma */
735 		unsigned long address;		/* Faulting virtual address - masked */
736 		unsigned long real_address;	/* Faulting virtual address - unmasked */
737 	};
738 	enum fault_flag flags;		/* FAULT_FLAG_xxx flags
739 					 * XXX: should really be 'const' */
740 	pmd_t *pmd;			/* Pointer to pmd entry matching
741 					 * the 'address' */
742 	pud_t *pud;			/* Pointer to pud entry matching
743 					 * the 'address'
744 					 */
745 	union {
746 		pte_t orig_pte;		/* Value of PTE at the time of fault */
747 		pmd_t orig_pmd;		/* Value of PMD at the time of fault,
748 					 * used by PMD fault only.
749 					 */
750 	};
751 
752 	struct page *cow_page;		/* Page handler may use for COW fault */
753 	struct page *page;		/* ->fault handlers should return a
754 					 * page here, unless VM_FAULT_NOPAGE
755 					 * is set (which is also implied by
756 					 * VM_FAULT_ERROR).
757 					 */
758 	/* These three entries are valid only while holding ptl lock */
759 	pte_t *pte;			/* Pointer to pte entry matching
760 					 * the 'address'. NULL if the page
761 					 * table hasn't been allocated.
762 					 */
763 	spinlock_t *ptl;		/* Page table lock.
764 					 * Protects pte page table if 'pte'
765 					 * is not NULL, otherwise pmd.
766 					 */
767 	pgtable_t prealloc_pte;		/* Pre-allocated pte page table.
768 					 * vm_ops->map_pages() sets up a page
769 					 * table from atomic context.
770 					 * do_fault_around() pre-allocates
771 					 * page table to avoid allocation from
772 					 * atomic context.
773 					 */
774 };
775 
776 struct vm_uffd_ops;
777 
778 /*
779  * These are the virtual MM functions - opening of an area, closing and
780  * unmapping it (needed to keep files on disk up-to-date etc), pointer
781  * to the functions called when a no-page or a wp-page exception occurs.
782  */
783 struct vm_operations_struct {
784 	/**
785 	 * @open: Called when a VMA is remapped, split or forked. Not called
786 	 * upon first mapping a VMA.
787 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
788 	 */
789 	void (*open)(struct vm_area_struct *vma);
790 	/**
791 	 * @close: Called when the VMA is being removed from the MM.
792 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
793 	 */
794 	void (*close)(struct vm_area_struct *vma);
795 	/**
796 	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
797 	 * the new VMA is merged with an adjacent VMA.
798 	 *
799 	 * The @vm_private_data field is an output field allowing the user to
800 	 * modify vma->vm_private_data as necessary.
801 	 *
802 	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
803 	 * set from f_op->mmap.
804 	 *
805 	 * Returns %0 on success, or an error otherwise. On error, the VMA will
806 	 * be unmapped.
807 	 *
808 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
809 	 */
810 	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
811 		      const struct file *file, void **vm_private_data);
812 	/* Called any time before splitting to check if it's allowed */
813 	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
814 	int (*mremap)(struct vm_area_struct *vma);
815 	/*
816 	 * Called by mprotect() to make driver-specific permission
817 	 * checks before mprotect() is finalised.   The VMA must not
818 	 * be modified.  Returns 0 if mprotect() can proceed.
819 	 */
820 	int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
821 			unsigned long end, unsigned long newflags);
822 	vm_fault_t (*fault)(struct vm_fault *vmf);
823 	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
824 	vm_fault_t (*map_pages)(struct vm_fault *vmf,
825 			pgoff_t start_pgoff, pgoff_t end_pgoff);
826 	unsigned long (*pagesize)(struct vm_area_struct *vma);
827 
828 	/* notification that a previously read-only page is about to become
829 	 * writable, if an error is returned it will cause a SIGBUS */
830 	vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
831 
832 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
833 	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
834 
835 	/* called by access_process_vm when get_user_pages() fails, typically
836 	 * for use by special VMAs. See also generic_access_phys() for a generic
837 	 * implementation useful for any iomem mapping.
838 	 */
839 	int (*access)(struct vm_area_struct *vma, unsigned long addr,
840 		      void *buf, int len, int write);
841 
842 	/* Called by the /proc/PID/maps code to ask the vma whether it
843 	 * has a special name.  Returning non-NULL will also cause this
844 	 * vma to be dumped unconditionally. */
845 	const char *(*name)(struct vm_area_struct *vma);
846 
847 #ifdef CONFIG_NUMA
848 	/*
849 	 * set_policy() op must add a reference to any non-NULL @new mempolicy
850 	 * to hold the policy upon return.  Caller should pass NULL @new to
851 	 * remove a policy and fall back to surrounding context--i.e. do not
852 	 * install a MPOL_DEFAULT policy, nor the task or system default
853 	 * mempolicy.
854 	 */
855 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
856 
857 	/*
858 	 * get_policy() op must add reference [mpol_get()] to any policy at
859 	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
860 	 * in mm/mempolicy.c will do this automatically.
861 	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
862 	 * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
863 	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
864 	 * must return NULL--i.e., do not "fallback" to task or system default
865 	 * policy.
866 	 */
867 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
868 					unsigned long addr, pgoff_t *ilx);
869 #endif
870 #ifdef CONFIG_FIND_NORMAL_PAGE
871 	/*
872 	 * Called by vm_normal_page() for special PTEs in @vma at @addr. This
873 	 * allows for returning a "normal" page from vm_normal_page() even
874 	 * though the PTE indicates that the "struct page" either does not exist
875 	 * or should not be touched: "special".
876 	 *
877 	 * Do not add new users: this really only works when a "normal" page
878 	 * was mapped, but then the PTE got changed to something weird (+
879 	 * marked special) that would not make pte_pfn() identify the originally
880 	 * inserted page.
881 	 */
882 	struct page *(*find_normal_page)(struct vm_area_struct *vma,
883 					 unsigned long addr);
884 #endif /* CONFIG_FIND_NORMAL_PAGE */
885 #ifdef CONFIG_USERFAULTFD
886 	const struct vm_uffd_ops *uffd_ops;
887 #endif
888 };
889 
890 #ifdef CONFIG_NUMA_BALANCING
891 static inline void vma_numab_state_init(struct vm_area_struct *vma)
892 {
893 	vma->numab_state = NULL;
894 }
895 static inline void vma_numab_state_free(struct vm_area_struct *vma)
896 {
897 	kfree(vma->numab_state);
898 }
899 #else
900 static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
901 static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
902 #endif /* CONFIG_NUMA_BALANCING */
903 
904 /*
905  * These must be here rather than mmap_lock.h as dependent on vm_fault type,
906  * declared in this header.
907  */
908 #ifdef CONFIG_PER_VMA_LOCK
909 static inline void release_fault_lock(struct vm_fault *vmf)
910 {
911 	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
912 		vma_end_read(vmf->vma);
913 	else
914 		mmap_read_unlock(vmf->vma->vm_mm);
915 }
916 
917 static inline void assert_fault_locked(const struct vm_fault *vmf)
918 {
919 	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
920 		vma_assert_locked(vmf->vma);
921 	else
922 		mmap_assert_locked(vmf->vma->vm_mm);
923 }
924 #else
925 static inline void release_fault_lock(struct vm_fault *vmf)
926 {
927 	mmap_read_unlock(vmf->vma->vm_mm);
928 }
929 
930 static inline void assert_fault_locked(const struct vm_fault *vmf)
931 {
932 	mmap_assert_locked(vmf->vma->vm_mm);
933 }
934 #endif /* CONFIG_PER_VMA_LOCK */
935 
936 static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
937 {
938 	return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
939 }
940 
941 static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm)
942 {
943 	return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
944 }
945 
946 static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm)
947 {
948 	return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
949 }
950 
951 static inline void mm_flags_set(int flag, struct mm_struct *mm)
952 {
953 	set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
954 }
955 
956 static inline void mm_flags_clear(int flag, struct mm_struct *mm)
957 {
958 	clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
959 }
960 
961 static inline void mm_flags_clear_all(struct mm_struct *mm)
962 {
963 	bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS);
964 }
965 
966 extern const struct vm_operations_struct vma_dummy_vm_ops;
967 
968 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
969 {
970 	memset(vma, 0, sizeof(*vma));
971 	vma->vm_mm = mm;
972 	vma->vm_ops = &vma_dummy_vm_ops;
973 	INIT_LIST_HEAD(&vma->anon_vma_chain);
974 	vma_lock_init(vma, false);
975 }
976 
977 /* Use when VMA is not part of the VMA tree and needs no locking */
978 static inline void vm_flags_init(struct vm_area_struct *vma,
979 				 vm_flags_t flags)
980 {
981 	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
982 	vma_flags_clear_all(&vma->flags);
983 	vma_flags_overwrite_word(&vma->flags, flags);
984 }
985 
986 /*
987  * Use when VMA is part of the VMA tree and modifications need coordination
988  * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
989  * it should be locked explicitly beforehand.
990  */
991 static inline void vm_flags_reset(struct vm_area_struct *vma,
992 				  vm_flags_t flags)
993 {
994 	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
995 	vma_assert_write_locked(vma);
996 	vm_flags_init(vma, flags);
997 }
998 
999 static inline void vma_flags_reset_once(struct vm_area_struct *vma,
1000 					vma_flags_t *flags)
1001 {
1002 	const unsigned long word = flags->__vma_flags[0];
1003 
1004 	/* It is assumed only the first system word must be written once. */
1005 	vma_flags_overwrite_word_once(&vma->flags, word);
1006 	/* The remainder can be copied normally. */
1007 	if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
1008 		unsigned long *dst = &vma->flags.__vma_flags[1];
1009 		const unsigned long *src = &flags->__vma_flags[1];
1010 
1011 		bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG);
1012 	}
1013 }
1014 
1015 static inline void vm_flags_set(struct vm_area_struct *vma,
1016 				vm_flags_t flags)
1017 {
1018 	vma_start_write(vma);
1019 	vma_flags_set_word(&vma->flags, flags);
1020 }
1021 
1022 static inline void vm_flags_clear(struct vm_area_struct *vma,
1023 				  vm_flags_t flags)
1024 {
1025 	VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
1026 	vma_start_write(vma);
1027 	vma_flags_clear_word(&vma->flags, flags);
1028 }
1029 
1030 /*
1031  * Use only if VMA is not part of the VMA tree or has no other users and
1032  * therefore needs no locking.
1033  */
1034 static inline void __vm_flags_mod(struct vm_area_struct *vma,
1035 				  vm_flags_t set, vm_flags_t clear)
1036 {
1037 	vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
1038 }
1039 
1040 /*
1041  * Use only when the order of set/clear operations is unimportant, otherwise
1042  * use vm_flags_{set|clear} explicitly.
1043  */
1044 static inline void vm_flags_mod(struct vm_area_struct *vma,
1045 				vm_flags_t set, vm_flags_t clear)
1046 {
1047 	vma_start_write(vma);
1048 	__vm_flags_mod(vma, set, clear);
1049 }
1050 
1051 static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma,
1052 		vma_flag_t bit)
1053 {
1054 	const vm_flags_t mask = BIT((__force int)bit);
1055 
1056 	/* Only specific flags are permitted */
1057 	if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
1058 		return false;
1059 
1060 	return true;
1061 }
1062 
1063 /*
1064  * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
1065  * valid flags are allowed to do this.
1066  */
1067 static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma,
1068 		vma_flag_t bit)
1069 {
1070 	unsigned long *bitmap = vma->flags.__vma_flags;
1071 
1072 	vma_assert_stabilised(vma);
1073 	if (__vma_atomic_valid_flag(vma, bit))
1074 		set_bit((__force int)bit, bitmap);
1075 }
1076 
1077 /*
1078  * Test for VMA flag atomically. Requires no locks. Only specific valid flags
1079  * are allowed to do this.
1080  *
1081  * This is necessarily racey, so callers must ensure that serialisation is
1082  * achieved through some other means, or that races are permissible.
1083  */
1084 static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma,
1085 		vma_flag_t bit)
1086 {
1087 	if (__vma_atomic_valid_flag(vma, bit))
1088 		return test_bit((__force int)bit, &vma->vm_flags);
1089 
1090 	return false;
1091 }
1092 
1093 /* Set an individual VMA flag in flags, non-atomically. */
1094 static __always_inline void vma_flags_set_flag(vma_flags_t *flags,
1095 		vma_flag_t bit)
1096 {
1097 	unsigned long *bitmap = flags->__vma_flags;
1098 
1099 	__set_bit((__force int)bit, bitmap);
1100 }
1101 
1102 static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags,
1103 		size_t count, const vma_flag_t *bits)
1104 {
1105 	int i;
1106 
1107 	for (i = 0; i < count; i++)
1108 		vma_flags_set_flag(&flags, bits[i]);
1109 	return flags;
1110 }
1111 
1112 /*
1113  * Helper macro which bitwise-or combines the specified input flags into a
1114  * vma_flags_t bitmap value. E.g.:
1115  *
1116  * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT,
1117  *              VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT);
1118  *
1119  * The compiler cleverly optimises away all of the work and this ends up being
1120  * equivalent to aggregating the values manually.
1121  */
1122 #define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS,			\
1123 		COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__})
1124 
1125 /*
1126  * Helper macro which acts like mk_vma_flags, only appending to a copy of the
1127  * specified flags rather than establishing new flags. E.g.:
1128  *
1129  * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT,
1130  *              VMA_ACCOUNT_BIT);
1131  */
1132 #define append_vma_flags(flags, ...) __mk_vma_flags(flags,			\
1133 		COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__})
1134 
1135 /* Calculates the number of set bits in the specified VMA flags. */
1136 static __always_inline int vma_flags_count(const vma_flags_t *flags)
1137 {
1138 	const unsigned long *bitmap = flags->__vma_flags;
1139 
1140 	return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS);
1141 }
1142 
1143 /*
1144  * Test whether a specific VMA flag is set, e.g.:
1145  *
1146  * if (vma_flags_test(flags, VMA_READ_BIT)) { ... }
1147  */
1148 static __always_inline bool vma_flags_test(const vma_flags_t *flags,
1149 		vma_flag_t bit)
1150 {
1151 	const unsigned long *bitmap = flags->__vma_flags;
1152 
1153 	return test_bit((__force int)bit, bitmap);
1154 }
1155 
1156 /*
1157  * Obtain a set of VMA flags which contain the overlapping flags contained
1158  * within flags and to_and.
1159  */
1160 static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags,
1161 						      vma_flags_t to_and)
1162 {
1163 	vma_flags_t dst;
1164 	unsigned long *bitmap_dst = dst.__vma_flags;
1165 	const unsigned long *bitmap = flags->__vma_flags;
1166 	const unsigned long *bitmap_to_and = to_and.__vma_flags;
1167 
1168 	bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS);
1169 	return dst;
1170 }
1171 
1172 /*
1173  * Obtain a set of VMA flags which contains the specified overlapping flags,
1174  * e.g.:
1175  *
1176  * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT,
1177  *                                        VMA_MAY_READ_BIT);
1178  */
1179 #define vma_flags_and(flags, ...)				\
1180 	vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__))
1181 
1182 /*  Test each of to_test flags in flags, non-atomically. */
1183 static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags,
1184 		vma_flags_t to_test)
1185 {
1186 	const unsigned long *bitmap = flags->__vma_flags;
1187 	const unsigned long *bitmap_to_test = to_test.__vma_flags;
1188 
1189 	return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
1190 }
1191 
1192 /*
1193  * Test whether any specified VMA flag is set, e.g.:
1194  *
1195  * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
1196  */
1197 #define vma_flags_test_any(flags, ...) \
1198 	vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__))
1199 
1200 /* Test that ALL of the to_test flags are set, non-atomically. */
1201 static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
1202 		vma_flags_t to_test)
1203 {
1204 	const unsigned long *bitmap = flags->__vma_flags;
1205 	const unsigned long *bitmap_to_test = to_test.__vma_flags;
1206 
1207 	return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
1208 }
1209 
1210 /*
1211  * Test whether ALL specified VMA flags are set, e.g.:
1212  *
1213  * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
1214  */
1215 #define vma_flags_test_all(flags, ...) \
1216 	vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
1217 
1218 /*
1219  * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set
1220  * (returning false if flagmask has no flags set).
1221  *
1222  * This is defined to make the semantics clearer when testing an optionally
1223  * defined VMA flags mask, e.g.:
1224  *
1225  * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... }
1226  *
1227  * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS
1228  * otherwise.
1229  */
1230 static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags,
1231 		vma_flags_t flagmask)
1232 {
1233 	VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1);
1234 
1235 	return vma_flags_test_any_mask(flags, flagmask);
1236 }
1237 
1238 /* Set each of the to_set flags in flags, non-atomically. */
1239 static __always_inline void vma_flags_set_mask(vma_flags_t *flags,
1240 		vma_flags_t to_set)
1241 {
1242 	unsigned long *bitmap = flags->__vma_flags;
1243 	const unsigned long *bitmap_to_set = to_set.__vma_flags;
1244 
1245 	bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
1246 }
1247 
1248 /*
1249  * Set all specified VMA flags, e.g.:
1250  *
1251  * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
1252  */
1253 #define vma_flags_set(flags, ...) \
1254 	vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
1255 
1256 static __always_inline vma_flags_t __mk_vma_flags_from_masks(size_t count,
1257 		const vma_flags_t *masks)
1258 {
1259 	vma_flags_t flags = EMPTY_VMA_FLAGS;
1260 	size_t i;
1261 
1262 	for (i = 0; i < count; i++)
1263 		vma_flags_set_mask(&flags, masks[i]);
1264 	return flags;
1265 }
1266 
1267 /*
1268  * Combine pre-computed vma_flags_t masks into one value, e.g.:
1269  *
1270  * vma_flags_t flags = mk_vma_flags_from_masks(VMA_UFFD_WP, VMA_UFFD_MINOR);
1271  *
1272  * Unlike mk_vma_flags(), which takes bit numbers, this takes whole masks --
1273  * each of which may be EMPTY_VMA_FLAGS when its feature is unavailable -- so a
1274  * bit that does not exist on the current build is never materialised.
1275  */
1276 #define mk_vma_flags_from_masks(...)					\
1277 	__mk_vma_flags_from_masks(COUNT_ARGS(__VA_ARGS__),		\
1278 		(const vma_flags_t []){__VA_ARGS__})
1279 
1280 /* Clear all of the to-clear flags in flags, non-atomically. */
1281 static __always_inline void vma_flags_clear_mask(vma_flags_t *flags,
1282 		vma_flags_t to_clear)
1283 {
1284 	unsigned long *bitmap = flags->__vma_flags;
1285 	const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
1286 
1287 	bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
1288 }
1289 
1290 /*
1291  * Clear all specified individual flags, e.g.:
1292  *
1293  * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
1294  */
1295 #define vma_flags_clear(flags, ...) \
1296 	vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
1297 
1298 /*
1299  * Obtain a VMA flags value containing those flags that are present in flags or
1300  * flags_other but not in both.
1301  */
1302 static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags,
1303 		const vma_flags_t *flags_other)
1304 {
1305 	vma_flags_t dst;
1306 	const unsigned long *bitmap_other = flags_other->__vma_flags;
1307 	const unsigned long *bitmap = flags->__vma_flags;
1308 	unsigned long *bitmap_dst = dst.__vma_flags;
1309 
1310 	bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
1311 	return dst;
1312 }
1313 
1314 /* Determine if flags and flags_other have precisely the same flags set. */
1315 static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags,
1316 						const vma_flags_t *flags_other)
1317 {
1318 	const unsigned long *bitmap = flags->__vma_flags;
1319 	const unsigned long *bitmap_other = flags_other->__vma_flags;
1320 
1321 	return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
1322 }
1323 
1324 /* Determine if flags and flags_other have precisely the same flags set.  */
1325 static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags,
1326 						vma_flags_t flags_other)
1327 {
1328 	const unsigned long *bitmap = flags->__vma_flags;
1329 	const unsigned long *bitmap_other = flags_other.__vma_flags;
1330 
1331 	return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS);
1332 }
1333 
1334 /*
1335  * Helper macro to determine if only the specific flags are set, e.g.:
1336  *
1337  * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... }
1338  */
1339 #define vma_flags_same(flags, ...) \
1340 	vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__))
1341 
1342 /*
1343  * Test whether a specific flag in the VMA is set, e.g.:
1344  *
1345  * if (vma_test(vma, VMA_READ_BIT)) { ... }
1346  */
1347 static __always_inline bool vma_test(const struct vm_area_struct *vma,
1348 		vma_flag_t bit)
1349 {
1350 	return vma_flags_test(&vma->flags, bit);
1351 }
1352 
1353 /* Helper to test any VMA flags in a VMA . */
1354 static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma,
1355 		vma_flags_t flags)
1356 {
1357 	return vma_flags_test_any_mask(&vma->flags, flags);
1358 }
1359 
1360 /*
1361  * Helper macro for testing whether any VMA flags are set in a VMA,
1362  * e.g.:
1363  *
1364  * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT,
1365  *		VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
1366  */
1367 #define vma_test_any(vma, ...) \
1368 	vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__))
1369 
1370 /*
1371  * Helper to test that ALL specified flags are set in a VMA.
1372  *
1373  * Note: appropriate locks must be held, this function does not acquire them for
1374  * you.
1375  */
1376 static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma,
1377 		vma_flags_t flags)
1378 {
1379 	return vma_flags_test_all_mask(&vma->flags, flags);
1380 }
1381 
1382 /*
1383  * Helper macro for checking that ALL specified flags are set in a VMA, e.g.:
1384  *
1385  * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... }
1386  */
1387 #define vma_test_all(vma, ...) \
1388 	vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__))
1389 
1390 /*
1391  * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set
1392  * (returning false if flagmask has no flags set).
1393  *
1394  * This is useful when a flag needs to be either defined or not depending upon
1395  * kernel configuration, e.g.:
1396  *
1397  * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... }
1398  *
1399  * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS
1400  * otherwise.
1401  */
1402 static __always_inline bool
1403 vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask)
1404 {
1405 	return vma_flags_test_single_mask(&vma->flags, flagmask);
1406 }
1407 
1408 /*
1409  * Helper to set all VMA flags in a VMA.
1410  *
1411  * Note: appropriate locks must be held, this function does not acquire them for
1412  * you.
1413  */
1414 static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma,
1415 		vma_flags_t flags)
1416 {
1417 	vma_flags_set_mask(&vma->flags, flags);
1418 }
1419 
1420 /*
1421  * Helper macro for specifying VMA flags in a VMA, e.g.:
1422  *
1423  * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
1424  * 		VMA_DONTDUMP_BIT);
1425  *
1426  * Note: appropriate locks must be held, this function does not acquire them for
1427  * you.
1428  */
1429 #define vma_set_flags(vma, ...) \
1430 	vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
1431 
1432 /* Helper to clear all VMA flags in a VMA. */
1433 static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma,
1434 		vma_flags_t flags)
1435 {
1436 	vma_flags_clear_mask(&vma->flags, flags);
1437 }
1438 
1439 /*
1440  * Helper macro for clearing VMA flags, e.g.:
1441  *
1442  * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
1443  * 		VMA_DONTDUMP_BIT);
1444  */
1445 #define vma_clear_flags(vma, ...) \
1446 	vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
1447 
1448 /*
1449  * Test whether a specific VMA flag is set in a VMA descriptor, e.g.:
1450  *
1451  * if (vma_desc_test(desc, VMA_READ_BIT)) { ... }
1452  */
1453 static __always_inline bool vma_desc_test(const struct vm_area_desc *desc,
1454 		vma_flag_t bit)
1455 {
1456 	return vma_flags_test(&desc->vma_flags, bit);
1457 }
1458 
1459 /* Helper to test any VMA flags in a VMA descriptor. */
1460 static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc,
1461 		vma_flags_t flags)
1462 {
1463 	return vma_flags_test_any_mask(&desc->vma_flags, flags);
1464 }
1465 
1466 /*
1467  * Helper macro for testing whether any VMA flags are set in a VMA descriptor,
1468  * e.g.:
1469  *
1470  * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT,
1471  *		VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
1472  */
1473 #define vma_desc_test_any(desc, ...) \
1474 	vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__))
1475 
1476 /* Helper to test all VMA flags in a VMA descriptor. */
1477 static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc,
1478 		vma_flags_t flags)
1479 {
1480 	return vma_flags_test_all_mask(&desc->vma_flags, flags);
1481 }
1482 
1483 /*
1484  * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor,
1485  * e.g.:
1486  *
1487  * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
1488  */
1489 #define vma_desc_test_all(desc, ...) \
1490 	vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__))
1491 
1492 /* Helper to set all VMA flags in a VMA descriptor. */
1493 static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
1494 		vma_flags_t flags)
1495 {
1496 	vma_flags_set_mask(&desc->vma_flags, flags);
1497 }
1498 
1499 /*
1500  * Helper macro for specifying VMA flags for an input pointer to a struct
1501  * vm_area_desc object describing a proposed VMA, e.g.:
1502  *
1503  * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
1504  * 		VMA_DONTDUMP_BIT);
1505  */
1506 #define vma_desc_set_flags(desc, ...) \
1507 	vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
1508 
1509 /* Helper to clear all VMA flags in a VMA descriptor. */
1510 static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
1511 		vma_flags_t flags)
1512 {
1513 	vma_flags_clear_mask(&desc->vma_flags, flags);
1514 }
1515 
1516 /*
1517  * Helper macro for clearing VMA flags for an input pointer to a struct
1518  * vm_area_desc object describing a proposed VMA, e.g.:
1519  *
1520  * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
1521  * 		VMA_DONTDUMP_BIT);
1522  */
1523 #define vma_desc_clear_flags(desc, ...) \
1524 	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
1525 
1526 static inline void vma_set_anonymous(struct vm_area_struct *vma)
1527 {
1528 	vma->vm_ops = NULL;
1529 }
1530 
1531 static inline void vma_desc_set_anonymous(struct vm_area_desc *desc)
1532 {
1533 	desc->vm_ops = NULL;
1534 }
1535 
1536 static inline bool vma_is_anonymous(struct vm_area_struct *vma)
1537 {
1538 	return !vma->vm_ops;
1539 }
1540 
1541 /*
1542  * Indicate if the VMA is a heap for the given task; for
1543  * /proc/PID/maps that is the heap of the main task.
1544  */
1545 static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
1546 {
1547 	return vma->vm_start < vma->vm_mm->brk &&
1548 		vma->vm_end > vma->vm_mm->start_brk;
1549 }
1550 
1551 /*
1552  * Indicate if the VMA is a stack for the given task; for
1553  * /proc/PID/maps that is the stack of the main task.
1554  */
1555 static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
1556 {
1557 	/*
1558 	 * We make no effort to guess what a given thread considers to be
1559 	 * its "stack".  It's not even well-defined for programs written
1560 	 * languages like Go.
1561 	 */
1562 	return vma->vm_start <= vma->vm_mm->start_stack &&
1563 		vma->vm_end >= vma->vm_mm->start_stack;
1564 }
1565 
1566 static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma)
1567 {
1568 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1569 
1570 	if (!maybe_stack)
1571 		return false;
1572 
1573 	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
1574 						VM_STACK_INCOMPLETE_SETUP)
1575 		return true;
1576 
1577 	return false;
1578 }
1579 
1580 static inline bool vma_is_foreign(const struct vm_area_struct *vma)
1581 {
1582 	if (!current->mm)
1583 		return true;
1584 
1585 	if (current->mm != vma->vm_mm)
1586 		return true;
1587 
1588 	return false;
1589 }
1590 
1591 static inline bool vma_is_accessible(const struct vm_area_struct *vma)
1592 {
1593 	return vma->vm_flags & VM_ACCESS_FLAGS;
1594 }
1595 
1596 static inline bool is_shared_maywrite(const vma_flags_t *flags)
1597 {
1598 	return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
1599 }
1600 
1601 static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
1602 {
1603 	return is_shared_maywrite(&vma->flags);
1604 }
1605 
1606 /**
1607  * vma_kernel_pagesize - Default page size granularity for this VMA.
1608  * @vma: The user mapping.
1609  *
1610  * The kernel page size specifies in which granularity VMA modifications
1611  * can be performed. Folios in this VMA will be aligned to, and at least
1612  * the size of the number of bytes returned by this function.
1613  *
1614  * The default kernel page size is not affected by Transparent Huge Pages
1615  * being in effect.
1616  *
1617  * Return: The default page size granularity for this VMA.
1618  */
1619 static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1620 {
1621 	if (unlikely(vma->vm_ops && vma->vm_ops->pagesize))
1622 		return vma->vm_ops->pagesize(vma);
1623 	return PAGE_SIZE;
1624 }
1625 
1626 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);
1627 
1628 static inline
1629 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
1630 {
1631 	return mas_find(&vmi->mas, max - 1);
1632 }
1633 
1634 static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
1635 {
1636 	/*
1637 	 * Uses mas_find() to get the first VMA when the iterator starts.
1638 	 * Calling mas_next() could skip the first entry.
1639 	 */
1640 	return mas_find(&vmi->mas, ULONG_MAX);
1641 }
1642 
1643 static inline
1644 struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
1645 {
1646 	return mas_next_range(&vmi->mas, ULONG_MAX);
1647 }
1648 
1649 
1650 static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
1651 {
1652 	return mas_prev(&vmi->mas, 0);
1653 }
1654 
1655 static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
1656 			unsigned long start, unsigned long end, gfp_t gfp)
1657 {
1658 	__mas_set_range(&vmi->mas, start, end - 1);
1659 	mas_store_gfp(&vmi->mas, NULL, gfp);
1660 	if (unlikely(mas_is_err(&vmi->mas)))
1661 		return -ENOMEM;
1662 
1663 	return 0;
1664 }
1665 
1666 /* Free any unused preallocations */
1667 static inline void vma_iter_free(struct vma_iterator *vmi)
1668 {
1669 	mas_destroy(&vmi->mas);
1670 }
1671 
1672 static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
1673 				      struct vm_area_struct *vma)
1674 {
1675 	vmi->mas.index = vma->vm_start;
1676 	vmi->mas.last = vma->vm_end - 1;
1677 	mas_store(&vmi->mas, vma);
1678 	if (unlikely(mas_is_err(&vmi->mas)))
1679 		return -ENOMEM;
1680 
1681 	vma_mark_attached(vma);
1682 	return 0;
1683 }
1684 
1685 static inline void vma_iter_invalidate(struct vma_iterator *vmi)
1686 {
1687 	mas_pause(&vmi->mas);
1688 }
1689 
1690 static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
1691 {
1692 	mas_set(&vmi->mas, addr);
1693 }
1694 
1695 #define for_each_vma(__vmi, __vma)					\
1696 	while (((__vma) = vma_next(&(__vmi))) != NULL)
1697 
1698 /* The MM code likes to work with exclusive end addresses */
1699 #define for_each_vma_range(__vmi, __vma, __end)				\
1700 	while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
1701 
1702 #ifdef CONFIG_SHMEM
1703 /*
1704  * The vma_is_shmem is not inline because it is used only by slow
1705  * paths in userfault.
1706  */
1707 bool vma_is_shmem(const struct vm_area_struct *vma);
1708 bool vma_is_anon_shmem(const struct vm_area_struct *vma);
1709 #else
1710 static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; }
1711 static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; }
1712 #endif
1713 
1714 int vma_is_stack_for_current(const struct vm_area_struct *vma);
1715 
1716 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
1717 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
1718 
1719 struct mmu_gather;
1720 struct inode;
1721 
1722 extern void prep_compound_page(struct page *page, unsigned int order);
1723 
1724 static inline unsigned int folio_large_order(const struct folio *folio)
1725 {
1726 	return folio->_flags_1 & 0xff;
1727 }
1728 
1729 #ifdef NR_PAGES_IN_LARGE_FOLIO
1730 static inline unsigned long folio_large_nr_pages(const struct folio *folio)
1731 {
1732 	return folio->_nr_pages;
1733 }
1734 #else
1735 static inline unsigned long folio_large_nr_pages(const struct folio *folio)
1736 {
1737 	return 1L << folio_large_order(folio);
1738 }
1739 #endif
1740 
1741 /*
1742  * compound_order() can be called without holding a reference, which means
1743  * that niceties like page_folio() don't work.  These callers should be
1744  * prepared to handle wild return values.  For example, PG_head may be
1745  * set before the order is initialised, or this may be a tail page.
1746  * See compaction.c for some good examples.
1747  */
1748 static inline unsigned int compound_order(const struct page *page)
1749 {
1750 	const struct folio *folio = (struct folio *)page;
1751 
1752 	if (!test_bit(PG_head, &folio->flags.f))
1753 		return 0;
1754 	return folio_large_order(folio);
1755 }
1756 
1757 /**
1758  * folio_order - The allocation order of a folio.
1759  * @folio: The folio.
1760  *
1761  * A folio is composed of 2^order pages.  See get_order() for the definition
1762  * of order.
1763  *
1764  * Return: The order of the folio.
1765  */
1766 static inline unsigned int folio_order(const struct folio *folio)
1767 {
1768 	if (!folio_test_large(folio))
1769 		return 0;
1770 	return folio_large_order(folio);
1771 }
1772 
1773 /**
1774  * folio_reset_order - Reset the folio order and derived _nr_pages
1775  * @folio: The folio.
1776  *
1777  * Reset the order and derived _nr_pages to 0. Must only be used in the
1778  * process of splitting large folios.
1779  */
1780 static inline void folio_reset_order(struct folio *folio)
1781 {
1782 	if (WARN_ON_ONCE(!folio_test_large(folio)))
1783 		return;
1784 	folio->_flags_1 &= ~0xffUL;
1785 #ifdef NR_PAGES_IN_LARGE_FOLIO
1786 	folio->_nr_pages = 0;
1787 #endif
1788 }
1789 
1790 #include <linux/huge_mm.h>
1791 
1792 /*
1793  * Methods to modify the page usage count.
1794  *
1795  * What counts for a page usage:
1796  * - cache mapping   (page->mapping)
1797  * - private data    (page->private)
1798  * - page mapped in a task's page tables, each mapping
1799  *   is counted separately
1800  *
1801  * Also, many kernel routines increase the page count before a critical
1802  * routine so they can be sure the page doesn't go away from under them.
1803  */
1804 
1805 /*
1806  * Drop a ref, return true if the refcount fell to zero (the page has no users)
1807  */
1808 static inline int put_page_testzero(struct page *page)
1809 {
1810 	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
1811 	return page_ref_dec_and_test(page);
1812 }
1813 
1814 static inline int folio_put_testzero(struct folio *folio)
1815 {
1816 	return put_page_testzero(&folio->page);
1817 }
1818 
1819 /*
1820  * Try to grab a ref unless the page has a refcount of zero, return false if
1821  * that is the case.
1822  * This can be called when MMU is off so it must not access
1823  * any of the virtual mappings.
1824  */
1825 static inline bool get_page_unless_zero(struct page *page)
1826 {
1827 	return page_ref_add_unless_zero(page, 1);
1828 }
1829 
1830 static inline struct folio *folio_get_nontail_page(struct page *page)
1831 {
1832 	if (unlikely(!get_page_unless_zero(page)))
1833 		return NULL;
1834 	return (struct folio *)page;
1835 }
1836 
1837 extern int page_is_ram(unsigned long pfn);
1838 
1839 enum {
1840 	REGION_INTERSECTS,
1841 	REGION_DISJOINT,
1842 	REGION_MIXED,
1843 };
1844 
1845 int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
1846 		      unsigned long desc);
1847 
1848 /* Support for virtually mapped pages */
1849 struct page *vmalloc_to_page(const void *addr);
1850 unsigned long vmalloc_to_pfn(const void *addr);
1851 
1852 /*
1853  * Determine if an address is within the vmalloc range
1854  *
1855  * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
1856  * is no special casing required.
1857  */
1858 #ifdef CONFIG_MMU
1859 extern bool is_vmalloc_addr(const void *x);
1860 extern int is_vmalloc_or_module_addr(const void *x);
1861 #else
1862 static inline bool is_vmalloc_addr(const void *x)
1863 {
1864 	return false;
1865 }
1866 static inline int is_vmalloc_or_module_addr(const void *x)
1867 {
1868 	return 0;
1869 }
1870 #endif
1871 
1872 /*
1873  * How many times the entire folio is mapped as a single unit (eg by a
1874  * PMD or PUD entry).  This is probably not what you want, except for
1875  * debugging purposes or implementation of other core folio_*() primitives.
1876  */
1877 static inline int folio_entire_mapcount(const struct folio *folio)
1878 {
1879 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1880 	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
1881 		return 0;
1882 	return atomic_read(&folio->_entire_mapcount) + 1;
1883 }
1884 
1885 static inline int folio_large_mapcount(const struct folio *folio)
1886 {
1887 	VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
1888 	return atomic_read(&folio->_large_mapcount) + 1;
1889 }
1890 
1891 /**
1892  * folio_mapcount() - Number of mappings of this folio.
1893  * @folio: The folio.
1894  *
1895  * The folio mapcount corresponds to the number of present user page table
1896  * entries that reference any part of a folio. Each such present user page
1897  * table entry must be paired with exactly on folio reference.
1898  *
1899  * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
1900  * exactly once.
1901  *
1902  * For hugetlb folios, each abstracted "hugetlb" user page table entry that
1903  * references the entire folio counts exactly once, even when such special
1904  * page table entries are comprised of multiple ordinary page table entries.
1905  *
1906  * Will report 0 for pages which cannot be mapped into userspace, such as
1907  * slab, page tables and similar.
1908  *
1909  * Return: The number of times this folio is mapped.
1910  */
1911 static inline int folio_mapcount(const struct folio *folio)
1912 {
1913 	int mapcount;
1914 
1915 	if (likely(!folio_test_large(folio))) {
1916 		mapcount = atomic_read(&folio->_mapcount) + 1;
1917 		if (page_mapcount_is_type(mapcount))
1918 			mapcount = 0;
1919 		return mapcount;
1920 	}
1921 	return folio_large_mapcount(folio);
1922 }
1923 
1924 /**
1925  * folio_mapped - Is this folio mapped into userspace?
1926  * @folio: The folio.
1927  *
1928  * Return: True if any page in this folio is referenced by user page tables.
1929  */
1930 static inline bool folio_mapped(const struct folio *folio)
1931 {
1932 	return folio_mapcount(folio) >= 1;
1933 }
1934 
1935 static inline struct page *virt_to_head_page(const void *x)
1936 {
1937 	struct page *page = virt_to_page(x);
1938 
1939 	return compound_head(page);
1940 }
1941 
1942 static inline struct folio *virt_to_folio(const void *x)
1943 {
1944 	struct page *page = virt_to_page(x);
1945 
1946 	return page_folio(page);
1947 }
1948 
1949 void __folio_put(struct folio *folio);
1950 
1951 void split_page(struct page *page, unsigned int order);
1952 void folio_copy(struct folio *dst, struct folio *src);
1953 int folio_mc_copy(struct folio *dst, struct folio *src);
1954 
1955 unsigned long nr_free_buffer_pages(void);
1956 
1957 /* Returns the number of bytes in this potentially compound page. */
1958 static inline unsigned long page_size(const struct page *page)
1959 {
1960 	return PAGE_SIZE << compound_order(page);
1961 }
1962 
1963 /* Returns the number of bits needed for the number of bytes in a page */
1964 static inline unsigned int page_shift(struct page *page)
1965 {
1966 	return PAGE_SHIFT + compound_order(page);
1967 }
1968 
1969 /**
1970  * thp_order - Order of a transparent huge page.
1971  * @page: Head page of a transparent huge page.
1972  */
1973 static inline unsigned int thp_order(struct page *page)
1974 {
1975 	VM_BUG_ON_PGFLAGS(PageTail(page), page);
1976 	return compound_order(page);
1977 }
1978 
1979 /**
1980  * thp_size - Size of a transparent huge page.
1981  * @page: Head page of a transparent huge page.
1982  *
1983  * Return: Number of bytes in this page.
1984  */
1985 static inline unsigned long thp_size(struct page *page)
1986 {
1987 	return PAGE_SIZE << thp_order(page);
1988 }
1989 
1990 #ifdef CONFIG_MMU
1991 /*
1992  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
1993  * servicing faults for write access.  In the normal case, do always want
1994  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
1995  * that do not have writing enabled, when used by access_process_vm.
1996  */
1997 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1998 {
1999 	if (likely(vma->vm_flags & VM_WRITE))
2000 		pte = pte_mkwrite(pte, vma);
2001 	return pte;
2002 }
2003 
2004 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
2005 void set_pte_range(struct vm_fault *vmf, struct folio *folio,
2006 		struct page *page, unsigned int nr, unsigned long addr);
2007 
2008 vm_fault_t finish_fault(struct vm_fault *vmf);
2009 #endif
2010 
2011 /*
2012  * Multiple processes may "see" the same page. E.g. for untouched
2013  * mappings of /dev/null, all processes see the same page full of
2014  * zeroes, and text pages of executables and shared libraries have
2015  * only one copy in memory, at most, normally.
2016  *
2017  * For the non-reserved pages, page_count(page) denotes a reference count.
2018  *   page_count() == 0 means the page is free. page->lru is then used for
2019  *   freelist management in the buddy allocator.
2020  *   page_count() > 0  means the page has been allocated.
2021  *
2022  * Pages are allocated by the slab allocator in order to provide memory
2023  * to kmalloc and kmem_cache_alloc. In this case, the management of the
2024  * page, and the fields in 'struct page' are the responsibility of mm/slab.c
2025  * unless a particular usage is carefully commented. (the responsibility of
2026  * freeing the kmalloc memory is the caller's, of course).
2027  *
2028  * A page may be used by anyone else who does a __get_free_page().
2029  * In this case, page_count still tracks the references, and should only
2030  * be used through the normal accessor functions. The top bits of page->flags
2031  * and page->virtual store page management information, but all other fields
2032  * are unused and could be used privately, carefully. The management of this
2033  * page is the responsibility of the one who allocated it, and those who have
2034  * subsequently been given references to it.
2035  *
2036  * The other pages (we may call them "pagecache pages") are completely
2037  * managed by the Linux memory manager: I/O, buffers, swapping etc.
2038  * The following discussion applies only to them.
2039  *
2040  * A pagecache page contains an opaque `private' member, which belongs to the
2041  * page's address_space. Usually, this is the address of a circular list of
2042  * the page's disk buffers. PG_private must be set to tell the VM to call
2043  * into the filesystem to release these pages.
2044  *
2045  * A folio may belong to an inode's memory mapping. In this case,
2046  * folio->mapping points to the inode, and folio->index is the file
2047  * offset of the folio, in units of PAGE_SIZE.
2048  *
2049  * If pagecache pages are not associated with an inode, they are said to be
2050  * anonymous pages. These may become associated with the swapcache, and in that
2051  * case PG_swapcache is set, and page->private is an offset into the swapcache.
2052  *
2053  * In either case (swapcache or inode backed), the pagecache itself holds one
2054  * reference to the page. Setting PG_private should also increment the
2055  * refcount. The each user mapping also has a reference to the page.
2056  *
2057  * The pagecache pages are stored in a per-mapping radix tree, which is
2058  * rooted at mapping->i_pages, and indexed by offset.
2059  * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
2060  * lists, we instead now tag pages as dirty/writeback in the radix tree.
2061  *
2062  * All pagecache pages may be subject to I/O:
2063  * - inode pages may need to be read from disk,
2064  * - inode pages which have been modified and are MAP_SHARED may need
2065  *   to be written back to the inode on disk,
2066  * - anonymous pages (including MAP_PRIVATE file mappings) which have been
2067  *   modified may need to be swapped out to swap space and (later) to be read
2068  *   back into memory.
2069  */
2070 
2071 /* 127: arbitrary random number, small enough to assemble well */
2072 #define folio_ref_zero_or_close_to_overflow(folio) \
2073 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
2074 
2075 /**
2076  * folio_get - Increment the reference count on a folio.
2077  * @folio: The folio.
2078  *
2079  * Context: May be called in any context, as long as you know that
2080  * you have a refcount on the folio.  If you do not already have one,
2081  * folio_try_get() may be the right interface for you to use.
2082  */
2083 static inline void folio_get(struct folio *folio)
2084 {
2085 	VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
2086 	folio_ref_inc(folio);
2087 }
2088 
2089 static inline void get_page(struct page *page)
2090 {
2091 	struct folio *folio = page_folio(page);
2092 	if (WARN_ON_ONCE(folio_test_slab(folio)))
2093 		return;
2094 	if (WARN_ON_ONCE(folio_test_large_kmalloc(folio)))
2095 		return;
2096 	folio_get(folio);
2097 }
2098 
2099 static inline __must_check bool try_get_page(struct page *page)
2100 {
2101 	page = compound_head(page);
2102 	if (WARN_ON_ONCE(page_ref_count(page) <= 0))
2103 		return false;
2104 	page_ref_inc(page);
2105 	return true;
2106 }
2107 
2108 /**
2109  * folio_put - Decrement the reference count on a folio.
2110  * @folio: The folio.
2111  *
2112  * If the folio's reference count reaches zero, the memory will be
2113  * released back to the page allocator and may be used by another
2114  * allocation immediately.  Do not access the memory or the struct folio
2115  * after calling folio_put() unless you can be sure that it wasn't the
2116  * last reference.
2117  *
2118  * Context: May be called in process or interrupt context, but not in NMI
2119  * context.  May be called while holding a spinlock.
2120  */
2121 static inline void folio_put(struct folio *folio)
2122 {
2123 	if (folio_put_testzero(folio))
2124 		__folio_put(folio);
2125 }
2126 
2127 /**
2128  * folio_put_refs - Reduce the reference count on a folio.
2129  * @folio: The folio.
2130  * @refs: The amount to subtract from the folio's reference count.
2131  *
2132  * If the folio's reference count reaches zero, the memory will be
2133  * released back to the page allocator and may be used by another
2134  * allocation immediately.  Do not access the memory or the struct folio
2135  * after calling folio_put_refs() unless you can be sure that these weren't
2136  * the last references.
2137  *
2138  * Context: May be called in process or interrupt context, but not in NMI
2139  * context.  May be called while holding a spinlock.
2140  */
2141 static inline void folio_put_refs(struct folio *folio, int refs)
2142 {
2143 	if (folio_ref_sub_and_test(folio, refs))
2144 		__folio_put(folio);
2145 }
2146 
2147 void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
2148 
2149 /*
2150  * union release_pages_arg - an array of pages or folios
2151  *
2152  * release_pages() releases a simple array of multiple pages, and
2153  * accepts various different forms of said page array: either
2154  * a regular old boring array of pages, an array of folios, or
2155  * an array of encoded page pointers.
2156  *
2157  * The transparent union syntax for this kind of "any of these
2158  * argument types" is all kinds of ugly, so look away.
2159  */
2160 typedef union {
2161 	struct page **pages;
2162 	struct folio **folios;
2163 	struct encoded_page **encoded_pages;
2164 } release_pages_arg __attribute__ ((__transparent_union__));
2165 
2166 void release_pages(release_pages_arg, int nr);
2167 
2168 /**
2169  * folios_put - Decrement the reference count on an array of folios.
2170  * @folios: The folios.
2171  *
2172  * Like folio_put(), but for a batch of folios.  This is more efficient
2173  * than writing the loop yourself as it will optimise the locks which need
2174  * to be taken if the folios are freed.  The folios batch is returned
2175  * empty and ready to be reused for another batch; there is no need to
2176  * reinitialise it.
2177  *
2178  * Context: May be called in process or interrupt context, but not in NMI
2179  * context.  May be called while holding a spinlock.
2180  */
2181 static inline void folios_put(struct folio_batch *folios)
2182 {
2183 	folios_put_refs(folios, NULL);
2184 }
2185 
2186 static inline void put_page(struct page *page)
2187 {
2188 	struct folio *folio = page_folio(page);
2189 
2190 	if (folio_test_slab(folio) || folio_test_large_kmalloc(folio))
2191 		return;
2192 
2193 	folio_put(folio);
2194 }
2195 
2196 /*
2197  * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
2198  * the page's refcount so that two separate items are tracked: the original page
2199  * reference count, and also a new count of how many pin_user_pages() calls were
2200  * made against the page. ("gup-pinned" is another term for the latter).
2201  *
2202  * With this scheme, pin_user_pages() becomes special: such pages are marked as
2203  * distinct from normal pages. As such, the unpin_user_page() call (and its
2204  * variants) must be used in order to release gup-pinned pages.
2205  *
2206  * Choice of value:
2207  *
2208  * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
2209  * counts with respect to pin_user_pages() and unpin_user_page() becomes
2210  * simpler, due to the fact that adding an even power of two to the page
2211  * refcount has the effect of using only the upper N bits, for the code that
2212  * counts up using the bias value. This means that the lower bits are left for
2213  * the exclusive use of the original code that increments and decrements by one
2214  * (or at least, by much smaller values than the bias value).
2215  *
2216  * Of course, once the lower bits overflow into the upper bits (and this is
2217  * OK, because subtraction recovers the original values), then visual inspection
2218  * no longer suffices to directly view the separate counts. However, for normal
2219  * applications that don't have huge page reference counts, this won't be an
2220  * issue.
2221  *
2222  * Locking: the lockless algorithm described in folio_try_get_rcu()
2223  * provides safe operation for get_user_pages(), folio_mkclean() and
2224  * other calls that race to set up page table entries.
2225  */
2226 #define GUP_PIN_COUNTING_BIAS (1U << 10)
2227 
2228 void unpin_user_page(struct page *page);
2229 void unpin_folio(struct folio *folio);
2230 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
2231 				 bool make_dirty);
2232 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
2233 				      bool make_dirty);
2234 void unpin_user_pages(struct page **pages, unsigned long npages);
2235 void unpin_user_folio(struct folio *folio, unsigned long npages);
2236 void unpin_folios(struct folio **folios, unsigned long nfolios);
2237 
2238 static inline bool is_cow_mapping(vm_flags_t flags)
2239 {
2240 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2241 }
2242 
2243 static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc)
2244 {
2245 	const vma_flags_t *flags = &desc->vma_flags;
2246 
2247 	return vma_flags_test(flags, VMA_MAYWRITE_BIT) &&
2248 		!vma_flags_test(flags, VMA_SHARED_BIT);
2249 }
2250 
2251 #ifndef CONFIG_MMU
2252 static inline bool is_nommu_shared_mapping(vm_flags_t flags)
2253 {
2254 	/*
2255 	 * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
2256 	 * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
2257 	 * a file mapping. R/O MAP_PRIVATE mappings might still modify
2258 	 * underlying memory if ptrace is active, so this is only possible if
2259 	 * ptrace does not apply. Note that there is no mprotect() to upgrade
2260 	 * write permissions later.
2261 	 */
2262 	return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
2263 }
2264 
2265 static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags)
2266 {
2267 	return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT);
2268 }
2269 #endif
2270 
2271 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
2272 #define SECTION_IN_PAGE_FLAGS
2273 #endif
2274 
2275 /*
2276  * The identification function is mainly used by the buddy allocator for
2277  * determining if two pages could be buddies. We are not really identifying
2278  * the zone since we could be using the section number id if we do not have
2279  * node id available in page flags.
2280  * We only guarantee that it will return the same value for two combinable
2281  * pages in a zone.
2282  */
2283 static inline int page_zone_id(struct page *page)
2284 {
2285 	return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK;
2286 }
2287 
2288 #ifdef NODE_NOT_IN_PAGE_FLAGS
2289 int memdesc_nid(memdesc_flags_t mdf);
2290 #else
2291 static inline int memdesc_nid(memdesc_flags_t mdf)
2292 {
2293 	return (mdf.f >> NODES_PGSHIFT) & NODES_MASK;
2294 }
2295 #endif
2296 
2297 static inline int page_to_nid(const struct page *page)
2298 {
2299 	return memdesc_nid(PF_POISONED_CHECK(page)->flags);
2300 }
2301 
2302 static inline int folio_nid(const struct folio *folio)
2303 {
2304 	return memdesc_nid(folio->flags);
2305 }
2306 
2307 #ifdef CONFIG_NUMA_BALANCING
2308 /* page access time bits needs to hold at least 4 seconds */
2309 #define PAGE_ACCESS_TIME_MIN_BITS	12
2310 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
2311 #define PAGE_ACCESS_TIME_BUCKETS				\
2312 	(PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
2313 #else
2314 #define PAGE_ACCESS_TIME_BUCKETS	0
2315 #endif
2316 
2317 #define PAGE_ACCESS_TIME_MASK				\
2318 	(LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
2319 
2320 static inline int cpu_pid_to_cpupid(int cpu, int pid)
2321 {
2322 	return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
2323 }
2324 
2325 static inline int cpupid_to_pid(int cpupid)
2326 {
2327 	return cpupid & LAST__PID_MASK;
2328 }
2329 
2330 static inline int cpupid_to_cpu(int cpupid)
2331 {
2332 	return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
2333 }
2334 
2335 static inline int cpupid_to_nid(int cpupid)
2336 {
2337 	return cpu_to_node(cpupid_to_cpu(cpupid));
2338 }
2339 
2340 static inline bool cpupid_pid_unset(int cpupid)
2341 {
2342 	return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
2343 }
2344 
2345 static inline bool cpupid_cpu_unset(int cpupid)
2346 {
2347 	return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
2348 }
2349 
2350 static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
2351 {
2352 	return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
2353 }
2354 
2355 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
2356 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
2357 static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
2358 {
2359 	return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
2360 }
2361 
2362 static inline int folio_last_cpupid(struct folio *folio)
2363 {
2364 	return folio->_last_cpupid;
2365 }
2366 static inline void page_cpupid_reset_last(struct page *page)
2367 {
2368 	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
2369 }
2370 #else
2371 static inline int folio_last_cpupid(struct folio *folio)
2372 {
2373 	return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
2374 }
2375 
2376 int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
2377 
2378 static inline void page_cpupid_reset_last(struct page *page)
2379 {
2380 	page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
2381 }
2382 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
2383 
2384 static inline int folio_xchg_access_time(struct folio *folio, int time)
2385 {
2386 	int last_time;
2387 
2388 	last_time = folio_xchg_last_cpupid(folio,
2389 					   time >> PAGE_ACCESS_TIME_BUCKETS);
2390 	return last_time << PAGE_ACCESS_TIME_BUCKETS;
2391 }
2392 
2393 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
2394 {
2395 	unsigned int pid_bit;
2396 
2397 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
2398 	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
2399 		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
2400 	}
2401 }
2402 
2403 bool folio_use_access_time(struct folio *folio);
2404 #else /* !CONFIG_NUMA_BALANCING */
2405 static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
2406 {
2407 	return folio_nid(folio); /* XXX */
2408 }
2409 
2410 static inline int folio_xchg_access_time(struct folio *folio, int time)
2411 {
2412 	return 0;
2413 }
2414 
2415 static inline int folio_last_cpupid(struct folio *folio)
2416 {
2417 	return folio_nid(folio); /* XXX */
2418 }
2419 
2420 static inline int cpupid_to_nid(int cpupid)
2421 {
2422 	return -1;
2423 }
2424 
2425 static inline int cpupid_to_pid(int cpupid)
2426 {
2427 	return -1;
2428 }
2429 
2430 static inline int cpupid_to_cpu(int cpupid)
2431 {
2432 	return -1;
2433 }
2434 
2435 static inline int cpu_pid_to_cpupid(int nid, int pid)
2436 {
2437 	return -1;
2438 }
2439 
2440 static inline bool cpupid_pid_unset(int cpupid)
2441 {
2442 	return true;
2443 }
2444 
2445 static inline void page_cpupid_reset_last(struct page *page)
2446 {
2447 }
2448 
2449 static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
2450 {
2451 	return false;
2452 }
2453 
2454 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
2455 {
2456 }
2457 static inline bool folio_use_access_time(struct folio *folio)
2458 {
2459 	return false;
2460 }
2461 #endif /* CONFIG_NUMA_BALANCING */
2462 
2463 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
2464 
2465 /*
2466  * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
2467  * setting tags for all pages to native kernel tag value 0xff, as the default
2468  * value 0x00 maps to 0xff.
2469  */
2470 
2471 static inline u8 page_kasan_tag(const struct page *page)
2472 {
2473 	u8 tag = KASAN_TAG_KERNEL;
2474 
2475 	if (kasan_enabled()) {
2476 		tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
2477 		tag ^= 0xff;
2478 	}
2479 
2480 	return tag;
2481 }
2482 
2483 static inline void page_kasan_tag_set(struct page *page, u8 tag)
2484 {
2485 	unsigned long old_flags, flags;
2486 
2487 	if (!kasan_enabled())
2488 		return;
2489 
2490 	tag ^= 0xff;
2491 	old_flags = READ_ONCE(page->flags.f);
2492 	do {
2493 		flags = old_flags;
2494 		flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
2495 		flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
2496 	} while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags)));
2497 }
2498 
2499 static inline void page_kasan_tag_reset(struct page *page)
2500 {
2501 	if (kasan_enabled())
2502 		page_kasan_tag_set(page, KASAN_TAG_KERNEL);
2503 }
2504 
2505 #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
2506 
2507 static inline u8 page_kasan_tag(const struct page *page)
2508 {
2509 	return 0xff;
2510 }
2511 
2512 static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
2513 static inline void page_kasan_tag_reset(struct page *page) { }
2514 
2515 #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
2516 
2517 static inline struct zone *page_zone(const struct page *page)
2518 {
2519 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
2520 }
2521 
2522 static inline pg_data_t *page_pgdat(const struct page *page)
2523 {
2524 	return NODE_DATA(page_to_nid(page));
2525 }
2526 
2527 static inline pg_data_t *folio_pgdat(const struct folio *folio)
2528 {
2529 	return NODE_DATA(folio_nid(folio));
2530 }
2531 
2532 static inline struct zone *folio_zone(const struct folio *folio)
2533 {
2534 	return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)];
2535 }
2536 
2537 #ifdef SECTION_IN_PAGE_FLAGS
2538 static inline void set_page_section(struct page *page, unsigned long section)
2539 {
2540 	page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
2541 	page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
2542 }
2543 
2544 static inline unsigned long memdesc_section(memdesc_flags_t mdf)
2545 {
2546 	return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
2547 }
2548 #else /* !SECTION_IN_PAGE_FLAGS */
2549 static inline unsigned long memdesc_section(memdesc_flags_t mdf)
2550 {
2551 	return 0;
2552 }
2553 #endif /* SECTION_IN_PAGE_FLAGS */
2554 
2555 /**
2556  * folio_pfn - Return the Page Frame Number of a folio.
2557  * @folio: The folio.
2558  *
2559  * A folio may contain multiple pages.  The pages have consecutive
2560  * Page Frame Numbers.
2561  *
2562  * Return: The Page Frame Number of the first page in the folio.
2563  */
2564 static inline unsigned long folio_pfn(const struct folio *folio)
2565 {
2566 	return page_to_pfn(&folio->page);
2567 }
2568 
2569 static inline struct folio *pfn_folio(unsigned long pfn)
2570 {
2571 	return page_folio(pfn_to_page(pfn));
2572 }
2573 
2574 #ifdef CONFIG_MMU
2575 static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot)
2576 {
2577 	return pfn_pte(page_to_pfn(page), pgprot);
2578 }
2579 
2580 /**
2581  * folio_mk_pte - Create a PTE for this folio
2582  * @folio: The folio to create a PTE for
2583  * @pgprot: The page protection bits to use
2584  *
2585  * Create a page table entry for the first page of this folio.
2586  * This is suitable for passing to set_ptes().
2587  *
2588  * Return: A page table entry suitable for mapping this folio.
2589  */
2590 static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot)
2591 {
2592 	return pfn_pte(folio_pfn(folio), pgprot);
2593 }
2594 
2595 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2596 /**
2597  * folio_mk_pmd - Create a PMD for this folio
2598  * @folio: The folio to create a PMD for
2599  * @pgprot: The page protection bits to use
2600  *
2601  * Create a page table entry for the first page of this folio.
2602  * This is suitable for passing to set_pmd_at().
2603  *
2604  * Return: A page table entry suitable for mapping this folio.
2605  */
2606 static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot)
2607 {
2608 	return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
2609 }
2610 
2611 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2612 /**
2613  * folio_mk_pud - Create a PUD for this folio
2614  * @folio: The folio to create a PUD for
2615  * @pgprot: The page protection bits to use
2616  *
2617  * Create a page table entry for the first page of this folio.
2618  * This is suitable for passing to set_pud_at().
2619  *
2620  * Return: A page table entry suitable for mapping this folio.
2621  */
2622 static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot)
2623 {
2624 	return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
2625 }
2626 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2627 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2628 #endif /* CONFIG_MMU */
2629 
2630 static inline bool folio_has_pincount(const struct folio *folio)
2631 {
2632 	if (IS_ENABLED(CONFIG_64BIT))
2633 		return folio_test_large(folio);
2634 	return folio_order(folio) > 1;
2635 }
2636 
2637 /**
2638  * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
2639  * @folio: The folio.
2640  *
2641  * This function checks if a folio has been pinned via a call to
2642  * a function in the pin_user_pages() family.
2643  *
2644  * For small folios, the return value is partially fuzzy: false is not fuzzy,
2645  * because it means "definitely not pinned for DMA", but true means "probably
2646  * pinned for DMA, but possibly a false positive due to having at least
2647  * GUP_PIN_COUNTING_BIAS worth of normal folio references".
2648  *
2649  * False positives are OK, because: a) it's unlikely for a folio to
2650  * get that many refcounts, and b) all the callers of this routine are
2651  * expected to be able to deal gracefully with a false positive.
2652  *
2653  * For most large folios, the result will be exactly correct. That's because
2654  * we have more tracking data available: the _pincount field is used
2655  * instead of the GUP_PIN_COUNTING_BIAS scheme.
2656  *
2657  * For more information, please see Documentation/core-api/pin_user_pages.rst.
2658  *
2659  * Return: True, if it is likely that the folio has been "dma-pinned".
2660  * False, if the folio is definitely not dma-pinned.
2661  */
2662 static inline bool folio_maybe_dma_pinned(struct folio *folio)
2663 {
2664 	if (folio_has_pincount(folio))
2665 		return atomic_read(&folio->_pincount) > 0;
2666 
2667 	/*
2668 	 * folio_ref_count() is signed. If that refcount overflows, then
2669 	 * folio_ref_count() returns a negative value, and callers will avoid
2670 	 * further incrementing the refcount.
2671 	 *
2672 	 * Here, for that overflow case, use the sign bit to count a little
2673 	 * bit higher via unsigned math, and thus still get an accurate result.
2674 	 */
2675 	return ((unsigned int)folio_ref_count(folio)) >=
2676 		GUP_PIN_COUNTING_BIAS;
2677 }
2678 
2679 /*
2680  * This should most likely only be called during fork() to see whether we
2681  * should break the cow immediately for an anon page on the src mm.
2682  *
2683  * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
2684  */
2685 static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
2686 					  struct folio *folio)
2687 {
2688 	VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
2689 
2690 	if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))
2691 		return false;
2692 
2693 	return folio_maybe_dma_pinned(folio);
2694 }
2695 
2696 /**
2697  * is_zero_page - Query if a page is a zero page
2698  * @page: The page to query
2699  *
2700  * This returns true if @page is one of the permanent zero pages.
2701  */
2702 static inline bool is_zero_page(const struct page *page)
2703 {
2704 	return is_zero_pfn(page_to_pfn(page));
2705 }
2706 
2707 /**
2708  * is_zero_folio - Query if a folio is a zero page
2709  * @folio: The folio to query
2710  *
2711  * This returns true if @folio is one of the permanent zero pages.
2712  */
2713 static inline bool is_zero_folio(const struct folio *folio)
2714 {
2715 	return is_zero_page(&folio->page);
2716 }
2717 
2718 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
2719 #ifdef CONFIG_MIGRATION
2720 static inline bool folio_is_longterm_pinnable(struct folio *folio)
2721 {
2722 #ifdef CONFIG_CMA
2723 	int mt = folio_migratetype(folio);
2724 
2725 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
2726 		return false;
2727 #endif
2728 	/* The zero page can be "pinned" but gets special handling. */
2729 	if (is_zero_folio(folio))
2730 		return true;
2731 
2732 	/* Coherent device memory must always allow eviction. */
2733 	if (folio_is_device_coherent(folio))
2734 		return false;
2735 
2736 	/*
2737 	 * Filesystems can only tolerate transient delays to truncate and
2738 	 * hole-punch operations
2739 	 */
2740 	if (folio_is_fsdax(folio))
2741 		return false;
2742 
2743 	/* Otherwise, non-movable zone folios can be pinned. */
2744 	return !folio_is_zone_movable(folio);
2745 
2746 }
2747 #else
2748 static inline bool folio_is_longterm_pinnable(struct folio *folio)
2749 {
2750 	return true;
2751 }
2752 #endif
2753 
2754 static inline void set_page_zone(struct page *page, enum zone_type zone)
2755 {
2756 	page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT);
2757 	page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
2758 }
2759 
2760 static inline void set_page_node(struct page *page, unsigned long node)
2761 {
2762 	page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT);
2763 	page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT;
2764 }
2765 
2766 static inline void set_page_links(struct page *page, enum zone_type zone,
2767 	unsigned long node, unsigned long pfn)
2768 {
2769 	set_page_zone(page, zone);
2770 	set_page_node(page, node);
2771 #ifdef SECTION_IN_PAGE_FLAGS
2772 	set_page_section(page, pfn_to_section_nr(pfn));
2773 #endif
2774 }
2775 
2776 /**
2777  * folio_nr_pages - The number of pages in the folio.
2778  * @folio: The folio.
2779  *
2780  * Return: A positive power of two.
2781  */
2782 static inline unsigned long folio_nr_pages(const struct folio *folio)
2783 {
2784 	if (!folio_test_large(folio))
2785 		return 1;
2786 	return folio_large_nr_pages(folio);
2787 }
2788 
2789 /*
2790  * compound_nr() returns the number of pages in this potentially compound
2791  * page.  compound_nr() can be called on a tail page, and is defined to
2792  * return 1 in that case.
2793  */
2794 static inline unsigned long compound_nr(const struct page *page)
2795 {
2796 	const struct folio *folio = (struct folio *)page;
2797 
2798 	if (!test_bit(PG_head, &folio->flags.f))
2799 		return 1;
2800 	return folio_large_nr_pages(folio);
2801 }
2802 
2803 /**
2804  * folio_next - Move to the next physical folio.
2805  * @folio: The folio we're currently operating on.
2806  *
2807  * If you have physically contiguous memory which may span more than
2808  * one folio (eg a &struct bio_vec), use this function to move from one
2809  * folio to the next.  Do not use it if the memory is only virtually
2810  * contiguous as the folios are almost certainly not adjacent to each
2811  * other.  This is the folio equivalent to writing ``page++``.
2812  *
2813  * Context: We assume that the folios are refcounted and/or locked at a
2814  * higher level and do not adjust the reference counts.
2815  * Return: The next struct folio.
2816  */
2817 static inline struct folio *folio_next(struct folio *folio)
2818 {
2819 	return (struct folio *)folio_page(folio, folio_nr_pages(folio));
2820 }
2821 
2822 /**
2823  * folio_shift - The size of the memory described by this folio.
2824  * @folio: The folio.
2825  *
2826  * A folio represents a number of bytes which is a power-of-two in size.
2827  * This function tells you which power-of-two the folio is.  See also
2828  * folio_size() and folio_order().
2829  *
2830  * Context: The caller should have a reference on the folio to prevent
2831  * it from being split.  It is not necessary for the folio to be locked.
2832  * Return: The base-2 logarithm of the size of this folio.
2833  */
2834 static inline unsigned int folio_shift(const struct folio *folio)
2835 {
2836 	return PAGE_SHIFT + folio_order(folio);
2837 }
2838 
2839 /**
2840  * folio_size - The number of bytes in a folio.
2841  * @folio: The folio.
2842  *
2843  * Context: The caller should have a reference on the folio to prevent
2844  * it from being split.  It is not necessary for the folio to be locked.
2845  * Return: The number of bytes in this folio.
2846  */
2847 static inline size_t folio_size(const struct folio *folio)
2848 {
2849 	return PAGE_SIZE << folio_order(folio);
2850 }
2851 
2852 /**
2853  * folio_maybe_mapped_shared - Whether the folio is mapped into the page
2854  *			       tables of more than one MM
2855  * @folio: The folio.
2856  *
2857  * This function checks if the folio maybe currently mapped into more than one
2858  * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
2859  * MM ("mapped exclusively").
2860  *
2861  * For KSM folios, this function also returns "mapped shared" when a folio is
2862  * mapped multiple times into the same MM, because the individual page mappings
2863  * are independent.
2864  *
2865  * For small anonymous folios and anonymous hugetlb folios, the return
2866  * value will be exactly correct: non-KSM folios can only be mapped at most once
2867  * into an MM, and they cannot be partially mapped. KSM folios are
2868  * considered shared even if mapped multiple times into the same MM.
2869  *
2870  * For other folios, the result can be fuzzy:
2871  *    #. For partially-mappable large folios (THP), the return value can wrongly
2872  *       indicate "mapped shared" (false positive) if a folio was mapped by
2873  *       more than two MMs at one point in time.
2874  *    #. For pagecache folios (including hugetlb), the return value can wrongly
2875  *       indicate "mapped shared" (false positive) when two VMAs in the same MM
2876  *       cover the same file range.
2877  *
2878  * Further, this function only considers current page table mappings that
2879  * are tracked using the folio mapcount(s).
2880  *
2881  * This function does not consider:
2882  *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
2883  *       pagecache, temporary unmapping for migration).
2884  *    #. If the folio is mapped differently (VM_PFNMAP).
2885  *    #. If hugetlb page table sharing applies. Callers might want to check
2886  *       hugetlb_pmd_shared().
2887  *
2888  * Return: Whether the folio is estimated to be mapped into more than one MM.
2889  */
2890 static inline bool folio_maybe_mapped_shared(struct folio *folio)
2891 {
2892 	int mapcount = folio_mapcount(folio);
2893 
2894 	/* Only partially-mappable folios require more care. */
2895 	if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
2896 		return mapcount > 1;
2897 
2898 	/*
2899 	 * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
2900 	 * simply assume "mapped shared", nobody should really care
2901 	 * about this for arbitrary kernel allocations.
2902 	 */
2903 	if (!IS_ENABLED(CONFIG_MM_ID))
2904 		return true;
2905 
2906 	/*
2907 	 * A single mapping implies "mapped exclusively", even if the
2908 	 * folio flag says something different: it's easier to handle this
2909 	 * case here instead of on the RMAP hot path.
2910 	 */
2911 	if (mapcount <= 1)
2912 		return false;
2913 	return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
2914 }
2915 
2916 /**
2917  * folio_expected_ref_count - calculate the expected folio refcount
2918  * @folio: the folio
2919  *
2920  * Calculate the expected folio refcount, taking references from the pagecache,
2921  * swapcache, PG_private and page table mappings into account. Useful in
2922  * combination with folio_ref_count() to detect unexpected references (e.g.,
2923  * GUP or other temporary references).
2924  *
2925  * Does currently not consider references from the LRU cache. If the folio
2926  * was isolated from the LRU (which is the case during migration or split),
2927  * the LRU cache does not apply.
2928  *
2929  * Calling this function on an unmapped folio -- !folio_mapped() -- that is
2930  * locked will return a stable result.
2931  *
2932  * Calling this function on a mapped folio will not result in a stable result,
2933  * because nothing stops additional page table mappings from coming (e.g.,
2934  * fork()) or going (e.g., munmap()).
2935  *
2936  * Calling this function without the folio lock will also not result in a
2937  * stable result: for example, the folio might get dropped from the swapcache
2938  * concurrently.
2939  *
2940  * However, even when called without the folio lock or on a mapped folio,
2941  * this function can be used to detect unexpected references early (for example,
2942  * if it makes sense to even lock the folio and unmap it).
2943  *
2944  * The caller must add any reference (e.g., from folio_try_get()) it might be
2945  * holding itself to the result.
2946  *
2947  * Returns: the expected folio refcount.
2948  */
2949 static inline int folio_expected_ref_count(const struct folio *folio)
2950 {
2951 	const int order = folio_order(folio);
2952 	int ref_count = 0;
2953 
2954 	if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
2955 		return 0;
2956 
2957 	/* One reference per page from the swapcache. */
2958 	ref_count += folio_test_swapcache(folio) << order;
2959 
2960 	if (!folio_test_anon(folio)) {
2961 		/* One reference per page from the pagecache. */
2962 		ref_count += !!folio->mapping << order;
2963 		/* One reference from PG_private. */
2964 		ref_count += folio_test_private(folio);
2965 	}
2966 
2967 	/* One reference per page table mapping. */
2968 	return ref_count + folio_mapcount(folio);
2969 }
2970 
2971 #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
2972 static inline int arch_make_folio_accessible(struct folio *folio)
2973 {
2974 	return 0;
2975 }
2976 #endif
2977 
2978 /*
2979  * Some inline functions in vmstat.h depend on page_zone()
2980  */
2981 #include <linux/vmstat.h>
2982 
2983 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
2984 #define HASHED_PAGE_VIRTUAL
2985 #endif
2986 
2987 #if defined(WANT_PAGE_VIRTUAL)
2988 static inline void *page_address(const struct page *page)
2989 {
2990 	return page->virtual;
2991 }
2992 static inline void set_page_address(struct page *page, void *address)
2993 {
2994 	page->virtual = address;
2995 }
2996 #define page_address_init()  do { } while(0)
2997 #endif
2998 
2999 #if defined(HASHED_PAGE_VIRTUAL)
3000 void *page_address(const struct page *page);
3001 void set_page_address(struct page *page, void *virtual);
3002 void page_address_init(void);
3003 #endif
3004 
3005 static __always_inline void *lowmem_page_address(const struct page *page)
3006 {
3007 	return page_to_virt(page);
3008 }
3009 
3010 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
3011 #define page_address(page) lowmem_page_address(page)
3012 #define set_page_address(page, address)  do { } while(0)
3013 #define page_address_init()  do { } while(0)
3014 #endif
3015 
3016 static inline void *folio_address(const struct folio *folio)
3017 {
3018 	return page_address(&folio->page);
3019 }
3020 
3021 /*
3022  * Return true only if the page has been allocated with
3023  * ALLOC_NO_WATERMARKS and the low watermark was not
3024  * met implying that the system is under some pressure.
3025  */
3026 static inline bool page_is_pfmemalloc(const struct page *page)
3027 {
3028 	/*
3029 	 * lru.next has bit 1 set if the page is allocated from the
3030 	 * pfmemalloc reserves.  Callers may simply overwrite it if
3031 	 * they do not need to preserve that information.
3032 	 */
3033 	return (uintptr_t)page->lru.next & BIT(1);
3034 }
3035 
3036 /*
3037  * Return true only if the folio has been allocated with
3038  * ALLOC_NO_WATERMARKS and the low watermark was not
3039  * met implying that the system is under some pressure.
3040  */
3041 static inline bool folio_is_pfmemalloc(const struct folio *folio)
3042 {
3043 	/*
3044 	 * lru.next has bit 1 set if the page is allocated from the
3045 	 * pfmemalloc reserves.  Callers may simply overwrite it if
3046 	 * they do not need to preserve that information.
3047 	 */
3048 	return (uintptr_t)folio->lru.next & BIT(1);
3049 }
3050 
3051 /*
3052  * Only to be called by the page allocator on a freshly allocated
3053  * page.
3054  */
3055 static inline void set_page_pfmemalloc(struct page *page)
3056 {
3057 	page->lru.next = (void *)BIT(1);
3058 }
3059 
3060 static inline void clear_page_pfmemalloc(struct page *page)
3061 {
3062 	page->lru.next = NULL;
3063 }
3064 
3065 /*
3066  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
3067  */
3068 extern void pagefault_out_of_memory(void);
3069 
3070 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
3071 #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
3072 
3073 /*
3074  * Parameter block passed down to zap_pte_range in exceptional cases.
3075  */
3076 struct zap_details {
3077 	struct folio *single_folio;	/* Locked folio to be unmapped */
3078 	bool skip_cows;			/* Do not zap COWed private pages */
3079 	bool reclaim_pt;		/* Need reclaim page tables? */
3080 	bool reaping;			/* Reaping, do not block. */
3081 	zap_flags_t zap_flags;		/* Extra flags for zapping */
3082 };
3083 
3084 /*
3085  * Whether to drop the pte markers, for example, the uffd-wp information for
3086  * file-backed memory.  This should only be specified when we will completely
3087  * drop the page in the mm, either by truncation or unmapping of the vma.  By
3088  * default, the flag is not set.
3089  */
3090 #define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
3091 /* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
3092 #define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))
3093 
3094 #ifdef CONFIG_MMU
3095 extern bool can_do_mlock(void);
3096 #else
3097 static inline bool can_do_mlock(void) { return false; }
3098 #endif
3099 extern int user_shm_lock(size_t, struct ucounts *);
3100 extern void user_shm_unlock(size_t, struct ucounts *);
3101 
3102 struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
3103 			     pte_t pte);
3104 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
3105 			     pte_t pte);
3106 struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
3107 				  unsigned long addr, pmd_t pmd);
3108 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
3109 				pmd_t pmd);
3110 struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
3111 		pud_t pud);
3112 
3113 void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address,
3114 		  unsigned long size);
3115 void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
3116 			   unsigned long size);
3117 /**
3118  * zap_vma - zap all page table entries in a vma
3119  * @vma: The vma to zap.
3120  */
3121 static inline void zap_vma(struct vm_area_struct *vma)
3122 {
3123 	zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
3124 }
3125 struct mmu_notifier_range;
3126 
3127 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
3128 		unsigned long end, unsigned long floor, unsigned long ceiling);
3129 int
3130 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
3131 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3132 			void *buf, int len, int write);
3133 
3134 struct follow_pfnmap_args {
3135 	/**
3136 	 * Inputs:
3137 	 * @vma: Pointer to @vm_area_struct struct
3138 	 * @address: the virtual address to walk
3139 	 */
3140 	struct vm_area_struct *vma;
3141 	unsigned long address;
3142 	/**
3143 	 * Internals:
3144 	 *
3145 	 * The caller shouldn't touch any of these.
3146 	 */
3147 	spinlock_t *lock;
3148 	pte_t *ptep;
3149 	/**
3150 	 * Outputs:
3151 	 *
3152 	 * @pfn: the PFN of the address
3153 	 * @addr_mask: address mask covering pfn
3154 	 * @pgprot: the pgprot_t of the mapping
3155 	 * @writable: whether the mapping is writable
3156 	 * @special: whether the mapping is a special mapping (real PFN maps)
3157 	 */
3158 	unsigned long pfn;
3159 	unsigned long addr_mask;
3160 	pgprot_t pgprot;
3161 	bool writable;
3162 	bool special;
3163 };
3164 int follow_pfnmap_start(struct follow_pfnmap_args *args);
3165 void follow_pfnmap_end(struct follow_pfnmap_args *args);
3166 
3167 extern void truncate_pagecache(struct inode *inode, loff_t new);
3168 extern void truncate_setsize(struct inode *inode, loff_t newsize);
3169 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
3170 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
3171 int generic_error_remove_folio(struct address_space *mapping,
3172 		struct folio *folio);
3173 
3174 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
3175 		unsigned long address, struct pt_regs *regs);
3176 
3177 #ifdef CONFIG_MMU
3178 extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
3179 				  unsigned long address, unsigned int flags,
3180 				  struct pt_regs *regs);
3181 extern int fixup_user_fault(struct mm_struct *mm,
3182 			    unsigned long address, unsigned int fault_flags,
3183 			    bool *unlocked);
3184 void unmap_mapping_pages(struct address_space *mapping,
3185 		pgoff_t start, pgoff_t nr, bool even_cows);
3186 void unmap_mapping_range(struct address_space *mapping,
3187 		loff_t const holebegin, loff_t const holelen, int even_cows);
3188 #else
3189 static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
3190 					 unsigned long address, unsigned int flags,
3191 					 struct pt_regs *regs)
3192 {
3193 	/* should never happen if there's no MMU */
3194 	BUG();
3195 	return VM_FAULT_SIGBUS;
3196 }
3197 static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
3198 		unsigned int fault_flags, bool *unlocked)
3199 {
3200 	/* should never happen if there's no MMU */
3201 	BUG();
3202 	return -EFAULT;
3203 }
3204 static inline void unmap_mapping_pages(struct address_space *mapping,
3205 		pgoff_t start, pgoff_t nr, bool even_cows) { }
3206 static inline void unmap_mapping_range(struct address_space *mapping,
3207 		loff_t const holebegin, loff_t const holelen, int even_cows) { }
3208 #endif
3209 
3210 static inline void unmap_shared_mapping_range(struct address_space *mapping,
3211 		loff_t const holebegin, loff_t const holelen)
3212 {
3213 	unmap_mapping_range(mapping, holebegin, holelen, 0);
3214 }
3215 
3216 static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
3217 						unsigned long addr);
3218 
3219 extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
3220 		void *buf, int len, unsigned int gup_flags);
3221 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3222 		void *buf, int len, unsigned int gup_flags);
3223 
3224 #ifdef CONFIG_BPF_SYSCALL
3225 extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
3226 			      void *buf, int len, unsigned int gup_flags);
3227 #endif
3228 
3229 long get_user_pages_remote(struct mm_struct *mm,
3230 			   unsigned long start, unsigned long nr_pages,
3231 			   unsigned int gup_flags, struct page **pages,
3232 			   int *locked);
3233 long pin_user_pages_remote(struct mm_struct *mm,
3234 			   unsigned long start, unsigned long nr_pages,
3235 			   unsigned int gup_flags, struct page **pages,
3236 			   int *locked);
3237 
3238 /*
3239  * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
3240  */
3241 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
3242 						    unsigned long addr,
3243 						    int gup_flags,
3244 						    struct vm_area_struct **vmap)
3245 {
3246 	struct page *page;
3247 	struct vm_area_struct *vma;
3248 	int got;
3249 
3250 	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
3251 		return ERR_PTR(-EINVAL);
3252 
3253 	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
3254 
3255 	if (got < 0)
3256 		return ERR_PTR(got);
3257 
3258 	vma = vma_lookup(mm, addr);
3259 	if (WARN_ON_ONCE(!vma)) {
3260 		put_page(page);
3261 		return ERR_PTR(-EINVAL);
3262 	}
3263 
3264 	*vmap = vma;
3265 	return page;
3266 }
3267 
3268 long get_user_pages(unsigned long start, unsigned long nr_pages,
3269 		    unsigned int gup_flags, struct page **pages);
3270 long pin_user_pages(unsigned long start, unsigned long nr_pages,
3271 		    unsigned int gup_flags, struct page **pages);
3272 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3273 		    struct page **pages, unsigned int gup_flags);
3274 long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3275 		    struct page **pages, unsigned int gup_flags);
3276 long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
3277 		      struct folio **folios, unsigned int max_folios,
3278 		      pgoff_t *offset);
3279 int folio_add_pins(struct folio *folio, unsigned int pins);
3280 
3281 int get_user_pages_fast(unsigned long start, int nr_pages,
3282 			unsigned int gup_flags, struct page **pages);
3283 int pin_user_pages_fast(unsigned long start, int nr_pages,
3284 			unsigned int gup_flags, struct page **pages);
3285 void folio_add_pin(struct folio *folio);
3286 
3287 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
3288 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
3289 			const struct task_struct *task, bool bypass_rlim);
3290 
3291 struct kvec;
3292 struct page *get_dump_page(unsigned long addr, int *locked);
3293 
3294 bool folio_mark_dirty(struct folio *folio);
3295 bool folio_mark_dirty_lock(struct folio *folio);
3296 bool set_page_dirty(struct page *page);
3297 int set_page_dirty_lock(struct page *page);
3298 
3299 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
3300 
3301 /*
3302  * Flags used by change_protection().  For now we make it a bitmap so
3303  * that we can pass in multiple flags just like parameters.  However
3304  * for now all the callers are only use one of the flags at the same
3305  * time.
3306  */
3307 /*
3308  * Whether we should manually check if we can map individual PTEs writable,
3309  * because something (e.g., COW, uffd-wp) blocks that from happening for all
3310  * PTEs automatically in a writable mapping.
3311  */
3312 #define  MM_CP_TRY_CHANGE_WRITABLE	   (1UL << 0)
3313 /* Whether this protection change is for NUMA hints */
3314 #define  MM_CP_PROT_NUMA                   (1UL << 1)
3315 /* Whether this change is for write protecting */
3316 #define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
3317 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
3318 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
3319 					    MM_CP_UFFD_WP_RESOLVE)
3320 
3321 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
3322 			     pte_t pte);
3323 extern long change_protection(struct mmu_gather *tlb,
3324 			      struct vm_area_struct *vma, unsigned long start,
3325 			      unsigned long end, unsigned long cp_flags);
3326 extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
3327 	  struct vm_area_struct *vma, struct vm_area_struct **pprev,
3328 	  unsigned long start, unsigned long end, vm_flags_t newflags);
3329 
3330 /*
3331  * doesn't attempt to fault and will return short.
3332  */
3333 int get_user_pages_fast_only(unsigned long start, int nr_pages,
3334 			     unsigned int gup_flags, struct page **pages);
3335 
3336 static inline bool get_user_page_fast_only(unsigned long addr,
3337 			unsigned int gup_flags, struct page **pagep)
3338 {
3339 	return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
3340 }
3341 /*
3342  * per-process(per-mm_struct) statistics.
3343  */
3344 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
3345 {
3346 	return percpu_counter_read_positive(&mm->rss_stat[member]);
3347 }
3348 
3349 static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
3350 {
3351 	return percpu_counter_sum_positive(&mm->rss_stat[member]);
3352 }
3353 
3354 void mm_trace_rss_stat(struct mm_struct *mm, int member);
3355 
3356 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
3357 {
3358 	percpu_counter_add(&mm->rss_stat[member], value);
3359 
3360 	mm_trace_rss_stat(mm, member);
3361 }
3362 
3363 static inline void inc_mm_counter(struct mm_struct *mm, int member)
3364 {
3365 	percpu_counter_inc(&mm->rss_stat[member]);
3366 
3367 	mm_trace_rss_stat(mm, member);
3368 }
3369 
3370 static inline void dec_mm_counter(struct mm_struct *mm, int member)
3371 {
3372 	percpu_counter_dec(&mm->rss_stat[member]);
3373 
3374 	mm_trace_rss_stat(mm, member);
3375 }
3376 
3377 /* Optimized variant when folio is already known not to be anon */
3378 static inline int mm_counter_file(struct folio *folio)
3379 {
3380 	if (folio_test_swapbacked(folio))
3381 		return MM_SHMEMPAGES;
3382 	return MM_FILEPAGES;
3383 }
3384 
3385 static inline int mm_counter(struct folio *folio)
3386 {
3387 	if (folio_test_anon(folio))
3388 		return MM_ANONPAGES;
3389 	return mm_counter_file(folio);
3390 }
3391 
3392 static inline unsigned long get_mm_rss(struct mm_struct *mm)
3393 {
3394 	return get_mm_counter(mm, MM_FILEPAGES) +
3395 		get_mm_counter(mm, MM_ANONPAGES) +
3396 		get_mm_counter(mm, MM_SHMEMPAGES);
3397 }
3398 
3399 static inline unsigned long get_mm_rss_sum(struct mm_struct *mm)
3400 {
3401 	return get_mm_counter_sum(mm, MM_FILEPAGES) +
3402 		get_mm_counter_sum(mm, MM_ANONPAGES) +
3403 		get_mm_counter_sum(mm, MM_SHMEMPAGES);
3404 }
3405 
3406 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
3407 {
3408 	return max(mm->hiwater_rss, get_mm_rss(mm));
3409 }
3410 
3411 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
3412 {
3413 	return max(mm->hiwater_vm, mm->total_vm);
3414 }
3415 
3416 static inline void update_hiwater_rss(struct mm_struct *mm)
3417 {
3418 	unsigned long _rss = get_mm_rss(mm);
3419 
3420 	if (data_race(mm->hiwater_rss) < _rss)
3421 		data_race(mm->hiwater_rss = _rss);
3422 }
3423 
3424 static inline void update_hiwater_vm(struct mm_struct *mm)
3425 {
3426 	if (mm->hiwater_vm < mm->total_vm)
3427 		mm->hiwater_vm = mm->total_vm;
3428 }
3429 
3430 static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
3431 {
3432 	mm->hiwater_rss = get_mm_rss(mm);
3433 }
3434 
3435 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
3436 					 struct mm_struct *mm)
3437 {
3438 	unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
3439 
3440 	if (*maxrss < hiwater_rss)
3441 		*maxrss = hiwater_rss;
3442 }
3443 
3444 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
3445 static inline int pte_special(pte_t pte)
3446 {
3447 	return 0;
3448 }
3449 
3450 static inline pte_t pte_mkspecial(pte_t pte)
3451 {
3452 	return pte;
3453 }
3454 #endif
3455 
3456 #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
3457 static inline bool pmd_special(pmd_t pmd)
3458 {
3459 	return false;
3460 }
3461 
3462 static inline pmd_t pmd_mkspecial(pmd_t pmd)
3463 {
3464 	return pmd;
3465 }
3466 #endif	/* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
3467 
3468 #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
3469 static inline bool pud_special(pud_t pud)
3470 {
3471 	return false;
3472 }
3473 
3474 static inline pud_t pud_mkspecial(pud_t pud)
3475 {
3476 	return pud;
3477 }
3478 #endif	/* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
3479 
3480 extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
3481 			     spinlock_t **ptl);
3482 
3483 #ifdef __PAGETABLE_P4D_FOLDED
3484 static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
3485 						unsigned long address)
3486 {
3487 	return 0;
3488 }
3489 #else
3490 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
3491 #endif
3492 
3493 #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
3494 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
3495 						unsigned long address)
3496 {
3497 	return 0;
3498 }
3499 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
3500 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
3501 
3502 #else
3503 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
3504 
3505 static inline void mm_inc_nr_puds(struct mm_struct *mm)
3506 {
3507 	if (mm_pud_folded(mm))
3508 		return;
3509 	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
3510 }
3511 
3512 static inline void mm_dec_nr_puds(struct mm_struct *mm)
3513 {
3514 	if (mm_pud_folded(mm))
3515 		return;
3516 	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
3517 }
3518 #endif
3519 
3520 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
3521 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
3522 						unsigned long address)
3523 {
3524 	return 0;
3525 }
3526 
3527 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
3528 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
3529 
3530 #else
3531 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
3532 
3533 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
3534 {
3535 	if (mm_pmd_folded(mm))
3536 		return;
3537 	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
3538 }
3539 
3540 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
3541 {
3542 	if (mm_pmd_folded(mm))
3543 		return;
3544 	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
3545 }
3546 #endif
3547 
3548 #ifdef CONFIG_MMU
3549 static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
3550 {
3551 	atomic_long_set(&mm->pgtables_bytes, 0);
3552 }
3553 
3554 static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
3555 {
3556 	return atomic_long_read(&mm->pgtables_bytes);
3557 }
3558 
3559 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
3560 {
3561 	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
3562 }
3563 
3564 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
3565 {
3566 	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
3567 }
3568 #else
3569 
3570 static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
3571 static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
3572 {
3573 	return 0;
3574 }
3575 
3576 static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
3577 static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
3578 #endif
3579 
3580 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
3581 int __pte_alloc_kernel(pmd_t *pmd);
3582 
3583 #if defined(CONFIG_MMU)
3584 
3585 static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
3586 		unsigned long address)
3587 {
3588 	return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
3589 		NULL : p4d_offset(pgd, address);
3590 }
3591 
3592 static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
3593 		unsigned long address)
3594 {
3595 	return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
3596 		NULL : pud_offset(p4d, address);
3597 }
3598 
3599 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3600 {
3601 	return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
3602 		NULL: pmd_offset(pud, address);
3603 }
3604 #endif /* CONFIG_MMU */
3605 
3606 enum pt_flags {
3607 	PT_kernel = PG_referenced,
3608 	PT_reserved = PG_reserved,
3609 	/* High bits are used for zone/node/section */
3610 };
3611 
3612 static inline struct ptdesc *virt_to_ptdesc(const void *x)
3613 {
3614 	return page_ptdesc(virt_to_page(x));
3615 }
3616 
3617 /**
3618  * ptdesc_address - Virtual address of page table.
3619  * @pt: Page table descriptor.
3620  *
3621  * Return: The first byte of the page table described by @pt.
3622  */
3623 static inline void *ptdesc_address(const struct ptdesc *pt)
3624 {
3625 	return folio_address(ptdesc_folio(pt));
3626 }
3627 
3628 static inline bool pagetable_is_reserved(struct ptdesc *pt)
3629 {
3630 	return test_bit(PT_reserved, &pt->pt_flags.f);
3631 }
3632 
3633 /**
3634  * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
3635  * @ptdesc: The ptdesc to be marked
3636  *
3637  * Kernel page tables often need special handling. Set a flag so that
3638  * the handling code knows this ptdesc will not be used for userspace.
3639  */
3640 static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
3641 {
3642 	set_bit(PT_kernel, &ptdesc->pt_flags.f);
3643 }
3644 
3645 /**
3646  * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
3647  * @ptdesc: The ptdesc to be unmarked
3648  *
3649  * Use when the ptdesc is no longer used to map the kernel and no longer
3650  * needs special handling.
3651  */
3652 static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
3653 {
3654 	/*
3655 	 * Note: the 'PG_referenced' bit does not strictly need to be
3656 	 * cleared before freeing the page. But this is nice for
3657 	 * symmetry.
3658 	 */
3659 	clear_bit(PT_kernel, &ptdesc->pt_flags.f);
3660 }
3661 
3662 /**
3663  * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
3664  * @ptdesc: The ptdesc being tested
3665  *
3666  * Call to tell if the ptdesc used to map the kernel.
3667  */
3668 static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
3669 {
3670 	return test_bit(PT_kernel, &ptdesc->pt_flags.f);
3671 }
3672 
3673 /**
3674  * pagetable_alloc - Allocate pagetables
3675  * @gfp:    GFP flags
3676  * @order:  desired pagetable order
3677  *
3678  * pagetable_alloc allocates memory for page tables as well as a page table
3679  * descriptor to describe that memory.
3680  *
3681  * Return: The ptdesc describing the allocated page tables.
3682  */
3683 static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
3684 {
3685 	struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
3686 
3687 	return page_ptdesc(page);
3688 }
3689 #define pagetable_alloc(...)	alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
3690 
3691 static inline void __pagetable_free(struct ptdesc *pt)
3692 {
3693 	struct page *page = ptdesc_page(pt);
3694 
3695 	__free_pages(page, compound_order(page));
3696 }
3697 
3698 #ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
3699 void pagetable_free_kernel(struct ptdesc *pt);
3700 #else
3701 static inline void pagetable_free_kernel(struct ptdesc *pt)
3702 {
3703 	__pagetable_free(pt);
3704 }
3705 #endif
3706 /**
3707  * pagetable_free - Free pagetables
3708  * @pt:	The page table descriptor
3709  *
3710  * pagetable_free frees the memory of all page tables described by a page
3711  * table descriptor and the memory for the descriptor itself.
3712  */
3713 static inline void pagetable_free(struct ptdesc *pt)
3714 {
3715 	if (ptdesc_test_kernel(pt)) {
3716 		ptdesc_clear_kernel(pt);
3717 		pagetable_free_kernel(pt);
3718 	} else {
3719 		__pagetable_free(pt);
3720 	}
3721 }
3722 
3723 #if defined(CONFIG_SPLIT_PTE_PTLOCKS)
3724 #if ALLOC_SPLIT_PTLOCKS
3725 void __init ptlock_cache_init(void);
3726 bool ptlock_alloc(struct ptdesc *ptdesc);
3727 void ptlock_free(struct ptdesc *ptdesc);
3728 
3729 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
3730 {
3731 	return ptdesc->ptl;
3732 }
3733 #else /* ALLOC_SPLIT_PTLOCKS */
3734 static inline void ptlock_cache_init(void)
3735 {
3736 }
3737 
3738 static inline bool ptlock_alloc(struct ptdesc *ptdesc)
3739 {
3740 	return true;
3741 }
3742 
3743 static inline void ptlock_free(struct ptdesc *ptdesc)
3744 {
3745 }
3746 
3747 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
3748 {
3749 	return &ptdesc->ptl;
3750 }
3751 #endif /* ALLOC_SPLIT_PTLOCKS */
3752 
3753 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
3754 {
3755 	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
3756 }
3757 
3758 static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
3759 {
3760 	BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
3761 	BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
3762 	return ptlock_ptr(virt_to_ptdesc(pte));
3763 }
3764 
3765 static inline bool ptlock_init(struct ptdesc *ptdesc)
3766 {
3767 	/*
3768 	 * prep_new_page() initialize page->private (and therefore page->ptl)
3769 	 * with 0. Make sure nobody took it in use in between.
3770 	 *
3771 	 * It can happen if arch try to use slab for page table allocation:
3772 	 * slab code uses page->slab_cache, which share storage with page->ptl.
3773 	 */
3774 	VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
3775 	if (!ptlock_alloc(ptdesc))
3776 		return false;
3777 	spin_lock_init(ptlock_ptr(ptdesc));
3778 	return true;
3779 }
3780 
3781 #else	/* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
3782 /*
3783  * We use mm->page_table_lock to guard all pagetable pages of the mm.
3784  */
3785 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
3786 {
3787 	return &mm->page_table_lock;
3788 }
3789 static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
3790 {
3791 	return &mm->page_table_lock;
3792 }
3793 static inline void ptlock_cache_init(void) {}
3794 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
3795 static inline void ptlock_free(struct ptdesc *ptdesc) {}
3796 #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
3797 
3798 static inline void __pagetable_ctor(struct ptdesc *ptdesc)
3799 {
3800 	struct folio *folio = ptdesc_folio(ptdesc);
3801 
3802 	__folio_set_pgtable(folio);
3803 	lruvec_stat_add_folio(folio, NR_PAGETABLE);
3804 }
3805 
3806 static inline void pagetable_dtor(struct ptdesc *ptdesc)
3807 {
3808 	struct folio *folio = ptdesc_folio(ptdesc);
3809 
3810 	ptlock_free(ptdesc);
3811 	__folio_clear_pgtable(folio);
3812 	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
3813 }
3814 
3815 static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
3816 {
3817 	pagetable_dtor(ptdesc);
3818 	pagetable_free(ptdesc);
3819 }
3820 
3821 static inline bool pagetable_pte_ctor(struct mm_struct *mm,
3822 				      struct ptdesc *ptdesc)
3823 {
3824 	if (mm != &init_mm && !ptlock_init(ptdesc))
3825 		return false;
3826 	__pagetable_ctor(ptdesc);
3827 	return true;
3828 }
3829 
3830 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
3831 
3832 static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
3833 {
3834 	return __pte_offset_map(pmd, addr, NULL);
3835 }
3836 
3837 pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
3838 			   unsigned long addr, spinlock_t **ptlp);
3839 
3840 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
3841 				unsigned long addr, spinlock_t **ptlp);
3842 pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
3843 				unsigned long addr, pmd_t *pmdvalp,
3844 				spinlock_t **ptlp);
3845 
3846 #define pte_unmap_unlock(pte, ptl)	do {		\
3847 	spin_unlock(ptl);				\
3848 	pte_unmap(pte);					\
3849 } while (0)
3850 
3851 #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
3852 
3853 #define pte_alloc_map(mm, pmd, address)			\
3854 	(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
3855 
3856 #define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
3857 	(pte_alloc(mm, pmd) ?			\
3858 		 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
3859 
3860 #define pte_alloc_kernel(pmd, address)			\
3861 	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
3862 		NULL: pte_offset_kernel(pmd, address))
3863 
3864 #if defined(CONFIG_SPLIT_PMD_PTLOCKS)
3865 
3866 static inline struct page *pmd_pgtable_page(pmd_t *pmd)
3867 {
3868 	unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
3869 	return virt_to_page((void *)((unsigned long) pmd & mask));
3870 }
3871 
3872 static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
3873 {
3874 	return page_ptdesc(pmd_pgtable_page(pmd));
3875 }
3876 
3877 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
3878 {
3879 	return ptlock_ptr(pmd_ptdesc(pmd));
3880 }
3881 
3882 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
3883 {
3884 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3885 	ptdesc->pmd_huge_pte = NULL;
3886 #endif
3887 	return ptlock_init(ptdesc);
3888 }
3889 
3890 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
3891 
3892 #else
3893 
3894 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
3895 {
3896 	return &mm->page_table_lock;
3897 }
3898 
3899 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
3900 
3901 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
3902 
3903 #endif
3904 
3905 static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
3906 {
3907 	spinlock_t *ptl = pmd_lockptr(mm, pmd);
3908 	spin_lock(ptl);
3909 	return ptl;
3910 }
3911 
3912 static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
3913 				      struct ptdesc *ptdesc)
3914 {
3915 	if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
3916 		return false;
3917 	ptdesc_pmd_pts_init(ptdesc);
3918 	__pagetable_ctor(ptdesc);
3919 	return true;
3920 }
3921 
3922 /*
3923  * No scalability reason to split PUD locks yet, but follow the same pattern
3924  * as the PMD locks to make it easier if we decide to.  The VM should not be
3925  * considered ready to switch to split PUD locks yet; there may be places
3926  * which need to be converted from page_table_lock.
3927  */
3928 static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
3929 {
3930 	return &mm->page_table_lock;
3931 }
3932 
3933 static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
3934 {
3935 	spinlock_t *ptl = pud_lockptr(mm, pud);
3936 
3937 	spin_lock(ptl);
3938 	return ptl;
3939 }
3940 
3941 static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
3942 {
3943 	__pagetable_ctor(ptdesc);
3944 }
3945 
3946 static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
3947 {
3948 	__pagetable_ctor(ptdesc);
3949 }
3950 
3951 static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
3952 {
3953 	__pagetable_ctor(ptdesc);
3954 }
3955 
3956 extern void __init pagecache_init(void);
3957 extern void free_initmem(void);
3958 
3959 /*
3960  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
3961  * into the buddy system. The freed pages will be poisoned with pattern
3962  * "poison" if it's within range [0, UCHAR_MAX].
3963  * Return pages freed into the buddy system.
3964  */
3965 extern unsigned long free_reserved_area(void *start, void *end,
3966 					int poison, const char *s);
3967 
3968 extern void adjust_managed_page_count(struct page *page, long count);
3969 
3970 /* Free the reserved page into the buddy system, so it gets managed. */
3971 void free_reserved_page(struct page *page);
3972 
3973 static inline void mark_page_reserved(struct page *page)
3974 {
3975 	SetPageReserved(page);
3976 	adjust_managed_page_count(page, -1);
3977 }
3978 
3979 static inline void free_reserved_ptdesc(struct ptdesc *pt)
3980 {
3981 	free_reserved_page(ptdesc_page(pt));
3982 }
3983 
3984 /*
3985  * Default method to free all the __init memory into the buddy system.
3986  * The freed pages will be poisoned with pattern "poison" if it's within
3987  * range [0, UCHAR_MAX].
3988  * Return pages freed into the buddy system.
3989  */
3990 static inline unsigned long free_initmem_default(int poison)
3991 {
3992 	extern char __init_begin[], __init_end[];
3993 
3994 	return free_reserved_area(&__init_begin, &__init_end,
3995 				  poison, "unused kernel image (initmem)");
3996 }
3997 
3998 static inline unsigned long get_num_physpages(void)
3999 {
4000 	int nid;
4001 	unsigned long phys_pages = 0;
4002 
4003 	for_each_online_node(nid)
4004 		phys_pages += node_present_pages(nid);
4005 
4006 	return phys_pages;
4007 }
4008 
4009 /*
4010  * FIXME: Using memblock node mappings, an architecture may initialise its
4011  * zones, allocate the backing mem_map and account for memory holes in an
4012  * architecture independent manner.
4013  *
4014  * An architecture is expected to register range of page frames backed by
4015  * physical memory with memblock_add[_node]() before calling
4016  * free_area_init() passing in the PFN each zone ends at. At a basic
4017  * usage, an architecture is expected to do something like
4018  *
4019  * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
4020  * 							 max_highmem_pfn};
4021  * for_each_valid_physical_page_range()
4022  *	memblock_add_node(base, size, nid, MEMBLOCK_NONE)
4023  * free_area_init(max_zone_pfns);
4024  */
4025 void arch_zone_limits_init(unsigned long *max_zone_pfn);
4026 unsigned long node_map_pfn_alignment(void);
4027 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
4028 						unsigned long end_pfn);
4029 extern void get_pfn_range_for_nid(unsigned int nid,
4030 			unsigned long *start_pfn, unsigned long *end_pfn);
4031 
4032 #ifndef CONFIG_NUMA
4033 static inline int early_pfn_to_nid(unsigned long pfn)
4034 {
4035 	return 0;
4036 }
4037 #else
4038 /* please see mm/page_alloc.c */
4039 extern int __meminit early_pfn_to_nid(unsigned long pfn);
4040 #endif
4041 
4042 extern void mem_init(void);
4043 extern void __init mmap_init(void);
4044 
4045 extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
4046 static inline void show_mem(void)
4047 {
4048 	__show_mem(0, NULL, MAX_NR_ZONES - 1);
4049 }
4050 extern long si_mem_available(void);
4051 extern void si_meminfo(struct sysinfo * val);
4052 extern void si_meminfo_node(struct sysinfo *val, int nid);
4053 
4054 extern __printf(3, 4)
4055 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
4056 
4057 extern void setup_per_cpu_pageset(void);
4058 
4059 /* nommu.c */
4060 extern atomic_long_t mmap_pages_allocated;
4061 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
4062 
4063 /* interval_tree.c */
4064 void vma_interval_tree_insert(struct vm_area_struct *node,
4065 			      struct rb_root_cached *root);
4066 void vma_interval_tree_insert_after(struct vm_area_struct *node,
4067 				    struct vm_area_struct *prev,
4068 				    struct rb_root_cached *root);
4069 void vma_interval_tree_remove(struct vm_area_struct *node,
4070 			      struct rb_root_cached *root);
4071 struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
4072 				unsigned long start, unsigned long last);
4073 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
4074 				unsigned long start, unsigned long last);
4075 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
4076 				unsigned long start, unsigned long last);
4077 
4078 #define vma_interval_tree_foreach(vma, root, start, last)		\
4079 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
4080 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
4081 
4082 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
4083 				   struct rb_root_cached *root);
4084 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
4085 				   struct rb_root_cached *root);
4086 struct anon_vma_chain *
4087 anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
4088 				  unsigned long start, unsigned long last);
4089 struct anon_vma_chain *anon_vma_interval_tree_iter_next(
4090 	struct anon_vma_chain *node, unsigned long start, unsigned long last);
4091 #ifdef CONFIG_DEBUG_VM_RB
4092 void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
4093 #endif
4094 
4095 #define anon_vma_interval_tree_foreach(avc, root, start, last)		 \
4096 	for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
4097 	     avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
4098 
4099 /* mmap.c */
4100 extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin);
4101 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
4102 extern void exit_mmap(struct mm_struct *);
4103 bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
4104 				 unsigned long addr, bool write);
4105 
4106 static inline int check_data_rlimit(unsigned long rlim,
4107 				    unsigned long new,
4108 				    unsigned long start,
4109 				    unsigned long end_data,
4110 				    unsigned long start_data)
4111 {
4112 	if (rlim < RLIM_INFINITY) {
4113 		if (((new - start) + (end_data - start_data)) > rlim)
4114 			return -ENOSPC;
4115 	}
4116 
4117 	return 0;
4118 }
4119 
4120 extern int mm_take_all_locks(struct mm_struct *mm);
4121 extern void mm_drop_all_locks(struct mm_struct *mm);
4122 
4123 extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
4124 extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
4125 extern struct file *get_mm_exe_file(struct mm_struct *mm);
4126 extern struct file *get_task_exe_file(struct task_struct *task);
4127 
4128 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
4129 
4130 extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
4131 				   const struct vm_special_mapping *sm);
4132 struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
4133 				   unsigned long addr, unsigned long len,
4134 				   vm_flags_t vm_flags,
4135 				   const struct vm_special_mapping *spec);
4136 
4137 unsigned long randomize_stack_top(unsigned long stack_top);
4138 unsigned long randomize_page(unsigned long start, unsigned long range);
4139 
4140 unsigned long
4141 __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
4142 		    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
4143 
4144 static inline unsigned long
4145 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
4146 		  unsigned long pgoff, unsigned long flags)
4147 {
4148 	return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
4149 }
4150 
4151 extern unsigned long do_mmap(struct file *file, unsigned long addr,
4152 	unsigned long len, unsigned long prot, unsigned long flags,
4153 	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
4154 	struct list_head *uf);
4155 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
4156 			 unsigned long start, size_t len, struct list_head *uf,
4157 			 bool unlock);
4158 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
4159 		    struct mm_struct *mm, unsigned long start,
4160 		    unsigned long end, struct list_head *uf, bool unlock);
4161 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
4162 		     struct list_head *uf);
4163 extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
4164 
4165 #ifdef CONFIG_MMU
4166 extern int __mm_populate(unsigned long addr, unsigned long len,
4167 			 int ignore_errors);
4168 static inline void mm_populate(unsigned long addr, unsigned long len)
4169 {
4170 	/* Ignore errors */
4171 	(void) __mm_populate(addr, len, 1);
4172 }
4173 #else
4174 static inline void mm_populate(unsigned long addr, unsigned long len) {}
4175 #endif
4176 
4177 /* This takes the mm semaphore itself */
4178 int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec);
4179 int vm_munmap(unsigned long start, size_t len);
4180 unsigned long __must_check vm_mmap(struct file *file, unsigned long addr,
4181 		unsigned long len, unsigned long prot,
4182 		unsigned long flag, unsigned long offset);
4183 unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr,
4184 		unsigned long len, unsigned long flags);
4185 
4186 struct vm_unmapped_area_info {
4187 #define VM_UNMAPPED_AREA_TOPDOWN 1
4188 	unsigned long flags;
4189 	unsigned long length;
4190 	unsigned long low_limit;
4191 	unsigned long high_limit;
4192 	unsigned long align_mask;
4193 	unsigned long align_offset;
4194 	unsigned long start_gap;
4195 };
4196 
4197 extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
4198 
4199 /* truncate.c */
4200 void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
4201 void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
4202 		uoff_t lend);
4203 void truncate_inode_pages_final(struct address_space *mapping);
4204 
4205 /* generic vm_area_ops exported for stackable file systems */
4206 extern vm_fault_t filemap_fault(struct vm_fault *vmf);
4207 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
4208 		pgoff_t start_pgoff, pgoff_t end_pgoff);
4209 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
4210 
4211 extern unsigned long stack_guard_gap;
4212 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
4213 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
4214 struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
4215 
4216 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
4217 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
4218 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
4219 					     struct vm_area_struct **pprev);
4220 
4221 /*
4222  * Look up the first VMA which intersects the interval [start_addr, end_addr)
4223  * NULL if none.  Assume start_addr < end_addr.
4224  */
4225 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
4226 			unsigned long start_addr, unsigned long end_addr);
4227 
4228 /**
4229  * vma_lookup() - Find a VMA at a specific address
4230  * @mm: The process address space.
4231  * @addr: The user address.
4232  *
4233  * Return: The vm_area_struct at the given address, %NULL otherwise.
4234  */
4235 static inline
4236 struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
4237 {
4238 	return mtree_load(&mm->mm_mt, addr);
4239 }
4240 
4241 static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma)
4242 {
4243 	if (vma->vm_flags & VM_GROWSDOWN)
4244 		return stack_guard_gap;
4245 
4246 	/* See reasoning around the VM_SHADOW_STACK definition */
4247 	if (vma->vm_flags & VM_SHADOW_STACK)
4248 		return PAGE_SIZE;
4249 
4250 	return 0;
4251 }
4252 
4253 static inline unsigned long vm_start_gap(const struct vm_area_struct *vma)
4254 {
4255 	unsigned long gap = stack_guard_start_gap(vma);
4256 	unsigned long vm_start = vma->vm_start;
4257 
4258 	vm_start -= gap;
4259 	if (vm_start > vma->vm_start)
4260 		vm_start = 0;
4261 	return vm_start;
4262 }
4263 
4264 static inline unsigned long vm_end_gap(const struct vm_area_struct *vma)
4265 {
4266 	unsigned long vm_end = vma->vm_end;
4267 
4268 	if (vma->vm_flags & VM_GROWSUP) {
4269 		vm_end += stack_guard_gap;
4270 		if (vm_end < vma->vm_end)
4271 			vm_end = -PAGE_SIZE;
4272 	}
4273 	return vm_end;
4274 }
4275 
4276 static inline unsigned long vma_pages(const struct vm_area_struct *vma)
4277 {
4278 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
4279 }
4280 
4281 static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma)
4282 {
4283 	return vma->vm_pgoff + vma_pages(vma) - 1;
4284 }
4285 
4286 static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
4287 {
4288 	return desc->end - desc->start;
4289 }
4290 
4291 static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
4292 {
4293 	return vma_desc_size(desc) >> PAGE_SHIFT;
4294 }
4295 
4296 /**
4297  * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
4298  * remap is required.
4299  * @desc: The VMA descriptor for the VMA requiring remap.
4300  * @start: The virtual address to start the remap from, must be within the VMA.
4301  * @start_pfn: The first PFN in the range to remap.
4302  * @size: The size of the range to remap, in bytes, at most spanning to the end
4303  * of the VMA.
4304  */
4305 static inline void mmap_action_remap(struct vm_area_desc *desc,
4306 				     unsigned long start,
4307 				     unsigned long start_pfn,
4308 				     unsigned long size)
4309 {
4310 	struct mmap_action *action = &desc->action;
4311 
4312 	/* [start, start + size) must be within the VMA. */
4313 	WARN_ON_ONCE(start < desc->start || start >= desc->end);
4314 	WARN_ON_ONCE(start + size > desc->end);
4315 
4316 	action->type = MMAP_REMAP_PFN;
4317 	action->remap.start = start;
4318 	action->remap.start_pfn = start_pfn;
4319 	action->remap.size = size;
4320 	action->remap.pgprot = desc->page_prot;
4321 }
4322 
4323 /**
4324  * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
4325  * entirety of a VMA should be PFN remapped.
4326  * @desc: The VMA descriptor for the VMA requiring remap.
4327  * @start_pfn: The first PFN in the range to remap.
4328  */
4329 static inline void mmap_action_remap_full(struct vm_area_desc *desc,
4330 					  unsigned long start_pfn)
4331 {
4332 	mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
4333 }
4334 
4335 /**
4336  * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
4337  * I/O remap is required.
4338  * @desc: The VMA descriptor for the VMA requiring remap.
4339  * @start: The virtual address to start the remap from, must be within the VMA.
4340  * @start_pfn: The first PFN in the range to remap.
4341  * @size: The size of the range to remap, in bytes, at most spanning to the end
4342  * of the VMA.
4343  */
4344 static inline void mmap_action_ioremap(struct vm_area_desc *desc,
4345 				       unsigned long start,
4346 				       unsigned long start_pfn,
4347 				       unsigned long size)
4348 {
4349 	mmap_action_remap(desc, start, start_pfn, size);
4350 	desc->action.type = MMAP_IO_REMAP_PFN;
4351 }
4352 
4353 /**
4354  * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
4355  * entirety of a VMA should be PFN I/O remapped.
4356  * @desc: The VMA descriptor for the VMA requiring remap.
4357  * @start_pfn: The first PFN in the range to remap.
4358  */
4359 static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
4360 					    unsigned long start_pfn)
4361 {
4362 	mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
4363 }
4364 
4365 /**
4366  * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the
4367  * physical range in [start_phys_addr, start_phys_addr + size) should be I/O
4368  * remapped.
4369  * @desc: The VMA descriptor for the VMA requiring remap.
4370  * @start_phys_addr: Start of the physical memory to be mapped.
4371  * @size: Size of the area to map.
4372  *
4373  * NOTE: Some drivers might want to tweak desc->page_prot for purposes of
4374  * write-combine or similar.
4375  */
4376 static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc,
4377 					      phys_addr_t start_phys_addr,
4378 					      unsigned long size)
4379 {
4380 	struct mmap_action *action = &desc->action;
4381 
4382 	action->simple_ioremap.start_phys_addr = start_phys_addr;
4383 	action->simple_ioremap.size = size;
4384 	action->type = MMAP_SIMPLE_IO_REMAP;
4385 }
4386 
4387 /**
4388  * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that
4389  * @num kernel pages contained in the @pages array should be mapped to userland
4390  * starting at virtual address @start.
4391  * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped.
4392  * @start: The virtual address from which to map them.
4393  * @pages: An array of struct page pointers describing the memory to map.
4394  * @nr_pages: The number of entries in the @pages aray.
4395  */
4396 static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc,
4397 		unsigned long start, struct page **pages,
4398 		unsigned long nr_pages)
4399 {
4400 	struct mmap_action *action = &desc->action;
4401 
4402 	action->type = MMAP_MAP_KERNEL_PAGES;
4403 	action->map_kernel.start = start;
4404 	action->map_kernel.pages = pages;
4405 	action->map_kernel.nr_pages = nr_pages;
4406 	action->map_kernel.pgoff = desc->pgoff;
4407 }
4408 
4409 /**
4410  * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that
4411  * kernel pages contained in the @pages array should be mapped to userland
4412  * from @desc->start to @desc->end.
4413  * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped.
4414  * @pages: An array of struct page pointers describing the memory to map.
4415  *
4416  * The caller must ensure that @pages contains sufficient entries to cover the
4417  * entire range described by @desc.
4418  */
4419 static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc,
4420 		struct page **pages)
4421 {
4422 	mmap_action_map_kernel_pages(desc, desc->start, pages,
4423 				     vma_desc_pages(desc));
4424 }
4425 
4426 int mmap_action_prepare(struct vm_area_desc *desc);
4427 int mmap_action_complete(struct vm_area_struct *vma,
4428 			 struct mmap_action *action, bool is_compat);
4429 
4430 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
4431 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
4432 				unsigned long vm_start, unsigned long vm_end)
4433 {
4434 	struct vm_area_struct *vma = vma_lookup(mm, vm_start);
4435 
4436 	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
4437 		vma = NULL;
4438 
4439 	return vma;
4440 }
4441 
4442 /**
4443  * range_is_subset - Is the specified inner range a subset of the outer range?
4444  * @outer_start: The start of the outer range.
4445  * @outer_end: The exclusive end of the outer range.
4446  * @inner_start: The start of the inner range.
4447  * @inner_end: The exclusive end of the inner range.
4448  *
4449  * Returns: %true if [inner_start, inner_end) is a subset of [outer_start,
4450  * outer_end), otherwise %false.
4451  */
4452 static inline bool range_is_subset(unsigned long outer_start,
4453 				   unsigned long outer_end,
4454 				   unsigned long inner_start,
4455 				   unsigned long inner_end)
4456 {
4457 	return outer_start <= inner_start && inner_end <= outer_end;
4458 }
4459 
4460 /**
4461  * range_in_vma - is the specified [@start, @end) range a subset of the VMA?
4462  * @vma: The VMA against which we want to check [@start, @end).
4463  * @start: The start of the range we wish to check.
4464  * @end: The exclusive end of the range we wish to check.
4465  *
4466  * Returns: %true if [@start, @end) is a subset of [@vma->vm_start,
4467  * @vma->vm_end), %false otherwise.
4468  */
4469 static inline bool range_in_vma(const struct vm_area_struct *vma,
4470 				unsigned long start, unsigned long end)
4471 {
4472 	if (!vma)
4473 		return false;
4474 
4475 	return range_is_subset(vma->vm_start, vma->vm_end, start, end);
4476 }
4477 
4478 /**
4479  * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA
4480  * described by @desc, a VMA descriptor?
4481  * @desc: The VMA descriptor against which we want to check [@start, @end).
4482  * @start: The start of the range we wish to check.
4483  * @end: The exclusive end of the range we wish to check.
4484  *
4485  * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end),
4486  * %false otherwise.
4487  */
4488 static inline bool range_in_vma_desc(const struct vm_area_desc *desc,
4489 				     unsigned long start, unsigned long end)
4490 {
4491 	if (!desc)
4492 		return false;
4493 
4494 	return range_is_subset(desc->start, desc->end, start, end);
4495 }
4496 
4497 #ifdef CONFIG_MMU
4498 pgprot_t vm_get_page_prot(vm_flags_t vm_flags);
4499 
4500 static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
4501 {
4502 	const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags);
4503 
4504 	return vm_get_page_prot(vm_flags);
4505 }
4506 
4507 void vma_set_page_prot(struct vm_area_struct *vma);
4508 #else
4509 static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
4510 {
4511 	return __pgprot(0);
4512 }
4513 static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
4514 {
4515 	return __pgprot(0);
4516 }
4517 static inline void vma_set_page_prot(struct vm_area_struct *vma)
4518 {
4519 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
4520 }
4521 #endif
4522 
4523 void vma_set_file(struct vm_area_struct *vma, struct file *file);
4524 
4525 #ifdef CONFIG_NUMA_BALANCING
4526 unsigned long change_prot_numa(struct vm_area_struct *vma,
4527 			unsigned long start, unsigned long end);
4528 #endif
4529 
4530 struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
4531 		unsigned long addr);
4532 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
4533 		    unsigned long pfn, unsigned long size, pgprot_t pgprot);
4534 
4535 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
4536 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
4537 			struct page **pages, unsigned long *num);
4538 int map_kernel_pages_prepare(struct vm_area_desc *desc);
4539 int map_kernel_pages_complete(struct vm_area_struct *vma,
4540 			      struct mmap_action *action);
4541 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
4542 				unsigned long num);
4543 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
4544 				unsigned long num);
4545 vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
4546 			bool write);
4547 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
4548 			unsigned long pfn);
4549 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
4550 			unsigned long pfn, pgprot_t pgprot);
4551 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
4552 			unsigned long pfn);
4553 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
4554 		unsigned long addr, unsigned long pfn);
4555 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
4556 
4557 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
4558 				unsigned long addr, struct page *page)
4559 {
4560 	int err = vm_insert_page(vma, addr, page);
4561 
4562 	if (err == -ENOMEM)
4563 		return VM_FAULT_OOM;
4564 	if (err < 0 && err != -EBUSY)
4565 		return VM_FAULT_SIGBUS;
4566 
4567 	return VM_FAULT_NOPAGE;
4568 }
4569 
4570 #ifndef io_remap_pfn_range_pfn
4571 static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
4572 		unsigned long size)
4573 {
4574 	return pfn;
4575 }
4576 #endif
4577 
4578 static inline int io_remap_pfn_range(struct vm_area_struct *vma,
4579 				     unsigned long addr, unsigned long orig_pfn,
4580 				     unsigned long size, pgprot_t orig_prot)
4581 {
4582 	const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
4583 	const pgprot_t prot = pgprot_decrypted(orig_prot);
4584 
4585 	return remap_pfn_range(vma, addr, pfn, size, prot);
4586 }
4587 
4588 static inline vm_fault_t vmf_error(int err)
4589 {
4590 	if (err == -ENOMEM)
4591 		return VM_FAULT_OOM;
4592 	else if (err == -EHWPOISON)
4593 		return VM_FAULT_HWPOISON;
4594 	return VM_FAULT_SIGBUS;
4595 }
4596 
4597 /*
4598  * Convert errno to return value for ->page_mkwrite() calls.
4599  *
4600  * This should eventually be merged with vmf_error() above, but will need a
4601  * careful audit of all vmf_error() callers.
4602  */
4603 static inline vm_fault_t vmf_fs_error(int err)
4604 {
4605 	if (err == 0)
4606 		return VM_FAULT_LOCKED;
4607 	if (err == -EFAULT || err == -EAGAIN)
4608 		return VM_FAULT_NOPAGE;
4609 	if (err == -ENOMEM)
4610 		return VM_FAULT_OOM;
4611 	/* -ENOSPC, -EDQUOT, -EIO ... */
4612 	return VM_FAULT_SIGBUS;
4613 }
4614 
4615 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
4616 {
4617 	if (vm_fault & VM_FAULT_OOM)
4618 		return -ENOMEM;
4619 	if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
4620 		return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
4621 	if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
4622 		return -EFAULT;
4623 	return 0;
4624 }
4625 
4626 /*
4627  * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
4628  * a (NUMA hinting) fault is required.
4629  */
4630 static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
4631 					   unsigned int flags)
4632 {
4633 	/*
4634 	 * If callers don't want to honor NUMA hinting faults, no need to
4635 	 * determine if we would actually have to trigger a NUMA hinting fault.
4636 	 */
4637 	if (!(flags & FOLL_HONOR_NUMA_FAULT))
4638 		return true;
4639 
4640 	/*
4641 	 * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
4642 	 *
4643 	 * Requiring a fault here even for inaccessible VMAs would mean that
4644 	 * FOLL_FORCE cannot make any progress, because handle_mm_fault()
4645 	 * refuses to process NUMA hinting faults in inaccessible VMAs.
4646 	 */
4647 	return !vma_is_accessible(vma);
4648 }
4649 
4650 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
4651 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
4652 			       unsigned long size, pte_fn_t fn, void *data);
4653 extern int apply_to_existing_page_range(struct mm_struct *mm,
4654 				   unsigned long address, unsigned long size,
4655 				   pte_fn_t fn, void *data);
4656 
4657 #ifdef CONFIG_PAGE_POISONING
4658 extern void __kernel_poison_pages(struct page *page, int numpages);
4659 extern void __kernel_unpoison_pages(struct page *page, int numpages);
4660 extern bool _page_poisoning_enabled_early;
4661 DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
4662 static inline bool page_poisoning_enabled(void)
4663 {
4664 	return _page_poisoning_enabled_early;
4665 }
4666 /*
4667  * For use in fast paths after init_mem_debugging() has run, or when a
4668  * false negative result is not harmful when called too early.
4669  */
4670 static inline bool page_poisoning_enabled_static(void)
4671 {
4672 	return static_branch_unlikely(&_page_poisoning_enabled);
4673 }
4674 static inline void kernel_poison_pages(struct page *page, int numpages)
4675 {
4676 	if (page_poisoning_enabled_static())
4677 		__kernel_poison_pages(page, numpages);
4678 }
4679 static inline void kernel_unpoison_pages(struct page *page, int numpages)
4680 {
4681 	if (page_poisoning_enabled_static())
4682 		__kernel_unpoison_pages(page, numpages);
4683 }
4684 #else
4685 static inline bool page_poisoning_enabled(void) { return false; }
4686 static inline bool page_poisoning_enabled_static(void) { return false; }
4687 static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
4688 static inline void kernel_poison_pages(struct page *page, int numpages) { }
4689 static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
4690 #endif
4691 
4692 DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
4693 static inline bool want_init_on_alloc(gfp_t flags)
4694 {
4695 	if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
4696 				&init_on_alloc))
4697 		return true;
4698 	return flags & __GFP_ZERO;
4699 }
4700 
4701 DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
4702 static inline bool want_init_on_free(void)
4703 {
4704 	return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
4705 				   &init_on_free);
4706 }
4707 
4708 extern bool _debug_pagealloc_enabled_early;
4709 DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
4710 
4711 static inline bool debug_pagealloc_enabled(void)
4712 {
4713 	return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
4714 		_debug_pagealloc_enabled_early;
4715 }
4716 
4717 /*
4718  * For use in fast paths after mem_debugging_and_hardening_init() has run,
4719  * or when a false negative result is not harmful when called too early.
4720  */
4721 static inline bool debug_pagealloc_enabled_static(void)
4722 {
4723 	if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
4724 		return false;
4725 
4726 	return static_branch_unlikely(&_debug_pagealloc_enabled);
4727 }
4728 
4729 /*
4730  * To support DEBUG_PAGEALLOC architecture must ensure that
4731  * __kernel_map_pages() never fails
4732  */
4733 extern void __kernel_map_pages(struct page *page, int numpages, int enable);
4734 #ifdef CONFIG_DEBUG_PAGEALLOC
4735 static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
4736 {
4737 	iommu_debug_check_unmapped(page, numpages);
4738 
4739 	if (debug_pagealloc_enabled_static())
4740 		__kernel_map_pages(page, numpages, 1);
4741 }
4742 
4743 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
4744 {
4745 	iommu_debug_check_unmapped(page, numpages);
4746 
4747 	if (debug_pagealloc_enabled_static())
4748 		__kernel_map_pages(page, numpages, 0);
4749 }
4750 
4751 extern unsigned int _debug_guardpage_minorder;
4752 DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
4753 
4754 static inline unsigned int debug_guardpage_minorder(void)
4755 {
4756 	return _debug_guardpage_minorder;
4757 }
4758 
4759 static inline bool debug_guardpage_enabled(void)
4760 {
4761 	return static_branch_unlikely(&_debug_guardpage_enabled);
4762 }
4763 
4764 static inline bool page_is_guard(const struct page *page)
4765 {
4766 	if (!debug_guardpage_enabled())
4767 		return false;
4768 
4769 	return PageGuard(page);
4770 }
4771 
4772 bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
4773 static inline bool set_page_guard(struct zone *zone, struct page *page,
4774 				  unsigned int order)
4775 {
4776 	if (!debug_guardpage_enabled())
4777 		return false;
4778 	return __set_page_guard(zone, page, order);
4779 }
4780 
4781 void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
4782 static inline void clear_page_guard(struct zone *zone, struct page *page,
4783 				    unsigned int order)
4784 {
4785 	if (!debug_guardpage_enabled())
4786 		return;
4787 	__clear_page_guard(zone, page, order);
4788 }
4789 
4790 #else	/* CONFIG_DEBUG_PAGEALLOC */
4791 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
4792 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
4793 static inline unsigned int debug_guardpage_minorder(void) { return 0; }
4794 static inline bool debug_guardpage_enabled(void) { return false; }
4795 static inline bool page_is_guard(const struct page *page) { return false; }
4796 static inline bool set_page_guard(struct zone *zone, struct page *page,
4797 			unsigned int order) { return false; }
4798 static inline void clear_page_guard(struct zone *zone, struct page *page,
4799 				unsigned int order) {}
4800 #endif	/* CONFIG_DEBUG_PAGEALLOC */
4801 
4802 #ifndef clear_pages
4803 /**
4804  * clear_pages() - clear a page range for kernel-internal use.
4805  * @addr: start address
4806  * @npages: number of pages
4807  *
4808  * Use clear_user_pages() instead when clearing a page range to be
4809  * mapped to user space.
4810  *
4811  * Does absolutely no exception handling.
4812  *
4813  * Note that even though the clearing operation is preemptible, clear_pages()
4814  * does not (and on architectures where it reduces to a few long-running
4815  * instructions, might not be able to) call cond_resched() to check if
4816  * rescheduling is required.
4817  *
4818  * When running under preemptible models this is not a problem. Under
4819  * cooperatively scheduled models, however, the caller is expected to
4820  * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH.
4821  */
4822 static inline void clear_pages(void *addr, unsigned int npages)
4823 {
4824 	do {
4825 		clear_page(addr);
4826 		addr += PAGE_SIZE;
4827 	} while (--npages);
4828 }
4829 #endif
4830 
4831 #ifndef PROCESS_PAGES_NON_PREEMPT_BATCH
4832 #ifdef clear_pages
4833 /*
4834  * The architecture defines clear_pages(), and we assume that it is
4835  * generally "fast". So choose a batch size large enough to allow the processor
4836  * headroom for optimizing the operation and yet small enough that we see
4837  * reasonable preemption latency for when this optimization is not possible
4838  * (ex. slow microarchitectures, memory bandwidth saturation.)
4839  *
4840  * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should
4841  * result in worst case preemption latency of around 3ms when clearing pages.
4842  *
4843  * (See comment above clear_pages() for why preemption latency is a concern
4844  * here.)
4845  */
4846 #define PROCESS_PAGES_NON_PREEMPT_BATCH		(SZ_32M >> PAGE_SHIFT)
4847 #else /* !clear_pages */
4848 /*
4849  * The architecture does not provide a clear_pages() implementation. Assume
4850  * that clear_page() -- which clear_pages() will fallback to -- is relatively
4851  * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH.
4852  */
4853 #define PROCESS_PAGES_NON_PREEMPT_BATCH		1
4854 #endif
4855 #endif
4856 
4857 #ifdef __HAVE_ARCH_GATE_AREA
4858 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
4859 extern int in_gate_area_no_mm(unsigned long addr);
4860 extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
4861 #else
4862 static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
4863 {
4864 	return NULL;
4865 }
4866 static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
4867 static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
4868 {
4869 	return 0;
4870 }
4871 #endif	/* __HAVE_ARCH_GATE_AREA */
4872 
4873 bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm);
4874 
4875 void drop_slab(void);
4876 
4877 #ifndef CONFIG_MMU
4878 #define randomize_va_space 0
4879 #else
4880 extern int randomize_va_space;
4881 #endif
4882 
4883 const char * arch_vma_name(struct vm_area_struct *vma);
4884 #ifdef CONFIG_MMU
4885 void print_vma_addr(char *prefix, unsigned long rip);
4886 #else
4887 static inline void print_vma_addr(char *prefix, unsigned long rip)
4888 {
4889 }
4890 #endif
4891 
4892 unsigned long section_map_size(void);
4893 struct page * __populate_section_memmap(unsigned long pfn,
4894 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
4895 		struct dev_pagemap *pgmap);
4896 void *vmemmap_alloc_block(unsigned long size, int node);
4897 struct vmem_altmap;
4898 void *vmemmap_alloc_block_buf(unsigned long size, int node,
4899 			      struct vmem_altmap *altmap);
4900 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
4901 void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
4902 		     unsigned long addr, unsigned long next);
4903 int vmemmap_check_pmd(pmd_t *pmd, int node,
4904 		      unsigned long addr, unsigned long next);
4905 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
4906 			       int node, struct vmem_altmap *altmap);
4907 int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
4908 			       int node, struct vmem_altmap *altmap);
4909 int vmemmap_populate(unsigned long start, unsigned long end, int node,
4910 		struct vmem_altmap *altmap);
4911 int vmemmap_populate_hvo(unsigned long start, unsigned long end,
4912 			 unsigned int order, struct zone *zone,
4913 			 unsigned long headsize);
4914 void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
4915 			  unsigned long headsize);
4916 void vmemmap_populate_print_last(void);
4917 #ifdef CONFIG_MEMORY_HOTPLUG
4918 void vmemmap_free(unsigned long start, unsigned long end,
4919 		struct vmem_altmap *altmap);
4920 #endif
4921 
4922 #ifdef CONFIG_SPARSEMEM_VMEMMAP
4923 static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
4924 {
4925 	/* number of pfns from base where pfn_to_page() is valid */
4926 	if (altmap)
4927 		return altmap->reserve + altmap->free;
4928 	return 0;
4929 }
4930 
4931 static inline void vmem_altmap_free(struct vmem_altmap *altmap,
4932 				    unsigned long nr_pfns)
4933 {
4934 	altmap->alloc -= nr_pfns;
4935 }
4936 #else
4937 static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap)
4938 {
4939 	return 0;
4940 }
4941 
4942 static inline void vmem_altmap_free(struct vmem_altmap *altmap,
4943 				    unsigned long nr_pfns)
4944 {
4945 }
4946 #endif
4947 
4948 #define VMEMMAP_RESERVE_NR	2
4949 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
4950 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
4951 					  struct dev_pagemap *pgmap)
4952 {
4953 	unsigned long nr_pages;
4954 	unsigned long nr_vmemmap_pages;
4955 
4956 	if (!pgmap || !is_power_of_2(sizeof(struct page)))
4957 		return false;
4958 
4959 	nr_pages = pgmap_vmemmap_nr(pgmap);
4960 	nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
4961 	/*
4962 	 * For vmemmap optimization with DAX we need minimum 2 vmemmap
4963 	 * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
4964 	 */
4965 	return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
4966 }
4967 /*
4968  * If we don't have an architecture override, use the generic rule
4969  */
4970 #ifndef vmemmap_can_optimize
4971 #define vmemmap_can_optimize __vmemmap_can_optimize
4972 #endif
4973 
4974 #else
4975 static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
4976 					   struct dev_pagemap *pgmap)
4977 {
4978 	return false;
4979 }
4980 #endif
4981 
4982 enum mf_flags {
4983 	MF_COUNT_INCREASED = 1 << 0,
4984 	MF_ACTION_REQUIRED = 1 << 1,
4985 	MF_MUST_KILL = 1 << 2,
4986 	MF_SOFT_OFFLINE = 1 << 3,
4987 	MF_UNPOISON = 1 << 4,
4988 	MF_SW_SIMULATED = 1 << 5,
4989 	MF_NO_RETRY = 1 << 6,
4990 	MF_MEM_PRE_REMOVE = 1 << 7,
4991 };
4992 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
4993 		      unsigned long count, int mf_flags);
4994 extern int memory_failure(unsigned long pfn, int flags);
4995 extern int unpoison_memory(unsigned long pfn);
4996 extern atomic_long_t num_poisoned_pages __read_mostly;
4997 extern int soft_offline_page(unsigned long pfn, int flags);
4998 #ifdef CONFIG_MEMORY_FAILURE
4999 /*
5000  * Sysfs entries for memory failure handling statistics.
5001  */
5002 extern const struct attribute_group memory_failure_attr_group;
5003 extern void memory_failure_queue(unsigned long pfn, int flags);
5004 void num_poisoned_pages_inc(unsigned long pfn);
5005 void num_poisoned_pages_sub(unsigned long pfn, long i);
5006 #else
5007 static inline void memory_failure_queue(unsigned long pfn, int flags)
5008 {
5009 }
5010 
5011 static inline void num_poisoned_pages_inc(unsigned long pfn)
5012 {
5013 }
5014 
5015 static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
5016 {
5017 }
5018 #endif
5019 
5020 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
5021 extern void memblk_nr_poison_inc(unsigned long pfn);
5022 extern void memblk_nr_poison_sub(unsigned long pfn, long i);
5023 #else
5024 static inline void memblk_nr_poison_inc(unsigned long pfn)
5025 {
5026 }
5027 
5028 static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
5029 {
5030 }
5031 #endif
5032 
5033 #ifndef arch_memory_failure
5034 static inline int arch_memory_failure(unsigned long pfn, int flags)
5035 {
5036 	return -ENXIO;
5037 }
5038 #endif
5039 
5040 #ifndef arch_is_platform_page
5041 static inline bool arch_is_platform_page(u64 paddr)
5042 {
5043 	return false;
5044 }
5045 #endif
5046 
5047 /*
5048  * Error handlers for various types of pages.
5049  */
5050 enum mf_result {
5051 	MF_IGNORED,	/* Error: cannot be handled */
5052 	MF_FAILED,	/* Error: handling failed */
5053 	MF_DELAYED,	/* Will be handled later */
5054 	MF_RECOVERED,	/* Successfully recovered */
5055 };
5056 
5057 enum mf_action_page_type {
5058 	MF_MSG_KERNEL,
5059 	MF_MSG_KERNEL_HIGH_ORDER,
5060 	MF_MSG_DIFFERENT_COMPOUND,
5061 	MF_MSG_HUGE,
5062 	MF_MSG_FREE_HUGE,
5063 	MF_MSG_GET_HWPOISON,
5064 	MF_MSG_UNMAP_FAILED,
5065 	MF_MSG_DIRTY_SWAPCACHE,
5066 	MF_MSG_CLEAN_SWAPCACHE,
5067 	MF_MSG_DIRTY_MLOCKED_LRU,
5068 	MF_MSG_CLEAN_MLOCKED_LRU,
5069 	MF_MSG_DIRTY_UNEVICTABLE_LRU,
5070 	MF_MSG_CLEAN_UNEVICTABLE_LRU,
5071 	MF_MSG_DIRTY_LRU,
5072 	MF_MSG_CLEAN_LRU,
5073 	MF_MSG_TRUNCATED_LRU,
5074 	MF_MSG_BUDDY,
5075 	MF_MSG_DAX,
5076 	MF_MSG_UNSPLIT_THP,
5077 	MF_MSG_ALREADY_POISONED,
5078 	MF_MSG_PFN_MAP,
5079 	MF_MSG_UNKNOWN,
5080 };
5081 
5082 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
5083 void folio_zero_user(struct folio *folio, unsigned long addr_hint);
5084 int copy_user_large_folio(struct folio *dst, struct folio *src,
5085 			  unsigned long addr_hint,
5086 			  struct vm_area_struct *vma);
5087 long copy_folio_from_user(struct folio *dst_folio,
5088 			   const void __user *usr_src,
5089 			   bool allow_pagefault);
5090 
5091 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
5092 
5093 #if MAX_NUMNODES > 1
5094 void __init setup_nr_node_ids(void);
5095 #else
5096 static inline void setup_nr_node_ids(void) {}
5097 #endif
5098 
5099 extern int memcmp_pages(struct page *page1, struct page *page2);
5100 
5101 static inline int pages_identical(struct page *page1, struct page *page2)
5102 {
5103 	return !memcmp_pages(page1, page2);
5104 }
5105 
5106 #ifdef CONFIG_MAPPING_DIRTY_HELPERS
5107 unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
5108 						pgoff_t first_index, pgoff_t nr,
5109 						pgoff_t bitmap_pgoff,
5110 						unsigned long *bitmap,
5111 						pgoff_t *start,
5112 						pgoff_t *end);
5113 
5114 unsigned long wp_shared_mapping_range(struct address_space *mapping,
5115 				      pgoff_t first_index, pgoff_t nr);
5116 #endif
5117 
5118 #ifdef CONFIG_ANON_VMA_NAME
5119 int set_anon_vma_name(unsigned long addr, unsigned long size,
5120 		      const char __user *uname);
5121 #else
5122 static inline
5123 int set_anon_vma_name(unsigned long addr, unsigned long size,
5124 		      const char __user *uname)
5125 {
5126 	return -EINVAL;
5127 }
5128 #endif
5129 
5130 #ifdef CONFIG_UNACCEPTED_MEMORY
5131 
5132 bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
5133 void accept_memory(phys_addr_t start, unsigned long size);
5134 
5135 #else
5136 
5137 static inline bool range_contains_unaccepted_memory(phys_addr_t start,
5138 						    unsigned long size)
5139 {
5140 	return false;
5141 }
5142 
5143 static inline void accept_memory(phys_addr_t start, unsigned long size)
5144 {
5145 }
5146 
5147 #endif
5148 
5149 static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
5150 {
5151 	return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
5152 }
5153 
5154 void vma_pgtable_walk_begin(struct vm_area_struct *vma);
5155 void vma_pgtable_walk_end(struct vm_area_struct *vma);
5156 
5157 int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
5158 int reserve_mem_release_by_name(const char *name);
5159 
5160 #ifdef CONFIG_64BIT
5161 int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
5162 #else
5163 static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
5164 {
5165 	/* noop on 32 bit */
5166 	return 0;
5167 }
5168 #endif
5169 
5170 /*
5171  * user_alloc_needs_zeroing checks if a user folio from page allocator needs to
5172  * be zeroed or not.
5173  */
5174 static inline bool user_alloc_needs_zeroing(void)
5175 {
5176 	/*
5177 	 * for user folios, arch with cache aliasing requires cache flush and
5178 	 * arc changes folio->flags to make icache coherent with dcache, so
5179 	 * always return false to make caller use
5180 	 * clear_user_page()/clear_user_highpage().
5181 	 */
5182 	return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
5183 	       !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
5184 				   &init_on_alloc);
5185 }
5186 
5187 int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
5188 int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
5189 int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
5190 
5191 /*
5192  * DMA mapping IDs for page_pool
5193  *
5194  * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and
5195  * stashes it in the upper bits of page->pp_magic. We always want to be able to
5196  * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP
5197  * pages can have arbitrary kernel pointers stored in the same field as pp_magic
5198  * (since it overlaps with page->lru.next), so we must ensure that we cannot
5199  * mistake a valid kernel pointer with any of the values we write into this
5200  * field.
5201  *
5202  * On architectures that set POISON_POINTER_DELTA, this is already ensured,
5203  * since this value becomes part of PP_SIGNATURE; meaning we can just use the
5204  * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the
5205  * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is
5206  * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is
5207  * known at compile-time.
5208  *
5209  * If the value of PAGE_OFFSET is not known at compile time, or if it is too
5210  * small to leave at least 8 bits available above PP_SIGNATURE, we define the
5211  * number of bits to be 0, which turns off the DMA index tracking altogether
5212  * (see page_pool_register_dma_index()).
5213  */
5214 #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA))
5215 #if POISON_POINTER_DELTA > 0
5216 /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA
5217  * index to not overlap with that if set
5218  */
5219 #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT)
5220 #else
5221 /* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */
5222 #define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8))
5223 #define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \
5224 			    PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \
5225 			    !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \
5226 			      MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0)
5227 
5228 #endif
5229 
5230 #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \
5231 				  PP_DMA_INDEX_SHIFT)
5232 
5233 /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is
5234  * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for
5235  * the head page of compound page and bit 1 for pfmemalloc page, as well as the
5236  * bits used for the DMA index. page_is_pfmemalloc() is checked in
5237  * __page_pool_put_page() to avoid recycling the pfmemalloc page.
5238  */
5239 #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
5240 
5241 #ifdef CONFIG_PAGE_POOL
5242 static inline bool page_pool_page_is_pp(const struct page *page)
5243 {
5244 	return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
5245 }
5246 #else
5247 static inline bool page_pool_page_is_pp(const struct page *page)
5248 {
5249 	return false;
5250 }
5251 #endif
5252 
5253 #define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
5254 #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1)
5255 #define PAGE_SNAPSHOT_PG_IDLE  (1 << 2)
5256 
5257 struct page_snapshot {
5258 	struct folio folio_snapshot;
5259 	struct page page_snapshot;
5260 	unsigned long pfn;
5261 	unsigned long idx;
5262 	unsigned long flags;
5263 };
5264 
5265 static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
5266 {
5267 	return ps->flags & PAGE_SNAPSHOT_FAITHFUL;
5268 }
5269 
5270 void snapshot_page(struct page_snapshot *ps, const struct page *page);
5271 
5272 void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte,
5273 		struct vm_area_struct *vma, unsigned long addr,
5274 		bool uffd_wp);
5275 
5276 #endif /* _LINUX_MM_H */
5277