1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_MM_H 3 #define _LINUX_MM_H 4 5 #include <linux/args.h> 6 #include <linux/errno.h> 7 #include <linux/mmdebug.h> 8 #include <linux/gfp.h> 9 #include <linux/pgalloc_tag.h> 10 #include <linux/bug.h> 11 #include <linux/list.h> 12 #include <linux/mmzone.h> 13 #include <linux/rbtree.h> 14 #include <linux/atomic.h> 15 #include <linux/debug_locks.h> 16 #include <linux/compiler.h> 17 #include <linux/mm_types.h> 18 #include <linux/mmap_lock.h> 19 #include <linux/range.h> 20 #include <linux/pfn.h> 21 #include <linux/percpu-refcount.h> 22 #include <linux/bit_spinlock.h> 23 #include <linux/shrinker.h> 24 #include <linux/resource.h> 25 #include <linux/page_ext.h> 26 #include <linux/err.h> 27 #include <linux/page-flags.h> 28 #include <linux/page_ref.h> 29 #include <linux/overflow.h> 30 #include <linux/sched.h> 31 #include <linux/pgtable.h> 32 #include <linux/kasan.h> 33 #include <linux/memremap.h> 34 #include <linux/slab.h> 35 #include <linux/cacheinfo.h> 36 #include <linux/rcuwait.h> 37 #include <linux/bitmap.h> 38 #include <linux/bitops.h> 39 #include <linux/iommu-debug-pagealloc.h> 40 41 struct mempolicy; 42 struct anon_vma; 43 struct anon_vma_chain; 44 struct user_struct; 45 struct pt_regs; 46 struct folio_batch; 47 48 void arch_mm_preinit(void); 49 void mm_core_init_early(void); 50 void mm_core_init(void); 51 void init_mm_internals(void); 52 53 extern atomic_long_t _totalram_pages; 54 static inline unsigned long totalram_pages(void) 55 { 56 return (unsigned long)atomic_long_read(&_totalram_pages); 57 } 58 59 static inline void totalram_pages_inc(void) 60 { 61 atomic_long_inc(&_totalram_pages); 62 } 63 64 static inline void totalram_pages_dec(void) 65 { 66 atomic_long_dec(&_totalram_pages); 67 } 68 69 static inline void totalram_pages_add(long count) 70 { 71 atomic_long_add(count, &_totalram_pages); 72 } 73 74 extern void * high_memory; 75 76 /* 77 * Convert between pages and MB 78 * 20 is the shift for 1MB (2^20 = 1MB) 79 * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) 80 * So (20 - PAGE_SHIFT) converts between pages and MB 81 */ 82 #define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) 83 #define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) 84 85 #ifdef CONFIG_SYSCTL 86 extern int sysctl_legacy_va_layout; 87 #else 88 #define sysctl_legacy_va_layout 0 89 #endif 90 91 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS 92 extern const int mmap_rnd_bits_min; 93 extern int mmap_rnd_bits_max __ro_after_init; 94 extern int mmap_rnd_bits __read_mostly; 95 #endif 96 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 97 extern const int mmap_rnd_compat_bits_min; 98 extern const int mmap_rnd_compat_bits_max; 99 extern int mmap_rnd_compat_bits __read_mostly; 100 #endif 101 102 #ifndef DIRECT_MAP_PHYSMEM_END 103 # ifdef MAX_PHYSMEM_BITS 104 # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) 105 # else 106 # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) 107 # endif 108 #endif 109 110 #define INVALID_PHYS_ADDR (~(phys_addr_t)0) 111 112 #include <asm/page.h> 113 #include <asm/processor.h> 114 115 #ifndef __pa_symbol 116 #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) 117 #endif 118 119 #ifndef page_to_virt 120 #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) 121 #endif 122 123 #ifndef lm_alias 124 #define lm_alias(x) __va(__pa_symbol(x)) 125 #endif 126 127 /* 128 * To prevent common memory management code establishing 129 * a zero page mapping on a read fault. 130 * This macro should be defined within <asm/pgtable.h>. 131 * s390 does this to prevent multiplexing of hardware bits 132 * related to the physical page in case of virtualization. 133 */ 134 #ifndef mm_forbids_zeropage 135 #define mm_forbids_zeropage(X) (0) 136 #endif 137 138 /* 139 * On some architectures it is expensive to call memset() for small sizes. 140 * If an architecture decides to implement their own version of 141 * mm_zero_struct_page they should wrap the defines below in a #ifndef and 142 * define their own version of this macro in <asm/pgtable.h> 143 */ 144 #if BITS_PER_LONG == 64 145 /* This function must be updated when the size of struct page grows above 96 146 * or reduces below 56. The idea that compiler optimizes out switch() 147 * statement, and only leaves move/store instructions. Also the compiler can 148 * combine write statements if they are both assignments and can be reordered, 149 * this can result in several of the writes here being dropped. 150 */ 151 #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) 152 static inline void __mm_zero_struct_page(struct page *page) 153 { 154 unsigned long *_pp = (void *)page; 155 156 /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ 157 BUILD_BUG_ON(sizeof(struct page) & 7); 158 BUILD_BUG_ON(sizeof(struct page) < 56); 159 BUILD_BUG_ON(sizeof(struct page) > 96); 160 161 switch (sizeof(struct page)) { 162 case 96: 163 _pp[11] = 0; 164 fallthrough; 165 case 88: 166 _pp[10] = 0; 167 fallthrough; 168 case 80: 169 _pp[9] = 0; 170 fallthrough; 171 case 72: 172 _pp[8] = 0; 173 fallthrough; 174 case 64: 175 _pp[7] = 0; 176 fallthrough; 177 case 56: 178 _pp[6] = 0; 179 _pp[5] = 0; 180 _pp[4] = 0; 181 _pp[3] = 0; 182 _pp[2] = 0; 183 _pp[1] = 0; 184 _pp[0] = 0; 185 } 186 } 187 #else 188 #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) 189 #endif 190 191 /* 192 * Default maximum number of active map areas, this limits the number of vmas 193 * per mm struct. Users can overwrite this number by sysctl but there is a 194 * problem. 195 * 196 * When a program's coredump is generated as ELF format, a section is created 197 * per a vma. In ELF, the number of sections is represented in unsigned short. 198 * This means the number of sections should be smaller than 65535 at coredump. 199 * Because the kernel adds some informative sections to a image of program at 200 * generating coredump, we need some margin. The number of extra sections is 201 * 1-3 now and depends on arch. We use "5" as safe margin, here. 202 * 203 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is 204 * not a hard limit any more. Although some userspace tools can be surprised by 205 * that. 206 */ 207 #define MAPCOUNT_ELF_CORE_MARGIN (5) 208 #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) 209 210 extern unsigned long sysctl_user_reserve_kbytes; 211 extern unsigned long sysctl_admin_reserve_kbytes; 212 213 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 214 bool page_range_contiguous(const struct page *page, unsigned long nr_pages); 215 #else 216 static inline bool page_range_contiguous(const struct page *page, 217 unsigned long nr_pages) 218 { 219 return true; 220 } 221 #endif 222 223 /* to align the pointer to the (next) page boundary */ 224 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) 225 226 /* to align the pointer to the (prev) page boundary */ 227 #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) 228 229 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ 230 #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) 231 232 /** 233 * folio_page_idx - Return the number of a page in a folio. 234 * @folio: The folio. 235 * @page: The folio page. 236 * 237 * This function expects that the page is actually part of the folio. 238 * The returned number is relative to the start of the folio. 239 */ 240 static inline unsigned long folio_page_idx(const struct folio *folio, 241 const struct page *page) 242 { 243 return page - &folio->page; 244 } 245 246 static inline struct folio *lru_to_folio(struct list_head *head) 247 { 248 return list_entry((head)->prev, struct folio, lru); 249 } 250 251 void setup_initial_init_mm(void *start_code, void *end_code, 252 void *end_data, void *brk); 253 254 /* 255 * Linux kernel virtual memory manager primitives. 256 * The idea being to have a "virtual" mm in the same way 257 * we have a virtual fs - giving a cleaner interface to the 258 * mm details, and allowing different kinds of memory mappings 259 * (from shared memory to executable loading to arbitrary 260 * mmap() functions). 261 */ 262 263 struct vm_area_struct *vm_area_alloc(struct mm_struct *); 264 struct vm_area_struct *vm_area_dup(struct vm_area_struct *); 265 void vm_area_free(struct vm_area_struct *); 266 267 #ifndef CONFIG_MMU 268 extern struct rb_root nommu_region_tree; 269 extern struct rw_semaphore nommu_region_sem; 270 271 extern unsigned int kobjsize(const void *objp); 272 #endif 273 274 /* 275 * vm_flags in vm_area_struct, see mm_types.h. 276 * When changing, update also include/trace/events/mmflags.h 277 */ 278 279 #define VM_NONE 0x00000000 280 281 /** 282 * typedef vma_flag_t - specifies an individual VMA flag by bit number. 283 * 284 * This value is made type safe by sparse to avoid passing invalid flag values 285 * around. 286 */ 287 typedef int __bitwise vma_flag_t; 288 289 #define DECLARE_VMA_BIT(name, bitnum) \ 290 VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) 291 #define DECLARE_VMA_BIT_ALIAS(name, aliased) \ 292 VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) 293 enum { 294 DECLARE_VMA_BIT(READ, 0), 295 DECLARE_VMA_BIT(WRITE, 1), 296 DECLARE_VMA_BIT(EXEC, 2), 297 DECLARE_VMA_BIT(SHARED, 3), 298 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ 299 DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ 300 DECLARE_VMA_BIT(MAYWRITE, 5), 301 DECLARE_VMA_BIT(MAYEXEC, 6), 302 DECLARE_VMA_BIT(MAYSHARE, 7), 303 DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ 304 #ifdef CONFIG_MMU 305 DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ 306 #else 307 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ 308 DECLARE_VMA_BIT(MAYOVERLAY, 9), 309 #endif /* CONFIG_MMU */ 310 /* Page-ranges managed without "struct page", just pure PFN */ 311 DECLARE_VMA_BIT(PFNMAP, 10), 312 DECLARE_VMA_BIT(MAYBE_GUARD, 11), 313 DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ 314 DECLARE_VMA_BIT(LOCKED, 13), 315 DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ 316 DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ 317 DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ 318 DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ 319 DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ 320 DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ 321 DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ 322 DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ 323 DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ 324 DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ 325 DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ 326 DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ 327 DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ 328 DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ 329 DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ 330 DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ 331 DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ 332 DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ 333 /* These bits are reused, we define specific uses below. */ 334 DECLARE_VMA_BIT(HIGH_ARCH_0, 32), 335 DECLARE_VMA_BIT(HIGH_ARCH_1, 33), 336 DECLARE_VMA_BIT(HIGH_ARCH_2, 34), 337 DECLARE_VMA_BIT(HIGH_ARCH_3, 35), 338 DECLARE_VMA_BIT(HIGH_ARCH_4, 36), 339 DECLARE_VMA_BIT(HIGH_ARCH_5, 37), 340 DECLARE_VMA_BIT(HIGH_ARCH_6, 38), 341 /* 342 * This flag is used to connect VFIO to arch specific KVM code. It 343 * indicates that the memory under this VMA is safe for use with any 344 * non-cachable memory type inside KVM. Some VFIO devices, on some 345 * platforms, are thought to be unsafe and can cause machine crashes 346 * if KVM does not lock down the memory type. 347 */ 348 DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), 349 #if defined(CONFIG_PPC32) 350 DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), 351 #elif defined(CONFIG_64BIT) 352 DECLARE_VMA_BIT(DROPPABLE, 40), 353 #endif 354 DECLARE_VMA_BIT(UFFD_MINOR, 41), 355 DECLARE_VMA_BIT(SEALED, 42), 356 /* Flags that reuse flags above. */ 357 DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), 358 DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), 359 DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), 360 DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), 361 DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), 362 #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_RISCV_USER_CFI) 363 /* 364 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of 365 * support core mm. 366 * 367 * These VMAs will get a single end guard page. This helps userspace 368 * protect itself from attacks. A single page is enough for current 369 * shadow stack archs (x86). See the comments near alloc_shstk() in 370 * arch/x86/kernel/shstk.c for more details on the guard size. 371 */ 372 DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), 373 #elif defined(CONFIG_ARM64_GCS) 374 /* 375 * arm64's Guarded Control Stack implements similar functionality and 376 * has similar constraints to shadow stacks. 377 */ 378 DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), 379 #endif 380 DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ 381 DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ 382 DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ 383 DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ 384 DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ 385 DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ 386 DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ 387 DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ 388 #ifdef CONFIG_STACK_GROWSUP 389 DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), 390 DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), 391 #else 392 DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), 393 #endif 394 }; 395 #undef DECLARE_VMA_BIT 396 #undef DECLARE_VMA_BIT_ALIAS 397 398 #define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) 399 #define VM_READ INIT_VM_FLAG(READ) 400 #define VM_WRITE INIT_VM_FLAG(WRITE) 401 #define VM_EXEC INIT_VM_FLAG(EXEC) 402 #define VM_SHARED INIT_VM_FLAG(SHARED) 403 #define VM_MAYREAD INIT_VM_FLAG(MAYREAD) 404 #define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) 405 #define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) 406 #define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) 407 #define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) 408 #ifdef CONFIG_MMU 409 #define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) 410 #else 411 #define VM_UFFD_MISSING VM_NONE 412 #define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) 413 #endif 414 #define VM_PFNMAP INIT_VM_FLAG(PFNMAP) 415 #define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) 416 #define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) 417 #define VM_LOCKED INIT_VM_FLAG(LOCKED) 418 #define VM_IO INIT_VM_FLAG(IO) 419 #define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) 420 #define VM_RAND_READ INIT_VM_FLAG(RAND_READ) 421 #define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) 422 #define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) 423 #define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) 424 #define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) 425 #define VM_NORESERVE INIT_VM_FLAG(NORESERVE) 426 #define VM_HUGETLB INIT_VM_FLAG(HUGETLB) 427 #define VM_SYNC INIT_VM_FLAG(SYNC) 428 #define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) 429 #define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) 430 #define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) 431 #ifdef CONFIG_MEM_SOFT_DIRTY 432 #define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) 433 #else 434 #define VM_SOFTDIRTY VM_NONE 435 #endif 436 #define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) 437 #define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) 438 #define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) 439 #define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) 440 #define VM_STACK INIT_VM_FLAG(STACK) 441 #ifdef CONFIG_STACK_GROWSUP 442 #define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) 443 #else 444 #define VM_STACK_EARLY VM_NONE 445 #endif 446 #ifdef CONFIG_ARCH_HAS_PKEYS 447 #define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) 448 /* Despite the naming, these are FLAGS not bits. */ 449 #define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) 450 #define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) 451 #define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) 452 #if CONFIG_ARCH_PKEY_BITS > 3 453 #define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) 454 #else 455 #define VM_PKEY_BIT3 VM_NONE 456 #endif /* CONFIG_ARCH_PKEY_BITS > 3 */ 457 #if CONFIG_ARCH_PKEY_BITS > 4 458 #define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) 459 #else 460 #define VM_PKEY_BIT4 VM_NONE 461 #endif /* CONFIG_ARCH_PKEY_BITS > 4 */ 462 #endif /* CONFIG_ARCH_HAS_PKEYS */ 463 #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \ 464 defined(CONFIG_RISCV_USER_CFI) 465 #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) 466 #define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) 467 #else 468 #define VM_SHADOW_STACK VM_NONE 469 #define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) 470 #endif 471 #if defined(CONFIG_PPC64) 472 #define VM_SAO INIT_VM_FLAG(SAO) 473 #elif defined(CONFIG_PARISC) 474 #define VM_GROWSUP INIT_VM_FLAG(GROWSUP) 475 #elif defined(CONFIG_SPARC64) 476 #define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) 477 #define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) 478 #elif defined(CONFIG_ARM64) 479 #define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) 480 #define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) 481 #elif !defined(CONFIG_MMU) 482 #define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) 483 #endif 484 #ifndef VM_GROWSUP 485 #define VM_GROWSUP VM_NONE 486 #endif 487 #ifdef CONFIG_ARM64_MTE 488 #define VM_MTE INIT_VM_FLAG(MTE) 489 #define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) 490 #else 491 #define VM_MTE VM_NONE 492 #define VM_MTE_ALLOWED VM_NONE 493 #endif 494 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 495 #define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) 496 #else 497 #define VM_UFFD_MINOR VM_NONE 498 #endif 499 500 /* 501 * vma_flags_t masks for the userfaultfd VMA flags. VMA_UFFD_MINOR is gated on 502 * the same config as VM_UFFD_MINOR -- which implies 64BIT, where the bit fits 503 * -- so an out-of-range bit is never fed to mk_vma_flags() on a build whose 504 * bitmap cannot hold it. 505 */ 506 #define VMA_UFFD_MISSING mk_vma_flags(VMA_UFFD_MISSING_BIT) 507 #define VMA_UFFD_WP mk_vma_flags(VMA_UFFD_WP_BIT) 508 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 509 #define VMA_UFFD_MINOR mk_vma_flags(VMA_UFFD_MINOR_BIT) 510 #else 511 #define VMA_UFFD_MINOR EMPTY_VMA_FLAGS 512 #endif 513 514 #ifdef CONFIG_64BIT 515 #define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) 516 #define VM_SEALED INIT_VM_FLAG(SEALED) 517 #else 518 #define VM_ALLOW_ANY_UNCACHED VM_NONE 519 #define VM_SEALED VM_NONE 520 #endif 521 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) 522 #define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) 523 #define VMA_DROPPABLE mk_vma_flags(VMA_DROPPABLE_BIT) 524 #else 525 #define VM_DROPPABLE VM_NONE 526 #define VMA_DROPPABLE EMPTY_VMA_FLAGS 527 #endif 528 529 /* Bits set in the VMA until the stack is in its final location */ 530 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) 531 532 #define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ 533 VMA_EXEC_BIT : VMA_READ_BIT) 534 535 /* Common data flag combinations */ 536 #define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ 537 TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ 538 VMA_MAYEXEC_BIT) 539 #define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ 540 VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) 541 #define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ 542 VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ 543 VMA_MAYEXEC_BIT) 544 545 #ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ 546 #define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC 547 #endif 548 549 #ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ 550 #define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS 551 #endif 552 553 #define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ 554 VMA_STACK_BIT, VMA_ACCOUNT_BIT) 555 556 /* Temporary until VMA flags conversion complete. */ 557 #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) 558 559 #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS 560 #define VM_SEALED_SYSMAP VM_SEALED 561 #else 562 #define VM_SEALED_SYSMAP VM_NONE 563 #endif 564 565 /* VMA basic access permission flags */ 566 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) 567 #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) 568 569 /* 570 * Special vmas that are non-mergable, non-mlock()able. 571 */ 572 573 #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ 574 VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) 575 #define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS) 576 577 /* 578 * Physically remapped pages are special. Tell the 579 * rest of the world about it: 580 * IO tells people not to look at these pages 581 * (accesses can have side effects). 582 * PFNMAP tells the core MM that the base pages are just 583 * raw PFN mappings, and do not have a "struct page" associated 584 * with them. 585 * DONTEXPAND 586 * Disable vma merging and expanding with mremap(). 587 * DONTDUMP 588 * Omit vma from core dump, even when VM_IO turned off. 589 */ 590 #define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ 591 VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) 592 593 /* This mask prevents VMA from being scanned with khugepaged */ 594 #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) 595 596 /* This mask defines which mm->def_flags a process can inherit its parent */ 597 #define VM_INIT_DEF_MASK VM_NOHUGEPAGE 598 599 /* This mask represents all the VMA flag bits used by mlock */ 600 #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) 601 602 #define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) 603 604 /* These flags can be updated atomically via VMA/mmap read lock. */ 605 #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD 606 607 /* Arch-specific flags to clear when updating VM flags on protection change */ 608 #ifndef VM_ARCH_CLEAR 609 #define VM_ARCH_CLEAR VM_NONE 610 #endif 611 #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) 612 613 /* 614 * Flags which should be 'sticky' on merge - that is, flags which, when one VMA 615 * possesses it but the other does not, the merged VMA should nonetheless have 616 * applied to it: 617 * 618 * VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its 619 * references cleared via /proc/$pid/clear_refs, any 620 * merged VMA should be considered soft-dirty also as it 621 * operates at a VMA granularity. 622 * 623 * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies 624 * that mapped page tables may contain metadata not 625 * described by the VMA and thus any merged VMA may also 626 * contain this metadata, and thus we must make this flag 627 * sticky. 628 */ 629 #ifdef CONFIG_MEM_SOFT_DIRTY 630 #define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) 631 #else 632 #define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) 633 #endif 634 635 /* 636 * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one 637 * of these flags and the other not does not preclude a merge. 638 * 639 * VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they 640 * are 'sticky'. If any sticky flags exist in either VMA, 641 * we simply set all of them on the merged VMA. 642 */ 643 #define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS 644 645 /* 646 * Flags which should result in page tables being copied on fork. These are 647 * flags which indicate that the VMA maps page tables which cannot be 648 * reconsistuted upon page fault, so necessitate page table copying upon fork. 649 * 650 * Note that these flags should be compared with the DESTINATION VMA not the 651 * source, as VM_UFFD_WP may not be propagated to destination, while all other 652 * flags will be. 653 * 654 * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be 655 * reasonably reconstructed on page fault. 656 * 657 * VM_UFFD_WP - Encodes metadata about an installed uffd 658 * write protect handler, which cannot be 659 * reconstructed on page fault. 660 * 661 * We always copy pgtables when dst_vma has uffd-wp 662 * enabled even if it's file-backed 663 * (e.g. shmem). Because when uffd-wp is enabled, 664 * pgtable contains uffd-wp protection information, 665 * that's something we can't retrieve from page cache, 666 * and skip copying will lose those info. 667 * 668 * VM_MAYBE_GUARD - Could contain page guard region markers which 669 * by design are a property of the page tables 670 * only and thus cannot be reconstructed on page 671 * fault. 672 */ 673 #define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) 674 675 /* 676 * mapping from the currently active vm_flags protection bits (the 677 * low four bits) to a page protection mask.. 678 */ 679 680 /* 681 * The default fault flags that should be used by most of the 682 * arch-specific page fault handlers. 683 */ 684 #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ 685 FAULT_FLAG_KILLABLE | \ 686 FAULT_FLAG_INTERRUPTIBLE) 687 688 /** 689 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time 690 * @flags: Fault flags. 691 * 692 * This is mostly used for places where we want to try to avoid taking 693 * the mmap_lock for too long a time when waiting for another condition 694 * to change, in which case we can try to be polite to release the 695 * mmap_lock in the first round to avoid potential starvation of other 696 * processes that would also want the mmap_lock. 697 * 698 * Return: true if the page fault allows retry and this is the first 699 * attempt of the fault handling; false otherwise. 700 */ 701 static inline bool fault_flag_allow_retry_first(enum fault_flag flags) 702 { 703 return (flags & FAULT_FLAG_ALLOW_RETRY) && 704 (!(flags & FAULT_FLAG_TRIED)); 705 } 706 707 #define FAULT_FLAG_TRACE \ 708 { FAULT_FLAG_WRITE, "WRITE" }, \ 709 { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ 710 { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ 711 { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ 712 { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ 713 { FAULT_FLAG_TRIED, "TRIED" }, \ 714 { FAULT_FLAG_USER, "USER" }, \ 715 { FAULT_FLAG_REMOTE, "REMOTE" }, \ 716 { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ 717 { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ 718 { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } 719 720 /* 721 * vm_fault is filled by the pagefault handler and passed to the vma's 722 * ->fault function. The vma's ->fault is responsible for returning a bitmask 723 * of VM_FAULT_xxx flags that give details about how the fault was handled. 724 * 725 * MM layer fills up gfp_mask for page allocations but fault handler might 726 * alter it if its implementation requires a different allocation context. 727 * 728 * pgoff should be used in favour of virtual_address, if possible. 729 */ 730 struct vm_fault { 731 const struct { 732 struct vm_area_struct *vma; /* Target VMA */ 733 gfp_t gfp_mask; /* gfp mask to be used for allocations */ 734 pgoff_t pgoff; /* Logical page offset based on vma */ 735 unsigned long address; /* Faulting virtual address - masked */ 736 unsigned long real_address; /* Faulting virtual address - unmasked */ 737 }; 738 enum fault_flag flags; /* FAULT_FLAG_xxx flags 739 * XXX: should really be 'const' */ 740 pmd_t *pmd; /* Pointer to pmd entry matching 741 * the 'address' */ 742 pud_t *pud; /* Pointer to pud entry matching 743 * the 'address' 744 */ 745 union { 746 pte_t orig_pte; /* Value of PTE at the time of fault */ 747 pmd_t orig_pmd; /* Value of PMD at the time of fault, 748 * used by PMD fault only. 749 */ 750 }; 751 752 struct page *cow_page; /* Page handler may use for COW fault */ 753 struct page *page; /* ->fault handlers should return a 754 * page here, unless VM_FAULT_NOPAGE 755 * is set (which is also implied by 756 * VM_FAULT_ERROR). 757 */ 758 /* These three entries are valid only while holding ptl lock */ 759 pte_t *pte; /* Pointer to pte entry matching 760 * the 'address'. NULL if the page 761 * table hasn't been allocated. 762 */ 763 spinlock_t *ptl; /* Page table lock. 764 * Protects pte page table if 'pte' 765 * is not NULL, otherwise pmd. 766 */ 767 pgtable_t prealloc_pte; /* Pre-allocated pte page table. 768 * vm_ops->map_pages() sets up a page 769 * table from atomic context. 770 * do_fault_around() pre-allocates 771 * page table to avoid allocation from 772 * atomic context. 773 */ 774 }; 775 776 struct vm_uffd_ops; 777 778 /* 779 * These are the virtual MM functions - opening of an area, closing and 780 * unmapping it (needed to keep files on disk up-to-date etc), pointer 781 * to the functions called when a no-page or a wp-page exception occurs. 782 */ 783 struct vm_operations_struct { 784 /** 785 * @open: Called when a VMA is remapped, split or forked. Not called 786 * upon first mapping a VMA. 787 * Context: User context. May sleep. Caller holds mmap_lock. 788 */ 789 void (*open)(struct vm_area_struct *vma); 790 /** 791 * @close: Called when the VMA is being removed from the MM. 792 * Context: User context. May sleep. Caller holds mmap_lock. 793 */ 794 void (*close)(struct vm_area_struct *vma); 795 /** 796 * @mapped: Called when the VMA is first mapped in the MM. Not called if 797 * the new VMA is merged with an adjacent VMA. 798 * 799 * The @vm_private_data field is an output field allowing the user to 800 * modify vma->vm_private_data as necessary. 801 * 802 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if 803 * set from f_op->mmap. 804 * 805 * Returns %0 on success, or an error otherwise. On error, the VMA will 806 * be unmapped. 807 * 808 * Context: User context. May sleep. Caller holds mmap_lock. 809 */ 810 int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, 811 const struct file *file, void **vm_private_data); 812 /* Called any time before splitting to check if it's allowed */ 813 int (*may_split)(struct vm_area_struct *vma, unsigned long addr); 814 int (*mremap)(struct vm_area_struct *vma); 815 /* 816 * Called by mprotect() to make driver-specific permission 817 * checks before mprotect() is finalised. The VMA must not 818 * be modified. Returns 0 if mprotect() can proceed. 819 */ 820 int (*mprotect)(struct vm_area_struct *vma, unsigned long start, 821 unsigned long end, unsigned long newflags); 822 vm_fault_t (*fault)(struct vm_fault *vmf); 823 vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); 824 vm_fault_t (*map_pages)(struct vm_fault *vmf, 825 pgoff_t start_pgoff, pgoff_t end_pgoff); 826 unsigned long (*pagesize)(struct vm_area_struct *vma); 827 828 /* notification that a previously read-only page is about to become 829 * writable, if an error is returned it will cause a SIGBUS */ 830 vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); 831 832 /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ 833 vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); 834 835 /* called by access_process_vm when get_user_pages() fails, typically 836 * for use by special VMAs. See also generic_access_phys() for a generic 837 * implementation useful for any iomem mapping. 838 */ 839 int (*access)(struct vm_area_struct *vma, unsigned long addr, 840 void *buf, int len, int write); 841 842 /* Called by the /proc/PID/maps code to ask the vma whether it 843 * has a special name. Returning non-NULL will also cause this 844 * vma to be dumped unconditionally. */ 845 const char *(*name)(struct vm_area_struct *vma); 846 847 #ifdef CONFIG_NUMA 848 /* 849 * set_policy() op must add a reference to any non-NULL @new mempolicy 850 * to hold the policy upon return. Caller should pass NULL @new to 851 * remove a policy and fall back to surrounding context--i.e. do not 852 * install a MPOL_DEFAULT policy, nor the task or system default 853 * mempolicy. 854 */ 855 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); 856 857 /* 858 * get_policy() op must add reference [mpol_get()] to any policy at 859 * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure 860 * in mm/mempolicy.c will do this automatically. 861 * get_policy() must NOT add a ref if the policy at (vma,addr) is not 862 * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. 863 * If no [shared/vma] mempolicy exists at the addr, get_policy() op 864 * must return NULL--i.e., do not "fallback" to task or system default 865 * policy. 866 */ 867 struct mempolicy *(*get_policy)(struct vm_area_struct *vma, 868 unsigned long addr, pgoff_t *ilx); 869 #endif 870 #ifdef CONFIG_FIND_NORMAL_PAGE 871 /* 872 * Called by vm_normal_page() for special PTEs in @vma at @addr. This 873 * allows for returning a "normal" page from vm_normal_page() even 874 * though the PTE indicates that the "struct page" either does not exist 875 * or should not be touched: "special". 876 * 877 * Do not add new users: this really only works when a "normal" page 878 * was mapped, but then the PTE got changed to something weird (+ 879 * marked special) that would not make pte_pfn() identify the originally 880 * inserted page. 881 */ 882 struct page *(*find_normal_page)(struct vm_area_struct *vma, 883 unsigned long addr); 884 #endif /* CONFIG_FIND_NORMAL_PAGE */ 885 #ifdef CONFIG_USERFAULTFD 886 const struct vm_uffd_ops *uffd_ops; 887 #endif 888 }; 889 890 #ifdef CONFIG_NUMA_BALANCING 891 static inline void vma_numab_state_init(struct vm_area_struct *vma) 892 { 893 vma->numab_state = NULL; 894 } 895 static inline void vma_numab_state_free(struct vm_area_struct *vma) 896 { 897 kfree(vma->numab_state); 898 } 899 #else 900 static inline void vma_numab_state_init(struct vm_area_struct *vma) {} 901 static inline void vma_numab_state_free(struct vm_area_struct *vma) {} 902 #endif /* CONFIG_NUMA_BALANCING */ 903 904 /* 905 * These must be here rather than mmap_lock.h as dependent on vm_fault type, 906 * declared in this header. 907 */ 908 #ifdef CONFIG_PER_VMA_LOCK 909 static inline void release_fault_lock(struct vm_fault *vmf) 910 { 911 if (vmf->flags & FAULT_FLAG_VMA_LOCK) 912 vma_end_read(vmf->vma); 913 else 914 mmap_read_unlock(vmf->vma->vm_mm); 915 } 916 917 static inline void assert_fault_locked(const struct vm_fault *vmf) 918 { 919 if (vmf->flags & FAULT_FLAG_VMA_LOCK) 920 vma_assert_locked(vmf->vma); 921 else 922 mmap_assert_locked(vmf->vma->vm_mm); 923 } 924 #else 925 static inline void release_fault_lock(struct vm_fault *vmf) 926 { 927 mmap_read_unlock(vmf->vma->vm_mm); 928 } 929 930 static inline void assert_fault_locked(const struct vm_fault *vmf) 931 { 932 mmap_assert_locked(vmf->vma->vm_mm); 933 } 934 #endif /* CONFIG_PER_VMA_LOCK */ 935 936 static inline bool mm_flags_test(int flag, const struct mm_struct *mm) 937 { 938 return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); 939 } 940 941 static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) 942 { 943 return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); 944 } 945 946 static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) 947 { 948 return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); 949 } 950 951 static inline void mm_flags_set(int flag, struct mm_struct *mm) 952 { 953 set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); 954 } 955 956 static inline void mm_flags_clear(int flag, struct mm_struct *mm) 957 { 958 clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); 959 } 960 961 static inline void mm_flags_clear_all(struct mm_struct *mm) 962 { 963 bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); 964 } 965 966 extern const struct vm_operations_struct vma_dummy_vm_ops; 967 968 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) 969 { 970 memset(vma, 0, sizeof(*vma)); 971 vma->vm_mm = mm; 972 vma->vm_ops = &vma_dummy_vm_ops; 973 INIT_LIST_HEAD(&vma->anon_vma_chain); 974 vma_lock_init(vma, false); 975 } 976 977 /* Use when VMA is not part of the VMA tree and needs no locking */ 978 static inline void vm_flags_init(struct vm_area_struct *vma, 979 vm_flags_t flags) 980 { 981 VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); 982 vma_flags_clear_all(&vma->flags); 983 vma_flags_overwrite_word(&vma->flags, flags); 984 } 985 986 /* 987 * Use when VMA is part of the VMA tree and modifications need coordination 988 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and 989 * it should be locked explicitly beforehand. 990 */ 991 static inline void vm_flags_reset(struct vm_area_struct *vma, 992 vm_flags_t flags) 993 { 994 VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); 995 vma_assert_write_locked(vma); 996 vm_flags_init(vma, flags); 997 } 998 999 static inline void vma_flags_reset_once(struct vm_area_struct *vma, 1000 vma_flags_t *flags) 1001 { 1002 const unsigned long word = flags->__vma_flags[0]; 1003 1004 /* It is assumed only the first system word must be written once. */ 1005 vma_flags_overwrite_word_once(&vma->flags, word); 1006 /* The remainder can be copied normally. */ 1007 if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { 1008 unsigned long *dst = &vma->flags.__vma_flags[1]; 1009 const unsigned long *src = &flags->__vma_flags[1]; 1010 1011 bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); 1012 } 1013 } 1014 1015 static inline void vm_flags_set(struct vm_area_struct *vma, 1016 vm_flags_t flags) 1017 { 1018 vma_start_write(vma); 1019 vma_flags_set_word(&vma->flags, flags); 1020 } 1021 1022 static inline void vm_flags_clear(struct vm_area_struct *vma, 1023 vm_flags_t flags) 1024 { 1025 VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); 1026 vma_start_write(vma); 1027 vma_flags_clear_word(&vma->flags, flags); 1028 } 1029 1030 /* 1031 * Use only if VMA is not part of the VMA tree or has no other users and 1032 * therefore needs no locking. 1033 */ 1034 static inline void __vm_flags_mod(struct vm_area_struct *vma, 1035 vm_flags_t set, vm_flags_t clear) 1036 { 1037 vm_flags_init(vma, (vma->vm_flags | set) & ~clear); 1038 } 1039 1040 /* 1041 * Use only when the order of set/clear operations is unimportant, otherwise 1042 * use vm_flags_{set|clear} explicitly. 1043 */ 1044 static inline void vm_flags_mod(struct vm_area_struct *vma, 1045 vm_flags_t set, vm_flags_t clear) 1046 { 1047 vma_start_write(vma); 1048 __vm_flags_mod(vma, set, clear); 1049 } 1050 1051 static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, 1052 vma_flag_t bit) 1053 { 1054 const vm_flags_t mask = BIT((__force int)bit); 1055 1056 /* Only specific flags are permitted */ 1057 if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) 1058 return false; 1059 1060 return true; 1061 } 1062 1063 /* 1064 * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific 1065 * valid flags are allowed to do this. 1066 */ 1067 static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma, 1068 vma_flag_t bit) 1069 { 1070 unsigned long *bitmap = vma->flags.__vma_flags; 1071 1072 vma_assert_stabilised(vma); 1073 if (__vma_atomic_valid_flag(vma, bit)) 1074 set_bit((__force int)bit, bitmap); 1075 } 1076 1077 /* 1078 * Test for VMA flag atomically. Requires no locks. Only specific valid flags 1079 * are allowed to do this. 1080 * 1081 * This is necessarily racey, so callers must ensure that serialisation is 1082 * achieved through some other means, or that races are permissible. 1083 */ 1084 static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma, 1085 vma_flag_t bit) 1086 { 1087 if (__vma_atomic_valid_flag(vma, bit)) 1088 return test_bit((__force int)bit, &vma->vm_flags); 1089 1090 return false; 1091 } 1092 1093 /* Set an individual VMA flag in flags, non-atomically. */ 1094 static __always_inline void vma_flags_set_flag(vma_flags_t *flags, 1095 vma_flag_t bit) 1096 { 1097 unsigned long *bitmap = flags->__vma_flags; 1098 1099 __set_bit((__force int)bit, bitmap); 1100 } 1101 1102 static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, 1103 size_t count, const vma_flag_t *bits) 1104 { 1105 int i; 1106 1107 for (i = 0; i < count; i++) 1108 vma_flags_set_flag(&flags, bits[i]); 1109 return flags; 1110 } 1111 1112 /* 1113 * Helper macro which bitwise-or combines the specified input flags into a 1114 * vma_flags_t bitmap value. E.g.: 1115 * 1116 * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, 1117 * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); 1118 * 1119 * The compiler cleverly optimises away all of the work and this ends up being 1120 * equivalent to aggregating the values manually. 1121 */ 1122 #define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ 1123 COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) 1124 1125 /* 1126 * Helper macro which acts like mk_vma_flags, only appending to a copy of the 1127 * specified flags rather than establishing new flags. E.g.: 1128 * 1129 * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT, 1130 * VMA_ACCOUNT_BIT); 1131 */ 1132 #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ 1133 COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) 1134 1135 /* Calculates the number of set bits in the specified VMA flags. */ 1136 static __always_inline int vma_flags_count(const vma_flags_t *flags) 1137 { 1138 const unsigned long *bitmap = flags->__vma_flags; 1139 1140 return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); 1141 } 1142 1143 /* 1144 * Test whether a specific VMA flag is set, e.g.: 1145 * 1146 * if (vma_flags_test(flags, VMA_READ_BIT)) { ... } 1147 */ 1148 static __always_inline bool vma_flags_test(const vma_flags_t *flags, 1149 vma_flag_t bit) 1150 { 1151 const unsigned long *bitmap = flags->__vma_flags; 1152 1153 return test_bit((__force int)bit, bitmap); 1154 } 1155 1156 /* 1157 * Obtain a set of VMA flags which contain the overlapping flags contained 1158 * within flags and to_and. 1159 */ 1160 static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, 1161 vma_flags_t to_and) 1162 { 1163 vma_flags_t dst; 1164 unsigned long *bitmap_dst = dst.__vma_flags; 1165 const unsigned long *bitmap = flags->__vma_flags; 1166 const unsigned long *bitmap_to_and = to_and.__vma_flags; 1167 1168 bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); 1169 return dst; 1170 } 1171 1172 /* 1173 * Obtain a set of VMA flags which contains the specified overlapping flags, 1174 * e.g.: 1175 * 1176 * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT, 1177 * VMA_MAY_READ_BIT); 1178 */ 1179 #define vma_flags_and(flags, ...) \ 1180 vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) 1181 1182 /* Test each of to_test flags in flags, non-atomically. */ 1183 static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, 1184 vma_flags_t to_test) 1185 { 1186 const unsigned long *bitmap = flags->__vma_flags; 1187 const unsigned long *bitmap_to_test = to_test.__vma_flags; 1188 1189 return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); 1190 } 1191 1192 /* 1193 * Test whether any specified VMA flag is set, e.g.: 1194 * 1195 * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } 1196 */ 1197 #define vma_flags_test_any(flags, ...) \ 1198 vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) 1199 1200 /* Test that ALL of the to_test flags are set, non-atomically. */ 1201 static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, 1202 vma_flags_t to_test) 1203 { 1204 const unsigned long *bitmap = flags->__vma_flags; 1205 const unsigned long *bitmap_to_test = to_test.__vma_flags; 1206 1207 return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); 1208 } 1209 1210 /* 1211 * Test whether ALL specified VMA flags are set, e.g.: 1212 * 1213 * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } 1214 */ 1215 #define vma_flags_test_all(flags, ...) \ 1216 vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) 1217 1218 /* 1219 * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set 1220 * (returning false if flagmask has no flags set). 1221 * 1222 * This is defined to make the semantics clearer when testing an optionally 1223 * defined VMA flags mask, e.g.: 1224 * 1225 * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... } 1226 * 1227 * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS 1228 * otherwise. 1229 */ 1230 static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, 1231 vma_flags_t flagmask) 1232 { 1233 VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); 1234 1235 return vma_flags_test_any_mask(flags, flagmask); 1236 } 1237 1238 /* Set each of the to_set flags in flags, non-atomically. */ 1239 static __always_inline void vma_flags_set_mask(vma_flags_t *flags, 1240 vma_flags_t to_set) 1241 { 1242 unsigned long *bitmap = flags->__vma_flags; 1243 const unsigned long *bitmap_to_set = to_set.__vma_flags; 1244 1245 bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS); 1246 } 1247 1248 /* 1249 * Set all specified VMA flags, e.g.: 1250 * 1251 * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); 1252 */ 1253 #define vma_flags_set(flags, ...) \ 1254 vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) 1255 1256 static __always_inline vma_flags_t __mk_vma_flags_from_masks(size_t count, 1257 const vma_flags_t *masks) 1258 { 1259 vma_flags_t flags = EMPTY_VMA_FLAGS; 1260 size_t i; 1261 1262 for (i = 0; i < count; i++) 1263 vma_flags_set_mask(&flags, masks[i]); 1264 return flags; 1265 } 1266 1267 /* 1268 * Combine pre-computed vma_flags_t masks into one value, e.g.: 1269 * 1270 * vma_flags_t flags = mk_vma_flags_from_masks(VMA_UFFD_WP, VMA_UFFD_MINOR); 1271 * 1272 * Unlike mk_vma_flags(), which takes bit numbers, this takes whole masks -- 1273 * each of which may be EMPTY_VMA_FLAGS when its feature is unavailable -- so a 1274 * bit that does not exist on the current build is never materialised. 1275 */ 1276 #define mk_vma_flags_from_masks(...) \ 1277 __mk_vma_flags_from_masks(COUNT_ARGS(__VA_ARGS__), \ 1278 (const vma_flags_t []){__VA_ARGS__}) 1279 1280 /* Clear all of the to-clear flags in flags, non-atomically. */ 1281 static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, 1282 vma_flags_t to_clear) 1283 { 1284 unsigned long *bitmap = flags->__vma_flags; 1285 const unsigned long *bitmap_to_clear = to_clear.__vma_flags; 1286 1287 bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS); 1288 } 1289 1290 /* 1291 * Clear all specified individual flags, e.g.: 1292 * 1293 * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); 1294 */ 1295 #define vma_flags_clear(flags, ...) \ 1296 vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) 1297 1298 /* 1299 * Obtain a VMA flags value containing those flags that are present in flags or 1300 * flags_other but not in both. 1301 */ 1302 static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, 1303 const vma_flags_t *flags_other) 1304 { 1305 vma_flags_t dst; 1306 const unsigned long *bitmap_other = flags_other->__vma_flags; 1307 const unsigned long *bitmap = flags->__vma_flags; 1308 unsigned long *bitmap_dst = dst.__vma_flags; 1309 1310 bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); 1311 return dst; 1312 } 1313 1314 /* Determine if flags and flags_other have precisely the same flags set. */ 1315 static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, 1316 const vma_flags_t *flags_other) 1317 { 1318 const unsigned long *bitmap = flags->__vma_flags; 1319 const unsigned long *bitmap_other = flags_other->__vma_flags; 1320 1321 return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); 1322 } 1323 1324 /* Determine if flags and flags_other have precisely the same flags set. */ 1325 static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, 1326 vma_flags_t flags_other) 1327 { 1328 const unsigned long *bitmap = flags->__vma_flags; 1329 const unsigned long *bitmap_other = flags_other.__vma_flags; 1330 1331 return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); 1332 } 1333 1334 /* 1335 * Helper macro to determine if only the specific flags are set, e.g.: 1336 * 1337 * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... } 1338 */ 1339 #define vma_flags_same(flags, ...) \ 1340 vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) 1341 1342 /* 1343 * Test whether a specific flag in the VMA is set, e.g.: 1344 * 1345 * if (vma_test(vma, VMA_READ_BIT)) { ... } 1346 */ 1347 static __always_inline bool vma_test(const struct vm_area_struct *vma, 1348 vma_flag_t bit) 1349 { 1350 return vma_flags_test(&vma->flags, bit); 1351 } 1352 1353 /* Helper to test any VMA flags in a VMA . */ 1354 static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, 1355 vma_flags_t flags) 1356 { 1357 return vma_flags_test_any_mask(&vma->flags, flags); 1358 } 1359 1360 /* 1361 * Helper macro for testing whether any VMA flags are set in a VMA, 1362 * e.g.: 1363 * 1364 * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, 1365 * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } 1366 */ 1367 #define vma_test_any(vma, ...) \ 1368 vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) 1369 1370 /* 1371 * Helper to test that ALL specified flags are set in a VMA. 1372 * 1373 * Note: appropriate locks must be held, this function does not acquire them for 1374 * you. 1375 */ 1376 static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, 1377 vma_flags_t flags) 1378 { 1379 return vma_flags_test_all_mask(&vma->flags, flags); 1380 } 1381 1382 /* 1383 * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: 1384 * 1385 * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } 1386 */ 1387 #define vma_test_all(vma, ...) \ 1388 vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) 1389 1390 /* 1391 * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set 1392 * (returning false if flagmask has no flags set). 1393 * 1394 * This is useful when a flag needs to be either defined or not depending upon 1395 * kernel configuration, e.g.: 1396 * 1397 * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... } 1398 * 1399 * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS 1400 * otherwise. 1401 */ 1402 static __always_inline bool 1403 vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) 1404 { 1405 return vma_flags_test_single_mask(&vma->flags, flagmask); 1406 } 1407 1408 /* 1409 * Helper to set all VMA flags in a VMA. 1410 * 1411 * Note: appropriate locks must be held, this function does not acquire them for 1412 * you. 1413 */ 1414 static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, 1415 vma_flags_t flags) 1416 { 1417 vma_flags_set_mask(&vma->flags, flags); 1418 } 1419 1420 /* 1421 * Helper macro for specifying VMA flags in a VMA, e.g.: 1422 * 1423 * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, 1424 * VMA_DONTDUMP_BIT); 1425 * 1426 * Note: appropriate locks must be held, this function does not acquire them for 1427 * you. 1428 */ 1429 #define vma_set_flags(vma, ...) \ 1430 vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) 1431 1432 /* Helper to clear all VMA flags in a VMA. */ 1433 static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, 1434 vma_flags_t flags) 1435 { 1436 vma_flags_clear_mask(&vma->flags, flags); 1437 } 1438 1439 /* 1440 * Helper macro for clearing VMA flags, e.g.: 1441 * 1442 * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, 1443 * VMA_DONTDUMP_BIT); 1444 */ 1445 #define vma_clear_flags(vma, ...) \ 1446 vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) 1447 1448 /* 1449 * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: 1450 * 1451 * if (vma_desc_test(desc, VMA_READ_BIT)) { ... } 1452 */ 1453 static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, 1454 vma_flag_t bit) 1455 { 1456 return vma_flags_test(&desc->vma_flags, bit); 1457 } 1458 1459 /* Helper to test any VMA flags in a VMA descriptor. */ 1460 static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, 1461 vma_flags_t flags) 1462 { 1463 return vma_flags_test_any_mask(&desc->vma_flags, flags); 1464 } 1465 1466 /* 1467 * Helper macro for testing whether any VMA flags are set in a VMA descriptor, 1468 * e.g.: 1469 * 1470 * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, 1471 * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } 1472 */ 1473 #define vma_desc_test_any(desc, ...) \ 1474 vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) 1475 1476 /* Helper to test all VMA flags in a VMA descriptor. */ 1477 static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, 1478 vma_flags_t flags) 1479 { 1480 return vma_flags_test_all_mask(&desc->vma_flags, flags); 1481 } 1482 1483 /* 1484 * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor, 1485 * e.g.: 1486 * 1487 * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } 1488 */ 1489 #define vma_desc_test_all(desc, ...) \ 1490 vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) 1491 1492 /* Helper to set all VMA flags in a VMA descriptor. */ 1493 static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, 1494 vma_flags_t flags) 1495 { 1496 vma_flags_set_mask(&desc->vma_flags, flags); 1497 } 1498 1499 /* 1500 * Helper macro for specifying VMA flags for an input pointer to a struct 1501 * vm_area_desc object describing a proposed VMA, e.g.: 1502 * 1503 * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, 1504 * VMA_DONTDUMP_BIT); 1505 */ 1506 #define vma_desc_set_flags(desc, ...) \ 1507 vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) 1508 1509 /* Helper to clear all VMA flags in a VMA descriptor. */ 1510 static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, 1511 vma_flags_t flags) 1512 { 1513 vma_flags_clear_mask(&desc->vma_flags, flags); 1514 } 1515 1516 /* 1517 * Helper macro for clearing VMA flags for an input pointer to a struct 1518 * vm_area_desc object describing a proposed VMA, e.g.: 1519 * 1520 * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, 1521 * VMA_DONTDUMP_BIT); 1522 */ 1523 #define vma_desc_clear_flags(desc, ...) \ 1524 vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) 1525 1526 static inline void vma_set_anonymous(struct vm_area_struct *vma) 1527 { 1528 vma->vm_ops = NULL; 1529 } 1530 1531 static inline void vma_desc_set_anonymous(struct vm_area_desc *desc) 1532 { 1533 desc->vm_ops = NULL; 1534 } 1535 1536 static inline bool vma_is_anonymous(struct vm_area_struct *vma) 1537 { 1538 return !vma->vm_ops; 1539 } 1540 1541 /* 1542 * Indicate if the VMA is a heap for the given task; for 1543 * /proc/PID/maps that is the heap of the main task. 1544 */ 1545 static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) 1546 { 1547 return vma->vm_start < vma->vm_mm->brk && 1548 vma->vm_end > vma->vm_mm->start_brk; 1549 } 1550 1551 /* 1552 * Indicate if the VMA is a stack for the given task; for 1553 * /proc/PID/maps that is the stack of the main task. 1554 */ 1555 static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) 1556 { 1557 /* 1558 * We make no effort to guess what a given thread considers to be 1559 * its "stack". It's not even well-defined for programs written 1560 * languages like Go. 1561 */ 1562 return vma->vm_start <= vma->vm_mm->start_stack && 1563 vma->vm_end >= vma->vm_mm->start_stack; 1564 } 1565 1566 static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) 1567 { 1568 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1569 1570 if (!maybe_stack) 1571 return false; 1572 1573 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1574 VM_STACK_INCOMPLETE_SETUP) 1575 return true; 1576 1577 return false; 1578 } 1579 1580 static inline bool vma_is_foreign(const struct vm_area_struct *vma) 1581 { 1582 if (!current->mm) 1583 return true; 1584 1585 if (current->mm != vma->vm_mm) 1586 return true; 1587 1588 return false; 1589 } 1590 1591 static inline bool vma_is_accessible(const struct vm_area_struct *vma) 1592 { 1593 return vma->vm_flags & VM_ACCESS_FLAGS; 1594 } 1595 1596 static inline bool is_shared_maywrite(const vma_flags_t *flags) 1597 { 1598 return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); 1599 } 1600 1601 static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) 1602 { 1603 return is_shared_maywrite(&vma->flags); 1604 } 1605 1606 /** 1607 * vma_kernel_pagesize - Default page size granularity for this VMA. 1608 * @vma: The user mapping. 1609 * 1610 * The kernel page size specifies in which granularity VMA modifications 1611 * can be performed. Folios in this VMA will be aligned to, and at least 1612 * the size of the number of bytes returned by this function. 1613 * 1614 * The default kernel page size is not affected by Transparent Huge Pages 1615 * being in effect. 1616 * 1617 * Return: The default page size granularity for this VMA. 1618 */ 1619 static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 1620 { 1621 if (unlikely(vma->vm_ops && vma->vm_ops->pagesize)) 1622 return vma->vm_ops->pagesize(vma); 1623 return PAGE_SIZE; 1624 } 1625 1626 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); 1627 1628 static inline 1629 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) 1630 { 1631 return mas_find(&vmi->mas, max - 1); 1632 } 1633 1634 static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) 1635 { 1636 /* 1637 * Uses mas_find() to get the first VMA when the iterator starts. 1638 * Calling mas_next() could skip the first entry. 1639 */ 1640 return mas_find(&vmi->mas, ULONG_MAX); 1641 } 1642 1643 static inline 1644 struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) 1645 { 1646 return mas_next_range(&vmi->mas, ULONG_MAX); 1647 } 1648 1649 1650 static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) 1651 { 1652 return mas_prev(&vmi->mas, 0); 1653 } 1654 1655 static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, 1656 unsigned long start, unsigned long end, gfp_t gfp) 1657 { 1658 __mas_set_range(&vmi->mas, start, end - 1); 1659 mas_store_gfp(&vmi->mas, NULL, gfp); 1660 if (unlikely(mas_is_err(&vmi->mas))) 1661 return -ENOMEM; 1662 1663 return 0; 1664 } 1665 1666 /* Free any unused preallocations */ 1667 static inline void vma_iter_free(struct vma_iterator *vmi) 1668 { 1669 mas_destroy(&vmi->mas); 1670 } 1671 1672 static inline int vma_iter_bulk_store(struct vma_iterator *vmi, 1673 struct vm_area_struct *vma) 1674 { 1675 vmi->mas.index = vma->vm_start; 1676 vmi->mas.last = vma->vm_end - 1; 1677 mas_store(&vmi->mas, vma); 1678 if (unlikely(mas_is_err(&vmi->mas))) 1679 return -ENOMEM; 1680 1681 vma_mark_attached(vma); 1682 return 0; 1683 } 1684 1685 static inline void vma_iter_invalidate(struct vma_iterator *vmi) 1686 { 1687 mas_pause(&vmi->mas); 1688 } 1689 1690 static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) 1691 { 1692 mas_set(&vmi->mas, addr); 1693 } 1694 1695 #define for_each_vma(__vmi, __vma) \ 1696 while (((__vma) = vma_next(&(__vmi))) != NULL) 1697 1698 /* The MM code likes to work with exclusive end addresses */ 1699 #define for_each_vma_range(__vmi, __vma, __end) \ 1700 while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) 1701 1702 #ifdef CONFIG_SHMEM 1703 /* 1704 * The vma_is_shmem is not inline because it is used only by slow 1705 * paths in userfault. 1706 */ 1707 bool vma_is_shmem(const struct vm_area_struct *vma); 1708 bool vma_is_anon_shmem(const struct vm_area_struct *vma); 1709 #else 1710 static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } 1711 static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } 1712 #endif 1713 1714 int vma_is_stack_for_current(const struct vm_area_struct *vma); 1715 1716 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ 1717 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } 1718 1719 struct mmu_gather; 1720 struct inode; 1721 1722 extern void prep_compound_page(struct page *page, unsigned int order); 1723 1724 static inline unsigned int folio_large_order(const struct folio *folio) 1725 { 1726 return folio->_flags_1 & 0xff; 1727 } 1728 1729 #ifdef NR_PAGES_IN_LARGE_FOLIO 1730 static inline unsigned long folio_large_nr_pages(const struct folio *folio) 1731 { 1732 return folio->_nr_pages; 1733 } 1734 #else 1735 static inline unsigned long folio_large_nr_pages(const struct folio *folio) 1736 { 1737 return 1L << folio_large_order(folio); 1738 } 1739 #endif 1740 1741 /* 1742 * compound_order() can be called without holding a reference, which means 1743 * that niceties like page_folio() don't work. These callers should be 1744 * prepared to handle wild return values. For example, PG_head may be 1745 * set before the order is initialised, or this may be a tail page. 1746 * See compaction.c for some good examples. 1747 */ 1748 static inline unsigned int compound_order(const struct page *page) 1749 { 1750 const struct folio *folio = (struct folio *)page; 1751 1752 if (!test_bit(PG_head, &folio->flags.f)) 1753 return 0; 1754 return folio_large_order(folio); 1755 } 1756 1757 /** 1758 * folio_order - The allocation order of a folio. 1759 * @folio: The folio. 1760 * 1761 * A folio is composed of 2^order pages. See get_order() for the definition 1762 * of order. 1763 * 1764 * Return: The order of the folio. 1765 */ 1766 static inline unsigned int folio_order(const struct folio *folio) 1767 { 1768 if (!folio_test_large(folio)) 1769 return 0; 1770 return folio_large_order(folio); 1771 } 1772 1773 /** 1774 * folio_reset_order - Reset the folio order and derived _nr_pages 1775 * @folio: The folio. 1776 * 1777 * Reset the order and derived _nr_pages to 0. Must only be used in the 1778 * process of splitting large folios. 1779 */ 1780 static inline void folio_reset_order(struct folio *folio) 1781 { 1782 if (WARN_ON_ONCE(!folio_test_large(folio))) 1783 return; 1784 folio->_flags_1 &= ~0xffUL; 1785 #ifdef NR_PAGES_IN_LARGE_FOLIO 1786 folio->_nr_pages = 0; 1787 #endif 1788 } 1789 1790 #include <linux/huge_mm.h> 1791 1792 /* 1793 * Methods to modify the page usage count. 1794 * 1795 * What counts for a page usage: 1796 * - cache mapping (page->mapping) 1797 * - private data (page->private) 1798 * - page mapped in a task's page tables, each mapping 1799 * is counted separately 1800 * 1801 * Also, many kernel routines increase the page count before a critical 1802 * routine so they can be sure the page doesn't go away from under them. 1803 */ 1804 1805 /* 1806 * Drop a ref, return true if the refcount fell to zero (the page has no users) 1807 */ 1808 static inline int put_page_testzero(struct page *page) 1809 { 1810 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 1811 return page_ref_dec_and_test(page); 1812 } 1813 1814 static inline int folio_put_testzero(struct folio *folio) 1815 { 1816 return put_page_testzero(&folio->page); 1817 } 1818 1819 /* 1820 * Try to grab a ref unless the page has a refcount of zero, return false if 1821 * that is the case. 1822 * This can be called when MMU is off so it must not access 1823 * any of the virtual mappings. 1824 */ 1825 static inline bool get_page_unless_zero(struct page *page) 1826 { 1827 return page_ref_add_unless_zero(page, 1); 1828 } 1829 1830 static inline struct folio *folio_get_nontail_page(struct page *page) 1831 { 1832 if (unlikely(!get_page_unless_zero(page))) 1833 return NULL; 1834 return (struct folio *)page; 1835 } 1836 1837 extern int page_is_ram(unsigned long pfn); 1838 1839 enum { 1840 REGION_INTERSECTS, 1841 REGION_DISJOINT, 1842 REGION_MIXED, 1843 }; 1844 1845 int region_intersects(resource_size_t offset, size_t size, unsigned long flags, 1846 unsigned long desc); 1847 1848 /* Support for virtually mapped pages */ 1849 struct page *vmalloc_to_page(const void *addr); 1850 unsigned long vmalloc_to_pfn(const void *addr); 1851 1852 /* 1853 * Determine if an address is within the vmalloc range 1854 * 1855 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there 1856 * is no special casing required. 1857 */ 1858 #ifdef CONFIG_MMU 1859 extern bool is_vmalloc_addr(const void *x); 1860 extern int is_vmalloc_or_module_addr(const void *x); 1861 #else 1862 static inline bool is_vmalloc_addr(const void *x) 1863 { 1864 return false; 1865 } 1866 static inline int is_vmalloc_or_module_addr(const void *x) 1867 { 1868 return 0; 1869 } 1870 #endif 1871 1872 /* 1873 * How many times the entire folio is mapped as a single unit (eg by a 1874 * PMD or PUD entry). This is probably not what you want, except for 1875 * debugging purposes or implementation of other core folio_*() primitives. 1876 */ 1877 static inline int folio_entire_mapcount(const struct folio *folio) 1878 { 1879 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 1880 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) 1881 return 0; 1882 return atomic_read(&folio->_entire_mapcount) + 1; 1883 } 1884 1885 static inline int folio_large_mapcount(const struct folio *folio) 1886 { 1887 VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); 1888 return atomic_read(&folio->_large_mapcount) + 1; 1889 } 1890 1891 /** 1892 * folio_mapcount() - Number of mappings of this folio. 1893 * @folio: The folio. 1894 * 1895 * The folio mapcount corresponds to the number of present user page table 1896 * entries that reference any part of a folio. Each such present user page 1897 * table entry must be paired with exactly on folio reference. 1898 * 1899 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts 1900 * exactly once. 1901 * 1902 * For hugetlb folios, each abstracted "hugetlb" user page table entry that 1903 * references the entire folio counts exactly once, even when such special 1904 * page table entries are comprised of multiple ordinary page table entries. 1905 * 1906 * Will report 0 for pages which cannot be mapped into userspace, such as 1907 * slab, page tables and similar. 1908 * 1909 * Return: The number of times this folio is mapped. 1910 */ 1911 static inline int folio_mapcount(const struct folio *folio) 1912 { 1913 int mapcount; 1914 1915 if (likely(!folio_test_large(folio))) { 1916 mapcount = atomic_read(&folio->_mapcount) + 1; 1917 if (page_mapcount_is_type(mapcount)) 1918 mapcount = 0; 1919 return mapcount; 1920 } 1921 return folio_large_mapcount(folio); 1922 } 1923 1924 /** 1925 * folio_mapped - Is this folio mapped into userspace? 1926 * @folio: The folio. 1927 * 1928 * Return: True if any page in this folio is referenced by user page tables. 1929 */ 1930 static inline bool folio_mapped(const struct folio *folio) 1931 { 1932 return folio_mapcount(folio) >= 1; 1933 } 1934 1935 static inline struct page *virt_to_head_page(const void *x) 1936 { 1937 struct page *page = virt_to_page(x); 1938 1939 return compound_head(page); 1940 } 1941 1942 static inline struct folio *virt_to_folio(const void *x) 1943 { 1944 struct page *page = virt_to_page(x); 1945 1946 return page_folio(page); 1947 } 1948 1949 void __folio_put(struct folio *folio); 1950 1951 void split_page(struct page *page, unsigned int order); 1952 void folio_copy(struct folio *dst, struct folio *src); 1953 int folio_mc_copy(struct folio *dst, struct folio *src); 1954 1955 unsigned long nr_free_buffer_pages(void); 1956 1957 /* Returns the number of bytes in this potentially compound page. */ 1958 static inline unsigned long page_size(const struct page *page) 1959 { 1960 return PAGE_SIZE << compound_order(page); 1961 } 1962 1963 /* Returns the number of bits needed for the number of bytes in a page */ 1964 static inline unsigned int page_shift(struct page *page) 1965 { 1966 return PAGE_SHIFT + compound_order(page); 1967 } 1968 1969 /** 1970 * thp_order - Order of a transparent huge page. 1971 * @page: Head page of a transparent huge page. 1972 */ 1973 static inline unsigned int thp_order(struct page *page) 1974 { 1975 VM_BUG_ON_PGFLAGS(PageTail(page), page); 1976 return compound_order(page); 1977 } 1978 1979 /** 1980 * thp_size - Size of a transparent huge page. 1981 * @page: Head page of a transparent huge page. 1982 * 1983 * Return: Number of bytes in this page. 1984 */ 1985 static inline unsigned long thp_size(struct page *page) 1986 { 1987 return PAGE_SIZE << thp_order(page); 1988 } 1989 1990 #ifdef CONFIG_MMU 1991 /* 1992 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1993 * servicing faults for write access. In the normal case, do always want 1994 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1995 * that do not have writing enabled, when used by access_process_vm. 1996 */ 1997 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) 1998 { 1999 if (likely(vma->vm_flags & VM_WRITE)) 2000 pte = pte_mkwrite(pte, vma); 2001 return pte; 2002 } 2003 2004 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); 2005 void set_pte_range(struct vm_fault *vmf, struct folio *folio, 2006 struct page *page, unsigned int nr, unsigned long addr); 2007 2008 vm_fault_t finish_fault(struct vm_fault *vmf); 2009 #endif 2010 2011 /* 2012 * Multiple processes may "see" the same page. E.g. for untouched 2013 * mappings of /dev/null, all processes see the same page full of 2014 * zeroes, and text pages of executables and shared libraries have 2015 * only one copy in memory, at most, normally. 2016 * 2017 * For the non-reserved pages, page_count(page) denotes a reference count. 2018 * page_count() == 0 means the page is free. page->lru is then used for 2019 * freelist management in the buddy allocator. 2020 * page_count() > 0 means the page has been allocated. 2021 * 2022 * Pages are allocated by the slab allocator in order to provide memory 2023 * to kmalloc and kmem_cache_alloc. In this case, the management of the 2024 * page, and the fields in 'struct page' are the responsibility of mm/slab.c 2025 * unless a particular usage is carefully commented. (the responsibility of 2026 * freeing the kmalloc memory is the caller's, of course). 2027 * 2028 * A page may be used by anyone else who does a __get_free_page(). 2029 * In this case, page_count still tracks the references, and should only 2030 * be used through the normal accessor functions. The top bits of page->flags 2031 * and page->virtual store page management information, but all other fields 2032 * are unused and could be used privately, carefully. The management of this 2033 * page is the responsibility of the one who allocated it, and those who have 2034 * subsequently been given references to it. 2035 * 2036 * The other pages (we may call them "pagecache pages") are completely 2037 * managed by the Linux memory manager: I/O, buffers, swapping etc. 2038 * The following discussion applies only to them. 2039 * 2040 * A pagecache page contains an opaque `private' member, which belongs to the 2041 * page's address_space. Usually, this is the address of a circular list of 2042 * the page's disk buffers. PG_private must be set to tell the VM to call 2043 * into the filesystem to release these pages. 2044 * 2045 * A folio may belong to an inode's memory mapping. In this case, 2046 * folio->mapping points to the inode, and folio->index is the file 2047 * offset of the folio, in units of PAGE_SIZE. 2048 * 2049 * If pagecache pages are not associated with an inode, they are said to be 2050 * anonymous pages. These may become associated with the swapcache, and in that 2051 * case PG_swapcache is set, and page->private is an offset into the swapcache. 2052 * 2053 * In either case (swapcache or inode backed), the pagecache itself holds one 2054 * reference to the page. Setting PG_private should also increment the 2055 * refcount. The each user mapping also has a reference to the page. 2056 * 2057 * The pagecache pages are stored in a per-mapping radix tree, which is 2058 * rooted at mapping->i_pages, and indexed by offset. 2059 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space 2060 * lists, we instead now tag pages as dirty/writeback in the radix tree. 2061 * 2062 * All pagecache pages may be subject to I/O: 2063 * - inode pages may need to be read from disk, 2064 * - inode pages which have been modified and are MAP_SHARED may need 2065 * to be written back to the inode on disk, 2066 * - anonymous pages (including MAP_PRIVATE file mappings) which have been 2067 * modified may need to be swapped out to swap space and (later) to be read 2068 * back into memory. 2069 */ 2070 2071 /* 127: arbitrary random number, small enough to assemble well */ 2072 #define folio_ref_zero_or_close_to_overflow(folio) \ 2073 ((unsigned int) folio_ref_count(folio) + 127u <= 127u) 2074 2075 /** 2076 * folio_get - Increment the reference count on a folio. 2077 * @folio: The folio. 2078 * 2079 * Context: May be called in any context, as long as you know that 2080 * you have a refcount on the folio. If you do not already have one, 2081 * folio_try_get() may be the right interface for you to use. 2082 */ 2083 static inline void folio_get(struct folio *folio) 2084 { 2085 VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); 2086 folio_ref_inc(folio); 2087 } 2088 2089 static inline void get_page(struct page *page) 2090 { 2091 struct folio *folio = page_folio(page); 2092 if (WARN_ON_ONCE(folio_test_slab(folio))) 2093 return; 2094 if (WARN_ON_ONCE(folio_test_large_kmalloc(folio))) 2095 return; 2096 folio_get(folio); 2097 } 2098 2099 static inline __must_check bool try_get_page(struct page *page) 2100 { 2101 page = compound_head(page); 2102 if (WARN_ON_ONCE(page_ref_count(page) <= 0)) 2103 return false; 2104 page_ref_inc(page); 2105 return true; 2106 } 2107 2108 /** 2109 * folio_put - Decrement the reference count on a folio. 2110 * @folio: The folio. 2111 * 2112 * If the folio's reference count reaches zero, the memory will be 2113 * released back to the page allocator and may be used by another 2114 * allocation immediately. Do not access the memory or the struct folio 2115 * after calling folio_put() unless you can be sure that it wasn't the 2116 * last reference. 2117 * 2118 * Context: May be called in process or interrupt context, but not in NMI 2119 * context. May be called while holding a spinlock. 2120 */ 2121 static inline void folio_put(struct folio *folio) 2122 { 2123 if (folio_put_testzero(folio)) 2124 __folio_put(folio); 2125 } 2126 2127 /** 2128 * folio_put_refs - Reduce the reference count on a folio. 2129 * @folio: The folio. 2130 * @refs: The amount to subtract from the folio's reference count. 2131 * 2132 * If the folio's reference count reaches zero, the memory will be 2133 * released back to the page allocator and may be used by another 2134 * allocation immediately. Do not access the memory or the struct folio 2135 * after calling folio_put_refs() unless you can be sure that these weren't 2136 * the last references. 2137 * 2138 * Context: May be called in process or interrupt context, but not in NMI 2139 * context. May be called while holding a spinlock. 2140 */ 2141 static inline void folio_put_refs(struct folio *folio, int refs) 2142 { 2143 if (folio_ref_sub_and_test(folio, refs)) 2144 __folio_put(folio); 2145 } 2146 2147 void folios_put_refs(struct folio_batch *folios, unsigned int *refs); 2148 2149 /* 2150 * union release_pages_arg - an array of pages or folios 2151 * 2152 * release_pages() releases a simple array of multiple pages, and 2153 * accepts various different forms of said page array: either 2154 * a regular old boring array of pages, an array of folios, or 2155 * an array of encoded page pointers. 2156 * 2157 * The transparent union syntax for this kind of "any of these 2158 * argument types" is all kinds of ugly, so look away. 2159 */ 2160 typedef union { 2161 struct page **pages; 2162 struct folio **folios; 2163 struct encoded_page **encoded_pages; 2164 } release_pages_arg __attribute__ ((__transparent_union__)); 2165 2166 void release_pages(release_pages_arg, int nr); 2167 2168 /** 2169 * folios_put - Decrement the reference count on an array of folios. 2170 * @folios: The folios. 2171 * 2172 * Like folio_put(), but for a batch of folios. This is more efficient 2173 * than writing the loop yourself as it will optimise the locks which need 2174 * to be taken if the folios are freed. The folios batch is returned 2175 * empty and ready to be reused for another batch; there is no need to 2176 * reinitialise it. 2177 * 2178 * Context: May be called in process or interrupt context, but not in NMI 2179 * context. May be called while holding a spinlock. 2180 */ 2181 static inline void folios_put(struct folio_batch *folios) 2182 { 2183 folios_put_refs(folios, NULL); 2184 } 2185 2186 static inline void put_page(struct page *page) 2187 { 2188 struct folio *folio = page_folio(page); 2189 2190 if (folio_test_slab(folio) || folio_test_large_kmalloc(folio)) 2191 return; 2192 2193 folio_put(folio); 2194 } 2195 2196 /* 2197 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload 2198 * the page's refcount so that two separate items are tracked: the original page 2199 * reference count, and also a new count of how many pin_user_pages() calls were 2200 * made against the page. ("gup-pinned" is another term for the latter). 2201 * 2202 * With this scheme, pin_user_pages() becomes special: such pages are marked as 2203 * distinct from normal pages. As such, the unpin_user_page() call (and its 2204 * variants) must be used in order to release gup-pinned pages. 2205 * 2206 * Choice of value: 2207 * 2208 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference 2209 * counts with respect to pin_user_pages() and unpin_user_page() becomes 2210 * simpler, due to the fact that adding an even power of two to the page 2211 * refcount has the effect of using only the upper N bits, for the code that 2212 * counts up using the bias value. This means that the lower bits are left for 2213 * the exclusive use of the original code that increments and decrements by one 2214 * (or at least, by much smaller values than the bias value). 2215 * 2216 * Of course, once the lower bits overflow into the upper bits (and this is 2217 * OK, because subtraction recovers the original values), then visual inspection 2218 * no longer suffices to directly view the separate counts. However, for normal 2219 * applications that don't have huge page reference counts, this won't be an 2220 * issue. 2221 * 2222 * Locking: the lockless algorithm described in folio_try_get_rcu() 2223 * provides safe operation for get_user_pages(), folio_mkclean() and 2224 * other calls that race to set up page table entries. 2225 */ 2226 #define GUP_PIN_COUNTING_BIAS (1U << 10) 2227 2228 void unpin_user_page(struct page *page); 2229 void unpin_folio(struct folio *folio); 2230 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, 2231 bool make_dirty); 2232 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, 2233 bool make_dirty); 2234 void unpin_user_pages(struct page **pages, unsigned long npages); 2235 void unpin_user_folio(struct folio *folio, unsigned long npages); 2236 void unpin_folios(struct folio **folios, unsigned long nfolios); 2237 2238 static inline bool is_cow_mapping(vm_flags_t flags) 2239 { 2240 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2241 } 2242 2243 static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) 2244 { 2245 const vma_flags_t *flags = &desc->vma_flags; 2246 2247 return vma_flags_test(flags, VMA_MAYWRITE_BIT) && 2248 !vma_flags_test(flags, VMA_SHARED_BIT); 2249 } 2250 2251 #ifndef CONFIG_MMU 2252 static inline bool is_nommu_shared_mapping(vm_flags_t flags) 2253 { 2254 /* 2255 * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected 2256 * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of 2257 * a file mapping. R/O MAP_PRIVATE mappings might still modify 2258 * underlying memory if ptrace is active, so this is only possible if 2259 * ptrace does not apply. Note that there is no mprotect() to upgrade 2260 * write permissions later. 2261 */ 2262 return flags & (VM_MAYSHARE | VM_MAYOVERLAY); 2263 } 2264 2265 static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) 2266 { 2267 return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); 2268 } 2269 #endif 2270 2271 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 2272 #define SECTION_IN_PAGE_FLAGS 2273 #endif 2274 2275 /* 2276 * The identification function is mainly used by the buddy allocator for 2277 * determining if two pages could be buddies. We are not really identifying 2278 * the zone since we could be using the section number id if we do not have 2279 * node id available in page flags. 2280 * We only guarantee that it will return the same value for two combinable 2281 * pages in a zone. 2282 */ 2283 static inline int page_zone_id(struct page *page) 2284 { 2285 return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; 2286 } 2287 2288 #ifdef NODE_NOT_IN_PAGE_FLAGS 2289 int memdesc_nid(memdesc_flags_t mdf); 2290 #else 2291 static inline int memdesc_nid(memdesc_flags_t mdf) 2292 { 2293 return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; 2294 } 2295 #endif 2296 2297 static inline int page_to_nid(const struct page *page) 2298 { 2299 return memdesc_nid(PF_POISONED_CHECK(page)->flags); 2300 } 2301 2302 static inline int folio_nid(const struct folio *folio) 2303 { 2304 return memdesc_nid(folio->flags); 2305 } 2306 2307 #ifdef CONFIG_NUMA_BALANCING 2308 /* page access time bits needs to hold at least 4 seconds */ 2309 #define PAGE_ACCESS_TIME_MIN_BITS 12 2310 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS 2311 #define PAGE_ACCESS_TIME_BUCKETS \ 2312 (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) 2313 #else 2314 #define PAGE_ACCESS_TIME_BUCKETS 0 2315 #endif 2316 2317 #define PAGE_ACCESS_TIME_MASK \ 2318 (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) 2319 2320 static inline int cpu_pid_to_cpupid(int cpu, int pid) 2321 { 2322 return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); 2323 } 2324 2325 static inline int cpupid_to_pid(int cpupid) 2326 { 2327 return cpupid & LAST__PID_MASK; 2328 } 2329 2330 static inline int cpupid_to_cpu(int cpupid) 2331 { 2332 return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; 2333 } 2334 2335 static inline int cpupid_to_nid(int cpupid) 2336 { 2337 return cpu_to_node(cpupid_to_cpu(cpupid)); 2338 } 2339 2340 static inline bool cpupid_pid_unset(int cpupid) 2341 { 2342 return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); 2343 } 2344 2345 static inline bool cpupid_cpu_unset(int cpupid) 2346 { 2347 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); 2348 } 2349 2350 static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) 2351 { 2352 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); 2353 } 2354 2355 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) 2356 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 2357 static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) 2358 { 2359 return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); 2360 } 2361 2362 static inline int folio_last_cpupid(struct folio *folio) 2363 { 2364 return folio->_last_cpupid; 2365 } 2366 static inline void page_cpupid_reset_last(struct page *page) 2367 { 2368 page->_last_cpupid = -1 & LAST_CPUPID_MASK; 2369 } 2370 #else 2371 static inline int folio_last_cpupid(struct folio *folio) 2372 { 2373 return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; 2374 } 2375 2376 int folio_xchg_last_cpupid(struct folio *folio, int cpupid); 2377 2378 static inline void page_cpupid_reset_last(struct page *page) 2379 { 2380 page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; 2381 } 2382 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ 2383 2384 static inline int folio_xchg_access_time(struct folio *folio, int time) 2385 { 2386 int last_time; 2387 2388 last_time = folio_xchg_last_cpupid(folio, 2389 time >> PAGE_ACCESS_TIME_BUCKETS); 2390 return last_time << PAGE_ACCESS_TIME_BUCKETS; 2391 } 2392 2393 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) 2394 { 2395 unsigned int pid_bit; 2396 2397 pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); 2398 if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { 2399 __set_bit(pid_bit, &vma->numab_state->pids_active[1]); 2400 } 2401 } 2402 2403 bool folio_use_access_time(struct folio *folio); 2404 #else /* !CONFIG_NUMA_BALANCING */ 2405 static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) 2406 { 2407 return folio_nid(folio); /* XXX */ 2408 } 2409 2410 static inline int folio_xchg_access_time(struct folio *folio, int time) 2411 { 2412 return 0; 2413 } 2414 2415 static inline int folio_last_cpupid(struct folio *folio) 2416 { 2417 return folio_nid(folio); /* XXX */ 2418 } 2419 2420 static inline int cpupid_to_nid(int cpupid) 2421 { 2422 return -1; 2423 } 2424 2425 static inline int cpupid_to_pid(int cpupid) 2426 { 2427 return -1; 2428 } 2429 2430 static inline int cpupid_to_cpu(int cpupid) 2431 { 2432 return -1; 2433 } 2434 2435 static inline int cpu_pid_to_cpupid(int nid, int pid) 2436 { 2437 return -1; 2438 } 2439 2440 static inline bool cpupid_pid_unset(int cpupid) 2441 { 2442 return true; 2443 } 2444 2445 static inline void page_cpupid_reset_last(struct page *page) 2446 { 2447 } 2448 2449 static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) 2450 { 2451 return false; 2452 } 2453 2454 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) 2455 { 2456 } 2457 static inline bool folio_use_access_time(struct folio *folio) 2458 { 2459 return false; 2460 } 2461 #endif /* CONFIG_NUMA_BALANCING */ 2462 2463 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) 2464 2465 /* 2466 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid 2467 * setting tags for all pages to native kernel tag value 0xff, as the default 2468 * value 0x00 maps to 0xff. 2469 */ 2470 2471 static inline u8 page_kasan_tag(const struct page *page) 2472 { 2473 u8 tag = KASAN_TAG_KERNEL; 2474 2475 if (kasan_enabled()) { 2476 tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; 2477 tag ^= 0xff; 2478 } 2479 2480 return tag; 2481 } 2482 2483 static inline void page_kasan_tag_set(struct page *page, u8 tag) 2484 { 2485 unsigned long old_flags, flags; 2486 2487 if (!kasan_enabled()) 2488 return; 2489 2490 tag ^= 0xff; 2491 old_flags = READ_ONCE(page->flags.f); 2492 do { 2493 flags = old_flags; 2494 flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); 2495 flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; 2496 } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); 2497 } 2498 2499 static inline void page_kasan_tag_reset(struct page *page) 2500 { 2501 if (kasan_enabled()) 2502 page_kasan_tag_set(page, KASAN_TAG_KERNEL); 2503 } 2504 2505 #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ 2506 2507 static inline u8 page_kasan_tag(const struct page *page) 2508 { 2509 return 0xff; 2510 } 2511 2512 static inline void page_kasan_tag_set(struct page *page, u8 tag) { } 2513 static inline void page_kasan_tag_reset(struct page *page) { } 2514 2515 #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ 2516 2517 static inline struct zone *page_zone(const struct page *page) 2518 { 2519 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 2520 } 2521 2522 static inline pg_data_t *page_pgdat(const struct page *page) 2523 { 2524 return NODE_DATA(page_to_nid(page)); 2525 } 2526 2527 static inline pg_data_t *folio_pgdat(const struct folio *folio) 2528 { 2529 return NODE_DATA(folio_nid(folio)); 2530 } 2531 2532 static inline struct zone *folio_zone(const struct folio *folio) 2533 { 2534 return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; 2535 } 2536 2537 #ifdef SECTION_IN_PAGE_FLAGS 2538 static inline void set_page_section(struct page *page, unsigned long section) 2539 { 2540 page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); 2541 page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; 2542 } 2543 2544 static inline unsigned long memdesc_section(memdesc_flags_t mdf) 2545 { 2546 return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; 2547 } 2548 #else /* !SECTION_IN_PAGE_FLAGS */ 2549 static inline unsigned long memdesc_section(memdesc_flags_t mdf) 2550 { 2551 return 0; 2552 } 2553 #endif /* SECTION_IN_PAGE_FLAGS */ 2554 2555 /** 2556 * folio_pfn - Return the Page Frame Number of a folio. 2557 * @folio: The folio. 2558 * 2559 * A folio may contain multiple pages. The pages have consecutive 2560 * Page Frame Numbers. 2561 * 2562 * Return: The Page Frame Number of the first page in the folio. 2563 */ 2564 static inline unsigned long folio_pfn(const struct folio *folio) 2565 { 2566 return page_to_pfn(&folio->page); 2567 } 2568 2569 static inline struct folio *pfn_folio(unsigned long pfn) 2570 { 2571 return page_folio(pfn_to_page(pfn)); 2572 } 2573 2574 #ifdef CONFIG_MMU 2575 static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) 2576 { 2577 return pfn_pte(page_to_pfn(page), pgprot); 2578 } 2579 2580 /** 2581 * folio_mk_pte - Create a PTE for this folio 2582 * @folio: The folio to create a PTE for 2583 * @pgprot: The page protection bits to use 2584 * 2585 * Create a page table entry for the first page of this folio. 2586 * This is suitable for passing to set_ptes(). 2587 * 2588 * Return: A page table entry suitable for mapping this folio. 2589 */ 2590 static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) 2591 { 2592 return pfn_pte(folio_pfn(folio), pgprot); 2593 } 2594 2595 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2596 /** 2597 * folio_mk_pmd - Create a PMD for this folio 2598 * @folio: The folio to create a PMD for 2599 * @pgprot: The page protection bits to use 2600 * 2601 * Create a page table entry for the first page of this folio. 2602 * This is suitable for passing to set_pmd_at(). 2603 * 2604 * Return: A page table entry suitable for mapping this folio. 2605 */ 2606 static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) 2607 { 2608 return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); 2609 } 2610 2611 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2612 /** 2613 * folio_mk_pud - Create a PUD for this folio 2614 * @folio: The folio to create a PUD for 2615 * @pgprot: The page protection bits to use 2616 * 2617 * Create a page table entry for the first page of this folio. 2618 * This is suitable for passing to set_pud_at(). 2619 * 2620 * Return: A page table entry suitable for mapping this folio. 2621 */ 2622 static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) 2623 { 2624 return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); 2625 } 2626 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2627 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2628 #endif /* CONFIG_MMU */ 2629 2630 static inline bool folio_has_pincount(const struct folio *folio) 2631 { 2632 if (IS_ENABLED(CONFIG_64BIT)) 2633 return folio_test_large(folio); 2634 return folio_order(folio) > 1; 2635 } 2636 2637 /** 2638 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. 2639 * @folio: The folio. 2640 * 2641 * This function checks if a folio has been pinned via a call to 2642 * a function in the pin_user_pages() family. 2643 * 2644 * For small folios, the return value is partially fuzzy: false is not fuzzy, 2645 * because it means "definitely not pinned for DMA", but true means "probably 2646 * pinned for DMA, but possibly a false positive due to having at least 2647 * GUP_PIN_COUNTING_BIAS worth of normal folio references". 2648 * 2649 * False positives are OK, because: a) it's unlikely for a folio to 2650 * get that many refcounts, and b) all the callers of this routine are 2651 * expected to be able to deal gracefully with a false positive. 2652 * 2653 * For most large folios, the result will be exactly correct. That's because 2654 * we have more tracking data available: the _pincount field is used 2655 * instead of the GUP_PIN_COUNTING_BIAS scheme. 2656 * 2657 * For more information, please see Documentation/core-api/pin_user_pages.rst. 2658 * 2659 * Return: True, if it is likely that the folio has been "dma-pinned". 2660 * False, if the folio is definitely not dma-pinned. 2661 */ 2662 static inline bool folio_maybe_dma_pinned(struct folio *folio) 2663 { 2664 if (folio_has_pincount(folio)) 2665 return atomic_read(&folio->_pincount) > 0; 2666 2667 /* 2668 * folio_ref_count() is signed. If that refcount overflows, then 2669 * folio_ref_count() returns a negative value, and callers will avoid 2670 * further incrementing the refcount. 2671 * 2672 * Here, for that overflow case, use the sign bit to count a little 2673 * bit higher via unsigned math, and thus still get an accurate result. 2674 */ 2675 return ((unsigned int)folio_ref_count(folio)) >= 2676 GUP_PIN_COUNTING_BIAS; 2677 } 2678 2679 /* 2680 * This should most likely only be called during fork() to see whether we 2681 * should break the cow immediately for an anon page on the src mm. 2682 * 2683 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. 2684 */ 2685 static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, 2686 struct folio *folio) 2687 { 2688 VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); 2689 2690 if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) 2691 return false; 2692 2693 return folio_maybe_dma_pinned(folio); 2694 } 2695 2696 /** 2697 * is_zero_page - Query if a page is a zero page 2698 * @page: The page to query 2699 * 2700 * This returns true if @page is one of the permanent zero pages. 2701 */ 2702 static inline bool is_zero_page(const struct page *page) 2703 { 2704 return is_zero_pfn(page_to_pfn(page)); 2705 } 2706 2707 /** 2708 * is_zero_folio - Query if a folio is a zero page 2709 * @folio: The folio to query 2710 * 2711 * This returns true if @folio is one of the permanent zero pages. 2712 */ 2713 static inline bool is_zero_folio(const struct folio *folio) 2714 { 2715 return is_zero_page(&folio->page); 2716 } 2717 2718 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ 2719 #ifdef CONFIG_MIGRATION 2720 static inline bool folio_is_longterm_pinnable(struct folio *folio) 2721 { 2722 #ifdef CONFIG_CMA 2723 int mt = folio_migratetype(folio); 2724 2725 if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) 2726 return false; 2727 #endif 2728 /* The zero page can be "pinned" but gets special handling. */ 2729 if (is_zero_folio(folio)) 2730 return true; 2731 2732 /* Coherent device memory must always allow eviction. */ 2733 if (folio_is_device_coherent(folio)) 2734 return false; 2735 2736 /* 2737 * Filesystems can only tolerate transient delays to truncate and 2738 * hole-punch operations 2739 */ 2740 if (folio_is_fsdax(folio)) 2741 return false; 2742 2743 /* Otherwise, non-movable zone folios can be pinned. */ 2744 return !folio_is_zone_movable(folio); 2745 2746 } 2747 #else 2748 static inline bool folio_is_longterm_pinnable(struct folio *folio) 2749 { 2750 return true; 2751 } 2752 #endif 2753 2754 static inline void set_page_zone(struct page *page, enum zone_type zone) 2755 { 2756 page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); 2757 page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; 2758 } 2759 2760 static inline void set_page_node(struct page *page, unsigned long node) 2761 { 2762 page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); 2763 page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; 2764 } 2765 2766 static inline void set_page_links(struct page *page, enum zone_type zone, 2767 unsigned long node, unsigned long pfn) 2768 { 2769 set_page_zone(page, zone); 2770 set_page_node(page, node); 2771 #ifdef SECTION_IN_PAGE_FLAGS 2772 set_page_section(page, pfn_to_section_nr(pfn)); 2773 #endif 2774 } 2775 2776 /** 2777 * folio_nr_pages - The number of pages in the folio. 2778 * @folio: The folio. 2779 * 2780 * Return: A positive power of two. 2781 */ 2782 static inline unsigned long folio_nr_pages(const struct folio *folio) 2783 { 2784 if (!folio_test_large(folio)) 2785 return 1; 2786 return folio_large_nr_pages(folio); 2787 } 2788 2789 /* 2790 * compound_nr() returns the number of pages in this potentially compound 2791 * page. compound_nr() can be called on a tail page, and is defined to 2792 * return 1 in that case. 2793 */ 2794 static inline unsigned long compound_nr(const struct page *page) 2795 { 2796 const struct folio *folio = (struct folio *)page; 2797 2798 if (!test_bit(PG_head, &folio->flags.f)) 2799 return 1; 2800 return folio_large_nr_pages(folio); 2801 } 2802 2803 /** 2804 * folio_next - Move to the next physical folio. 2805 * @folio: The folio we're currently operating on. 2806 * 2807 * If you have physically contiguous memory which may span more than 2808 * one folio (eg a &struct bio_vec), use this function to move from one 2809 * folio to the next. Do not use it if the memory is only virtually 2810 * contiguous as the folios are almost certainly not adjacent to each 2811 * other. This is the folio equivalent to writing ``page++``. 2812 * 2813 * Context: We assume that the folios are refcounted and/or locked at a 2814 * higher level and do not adjust the reference counts. 2815 * Return: The next struct folio. 2816 */ 2817 static inline struct folio *folio_next(struct folio *folio) 2818 { 2819 return (struct folio *)folio_page(folio, folio_nr_pages(folio)); 2820 } 2821 2822 /** 2823 * folio_shift - The size of the memory described by this folio. 2824 * @folio: The folio. 2825 * 2826 * A folio represents a number of bytes which is a power-of-two in size. 2827 * This function tells you which power-of-two the folio is. See also 2828 * folio_size() and folio_order(). 2829 * 2830 * Context: The caller should have a reference on the folio to prevent 2831 * it from being split. It is not necessary for the folio to be locked. 2832 * Return: The base-2 logarithm of the size of this folio. 2833 */ 2834 static inline unsigned int folio_shift(const struct folio *folio) 2835 { 2836 return PAGE_SHIFT + folio_order(folio); 2837 } 2838 2839 /** 2840 * folio_size - The number of bytes in a folio. 2841 * @folio: The folio. 2842 * 2843 * Context: The caller should have a reference on the folio to prevent 2844 * it from being split. It is not necessary for the folio to be locked. 2845 * Return: The number of bytes in this folio. 2846 */ 2847 static inline size_t folio_size(const struct folio *folio) 2848 { 2849 return PAGE_SIZE << folio_order(folio); 2850 } 2851 2852 /** 2853 * folio_maybe_mapped_shared - Whether the folio is mapped into the page 2854 * tables of more than one MM 2855 * @folio: The folio. 2856 * 2857 * This function checks if the folio maybe currently mapped into more than one 2858 * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single 2859 * MM ("mapped exclusively"). 2860 * 2861 * For KSM folios, this function also returns "mapped shared" when a folio is 2862 * mapped multiple times into the same MM, because the individual page mappings 2863 * are independent. 2864 * 2865 * For small anonymous folios and anonymous hugetlb folios, the return 2866 * value will be exactly correct: non-KSM folios can only be mapped at most once 2867 * into an MM, and they cannot be partially mapped. KSM folios are 2868 * considered shared even if mapped multiple times into the same MM. 2869 * 2870 * For other folios, the result can be fuzzy: 2871 * #. For partially-mappable large folios (THP), the return value can wrongly 2872 * indicate "mapped shared" (false positive) if a folio was mapped by 2873 * more than two MMs at one point in time. 2874 * #. For pagecache folios (including hugetlb), the return value can wrongly 2875 * indicate "mapped shared" (false positive) when two VMAs in the same MM 2876 * cover the same file range. 2877 * 2878 * Further, this function only considers current page table mappings that 2879 * are tracked using the folio mapcount(s). 2880 * 2881 * This function does not consider: 2882 * #. If the folio might get mapped in the (near) future (e.g., swapcache, 2883 * pagecache, temporary unmapping for migration). 2884 * #. If the folio is mapped differently (VM_PFNMAP). 2885 * #. If hugetlb page table sharing applies. Callers might want to check 2886 * hugetlb_pmd_shared(). 2887 * 2888 * Return: Whether the folio is estimated to be mapped into more than one MM. 2889 */ 2890 static inline bool folio_maybe_mapped_shared(struct folio *folio) 2891 { 2892 int mapcount = folio_mapcount(folio); 2893 2894 /* Only partially-mappable folios require more care. */ 2895 if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) 2896 return mapcount > 1; 2897 2898 /* 2899 * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... 2900 * simply assume "mapped shared", nobody should really care 2901 * about this for arbitrary kernel allocations. 2902 */ 2903 if (!IS_ENABLED(CONFIG_MM_ID)) 2904 return true; 2905 2906 /* 2907 * A single mapping implies "mapped exclusively", even if the 2908 * folio flag says something different: it's easier to handle this 2909 * case here instead of on the RMAP hot path. 2910 */ 2911 if (mapcount <= 1) 2912 return false; 2913 return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); 2914 } 2915 2916 /** 2917 * folio_expected_ref_count - calculate the expected folio refcount 2918 * @folio: the folio 2919 * 2920 * Calculate the expected folio refcount, taking references from the pagecache, 2921 * swapcache, PG_private and page table mappings into account. Useful in 2922 * combination with folio_ref_count() to detect unexpected references (e.g., 2923 * GUP or other temporary references). 2924 * 2925 * Does currently not consider references from the LRU cache. If the folio 2926 * was isolated from the LRU (which is the case during migration or split), 2927 * the LRU cache does not apply. 2928 * 2929 * Calling this function on an unmapped folio -- !folio_mapped() -- that is 2930 * locked will return a stable result. 2931 * 2932 * Calling this function on a mapped folio will not result in a stable result, 2933 * because nothing stops additional page table mappings from coming (e.g., 2934 * fork()) or going (e.g., munmap()). 2935 * 2936 * Calling this function without the folio lock will also not result in a 2937 * stable result: for example, the folio might get dropped from the swapcache 2938 * concurrently. 2939 * 2940 * However, even when called without the folio lock or on a mapped folio, 2941 * this function can be used to detect unexpected references early (for example, 2942 * if it makes sense to even lock the folio and unmap it). 2943 * 2944 * The caller must add any reference (e.g., from folio_try_get()) it might be 2945 * holding itself to the result. 2946 * 2947 * Returns: the expected folio refcount. 2948 */ 2949 static inline int folio_expected_ref_count(const struct folio *folio) 2950 { 2951 const int order = folio_order(folio); 2952 int ref_count = 0; 2953 2954 if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) 2955 return 0; 2956 2957 /* One reference per page from the swapcache. */ 2958 ref_count += folio_test_swapcache(folio) << order; 2959 2960 if (!folio_test_anon(folio)) { 2961 /* One reference per page from the pagecache. */ 2962 ref_count += !!folio->mapping << order; 2963 /* One reference from PG_private. */ 2964 ref_count += folio_test_private(folio); 2965 } 2966 2967 /* One reference per page table mapping. */ 2968 return ref_count + folio_mapcount(folio); 2969 } 2970 2971 #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE 2972 static inline int arch_make_folio_accessible(struct folio *folio) 2973 { 2974 return 0; 2975 } 2976 #endif 2977 2978 /* 2979 * Some inline functions in vmstat.h depend on page_zone() 2980 */ 2981 #include <linux/vmstat.h> 2982 2983 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) 2984 #define HASHED_PAGE_VIRTUAL 2985 #endif 2986 2987 #if defined(WANT_PAGE_VIRTUAL) 2988 static inline void *page_address(const struct page *page) 2989 { 2990 return page->virtual; 2991 } 2992 static inline void set_page_address(struct page *page, void *address) 2993 { 2994 page->virtual = address; 2995 } 2996 #define page_address_init() do { } while(0) 2997 #endif 2998 2999 #if defined(HASHED_PAGE_VIRTUAL) 3000 void *page_address(const struct page *page); 3001 void set_page_address(struct page *page, void *virtual); 3002 void page_address_init(void); 3003 #endif 3004 3005 static __always_inline void *lowmem_page_address(const struct page *page) 3006 { 3007 return page_to_virt(page); 3008 } 3009 3010 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) 3011 #define page_address(page) lowmem_page_address(page) 3012 #define set_page_address(page, address) do { } while(0) 3013 #define page_address_init() do { } while(0) 3014 #endif 3015 3016 static inline void *folio_address(const struct folio *folio) 3017 { 3018 return page_address(&folio->page); 3019 } 3020 3021 /* 3022 * Return true only if the page has been allocated with 3023 * ALLOC_NO_WATERMARKS and the low watermark was not 3024 * met implying that the system is under some pressure. 3025 */ 3026 static inline bool page_is_pfmemalloc(const struct page *page) 3027 { 3028 /* 3029 * lru.next has bit 1 set if the page is allocated from the 3030 * pfmemalloc reserves. Callers may simply overwrite it if 3031 * they do not need to preserve that information. 3032 */ 3033 return (uintptr_t)page->lru.next & BIT(1); 3034 } 3035 3036 /* 3037 * Return true only if the folio has been allocated with 3038 * ALLOC_NO_WATERMARKS and the low watermark was not 3039 * met implying that the system is under some pressure. 3040 */ 3041 static inline bool folio_is_pfmemalloc(const struct folio *folio) 3042 { 3043 /* 3044 * lru.next has bit 1 set if the page is allocated from the 3045 * pfmemalloc reserves. Callers may simply overwrite it if 3046 * they do not need to preserve that information. 3047 */ 3048 return (uintptr_t)folio->lru.next & BIT(1); 3049 } 3050 3051 /* 3052 * Only to be called by the page allocator on a freshly allocated 3053 * page. 3054 */ 3055 static inline void set_page_pfmemalloc(struct page *page) 3056 { 3057 page->lru.next = (void *)BIT(1); 3058 } 3059 3060 static inline void clear_page_pfmemalloc(struct page *page) 3061 { 3062 page->lru.next = NULL; 3063 } 3064 3065 /* 3066 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 3067 */ 3068 extern void pagefault_out_of_memory(void); 3069 3070 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) 3071 #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) 3072 3073 /* 3074 * Parameter block passed down to zap_pte_range in exceptional cases. 3075 */ 3076 struct zap_details { 3077 struct folio *single_folio; /* Locked folio to be unmapped */ 3078 bool skip_cows; /* Do not zap COWed private pages */ 3079 bool reclaim_pt; /* Need reclaim page tables? */ 3080 bool reaping; /* Reaping, do not block. */ 3081 zap_flags_t zap_flags; /* Extra flags for zapping */ 3082 }; 3083 3084 /* 3085 * Whether to drop the pte markers, for example, the uffd-wp information for 3086 * file-backed memory. This should only be specified when we will completely 3087 * drop the page in the mm, either by truncation or unmapping of the vma. By 3088 * default, the flag is not set. 3089 */ 3090 #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) 3091 /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ 3092 #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) 3093 3094 #ifdef CONFIG_MMU 3095 extern bool can_do_mlock(void); 3096 #else 3097 static inline bool can_do_mlock(void) { return false; } 3098 #endif 3099 extern int user_shm_lock(size_t, struct ucounts *); 3100 extern void user_shm_unlock(size_t, struct ucounts *); 3101 3102 struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, 3103 pte_t pte); 3104 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 3105 pte_t pte); 3106 struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, 3107 unsigned long addr, pmd_t pmd); 3108 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, 3109 pmd_t pmd); 3110 struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, 3111 pud_t pud); 3112 3113 void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, 3114 unsigned long size); 3115 void zap_vma_range(struct vm_area_struct *vma, unsigned long address, 3116 unsigned long size); 3117 /** 3118 * zap_vma - zap all page table entries in a vma 3119 * @vma: The vma to zap. 3120 */ 3121 static inline void zap_vma(struct vm_area_struct *vma) 3122 { 3123 zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); 3124 } 3125 struct mmu_notifier_range; 3126 3127 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 3128 unsigned long end, unsigned long floor, unsigned long ceiling); 3129 int 3130 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); 3131 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 3132 void *buf, int len, int write); 3133 3134 struct follow_pfnmap_args { 3135 /** 3136 * Inputs: 3137 * @vma: Pointer to @vm_area_struct struct 3138 * @address: the virtual address to walk 3139 */ 3140 struct vm_area_struct *vma; 3141 unsigned long address; 3142 /** 3143 * Internals: 3144 * 3145 * The caller shouldn't touch any of these. 3146 */ 3147 spinlock_t *lock; 3148 pte_t *ptep; 3149 /** 3150 * Outputs: 3151 * 3152 * @pfn: the PFN of the address 3153 * @addr_mask: address mask covering pfn 3154 * @pgprot: the pgprot_t of the mapping 3155 * @writable: whether the mapping is writable 3156 * @special: whether the mapping is a special mapping (real PFN maps) 3157 */ 3158 unsigned long pfn; 3159 unsigned long addr_mask; 3160 pgprot_t pgprot; 3161 bool writable; 3162 bool special; 3163 }; 3164 int follow_pfnmap_start(struct follow_pfnmap_args *args); 3165 void follow_pfnmap_end(struct follow_pfnmap_args *args); 3166 3167 extern void truncate_pagecache(struct inode *inode, loff_t new); 3168 extern void truncate_setsize(struct inode *inode, loff_t newsize); 3169 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); 3170 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); 3171 int generic_error_remove_folio(struct address_space *mapping, 3172 struct folio *folio); 3173 3174 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 3175 unsigned long address, struct pt_regs *regs); 3176 3177 #ifdef CONFIG_MMU 3178 extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, 3179 unsigned long address, unsigned int flags, 3180 struct pt_regs *regs); 3181 extern int fixup_user_fault(struct mm_struct *mm, 3182 unsigned long address, unsigned int fault_flags, 3183 bool *unlocked); 3184 void unmap_mapping_pages(struct address_space *mapping, 3185 pgoff_t start, pgoff_t nr, bool even_cows); 3186 void unmap_mapping_range(struct address_space *mapping, 3187 loff_t const holebegin, loff_t const holelen, int even_cows); 3188 #else 3189 static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, 3190 unsigned long address, unsigned int flags, 3191 struct pt_regs *regs) 3192 { 3193 /* should never happen if there's no MMU */ 3194 BUG(); 3195 return VM_FAULT_SIGBUS; 3196 } 3197 static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, 3198 unsigned int fault_flags, bool *unlocked) 3199 { 3200 /* should never happen if there's no MMU */ 3201 BUG(); 3202 return -EFAULT; 3203 } 3204 static inline void unmap_mapping_pages(struct address_space *mapping, 3205 pgoff_t start, pgoff_t nr, bool even_cows) { } 3206 static inline void unmap_mapping_range(struct address_space *mapping, 3207 loff_t const holebegin, loff_t const holelen, int even_cows) { } 3208 #endif 3209 3210 static inline void unmap_shared_mapping_range(struct address_space *mapping, 3211 loff_t const holebegin, loff_t const holelen) 3212 { 3213 unmap_mapping_range(mapping, holebegin, holelen, 0); 3214 } 3215 3216 static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, 3217 unsigned long addr); 3218 3219 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, 3220 void *buf, int len, unsigned int gup_flags); 3221 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, 3222 void *buf, int len, unsigned int gup_flags); 3223 3224 #ifdef CONFIG_BPF_SYSCALL 3225 extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, 3226 void *buf, int len, unsigned int gup_flags); 3227 #endif 3228 3229 long get_user_pages_remote(struct mm_struct *mm, 3230 unsigned long start, unsigned long nr_pages, 3231 unsigned int gup_flags, struct page **pages, 3232 int *locked); 3233 long pin_user_pages_remote(struct mm_struct *mm, 3234 unsigned long start, unsigned long nr_pages, 3235 unsigned int gup_flags, struct page **pages, 3236 int *locked); 3237 3238 /* 3239 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. 3240 */ 3241 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, 3242 unsigned long addr, 3243 int gup_flags, 3244 struct vm_area_struct **vmap) 3245 { 3246 struct page *page; 3247 struct vm_area_struct *vma; 3248 int got; 3249 3250 if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) 3251 return ERR_PTR(-EINVAL); 3252 3253 got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); 3254 3255 if (got < 0) 3256 return ERR_PTR(got); 3257 3258 vma = vma_lookup(mm, addr); 3259 if (WARN_ON_ONCE(!vma)) { 3260 put_page(page); 3261 return ERR_PTR(-EINVAL); 3262 } 3263 3264 *vmap = vma; 3265 return page; 3266 } 3267 3268 long get_user_pages(unsigned long start, unsigned long nr_pages, 3269 unsigned int gup_flags, struct page **pages); 3270 long pin_user_pages(unsigned long start, unsigned long nr_pages, 3271 unsigned int gup_flags, struct page **pages); 3272 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 3273 struct page **pages, unsigned int gup_flags); 3274 long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 3275 struct page **pages, unsigned int gup_flags); 3276 long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, 3277 struct folio **folios, unsigned int max_folios, 3278 pgoff_t *offset); 3279 int folio_add_pins(struct folio *folio, unsigned int pins); 3280 3281 int get_user_pages_fast(unsigned long start, int nr_pages, 3282 unsigned int gup_flags, struct page **pages); 3283 int pin_user_pages_fast(unsigned long start, int nr_pages, 3284 unsigned int gup_flags, struct page **pages); 3285 void folio_add_pin(struct folio *folio); 3286 3287 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); 3288 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 3289 const struct task_struct *task, bool bypass_rlim); 3290 3291 struct kvec; 3292 struct page *get_dump_page(unsigned long addr, int *locked); 3293 3294 bool folio_mark_dirty(struct folio *folio); 3295 bool folio_mark_dirty_lock(struct folio *folio); 3296 bool set_page_dirty(struct page *page); 3297 int set_page_dirty_lock(struct page *page); 3298 3299 int get_cmdline(struct task_struct *task, char *buffer, int buflen); 3300 3301 /* 3302 * Flags used by change_protection(). For now we make it a bitmap so 3303 * that we can pass in multiple flags just like parameters. However 3304 * for now all the callers are only use one of the flags at the same 3305 * time. 3306 */ 3307 /* 3308 * Whether we should manually check if we can map individual PTEs writable, 3309 * because something (e.g., COW, uffd-wp) blocks that from happening for all 3310 * PTEs automatically in a writable mapping. 3311 */ 3312 #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) 3313 /* Whether this protection change is for NUMA hints */ 3314 #define MM_CP_PROT_NUMA (1UL << 1) 3315 /* Whether this change is for write protecting */ 3316 #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ 3317 #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ 3318 #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ 3319 MM_CP_UFFD_WP_RESOLVE) 3320 3321 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, 3322 pte_t pte); 3323 extern long change_protection(struct mmu_gather *tlb, 3324 struct vm_area_struct *vma, unsigned long start, 3325 unsigned long end, unsigned long cp_flags); 3326 extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, 3327 struct vm_area_struct *vma, struct vm_area_struct **pprev, 3328 unsigned long start, unsigned long end, vm_flags_t newflags); 3329 3330 /* 3331 * doesn't attempt to fault and will return short. 3332 */ 3333 int get_user_pages_fast_only(unsigned long start, int nr_pages, 3334 unsigned int gup_flags, struct page **pages); 3335 3336 static inline bool get_user_page_fast_only(unsigned long addr, 3337 unsigned int gup_flags, struct page **pagep) 3338 { 3339 return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; 3340 } 3341 /* 3342 * per-process(per-mm_struct) statistics. 3343 */ 3344 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) 3345 { 3346 return percpu_counter_read_positive(&mm->rss_stat[member]); 3347 } 3348 3349 static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member) 3350 { 3351 return percpu_counter_sum_positive(&mm->rss_stat[member]); 3352 } 3353 3354 void mm_trace_rss_stat(struct mm_struct *mm, int member); 3355 3356 static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 3357 { 3358 percpu_counter_add(&mm->rss_stat[member], value); 3359 3360 mm_trace_rss_stat(mm, member); 3361 } 3362 3363 static inline void inc_mm_counter(struct mm_struct *mm, int member) 3364 { 3365 percpu_counter_inc(&mm->rss_stat[member]); 3366 3367 mm_trace_rss_stat(mm, member); 3368 } 3369 3370 static inline void dec_mm_counter(struct mm_struct *mm, int member) 3371 { 3372 percpu_counter_dec(&mm->rss_stat[member]); 3373 3374 mm_trace_rss_stat(mm, member); 3375 } 3376 3377 /* Optimized variant when folio is already known not to be anon */ 3378 static inline int mm_counter_file(struct folio *folio) 3379 { 3380 if (folio_test_swapbacked(folio)) 3381 return MM_SHMEMPAGES; 3382 return MM_FILEPAGES; 3383 } 3384 3385 static inline int mm_counter(struct folio *folio) 3386 { 3387 if (folio_test_anon(folio)) 3388 return MM_ANONPAGES; 3389 return mm_counter_file(folio); 3390 } 3391 3392 static inline unsigned long get_mm_rss(struct mm_struct *mm) 3393 { 3394 return get_mm_counter(mm, MM_FILEPAGES) + 3395 get_mm_counter(mm, MM_ANONPAGES) + 3396 get_mm_counter(mm, MM_SHMEMPAGES); 3397 } 3398 3399 static inline unsigned long get_mm_rss_sum(struct mm_struct *mm) 3400 { 3401 return get_mm_counter_sum(mm, MM_FILEPAGES) + 3402 get_mm_counter_sum(mm, MM_ANONPAGES) + 3403 get_mm_counter_sum(mm, MM_SHMEMPAGES); 3404 } 3405 3406 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) 3407 { 3408 return max(mm->hiwater_rss, get_mm_rss(mm)); 3409 } 3410 3411 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) 3412 { 3413 return max(mm->hiwater_vm, mm->total_vm); 3414 } 3415 3416 static inline void update_hiwater_rss(struct mm_struct *mm) 3417 { 3418 unsigned long _rss = get_mm_rss(mm); 3419 3420 if (data_race(mm->hiwater_rss) < _rss) 3421 data_race(mm->hiwater_rss = _rss); 3422 } 3423 3424 static inline void update_hiwater_vm(struct mm_struct *mm) 3425 { 3426 if (mm->hiwater_vm < mm->total_vm) 3427 mm->hiwater_vm = mm->total_vm; 3428 } 3429 3430 static inline void reset_mm_hiwater_rss(struct mm_struct *mm) 3431 { 3432 mm->hiwater_rss = get_mm_rss(mm); 3433 } 3434 3435 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, 3436 struct mm_struct *mm) 3437 { 3438 unsigned long hiwater_rss = get_mm_hiwater_rss(mm); 3439 3440 if (*maxrss < hiwater_rss) 3441 *maxrss = hiwater_rss; 3442 } 3443 3444 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL 3445 static inline int pte_special(pte_t pte) 3446 { 3447 return 0; 3448 } 3449 3450 static inline pte_t pte_mkspecial(pte_t pte) 3451 { 3452 return pte; 3453 } 3454 #endif 3455 3456 #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP 3457 static inline bool pmd_special(pmd_t pmd) 3458 { 3459 return false; 3460 } 3461 3462 static inline pmd_t pmd_mkspecial(pmd_t pmd) 3463 { 3464 return pmd; 3465 } 3466 #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ 3467 3468 #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP 3469 static inline bool pud_special(pud_t pud) 3470 { 3471 return false; 3472 } 3473 3474 static inline pud_t pud_mkspecial(pud_t pud) 3475 { 3476 return pud; 3477 } 3478 #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ 3479 3480 extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 3481 spinlock_t **ptl); 3482 3483 #ifdef __PAGETABLE_P4D_FOLDED 3484 static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, 3485 unsigned long address) 3486 { 3487 return 0; 3488 } 3489 #else 3490 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 3491 #endif 3492 3493 #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) 3494 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, 3495 unsigned long address) 3496 { 3497 return 0; 3498 } 3499 static inline void mm_inc_nr_puds(struct mm_struct *mm) {} 3500 static inline void mm_dec_nr_puds(struct mm_struct *mm) {} 3501 3502 #else 3503 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); 3504 3505 static inline void mm_inc_nr_puds(struct mm_struct *mm) 3506 { 3507 if (mm_pud_folded(mm)) 3508 return; 3509 atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); 3510 } 3511 3512 static inline void mm_dec_nr_puds(struct mm_struct *mm) 3513 { 3514 if (mm_pud_folded(mm)) 3515 return; 3516 atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); 3517 } 3518 #endif 3519 3520 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) 3521 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 3522 unsigned long address) 3523 { 3524 return 0; 3525 } 3526 3527 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} 3528 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} 3529 3530 #else 3531 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 3532 3533 static inline void mm_inc_nr_pmds(struct mm_struct *mm) 3534 { 3535 if (mm_pmd_folded(mm)) 3536 return; 3537 atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); 3538 } 3539 3540 static inline void mm_dec_nr_pmds(struct mm_struct *mm) 3541 { 3542 if (mm_pmd_folded(mm)) 3543 return; 3544 atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); 3545 } 3546 #endif 3547 3548 #ifdef CONFIG_MMU 3549 static inline void mm_pgtables_bytes_init(struct mm_struct *mm) 3550 { 3551 atomic_long_set(&mm->pgtables_bytes, 0); 3552 } 3553 3554 static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) 3555 { 3556 return atomic_long_read(&mm->pgtables_bytes); 3557 } 3558 3559 static inline void mm_inc_nr_ptes(struct mm_struct *mm) 3560 { 3561 atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); 3562 } 3563 3564 static inline void mm_dec_nr_ptes(struct mm_struct *mm) 3565 { 3566 atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); 3567 } 3568 #else 3569 3570 static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} 3571 static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) 3572 { 3573 return 0; 3574 } 3575 3576 static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} 3577 static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} 3578 #endif 3579 3580 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); 3581 int __pte_alloc_kernel(pmd_t *pmd); 3582 3583 #if defined(CONFIG_MMU) 3584 3585 static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, 3586 unsigned long address) 3587 { 3588 return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? 3589 NULL : p4d_offset(pgd, address); 3590 } 3591 3592 static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, 3593 unsigned long address) 3594 { 3595 return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? 3596 NULL : pud_offset(p4d, address); 3597 } 3598 3599 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3600 { 3601 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? 3602 NULL: pmd_offset(pud, address); 3603 } 3604 #endif /* CONFIG_MMU */ 3605 3606 enum pt_flags { 3607 PT_kernel = PG_referenced, 3608 PT_reserved = PG_reserved, 3609 /* High bits are used for zone/node/section */ 3610 }; 3611 3612 static inline struct ptdesc *virt_to_ptdesc(const void *x) 3613 { 3614 return page_ptdesc(virt_to_page(x)); 3615 } 3616 3617 /** 3618 * ptdesc_address - Virtual address of page table. 3619 * @pt: Page table descriptor. 3620 * 3621 * Return: The first byte of the page table described by @pt. 3622 */ 3623 static inline void *ptdesc_address(const struct ptdesc *pt) 3624 { 3625 return folio_address(ptdesc_folio(pt)); 3626 } 3627 3628 static inline bool pagetable_is_reserved(struct ptdesc *pt) 3629 { 3630 return test_bit(PT_reserved, &pt->pt_flags.f); 3631 } 3632 3633 /** 3634 * ptdesc_set_kernel - Mark a ptdesc used to map the kernel 3635 * @ptdesc: The ptdesc to be marked 3636 * 3637 * Kernel page tables often need special handling. Set a flag so that 3638 * the handling code knows this ptdesc will not be used for userspace. 3639 */ 3640 static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) 3641 { 3642 set_bit(PT_kernel, &ptdesc->pt_flags.f); 3643 } 3644 3645 /** 3646 * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel 3647 * @ptdesc: The ptdesc to be unmarked 3648 * 3649 * Use when the ptdesc is no longer used to map the kernel and no longer 3650 * needs special handling. 3651 */ 3652 static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) 3653 { 3654 /* 3655 * Note: the 'PG_referenced' bit does not strictly need to be 3656 * cleared before freeing the page. But this is nice for 3657 * symmetry. 3658 */ 3659 clear_bit(PT_kernel, &ptdesc->pt_flags.f); 3660 } 3661 3662 /** 3663 * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel 3664 * @ptdesc: The ptdesc being tested 3665 * 3666 * Call to tell if the ptdesc used to map the kernel. 3667 */ 3668 static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) 3669 { 3670 return test_bit(PT_kernel, &ptdesc->pt_flags.f); 3671 } 3672 3673 /** 3674 * pagetable_alloc - Allocate pagetables 3675 * @gfp: GFP flags 3676 * @order: desired pagetable order 3677 * 3678 * pagetable_alloc allocates memory for page tables as well as a page table 3679 * descriptor to describe that memory. 3680 * 3681 * Return: The ptdesc describing the allocated page tables. 3682 */ 3683 static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) 3684 { 3685 struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); 3686 3687 return page_ptdesc(page); 3688 } 3689 #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) 3690 3691 static inline void __pagetable_free(struct ptdesc *pt) 3692 { 3693 struct page *page = ptdesc_page(pt); 3694 3695 __free_pages(page, compound_order(page)); 3696 } 3697 3698 #ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE 3699 void pagetable_free_kernel(struct ptdesc *pt); 3700 #else 3701 static inline void pagetable_free_kernel(struct ptdesc *pt) 3702 { 3703 __pagetable_free(pt); 3704 } 3705 #endif 3706 /** 3707 * pagetable_free - Free pagetables 3708 * @pt: The page table descriptor 3709 * 3710 * pagetable_free frees the memory of all page tables described by a page 3711 * table descriptor and the memory for the descriptor itself. 3712 */ 3713 static inline void pagetable_free(struct ptdesc *pt) 3714 { 3715 if (ptdesc_test_kernel(pt)) { 3716 ptdesc_clear_kernel(pt); 3717 pagetable_free_kernel(pt); 3718 } else { 3719 __pagetable_free(pt); 3720 } 3721 } 3722 3723 #if defined(CONFIG_SPLIT_PTE_PTLOCKS) 3724 #if ALLOC_SPLIT_PTLOCKS 3725 void __init ptlock_cache_init(void); 3726 bool ptlock_alloc(struct ptdesc *ptdesc); 3727 void ptlock_free(struct ptdesc *ptdesc); 3728 3729 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) 3730 { 3731 return ptdesc->ptl; 3732 } 3733 #else /* ALLOC_SPLIT_PTLOCKS */ 3734 static inline void ptlock_cache_init(void) 3735 { 3736 } 3737 3738 static inline bool ptlock_alloc(struct ptdesc *ptdesc) 3739 { 3740 return true; 3741 } 3742 3743 static inline void ptlock_free(struct ptdesc *ptdesc) 3744 { 3745 } 3746 3747 static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) 3748 { 3749 return &ptdesc->ptl; 3750 } 3751 #endif /* ALLOC_SPLIT_PTLOCKS */ 3752 3753 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) 3754 { 3755 return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); 3756 } 3757 3758 static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) 3759 { 3760 BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); 3761 BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); 3762 return ptlock_ptr(virt_to_ptdesc(pte)); 3763 } 3764 3765 static inline bool ptlock_init(struct ptdesc *ptdesc) 3766 { 3767 /* 3768 * prep_new_page() initialize page->private (and therefore page->ptl) 3769 * with 0. Make sure nobody took it in use in between. 3770 * 3771 * It can happen if arch try to use slab for page table allocation: 3772 * slab code uses page->slab_cache, which share storage with page->ptl. 3773 */ 3774 VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); 3775 if (!ptlock_alloc(ptdesc)) 3776 return false; 3777 spin_lock_init(ptlock_ptr(ptdesc)); 3778 return true; 3779 } 3780 3781 #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ 3782 /* 3783 * We use mm->page_table_lock to guard all pagetable pages of the mm. 3784 */ 3785 static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) 3786 { 3787 return &mm->page_table_lock; 3788 } 3789 static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) 3790 { 3791 return &mm->page_table_lock; 3792 } 3793 static inline void ptlock_cache_init(void) {} 3794 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } 3795 static inline void ptlock_free(struct ptdesc *ptdesc) {} 3796 #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ 3797 3798 static inline void __pagetable_ctor(struct ptdesc *ptdesc) 3799 { 3800 struct folio *folio = ptdesc_folio(ptdesc); 3801 3802 __folio_set_pgtable(folio); 3803 lruvec_stat_add_folio(folio, NR_PAGETABLE); 3804 } 3805 3806 static inline void pagetable_dtor(struct ptdesc *ptdesc) 3807 { 3808 struct folio *folio = ptdesc_folio(ptdesc); 3809 3810 ptlock_free(ptdesc); 3811 __folio_clear_pgtable(folio); 3812 lruvec_stat_sub_folio(folio, NR_PAGETABLE); 3813 } 3814 3815 static inline void pagetable_dtor_free(struct ptdesc *ptdesc) 3816 { 3817 pagetable_dtor(ptdesc); 3818 pagetable_free(ptdesc); 3819 } 3820 3821 static inline bool pagetable_pte_ctor(struct mm_struct *mm, 3822 struct ptdesc *ptdesc) 3823 { 3824 if (mm != &init_mm && !ptlock_init(ptdesc)) 3825 return false; 3826 __pagetable_ctor(ptdesc); 3827 return true; 3828 } 3829 3830 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); 3831 3832 static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) 3833 { 3834 return __pte_offset_map(pmd, addr, NULL); 3835 } 3836 3837 pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, 3838 unsigned long addr, spinlock_t **ptlp); 3839 3840 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, 3841 unsigned long addr, spinlock_t **ptlp); 3842 pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, 3843 unsigned long addr, pmd_t *pmdvalp, 3844 spinlock_t **ptlp); 3845 3846 #define pte_unmap_unlock(pte, ptl) do { \ 3847 spin_unlock(ptl); \ 3848 pte_unmap(pte); \ 3849 } while (0) 3850 3851 #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) 3852 3853 #define pte_alloc_map(mm, pmd, address) \ 3854 (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) 3855 3856 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ 3857 (pte_alloc(mm, pmd) ? \ 3858 NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) 3859 3860 #define pte_alloc_kernel(pmd, address) \ 3861 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ 3862 NULL: pte_offset_kernel(pmd, address)) 3863 3864 #if defined(CONFIG_SPLIT_PMD_PTLOCKS) 3865 3866 static inline struct page *pmd_pgtable_page(pmd_t *pmd) 3867 { 3868 unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); 3869 return virt_to_page((void *)((unsigned long) pmd & mask)); 3870 } 3871 3872 static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) 3873 { 3874 return page_ptdesc(pmd_pgtable_page(pmd)); 3875 } 3876 3877 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) 3878 { 3879 return ptlock_ptr(pmd_ptdesc(pmd)); 3880 } 3881 3882 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) 3883 { 3884 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3885 ptdesc->pmd_huge_pte = NULL; 3886 #endif 3887 return ptlock_init(ptdesc); 3888 } 3889 3890 #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) 3891 3892 #else 3893 3894 static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) 3895 { 3896 return &mm->page_table_lock; 3897 } 3898 3899 static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } 3900 3901 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) 3902 3903 #endif 3904 3905 static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) 3906 { 3907 spinlock_t *ptl = pmd_lockptr(mm, pmd); 3908 spin_lock(ptl); 3909 return ptl; 3910 } 3911 3912 static inline bool pagetable_pmd_ctor(struct mm_struct *mm, 3913 struct ptdesc *ptdesc) 3914 { 3915 if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) 3916 return false; 3917 ptdesc_pmd_pts_init(ptdesc); 3918 __pagetable_ctor(ptdesc); 3919 return true; 3920 } 3921 3922 /* 3923 * No scalability reason to split PUD locks yet, but follow the same pattern 3924 * as the PMD locks to make it easier if we decide to. The VM should not be 3925 * considered ready to switch to split PUD locks yet; there may be places 3926 * which need to be converted from page_table_lock. 3927 */ 3928 static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) 3929 { 3930 return &mm->page_table_lock; 3931 } 3932 3933 static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) 3934 { 3935 spinlock_t *ptl = pud_lockptr(mm, pud); 3936 3937 spin_lock(ptl); 3938 return ptl; 3939 } 3940 3941 static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) 3942 { 3943 __pagetable_ctor(ptdesc); 3944 } 3945 3946 static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) 3947 { 3948 __pagetable_ctor(ptdesc); 3949 } 3950 3951 static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) 3952 { 3953 __pagetable_ctor(ptdesc); 3954 } 3955 3956 extern void __init pagecache_init(void); 3957 extern void free_initmem(void); 3958 3959 /* 3960 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) 3961 * into the buddy system. The freed pages will be poisoned with pattern 3962 * "poison" if it's within range [0, UCHAR_MAX]. 3963 * Return pages freed into the buddy system. 3964 */ 3965 extern unsigned long free_reserved_area(void *start, void *end, 3966 int poison, const char *s); 3967 3968 extern void adjust_managed_page_count(struct page *page, long count); 3969 3970 /* Free the reserved page into the buddy system, so it gets managed. */ 3971 void free_reserved_page(struct page *page); 3972 3973 static inline void mark_page_reserved(struct page *page) 3974 { 3975 SetPageReserved(page); 3976 adjust_managed_page_count(page, -1); 3977 } 3978 3979 static inline void free_reserved_ptdesc(struct ptdesc *pt) 3980 { 3981 free_reserved_page(ptdesc_page(pt)); 3982 } 3983 3984 /* 3985 * Default method to free all the __init memory into the buddy system. 3986 * The freed pages will be poisoned with pattern "poison" if it's within 3987 * range [0, UCHAR_MAX]. 3988 * Return pages freed into the buddy system. 3989 */ 3990 static inline unsigned long free_initmem_default(int poison) 3991 { 3992 extern char __init_begin[], __init_end[]; 3993 3994 return free_reserved_area(&__init_begin, &__init_end, 3995 poison, "unused kernel image (initmem)"); 3996 } 3997 3998 static inline unsigned long get_num_physpages(void) 3999 { 4000 int nid; 4001 unsigned long phys_pages = 0; 4002 4003 for_each_online_node(nid) 4004 phys_pages += node_present_pages(nid); 4005 4006 return phys_pages; 4007 } 4008 4009 /* 4010 * FIXME: Using memblock node mappings, an architecture may initialise its 4011 * zones, allocate the backing mem_map and account for memory holes in an 4012 * architecture independent manner. 4013 * 4014 * An architecture is expected to register range of page frames backed by 4015 * physical memory with memblock_add[_node]() before calling 4016 * free_area_init() passing in the PFN each zone ends at. At a basic 4017 * usage, an architecture is expected to do something like 4018 * 4019 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, 4020 * max_highmem_pfn}; 4021 * for_each_valid_physical_page_range() 4022 * memblock_add_node(base, size, nid, MEMBLOCK_NONE) 4023 * free_area_init(max_zone_pfns); 4024 */ 4025 void arch_zone_limits_init(unsigned long *max_zone_pfn); 4026 unsigned long node_map_pfn_alignment(void); 4027 extern unsigned long absent_pages_in_range(unsigned long start_pfn, 4028 unsigned long end_pfn); 4029 extern void get_pfn_range_for_nid(unsigned int nid, 4030 unsigned long *start_pfn, unsigned long *end_pfn); 4031 4032 #ifndef CONFIG_NUMA 4033 static inline int early_pfn_to_nid(unsigned long pfn) 4034 { 4035 return 0; 4036 } 4037 #else 4038 /* please see mm/page_alloc.c */ 4039 extern int __meminit early_pfn_to_nid(unsigned long pfn); 4040 #endif 4041 4042 extern void mem_init(void); 4043 extern void __init mmap_init(void); 4044 4045 extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); 4046 static inline void show_mem(void) 4047 { 4048 __show_mem(0, NULL, MAX_NR_ZONES - 1); 4049 } 4050 extern long si_mem_available(void); 4051 extern void si_meminfo(struct sysinfo * val); 4052 extern void si_meminfo_node(struct sysinfo *val, int nid); 4053 4054 extern __printf(3, 4) 4055 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); 4056 4057 extern void setup_per_cpu_pageset(void); 4058 4059 /* nommu.c */ 4060 extern atomic_long_t mmap_pages_allocated; 4061 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); 4062 4063 /* interval_tree.c */ 4064 void vma_interval_tree_insert(struct vm_area_struct *node, 4065 struct rb_root_cached *root); 4066 void vma_interval_tree_insert_after(struct vm_area_struct *node, 4067 struct vm_area_struct *prev, 4068 struct rb_root_cached *root); 4069 void vma_interval_tree_remove(struct vm_area_struct *node, 4070 struct rb_root_cached *root); 4071 struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, 4072 unsigned long start, unsigned long last); 4073 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, 4074 unsigned long start, unsigned long last); 4075 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, 4076 unsigned long start, unsigned long last); 4077 4078 #define vma_interval_tree_foreach(vma, root, start, last) \ 4079 for (vma = vma_interval_tree_iter_first(root, start, last); \ 4080 vma; vma = vma_interval_tree_iter_next(vma, start, last)) 4081 4082 void anon_vma_interval_tree_insert(struct anon_vma_chain *node, 4083 struct rb_root_cached *root); 4084 void anon_vma_interval_tree_remove(struct anon_vma_chain *node, 4085 struct rb_root_cached *root); 4086 struct anon_vma_chain * 4087 anon_vma_interval_tree_iter_first(struct rb_root_cached *root, 4088 unsigned long start, unsigned long last); 4089 struct anon_vma_chain *anon_vma_interval_tree_iter_next( 4090 struct anon_vma_chain *node, unsigned long start, unsigned long last); 4091 #ifdef CONFIG_DEBUG_VM_RB 4092 void anon_vma_interval_tree_verify(struct anon_vma_chain *node); 4093 #endif 4094 4095 #define anon_vma_interval_tree_foreach(avc, root, start, last) \ 4096 for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ 4097 avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) 4098 4099 /* mmap.c */ 4100 extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); 4101 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 4102 extern void exit_mmap(struct mm_struct *); 4103 bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, 4104 unsigned long addr, bool write); 4105 4106 static inline int check_data_rlimit(unsigned long rlim, 4107 unsigned long new, 4108 unsigned long start, 4109 unsigned long end_data, 4110 unsigned long start_data) 4111 { 4112 if (rlim < RLIM_INFINITY) { 4113 if (((new - start) + (end_data - start_data)) > rlim) 4114 return -ENOSPC; 4115 } 4116 4117 return 0; 4118 } 4119 4120 extern int mm_take_all_locks(struct mm_struct *mm); 4121 extern void mm_drop_all_locks(struct mm_struct *mm); 4122 4123 extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 4124 extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 4125 extern struct file *get_mm_exe_file(struct mm_struct *mm); 4126 extern struct file *get_task_exe_file(struct task_struct *task); 4127 4128 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); 4129 4130 extern bool vma_is_special_mapping(const struct vm_area_struct *vma, 4131 const struct vm_special_mapping *sm); 4132 struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, 4133 unsigned long addr, unsigned long len, 4134 vm_flags_t vm_flags, 4135 const struct vm_special_mapping *spec); 4136 4137 unsigned long randomize_stack_top(unsigned long stack_top); 4138 unsigned long randomize_page(unsigned long start, unsigned long range); 4139 4140 unsigned long 4141 __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 4142 unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); 4143 4144 static inline unsigned long 4145 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 4146 unsigned long pgoff, unsigned long flags) 4147 { 4148 return __get_unmapped_area(file, addr, len, pgoff, flags, 0); 4149 } 4150 4151 extern unsigned long do_mmap(struct file *file, unsigned long addr, 4152 unsigned long len, unsigned long prot, unsigned long flags, 4153 vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, 4154 struct list_head *uf); 4155 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 4156 unsigned long start, size_t len, struct list_head *uf, 4157 bool unlock); 4158 int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 4159 struct mm_struct *mm, unsigned long start, 4160 unsigned long end, struct list_head *uf, bool unlock); 4161 extern int do_munmap(struct mm_struct *, unsigned long, size_t, 4162 struct list_head *uf); 4163 extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); 4164 4165 #ifdef CONFIG_MMU 4166 extern int __mm_populate(unsigned long addr, unsigned long len, 4167 int ignore_errors); 4168 static inline void mm_populate(unsigned long addr, unsigned long len) 4169 { 4170 /* Ignore errors */ 4171 (void) __mm_populate(addr, len, 1); 4172 } 4173 #else 4174 static inline void mm_populate(unsigned long addr, unsigned long len) {} 4175 #endif 4176 4177 /* This takes the mm semaphore itself */ 4178 int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec); 4179 int vm_munmap(unsigned long start, size_t len); 4180 unsigned long __must_check vm_mmap(struct file *file, unsigned long addr, 4181 unsigned long len, unsigned long prot, 4182 unsigned long flag, unsigned long offset); 4183 unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, 4184 unsigned long len, unsigned long flags); 4185 4186 struct vm_unmapped_area_info { 4187 #define VM_UNMAPPED_AREA_TOPDOWN 1 4188 unsigned long flags; 4189 unsigned long length; 4190 unsigned long low_limit; 4191 unsigned long high_limit; 4192 unsigned long align_mask; 4193 unsigned long align_offset; 4194 unsigned long start_gap; 4195 }; 4196 4197 extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); 4198 4199 /* truncate.c */ 4200 void truncate_inode_pages(struct address_space *mapping, loff_t lstart); 4201 void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, 4202 uoff_t lend); 4203 void truncate_inode_pages_final(struct address_space *mapping); 4204 4205 /* generic vm_area_ops exported for stackable file systems */ 4206 extern vm_fault_t filemap_fault(struct vm_fault *vmf); 4207 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, 4208 pgoff_t start_pgoff, pgoff_t end_pgoff); 4209 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); 4210 4211 extern unsigned long stack_guard_gap; 4212 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ 4213 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); 4214 struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); 4215 4216 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 4217 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); 4218 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, 4219 struct vm_area_struct **pprev); 4220 4221 /* 4222 * Look up the first VMA which intersects the interval [start_addr, end_addr) 4223 * NULL if none. Assume start_addr < end_addr. 4224 */ 4225 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, 4226 unsigned long start_addr, unsigned long end_addr); 4227 4228 /** 4229 * vma_lookup() - Find a VMA at a specific address 4230 * @mm: The process address space. 4231 * @addr: The user address. 4232 * 4233 * Return: The vm_area_struct at the given address, %NULL otherwise. 4234 */ 4235 static inline 4236 struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) 4237 { 4238 return mtree_load(&mm->mm_mt, addr); 4239 } 4240 4241 static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) 4242 { 4243 if (vma->vm_flags & VM_GROWSDOWN) 4244 return stack_guard_gap; 4245 4246 /* See reasoning around the VM_SHADOW_STACK definition */ 4247 if (vma->vm_flags & VM_SHADOW_STACK) 4248 return PAGE_SIZE; 4249 4250 return 0; 4251 } 4252 4253 static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) 4254 { 4255 unsigned long gap = stack_guard_start_gap(vma); 4256 unsigned long vm_start = vma->vm_start; 4257 4258 vm_start -= gap; 4259 if (vm_start > vma->vm_start) 4260 vm_start = 0; 4261 return vm_start; 4262 } 4263 4264 static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) 4265 { 4266 unsigned long vm_end = vma->vm_end; 4267 4268 if (vma->vm_flags & VM_GROWSUP) { 4269 vm_end += stack_guard_gap; 4270 if (vm_end < vma->vm_end) 4271 vm_end = -PAGE_SIZE; 4272 } 4273 return vm_end; 4274 } 4275 4276 static inline unsigned long vma_pages(const struct vm_area_struct *vma) 4277 { 4278 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 4279 } 4280 4281 static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma) 4282 { 4283 return vma->vm_pgoff + vma_pages(vma) - 1; 4284 } 4285 4286 static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) 4287 { 4288 return desc->end - desc->start; 4289 } 4290 4291 static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) 4292 { 4293 return vma_desc_size(desc) >> PAGE_SHIFT; 4294 } 4295 4296 /** 4297 * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN 4298 * remap is required. 4299 * @desc: The VMA descriptor for the VMA requiring remap. 4300 * @start: The virtual address to start the remap from, must be within the VMA. 4301 * @start_pfn: The first PFN in the range to remap. 4302 * @size: The size of the range to remap, in bytes, at most spanning to the end 4303 * of the VMA. 4304 */ 4305 static inline void mmap_action_remap(struct vm_area_desc *desc, 4306 unsigned long start, 4307 unsigned long start_pfn, 4308 unsigned long size) 4309 { 4310 struct mmap_action *action = &desc->action; 4311 4312 /* [start, start + size) must be within the VMA. */ 4313 WARN_ON_ONCE(start < desc->start || start >= desc->end); 4314 WARN_ON_ONCE(start + size > desc->end); 4315 4316 action->type = MMAP_REMAP_PFN; 4317 action->remap.start = start; 4318 action->remap.start_pfn = start_pfn; 4319 action->remap.size = size; 4320 action->remap.pgprot = desc->page_prot; 4321 } 4322 4323 /** 4324 * mmap_action_remap_full - helper for mmap_prepare hook to specify that the 4325 * entirety of a VMA should be PFN remapped. 4326 * @desc: The VMA descriptor for the VMA requiring remap. 4327 * @start_pfn: The first PFN in the range to remap. 4328 */ 4329 static inline void mmap_action_remap_full(struct vm_area_desc *desc, 4330 unsigned long start_pfn) 4331 { 4332 mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); 4333 } 4334 4335 /** 4336 * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN 4337 * I/O remap is required. 4338 * @desc: The VMA descriptor for the VMA requiring remap. 4339 * @start: The virtual address to start the remap from, must be within the VMA. 4340 * @start_pfn: The first PFN in the range to remap. 4341 * @size: The size of the range to remap, in bytes, at most spanning to the end 4342 * of the VMA. 4343 */ 4344 static inline void mmap_action_ioremap(struct vm_area_desc *desc, 4345 unsigned long start, 4346 unsigned long start_pfn, 4347 unsigned long size) 4348 { 4349 mmap_action_remap(desc, start, start_pfn, size); 4350 desc->action.type = MMAP_IO_REMAP_PFN; 4351 } 4352 4353 /** 4354 * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the 4355 * entirety of a VMA should be PFN I/O remapped. 4356 * @desc: The VMA descriptor for the VMA requiring remap. 4357 * @start_pfn: The first PFN in the range to remap. 4358 */ 4359 static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, 4360 unsigned long start_pfn) 4361 { 4362 mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); 4363 } 4364 4365 /** 4366 * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the 4367 * physical range in [start_phys_addr, start_phys_addr + size) should be I/O 4368 * remapped. 4369 * @desc: The VMA descriptor for the VMA requiring remap. 4370 * @start_phys_addr: Start of the physical memory to be mapped. 4371 * @size: Size of the area to map. 4372 * 4373 * NOTE: Some drivers might want to tweak desc->page_prot for purposes of 4374 * write-combine or similar. 4375 */ 4376 static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, 4377 phys_addr_t start_phys_addr, 4378 unsigned long size) 4379 { 4380 struct mmap_action *action = &desc->action; 4381 4382 action->simple_ioremap.start_phys_addr = start_phys_addr; 4383 action->simple_ioremap.size = size; 4384 action->type = MMAP_SIMPLE_IO_REMAP; 4385 } 4386 4387 /** 4388 * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that 4389 * @num kernel pages contained in the @pages array should be mapped to userland 4390 * starting at virtual address @start. 4391 * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. 4392 * @start: The virtual address from which to map them. 4393 * @pages: An array of struct page pointers describing the memory to map. 4394 * @nr_pages: The number of entries in the @pages aray. 4395 */ 4396 static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc, 4397 unsigned long start, struct page **pages, 4398 unsigned long nr_pages) 4399 { 4400 struct mmap_action *action = &desc->action; 4401 4402 action->type = MMAP_MAP_KERNEL_PAGES; 4403 action->map_kernel.start = start; 4404 action->map_kernel.pages = pages; 4405 action->map_kernel.nr_pages = nr_pages; 4406 action->map_kernel.pgoff = desc->pgoff; 4407 } 4408 4409 /** 4410 * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that 4411 * kernel pages contained in the @pages array should be mapped to userland 4412 * from @desc->start to @desc->end. 4413 * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. 4414 * @pages: An array of struct page pointers describing the memory to map. 4415 * 4416 * The caller must ensure that @pages contains sufficient entries to cover the 4417 * entire range described by @desc. 4418 */ 4419 static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, 4420 struct page **pages) 4421 { 4422 mmap_action_map_kernel_pages(desc, desc->start, pages, 4423 vma_desc_pages(desc)); 4424 } 4425 4426 int mmap_action_prepare(struct vm_area_desc *desc); 4427 int mmap_action_complete(struct vm_area_struct *vma, 4428 struct mmap_action *action, bool is_compat); 4429 4430 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ 4431 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, 4432 unsigned long vm_start, unsigned long vm_end) 4433 { 4434 struct vm_area_struct *vma = vma_lookup(mm, vm_start); 4435 4436 if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) 4437 vma = NULL; 4438 4439 return vma; 4440 } 4441 4442 /** 4443 * range_is_subset - Is the specified inner range a subset of the outer range? 4444 * @outer_start: The start of the outer range. 4445 * @outer_end: The exclusive end of the outer range. 4446 * @inner_start: The start of the inner range. 4447 * @inner_end: The exclusive end of the inner range. 4448 * 4449 * Returns: %true if [inner_start, inner_end) is a subset of [outer_start, 4450 * outer_end), otherwise %false. 4451 */ 4452 static inline bool range_is_subset(unsigned long outer_start, 4453 unsigned long outer_end, 4454 unsigned long inner_start, 4455 unsigned long inner_end) 4456 { 4457 return outer_start <= inner_start && inner_end <= outer_end; 4458 } 4459 4460 /** 4461 * range_in_vma - is the specified [@start, @end) range a subset of the VMA? 4462 * @vma: The VMA against which we want to check [@start, @end). 4463 * @start: The start of the range we wish to check. 4464 * @end: The exclusive end of the range we wish to check. 4465 * 4466 * Returns: %true if [@start, @end) is a subset of [@vma->vm_start, 4467 * @vma->vm_end), %false otherwise. 4468 */ 4469 static inline bool range_in_vma(const struct vm_area_struct *vma, 4470 unsigned long start, unsigned long end) 4471 { 4472 if (!vma) 4473 return false; 4474 4475 return range_is_subset(vma->vm_start, vma->vm_end, start, end); 4476 } 4477 4478 /** 4479 * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA 4480 * described by @desc, a VMA descriptor? 4481 * @desc: The VMA descriptor against which we want to check [@start, @end). 4482 * @start: The start of the range we wish to check. 4483 * @end: The exclusive end of the range we wish to check. 4484 * 4485 * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end), 4486 * %false otherwise. 4487 */ 4488 static inline bool range_in_vma_desc(const struct vm_area_desc *desc, 4489 unsigned long start, unsigned long end) 4490 { 4491 if (!desc) 4492 return false; 4493 4494 return range_is_subset(desc->start, desc->end, start, end); 4495 } 4496 4497 #ifdef CONFIG_MMU 4498 pgprot_t vm_get_page_prot(vm_flags_t vm_flags); 4499 4500 static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) 4501 { 4502 const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); 4503 4504 return vm_get_page_prot(vm_flags); 4505 } 4506 4507 void vma_set_page_prot(struct vm_area_struct *vma); 4508 #else 4509 static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) 4510 { 4511 return __pgprot(0); 4512 } 4513 static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) 4514 { 4515 return __pgprot(0); 4516 } 4517 static inline void vma_set_page_prot(struct vm_area_struct *vma) 4518 { 4519 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 4520 } 4521 #endif 4522 4523 void vma_set_file(struct vm_area_struct *vma, struct file *file); 4524 4525 #ifdef CONFIG_NUMA_BALANCING 4526 unsigned long change_prot_numa(struct vm_area_struct *vma, 4527 unsigned long start, unsigned long end); 4528 #endif 4529 4530 struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, 4531 unsigned long addr); 4532 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 4533 unsigned long pfn, unsigned long size, pgprot_t pgprot); 4534 4535 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 4536 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, 4537 struct page **pages, unsigned long *num); 4538 int map_kernel_pages_prepare(struct vm_area_desc *desc); 4539 int map_kernel_pages_complete(struct vm_area_struct *vma, 4540 struct mmap_action *action); 4541 int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 4542 unsigned long num); 4543 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 4544 unsigned long num); 4545 vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, 4546 bool write); 4547 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 4548 unsigned long pfn); 4549 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, 4550 unsigned long pfn, pgprot_t pgprot); 4551 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 4552 unsigned long pfn); 4553 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, 4554 unsigned long addr, unsigned long pfn); 4555 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 4556 4557 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, 4558 unsigned long addr, struct page *page) 4559 { 4560 int err = vm_insert_page(vma, addr, page); 4561 4562 if (err == -ENOMEM) 4563 return VM_FAULT_OOM; 4564 if (err < 0 && err != -EBUSY) 4565 return VM_FAULT_SIGBUS; 4566 4567 return VM_FAULT_NOPAGE; 4568 } 4569 4570 #ifndef io_remap_pfn_range_pfn 4571 static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, 4572 unsigned long size) 4573 { 4574 return pfn; 4575 } 4576 #endif 4577 4578 static inline int io_remap_pfn_range(struct vm_area_struct *vma, 4579 unsigned long addr, unsigned long orig_pfn, 4580 unsigned long size, pgprot_t orig_prot) 4581 { 4582 const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); 4583 const pgprot_t prot = pgprot_decrypted(orig_prot); 4584 4585 return remap_pfn_range(vma, addr, pfn, size, prot); 4586 } 4587 4588 static inline vm_fault_t vmf_error(int err) 4589 { 4590 if (err == -ENOMEM) 4591 return VM_FAULT_OOM; 4592 else if (err == -EHWPOISON) 4593 return VM_FAULT_HWPOISON; 4594 return VM_FAULT_SIGBUS; 4595 } 4596 4597 /* 4598 * Convert errno to return value for ->page_mkwrite() calls. 4599 * 4600 * This should eventually be merged with vmf_error() above, but will need a 4601 * careful audit of all vmf_error() callers. 4602 */ 4603 static inline vm_fault_t vmf_fs_error(int err) 4604 { 4605 if (err == 0) 4606 return VM_FAULT_LOCKED; 4607 if (err == -EFAULT || err == -EAGAIN) 4608 return VM_FAULT_NOPAGE; 4609 if (err == -ENOMEM) 4610 return VM_FAULT_OOM; 4611 /* -ENOSPC, -EDQUOT, -EIO ... */ 4612 return VM_FAULT_SIGBUS; 4613 } 4614 4615 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) 4616 { 4617 if (vm_fault & VM_FAULT_OOM) 4618 return -ENOMEM; 4619 if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 4620 return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; 4621 if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) 4622 return -EFAULT; 4623 return 0; 4624 } 4625 4626 /* 4627 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether 4628 * a (NUMA hinting) fault is required. 4629 */ 4630 static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, 4631 unsigned int flags) 4632 { 4633 /* 4634 * If callers don't want to honor NUMA hinting faults, no need to 4635 * determine if we would actually have to trigger a NUMA hinting fault. 4636 */ 4637 if (!(flags & FOLL_HONOR_NUMA_FAULT)) 4638 return true; 4639 4640 /* 4641 * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. 4642 * 4643 * Requiring a fault here even for inaccessible VMAs would mean that 4644 * FOLL_FORCE cannot make any progress, because handle_mm_fault() 4645 * refuses to process NUMA hinting faults in inaccessible VMAs. 4646 */ 4647 return !vma_is_accessible(vma); 4648 } 4649 4650 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); 4651 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, 4652 unsigned long size, pte_fn_t fn, void *data); 4653 extern int apply_to_existing_page_range(struct mm_struct *mm, 4654 unsigned long address, unsigned long size, 4655 pte_fn_t fn, void *data); 4656 4657 #ifdef CONFIG_PAGE_POISONING 4658 extern void __kernel_poison_pages(struct page *page, int numpages); 4659 extern void __kernel_unpoison_pages(struct page *page, int numpages); 4660 extern bool _page_poisoning_enabled_early; 4661 DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); 4662 static inline bool page_poisoning_enabled(void) 4663 { 4664 return _page_poisoning_enabled_early; 4665 } 4666 /* 4667 * For use in fast paths after init_mem_debugging() has run, or when a 4668 * false negative result is not harmful when called too early. 4669 */ 4670 static inline bool page_poisoning_enabled_static(void) 4671 { 4672 return static_branch_unlikely(&_page_poisoning_enabled); 4673 } 4674 static inline void kernel_poison_pages(struct page *page, int numpages) 4675 { 4676 if (page_poisoning_enabled_static()) 4677 __kernel_poison_pages(page, numpages); 4678 } 4679 static inline void kernel_unpoison_pages(struct page *page, int numpages) 4680 { 4681 if (page_poisoning_enabled_static()) 4682 __kernel_unpoison_pages(page, numpages); 4683 } 4684 #else 4685 static inline bool page_poisoning_enabled(void) { return false; } 4686 static inline bool page_poisoning_enabled_static(void) { return false; } 4687 static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } 4688 static inline void kernel_poison_pages(struct page *page, int numpages) { } 4689 static inline void kernel_unpoison_pages(struct page *page, int numpages) { } 4690 #endif 4691 4692 DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); 4693 static inline bool want_init_on_alloc(gfp_t flags) 4694 { 4695 if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, 4696 &init_on_alloc)) 4697 return true; 4698 return flags & __GFP_ZERO; 4699 } 4700 4701 DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); 4702 static inline bool want_init_on_free(void) 4703 { 4704 return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, 4705 &init_on_free); 4706 } 4707 4708 extern bool _debug_pagealloc_enabled_early; 4709 DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); 4710 4711 static inline bool debug_pagealloc_enabled(void) 4712 { 4713 return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && 4714 _debug_pagealloc_enabled_early; 4715 } 4716 4717 /* 4718 * For use in fast paths after mem_debugging_and_hardening_init() has run, 4719 * or when a false negative result is not harmful when called too early. 4720 */ 4721 static inline bool debug_pagealloc_enabled_static(void) 4722 { 4723 if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) 4724 return false; 4725 4726 return static_branch_unlikely(&_debug_pagealloc_enabled); 4727 } 4728 4729 /* 4730 * To support DEBUG_PAGEALLOC architecture must ensure that 4731 * __kernel_map_pages() never fails 4732 */ 4733 extern void __kernel_map_pages(struct page *page, int numpages, int enable); 4734 #ifdef CONFIG_DEBUG_PAGEALLOC 4735 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) 4736 { 4737 iommu_debug_check_unmapped(page, numpages); 4738 4739 if (debug_pagealloc_enabled_static()) 4740 __kernel_map_pages(page, numpages, 1); 4741 } 4742 4743 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) 4744 { 4745 iommu_debug_check_unmapped(page, numpages); 4746 4747 if (debug_pagealloc_enabled_static()) 4748 __kernel_map_pages(page, numpages, 0); 4749 } 4750 4751 extern unsigned int _debug_guardpage_minorder; 4752 DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); 4753 4754 static inline unsigned int debug_guardpage_minorder(void) 4755 { 4756 return _debug_guardpage_minorder; 4757 } 4758 4759 static inline bool debug_guardpage_enabled(void) 4760 { 4761 return static_branch_unlikely(&_debug_guardpage_enabled); 4762 } 4763 4764 static inline bool page_is_guard(const struct page *page) 4765 { 4766 if (!debug_guardpage_enabled()) 4767 return false; 4768 4769 return PageGuard(page); 4770 } 4771 4772 bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); 4773 static inline bool set_page_guard(struct zone *zone, struct page *page, 4774 unsigned int order) 4775 { 4776 if (!debug_guardpage_enabled()) 4777 return false; 4778 return __set_page_guard(zone, page, order); 4779 } 4780 4781 void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); 4782 static inline void clear_page_guard(struct zone *zone, struct page *page, 4783 unsigned int order) 4784 { 4785 if (!debug_guardpage_enabled()) 4786 return; 4787 __clear_page_guard(zone, page, order); 4788 } 4789 4790 #else /* CONFIG_DEBUG_PAGEALLOC */ 4791 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} 4792 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} 4793 static inline unsigned int debug_guardpage_minorder(void) { return 0; } 4794 static inline bool debug_guardpage_enabled(void) { return false; } 4795 static inline bool page_is_guard(const struct page *page) { return false; } 4796 static inline bool set_page_guard(struct zone *zone, struct page *page, 4797 unsigned int order) { return false; } 4798 static inline void clear_page_guard(struct zone *zone, struct page *page, 4799 unsigned int order) {} 4800 #endif /* CONFIG_DEBUG_PAGEALLOC */ 4801 4802 #ifndef clear_pages 4803 /** 4804 * clear_pages() - clear a page range for kernel-internal use. 4805 * @addr: start address 4806 * @npages: number of pages 4807 * 4808 * Use clear_user_pages() instead when clearing a page range to be 4809 * mapped to user space. 4810 * 4811 * Does absolutely no exception handling. 4812 * 4813 * Note that even though the clearing operation is preemptible, clear_pages() 4814 * does not (and on architectures where it reduces to a few long-running 4815 * instructions, might not be able to) call cond_resched() to check if 4816 * rescheduling is required. 4817 * 4818 * When running under preemptible models this is not a problem. Under 4819 * cooperatively scheduled models, however, the caller is expected to 4820 * limit @npages to no more than PROCESS_PAGES_NON_PREEMPT_BATCH. 4821 */ 4822 static inline void clear_pages(void *addr, unsigned int npages) 4823 { 4824 do { 4825 clear_page(addr); 4826 addr += PAGE_SIZE; 4827 } while (--npages); 4828 } 4829 #endif 4830 4831 #ifndef PROCESS_PAGES_NON_PREEMPT_BATCH 4832 #ifdef clear_pages 4833 /* 4834 * The architecture defines clear_pages(), and we assume that it is 4835 * generally "fast". So choose a batch size large enough to allow the processor 4836 * headroom for optimizing the operation and yet small enough that we see 4837 * reasonable preemption latency for when this optimization is not possible 4838 * (ex. slow microarchitectures, memory bandwidth saturation.) 4839 * 4840 * With a value of 32MB and assuming a memory bandwidth of ~10GBps, this should 4841 * result in worst case preemption latency of around 3ms when clearing pages. 4842 * 4843 * (See comment above clear_pages() for why preemption latency is a concern 4844 * here.) 4845 */ 4846 #define PROCESS_PAGES_NON_PREEMPT_BATCH (SZ_32M >> PAGE_SHIFT) 4847 #else /* !clear_pages */ 4848 /* 4849 * The architecture does not provide a clear_pages() implementation. Assume 4850 * that clear_page() -- which clear_pages() will fallback to -- is relatively 4851 * slow and choose a small value for PROCESS_PAGES_NON_PREEMPT_BATCH. 4852 */ 4853 #define PROCESS_PAGES_NON_PREEMPT_BATCH 1 4854 #endif 4855 #endif 4856 4857 #ifdef __HAVE_ARCH_GATE_AREA 4858 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); 4859 extern int in_gate_area_no_mm(unsigned long addr); 4860 extern int in_gate_area(struct mm_struct *mm, unsigned long addr); 4861 #else 4862 static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 4863 { 4864 return NULL; 4865 } 4866 static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } 4867 static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) 4868 { 4869 return 0; 4870 } 4871 #endif /* __HAVE_ARCH_GATE_AREA */ 4872 4873 bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); 4874 4875 void drop_slab(void); 4876 4877 #ifndef CONFIG_MMU 4878 #define randomize_va_space 0 4879 #else 4880 extern int randomize_va_space; 4881 #endif 4882 4883 const char * arch_vma_name(struct vm_area_struct *vma); 4884 #ifdef CONFIG_MMU 4885 void print_vma_addr(char *prefix, unsigned long rip); 4886 #else 4887 static inline void print_vma_addr(char *prefix, unsigned long rip) 4888 { 4889 } 4890 #endif 4891 4892 unsigned long section_map_size(void); 4893 struct page * __populate_section_memmap(unsigned long pfn, 4894 unsigned long nr_pages, int nid, struct vmem_altmap *altmap, 4895 struct dev_pagemap *pgmap); 4896 void *vmemmap_alloc_block(unsigned long size, int node); 4897 struct vmem_altmap; 4898 void *vmemmap_alloc_block_buf(unsigned long size, int node, 4899 struct vmem_altmap *altmap); 4900 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); 4901 void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, 4902 unsigned long addr, unsigned long next); 4903 int vmemmap_check_pmd(pmd_t *pmd, int node, 4904 unsigned long addr, unsigned long next); 4905 int vmemmap_populate_basepages(unsigned long start, unsigned long end, 4906 int node, struct vmem_altmap *altmap); 4907 int vmemmap_populate_hugepages(unsigned long start, unsigned long end, 4908 int node, struct vmem_altmap *altmap); 4909 int vmemmap_populate(unsigned long start, unsigned long end, int node, 4910 struct vmem_altmap *altmap); 4911 int vmemmap_populate_hvo(unsigned long start, unsigned long end, 4912 unsigned int order, struct zone *zone, 4913 unsigned long headsize); 4914 void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, 4915 unsigned long headsize); 4916 void vmemmap_populate_print_last(void); 4917 #ifdef CONFIG_MEMORY_HOTPLUG 4918 void vmemmap_free(unsigned long start, unsigned long end, 4919 struct vmem_altmap *altmap); 4920 #endif 4921 4922 #ifdef CONFIG_SPARSEMEM_VMEMMAP 4923 static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) 4924 { 4925 /* number of pfns from base where pfn_to_page() is valid */ 4926 if (altmap) 4927 return altmap->reserve + altmap->free; 4928 return 0; 4929 } 4930 4931 static inline void vmem_altmap_free(struct vmem_altmap *altmap, 4932 unsigned long nr_pfns) 4933 { 4934 altmap->alloc -= nr_pfns; 4935 } 4936 #else 4937 static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) 4938 { 4939 return 0; 4940 } 4941 4942 static inline void vmem_altmap_free(struct vmem_altmap *altmap, 4943 unsigned long nr_pfns) 4944 { 4945 } 4946 #endif 4947 4948 #define VMEMMAP_RESERVE_NR 2 4949 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP 4950 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, 4951 struct dev_pagemap *pgmap) 4952 { 4953 unsigned long nr_pages; 4954 unsigned long nr_vmemmap_pages; 4955 4956 if (!pgmap || !is_power_of_2(sizeof(struct page))) 4957 return false; 4958 4959 nr_pages = pgmap_vmemmap_nr(pgmap); 4960 nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); 4961 /* 4962 * For vmemmap optimization with DAX we need minimum 2 vmemmap 4963 * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst 4964 */ 4965 return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); 4966 } 4967 /* 4968 * If we don't have an architecture override, use the generic rule 4969 */ 4970 #ifndef vmemmap_can_optimize 4971 #define vmemmap_can_optimize __vmemmap_can_optimize 4972 #endif 4973 4974 #else 4975 static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, 4976 struct dev_pagemap *pgmap) 4977 { 4978 return false; 4979 } 4980 #endif 4981 4982 enum mf_flags { 4983 MF_COUNT_INCREASED = 1 << 0, 4984 MF_ACTION_REQUIRED = 1 << 1, 4985 MF_MUST_KILL = 1 << 2, 4986 MF_SOFT_OFFLINE = 1 << 3, 4987 MF_UNPOISON = 1 << 4, 4988 MF_SW_SIMULATED = 1 << 5, 4989 MF_NO_RETRY = 1 << 6, 4990 MF_MEM_PRE_REMOVE = 1 << 7, 4991 }; 4992 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, 4993 unsigned long count, int mf_flags); 4994 extern int memory_failure(unsigned long pfn, int flags); 4995 extern int unpoison_memory(unsigned long pfn); 4996 extern atomic_long_t num_poisoned_pages __read_mostly; 4997 extern int soft_offline_page(unsigned long pfn, int flags); 4998 #ifdef CONFIG_MEMORY_FAILURE 4999 /* 5000 * Sysfs entries for memory failure handling statistics. 5001 */ 5002 extern const struct attribute_group memory_failure_attr_group; 5003 extern void memory_failure_queue(unsigned long pfn, int flags); 5004 void num_poisoned_pages_inc(unsigned long pfn); 5005 void num_poisoned_pages_sub(unsigned long pfn, long i); 5006 #else 5007 static inline void memory_failure_queue(unsigned long pfn, int flags) 5008 { 5009 } 5010 5011 static inline void num_poisoned_pages_inc(unsigned long pfn) 5012 { 5013 } 5014 5015 static inline void num_poisoned_pages_sub(unsigned long pfn, long i) 5016 { 5017 } 5018 #endif 5019 5020 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 5021 extern void memblk_nr_poison_inc(unsigned long pfn); 5022 extern void memblk_nr_poison_sub(unsigned long pfn, long i); 5023 #else 5024 static inline void memblk_nr_poison_inc(unsigned long pfn) 5025 { 5026 } 5027 5028 static inline void memblk_nr_poison_sub(unsigned long pfn, long i) 5029 { 5030 } 5031 #endif 5032 5033 #ifndef arch_memory_failure 5034 static inline int arch_memory_failure(unsigned long pfn, int flags) 5035 { 5036 return -ENXIO; 5037 } 5038 #endif 5039 5040 #ifndef arch_is_platform_page 5041 static inline bool arch_is_platform_page(u64 paddr) 5042 { 5043 return false; 5044 } 5045 #endif 5046 5047 /* 5048 * Error handlers for various types of pages. 5049 */ 5050 enum mf_result { 5051 MF_IGNORED, /* Error: cannot be handled */ 5052 MF_FAILED, /* Error: handling failed */ 5053 MF_DELAYED, /* Will be handled later */ 5054 MF_RECOVERED, /* Successfully recovered */ 5055 }; 5056 5057 enum mf_action_page_type { 5058 MF_MSG_KERNEL, 5059 MF_MSG_KERNEL_HIGH_ORDER, 5060 MF_MSG_DIFFERENT_COMPOUND, 5061 MF_MSG_HUGE, 5062 MF_MSG_FREE_HUGE, 5063 MF_MSG_GET_HWPOISON, 5064 MF_MSG_UNMAP_FAILED, 5065 MF_MSG_DIRTY_SWAPCACHE, 5066 MF_MSG_CLEAN_SWAPCACHE, 5067 MF_MSG_DIRTY_MLOCKED_LRU, 5068 MF_MSG_CLEAN_MLOCKED_LRU, 5069 MF_MSG_DIRTY_UNEVICTABLE_LRU, 5070 MF_MSG_CLEAN_UNEVICTABLE_LRU, 5071 MF_MSG_DIRTY_LRU, 5072 MF_MSG_CLEAN_LRU, 5073 MF_MSG_TRUNCATED_LRU, 5074 MF_MSG_BUDDY, 5075 MF_MSG_DAX, 5076 MF_MSG_UNSPLIT_THP, 5077 MF_MSG_ALREADY_POISONED, 5078 MF_MSG_PFN_MAP, 5079 MF_MSG_UNKNOWN, 5080 }; 5081 5082 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 5083 void folio_zero_user(struct folio *folio, unsigned long addr_hint); 5084 int copy_user_large_folio(struct folio *dst, struct folio *src, 5085 unsigned long addr_hint, 5086 struct vm_area_struct *vma); 5087 long copy_folio_from_user(struct folio *dst_folio, 5088 const void __user *usr_src, 5089 bool allow_pagefault); 5090 5091 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 5092 5093 #if MAX_NUMNODES > 1 5094 void __init setup_nr_node_ids(void); 5095 #else 5096 static inline void setup_nr_node_ids(void) {} 5097 #endif 5098 5099 extern int memcmp_pages(struct page *page1, struct page *page2); 5100 5101 static inline int pages_identical(struct page *page1, struct page *page2) 5102 { 5103 return !memcmp_pages(page1, page2); 5104 } 5105 5106 #ifdef CONFIG_MAPPING_DIRTY_HELPERS 5107 unsigned long clean_record_shared_mapping_range(struct address_space *mapping, 5108 pgoff_t first_index, pgoff_t nr, 5109 pgoff_t bitmap_pgoff, 5110 unsigned long *bitmap, 5111 pgoff_t *start, 5112 pgoff_t *end); 5113 5114 unsigned long wp_shared_mapping_range(struct address_space *mapping, 5115 pgoff_t first_index, pgoff_t nr); 5116 #endif 5117 5118 #ifdef CONFIG_ANON_VMA_NAME 5119 int set_anon_vma_name(unsigned long addr, unsigned long size, 5120 const char __user *uname); 5121 #else 5122 static inline 5123 int set_anon_vma_name(unsigned long addr, unsigned long size, 5124 const char __user *uname) 5125 { 5126 return -EINVAL; 5127 } 5128 #endif 5129 5130 #ifdef CONFIG_UNACCEPTED_MEMORY 5131 5132 bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); 5133 void accept_memory(phys_addr_t start, unsigned long size); 5134 5135 #else 5136 5137 static inline bool range_contains_unaccepted_memory(phys_addr_t start, 5138 unsigned long size) 5139 { 5140 return false; 5141 } 5142 5143 static inline void accept_memory(phys_addr_t start, unsigned long size) 5144 { 5145 } 5146 5147 #endif 5148 5149 static inline bool pfn_is_unaccepted_memory(unsigned long pfn) 5150 { 5151 return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); 5152 } 5153 5154 void vma_pgtable_walk_begin(struct vm_area_struct *vma); 5155 void vma_pgtable_walk_end(struct vm_area_struct *vma); 5156 5157 int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); 5158 int reserve_mem_release_by_name(const char *name); 5159 5160 #ifdef CONFIG_64BIT 5161 int do_mseal(unsigned long start, size_t len_in, unsigned long flags); 5162 #else 5163 static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) 5164 { 5165 /* noop on 32 bit */ 5166 return 0; 5167 } 5168 #endif 5169 5170 /* 5171 * user_alloc_needs_zeroing checks if a user folio from page allocator needs to 5172 * be zeroed or not. 5173 */ 5174 static inline bool user_alloc_needs_zeroing(void) 5175 { 5176 /* 5177 * for user folios, arch with cache aliasing requires cache flush and 5178 * arc changes folio->flags to make icache coherent with dcache, so 5179 * always return false to make caller use 5180 * clear_user_page()/clear_user_highpage(). 5181 */ 5182 return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || 5183 !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, 5184 &init_on_alloc); 5185 } 5186 5187 int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); 5188 int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); 5189 int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); 5190 5191 /* 5192 * DMA mapping IDs for page_pool 5193 * 5194 * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and 5195 * stashes it in the upper bits of page->pp_magic. We always want to be able to 5196 * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP 5197 * pages can have arbitrary kernel pointers stored in the same field as pp_magic 5198 * (since it overlaps with page->lru.next), so we must ensure that we cannot 5199 * mistake a valid kernel pointer with any of the values we write into this 5200 * field. 5201 * 5202 * On architectures that set POISON_POINTER_DELTA, this is already ensured, 5203 * since this value becomes part of PP_SIGNATURE; meaning we can just use the 5204 * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the 5205 * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is 5206 * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is 5207 * known at compile-time. 5208 * 5209 * If the value of PAGE_OFFSET is not known at compile time, or if it is too 5210 * small to leave at least 8 bits available above PP_SIGNATURE, we define the 5211 * number of bits to be 0, which turns off the DMA index tracking altogether 5212 * (see page_pool_register_dma_index()). 5213 */ 5214 #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) 5215 #if POISON_POINTER_DELTA > 0 5216 /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA 5217 * index to not overlap with that if set 5218 */ 5219 #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) 5220 #else 5221 /* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ 5222 #define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) 5223 #define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \ 5224 PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \ 5225 !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \ 5226 MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0) 5227 5228 #endif 5229 5230 #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ 5231 PP_DMA_INDEX_SHIFT) 5232 5233 /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is 5234 * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for 5235 * the head page of compound page and bit 1 for pfmemalloc page, as well as the 5236 * bits used for the DMA index. page_is_pfmemalloc() is checked in 5237 * __page_pool_put_page() to avoid recycling the pfmemalloc page. 5238 */ 5239 #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) 5240 5241 #ifdef CONFIG_PAGE_POOL 5242 static inline bool page_pool_page_is_pp(const struct page *page) 5243 { 5244 return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; 5245 } 5246 #else 5247 static inline bool page_pool_page_is_pp(const struct page *page) 5248 { 5249 return false; 5250 } 5251 #endif 5252 5253 #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) 5254 #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) 5255 #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) 5256 5257 struct page_snapshot { 5258 struct folio folio_snapshot; 5259 struct page page_snapshot; 5260 unsigned long pfn; 5261 unsigned long idx; 5262 unsigned long flags; 5263 }; 5264 5265 static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) 5266 { 5267 return ps->flags & PAGE_SNAPSHOT_FAITHFUL; 5268 } 5269 5270 void snapshot_page(struct page_snapshot *ps, const struct page *page); 5271 5272 void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, 5273 struct vm_area_struct *vma, unsigned long addr, 5274 bool uffd_wp); 5275 5276 #endif /* _LINUX_MM_H */ 5277