1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/mm.h> 3 #include <linux/slab.h> 4 #include <linux/string.h> 5 #include <linux/compiler.h> 6 #include <linux/export.h> 7 #include <linux/err.h> 8 #include <linux/sched.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/sched/task_stack.h> 12 #include <linux/security.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/sysctl.h> 16 #include <linux/mman.h> 17 #include <linux/hugetlb.h> 18 #include <linux/vmalloc.h> 19 #include <linux/userfaultfd_k.h> 20 #include <linux/elf.h> 21 #include <linux/elf-randomize.h> 22 #include <linux/personality.h> 23 #include <linux/random.h> 24 #include <linux/processor.h> 25 #include <linux/sizes.h> 26 #include <linux/compat.h> 27 #include <linux/fsnotify.h> 28 #include <linux/page_idle.h> 29 30 #include <linux/uaccess.h> 31 32 #include <kunit/visibility.h> 33 34 #include "internal.h" 35 #include "swap.h" 36 37 /** 38 * kfree_const - conditionally free memory 39 * @x: pointer to the memory 40 * 41 * Function calls kfree only if @x is not in .rodata section. 42 */ 43 void kfree_const(const void *x) 44 { 45 if (!is_kernel_rodata((unsigned long)x)) 46 kfree(x); 47 } 48 EXPORT_SYMBOL(kfree_const); 49 50 /** 51 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. 52 * @s: The data to copy 53 * @len: The size of the data, not including the NUL terminator 54 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 55 * 56 * Return: newly allocated copy of @s with NUL-termination or %NULL in 57 * case of error 58 */ 59 static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) 60 { 61 char *buf; 62 63 /* '+1' for the NUL terminator */ 64 buf = kmalloc_track_caller(len + 1, gfp); 65 if (!buf) 66 return NULL; 67 68 memcpy(buf, s, len); 69 /* Ensure the buf is always NUL-terminated, regardless of @s. */ 70 buf[len] = '\0'; 71 return buf; 72 } 73 74 /** 75 * kstrdup - allocate space for and copy an existing string 76 * @s: the string to duplicate 77 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 78 * 79 * Return: newly allocated copy of @s or %NULL in case of error 80 */ 81 noinline 82 char *kstrdup(const char *s, gfp_t gfp) 83 { 84 return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; 85 } 86 EXPORT_SYMBOL(kstrdup); 87 88 /** 89 * kstrdup_const - conditionally duplicate an existing const string 90 * @s: the string to duplicate 91 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 92 * 93 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and 94 * must not be passed to krealloc(). 95 * 96 * Return: source string if it is in .rodata section otherwise 97 * fallback to kstrdup. 98 */ 99 const char *kstrdup_const(const char *s, gfp_t gfp) 100 { 101 if (is_kernel_rodata((unsigned long)s)) 102 return s; 103 104 return kstrdup(s, gfp); 105 } 106 EXPORT_SYMBOL(kstrdup_const); 107 108 /** 109 * kstrndup - allocate space for and copy an existing string 110 * @s: the string to duplicate 111 * @max: read at most @max chars from @s 112 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 113 * 114 * Note: Use kmemdup_nul() instead if the size is known exactly. 115 * 116 * Return: newly allocated copy of @s or %NULL in case of error 117 */ 118 char *kstrndup(const char *s, size_t max, gfp_t gfp) 119 { 120 return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; 121 } 122 EXPORT_SYMBOL(kstrndup); 123 124 /** 125 * kmemdup - duplicate region of memory 126 * 127 * @src: memory region to duplicate 128 * @len: memory region length 129 * @gfp: GFP mask to use 130 * 131 * Return: newly allocated copy of @src or %NULL in case of error, 132 * result is physically contiguous. Use kfree() to free. 133 */ 134 void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) 135 { 136 void *p; 137 138 p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); 139 if (p) 140 memcpy(p, src, len); 141 return p; 142 } 143 EXPORT_SYMBOL(kmemdup_noprof); 144 145 /** 146 * kmemdup_array - duplicate a given array. 147 * 148 * @src: array to duplicate. 149 * @count: number of elements to duplicate from array. 150 * @element_size: size of each element of array. 151 * @gfp: GFP mask to use. 152 * 153 * Return: duplicated array of @src or %NULL in case of error, 154 * result is physically contiguous. Use kfree() to free. 155 */ 156 void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) 157 { 158 return kmemdup(src, size_mul(element_size, count), gfp); 159 } 160 EXPORT_SYMBOL(kmemdup_array); 161 162 /** 163 * kvmemdup - duplicate region of memory 164 * 165 * @src: memory region to duplicate 166 * @len: memory region length 167 * @gfp: GFP mask to use 168 * 169 * Return: newly allocated copy of @src or %NULL in case of error, 170 * result may be not physically contiguous. Use kvfree() to free. 171 */ 172 void *kvmemdup(const void *src, size_t len, gfp_t gfp) 173 { 174 void *p; 175 176 p = kvmalloc(len, gfp); 177 if (p) 178 memcpy(p, src, len); 179 return p; 180 } 181 EXPORT_SYMBOL(kvmemdup); 182 183 /** 184 * kmemdup_nul - Create a NUL-terminated string from unterminated data 185 * @s: The data to stringify 186 * @len: The size of the data 187 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 188 * 189 * Return: newly allocated copy of @s with NUL-termination or %NULL in 190 * case of error 191 */ 192 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 193 { 194 return s ? __kmemdup_nul(s, len, gfp) : NULL; 195 } 196 EXPORT_SYMBOL(kmemdup_nul); 197 198 static kmem_buckets *user_buckets __ro_after_init; 199 200 static int __init init_user_buckets(void) 201 { 202 user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); 203 204 return 0; 205 } 206 subsys_initcall(init_user_buckets); 207 208 /** 209 * memdup_user - duplicate memory region from user space 210 * 211 * @src: source address in user space 212 * @len: number of bytes to copy 213 * 214 * Return: an ERR_PTR() on failure. Result is physically 215 * contiguous, to be freed by kfree(). 216 */ 217 void *memdup_user(const void __user *src, size_t len) 218 { 219 void *p; 220 221 p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); 222 if (!p) 223 return ERR_PTR(-ENOMEM); 224 225 if (copy_from_user(p, src, len)) { 226 kfree(p); 227 return ERR_PTR(-EFAULT); 228 } 229 230 return p; 231 } 232 EXPORT_SYMBOL(memdup_user); 233 234 /** 235 * vmemdup_user - duplicate memory region from user space 236 * 237 * @src: source address in user space 238 * @len: number of bytes to copy 239 * 240 * Return: an ERR_PTR() on failure. Result may be not 241 * physically contiguous. Use kvfree() to free. 242 */ 243 void *vmemdup_user(const void __user *src, size_t len) 244 { 245 void *p; 246 247 p = kmem_buckets_valloc(user_buckets, len, GFP_USER); 248 if (!p) 249 return ERR_PTR(-ENOMEM); 250 251 if (copy_from_user(p, src, len)) { 252 kvfree(p); 253 return ERR_PTR(-EFAULT); 254 } 255 256 return p; 257 } 258 EXPORT_SYMBOL(vmemdup_user); 259 260 /** 261 * strndup_user - duplicate an existing string from user space 262 * @s: The string to duplicate 263 * @n: Maximum number of bytes to copy, including the trailing NUL. 264 * 265 * Return: newly allocated copy of @s or an ERR_PTR() in case of error 266 */ 267 char *strndup_user(const char __user *s, long n) 268 { 269 char *p; 270 long length; 271 272 length = strnlen_user(s, n); 273 274 if (!length) 275 return ERR_PTR(-EFAULT); 276 277 if (length > n) 278 return ERR_PTR(-EINVAL); 279 280 p = memdup_user(s, length); 281 282 if (IS_ERR(p)) 283 return p; 284 285 p[length - 1] = '\0'; 286 287 return p; 288 } 289 EXPORT_SYMBOL(strndup_user); 290 291 /** 292 * memdup_user_nul - duplicate memory region from user space and NUL-terminate 293 * 294 * @src: source address in user space 295 * @len: number of bytes to copy 296 * 297 * Return: an ERR_PTR() on failure. 298 */ 299 void *memdup_user_nul(const void __user *src, size_t len) 300 { 301 char *p; 302 303 p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); 304 if (!p) 305 return ERR_PTR(-ENOMEM); 306 307 if (copy_from_user(p, src, len)) { 308 kfree(p); 309 return ERR_PTR(-EFAULT); 310 } 311 p[len] = '\0'; 312 313 return p; 314 } 315 EXPORT_SYMBOL(memdup_user_nul); 316 317 /* Check if the vma is being used as a stack by this task */ 318 int vma_is_stack_for_current(const struct vm_area_struct *vma) 319 { 320 struct task_struct * __maybe_unused t = current; 321 322 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 323 } 324 325 /* 326 * Change backing file, only valid to use during initial VMA setup. 327 */ 328 void vma_set_file(struct vm_area_struct *vma, struct file *file) 329 { 330 /* Changing an anonymous vma with this is illegal */ 331 get_file(file); 332 swap(vma->vm_file, file); 333 fput(file); 334 } 335 EXPORT_SYMBOL(vma_set_file); 336 337 #ifndef STACK_RND_MASK 338 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 339 #endif 340 341 unsigned long randomize_stack_top(unsigned long stack_top) 342 { 343 unsigned long random_variable = 0; 344 345 if (current->flags & PF_RANDOMIZE) { 346 random_variable = get_random_long(); 347 random_variable &= STACK_RND_MASK; 348 random_variable <<= PAGE_SHIFT; 349 } 350 #ifdef CONFIG_STACK_GROWSUP 351 return PAGE_ALIGN(stack_top) + random_variable; 352 #else 353 return PAGE_ALIGN(stack_top) - random_variable; 354 #endif 355 } 356 357 /** 358 * randomize_page - Generate a random, page aligned address 359 * @start: The smallest acceptable address the caller will take. 360 * @range: The size of the area, starting at @start, within which the 361 * random address must fall. 362 * 363 * If @start + @range would overflow, @range is capped. 364 * 365 * NOTE: Historical use of randomize_range, which this replaces, presumed that 366 * @start was already page aligned. We now align it regardless. 367 * 368 * Return: A page aligned address within [start, start + range). On error, 369 * @start is returned. 370 */ 371 unsigned long randomize_page(unsigned long start, unsigned long range) 372 { 373 if (!PAGE_ALIGNED(start)) { 374 range -= PAGE_ALIGN(start) - start; 375 start = PAGE_ALIGN(start); 376 } 377 378 if (start > ULONG_MAX - range) 379 range = ULONG_MAX - start; 380 381 range >>= PAGE_SHIFT; 382 383 if (range == 0) 384 return start; 385 386 return start + (get_random_long() % range << PAGE_SHIFT); 387 } 388 389 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 390 unsigned long __weak arch_randomize_brk(struct mm_struct *mm) 391 { 392 /* Is the current task 32bit ? */ 393 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 394 return randomize_page(mm->brk, SZ_32M); 395 396 return randomize_page(mm->brk, SZ_1G); 397 } 398 399 unsigned long arch_mmap_rnd(void) 400 { 401 unsigned long rnd; 402 403 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 404 if (is_compat_task()) 405 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 406 else 407 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 408 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 409 410 return rnd << PAGE_SHIFT; 411 } 412 413 static int mmap_is_legacy(const struct rlimit *rlim_stack) 414 { 415 if (current->personality & ADDR_COMPAT_LAYOUT) 416 return 1; 417 418 /* On parisc the stack always grows up - so a unlimited stack should 419 * not be an indicator to use the legacy memory layout. */ 420 if (rlim_stack->rlim_cur == RLIM_INFINITY && 421 !IS_ENABLED(CONFIG_STACK_GROWSUP)) 422 return 1; 423 424 return sysctl_legacy_va_layout; 425 } 426 427 /* 428 * Leave enough space between the mmap area and the stack to honour ulimit in 429 * the face of randomisation. 430 */ 431 #define MIN_GAP (SZ_128M) 432 #define MAX_GAP (STACK_TOP / 6 * 5) 433 434 static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) 435 { 436 #ifdef CONFIG_STACK_GROWSUP 437 /* 438 * For an upwards growing stack the calculation is much simpler. 439 * Memory for the maximum stack size is reserved at the top of the 440 * task. mmap_base starts directly below the stack and grows 441 * downwards. 442 */ 443 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); 444 #else 445 unsigned long gap = rlim_stack->rlim_cur; 446 unsigned long pad = stack_guard_gap; 447 448 /* Account for stack randomization if necessary */ 449 if (current->flags & PF_RANDOMIZE) 450 pad += (STACK_RND_MASK << PAGE_SHIFT); 451 452 /* Values close to RLIM_INFINITY can overflow. */ 453 if (gap + pad > gap) 454 gap += pad; 455 456 if (gap < MIN_GAP && MIN_GAP < MAX_GAP) 457 gap = MIN_GAP; 458 else if (gap > MAX_GAP) 459 gap = MAX_GAP; 460 461 return PAGE_ALIGN(STACK_TOP - gap - rnd); 462 #endif 463 } 464 465 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) 466 { 467 unsigned long random_factor = 0UL; 468 469 if (current->flags & PF_RANDOMIZE) 470 random_factor = arch_mmap_rnd(); 471 472 if (mmap_is_legacy(rlim_stack)) { 473 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 474 mm_flags_clear(MMF_TOPDOWN, mm); 475 } else { 476 mm->mmap_base = mmap_base(random_factor, rlim_stack); 477 mm_flags_set(MMF_TOPDOWN, mm); 478 } 479 } 480 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 481 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) 482 { 483 mm->mmap_base = TASK_UNMAPPED_BASE; 484 mm_flags_clear(MMF_TOPDOWN, mm); 485 } 486 #endif 487 #ifdef CONFIG_MMU 488 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); 489 #endif 490 491 /** 492 * __account_locked_vm - account locked pages to an mm's locked_vm 493 * @mm: mm to account against 494 * @pages: number of pages to account 495 * @inc: %true if @pages should be considered positive, %false if not 496 * @task: task used to check RLIMIT_MEMLOCK 497 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped 498 * 499 * Assumes @task and @mm are valid (i.e. at least one reference on each), and 500 * that mmap_lock is held as writer. 501 * 502 * Return: 503 * * 0 on success 504 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 505 */ 506 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 507 const struct task_struct *task, bool bypass_rlim) 508 { 509 unsigned long locked_vm, limit; 510 int ret = 0; 511 512 mmap_assert_write_locked(mm); 513 514 locked_vm = mm->locked_vm; 515 if (inc) { 516 if (!bypass_rlim) { 517 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; 518 if (locked_vm + pages > limit) 519 ret = -ENOMEM; 520 } 521 if (!ret) 522 mm->locked_vm = locked_vm + pages; 523 } else { 524 WARN_ON_ONCE(pages > locked_vm); 525 mm->locked_vm = locked_vm - pages; 526 } 527 528 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, 529 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, 530 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), 531 ret ? " - exceeded" : ""); 532 533 return ret; 534 } 535 EXPORT_SYMBOL_GPL(__account_locked_vm); 536 537 /** 538 * account_locked_vm - account locked pages to an mm's locked_vm 539 * @mm: mm to account against, may be NULL 540 * @pages: number of pages to account 541 * @inc: %true if @pages should be considered positive, %false if not 542 * 543 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). 544 * 545 * Return: 546 * * 0 on success, or if mm is NULL 547 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 548 */ 549 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) 550 { 551 int ret; 552 553 if (pages == 0 || !mm) 554 return 0; 555 556 mmap_write_lock(mm); 557 ret = __account_locked_vm(mm, pages, inc, current, 558 capable(CAP_IPC_LOCK)); 559 mmap_write_unlock(mm); 560 561 return ret; 562 } 563 EXPORT_SYMBOL_GPL(account_locked_vm); 564 565 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 566 unsigned long len, unsigned long prot, 567 unsigned long flag, unsigned long pgoff) 568 { 569 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 570 unsigned long ret; 571 struct mm_struct *mm = current->mm; 572 unsigned long populate; 573 LIST_HEAD(uf); 574 575 ret = security_mmap_file(file, prot, flag); 576 if (!ret) 577 ret = fsnotify_mmap_perm(file, prot, off, len); 578 if (!ret) { 579 if (mmap_write_lock_killable(mm)) 580 return -EINTR; 581 ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, 582 &uf); 583 mmap_write_unlock(mm); 584 userfaultfd_unmap_complete(mm, &uf); 585 if (populate) 586 mm_populate(ret, populate); 587 } 588 return ret; 589 } 590 591 /* 592 * Perform a userland memory mapping into the current process address space. See 593 * the comment for do_mmap() for more details on this operation in general. 594 * 595 * This differs from do_mmap() in that: 596 * 597 * a. An offset parameter is provided rather than pgoff, which is both checked 598 * for overflow and page alignment. 599 * b. mmap locking is performed on the caller's behalf. 600 * c. Userfaultfd unmap events and memory population are handled. 601 * 602 * This means that this function performs essentially the same work as if 603 * userland were invoking mmap (2). 604 * 605 * Returns either an error, or the address at which the requested mapping has 606 * been performed. 607 */ 608 unsigned long vm_mmap(struct file *file, unsigned long addr, 609 unsigned long len, unsigned long prot, 610 unsigned long flag, unsigned long offset) 611 { 612 if (unlikely(offset + PAGE_ALIGN(len) < offset)) 613 return -EINVAL; 614 if (unlikely(offset_in_page(offset))) 615 return -EINVAL; 616 617 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 618 } 619 EXPORT_SYMBOL(vm_mmap); 620 621 #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK 622 /* 623 * Perform a userland memory mapping for a shadow stack into the current 624 * process address space. This is intended to be used by architectures that 625 * support user shadow stacks. 626 */ 627 unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, 628 unsigned long flags) 629 { 630 struct mm_struct *mm = current->mm; 631 unsigned long ret, unused; 632 vm_flags_t vm_flags = VM_SHADOW_STACK; 633 634 flags |= MAP_ANONYMOUS | MAP_PRIVATE; 635 if (addr) 636 flags |= MAP_FIXED_NOREPLACE; 637 638 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 639 vm_flags |= VM_NOHUGEPAGE; 640 641 mmap_write_lock(mm); 642 ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, 643 vm_flags, 0, &unused, NULL); 644 mmap_write_unlock(mm); 645 646 return ret; 647 } 648 #endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */ 649 650 /** 651 * __vmalloc_array - allocate memory for a virtually contiguous array. 652 * @n: number of elements. 653 * @size: element size. 654 * @flags: the type of memory to allocate (see kmalloc). 655 */ 656 void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) 657 { 658 size_t bytes; 659 660 if (unlikely(check_mul_overflow(n, size, &bytes))) 661 return NULL; 662 return __vmalloc_noprof(bytes, flags); 663 } 664 EXPORT_SYMBOL(__vmalloc_array_noprof); 665 666 /** 667 * vmalloc_array - allocate memory for a virtually contiguous array. 668 * @n: number of elements. 669 * @size: element size. 670 */ 671 void *vmalloc_array_noprof(size_t n, size_t size) 672 { 673 return __vmalloc_array_noprof(n, size, GFP_KERNEL); 674 } 675 EXPORT_SYMBOL(vmalloc_array_noprof); 676 677 /** 678 * __vcalloc - allocate and zero memory for a virtually contiguous array. 679 * @n: number of elements. 680 * @size: element size. 681 * @flags: the type of memory to allocate (see kmalloc). 682 */ 683 void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) 684 { 685 return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); 686 } 687 EXPORT_SYMBOL(__vcalloc_noprof); 688 689 /** 690 * vcalloc - allocate and zero memory for a virtually contiguous array. 691 * @n: number of elements. 692 * @size: element size. 693 */ 694 void *vcalloc_noprof(size_t n, size_t size) 695 { 696 return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); 697 } 698 EXPORT_SYMBOL(vcalloc_noprof); 699 700 struct anon_vma *folio_anon_vma(const struct folio *folio) 701 { 702 unsigned long mapping = (unsigned long)folio->mapping; 703 704 if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) 705 return NULL; 706 return (void *)(mapping - FOLIO_MAPPING_ANON); 707 } 708 709 /** 710 * folio_mapping - Find the mapping where this folio is stored. 711 * @folio: The folio. 712 * 713 * For folios which are in the page cache, return the mapping that this 714 * page belongs to. Folios in the swap cache return the swap mapping 715 * this page is stored in (which is different from the mapping for the 716 * swap file or swap device where the data is stored). 717 * 718 * You can call this for folios which aren't in the swap cache or page 719 * cache and it will return NULL. 720 */ 721 struct address_space *folio_mapping(const struct folio *folio) 722 { 723 struct address_space *mapping; 724 725 /* This happens if someone calls flush_dcache_page on slab page */ 726 if (unlikely(folio_test_slab(folio))) 727 return NULL; 728 729 if (unlikely(folio_test_swapcache(folio))) 730 return swap_address_space(folio->swap); 731 732 mapping = folio->mapping; 733 if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) 734 return NULL; 735 736 return mapping; 737 } 738 EXPORT_SYMBOL(folio_mapping); 739 740 /** 741 * folio_copy - Copy the contents of one folio to another. 742 * @dst: Folio to copy to. 743 * @src: Folio to copy from. 744 * 745 * The bytes in the folio represented by @src are copied to @dst. 746 * Assumes the caller has validated that @dst is at least as large as @src. 747 * Can be called in atomic context for order-0 folios, but if the folio is 748 * larger, it may sleep. 749 */ 750 void folio_copy(struct folio *dst, struct folio *src) 751 { 752 long i = 0; 753 long nr = folio_nr_pages(src); 754 755 for (;;) { 756 copy_highpage(folio_page(dst, i), folio_page(src, i)); 757 if (++i == nr) 758 break; 759 cond_resched(); 760 } 761 } 762 EXPORT_SYMBOL(folio_copy); 763 764 int folio_mc_copy(struct folio *dst, struct folio *src) 765 { 766 long nr = folio_nr_pages(src); 767 long i = 0; 768 769 for (;;) { 770 if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) 771 return -EHWPOISON; 772 if (++i == nr) 773 break; 774 cond_resched(); 775 } 776 777 return 0; 778 } 779 EXPORT_SYMBOL(folio_mc_copy); 780 781 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; 782 static int sysctl_overcommit_ratio __read_mostly = 50; 783 static unsigned long sysctl_overcommit_kbytes __read_mostly; 784 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 785 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 786 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 787 788 #ifdef CONFIG_SYSCTL 789 790 static int overcommit_ratio_handler(const struct ctl_table *table, int write, 791 void *buffer, size_t *lenp, loff_t *ppos) 792 { 793 int ret; 794 795 ret = proc_dointvec(table, write, buffer, lenp, ppos); 796 if (ret == 0 && write) 797 sysctl_overcommit_kbytes = 0; 798 return ret; 799 } 800 801 static void sync_overcommit_as(struct work_struct *dummy) 802 { 803 percpu_counter_sync(&vm_committed_as); 804 } 805 806 static int overcommit_policy_handler(const struct ctl_table *table, int write, 807 void *buffer, size_t *lenp, loff_t *ppos) 808 { 809 struct ctl_table t; 810 int new_policy = -1; 811 int ret; 812 813 /* 814 * The deviation of sync_overcommit_as could be big with loose policy 815 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to 816 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply 817 * with the strict "NEVER", and to avoid possible race condition (even 818 * though user usually won't too frequently do the switching to policy 819 * OVERCOMMIT_NEVER), the switch is done in the following order: 820 * 1. changing the batch 821 * 2. sync percpu count on each CPU 822 * 3. switch the policy 823 */ 824 if (write) { 825 t = *table; 826 t.data = &new_policy; 827 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 828 if (ret || new_policy == -1) 829 return ret; 830 831 mm_compute_batch(new_policy); 832 if (new_policy == OVERCOMMIT_NEVER) 833 schedule_on_each_cpu(sync_overcommit_as); 834 sysctl_overcommit_memory = new_policy; 835 } else { 836 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 837 } 838 839 return ret; 840 } 841 842 static int overcommit_kbytes_handler(const struct ctl_table *table, int write, 843 void *buffer, size_t *lenp, loff_t *ppos) 844 { 845 int ret; 846 847 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 848 if (ret == 0 && write) 849 sysctl_overcommit_ratio = 0; 850 return ret; 851 } 852 853 static const struct ctl_table util_sysctl_table[] = { 854 { 855 .procname = "overcommit_memory", 856 .data = &sysctl_overcommit_memory, 857 .maxlen = sizeof(sysctl_overcommit_memory), 858 .mode = 0644, 859 .proc_handler = overcommit_policy_handler, 860 .extra1 = SYSCTL_ZERO, 861 .extra2 = SYSCTL_TWO, 862 }, 863 { 864 .procname = "overcommit_ratio", 865 .data = &sysctl_overcommit_ratio, 866 .maxlen = sizeof(sysctl_overcommit_ratio), 867 .mode = 0644, 868 .proc_handler = overcommit_ratio_handler, 869 }, 870 { 871 .procname = "overcommit_kbytes", 872 .data = &sysctl_overcommit_kbytes, 873 .maxlen = sizeof(sysctl_overcommit_kbytes), 874 .mode = 0644, 875 .proc_handler = overcommit_kbytes_handler, 876 }, 877 { 878 .procname = "user_reserve_kbytes", 879 .data = &sysctl_user_reserve_kbytes, 880 .maxlen = sizeof(sysctl_user_reserve_kbytes), 881 .mode = 0644, 882 .proc_handler = proc_doulongvec_minmax, 883 }, 884 { 885 .procname = "admin_reserve_kbytes", 886 .data = &sysctl_admin_reserve_kbytes, 887 .maxlen = sizeof(sysctl_admin_reserve_kbytes), 888 .mode = 0644, 889 .proc_handler = proc_doulongvec_minmax, 890 }, 891 }; 892 893 static int __init init_vm_util_sysctls(void) 894 { 895 register_sysctl_init("vm", util_sysctl_table); 896 return 0; 897 } 898 subsys_initcall(init_vm_util_sysctls); 899 #endif /* CONFIG_SYSCTL */ 900 901 /* 902 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 903 */ 904 unsigned long vm_commit_limit(void) 905 { 906 unsigned long allowed; 907 908 if (sysctl_overcommit_kbytes) 909 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 910 else 911 allowed = ((totalram_pages() - hugetlb_total_pages()) 912 * sysctl_overcommit_ratio / 100); 913 allowed += total_swap_pages; 914 915 return allowed; 916 } 917 918 /* 919 * Make sure vm_committed_as in one cacheline and not cacheline shared with 920 * other variables. It can be updated by several CPUs frequently. 921 */ 922 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 923 924 /* 925 * The global memory commitment made in the system can be a metric 926 * that can be used to drive ballooning decisions when Linux is hosted 927 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 928 * balancing memory across competing virtual machines that are hosted. 929 * Several metrics drive this policy engine including the guest reported 930 * memory commitment. 931 * 932 * The time cost of this is very low for small platforms, and for big 933 * platform like a 2S/36C/72T Skylake server, in worst case where 934 * vm_committed_as's spinlock is under severe contention, the time cost 935 * could be about 30~40 microseconds. 936 */ 937 unsigned long vm_memory_committed(void) 938 { 939 return percpu_counter_sum_positive(&vm_committed_as); 940 } 941 EXPORT_SYMBOL_GPL(vm_memory_committed); 942 943 /* 944 * Check that a process has enough memory to allocate a new virtual 945 * mapping. 0 means there is enough memory for the allocation to 946 * succeed and -ENOMEM implies there is not. 947 * 948 * We currently support three overcommit policies, which are set via the 949 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst 950 * 951 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 952 * Additional code 2002 Jul 20 by Robert Love. 953 * 954 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 955 * 956 * Note this is a helper function intended to be used by LSMs which 957 * wish to use this logic. 958 */ 959 int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) 960 { 961 long allowed; 962 unsigned long bytes_failed; 963 964 vm_acct_memory(pages); 965 966 /* 967 * Sometimes we want to use more memory than we have 968 */ 969 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 970 return 0; 971 972 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 973 if (pages > totalram_pages() + total_swap_pages) 974 goto error; 975 return 0; 976 } 977 978 allowed = vm_commit_limit(); 979 /* 980 * Reserve some for root 981 */ 982 if (!cap_sys_admin) 983 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 984 985 /* 986 * Don't let a single process grow so big a user can't recover 987 */ 988 if (mm) { 989 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 990 991 allowed -= min_t(long, mm->total_vm / 32, reserve); 992 } 993 994 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 995 return 0; 996 error: 997 bytes_failed = pages << PAGE_SHIFT; 998 pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", 999 __func__, current->pid, current->comm, bytes_failed); 1000 vm_unacct_memory(pages); 1001 1002 return -ENOMEM; 1003 } 1004 1005 /** 1006 * get_cmdline() - copy the cmdline value to a buffer. 1007 * @task: the task whose cmdline value to copy. 1008 * @buffer: the buffer to copy to. 1009 * @buflen: the length of the buffer. Larger cmdline values are truncated 1010 * to this length. 1011 * 1012 * Return: the size of the cmdline field copied. Note that the copy does 1013 * not guarantee an ending NULL byte. 1014 */ 1015 int get_cmdline(struct task_struct *task, char *buffer, int buflen) 1016 { 1017 int res = 0; 1018 unsigned int len; 1019 struct mm_struct *mm = get_task_mm(task); 1020 unsigned long arg_start, arg_end, env_start, env_end; 1021 if (!mm) 1022 goto out; 1023 if (!mm->arg_end) 1024 goto out_mm; /* Shh! No looking before we're done */ 1025 1026 spin_lock(&mm->arg_lock); 1027 arg_start = mm->arg_start; 1028 arg_end = mm->arg_end; 1029 env_start = mm->env_start; 1030 env_end = mm->env_end; 1031 spin_unlock(&mm->arg_lock); 1032 1033 len = arg_end - arg_start; 1034 1035 if (len > buflen) 1036 len = buflen; 1037 1038 res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); 1039 1040 /* 1041 * If the nul at the end of args has been overwritten, then 1042 * assume application is using setproctitle(3). 1043 */ 1044 if (res > 0 && buffer[res-1] != '\0' && len < buflen) { 1045 len = strnlen(buffer, res); 1046 if (len < res) { 1047 res = len; 1048 } else { 1049 len = env_end - env_start; 1050 if (len > buflen - res) 1051 len = buflen - res; 1052 res += access_process_vm(task, env_start, 1053 buffer+res, len, 1054 FOLL_FORCE); 1055 res = strnlen(buffer, res); 1056 } 1057 } 1058 out_mm: 1059 mmput(mm); 1060 out: 1061 return res; 1062 } 1063 1064 int __weak memcmp_pages(struct page *page1, struct page *page2) 1065 { 1066 char *addr1, *addr2; 1067 int ret; 1068 1069 addr1 = kmap_local_page(page1); 1070 addr2 = kmap_local_page(page2); 1071 ret = memcmp(addr1, addr2, PAGE_SIZE); 1072 kunmap_local(addr2); 1073 kunmap_local(addr1); 1074 return ret; 1075 } 1076 1077 #ifdef CONFIG_PRINTK 1078 /** 1079 * mem_dump_obj - Print available provenance information 1080 * @object: object for which to find provenance information. 1081 * 1082 * This function uses pr_cont(), so that the caller is expected to have 1083 * printed out whatever preamble is appropriate. The provenance information 1084 * depends on the type of object and on how much debugging is enabled. 1085 * For example, for a slab-cache object, the slab name is printed, and, 1086 * if available, the return address and stack trace from the allocation 1087 * and last free path of that object. 1088 */ 1089 void mem_dump_obj(void *object) 1090 { 1091 const char *type; 1092 1093 if (kmem_dump_obj(object)) 1094 return; 1095 1096 if (vmalloc_dump_obj(object)) 1097 return; 1098 1099 if (is_vmalloc_addr(object)) 1100 type = "vmalloc memory"; 1101 else if (virt_addr_valid(object)) 1102 type = "non-slab/vmalloc memory"; 1103 else if (object == NULL) 1104 type = "NULL pointer"; 1105 else if (object == ZERO_SIZE_PTR) 1106 type = "zero-size pointer"; 1107 else 1108 type = "non-paged memory"; 1109 1110 pr_cont(" %s\n", type); 1111 } 1112 EXPORT_SYMBOL_GPL(mem_dump_obj); 1113 #endif 1114 1115 /* 1116 * A driver might set a page logically offline -- PageOffline() -- and 1117 * turn the page inaccessible in the hypervisor; after that, access to page 1118 * content can be fatal. 1119 * 1120 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random 1121 * pages after checking PageOffline(); however, these PFN walkers can race 1122 * with drivers that set PageOffline(). 1123 * 1124 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to 1125 * synchronize with such drivers, achieving that a page cannot be set 1126 * PageOffline() while frozen. 1127 * 1128 * page_offline_begin()/page_offline_end() is used by drivers that care about 1129 * such races when setting a page PageOffline(). 1130 */ 1131 static DECLARE_RWSEM(page_offline_rwsem); 1132 1133 void page_offline_freeze(void) 1134 { 1135 down_read(&page_offline_rwsem); 1136 } 1137 1138 void page_offline_thaw(void) 1139 { 1140 up_read(&page_offline_rwsem); 1141 } 1142 1143 void page_offline_begin(void) 1144 { 1145 down_write(&page_offline_rwsem); 1146 } 1147 EXPORT_SYMBOL(page_offline_begin); 1148 1149 void page_offline_end(void) 1150 { 1151 up_write(&page_offline_rwsem); 1152 } 1153 EXPORT_SYMBOL(page_offline_end); 1154 1155 #ifndef flush_dcache_folio 1156 void flush_dcache_folio(struct folio *folio) 1157 { 1158 long i, nr = folio_nr_pages(folio); 1159 1160 for (i = 0; i < nr; i++) 1161 flush_dcache_page(folio_page(folio, i)); 1162 } 1163 EXPORT_SYMBOL(flush_dcache_folio); 1164 #endif 1165 1166 /** 1167 * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA. 1168 * @desc: A VMA descriptor whose fields need to be set. 1169 * @file: The file object describing the file being mmap()'d. 1170 * @vma: The VMA whose fields we wish to assign to @desc. 1171 * 1172 * This is a compatibility function to allow an mmap() hook to call 1173 * mmap_prepare() hooks when drivers nest these. This function specifically 1174 * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for 1175 * the purposes of doing this. 1176 * 1177 * Once the conversion of drivers is complete this function will no longer be 1178 * required and will be removed. 1179 */ 1180 void compat_set_desc_from_vma(struct vm_area_desc *desc, 1181 const struct file *file, 1182 const struct vm_area_struct *vma) 1183 { 1184 memset(desc, 0, sizeof(*desc)); 1185 1186 desc->mm = vma->vm_mm; 1187 desc->file = (struct file *)file; 1188 desc->start = vma->vm_start; 1189 desc->end = vma->vm_end; 1190 1191 desc->pgoff = vma->vm_pgoff; 1192 desc->vm_file = vma->vm_file; 1193 desc->vma_flags = vma->flags; 1194 desc->page_prot = vma->vm_page_prot; 1195 1196 /* Default. */ 1197 desc->action.type = MMAP_NOTHING; 1198 } 1199 EXPORT_SYMBOL(compat_set_desc_from_vma); 1200 1201 /** 1202 * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows 1203 * flexibility as to how the mmap_prepare callback is invoked, which is useful 1204 * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook. 1205 * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been 1206 * executed. 1207 * @vma: The VMA to which @desc should be applied. 1208 * 1209 * The function assumes that you have obtained a VMA descriptor @desc from 1210 * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon 1211 * it. 1212 * 1213 * It then performs any specified mmap actions, and invokes the vm_ops->mapped() 1214 * hook if one is present. 1215 * 1216 * See the description of compat_vma_mmap() for more details. 1217 * 1218 * Once the conversion of drivers is complete this function will no longer be 1219 * required and will be removed. 1220 * 1221 * Returns: 0 on success or error. 1222 */ 1223 int __compat_vma_mmap(struct vm_area_desc *desc, 1224 struct vm_area_struct *vma) 1225 { 1226 int err; 1227 1228 /* Perform any preparatory tasks for mmap action. */ 1229 err = mmap_action_prepare(desc); 1230 if (err) 1231 return err; 1232 /* Update the VMA from the descriptor. */ 1233 compat_set_vma_from_desc(vma, desc); 1234 /* Complete any specified mmap actions. */ 1235 return mmap_action_complete(vma, &desc->action); 1236 } 1237 EXPORT_SYMBOL(__compat_vma_mmap); 1238 1239 /** 1240 * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an 1241 * existing VMA and execute any requested actions. 1242 * @file: The file which possesss an f_op->mmap_prepare() hook. 1243 * @vma: The VMA to apply the .mmap_prepare() hook to. 1244 * 1245 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain 1246 * stacked drivers invoke a nested mmap hook of an underlying file. 1247 * 1248 * Until all drivers are converted to use .mmap_prepare(), we must be 1249 * conservative and continue to invoke these stacked drivers using the 1250 * deprecated .mmap() hook. 1251 * 1252 * However we have a problem if the underlying file system possesses an 1253 * .mmap_prepare() hook, as we are in a different context when we invoke the 1254 * .mmap() hook, already having a VMA to deal with. 1255 * 1256 * compat_vma_mmap() is a compatibility function that takes VMA state, 1257 * establishes a struct vm_area_desc descriptor, passes to the underlying 1258 * .mmap_prepare() hook and applies any changes performed by it. 1259 * 1260 * Once the conversion of drivers is complete this function will no longer be 1261 * required and will be removed. 1262 * 1263 * Returns: 0 on success or error. 1264 */ 1265 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) 1266 { 1267 struct vm_area_desc desc; 1268 struct mmap_action *action; 1269 int err; 1270 1271 compat_set_desc_from_vma(&desc, file, vma); 1272 err = vfs_mmap_prepare(file, &desc); 1273 if (err) 1274 return err; 1275 action = &desc.action; 1276 1277 /* being invoked from .mmmap means we don't have to enforce this. */ 1278 action->hide_from_rmap_until_complete = false; 1279 1280 return __compat_vma_mmap(&desc, vma); 1281 } 1282 EXPORT_SYMBOL(compat_vma_mmap); 1283 1284 int __vma_check_mmap_hook(struct vm_area_struct *vma) 1285 { 1286 /* vm_ops->mapped is not valid if mmap() is specified. */ 1287 if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) 1288 return -EINVAL; 1289 1290 return 0; 1291 } 1292 EXPORT_SYMBOL(__vma_check_mmap_hook); 1293 1294 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, 1295 const struct page *page) 1296 { 1297 /* 1298 * Only the first page of a high-order buddy page has PageBuddy() set. 1299 * So we have to check manually whether this page is part of a high- 1300 * order buddy page. 1301 */ 1302 if (PageBuddy(page)) 1303 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1304 else if (page_count(page) == 0 && is_free_buddy_page(page)) 1305 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1306 1307 if (folio_test_idle(folio)) 1308 ps->flags |= PAGE_SNAPSHOT_PG_IDLE; 1309 } 1310 1311 /** 1312 * snapshot_page() - Create a snapshot of a struct page 1313 * @ps: Pointer to a struct page_snapshot to store the page snapshot 1314 * @page: The page to snapshot 1315 * 1316 * Create a snapshot of the page and store both its struct page and struct 1317 * folio representations in @ps. 1318 * 1319 * A snapshot is marked as "faithful" if the compound state of @page was 1320 * stable and allowed safe reconstruction of the folio representation. In 1321 * rare cases where this is not possible (e.g. due to folio splitting), 1322 * snapshot_page() falls back to treating @page as a single page and the 1323 * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() 1324 * helper can be used to check for this condition. 1325 */ 1326 void snapshot_page(struct page_snapshot *ps, const struct page *page) 1327 { 1328 unsigned long info, nr_pages = 1; 1329 struct folio *foliop; 1330 int loops = 5; 1331 1332 ps->pfn = page_to_pfn(page); 1333 ps->flags = PAGE_SNAPSHOT_FAITHFUL; 1334 1335 again: 1336 memset(&ps->folio_snapshot, 0, sizeof(struct folio)); 1337 memcpy(&ps->page_snapshot, page, sizeof(*page)); 1338 info = ps->page_snapshot.compound_info; 1339 if (!(info & 1)) { 1340 ps->idx = 0; 1341 foliop = (struct folio *)&ps->page_snapshot; 1342 if (!folio_test_large(foliop)) { 1343 set_ps_flags(ps, page_folio(page), page); 1344 memcpy(&ps->folio_snapshot, foliop, 1345 sizeof(struct page)); 1346 return; 1347 } 1348 foliop = (struct folio *)page; 1349 } else { 1350 /* See compound_head() */ 1351 if (compound_info_has_mask()) { 1352 unsigned long p = (unsigned long)page; 1353 1354 foliop = (struct folio *)(p & info); 1355 } else { 1356 foliop = (struct folio *)(info - 1); 1357 } 1358 1359 ps->idx = folio_page_idx(foliop, page); 1360 } 1361 1362 if (ps->idx < MAX_FOLIO_NR_PAGES) { 1363 memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); 1364 nr_pages = folio_nr_pages(&ps->folio_snapshot); 1365 if (nr_pages > 1) 1366 memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, 1367 sizeof(struct page)); 1368 set_ps_flags(ps, foliop, page); 1369 } 1370 1371 if (ps->idx > nr_pages) { 1372 if (loops-- > 0) 1373 goto again; 1374 clear_compound_head(&ps->page_snapshot); 1375 foliop = (struct folio *)&ps->page_snapshot; 1376 memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); 1377 ps->flags = 0; 1378 ps->idx = 0; 1379 } 1380 } 1381 1382 static int call_vma_mapped(struct vm_area_struct *vma) 1383 { 1384 const struct vm_operations_struct *vm_ops = vma->vm_ops; 1385 void *vm_private_data = vma->vm_private_data; 1386 int err; 1387 1388 if (!vm_ops || !vm_ops->mapped) 1389 return 0; 1390 1391 err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, 1392 vma->vm_file, &vm_private_data); 1393 if (err) 1394 return err; 1395 1396 if (vm_private_data != vma->vm_private_data) 1397 vma->vm_private_data = vm_private_data; 1398 return 0; 1399 } 1400 1401 static int mmap_action_finish(struct vm_area_struct *vma, 1402 struct mmap_action *action, int err) 1403 { 1404 size_t len; 1405 1406 if (!err) 1407 err = call_vma_mapped(vma); 1408 if (!err && action->success_hook) 1409 err = action->success_hook(vma); 1410 1411 /* do_munmap() might take rmap lock, so release if held. */ 1412 maybe_rmap_unlock_action(vma, action); 1413 if (!err) 1414 return 0; 1415 1416 /* 1417 * If an error occurs, unmap the VMA altogether and return an error. We 1418 * only clear the newly allocated VMA, since this function is only 1419 * invoked if we do NOT merge, so we only clean up the VMA we created. 1420 */ 1421 len = vma_pages(vma) << PAGE_SHIFT; 1422 do_munmap(current->mm, vma->vm_start, len, NULL); 1423 if (action->error_hook) { 1424 /* We may want to filter the error. */ 1425 err = action->error_hook(err); 1426 /* The caller should not clear the error. */ 1427 VM_WARN_ON_ONCE(!err); 1428 } 1429 return err; 1430 } 1431 1432 #ifdef CONFIG_MMU 1433 /** 1434 * mmap_action_prepare - Perform preparatory setup for an VMA descriptor 1435 * action which need to be performed. 1436 * @desc: The VMA descriptor to prepare for its @desc->action. 1437 * 1438 * Returns: %0 on success, otherwise error. 1439 */ 1440 int mmap_action_prepare(struct vm_area_desc *desc) 1441 { 1442 switch (desc->action.type) { 1443 case MMAP_NOTHING: 1444 return 0; 1445 case MMAP_REMAP_PFN: 1446 return remap_pfn_range_prepare(desc); 1447 case MMAP_IO_REMAP_PFN: 1448 return io_remap_pfn_range_prepare(desc); 1449 case MMAP_SIMPLE_IO_REMAP: 1450 return simple_ioremap_prepare(desc); 1451 case MMAP_MAP_KERNEL_PAGES: 1452 return map_kernel_pages_prepare(desc); 1453 } 1454 1455 WARN_ON_ONCE(1); 1456 return -EINVAL; 1457 } 1458 EXPORT_SYMBOL(mmap_action_prepare); 1459 1460 /** 1461 * mmap_action_complete - Execute VMA descriptor action. 1462 * @vma: The VMA to perform the action upon. 1463 * @action: The action to perform. 1464 * 1465 * Similar to mmap_action_prepare(). 1466 * 1467 * Return: 0 on success, or error, at which point the VMA will be unmapped. 1468 */ 1469 int mmap_action_complete(struct vm_area_struct *vma, 1470 struct mmap_action *action) 1471 { 1472 int err = 0; 1473 1474 switch (action->type) { 1475 case MMAP_NOTHING: 1476 break; 1477 case MMAP_REMAP_PFN: 1478 err = remap_pfn_range_complete(vma, action); 1479 break; 1480 case MMAP_MAP_KERNEL_PAGES: 1481 err = map_kernel_pages_complete(vma, action); 1482 break; 1483 case MMAP_IO_REMAP_PFN: 1484 case MMAP_SIMPLE_IO_REMAP: 1485 /* Should have been delegated. */ 1486 WARN_ON_ONCE(1); 1487 err = -EINVAL; 1488 break; 1489 } 1490 1491 return mmap_action_finish(vma, action, err); 1492 } 1493 EXPORT_SYMBOL(mmap_action_complete); 1494 #else 1495 int mmap_action_prepare(struct vm_area_desc *desc) 1496 { 1497 switch (desc->action.type) { 1498 case MMAP_NOTHING: 1499 break; 1500 case MMAP_REMAP_PFN: 1501 case MMAP_IO_REMAP_PFN: 1502 case MMAP_SIMPLE_IO_REMAP: 1503 case MMAP_MAP_KERNEL_PAGES: 1504 WARN_ON_ONCE(1); /* nommu cannot handle these. */ 1505 break; 1506 } 1507 1508 return 0; 1509 } 1510 EXPORT_SYMBOL(mmap_action_prepare); 1511 1512 int mmap_action_complete(struct vm_area_struct *vma, 1513 struct mmap_action *action) 1514 { 1515 int err = 0; 1516 1517 switch (action->type) { 1518 case MMAP_NOTHING: 1519 break; 1520 case MMAP_REMAP_PFN: 1521 case MMAP_IO_REMAP_PFN: 1522 case MMAP_SIMPLE_IO_REMAP: 1523 case MMAP_MAP_KERNEL_PAGES: 1524 WARN_ON_ONCE(1); /* nommu cannot handle this. */ 1525 1526 err = -EINVAL; 1527 break; 1528 } 1529 1530 return mmap_action_finish(vma, action, err); 1531 } 1532 EXPORT_SYMBOL(mmap_action_complete); 1533 #endif 1534 1535 #ifdef CONFIG_MMU 1536 /** 1537 * folio_pte_batch - detect a PTE batch for a large folio 1538 * @folio: The large folio to detect a PTE batch for. 1539 * @ptep: Page table pointer for the first entry. 1540 * @pte: Page table entry for the first page. 1541 * @max_nr: The maximum number of table entries to consider. 1542 * 1543 * This is a simplified variant of folio_pte_batch_flags(). 1544 * 1545 * Detect a PTE batch: consecutive (present) PTEs that map consecutive 1546 * pages of the same large folio in a single VMA and a single page table. 1547 * 1548 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, 1549 * the accessed bit, writable bit, dirt-bit and soft-dirty bit. 1550 * 1551 * ptep must map any page of the folio. max_nr must be at least one and 1552 * must be limited by the caller so scanning cannot exceed a single VMA and 1553 * a single page table. 1554 * 1555 * Return: the number of table entries in the batch. 1556 */ 1557 unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, 1558 unsigned int max_nr) 1559 { 1560 return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); 1561 } 1562 #endif /* CONFIG_MMU */ 1563 1564 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 1565 /** 1566 * page_range_contiguous - test whether the page range is contiguous 1567 * @page: the start of the page range. 1568 * @nr_pages: the number of pages in the range. 1569 * 1570 * Test whether the page range is contiguous, such that they can be iterated 1571 * naively, corresponding to iterating a contiguous PFN range. 1572 * 1573 * This function should primarily only be used for debug checks, or when 1574 * working with page ranges that are not naturally contiguous (e.g., pages 1575 * within a folio are). 1576 * 1577 * Returns true if contiguous, otherwise false. 1578 */ 1579 bool page_range_contiguous(const struct page *page, unsigned long nr_pages) 1580 { 1581 const unsigned long start_pfn = page_to_pfn(page); 1582 const unsigned long end_pfn = start_pfn + nr_pages; 1583 unsigned long pfn; 1584 1585 /* 1586 * The memmap is allocated per memory section, so no need to check 1587 * within the first section. However, we need to check each other 1588 * spanned memory section once, making sure the first page in a 1589 * section could similarly be reached by just iterating pages. 1590 */ 1591 for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); 1592 pfn < end_pfn; pfn += PAGES_PER_SECTION) 1593 if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) 1594 return false; 1595 return true; 1596 } 1597 EXPORT_SYMBOL(page_range_contiguous); 1598 #endif 1599