1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/mm.h> 3 #include <linux/slab.h> 4 #include <linux/string.h> 5 #include <linux/compiler.h> 6 #include <linux/export.h> 7 #include <linux/err.h> 8 #include <linux/sched.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/sched/task_stack.h> 12 #include <linux/security.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/sysctl.h> 16 #include <linux/mman.h> 17 #include <linux/hugetlb.h> 18 #include <linux/vmalloc.h> 19 #include <linux/userfaultfd_k.h> 20 #include <linux/elf.h> 21 #include <linux/elf-randomize.h> 22 #include <linux/personality.h> 23 #include <linux/random.h> 24 #include <linux/processor.h> 25 #include <linux/sizes.h> 26 #include <linux/compat.h> 27 #include <linux/fsnotify.h> 28 #include <linux/page_idle.h> 29 30 #include <linux/uaccess.h> 31 32 #include <kunit/visibility.h> 33 34 #include "internal.h" 35 #include "swap.h" 36 37 /** 38 * kfree_const - conditionally free memory 39 * @x: pointer to the memory 40 * 41 * Function calls kfree only if @x is not in .rodata section. 42 */ 43 void kfree_const(const void *x) 44 { 45 if (!is_kernel_rodata((unsigned long)x)) 46 kfree(x); 47 } 48 EXPORT_SYMBOL(kfree_const); 49 50 /** 51 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. 52 * @s: The data to copy 53 * @len: The size of the data, not including the NUL terminator 54 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 55 * 56 * Return: newly allocated copy of @s with NUL-termination or %NULL in 57 * case of error 58 */ 59 static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) 60 { 61 char *buf; 62 63 /* '+1' for the NUL terminator */ 64 buf = kmalloc_track_caller(len + 1, gfp); 65 if (!buf) 66 return NULL; 67 68 memcpy(buf, s, len); 69 /* Ensure the buf is always NUL-terminated, regardless of @s. */ 70 buf[len] = '\0'; 71 return buf; 72 } 73 74 /** 75 * kstrdup - allocate space for and copy an existing string 76 * @s: the string to duplicate 77 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 78 * 79 * Return: newly allocated copy of @s or %NULL in case of error 80 */ 81 noinline 82 char *kstrdup(const char *s, gfp_t gfp) 83 { 84 return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; 85 } 86 EXPORT_SYMBOL(kstrdup); 87 88 /** 89 * kstrdup_const - conditionally duplicate an existing const string 90 * @s: the string to duplicate 91 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 92 * 93 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and 94 * must not be passed to krealloc(). 95 * 96 * Return: source string if it is in .rodata section otherwise 97 * fallback to kstrdup. 98 */ 99 const char *kstrdup_const(const char *s, gfp_t gfp) 100 { 101 if (is_kernel_rodata((unsigned long)s)) 102 return s; 103 104 return kstrdup(s, gfp); 105 } 106 EXPORT_SYMBOL(kstrdup_const); 107 108 /** 109 * kstrndup - allocate space for and copy an existing string 110 * @s: the string to duplicate 111 * @max: read at most @max chars from @s 112 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 113 * 114 * Note: Use kmemdup_nul() instead if the size is known exactly. 115 * 116 * Return: newly allocated copy of @s or %NULL in case of error 117 */ 118 char *kstrndup(const char *s, size_t max, gfp_t gfp) 119 { 120 return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; 121 } 122 EXPORT_SYMBOL(kstrndup); 123 124 /** 125 * kmemdup - duplicate region of memory 126 * 127 * @src: memory region to duplicate 128 * @len: memory region length 129 * @gfp: GFP mask to use 130 * 131 * Return: newly allocated copy of @src or %NULL in case of error, 132 * result is physically contiguous. Use kfree() to free. 133 */ 134 void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) 135 { 136 void *p; 137 138 p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); 139 if (p) 140 memcpy(p, src, len); 141 return p; 142 } 143 EXPORT_SYMBOL(kmemdup_noprof); 144 145 /** 146 * kmemdup_array - duplicate a given array. 147 * 148 * @src: array to duplicate. 149 * @count: number of elements to duplicate from array. 150 * @element_size: size of each element of array. 151 * @gfp: GFP mask to use. 152 * 153 * Return: duplicated array of @src or %NULL in case of error, 154 * result is physically contiguous. Use kfree() to free. 155 */ 156 void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) 157 { 158 return kmemdup(src, size_mul(element_size, count), gfp); 159 } 160 EXPORT_SYMBOL(kmemdup_array); 161 162 /** 163 * kvmemdup - duplicate region of memory 164 * 165 * @src: memory region to duplicate 166 * @len: memory region length 167 * @gfp: GFP mask to use 168 * 169 * Return: newly allocated copy of @src or %NULL in case of error, 170 * result may be not physically contiguous. Use kvfree() to free. 171 */ 172 void *kvmemdup(const void *src, size_t len, gfp_t gfp) 173 { 174 void *p; 175 176 p = kvmalloc(len, gfp); 177 if (p) 178 memcpy(p, src, len); 179 return p; 180 } 181 EXPORT_SYMBOL(kvmemdup); 182 183 /** 184 * kmemdup_nul - Create a NUL-terminated string from unterminated data 185 * @s: The data to stringify 186 * @len: The size of the data 187 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 188 * 189 * Return: newly allocated copy of @s with NUL-termination or %NULL in 190 * case of error 191 */ 192 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 193 { 194 return s ? __kmemdup_nul(s, len, gfp) : NULL; 195 } 196 EXPORT_SYMBOL(kmemdup_nul); 197 198 static kmem_buckets *user_buckets __ro_after_init; 199 200 static int __init init_user_buckets(void) 201 { 202 user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); 203 204 return 0; 205 } 206 subsys_initcall(init_user_buckets); 207 208 /** 209 * memdup_user - duplicate memory region from user space 210 * 211 * @src: source address in user space 212 * @len: number of bytes to copy 213 * 214 * Return: an ERR_PTR() on failure. Result is physically 215 * contiguous, to be freed by kfree(). 216 */ 217 void *memdup_user(const void __user *src, size_t len) 218 { 219 void *p; 220 221 p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); 222 if (!p) 223 return ERR_PTR(-ENOMEM); 224 225 if (copy_from_user(p, src, len)) { 226 kfree(p); 227 return ERR_PTR(-EFAULT); 228 } 229 230 return p; 231 } 232 EXPORT_SYMBOL(memdup_user); 233 234 /** 235 * vmemdup_user - duplicate memory region from user space 236 * 237 * @src: source address in user space 238 * @len: number of bytes to copy 239 * 240 * Return: an ERR_PTR() on failure. Result may be not 241 * physically contiguous. Use kvfree() to free. 242 */ 243 void *vmemdup_user(const void __user *src, size_t len) 244 { 245 void *p; 246 247 p = kmem_buckets_valloc(user_buckets, len, GFP_USER); 248 if (!p) 249 return ERR_PTR(-ENOMEM); 250 251 if (copy_from_user(p, src, len)) { 252 kvfree(p); 253 return ERR_PTR(-EFAULT); 254 } 255 256 return p; 257 } 258 EXPORT_SYMBOL(vmemdup_user); 259 260 /** 261 * strndup_user - duplicate an existing string from user space 262 * @s: The string to duplicate 263 * @n: Maximum number of bytes to copy, including the trailing NUL. 264 * 265 * Return: newly allocated copy of @s or an ERR_PTR() in case of error 266 */ 267 char *strndup_user(const char __user *s, long n) 268 { 269 char *p; 270 long length; 271 272 length = strnlen_user(s, n); 273 274 if (!length) 275 return ERR_PTR(-EFAULT); 276 277 if (length > n) 278 return ERR_PTR(-EINVAL); 279 280 p = memdup_user(s, length); 281 282 if (IS_ERR(p)) 283 return p; 284 285 p[length - 1] = '\0'; 286 287 return p; 288 } 289 EXPORT_SYMBOL(strndup_user); 290 291 /** 292 * memdup_user_nul - duplicate memory region from user space and NUL-terminate 293 * 294 * @src: source address in user space 295 * @len: number of bytes to copy 296 * 297 * Return: an ERR_PTR() on failure. 298 */ 299 void *memdup_user_nul(const void __user *src, size_t len) 300 { 301 char *p; 302 303 p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); 304 if (!p) 305 return ERR_PTR(-ENOMEM); 306 307 if (copy_from_user(p, src, len)) { 308 kfree(p); 309 return ERR_PTR(-EFAULT); 310 } 311 p[len] = '\0'; 312 313 return p; 314 } 315 EXPORT_SYMBOL(memdup_user_nul); 316 317 /* Check if the vma is being used as a stack by this task */ 318 int vma_is_stack_for_current(const struct vm_area_struct *vma) 319 { 320 struct task_struct * __maybe_unused t = current; 321 322 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 323 } 324 325 /* 326 * Change backing file, only valid to use during initial VMA setup. 327 */ 328 void vma_set_file(struct vm_area_struct *vma, struct file *file) 329 { 330 /* Changing an anonymous vma with this is illegal */ 331 get_file(file); 332 swap(vma->vm_file, file); 333 fput(file); 334 } 335 EXPORT_SYMBOL(vma_set_file); 336 337 #ifndef STACK_RND_MASK 338 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 339 #endif 340 341 unsigned long randomize_stack_top(unsigned long stack_top) 342 { 343 unsigned long random_variable = 0; 344 345 if (current->flags & PF_RANDOMIZE) { 346 random_variable = get_random_long(); 347 random_variable &= STACK_RND_MASK; 348 random_variable <<= PAGE_SHIFT; 349 } 350 #ifdef CONFIG_STACK_GROWSUP 351 return PAGE_ALIGN(stack_top) + random_variable; 352 #else 353 return PAGE_ALIGN(stack_top) - random_variable; 354 #endif 355 } 356 357 /** 358 * randomize_page - Generate a random, page aligned address 359 * @start: The smallest acceptable address the caller will take. 360 * @range: The size of the area, starting at @start, within which the 361 * random address must fall. 362 * 363 * If @start + @range would overflow, @range is capped. 364 * 365 * NOTE: Historical use of randomize_range, which this replaces, presumed that 366 * @start was already page aligned. We now align it regardless. 367 * 368 * Return: A page aligned address within [start, start + range). On error, 369 * @start is returned. 370 */ 371 unsigned long randomize_page(unsigned long start, unsigned long range) 372 { 373 if (!PAGE_ALIGNED(start)) { 374 range -= PAGE_ALIGN(start) - start; 375 start = PAGE_ALIGN(start); 376 } 377 378 if (start > ULONG_MAX - range) 379 range = ULONG_MAX - start; 380 381 range >>= PAGE_SHIFT; 382 383 if (range == 0) 384 return start; 385 386 return start + (get_random_long() % range << PAGE_SHIFT); 387 } 388 389 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 390 unsigned long __weak arch_randomize_brk(struct mm_struct *mm) 391 { 392 /* Is the current task 32bit ? */ 393 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 394 return randomize_page(mm->brk, SZ_32M); 395 396 return randomize_page(mm->brk, SZ_1G); 397 } 398 399 unsigned long arch_mmap_rnd(void) 400 { 401 unsigned long rnd; 402 403 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 404 if (is_compat_task()) 405 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 406 else 407 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 408 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 409 410 return rnd << PAGE_SHIFT; 411 } 412 413 static int mmap_is_legacy(const struct rlimit *rlim_stack) 414 { 415 if (current->personality & ADDR_COMPAT_LAYOUT) 416 return 1; 417 418 /* On parisc the stack always grows up - so a unlimited stack should 419 * not be an indicator to use the legacy memory layout. */ 420 if (rlim_stack->rlim_cur == RLIM_INFINITY && 421 !IS_ENABLED(CONFIG_STACK_GROWSUP)) 422 return 1; 423 424 return sysctl_legacy_va_layout; 425 } 426 427 /* 428 * Leave enough space between the mmap area and the stack to honour ulimit in 429 * the face of randomisation. 430 */ 431 #define MIN_GAP (SZ_128M) 432 #define MAX_GAP (STACK_TOP / 6 * 5) 433 434 static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) 435 { 436 #ifdef CONFIG_STACK_GROWSUP 437 /* 438 * For an upwards growing stack the calculation is much simpler. 439 * Memory for the maximum stack size is reserved at the top of the 440 * task. mmap_base starts directly below the stack and grows 441 * downwards. 442 */ 443 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); 444 #else 445 unsigned long gap = rlim_stack->rlim_cur; 446 unsigned long pad = stack_guard_gap; 447 448 /* Account for stack randomization if necessary */ 449 if (current->flags & PF_RANDOMIZE) 450 pad += (STACK_RND_MASK << PAGE_SHIFT); 451 452 /* Values close to RLIM_INFINITY can overflow. */ 453 if (gap + pad > gap) 454 gap += pad; 455 456 if (gap < MIN_GAP && MIN_GAP < MAX_GAP) 457 gap = MIN_GAP; 458 else if (gap > MAX_GAP) 459 gap = MAX_GAP; 460 461 return PAGE_ALIGN(STACK_TOP - gap - rnd); 462 #endif 463 } 464 465 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) 466 { 467 unsigned long random_factor = 0UL; 468 469 if (current->flags & PF_RANDOMIZE) 470 random_factor = arch_mmap_rnd(); 471 472 if (mmap_is_legacy(rlim_stack)) { 473 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 474 mm_flags_clear(MMF_TOPDOWN, mm); 475 } else { 476 mm->mmap_base = mmap_base(random_factor, rlim_stack); 477 mm_flags_set(MMF_TOPDOWN, mm); 478 } 479 } 480 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 481 void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) 482 { 483 mm->mmap_base = TASK_UNMAPPED_BASE; 484 mm_flags_clear(MMF_TOPDOWN, mm); 485 } 486 #endif 487 #ifdef CONFIG_MMU 488 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); 489 #endif 490 491 /** 492 * __account_locked_vm - account locked pages to an mm's locked_vm 493 * @mm: mm to account against 494 * @pages: number of pages to account 495 * @inc: %true if @pages should be considered positive, %false if not 496 * @task: task used to check RLIMIT_MEMLOCK 497 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped 498 * 499 * Assumes @task and @mm are valid (i.e. at least one reference on each), and 500 * that mmap_lock is held as writer. 501 * 502 * Return: 503 * * 0 on success 504 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 505 */ 506 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 507 const struct task_struct *task, bool bypass_rlim) 508 { 509 unsigned long locked_vm, limit; 510 int ret = 0; 511 512 mmap_assert_write_locked(mm); 513 514 locked_vm = mm->locked_vm; 515 if (inc) { 516 if (!bypass_rlim) { 517 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; 518 if (locked_vm + pages > limit) 519 ret = -ENOMEM; 520 } 521 if (!ret) 522 mm->locked_vm = locked_vm + pages; 523 } else { 524 WARN_ON_ONCE(pages > locked_vm); 525 mm->locked_vm = locked_vm - pages; 526 } 527 528 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, 529 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, 530 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), 531 ret ? " - exceeded" : ""); 532 533 return ret; 534 } 535 EXPORT_SYMBOL_GPL(__account_locked_vm); 536 537 /** 538 * account_locked_vm - account locked pages to an mm's locked_vm 539 * @mm: mm to account against, may be NULL 540 * @pages: number of pages to account 541 * @inc: %true if @pages should be considered positive, %false if not 542 * 543 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). 544 * 545 * Return: 546 * * 0 on success, or if mm is NULL 547 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 548 */ 549 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) 550 { 551 int ret; 552 553 if (pages == 0 || !mm) 554 return 0; 555 556 mmap_write_lock(mm); 557 ret = __account_locked_vm(mm, pages, inc, current, 558 capable(CAP_IPC_LOCK)); 559 mmap_write_unlock(mm); 560 561 return ret; 562 } 563 EXPORT_SYMBOL_GPL(account_locked_vm); 564 565 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 566 unsigned long len, unsigned long prot, 567 unsigned long flag, unsigned long pgoff) 568 { 569 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 570 unsigned long ret; 571 struct mm_struct *mm = current->mm; 572 unsigned long populate; 573 LIST_HEAD(uf); 574 575 ret = security_mmap_file(file, prot, flag); 576 if (!ret) 577 ret = fsnotify_mmap_perm(file, prot, off, len); 578 if (!ret) { 579 if (mmap_write_lock_killable(mm)) 580 return -EINTR; 581 ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, 582 &uf); 583 mmap_write_unlock(mm); 584 userfaultfd_unmap_complete(mm, &uf); 585 if (populate) 586 mm_populate(ret, populate); 587 } 588 return ret; 589 } 590 591 /* 592 * Perform a userland memory mapping into the current process address space. See 593 * the comment for do_mmap() for more details on this operation in general. 594 * 595 * This differs from do_mmap() in that: 596 * 597 * a. An offset parameter is provided rather than pgoff, which is both checked 598 * for overflow and page alignment. 599 * b. mmap locking is performed on the caller's behalf. 600 * c. Userfaultfd unmap events and memory population are handled. 601 * 602 * This means that this function performs essentially the same work as if 603 * userland were invoking mmap (2). 604 * 605 * Returns either an error, or the address at which the requested mapping has 606 * been performed. 607 */ 608 unsigned long vm_mmap(struct file *file, unsigned long addr, 609 unsigned long len, unsigned long prot, 610 unsigned long flag, unsigned long offset) 611 { 612 if (unlikely(offset + PAGE_ALIGN(len) < offset)) 613 return -EINVAL; 614 if (unlikely(offset_in_page(offset))) 615 return -EINVAL; 616 617 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 618 } 619 EXPORT_SYMBOL(vm_mmap); 620 621 /** 622 * __vmalloc_array - allocate memory for a virtually contiguous array. 623 * @n: number of elements. 624 * @size: element size. 625 * @flags: the type of memory to allocate (see kmalloc). 626 */ 627 void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) 628 { 629 size_t bytes; 630 631 if (unlikely(check_mul_overflow(n, size, &bytes))) 632 return NULL; 633 return __vmalloc_noprof(bytes, flags); 634 } 635 EXPORT_SYMBOL(__vmalloc_array_noprof); 636 637 /** 638 * vmalloc_array - allocate memory for a virtually contiguous array. 639 * @n: number of elements. 640 * @size: element size. 641 */ 642 void *vmalloc_array_noprof(size_t n, size_t size) 643 { 644 return __vmalloc_array_noprof(n, size, GFP_KERNEL); 645 } 646 EXPORT_SYMBOL(vmalloc_array_noprof); 647 648 /** 649 * __vcalloc - allocate and zero memory for a virtually contiguous array. 650 * @n: number of elements. 651 * @size: element size. 652 * @flags: the type of memory to allocate (see kmalloc). 653 */ 654 void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) 655 { 656 return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); 657 } 658 EXPORT_SYMBOL(__vcalloc_noprof); 659 660 /** 661 * vcalloc - allocate and zero memory for a virtually contiguous array. 662 * @n: number of elements. 663 * @size: element size. 664 */ 665 void *vcalloc_noprof(size_t n, size_t size) 666 { 667 return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); 668 } 669 EXPORT_SYMBOL(vcalloc_noprof); 670 671 struct anon_vma *folio_anon_vma(const struct folio *folio) 672 { 673 unsigned long mapping = (unsigned long)folio->mapping; 674 675 if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) 676 return NULL; 677 return (void *)(mapping - FOLIO_MAPPING_ANON); 678 } 679 680 /** 681 * folio_mapping - Find the mapping where this folio is stored. 682 * @folio: The folio. 683 * 684 * For folios which are in the page cache, return the mapping that this 685 * page belongs to. Folios in the swap cache return the swap mapping 686 * this page is stored in (which is different from the mapping for the 687 * swap file or swap device where the data is stored). 688 * 689 * You can call this for folios which aren't in the swap cache or page 690 * cache and it will return NULL. 691 */ 692 struct address_space *folio_mapping(const struct folio *folio) 693 { 694 struct address_space *mapping; 695 696 /* This happens if someone calls flush_dcache_page on slab page */ 697 if (unlikely(folio_test_slab(folio))) 698 return NULL; 699 700 if (unlikely(folio_test_swapcache(folio))) 701 return swap_address_space(folio->swap); 702 703 mapping = folio->mapping; 704 if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) 705 return NULL; 706 707 return mapping; 708 } 709 EXPORT_SYMBOL(folio_mapping); 710 711 /** 712 * folio_copy - Copy the contents of one folio to another. 713 * @dst: Folio to copy to. 714 * @src: Folio to copy from. 715 * 716 * The bytes in the folio represented by @src are copied to @dst. 717 * Assumes the caller has validated that @dst is at least as large as @src. 718 * Can be called in atomic context for order-0 folios, but if the folio is 719 * larger, it may sleep. 720 */ 721 void folio_copy(struct folio *dst, struct folio *src) 722 { 723 long i = 0; 724 long nr = folio_nr_pages(src); 725 726 for (;;) { 727 copy_highpage(folio_page(dst, i), folio_page(src, i)); 728 if (++i == nr) 729 break; 730 cond_resched(); 731 } 732 } 733 EXPORT_SYMBOL(folio_copy); 734 735 int folio_mc_copy(struct folio *dst, struct folio *src) 736 { 737 long nr = folio_nr_pages(src); 738 long i = 0; 739 740 for (;;) { 741 if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) 742 return -EHWPOISON; 743 if (++i == nr) 744 break; 745 cond_resched(); 746 } 747 748 return 0; 749 } 750 EXPORT_SYMBOL(folio_mc_copy); 751 752 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; 753 static int sysctl_overcommit_ratio __read_mostly = 50; 754 static unsigned long sysctl_overcommit_kbytes __read_mostly; 755 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 756 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 757 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 758 759 #ifdef CONFIG_SYSCTL 760 761 static int overcommit_ratio_handler(const struct ctl_table *table, int write, 762 void *buffer, size_t *lenp, loff_t *ppos) 763 { 764 int ret; 765 766 ret = proc_dointvec(table, write, buffer, lenp, ppos); 767 if (ret == 0 && write) 768 sysctl_overcommit_kbytes = 0; 769 return ret; 770 } 771 772 static void sync_overcommit_as(struct work_struct *dummy) 773 { 774 percpu_counter_sync(&vm_committed_as); 775 } 776 777 static int overcommit_policy_handler(const struct ctl_table *table, int write, 778 void *buffer, size_t *lenp, loff_t *ppos) 779 { 780 struct ctl_table t; 781 int new_policy = -1; 782 int ret; 783 784 /* 785 * The deviation of sync_overcommit_as could be big with loose policy 786 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to 787 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply 788 * with the strict "NEVER", and to avoid possible race condition (even 789 * though user usually won't too frequently do the switching to policy 790 * OVERCOMMIT_NEVER), the switch is done in the following order: 791 * 1. changing the batch 792 * 2. sync percpu count on each CPU 793 * 3. switch the policy 794 */ 795 if (write) { 796 t = *table; 797 t.data = &new_policy; 798 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 799 if (ret || new_policy == -1) 800 return ret; 801 802 mm_compute_batch(new_policy); 803 if (new_policy == OVERCOMMIT_NEVER) 804 schedule_on_each_cpu(sync_overcommit_as); 805 sysctl_overcommit_memory = new_policy; 806 } else { 807 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 808 } 809 810 return ret; 811 } 812 813 static int overcommit_kbytes_handler(const struct ctl_table *table, int write, 814 void *buffer, size_t *lenp, loff_t *ppos) 815 { 816 int ret; 817 818 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 819 if (ret == 0 && write) 820 sysctl_overcommit_ratio = 0; 821 return ret; 822 } 823 824 static const struct ctl_table util_sysctl_table[] = { 825 { 826 .procname = "overcommit_memory", 827 .data = &sysctl_overcommit_memory, 828 .maxlen = sizeof(sysctl_overcommit_memory), 829 .mode = 0644, 830 .proc_handler = overcommit_policy_handler, 831 .extra1 = SYSCTL_ZERO, 832 .extra2 = SYSCTL_TWO, 833 }, 834 { 835 .procname = "overcommit_ratio", 836 .data = &sysctl_overcommit_ratio, 837 .maxlen = sizeof(sysctl_overcommit_ratio), 838 .mode = 0644, 839 .proc_handler = overcommit_ratio_handler, 840 }, 841 { 842 .procname = "overcommit_kbytes", 843 .data = &sysctl_overcommit_kbytes, 844 .maxlen = sizeof(sysctl_overcommit_kbytes), 845 .mode = 0644, 846 .proc_handler = overcommit_kbytes_handler, 847 }, 848 { 849 .procname = "user_reserve_kbytes", 850 .data = &sysctl_user_reserve_kbytes, 851 .maxlen = sizeof(sysctl_user_reserve_kbytes), 852 .mode = 0644, 853 .proc_handler = proc_doulongvec_minmax, 854 }, 855 { 856 .procname = "admin_reserve_kbytes", 857 .data = &sysctl_admin_reserve_kbytes, 858 .maxlen = sizeof(sysctl_admin_reserve_kbytes), 859 .mode = 0644, 860 .proc_handler = proc_doulongvec_minmax, 861 }, 862 }; 863 864 static int __init init_vm_util_sysctls(void) 865 { 866 register_sysctl_init("vm", util_sysctl_table); 867 return 0; 868 } 869 subsys_initcall(init_vm_util_sysctls); 870 #endif /* CONFIG_SYSCTL */ 871 872 /* 873 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 874 */ 875 unsigned long vm_commit_limit(void) 876 { 877 unsigned long allowed; 878 879 if (sysctl_overcommit_kbytes) 880 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 881 else 882 allowed = ((totalram_pages() - hugetlb_total_pages()) 883 * sysctl_overcommit_ratio / 100); 884 allowed += total_swap_pages; 885 886 return allowed; 887 } 888 889 /* 890 * Make sure vm_committed_as in one cacheline and not cacheline shared with 891 * other variables. It can be updated by several CPUs frequently. 892 */ 893 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 894 895 /* 896 * The global memory commitment made in the system can be a metric 897 * that can be used to drive ballooning decisions when Linux is hosted 898 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 899 * balancing memory across competing virtual machines that are hosted. 900 * Several metrics drive this policy engine including the guest reported 901 * memory commitment. 902 * 903 * The time cost of this is very low for small platforms, and for big 904 * platform like a 2S/36C/72T Skylake server, in worst case where 905 * vm_committed_as's spinlock is under severe contention, the time cost 906 * could be about 30~40 microseconds. 907 */ 908 unsigned long vm_memory_committed(void) 909 { 910 return percpu_counter_sum_positive(&vm_committed_as); 911 } 912 EXPORT_SYMBOL_GPL(vm_memory_committed); 913 914 /* 915 * Check that a process has enough memory to allocate a new virtual 916 * mapping. 0 means there is enough memory for the allocation to 917 * succeed and -ENOMEM implies there is not. 918 * 919 * We currently support three overcommit policies, which are set via the 920 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst 921 * 922 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 923 * Additional code 2002 Jul 20 by Robert Love. 924 * 925 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 926 * 927 * Note this is a helper function intended to be used by LSMs which 928 * wish to use this logic. 929 */ 930 int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) 931 { 932 long allowed; 933 unsigned long bytes_failed; 934 935 vm_acct_memory(pages); 936 937 /* 938 * Sometimes we want to use more memory than we have 939 */ 940 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 941 return 0; 942 943 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 944 if (pages > totalram_pages() + total_swap_pages) 945 goto error; 946 return 0; 947 } 948 949 allowed = vm_commit_limit(); 950 /* 951 * Reserve some for root 952 */ 953 if (!cap_sys_admin) 954 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 955 956 /* 957 * Don't let a single process grow so big a user can't recover 958 */ 959 if (mm) { 960 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 961 962 allowed -= min_t(long, mm->total_vm / 32, reserve); 963 } 964 965 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 966 return 0; 967 error: 968 bytes_failed = pages << PAGE_SHIFT; 969 pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", 970 __func__, current->pid, current->comm, bytes_failed); 971 vm_unacct_memory(pages); 972 973 return -ENOMEM; 974 } 975 976 /** 977 * get_cmdline() - copy the cmdline value to a buffer. 978 * @task: the task whose cmdline value to copy. 979 * @buffer: the buffer to copy to. 980 * @buflen: the length of the buffer. Larger cmdline values are truncated 981 * to this length. 982 * 983 * Return: the size of the cmdline field copied. Note that the copy does 984 * not guarantee an ending NULL byte. 985 */ 986 int get_cmdline(struct task_struct *task, char *buffer, int buflen) 987 { 988 int res = 0; 989 unsigned int len; 990 struct mm_struct *mm = get_task_mm(task); 991 unsigned long arg_start, arg_end, env_start, env_end; 992 if (!mm) 993 goto out; 994 if (!mm->arg_end) 995 goto out_mm; /* Shh! No looking before we're done */ 996 997 spin_lock(&mm->arg_lock); 998 arg_start = mm->arg_start; 999 arg_end = mm->arg_end; 1000 env_start = mm->env_start; 1001 env_end = mm->env_end; 1002 spin_unlock(&mm->arg_lock); 1003 1004 len = arg_end - arg_start; 1005 1006 if (len > buflen) 1007 len = buflen; 1008 1009 res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); 1010 1011 /* 1012 * If the nul at the end of args has been overwritten, then 1013 * assume application is using setproctitle(3). 1014 */ 1015 if (res > 0 && buffer[res-1] != '\0' && len < buflen) { 1016 len = strnlen(buffer, res); 1017 if (len < res) { 1018 res = len; 1019 } else { 1020 len = env_end - env_start; 1021 if (len > buflen - res) 1022 len = buflen - res; 1023 res += access_process_vm(task, env_start, 1024 buffer+res, len, 1025 FOLL_FORCE); 1026 res = strnlen(buffer, res); 1027 } 1028 } 1029 out_mm: 1030 mmput(mm); 1031 out: 1032 return res; 1033 } 1034 1035 int __weak memcmp_pages(struct page *page1, struct page *page2) 1036 { 1037 char *addr1, *addr2; 1038 int ret; 1039 1040 addr1 = kmap_local_page(page1); 1041 addr2 = kmap_local_page(page2); 1042 ret = memcmp(addr1, addr2, PAGE_SIZE); 1043 kunmap_local(addr2); 1044 kunmap_local(addr1); 1045 return ret; 1046 } 1047 1048 #ifdef CONFIG_PRINTK 1049 /** 1050 * mem_dump_obj - Print available provenance information 1051 * @object: object for which to find provenance information. 1052 * 1053 * This function uses pr_cont(), so that the caller is expected to have 1054 * printed out whatever preamble is appropriate. The provenance information 1055 * depends on the type of object and on how much debugging is enabled. 1056 * For example, for a slab-cache object, the slab name is printed, and, 1057 * if available, the return address and stack trace from the allocation 1058 * and last free path of that object. 1059 */ 1060 void mem_dump_obj(void *object) 1061 { 1062 const char *type; 1063 1064 if (kmem_dump_obj(object)) 1065 return; 1066 1067 if (vmalloc_dump_obj(object)) 1068 return; 1069 1070 if (is_vmalloc_addr(object)) 1071 type = "vmalloc memory"; 1072 else if (virt_addr_valid(object)) 1073 type = "non-slab/vmalloc memory"; 1074 else if (object == NULL) 1075 type = "NULL pointer"; 1076 else if (object == ZERO_SIZE_PTR) 1077 type = "zero-size pointer"; 1078 else 1079 type = "non-paged memory"; 1080 1081 pr_cont(" %s\n", type); 1082 } 1083 EXPORT_SYMBOL_GPL(mem_dump_obj); 1084 #endif 1085 1086 /* 1087 * A driver might set a page logically offline -- PageOffline() -- and 1088 * turn the page inaccessible in the hypervisor; after that, access to page 1089 * content can be fatal. 1090 * 1091 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random 1092 * pages after checking PageOffline(); however, these PFN walkers can race 1093 * with drivers that set PageOffline(). 1094 * 1095 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to 1096 * synchronize with such drivers, achieving that a page cannot be set 1097 * PageOffline() while frozen. 1098 * 1099 * page_offline_begin()/page_offline_end() is used by drivers that care about 1100 * such races when setting a page PageOffline(). 1101 */ 1102 static DECLARE_RWSEM(page_offline_rwsem); 1103 1104 void page_offline_freeze(void) 1105 { 1106 down_read(&page_offline_rwsem); 1107 } 1108 1109 void page_offline_thaw(void) 1110 { 1111 up_read(&page_offline_rwsem); 1112 } 1113 1114 void page_offline_begin(void) 1115 { 1116 down_write(&page_offline_rwsem); 1117 } 1118 EXPORT_SYMBOL(page_offline_begin); 1119 1120 void page_offline_end(void) 1121 { 1122 up_write(&page_offline_rwsem); 1123 } 1124 EXPORT_SYMBOL(page_offline_end); 1125 1126 #ifndef flush_dcache_folio 1127 void flush_dcache_folio(struct folio *folio) 1128 { 1129 long i, nr = folio_nr_pages(folio); 1130 1131 for (i = 0; i < nr; i++) 1132 flush_dcache_page(folio_page(folio, i)); 1133 } 1134 EXPORT_SYMBOL(flush_dcache_folio); 1135 #endif 1136 1137 /** 1138 * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() 1139 * for details. This is the same operation, only with a specific file operations 1140 * struct which may or may not be the same as vma->vm_file->f_op. 1141 * @f_op: The file operations whose .mmap_prepare() hook is specified. 1142 * @file: The file which backs or will back the mapping. 1143 * @vma: The VMA to apply the .mmap_prepare() hook to. 1144 * Returns: 0 on success or error. 1145 */ 1146 int __compat_vma_mmap_prepare(const struct file_operations *f_op, 1147 struct file *file, struct vm_area_struct *vma) 1148 { 1149 struct vm_area_desc desc = { 1150 .mm = vma->vm_mm, 1151 .file = file, 1152 .start = vma->vm_start, 1153 .end = vma->vm_end, 1154 1155 .pgoff = vma->vm_pgoff, 1156 .vm_file = vma->vm_file, 1157 .vm_flags = vma->vm_flags, 1158 .page_prot = vma->vm_page_prot, 1159 }; 1160 int err; 1161 1162 err = f_op->mmap_prepare(&desc); 1163 if (err) 1164 return err; 1165 set_vma_from_desc(vma, &desc); 1166 1167 return 0; 1168 } 1169 EXPORT_SYMBOL(__compat_vma_mmap_prepare); 1170 1171 /** 1172 * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an 1173 * existing VMA. 1174 * @file: The file which possesss an f_op->mmap_prepare() hook. 1175 * @vma: The VMA to apply the .mmap_prepare() hook to. 1176 * 1177 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain 1178 * stacked filesystems invoke a nested mmap hook of an underlying file. 1179 * 1180 * Until all filesystems are converted to use .mmap_prepare(), we must be 1181 * conservative and continue to invoke these stacked filesystems using the 1182 * deprecated .mmap() hook. 1183 * 1184 * However we have a problem if the underlying file system possesses an 1185 * .mmap_prepare() hook, as we are in a different context when we invoke the 1186 * .mmap() hook, already having a VMA to deal with. 1187 * 1188 * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, 1189 * establishes a struct vm_area_desc descriptor, passes to the underlying 1190 * .mmap_prepare() hook and applies any changes performed by it. 1191 * 1192 * Once the conversion of filesystems is complete this function will no longer 1193 * be required and will be removed. 1194 * 1195 * Returns: 0 on success or error. 1196 */ 1197 int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) 1198 { 1199 return __compat_vma_mmap_prepare(file->f_op, file, vma); 1200 } 1201 EXPORT_SYMBOL(compat_vma_mmap_prepare); 1202 1203 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, 1204 const struct page *page) 1205 { 1206 /* 1207 * Only the first page of a high-order buddy page has PageBuddy() set. 1208 * So we have to check manually whether this page is part of a high- 1209 * order buddy page. 1210 */ 1211 if (PageBuddy(page)) 1212 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1213 else if (page_count(page) == 0 && is_free_buddy_page(page)) 1214 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1215 1216 if (folio_test_idle(folio)) 1217 ps->flags |= PAGE_SNAPSHOT_PG_IDLE; 1218 } 1219 1220 /** 1221 * snapshot_page() - Create a snapshot of a struct page 1222 * @ps: Pointer to a struct page_snapshot to store the page snapshot 1223 * @page: The page to snapshot 1224 * 1225 * Create a snapshot of the page and store both its struct page and struct 1226 * folio representations in @ps. 1227 * 1228 * A snapshot is marked as "faithful" if the compound state of @page was 1229 * stable and allowed safe reconstruction of the folio representation. In 1230 * rare cases where this is not possible (e.g. due to folio splitting), 1231 * snapshot_page() falls back to treating @page as a single page and the 1232 * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() 1233 * helper can be used to check for this condition. 1234 */ 1235 void snapshot_page(struct page_snapshot *ps, const struct page *page) 1236 { 1237 unsigned long head, nr_pages = 1; 1238 struct folio *foliop; 1239 int loops = 5; 1240 1241 ps->pfn = page_to_pfn(page); 1242 ps->flags = PAGE_SNAPSHOT_FAITHFUL; 1243 1244 again: 1245 memset(&ps->folio_snapshot, 0, sizeof(struct folio)); 1246 memcpy(&ps->page_snapshot, page, sizeof(*page)); 1247 head = ps->page_snapshot.compound_head; 1248 if ((head & 1) == 0) { 1249 ps->idx = 0; 1250 foliop = (struct folio *)&ps->page_snapshot; 1251 if (!folio_test_large(foliop)) { 1252 set_ps_flags(ps, page_folio(page), page); 1253 memcpy(&ps->folio_snapshot, foliop, 1254 sizeof(struct page)); 1255 return; 1256 } 1257 foliop = (struct folio *)page; 1258 } else { 1259 foliop = (struct folio *)(head - 1); 1260 ps->idx = folio_page_idx(foliop, page); 1261 } 1262 1263 if (ps->idx < MAX_FOLIO_NR_PAGES) { 1264 memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); 1265 nr_pages = folio_nr_pages(&ps->folio_snapshot); 1266 if (nr_pages > 1) 1267 memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, 1268 sizeof(struct page)); 1269 set_ps_flags(ps, foliop, page); 1270 } 1271 1272 if (ps->idx > nr_pages) { 1273 if (loops-- > 0) 1274 goto again; 1275 clear_compound_head(&ps->page_snapshot); 1276 foliop = (struct folio *)&ps->page_snapshot; 1277 memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); 1278 ps->flags = 0; 1279 ps->idx = 0; 1280 } 1281 } 1282 1283 #ifdef CONFIG_MMU 1284 /** 1285 * folio_pte_batch - detect a PTE batch for a large folio 1286 * @folio: The large folio to detect a PTE batch for. 1287 * @ptep: Page table pointer for the first entry. 1288 * @pte: Page table entry for the first page. 1289 * @max_nr: The maximum number of table entries to consider. 1290 * 1291 * This is a simplified variant of folio_pte_batch_flags(). 1292 * 1293 * Detect a PTE batch: consecutive (present) PTEs that map consecutive 1294 * pages of the same large folio in a single VMA and a single page table. 1295 * 1296 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, 1297 * the accessed bit, writable bit, dirt-bit and soft-dirty bit. 1298 * 1299 * ptep must map any page of the folio. max_nr must be at least one and 1300 * must be limited by the caller so scanning cannot exceed a single VMA and 1301 * a single page table. 1302 * 1303 * Return: the number of table entries in the batch. 1304 */ 1305 unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, 1306 unsigned int max_nr) 1307 { 1308 return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); 1309 } 1310 #endif /* CONFIG_MMU */ 1311 1312 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 1313 /** 1314 * page_range_contiguous - test whether the page range is contiguous 1315 * @page: the start of the page range. 1316 * @nr_pages: the number of pages in the range. 1317 * 1318 * Test whether the page range is contiguous, such that they can be iterated 1319 * naively, corresponding to iterating a contiguous PFN range. 1320 * 1321 * This function should primarily only be used for debug checks, or when 1322 * working with page ranges that are not naturally contiguous (e.g., pages 1323 * within a folio are). 1324 * 1325 * Returns true if contiguous, otherwise false. 1326 */ 1327 bool page_range_contiguous(const struct page *page, unsigned long nr_pages) 1328 { 1329 const unsigned long start_pfn = page_to_pfn(page); 1330 const unsigned long end_pfn = start_pfn + nr_pages; 1331 unsigned long pfn; 1332 1333 /* 1334 * The memmap is allocated per memory section, so no need to check 1335 * within the first section. However, we need to check each other 1336 * spanned memory section once, making sure the first page in a 1337 * section could similarly be reached by just iterating pages. 1338 */ 1339 for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); 1340 pfn < end_pfn; pfn += PAGES_PER_SECTION) 1341 if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) 1342 return false; 1343 return true; 1344 } 1345 EXPORT_SYMBOL(page_range_contiguous); 1346 #endif 1347