1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/mm.h> 3 #include <linux/slab.h> 4 #include <linux/string.h> 5 #include <linux/compiler.h> 6 #include <linux/export.h> 7 #include <linux/err.h> 8 #include <linux/sched.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/sched/task_stack.h> 12 #include <linux/security.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/sysctl.h> 16 #include <linux/mman.h> 17 #include <linux/hugetlb.h> 18 #include <linux/vmalloc.h> 19 #include <linux/userfaultfd_k.h> 20 #include <linux/elf.h> 21 #include <linux/elf-randomize.h> 22 #include <linux/personality.h> 23 #include <linux/random.h> 24 #include <linux/processor.h> 25 #include <linux/sizes.h> 26 #include <linux/compat.h> 27 #include <linux/fsnotify.h> 28 #include <linux/page_idle.h> 29 30 #include <linux/uaccess.h> 31 32 #include <kunit/visibility.h> 33 34 #include "internal.h" 35 #include "swap.h" 36 37 /** 38 * kfree_const - conditionally free memory 39 * @x: pointer to the memory 40 * 41 * Function calls kfree only if @x is not in .rodata section. 42 */ 43 void kfree_const(const void *x) 44 { 45 if (!is_kernel_rodata((unsigned long)x)) 46 kfree(x); 47 } 48 EXPORT_SYMBOL(kfree_const); 49 50 /** 51 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. 52 * @s: The data to copy 53 * @len: The size of the data, not including the NUL terminator 54 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 55 * 56 * Return: newly allocated copy of @s with NUL-termination or %NULL in 57 * case of error 58 */ 59 static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) 60 { 61 char *buf; 62 63 /* '+1' for the NUL terminator */ 64 buf = kmalloc_track_caller(len + 1, gfp); 65 if (!buf) 66 return NULL; 67 68 memcpy(buf, s, len); 69 /* Ensure the buf is always NUL-terminated, regardless of @s. */ 70 buf[len] = '\0'; 71 return buf; 72 } 73 74 /** 75 * kstrdup - allocate space for and copy an existing string 76 * @s: the string to duplicate 77 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 78 * 79 * Return: newly allocated copy of @s or %NULL in case of error 80 */ 81 noinline 82 char *kstrdup(const char *s, gfp_t gfp) 83 { 84 return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; 85 } 86 EXPORT_SYMBOL(kstrdup); 87 88 /** 89 * kstrdup_const - conditionally duplicate an existing const string 90 * @s: the string to duplicate 91 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 92 * 93 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and 94 * must not be passed to krealloc(). 95 * 96 * Return: source string if it is in .rodata section otherwise 97 * fallback to kstrdup. 98 */ 99 const char *kstrdup_const(const char *s, gfp_t gfp) 100 { 101 if (is_kernel_rodata((unsigned long)s)) 102 return s; 103 104 return kstrdup(s, gfp); 105 } 106 EXPORT_SYMBOL(kstrdup_const); 107 108 /** 109 * kstrndup - allocate space for and copy an existing string 110 * @s: the string to duplicate 111 * @max: read at most @max chars from @s 112 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 113 * 114 * Note: Use kmemdup_nul() instead if the size is known exactly. 115 * 116 * Return: newly allocated copy of @s or %NULL in case of error 117 */ 118 char *kstrndup(const char *s, size_t max, gfp_t gfp) 119 { 120 return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; 121 } 122 EXPORT_SYMBOL(kstrndup); 123 124 /** 125 * kmemdup - duplicate region of memory 126 * 127 * @src: memory region to duplicate 128 * @len: memory region length 129 * @gfp: GFP mask to use 130 * 131 * Return: newly allocated copy of @src or %NULL in case of error, 132 * result is physically contiguous. Use kfree() to free. 133 */ 134 void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) 135 { 136 void *p; 137 138 p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); 139 if (p) 140 memcpy(p, src, len); 141 return p; 142 } 143 EXPORT_SYMBOL(kmemdup_noprof); 144 145 /** 146 * kmemdup_array - duplicate a given array. 147 * 148 * @src: array to duplicate. 149 * @count: number of elements to duplicate from array. 150 * @element_size: size of each element of array. 151 * @gfp: GFP mask to use. 152 * 153 * Return: duplicated array of @src or %NULL in case of error, 154 * result is physically contiguous. Use kfree() to free. 155 */ 156 void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) 157 { 158 return kmemdup(src, size_mul(element_size, count), gfp); 159 } 160 EXPORT_SYMBOL(kmemdup_array); 161 162 /** 163 * kvmemdup - duplicate region of memory 164 * 165 * @src: memory region to duplicate 166 * @len: memory region length 167 * @gfp: GFP mask to use 168 * 169 * Return: newly allocated copy of @src or %NULL in case of error, 170 * result may be not physically contiguous. Use kvfree() to free. 171 */ 172 void *kvmemdup(const void *src, size_t len, gfp_t gfp) 173 { 174 void *p; 175 176 p = kvmalloc(len, gfp); 177 if (p) 178 memcpy(p, src, len); 179 return p; 180 } 181 EXPORT_SYMBOL(kvmemdup); 182 183 /** 184 * kmemdup_nul - Create a NUL-terminated string from unterminated data 185 * @s: The data to stringify 186 * @len: The size of the data 187 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 188 * 189 * Return: newly allocated copy of @s with NUL-termination or %NULL in 190 * case of error 191 */ 192 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 193 { 194 return s ? __kmemdup_nul(s, len, gfp) : NULL; 195 } 196 EXPORT_SYMBOL(kmemdup_nul); 197 198 static kmem_buckets *user_buckets __ro_after_init; 199 200 static int __init init_user_buckets(void) 201 { 202 user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); 203 204 return 0; 205 } 206 subsys_initcall(init_user_buckets); 207 208 /** 209 * memdup_user - duplicate memory region from user space 210 * 211 * @src: source address in user space 212 * @len: number of bytes to copy 213 * 214 * Return: an ERR_PTR() on failure. Result is physically 215 * contiguous, to be freed by kfree(). 216 */ 217 void *memdup_user(const void __user *src, size_t len) 218 { 219 void *p; 220 221 p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); 222 if (!p) 223 return ERR_PTR(-ENOMEM); 224 225 if (copy_from_user(p, src, len)) { 226 kfree(p); 227 return ERR_PTR(-EFAULT); 228 } 229 230 return p; 231 } 232 EXPORT_SYMBOL(memdup_user); 233 234 /** 235 * vmemdup_user - duplicate memory region from user space 236 * 237 * @src: source address in user space 238 * @len: number of bytes to copy 239 * 240 * Return: an ERR_PTR() on failure. Result may be not 241 * physically contiguous. Use kvfree() to free. 242 */ 243 void *vmemdup_user(const void __user *src, size_t len) 244 { 245 void *p; 246 247 p = kmem_buckets_valloc(user_buckets, len, GFP_USER); 248 if (!p) 249 return ERR_PTR(-ENOMEM); 250 251 if (copy_from_user(p, src, len)) { 252 kvfree(p); 253 return ERR_PTR(-EFAULT); 254 } 255 256 return p; 257 } 258 EXPORT_SYMBOL(vmemdup_user); 259 260 /** 261 * strndup_user - duplicate an existing string from user space 262 * @s: The string to duplicate 263 * @n: Maximum number of bytes to copy, including the trailing NUL. 264 * 265 * Return: newly allocated copy of @s or an ERR_PTR() in case of error 266 */ 267 char *strndup_user(const char __user *s, long n) 268 { 269 char *p; 270 long length; 271 272 length = strnlen_user(s, n); 273 274 if (!length) 275 return ERR_PTR(-EFAULT); 276 277 if (length > n) 278 return ERR_PTR(-EINVAL); 279 280 p = memdup_user(s, length); 281 282 if (IS_ERR(p)) 283 return p; 284 285 p[length - 1] = '\0'; 286 287 return p; 288 } 289 EXPORT_SYMBOL(strndup_user); 290 291 /** 292 * memdup_user_nul - duplicate memory region from user space and NUL-terminate 293 * 294 * @src: source address in user space 295 * @len: number of bytes to copy 296 * 297 * Return: an ERR_PTR() on failure. 298 */ 299 void *memdup_user_nul(const void __user *src, size_t len) 300 { 301 char *p; 302 303 p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); 304 if (!p) 305 return ERR_PTR(-ENOMEM); 306 307 if (copy_from_user(p, src, len)) { 308 kfree(p); 309 return ERR_PTR(-EFAULT); 310 } 311 p[len] = '\0'; 312 313 return p; 314 } 315 EXPORT_SYMBOL(memdup_user_nul); 316 317 /* Check if the vma is being used as a stack by this task */ 318 int vma_is_stack_for_current(struct vm_area_struct *vma) 319 { 320 struct task_struct * __maybe_unused t = current; 321 322 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 323 } 324 325 /* 326 * Change backing file, only valid to use during initial VMA setup. 327 */ 328 void vma_set_file(struct vm_area_struct *vma, struct file *file) 329 { 330 /* Changing an anonymous vma with this is illegal */ 331 get_file(file); 332 swap(vma->vm_file, file); 333 fput(file); 334 } 335 EXPORT_SYMBOL(vma_set_file); 336 337 #ifndef STACK_RND_MASK 338 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 339 #endif 340 341 unsigned long randomize_stack_top(unsigned long stack_top) 342 { 343 unsigned long random_variable = 0; 344 345 if (current->flags & PF_RANDOMIZE) { 346 random_variable = get_random_long(); 347 random_variable &= STACK_RND_MASK; 348 random_variable <<= PAGE_SHIFT; 349 } 350 #ifdef CONFIG_STACK_GROWSUP 351 return PAGE_ALIGN(stack_top) + random_variable; 352 #else 353 return PAGE_ALIGN(stack_top) - random_variable; 354 #endif 355 } 356 357 /** 358 * randomize_page - Generate a random, page aligned address 359 * @start: The smallest acceptable address the caller will take. 360 * @range: The size of the area, starting at @start, within which the 361 * random address must fall. 362 * 363 * If @start + @range would overflow, @range is capped. 364 * 365 * NOTE: Historical use of randomize_range, which this replaces, presumed that 366 * @start was already page aligned. We now align it regardless. 367 * 368 * Return: A page aligned address within [start, start + range). On error, 369 * @start is returned. 370 */ 371 unsigned long randomize_page(unsigned long start, unsigned long range) 372 { 373 if (!PAGE_ALIGNED(start)) { 374 range -= PAGE_ALIGN(start) - start; 375 start = PAGE_ALIGN(start); 376 } 377 378 if (start > ULONG_MAX - range) 379 range = ULONG_MAX - start; 380 381 range >>= PAGE_SHIFT; 382 383 if (range == 0) 384 return start; 385 386 return start + (get_random_long() % range << PAGE_SHIFT); 387 } 388 389 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 390 unsigned long __weak arch_randomize_brk(struct mm_struct *mm) 391 { 392 /* Is the current task 32bit ? */ 393 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 394 return randomize_page(mm->brk, SZ_32M); 395 396 return randomize_page(mm->brk, SZ_1G); 397 } 398 399 unsigned long arch_mmap_rnd(void) 400 { 401 unsigned long rnd; 402 403 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 404 if (is_compat_task()) 405 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 406 else 407 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 408 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 409 410 return rnd << PAGE_SHIFT; 411 } 412 413 static int mmap_is_legacy(struct rlimit *rlim_stack) 414 { 415 if (current->personality & ADDR_COMPAT_LAYOUT) 416 return 1; 417 418 /* On parisc the stack always grows up - so a unlimited stack should 419 * not be an indicator to use the legacy memory layout. */ 420 if (rlim_stack->rlim_cur == RLIM_INFINITY && 421 !IS_ENABLED(CONFIG_STACK_GROWSUP)) 422 return 1; 423 424 return sysctl_legacy_va_layout; 425 } 426 427 /* 428 * Leave enough space between the mmap area and the stack to honour ulimit in 429 * the face of randomisation. 430 */ 431 #define MIN_GAP (SZ_128M) 432 #define MAX_GAP (STACK_TOP / 6 * 5) 433 434 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 435 { 436 #ifdef CONFIG_STACK_GROWSUP 437 /* 438 * For an upwards growing stack the calculation is much simpler. 439 * Memory for the maximum stack size is reserved at the top of the 440 * task. mmap_base starts directly below the stack and grows 441 * downwards. 442 */ 443 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); 444 #else 445 unsigned long gap = rlim_stack->rlim_cur; 446 unsigned long pad = stack_guard_gap; 447 448 /* Account for stack randomization if necessary */ 449 if (current->flags & PF_RANDOMIZE) 450 pad += (STACK_RND_MASK << PAGE_SHIFT); 451 452 /* Values close to RLIM_INFINITY can overflow. */ 453 if (gap + pad > gap) 454 gap += pad; 455 456 if (gap < MIN_GAP && MIN_GAP < MAX_GAP) 457 gap = MIN_GAP; 458 else if (gap > MAX_GAP) 459 gap = MAX_GAP; 460 461 return PAGE_ALIGN(STACK_TOP - gap - rnd); 462 #endif 463 } 464 465 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 466 { 467 unsigned long random_factor = 0UL; 468 469 if (current->flags & PF_RANDOMIZE) 470 random_factor = arch_mmap_rnd(); 471 472 if (mmap_is_legacy(rlim_stack)) { 473 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 474 clear_bit(MMF_TOPDOWN, &mm->flags); 475 } else { 476 mm->mmap_base = mmap_base(random_factor, rlim_stack); 477 set_bit(MMF_TOPDOWN, &mm->flags); 478 } 479 } 480 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 481 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 482 { 483 mm->mmap_base = TASK_UNMAPPED_BASE; 484 clear_bit(MMF_TOPDOWN, &mm->flags); 485 } 486 #endif 487 #ifdef CONFIG_MMU 488 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); 489 #endif 490 491 /** 492 * __account_locked_vm - account locked pages to an mm's locked_vm 493 * @mm: mm to account against 494 * @pages: number of pages to account 495 * @inc: %true if @pages should be considered positive, %false if not 496 * @task: task used to check RLIMIT_MEMLOCK 497 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped 498 * 499 * Assumes @task and @mm are valid (i.e. at least one reference on each), and 500 * that mmap_lock is held as writer. 501 * 502 * Return: 503 * * 0 on success 504 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 505 */ 506 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 507 struct task_struct *task, bool bypass_rlim) 508 { 509 unsigned long locked_vm, limit; 510 int ret = 0; 511 512 mmap_assert_write_locked(mm); 513 514 locked_vm = mm->locked_vm; 515 if (inc) { 516 if (!bypass_rlim) { 517 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; 518 if (locked_vm + pages > limit) 519 ret = -ENOMEM; 520 } 521 if (!ret) 522 mm->locked_vm = locked_vm + pages; 523 } else { 524 WARN_ON_ONCE(pages > locked_vm); 525 mm->locked_vm = locked_vm - pages; 526 } 527 528 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, 529 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, 530 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), 531 ret ? " - exceeded" : ""); 532 533 return ret; 534 } 535 EXPORT_SYMBOL_GPL(__account_locked_vm); 536 537 /** 538 * account_locked_vm - account locked pages to an mm's locked_vm 539 * @mm: mm to account against, may be NULL 540 * @pages: number of pages to account 541 * @inc: %true if @pages should be considered positive, %false if not 542 * 543 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). 544 * 545 * Return: 546 * * 0 on success, or if mm is NULL 547 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 548 */ 549 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) 550 { 551 int ret; 552 553 if (pages == 0 || !mm) 554 return 0; 555 556 mmap_write_lock(mm); 557 ret = __account_locked_vm(mm, pages, inc, current, 558 capable(CAP_IPC_LOCK)); 559 mmap_write_unlock(mm); 560 561 return ret; 562 } 563 EXPORT_SYMBOL_GPL(account_locked_vm); 564 565 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 566 unsigned long len, unsigned long prot, 567 unsigned long flag, unsigned long pgoff) 568 { 569 unsigned long ret; 570 struct mm_struct *mm = current->mm; 571 unsigned long populate; 572 LIST_HEAD(uf); 573 574 ret = security_mmap_file(file, prot, flag); 575 if (!ret) 576 ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); 577 if (!ret) { 578 if (mmap_write_lock_killable(mm)) 579 return -EINTR; 580 ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, 581 &uf); 582 mmap_write_unlock(mm); 583 userfaultfd_unmap_complete(mm, &uf); 584 if (populate) 585 mm_populate(ret, populate); 586 } 587 return ret; 588 } 589 590 /* 591 * Perform a userland memory mapping into the current process address space. See 592 * the comment for do_mmap() for more details on this operation in general. 593 * 594 * This differs from do_mmap() in that: 595 * 596 * a. An offset parameter is provided rather than pgoff, which is both checked 597 * for overflow and page alignment. 598 * b. mmap locking is performed on the caller's behalf. 599 * c. Userfaultfd unmap events and memory population are handled. 600 * 601 * This means that this function performs essentially the same work as if 602 * userland were invoking mmap (2). 603 * 604 * Returns either an error, or the address at which the requested mapping has 605 * been performed. 606 */ 607 unsigned long vm_mmap(struct file *file, unsigned long addr, 608 unsigned long len, unsigned long prot, 609 unsigned long flag, unsigned long offset) 610 { 611 if (unlikely(offset + PAGE_ALIGN(len) < offset)) 612 return -EINVAL; 613 if (unlikely(offset_in_page(offset))) 614 return -EINVAL; 615 616 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 617 } 618 EXPORT_SYMBOL(vm_mmap); 619 620 /** 621 * __vmalloc_array - allocate memory for a virtually contiguous array. 622 * @n: number of elements. 623 * @size: element size. 624 * @flags: the type of memory to allocate (see kmalloc). 625 */ 626 void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) 627 { 628 size_t bytes; 629 630 if (unlikely(check_mul_overflow(n, size, &bytes))) 631 return NULL; 632 return __vmalloc_noprof(bytes, flags); 633 } 634 EXPORT_SYMBOL(__vmalloc_array_noprof); 635 636 /** 637 * vmalloc_array - allocate memory for a virtually contiguous array. 638 * @n: number of elements. 639 * @size: element size. 640 */ 641 void *vmalloc_array_noprof(size_t n, size_t size) 642 { 643 return __vmalloc_array_noprof(n, size, GFP_KERNEL); 644 } 645 EXPORT_SYMBOL(vmalloc_array_noprof); 646 647 /** 648 * __vcalloc - allocate and zero memory for a virtually contiguous array. 649 * @n: number of elements. 650 * @size: element size. 651 * @flags: the type of memory to allocate (see kmalloc). 652 */ 653 void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) 654 { 655 return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); 656 } 657 EXPORT_SYMBOL(__vcalloc_noprof); 658 659 /** 660 * vcalloc - allocate and zero memory for a virtually contiguous array. 661 * @n: number of elements. 662 * @size: element size. 663 */ 664 void *vcalloc_noprof(size_t n, size_t size) 665 { 666 return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); 667 } 668 EXPORT_SYMBOL(vcalloc_noprof); 669 670 struct anon_vma *folio_anon_vma(const struct folio *folio) 671 { 672 unsigned long mapping = (unsigned long)folio->mapping; 673 674 if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) 675 return NULL; 676 return (void *)(mapping - FOLIO_MAPPING_ANON); 677 } 678 679 /** 680 * folio_mapping - Find the mapping where this folio is stored. 681 * @folio: The folio. 682 * 683 * For folios which are in the page cache, return the mapping that this 684 * page belongs to. Folios in the swap cache return the swap mapping 685 * this page is stored in (which is different from the mapping for the 686 * swap file or swap device where the data is stored). 687 * 688 * You can call this for folios which aren't in the swap cache or page 689 * cache and it will return NULL. 690 */ 691 struct address_space *folio_mapping(struct folio *folio) 692 { 693 struct address_space *mapping; 694 695 /* This happens if someone calls flush_dcache_page on slab page */ 696 if (unlikely(folio_test_slab(folio))) 697 return NULL; 698 699 if (unlikely(folio_test_swapcache(folio))) 700 return swap_address_space(folio->swap); 701 702 mapping = folio->mapping; 703 if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) 704 return NULL; 705 706 return mapping; 707 } 708 EXPORT_SYMBOL(folio_mapping); 709 710 /** 711 * folio_copy - Copy the contents of one folio to another. 712 * @dst: Folio to copy to. 713 * @src: Folio to copy from. 714 * 715 * The bytes in the folio represented by @src are copied to @dst. 716 * Assumes the caller has validated that @dst is at least as large as @src. 717 * Can be called in atomic context for order-0 folios, but if the folio is 718 * larger, it may sleep. 719 */ 720 void folio_copy(struct folio *dst, struct folio *src) 721 { 722 long i = 0; 723 long nr = folio_nr_pages(src); 724 725 for (;;) { 726 copy_highpage(folio_page(dst, i), folio_page(src, i)); 727 if (++i == nr) 728 break; 729 cond_resched(); 730 } 731 } 732 EXPORT_SYMBOL(folio_copy); 733 734 int folio_mc_copy(struct folio *dst, struct folio *src) 735 { 736 long nr = folio_nr_pages(src); 737 long i = 0; 738 739 for (;;) { 740 if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) 741 return -EHWPOISON; 742 if (++i == nr) 743 break; 744 cond_resched(); 745 } 746 747 return 0; 748 } 749 EXPORT_SYMBOL(folio_mc_copy); 750 751 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; 752 static int sysctl_overcommit_ratio __read_mostly = 50; 753 static unsigned long sysctl_overcommit_kbytes __read_mostly; 754 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 755 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 756 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 757 758 #ifdef CONFIG_SYSCTL 759 760 static int overcommit_ratio_handler(const struct ctl_table *table, int write, 761 void *buffer, size_t *lenp, loff_t *ppos) 762 { 763 int ret; 764 765 ret = proc_dointvec(table, write, buffer, lenp, ppos); 766 if (ret == 0 && write) 767 sysctl_overcommit_kbytes = 0; 768 return ret; 769 } 770 771 static void sync_overcommit_as(struct work_struct *dummy) 772 { 773 percpu_counter_sync(&vm_committed_as); 774 } 775 776 static int overcommit_policy_handler(const struct ctl_table *table, int write, 777 void *buffer, size_t *lenp, loff_t *ppos) 778 { 779 struct ctl_table t; 780 int new_policy = -1; 781 int ret; 782 783 /* 784 * The deviation of sync_overcommit_as could be big with loose policy 785 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to 786 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply 787 * with the strict "NEVER", and to avoid possible race condition (even 788 * though user usually won't too frequently do the switching to policy 789 * OVERCOMMIT_NEVER), the switch is done in the following order: 790 * 1. changing the batch 791 * 2. sync percpu count on each CPU 792 * 3. switch the policy 793 */ 794 if (write) { 795 t = *table; 796 t.data = &new_policy; 797 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 798 if (ret || new_policy == -1) 799 return ret; 800 801 mm_compute_batch(new_policy); 802 if (new_policy == OVERCOMMIT_NEVER) 803 schedule_on_each_cpu(sync_overcommit_as); 804 sysctl_overcommit_memory = new_policy; 805 } else { 806 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 807 } 808 809 return ret; 810 } 811 812 static int overcommit_kbytes_handler(const struct ctl_table *table, int write, 813 void *buffer, size_t *lenp, loff_t *ppos) 814 { 815 int ret; 816 817 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 818 if (ret == 0 && write) 819 sysctl_overcommit_ratio = 0; 820 return ret; 821 } 822 823 static const struct ctl_table util_sysctl_table[] = { 824 { 825 .procname = "overcommit_memory", 826 .data = &sysctl_overcommit_memory, 827 .maxlen = sizeof(sysctl_overcommit_memory), 828 .mode = 0644, 829 .proc_handler = overcommit_policy_handler, 830 .extra1 = SYSCTL_ZERO, 831 .extra2 = SYSCTL_TWO, 832 }, 833 { 834 .procname = "overcommit_ratio", 835 .data = &sysctl_overcommit_ratio, 836 .maxlen = sizeof(sysctl_overcommit_ratio), 837 .mode = 0644, 838 .proc_handler = overcommit_ratio_handler, 839 }, 840 { 841 .procname = "overcommit_kbytes", 842 .data = &sysctl_overcommit_kbytes, 843 .maxlen = sizeof(sysctl_overcommit_kbytes), 844 .mode = 0644, 845 .proc_handler = overcommit_kbytes_handler, 846 }, 847 { 848 .procname = "user_reserve_kbytes", 849 .data = &sysctl_user_reserve_kbytes, 850 .maxlen = sizeof(sysctl_user_reserve_kbytes), 851 .mode = 0644, 852 .proc_handler = proc_doulongvec_minmax, 853 }, 854 { 855 .procname = "admin_reserve_kbytes", 856 .data = &sysctl_admin_reserve_kbytes, 857 .maxlen = sizeof(sysctl_admin_reserve_kbytes), 858 .mode = 0644, 859 .proc_handler = proc_doulongvec_minmax, 860 }, 861 }; 862 863 static int __init init_vm_util_sysctls(void) 864 { 865 register_sysctl_init("vm", util_sysctl_table); 866 return 0; 867 } 868 subsys_initcall(init_vm_util_sysctls); 869 #endif /* CONFIG_SYSCTL */ 870 871 /* 872 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 873 */ 874 unsigned long vm_commit_limit(void) 875 { 876 unsigned long allowed; 877 878 if (sysctl_overcommit_kbytes) 879 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 880 else 881 allowed = ((totalram_pages() - hugetlb_total_pages()) 882 * sysctl_overcommit_ratio / 100); 883 allowed += total_swap_pages; 884 885 return allowed; 886 } 887 888 /* 889 * Make sure vm_committed_as in one cacheline and not cacheline shared with 890 * other variables. It can be updated by several CPUs frequently. 891 */ 892 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 893 894 /* 895 * The global memory commitment made in the system can be a metric 896 * that can be used to drive ballooning decisions when Linux is hosted 897 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 898 * balancing memory across competing virtual machines that are hosted. 899 * Several metrics drive this policy engine including the guest reported 900 * memory commitment. 901 * 902 * The time cost of this is very low for small platforms, and for big 903 * platform like a 2S/36C/72T Skylake server, in worst case where 904 * vm_committed_as's spinlock is under severe contention, the time cost 905 * could be about 30~40 microseconds. 906 */ 907 unsigned long vm_memory_committed(void) 908 { 909 return percpu_counter_sum_positive(&vm_committed_as); 910 } 911 EXPORT_SYMBOL_GPL(vm_memory_committed); 912 913 /* 914 * Check that a process has enough memory to allocate a new virtual 915 * mapping. 0 means there is enough memory for the allocation to 916 * succeed and -ENOMEM implies there is not. 917 * 918 * We currently support three overcommit policies, which are set via the 919 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst 920 * 921 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 922 * Additional code 2002 Jul 20 by Robert Love. 923 * 924 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 925 * 926 * Note this is a helper function intended to be used by LSMs which 927 * wish to use this logic. 928 */ 929 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 930 { 931 long allowed; 932 unsigned long bytes_failed; 933 934 vm_acct_memory(pages); 935 936 /* 937 * Sometimes we want to use more memory than we have 938 */ 939 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 940 return 0; 941 942 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 943 if (pages > totalram_pages() + total_swap_pages) 944 goto error; 945 return 0; 946 } 947 948 allowed = vm_commit_limit(); 949 /* 950 * Reserve some for root 951 */ 952 if (!cap_sys_admin) 953 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 954 955 /* 956 * Don't let a single process grow so big a user can't recover 957 */ 958 if (mm) { 959 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 960 961 allowed -= min_t(long, mm->total_vm / 32, reserve); 962 } 963 964 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 965 return 0; 966 error: 967 bytes_failed = pages << PAGE_SHIFT; 968 pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", 969 __func__, current->pid, current->comm, bytes_failed); 970 vm_unacct_memory(pages); 971 972 return -ENOMEM; 973 } 974 975 /** 976 * get_cmdline() - copy the cmdline value to a buffer. 977 * @task: the task whose cmdline value to copy. 978 * @buffer: the buffer to copy to. 979 * @buflen: the length of the buffer. Larger cmdline values are truncated 980 * to this length. 981 * 982 * Return: the size of the cmdline field copied. Note that the copy does 983 * not guarantee an ending NULL byte. 984 */ 985 int get_cmdline(struct task_struct *task, char *buffer, int buflen) 986 { 987 int res = 0; 988 unsigned int len; 989 struct mm_struct *mm = get_task_mm(task); 990 unsigned long arg_start, arg_end, env_start, env_end; 991 if (!mm) 992 goto out; 993 if (!mm->arg_end) 994 goto out_mm; /* Shh! No looking before we're done */ 995 996 spin_lock(&mm->arg_lock); 997 arg_start = mm->arg_start; 998 arg_end = mm->arg_end; 999 env_start = mm->env_start; 1000 env_end = mm->env_end; 1001 spin_unlock(&mm->arg_lock); 1002 1003 len = arg_end - arg_start; 1004 1005 if (len > buflen) 1006 len = buflen; 1007 1008 res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); 1009 1010 /* 1011 * If the nul at the end of args has been overwritten, then 1012 * assume application is using setproctitle(3). 1013 */ 1014 if (res > 0 && buffer[res-1] != '\0' && len < buflen) { 1015 len = strnlen(buffer, res); 1016 if (len < res) { 1017 res = len; 1018 } else { 1019 len = env_end - env_start; 1020 if (len > buflen - res) 1021 len = buflen - res; 1022 res += access_process_vm(task, env_start, 1023 buffer+res, len, 1024 FOLL_FORCE); 1025 res = strnlen(buffer, res); 1026 } 1027 } 1028 out_mm: 1029 mmput(mm); 1030 out: 1031 return res; 1032 } 1033 1034 int __weak memcmp_pages(struct page *page1, struct page *page2) 1035 { 1036 char *addr1, *addr2; 1037 int ret; 1038 1039 addr1 = kmap_local_page(page1); 1040 addr2 = kmap_local_page(page2); 1041 ret = memcmp(addr1, addr2, PAGE_SIZE); 1042 kunmap_local(addr2); 1043 kunmap_local(addr1); 1044 return ret; 1045 } 1046 1047 #ifdef CONFIG_PRINTK 1048 /** 1049 * mem_dump_obj - Print available provenance information 1050 * @object: object for which to find provenance information. 1051 * 1052 * This function uses pr_cont(), so that the caller is expected to have 1053 * printed out whatever preamble is appropriate. The provenance information 1054 * depends on the type of object and on how much debugging is enabled. 1055 * For example, for a slab-cache object, the slab name is printed, and, 1056 * if available, the return address and stack trace from the allocation 1057 * and last free path of that object. 1058 */ 1059 void mem_dump_obj(void *object) 1060 { 1061 const char *type; 1062 1063 if (kmem_dump_obj(object)) 1064 return; 1065 1066 if (vmalloc_dump_obj(object)) 1067 return; 1068 1069 if (is_vmalloc_addr(object)) 1070 type = "vmalloc memory"; 1071 else if (virt_addr_valid(object)) 1072 type = "non-slab/vmalloc memory"; 1073 else if (object == NULL) 1074 type = "NULL pointer"; 1075 else if (object == ZERO_SIZE_PTR) 1076 type = "zero-size pointer"; 1077 else 1078 type = "non-paged memory"; 1079 1080 pr_cont(" %s\n", type); 1081 } 1082 EXPORT_SYMBOL_GPL(mem_dump_obj); 1083 #endif 1084 1085 /* 1086 * A driver might set a page logically offline -- PageOffline() -- and 1087 * turn the page inaccessible in the hypervisor; after that, access to page 1088 * content can be fatal. 1089 * 1090 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random 1091 * pages after checking PageOffline(); however, these PFN walkers can race 1092 * with drivers that set PageOffline(). 1093 * 1094 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to 1095 * synchronize with such drivers, achieving that a page cannot be set 1096 * PageOffline() while frozen. 1097 * 1098 * page_offline_begin()/page_offline_end() is used by drivers that care about 1099 * such races when setting a page PageOffline(). 1100 */ 1101 static DECLARE_RWSEM(page_offline_rwsem); 1102 1103 void page_offline_freeze(void) 1104 { 1105 down_read(&page_offline_rwsem); 1106 } 1107 1108 void page_offline_thaw(void) 1109 { 1110 up_read(&page_offline_rwsem); 1111 } 1112 1113 void page_offline_begin(void) 1114 { 1115 down_write(&page_offline_rwsem); 1116 } 1117 EXPORT_SYMBOL(page_offline_begin); 1118 1119 void page_offline_end(void) 1120 { 1121 up_write(&page_offline_rwsem); 1122 } 1123 EXPORT_SYMBOL(page_offline_end); 1124 1125 #ifndef flush_dcache_folio 1126 void flush_dcache_folio(struct folio *folio) 1127 { 1128 long i, nr = folio_nr_pages(folio); 1129 1130 for (i = 0; i < nr; i++) 1131 flush_dcache_page(folio_page(folio, i)); 1132 } 1133 EXPORT_SYMBOL(flush_dcache_folio); 1134 #endif 1135 1136 /** 1137 * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an 1138 * existing VMA 1139 * @file: The file which possesss an f_op->mmap_prepare() hook 1140 * @vma: The VMA to apply the .mmap_prepare() hook to. 1141 * 1142 * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain 1143 * 'wrapper' file systems invoke a nested mmap hook of an underlying file. 1144 * 1145 * Until all filesystems are converted to use .mmap_prepare(), we must be 1146 * conservative and continue to invoke these 'wrapper' filesystems using the 1147 * deprecated .mmap() hook. 1148 * 1149 * However we have a problem if the underlying file system possesses an 1150 * .mmap_prepare() hook, as we are in a different context when we invoke the 1151 * .mmap() hook, already having a VMA to deal with. 1152 * 1153 * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, 1154 * establishes a struct vm_area_desc descriptor, passes to the underlying 1155 * .mmap_prepare() hook and applies any changes performed by it. 1156 * 1157 * Once the conversion of filesystems is complete this function will no longer 1158 * be required and will be removed. 1159 * 1160 * Returns: 0 on success or error. 1161 */ 1162 int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) 1163 { 1164 struct vm_area_desc desc; 1165 int err; 1166 1167 err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); 1168 if (err) 1169 return err; 1170 set_vma_from_desc(vma, &desc); 1171 1172 return 0; 1173 } 1174 EXPORT_SYMBOL(compat_vma_mmap_prepare); 1175 1176 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, 1177 const struct page *page) 1178 { 1179 /* 1180 * Only the first page of a high-order buddy page has PageBuddy() set. 1181 * So we have to check manually whether this page is part of a high- 1182 * order buddy page. 1183 */ 1184 if (PageBuddy(page)) 1185 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1186 else if (page_count(page) == 0 && is_free_buddy_page(page)) 1187 ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; 1188 1189 if (folio_test_idle(folio)) 1190 ps->flags |= PAGE_SNAPSHOT_PG_IDLE; 1191 } 1192 1193 /** 1194 * snapshot_page() - Create a snapshot of a struct page 1195 * @ps: Pointer to a struct page_snapshot to store the page snapshot 1196 * @page: The page to snapshot 1197 * 1198 * Create a snapshot of the page and store both its struct page and struct 1199 * folio representations in @ps. 1200 * 1201 * A snapshot is marked as "faithful" if the compound state of @page was 1202 * stable and allowed safe reconstruction of the folio representation. In 1203 * rare cases where this is not possible (e.g. due to folio splitting), 1204 * snapshot_page() falls back to treating @page as a single page and the 1205 * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() 1206 * helper can be used to check for this condition. 1207 */ 1208 void snapshot_page(struct page_snapshot *ps, const struct page *page) 1209 { 1210 unsigned long head, nr_pages = 1; 1211 struct folio *foliop; 1212 int loops = 5; 1213 1214 ps->pfn = page_to_pfn(page); 1215 ps->flags = PAGE_SNAPSHOT_FAITHFUL; 1216 1217 again: 1218 memset(&ps->folio_snapshot, 0, sizeof(struct folio)); 1219 memcpy(&ps->page_snapshot, page, sizeof(*page)); 1220 head = ps->page_snapshot.compound_head; 1221 if ((head & 1) == 0) { 1222 ps->idx = 0; 1223 foliop = (struct folio *)&ps->page_snapshot; 1224 if (!folio_test_large(foliop)) { 1225 set_ps_flags(ps, page_folio(page), page); 1226 memcpy(&ps->folio_snapshot, foliop, 1227 sizeof(struct page)); 1228 return; 1229 } 1230 foliop = (struct folio *)page; 1231 } else { 1232 foliop = (struct folio *)(head - 1); 1233 ps->idx = folio_page_idx(foliop, page); 1234 } 1235 1236 if (ps->idx < MAX_FOLIO_NR_PAGES) { 1237 memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); 1238 nr_pages = folio_nr_pages(&ps->folio_snapshot); 1239 if (nr_pages > 1) 1240 memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, 1241 sizeof(struct page)); 1242 set_ps_flags(ps, foliop, page); 1243 } 1244 1245 if (ps->idx > nr_pages) { 1246 if (loops-- > 0) 1247 goto again; 1248 clear_compound_head(&ps->page_snapshot); 1249 foliop = (struct folio *)&ps->page_snapshot; 1250 memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); 1251 ps->flags = 0; 1252 ps->idx = 0; 1253 } 1254 } 1255 1256 #ifdef CONFIG_MMU 1257 /** 1258 * folio_pte_batch - detect a PTE batch for a large folio 1259 * @folio: The large folio to detect a PTE batch for. 1260 * @ptep: Page table pointer for the first entry. 1261 * @pte: Page table entry for the first page. 1262 * @max_nr: The maximum number of table entries to consider. 1263 * 1264 * This is a simplified variant of folio_pte_batch_flags(). 1265 * 1266 * Detect a PTE batch: consecutive (present) PTEs that map consecutive 1267 * pages of the same large folio in a single VMA and a single page table. 1268 * 1269 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, 1270 * the accessed bit, writable bit, dirt-bit and soft-dirty bit. 1271 * 1272 * ptep must map any page of the folio. max_nr must be at least one and 1273 * must be limited by the caller so scanning cannot exceed a single VMA and 1274 * a single page table. 1275 * 1276 * Return: the number of table entries in the batch. 1277 */ 1278 unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, 1279 unsigned int max_nr) 1280 { 1281 return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); 1282 } 1283 #endif /* CONFIG_MMU */ 1284