1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/mm.h> 3 #include <linux/slab.h> 4 #include <linux/string.h> 5 #include <linux/compiler.h> 6 #include <linux/export.h> 7 #include <linux/err.h> 8 #include <linux/sched.h> 9 #include <linux/sched/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/sched/task_stack.h> 12 #include <linux/security.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/sysctl.h> 16 #include <linux/mman.h> 17 #include <linux/hugetlb.h> 18 #include <linux/vmalloc.h> 19 #include <linux/userfaultfd_k.h> 20 #include <linux/elf.h> 21 #include <linux/elf-randomize.h> 22 #include <linux/personality.h> 23 #include <linux/random.h> 24 #include <linux/processor.h> 25 #include <linux/sizes.h> 26 #include <linux/compat.h> 27 #include <linux/fsnotify.h> 28 29 #include <linux/uaccess.h> 30 31 #include <kunit/visibility.h> 32 33 #include "internal.h" 34 #include "swap.h" 35 36 /** 37 * kfree_const - conditionally free memory 38 * @x: pointer to the memory 39 * 40 * Function calls kfree only if @x is not in .rodata section. 41 */ 42 void kfree_const(const void *x) 43 { 44 if (!is_kernel_rodata((unsigned long)x)) 45 kfree(x); 46 } 47 EXPORT_SYMBOL(kfree_const); 48 49 /** 50 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. 51 * @s: The data to copy 52 * @len: The size of the data, not including the NUL terminator 53 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 54 * 55 * Return: newly allocated copy of @s with NUL-termination or %NULL in 56 * case of error 57 */ 58 static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) 59 { 60 char *buf; 61 62 /* '+1' for the NUL terminator */ 63 buf = kmalloc_track_caller(len + 1, gfp); 64 if (!buf) 65 return NULL; 66 67 memcpy(buf, s, len); 68 /* Ensure the buf is always NUL-terminated, regardless of @s. */ 69 buf[len] = '\0'; 70 return buf; 71 } 72 73 /** 74 * kstrdup - allocate space for and copy an existing string 75 * @s: the string to duplicate 76 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 77 * 78 * Return: newly allocated copy of @s or %NULL in case of error 79 */ 80 noinline 81 char *kstrdup(const char *s, gfp_t gfp) 82 { 83 return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; 84 } 85 EXPORT_SYMBOL(kstrdup); 86 87 /** 88 * kstrdup_const - conditionally duplicate an existing const string 89 * @s: the string to duplicate 90 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 91 * 92 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and 93 * must not be passed to krealloc(). 94 * 95 * Return: source string if it is in .rodata section otherwise 96 * fallback to kstrdup. 97 */ 98 const char *kstrdup_const(const char *s, gfp_t gfp) 99 { 100 if (is_kernel_rodata((unsigned long)s)) 101 return s; 102 103 return kstrdup(s, gfp); 104 } 105 EXPORT_SYMBOL(kstrdup_const); 106 107 /** 108 * kstrndup - allocate space for and copy an existing string 109 * @s: the string to duplicate 110 * @max: read at most @max chars from @s 111 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 112 * 113 * Note: Use kmemdup_nul() instead if the size is known exactly. 114 * 115 * Return: newly allocated copy of @s or %NULL in case of error 116 */ 117 char *kstrndup(const char *s, size_t max, gfp_t gfp) 118 { 119 return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; 120 } 121 EXPORT_SYMBOL(kstrndup); 122 123 /** 124 * kmemdup - duplicate region of memory 125 * 126 * @src: memory region to duplicate 127 * @len: memory region length 128 * @gfp: GFP mask to use 129 * 130 * Return: newly allocated copy of @src or %NULL in case of error, 131 * result is physically contiguous. Use kfree() to free. 132 */ 133 void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) 134 { 135 void *p; 136 137 p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); 138 if (p) 139 memcpy(p, src, len); 140 return p; 141 } 142 EXPORT_SYMBOL(kmemdup_noprof); 143 144 /** 145 * kmemdup_array - duplicate a given array. 146 * 147 * @src: array to duplicate. 148 * @count: number of elements to duplicate from array. 149 * @element_size: size of each element of array. 150 * @gfp: GFP mask to use. 151 * 152 * Return: duplicated array of @src or %NULL in case of error, 153 * result is physically contiguous. Use kfree() to free. 154 */ 155 void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) 156 { 157 return kmemdup(src, size_mul(element_size, count), gfp); 158 } 159 EXPORT_SYMBOL(kmemdup_array); 160 161 /** 162 * kvmemdup - duplicate region of memory 163 * 164 * @src: memory region to duplicate 165 * @len: memory region length 166 * @gfp: GFP mask to use 167 * 168 * Return: newly allocated copy of @src or %NULL in case of error, 169 * result may be not physically contiguous. Use kvfree() to free. 170 */ 171 void *kvmemdup(const void *src, size_t len, gfp_t gfp) 172 { 173 void *p; 174 175 p = kvmalloc(len, gfp); 176 if (p) 177 memcpy(p, src, len); 178 return p; 179 } 180 EXPORT_SYMBOL(kvmemdup); 181 182 /** 183 * kmemdup_nul - Create a NUL-terminated string from unterminated data 184 * @s: The data to stringify 185 * @len: The size of the data 186 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 187 * 188 * Return: newly allocated copy of @s with NUL-termination or %NULL in 189 * case of error 190 */ 191 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 192 { 193 return s ? __kmemdup_nul(s, len, gfp) : NULL; 194 } 195 EXPORT_SYMBOL(kmemdup_nul); 196 197 static kmem_buckets *user_buckets __ro_after_init; 198 199 static int __init init_user_buckets(void) 200 { 201 user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); 202 203 return 0; 204 } 205 subsys_initcall(init_user_buckets); 206 207 /** 208 * memdup_user - duplicate memory region from user space 209 * 210 * @src: source address in user space 211 * @len: number of bytes to copy 212 * 213 * Return: an ERR_PTR() on failure. Result is physically 214 * contiguous, to be freed by kfree(). 215 */ 216 void *memdup_user(const void __user *src, size_t len) 217 { 218 void *p; 219 220 p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); 221 if (!p) 222 return ERR_PTR(-ENOMEM); 223 224 if (copy_from_user(p, src, len)) { 225 kfree(p); 226 return ERR_PTR(-EFAULT); 227 } 228 229 return p; 230 } 231 EXPORT_SYMBOL(memdup_user); 232 233 /** 234 * vmemdup_user - duplicate memory region from user space 235 * 236 * @src: source address in user space 237 * @len: number of bytes to copy 238 * 239 * Return: an ERR_PTR() on failure. Result may be not 240 * physically contiguous. Use kvfree() to free. 241 */ 242 void *vmemdup_user(const void __user *src, size_t len) 243 { 244 void *p; 245 246 p = kmem_buckets_valloc(user_buckets, len, GFP_USER); 247 if (!p) 248 return ERR_PTR(-ENOMEM); 249 250 if (copy_from_user(p, src, len)) { 251 kvfree(p); 252 return ERR_PTR(-EFAULT); 253 } 254 255 return p; 256 } 257 EXPORT_SYMBOL(vmemdup_user); 258 259 /** 260 * strndup_user - duplicate an existing string from user space 261 * @s: The string to duplicate 262 * @n: Maximum number of bytes to copy, including the trailing NUL. 263 * 264 * Return: newly allocated copy of @s or an ERR_PTR() in case of error 265 */ 266 char *strndup_user(const char __user *s, long n) 267 { 268 char *p; 269 long length; 270 271 length = strnlen_user(s, n); 272 273 if (!length) 274 return ERR_PTR(-EFAULT); 275 276 if (length > n) 277 return ERR_PTR(-EINVAL); 278 279 p = memdup_user(s, length); 280 281 if (IS_ERR(p)) 282 return p; 283 284 p[length - 1] = '\0'; 285 286 return p; 287 } 288 EXPORT_SYMBOL(strndup_user); 289 290 /** 291 * memdup_user_nul - duplicate memory region from user space and NUL-terminate 292 * 293 * @src: source address in user space 294 * @len: number of bytes to copy 295 * 296 * Return: an ERR_PTR() on failure. 297 */ 298 void *memdup_user_nul(const void __user *src, size_t len) 299 { 300 char *p; 301 302 p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); 303 if (!p) 304 return ERR_PTR(-ENOMEM); 305 306 if (copy_from_user(p, src, len)) { 307 kfree(p); 308 return ERR_PTR(-EFAULT); 309 } 310 p[len] = '\0'; 311 312 return p; 313 } 314 EXPORT_SYMBOL(memdup_user_nul); 315 316 /* Check if the vma is being used as a stack by this task */ 317 int vma_is_stack_for_current(struct vm_area_struct *vma) 318 { 319 struct task_struct * __maybe_unused t = current; 320 321 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 322 } 323 324 /* 325 * Change backing file, only valid to use during initial VMA setup. 326 */ 327 void vma_set_file(struct vm_area_struct *vma, struct file *file) 328 { 329 /* Changing an anonymous vma with this is illegal */ 330 get_file(file); 331 swap(vma->vm_file, file); 332 fput(file); 333 } 334 EXPORT_SYMBOL(vma_set_file); 335 336 #ifndef STACK_RND_MASK 337 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 338 #endif 339 340 unsigned long randomize_stack_top(unsigned long stack_top) 341 { 342 unsigned long random_variable = 0; 343 344 if (current->flags & PF_RANDOMIZE) { 345 random_variable = get_random_long(); 346 random_variable &= STACK_RND_MASK; 347 random_variable <<= PAGE_SHIFT; 348 } 349 #ifdef CONFIG_STACK_GROWSUP 350 return PAGE_ALIGN(stack_top) + random_variable; 351 #else 352 return PAGE_ALIGN(stack_top) - random_variable; 353 #endif 354 } 355 356 /** 357 * randomize_page - Generate a random, page aligned address 358 * @start: The smallest acceptable address the caller will take. 359 * @range: The size of the area, starting at @start, within which the 360 * random address must fall. 361 * 362 * If @start + @range would overflow, @range is capped. 363 * 364 * NOTE: Historical use of randomize_range, which this replaces, presumed that 365 * @start was already page aligned. We now align it regardless. 366 * 367 * Return: A page aligned address within [start, start + range). On error, 368 * @start is returned. 369 */ 370 unsigned long randomize_page(unsigned long start, unsigned long range) 371 { 372 if (!PAGE_ALIGNED(start)) { 373 range -= PAGE_ALIGN(start) - start; 374 start = PAGE_ALIGN(start); 375 } 376 377 if (start > ULONG_MAX - range) 378 range = ULONG_MAX - start; 379 380 range >>= PAGE_SHIFT; 381 382 if (range == 0) 383 return start; 384 385 return start + (get_random_long() % range << PAGE_SHIFT); 386 } 387 388 #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 389 unsigned long __weak arch_randomize_brk(struct mm_struct *mm) 390 { 391 /* Is the current task 32bit ? */ 392 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 393 return randomize_page(mm->brk, SZ_32M); 394 395 return randomize_page(mm->brk, SZ_1G); 396 } 397 398 unsigned long arch_mmap_rnd(void) 399 { 400 unsigned long rnd; 401 402 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 403 if (is_compat_task()) 404 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 405 else 406 #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 407 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 408 409 return rnd << PAGE_SHIFT; 410 } 411 412 static int mmap_is_legacy(struct rlimit *rlim_stack) 413 { 414 if (current->personality & ADDR_COMPAT_LAYOUT) 415 return 1; 416 417 /* On parisc the stack always grows up - so a unlimited stack should 418 * not be an indicator to use the legacy memory layout. */ 419 if (rlim_stack->rlim_cur == RLIM_INFINITY && 420 !IS_ENABLED(CONFIG_STACK_GROWSUP)) 421 return 1; 422 423 return sysctl_legacy_va_layout; 424 } 425 426 /* 427 * Leave enough space between the mmap area and the stack to honour ulimit in 428 * the face of randomisation. 429 */ 430 #define MIN_GAP (SZ_128M) 431 #define MAX_GAP (STACK_TOP / 6 * 5) 432 433 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 434 { 435 #ifdef CONFIG_STACK_GROWSUP 436 /* 437 * For an upwards growing stack the calculation is much simpler. 438 * Memory for the maximum stack size is reserved at the top of the 439 * task. mmap_base starts directly below the stack and grows 440 * downwards. 441 */ 442 return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); 443 #else 444 unsigned long gap = rlim_stack->rlim_cur; 445 unsigned long pad = stack_guard_gap; 446 447 /* Account for stack randomization if necessary */ 448 if (current->flags & PF_RANDOMIZE) 449 pad += (STACK_RND_MASK << PAGE_SHIFT); 450 451 /* Values close to RLIM_INFINITY can overflow. */ 452 if (gap + pad > gap) 453 gap += pad; 454 455 if (gap < MIN_GAP && MIN_GAP < MAX_GAP) 456 gap = MIN_GAP; 457 else if (gap > MAX_GAP) 458 gap = MAX_GAP; 459 460 return PAGE_ALIGN(STACK_TOP - gap - rnd); 461 #endif 462 } 463 464 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 465 { 466 unsigned long random_factor = 0UL; 467 468 if (current->flags & PF_RANDOMIZE) 469 random_factor = arch_mmap_rnd(); 470 471 if (mmap_is_legacy(rlim_stack)) { 472 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 473 clear_bit(MMF_TOPDOWN, &mm->flags); 474 } else { 475 mm->mmap_base = mmap_base(random_factor, rlim_stack); 476 set_bit(MMF_TOPDOWN, &mm->flags); 477 } 478 } 479 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 480 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 481 { 482 mm->mmap_base = TASK_UNMAPPED_BASE; 483 clear_bit(MMF_TOPDOWN, &mm->flags); 484 } 485 #endif 486 #ifdef CONFIG_MMU 487 EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); 488 #endif 489 490 /** 491 * __account_locked_vm - account locked pages to an mm's locked_vm 492 * @mm: mm to account against 493 * @pages: number of pages to account 494 * @inc: %true if @pages should be considered positive, %false if not 495 * @task: task used to check RLIMIT_MEMLOCK 496 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped 497 * 498 * Assumes @task and @mm are valid (i.e. at least one reference on each), and 499 * that mmap_lock is held as writer. 500 * 501 * Return: 502 * * 0 on success 503 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 504 */ 505 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 506 struct task_struct *task, bool bypass_rlim) 507 { 508 unsigned long locked_vm, limit; 509 int ret = 0; 510 511 mmap_assert_write_locked(mm); 512 513 locked_vm = mm->locked_vm; 514 if (inc) { 515 if (!bypass_rlim) { 516 limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; 517 if (locked_vm + pages > limit) 518 ret = -ENOMEM; 519 } 520 if (!ret) 521 mm->locked_vm = locked_vm + pages; 522 } else { 523 WARN_ON_ONCE(pages > locked_vm); 524 mm->locked_vm = locked_vm - pages; 525 } 526 527 pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, 528 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, 529 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), 530 ret ? " - exceeded" : ""); 531 532 return ret; 533 } 534 EXPORT_SYMBOL_GPL(__account_locked_vm); 535 536 /** 537 * account_locked_vm - account locked pages to an mm's locked_vm 538 * @mm: mm to account against, may be NULL 539 * @pages: number of pages to account 540 * @inc: %true if @pages should be considered positive, %false if not 541 * 542 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). 543 * 544 * Return: 545 * * 0 on success, or if mm is NULL 546 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 547 */ 548 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) 549 { 550 int ret; 551 552 if (pages == 0 || !mm) 553 return 0; 554 555 mmap_write_lock(mm); 556 ret = __account_locked_vm(mm, pages, inc, current, 557 capable(CAP_IPC_LOCK)); 558 mmap_write_unlock(mm); 559 560 return ret; 561 } 562 EXPORT_SYMBOL_GPL(account_locked_vm); 563 564 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 565 unsigned long len, unsigned long prot, 566 unsigned long flag, unsigned long pgoff) 567 { 568 unsigned long ret; 569 struct mm_struct *mm = current->mm; 570 unsigned long populate; 571 LIST_HEAD(uf); 572 573 ret = security_mmap_file(file, prot, flag); 574 if (!ret) 575 ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); 576 if (!ret) { 577 if (mmap_write_lock_killable(mm)) 578 return -EINTR; 579 ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, 580 &uf); 581 mmap_write_unlock(mm); 582 userfaultfd_unmap_complete(mm, &uf); 583 if (populate) 584 mm_populate(ret, populate); 585 } 586 return ret; 587 } 588 589 /* 590 * Perform a userland memory mapping into the current process address space. See 591 * the comment for do_mmap() for more details on this operation in general. 592 * 593 * This differs from do_mmap() in that: 594 * 595 * a. An offset parameter is provided rather than pgoff, which is both checked 596 * for overflow and page alignment. 597 * b. mmap locking is performed on the caller's behalf. 598 * c. Userfaultfd unmap events and memory population are handled. 599 * 600 * This means that this function performs essentially the same work as if 601 * userland were invoking mmap (2). 602 * 603 * Returns either an error, or the address at which the requested mapping has 604 * been performed. 605 */ 606 unsigned long vm_mmap(struct file *file, unsigned long addr, 607 unsigned long len, unsigned long prot, 608 unsigned long flag, unsigned long offset) 609 { 610 if (unlikely(offset + PAGE_ALIGN(len) < offset)) 611 return -EINVAL; 612 if (unlikely(offset_in_page(offset))) 613 return -EINVAL; 614 615 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 616 } 617 EXPORT_SYMBOL(vm_mmap); 618 619 /** 620 * __vmalloc_array - allocate memory for a virtually contiguous array. 621 * @n: number of elements. 622 * @size: element size. 623 * @flags: the type of memory to allocate (see kmalloc). 624 */ 625 void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) 626 { 627 size_t bytes; 628 629 if (unlikely(check_mul_overflow(n, size, &bytes))) 630 return NULL; 631 return __vmalloc_noprof(bytes, flags); 632 } 633 EXPORT_SYMBOL(__vmalloc_array_noprof); 634 635 /** 636 * vmalloc_array - allocate memory for a virtually contiguous array. 637 * @n: number of elements. 638 * @size: element size. 639 */ 640 void *vmalloc_array_noprof(size_t n, size_t size) 641 { 642 return __vmalloc_array_noprof(n, size, GFP_KERNEL); 643 } 644 EXPORT_SYMBOL(vmalloc_array_noprof); 645 646 /** 647 * __vcalloc - allocate and zero memory for a virtually contiguous array. 648 * @n: number of elements. 649 * @size: element size. 650 * @flags: the type of memory to allocate (see kmalloc). 651 */ 652 void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) 653 { 654 return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); 655 } 656 EXPORT_SYMBOL(__vcalloc_noprof); 657 658 /** 659 * vcalloc - allocate and zero memory for a virtually contiguous array. 660 * @n: number of elements. 661 * @size: element size. 662 */ 663 void *vcalloc_noprof(size_t n, size_t size) 664 { 665 return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); 666 } 667 EXPORT_SYMBOL(vcalloc_noprof); 668 669 struct anon_vma *folio_anon_vma(const struct folio *folio) 670 { 671 unsigned long mapping = (unsigned long)folio->mapping; 672 673 if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 674 return NULL; 675 return (void *)(mapping - PAGE_MAPPING_ANON); 676 } 677 678 /** 679 * folio_mapping - Find the mapping where this folio is stored. 680 * @folio: The folio. 681 * 682 * For folios which are in the page cache, return the mapping that this 683 * page belongs to. Folios in the swap cache return the swap mapping 684 * this page is stored in (which is different from the mapping for the 685 * swap file or swap device where the data is stored). 686 * 687 * You can call this for folios which aren't in the swap cache or page 688 * cache and it will return NULL. 689 */ 690 struct address_space *folio_mapping(struct folio *folio) 691 { 692 struct address_space *mapping; 693 694 /* This happens if someone calls flush_dcache_page on slab page */ 695 if (unlikely(folio_test_slab(folio))) 696 return NULL; 697 698 if (unlikely(folio_test_swapcache(folio))) 699 return swap_address_space(folio->swap); 700 701 mapping = folio->mapping; 702 if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) 703 return NULL; 704 705 return mapping; 706 } 707 EXPORT_SYMBOL(folio_mapping); 708 709 /** 710 * folio_copy - Copy the contents of one folio to another. 711 * @dst: Folio to copy to. 712 * @src: Folio to copy from. 713 * 714 * The bytes in the folio represented by @src are copied to @dst. 715 * Assumes the caller has validated that @dst is at least as large as @src. 716 * Can be called in atomic context for order-0 folios, but if the folio is 717 * larger, it may sleep. 718 */ 719 void folio_copy(struct folio *dst, struct folio *src) 720 { 721 long i = 0; 722 long nr = folio_nr_pages(src); 723 724 for (;;) { 725 copy_highpage(folio_page(dst, i), folio_page(src, i)); 726 if (++i == nr) 727 break; 728 cond_resched(); 729 } 730 } 731 EXPORT_SYMBOL(folio_copy); 732 733 int folio_mc_copy(struct folio *dst, struct folio *src) 734 { 735 long nr = folio_nr_pages(src); 736 long i = 0; 737 738 for (;;) { 739 if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) 740 return -EHWPOISON; 741 if (++i == nr) 742 break; 743 cond_resched(); 744 } 745 746 return 0; 747 } 748 EXPORT_SYMBOL(folio_mc_copy); 749 750 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; 751 static int sysctl_overcommit_ratio __read_mostly = 50; 752 static unsigned long sysctl_overcommit_kbytes __read_mostly; 753 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 754 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 755 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 756 757 #ifdef CONFIG_SYSCTL 758 759 static int overcommit_ratio_handler(const struct ctl_table *table, int write, 760 void *buffer, size_t *lenp, loff_t *ppos) 761 { 762 int ret; 763 764 ret = proc_dointvec(table, write, buffer, lenp, ppos); 765 if (ret == 0 && write) 766 sysctl_overcommit_kbytes = 0; 767 return ret; 768 } 769 770 static void sync_overcommit_as(struct work_struct *dummy) 771 { 772 percpu_counter_sync(&vm_committed_as); 773 } 774 775 static int overcommit_policy_handler(const struct ctl_table *table, int write, 776 void *buffer, size_t *lenp, loff_t *ppos) 777 { 778 struct ctl_table t; 779 int new_policy = -1; 780 int ret; 781 782 /* 783 * The deviation of sync_overcommit_as could be big with loose policy 784 * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to 785 * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply 786 * with the strict "NEVER", and to avoid possible race condition (even 787 * though user usually won't too frequently do the switching to policy 788 * OVERCOMMIT_NEVER), the switch is done in the following order: 789 * 1. changing the batch 790 * 2. sync percpu count on each CPU 791 * 3. switch the policy 792 */ 793 if (write) { 794 t = *table; 795 t.data = &new_policy; 796 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 797 if (ret || new_policy == -1) 798 return ret; 799 800 mm_compute_batch(new_policy); 801 if (new_policy == OVERCOMMIT_NEVER) 802 schedule_on_each_cpu(sync_overcommit_as); 803 sysctl_overcommit_memory = new_policy; 804 } else { 805 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 806 } 807 808 return ret; 809 } 810 811 static int overcommit_kbytes_handler(const struct ctl_table *table, int write, 812 void *buffer, size_t *lenp, loff_t *ppos) 813 { 814 int ret; 815 816 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 817 if (ret == 0 && write) 818 sysctl_overcommit_ratio = 0; 819 return ret; 820 } 821 822 static const struct ctl_table util_sysctl_table[] = { 823 { 824 .procname = "overcommit_memory", 825 .data = &sysctl_overcommit_memory, 826 .maxlen = sizeof(sysctl_overcommit_memory), 827 .mode = 0644, 828 .proc_handler = overcommit_policy_handler, 829 .extra1 = SYSCTL_ZERO, 830 .extra2 = SYSCTL_TWO, 831 }, 832 { 833 .procname = "overcommit_ratio", 834 .data = &sysctl_overcommit_ratio, 835 .maxlen = sizeof(sysctl_overcommit_ratio), 836 .mode = 0644, 837 .proc_handler = overcommit_ratio_handler, 838 }, 839 { 840 .procname = "overcommit_kbytes", 841 .data = &sysctl_overcommit_kbytes, 842 .maxlen = sizeof(sysctl_overcommit_kbytes), 843 .mode = 0644, 844 .proc_handler = overcommit_kbytes_handler, 845 }, 846 { 847 .procname = "user_reserve_kbytes", 848 .data = &sysctl_user_reserve_kbytes, 849 .maxlen = sizeof(sysctl_user_reserve_kbytes), 850 .mode = 0644, 851 .proc_handler = proc_doulongvec_minmax, 852 }, 853 { 854 .procname = "admin_reserve_kbytes", 855 .data = &sysctl_admin_reserve_kbytes, 856 .maxlen = sizeof(sysctl_admin_reserve_kbytes), 857 .mode = 0644, 858 .proc_handler = proc_doulongvec_minmax, 859 }, 860 }; 861 862 static int __init init_vm_util_sysctls(void) 863 { 864 register_sysctl_init("vm", util_sysctl_table); 865 return 0; 866 } 867 subsys_initcall(init_vm_util_sysctls); 868 #endif /* CONFIG_SYSCTL */ 869 870 /* 871 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 872 */ 873 unsigned long vm_commit_limit(void) 874 { 875 unsigned long allowed; 876 877 if (sysctl_overcommit_kbytes) 878 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 879 else 880 allowed = ((totalram_pages() - hugetlb_total_pages()) 881 * sysctl_overcommit_ratio / 100); 882 allowed += total_swap_pages; 883 884 return allowed; 885 } 886 887 /* 888 * Make sure vm_committed_as in one cacheline and not cacheline shared with 889 * other variables. It can be updated by several CPUs frequently. 890 */ 891 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 892 893 /* 894 * The global memory commitment made in the system can be a metric 895 * that can be used to drive ballooning decisions when Linux is hosted 896 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 897 * balancing memory across competing virtual machines that are hosted. 898 * Several metrics drive this policy engine including the guest reported 899 * memory commitment. 900 * 901 * The time cost of this is very low for small platforms, and for big 902 * platform like a 2S/36C/72T Skylake server, in worst case where 903 * vm_committed_as's spinlock is under severe contention, the time cost 904 * could be about 30~40 microseconds. 905 */ 906 unsigned long vm_memory_committed(void) 907 { 908 return percpu_counter_sum_positive(&vm_committed_as); 909 } 910 EXPORT_SYMBOL_GPL(vm_memory_committed); 911 912 /* 913 * Check that a process has enough memory to allocate a new virtual 914 * mapping. 0 means there is enough memory for the allocation to 915 * succeed and -ENOMEM implies there is not. 916 * 917 * We currently support three overcommit policies, which are set via the 918 * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst 919 * 920 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 921 * Additional code 2002 Jul 20 by Robert Love. 922 * 923 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 924 * 925 * Note this is a helper function intended to be used by LSMs which 926 * wish to use this logic. 927 */ 928 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 929 { 930 long allowed; 931 unsigned long bytes_failed; 932 933 vm_acct_memory(pages); 934 935 /* 936 * Sometimes we want to use more memory than we have 937 */ 938 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 939 return 0; 940 941 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 942 if (pages > totalram_pages() + total_swap_pages) 943 goto error; 944 return 0; 945 } 946 947 allowed = vm_commit_limit(); 948 /* 949 * Reserve some for root 950 */ 951 if (!cap_sys_admin) 952 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 953 954 /* 955 * Don't let a single process grow so big a user can't recover 956 */ 957 if (mm) { 958 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 959 960 allowed -= min_t(long, mm->total_vm / 32, reserve); 961 } 962 963 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 964 return 0; 965 error: 966 bytes_failed = pages << PAGE_SHIFT; 967 pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", 968 __func__, current->pid, current->comm, bytes_failed); 969 vm_unacct_memory(pages); 970 971 return -ENOMEM; 972 } 973 974 /** 975 * get_cmdline() - copy the cmdline value to a buffer. 976 * @task: the task whose cmdline value to copy. 977 * @buffer: the buffer to copy to. 978 * @buflen: the length of the buffer. Larger cmdline values are truncated 979 * to this length. 980 * 981 * Return: the size of the cmdline field copied. Note that the copy does 982 * not guarantee an ending NULL byte. 983 */ 984 int get_cmdline(struct task_struct *task, char *buffer, int buflen) 985 { 986 int res = 0; 987 unsigned int len; 988 struct mm_struct *mm = get_task_mm(task); 989 unsigned long arg_start, arg_end, env_start, env_end; 990 if (!mm) 991 goto out; 992 if (!mm->arg_end) 993 goto out_mm; /* Shh! No looking before we're done */ 994 995 spin_lock(&mm->arg_lock); 996 arg_start = mm->arg_start; 997 arg_end = mm->arg_end; 998 env_start = mm->env_start; 999 env_end = mm->env_end; 1000 spin_unlock(&mm->arg_lock); 1001 1002 len = arg_end - arg_start; 1003 1004 if (len > buflen) 1005 len = buflen; 1006 1007 res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); 1008 1009 /* 1010 * If the nul at the end of args has been overwritten, then 1011 * assume application is using setproctitle(3). 1012 */ 1013 if (res > 0 && buffer[res-1] != '\0' && len < buflen) { 1014 len = strnlen(buffer, res); 1015 if (len < res) { 1016 res = len; 1017 } else { 1018 len = env_end - env_start; 1019 if (len > buflen - res) 1020 len = buflen - res; 1021 res += access_process_vm(task, env_start, 1022 buffer+res, len, 1023 FOLL_FORCE); 1024 res = strnlen(buffer, res); 1025 } 1026 } 1027 out_mm: 1028 mmput(mm); 1029 out: 1030 return res; 1031 } 1032 1033 int __weak memcmp_pages(struct page *page1, struct page *page2) 1034 { 1035 char *addr1, *addr2; 1036 int ret; 1037 1038 addr1 = kmap_local_page(page1); 1039 addr2 = kmap_local_page(page2); 1040 ret = memcmp(addr1, addr2, PAGE_SIZE); 1041 kunmap_local(addr2); 1042 kunmap_local(addr1); 1043 return ret; 1044 } 1045 1046 #ifdef CONFIG_PRINTK 1047 /** 1048 * mem_dump_obj - Print available provenance information 1049 * @object: object for which to find provenance information. 1050 * 1051 * This function uses pr_cont(), so that the caller is expected to have 1052 * printed out whatever preamble is appropriate. The provenance information 1053 * depends on the type of object and on how much debugging is enabled. 1054 * For example, for a slab-cache object, the slab name is printed, and, 1055 * if available, the return address and stack trace from the allocation 1056 * and last free path of that object. 1057 */ 1058 void mem_dump_obj(void *object) 1059 { 1060 const char *type; 1061 1062 if (kmem_dump_obj(object)) 1063 return; 1064 1065 if (vmalloc_dump_obj(object)) 1066 return; 1067 1068 if (is_vmalloc_addr(object)) 1069 type = "vmalloc memory"; 1070 else if (virt_addr_valid(object)) 1071 type = "non-slab/vmalloc memory"; 1072 else if (object == NULL) 1073 type = "NULL pointer"; 1074 else if (object == ZERO_SIZE_PTR) 1075 type = "zero-size pointer"; 1076 else 1077 type = "non-paged memory"; 1078 1079 pr_cont(" %s\n", type); 1080 } 1081 EXPORT_SYMBOL_GPL(mem_dump_obj); 1082 #endif 1083 1084 /* 1085 * A driver might set a page logically offline -- PageOffline() -- and 1086 * turn the page inaccessible in the hypervisor; after that, access to page 1087 * content can be fatal. 1088 * 1089 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random 1090 * pages after checking PageOffline(); however, these PFN walkers can race 1091 * with drivers that set PageOffline(). 1092 * 1093 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to 1094 * synchronize with such drivers, achieving that a page cannot be set 1095 * PageOffline() while frozen. 1096 * 1097 * page_offline_begin()/page_offline_end() is used by drivers that care about 1098 * such races when setting a page PageOffline(). 1099 */ 1100 static DECLARE_RWSEM(page_offline_rwsem); 1101 1102 void page_offline_freeze(void) 1103 { 1104 down_read(&page_offline_rwsem); 1105 } 1106 1107 void page_offline_thaw(void) 1108 { 1109 up_read(&page_offline_rwsem); 1110 } 1111 1112 void page_offline_begin(void) 1113 { 1114 down_write(&page_offline_rwsem); 1115 } 1116 EXPORT_SYMBOL(page_offline_begin); 1117 1118 void page_offline_end(void) 1119 { 1120 up_write(&page_offline_rwsem); 1121 } 1122 EXPORT_SYMBOL(page_offline_end); 1123 1124 #ifndef flush_dcache_folio 1125 void flush_dcache_folio(struct folio *folio) 1126 { 1127 long i, nr = folio_nr_pages(folio); 1128 1129 for (i = 0; i < nr; i++) 1130 flush_dcache_page(folio_page(folio, i)); 1131 } 1132 EXPORT_SYMBOL(flush_dcache_folio); 1133 #endif 1134