1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 216d69265SAndrew Morton #include <linux/mm.h> 330992c97SMatt Mackall #include <linux/slab.h> 430992c97SMatt Mackall #include <linux/string.h> 53b32123dSGideon Israel Dsouza #include <linux/compiler.h> 6b95f1b31SPaul Gortmaker #include <linux/export.h> 796840aa0SDavi Arnaut #include <linux/err.h> 83b8f14b4SAdrian Bunk #include <linux/sched.h> 96e84f315SIngo Molnar #include <linux/sched/mm.h> 1079eb597cSDaniel Jordan #include <linux/sched/signal.h> 1168db0cf1SIngo Molnar #include <linux/sched/task_stack.h> 12eb36c587SAl Viro #include <linux/security.h> 139800339bSShaohua Li #include <linux/swap.h> 1433806f06SShaohua Li #include <linux/swapops.h> 1500619bccSJerome Marchand #include <linux/mman.h> 1600619bccSJerome Marchand #include <linux/hugetlb.h> 1739f1f78dSAl Viro #include <linux/vmalloc.h> 18897ab3e0SMike Rapoport #include <linux/userfaultfd_k.h> 19649775beSAlexandre Ghiti #include <linux/elf.h> 2067f3977fSAlexandre Ghiti #include <linux/elf-randomize.h> 2167f3977fSAlexandre Ghiti #include <linux/personality.h> 22649775beSAlexandre Ghiti #include <linux/random.h> 2367f3977fSAlexandre Ghiti #include <linux/processor.h> 2467f3977fSAlexandre Ghiti #include <linux/sizes.h> 2567f3977fSAlexandre Ghiti #include <linux/compat.h> 2600619bccSJerome Marchand 277c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 2830992c97SMatt Mackall 296038def0SNamhyung Kim #include "internal.h" 306038def0SNamhyung Kim 31a4bb1e43SAndrzej Hajda /** 32a4bb1e43SAndrzej Hajda * kfree_const - conditionally free memory 33a4bb1e43SAndrzej Hajda * @x: pointer to the memory 34a4bb1e43SAndrzej Hajda * 35a4bb1e43SAndrzej Hajda * Function calls kfree only if @x is not in .rodata section. 36a4bb1e43SAndrzej Hajda */ 37a4bb1e43SAndrzej Hajda void kfree_const(const void *x) 38a4bb1e43SAndrzej Hajda { 39a4bb1e43SAndrzej Hajda if (!is_kernel_rodata((unsigned long)x)) 40a4bb1e43SAndrzej Hajda kfree(x); 41a4bb1e43SAndrzej Hajda } 42a4bb1e43SAndrzej Hajda EXPORT_SYMBOL(kfree_const); 43a4bb1e43SAndrzej Hajda 4430992c97SMatt Mackall /** 4530992c97SMatt Mackall * kstrdup - allocate space for and copy an existing string 4630992c97SMatt Mackall * @s: the string to duplicate 4730992c97SMatt Mackall * @gfp: the GFP mask used in the kmalloc() call when allocating memory 48a862f68aSMike Rapoport * 49a862f68aSMike Rapoport * Return: newly allocated copy of @s or %NULL in case of error 5030992c97SMatt Mackall */ 5130992c97SMatt Mackall char *kstrdup(const char *s, gfp_t gfp) 5230992c97SMatt Mackall { 5330992c97SMatt Mackall size_t len; 5430992c97SMatt Mackall char *buf; 5530992c97SMatt Mackall 5630992c97SMatt Mackall if (!s) 5730992c97SMatt Mackall return NULL; 5830992c97SMatt Mackall 5930992c97SMatt Mackall len = strlen(s) + 1; 601d2c8eeaSChristoph Hellwig buf = kmalloc_track_caller(len, gfp); 6130992c97SMatt Mackall if (buf) 6230992c97SMatt Mackall memcpy(buf, s, len); 6330992c97SMatt Mackall return buf; 6430992c97SMatt Mackall } 6530992c97SMatt Mackall EXPORT_SYMBOL(kstrdup); 6696840aa0SDavi Arnaut 671a2f67b4SAlexey Dobriyan /** 68a4bb1e43SAndrzej Hajda * kstrdup_const - conditionally duplicate an existing const string 69a4bb1e43SAndrzej Hajda * @s: the string to duplicate 70a4bb1e43SAndrzej Hajda * @gfp: the GFP mask used in the kmalloc() call when allocating memory 71a4bb1e43SAndrzej Hajda * 72295a1730SBartosz Golaszewski * Note: Strings allocated by kstrdup_const should be freed by kfree_const and 73295a1730SBartosz Golaszewski * must not be passed to krealloc(). 74a862f68aSMike Rapoport * 75a862f68aSMike Rapoport * Return: source string if it is in .rodata section otherwise 76a862f68aSMike Rapoport * fallback to kstrdup. 77a4bb1e43SAndrzej Hajda */ 78a4bb1e43SAndrzej Hajda const char *kstrdup_const(const char *s, gfp_t gfp) 79a4bb1e43SAndrzej Hajda { 80a4bb1e43SAndrzej Hajda if (is_kernel_rodata((unsigned long)s)) 81a4bb1e43SAndrzej Hajda return s; 82a4bb1e43SAndrzej Hajda 83a4bb1e43SAndrzej Hajda return kstrdup(s, gfp); 84a4bb1e43SAndrzej Hajda } 85a4bb1e43SAndrzej Hajda EXPORT_SYMBOL(kstrdup_const); 86a4bb1e43SAndrzej Hajda 87a4bb1e43SAndrzej Hajda /** 881e66df3eSJeremy Fitzhardinge * kstrndup - allocate space for and copy an existing string 891e66df3eSJeremy Fitzhardinge * @s: the string to duplicate 901e66df3eSJeremy Fitzhardinge * @max: read at most @max chars from @s 911e66df3eSJeremy Fitzhardinge * @gfp: the GFP mask used in the kmalloc() call when allocating memory 92f3515741SDavid Howells * 93f3515741SDavid Howells * Note: Use kmemdup_nul() instead if the size is known exactly. 94a862f68aSMike Rapoport * 95a862f68aSMike Rapoport * Return: newly allocated copy of @s or %NULL in case of error 961e66df3eSJeremy Fitzhardinge */ 971e66df3eSJeremy Fitzhardinge char *kstrndup(const char *s, size_t max, gfp_t gfp) 981e66df3eSJeremy Fitzhardinge { 991e66df3eSJeremy Fitzhardinge size_t len; 1001e66df3eSJeremy Fitzhardinge char *buf; 1011e66df3eSJeremy Fitzhardinge 1021e66df3eSJeremy Fitzhardinge if (!s) 1031e66df3eSJeremy Fitzhardinge return NULL; 1041e66df3eSJeremy Fitzhardinge 1051e66df3eSJeremy Fitzhardinge len = strnlen(s, max); 1061e66df3eSJeremy Fitzhardinge buf = kmalloc_track_caller(len+1, gfp); 1071e66df3eSJeremy Fitzhardinge if (buf) { 1081e66df3eSJeremy Fitzhardinge memcpy(buf, s, len); 1091e66df3eSJeremy Fitzhardinge buf[len] = '\0'; 1101e66df3eSJeremy Fitzhardinge } 1111e66df3eSJeremy Fitzhardinge return buf; 1121e66df3eSJeremy Fitzhardinge } 1131e66df3eSJeremy Fitzhardinge EXPORT_SYMBOL(kstrndup); 1141e66df3eSJeremy Fitzhardinge 1151e66df3eSJeremy Fitzhardinge /** 1161a2f67b4SAlexey Dobriyan * kmemdup - duplicate region of memory 1171a2f67b4SAlexey Dobriyan * 1181a2f67b4SAlexey Dobriyan * @src: memory region to duplicate 1191a2f67b4SAlexey Dobriyan * @len: memory region length 1201a2f67b4SAlexey Dobriyan * @gfp: GFP mask to use 121a862f68aSMike Rapoport * 122a862f68aSMike Rapoport * Return: newly allocated copy of @src or %NULL in case of error 1231a2f67b4SAlexey Dobriyan */ 1241a2f67b4SAlexey Dobriyan void *kmemdup(const void *src, size_t len, gfp_t gfp) 1251a2f67b4SAlexey Dobriyan { 1261a2f67b4SAlexey Dobriyan void *p; 1271a2f67b4SAlexey Dobriyan 1281d2c8eeaSChristoph Hellwig p = kmalloc_track_caller(len, gfp); 1291a2f67b4SAlexey Dobriyan if (p) 1301a2f67b4SAlexey Dobriyan memcpy(p, src, len); 1311a2f67b4SAlexey Dobriyan return p; 1321a2f67b4SAlexey Dobriyan } 1331a2f67b4SAlexey Dobriyan EXPORT_SYMBOL(kmemdup); 1341a2f67b4SAlexey Dobriyan 135ef2ad80cSChristoph Lameter /** 136f3515741SDavid Howells * kmemdup_nul - Create a NUL-terminated string from unterminated data 137f3515741SDavid Howells * @s: The data to stringify 138f3515741SDavid Howells * @len: The size of the data 139f3515741SDavid Howells * @gfp: the GFP mask used in the kmalloc() call when allocating memory 140a862f68aSMike Rapoport * 141a862f68aSMike Rapoport * Return: newly allocated copy of @s with NUL-termination or %NULL in 142a862f68aSMike Rapoport * case of error 143f3515741SDavid Howells */ 144f3515741SDavid Howells char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 145f3515741SDavid Howells { 146f3515741SDavid Howells char *buf; 147f3515741SDavid Howells 148f3515741SDavid Howells if (!s) 149f3515741SDavid Howells return NULL; 150f3515741SDavid Howells 151f3515741SDavid Howells buf = kmalloc_track_caller(len + 1, gfp); 152f3515741SDavid Howells if (buf) { 153f3515741SDavid Howells memcpy(buf, s, len); 154f3515741SDavid Howells buf[len] = '\0'; 155f3515741SDavid Howells } 156f3515741SDavid Howells return buf; 157f3515741SDavid Howells } 158f3515741SDavid Howells EXPORT_SYMBOL(kmemdup_nul); 159f3515741SDavid Howells 160f3515741SDavid Howells /** 161610a77e0SLi Zefan * memdup_user - duplicate memory region from user space 162610a77e0SLi Zefan * 163610a77e0SLi Zefan * @src: source address in user space 164610a77e0SLi Zefan * @len: number of bytes to copy 165610a77e0SLi Zefan * 166a862f68aSMike Rapoport * Return: an ERR_PTR() on failure. Result is physically 16750fd2f29SAl Viro * contiguous, to be freed by kfree(). 168610a77e0SLi Zefan */ 169610a77e0SLi Zefan void *memdup_user(const void __user *src, size_t len) 170610a77e0SLi Zefan { 171610a77e0SLi Zefan void *p; 172610a77e0SLi Zefan 1736c8fcc09SDaniel Vetter p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); 174610a77e0SLi Zefan if (!p) 175610a77e0SLi Zefan return ERR_PTR(-ENOMEM); 176610a77e0SLi Zefan 177610a77e0SLi Zefan if (copy_from_user(p, src, len)) { 178610a77e0SLi Zefan kfree(p); 179610a77e0SLi Zefan return ERR_PTR(-EFAULT); 180610a77e0SLi Zefan } 181610a77e0SLi Zefan 182610a77e0SLi Zefan return p; 183610a77e0SLi Zefan } 184610a77e0SLi Zefan EXPORT_SYMBOL(memdup_user); 185610a77e0SLi Zefan 18650fd2f29SAl Viro /** 18750fd2f29SAl Viro * vmemdup_user - duplicate memory region from user space 18850fd2f29SAl Viro * 18950fd2f29SAl Viro * @src: source address in user space 19050fd2f29SAl Viro * @len: number of bytes to copy 19150fd2f29SAl Viro * 192a862f68aSMike Rapoport * Return: an ERR_PTR() on failure. Result may be not 19350fd2f29SAl Viro * physically contiguous. Use kvfree() to free. 19450fd2f29SAl Viro */ 19550fd2f29SAl Viro void *vmemdup_user(const void __user *src, size_t len) 19650fd2f29SAl Viro { 19750fd2f29SAl Viro void *p; 19850fd2f29SAl Viro 19950fd2f29SAl Viro p = kvmalloc(len, GFP_USER); 20050fd2f29SAl Viro if (!p) 20150fd2f29SAl Viro return ERR_PTR(-ENOMEM); 20250fd2f29SAl Viro 20350fd2f29SAl Viro if (copy_from_user(p, src, len)) { 20450fd2f29SAl Viro kvfree(p); 20550fd2f29SAl Viro return ERR_PTR(-EFAULT); 20650fd2f29SAl Viro } 20750fd2f29SAl Viro 20850fd2f29SAl Viro return p; 20950fd2f29SAl Viro } 21050fd2f29SAl Viro EXPORT_SYMBOL(vmemdup_user); 21150fd2f29SAl Viro 212b86181f1SMike Rapoport /** 21396840aa0SDavi Arnaut * strndup_user - duplicate an existing string from user space 21496840aa0SDavi Arnaut * @s: The string to duplicate 21596840aa0SDavi Arnaut * @n: Maximum number of bytes to copy, including the trailing NUL. 216a862f68aSMike Rapoport * 217e9145521SAndrew Morton * Return: newly allocated copy of @s or an ERR_PTR() in case of error 21896840aa0SDavi Arnaut */ 21996840aa0SDavi Arnaut char *strndup_user(const char __user *s, long n) 22096840aa0SDavi Arnaut { 22196840aa0SDavi Arnaut char *p; 22296840aa0SDavi Arnaut long length; 22396840aa0SDavi Arnaut 22496840aa0SDavi Arnaut length = strnlen_user(s, n); 22596840aa0SDavi Arnaut 22696840aa0SDavi Arnaut if (!length) 22796840aa0SDavi Arnaut return ERR_PTR(-EFAULT); 22896840aa0SDavi Arnaut 22996840aa0SDavi Arnaut if (length > n) 23096840aa0SDavi Arnaut return ERR_PTR(-EINVAL); 23196840aa0SDavi Arnaut 23290d74045SJulia Lawall p = memdup_user(s, length); 23396840aa0SDavi Arnaut 23490d74045SJulia Lawall if (IS_ERR(p)) 23590d74045SJulia Lawall return p; 23696840aa0SDavi Arnaut 23796840aa0SDavi Arnaut p[length - 1] = '\0'; 23896840aa0SDavi Arnaut 23996840aa0SDavi Arnaut return p; 24096840aa0SDavi Arnaut } 24196840aa0SDavi Arnaut EXPORT_SYMBOL(strndup_user); 24216d69265SAndrew Morton 243e9d408e1SAl Viro /** 244e9d408e1SAl Viro * memdup_user_nul - duplicate memory region from user space and NUL-terminate 245e9d408e1SAl Viro * 246e9d408e1SAl Viro * @src: source address in user space 247e9d408e1SAl Viro * @len: number of bytes to copy 248e9d408e1SAl Viro * 249a862f68aSMike Rapoport * Return: an ERR_PTR() on failure. 250e9d408e1SAl Viro */ 251e9d408e1SAl Viro void *memdup_user_nul(const void __user *src, size_t len) 252e9d408e1SAl Viro { 253e9d408e1SAl Viro char *p; 254e9d408e1SAl Viro 255e9d408e1SAl Viro /* 256e9d408e1SAl Viro * Always use GFP_KERNEL, since copy_from_user() can sleep and 257e9d408e1SAl Viro * cause pagefault, which makes it pointless to use GFP_NOFS 258e9d408e1SAl Viro * or GFP_ATOMIC. 259e9d408e1SAl Viro */ 260e9d408e1SAl Viro p = kmalloc_track_caller(len + 1, GFP_KERNEL); 261e9d408e1SAl Viro if (!p) 262e9d408e1SAl Viro return ERR_PTR(-ENOMEM); 263e9d408e1SAl Viro 264e9d408e1SAl Viro if (copy_from_user(p, src, len)) { 265e9d408e1SAl Viro kfree(p); 266e9d408e1SAl Viro return ERR_PTR(-EFAULT); 267e9d408e1SAl Viro } 268e9d408e1SAl Viro p[len] = '\0'; 269e9d408e1SAl Viro 270e9d408e1SAl Viro return p; 271e9d408e1SAl Viro } 272e9d408e1SAl Viro EXPORT_SYMBOL(memdup_user_nul); 273e9d408e1SAl Viro 2746038def0SNamhyung Kim void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 275aba6dfb7SWei Yang struct vm_area_struct *prev) 2766038def0SNamhyung Kim { 2776038def0SNamhyung Kim struct vm_area_struct *next; 2786038def0SNamhyung Kim 2796038def0SNamhyung Kim vma->vm_prev = prev; 2806038def0SNamhyung Kim if (prev) { 2816038def0SNamhyung Kim next = prev->vm_next; 2826038def0SNamhyung Kim prev->vm_next = vma; 2836038def0SNamhyung Kim } else { 284aba6dfb7SWei Yang next = mm->mmap; 2856038def0SNamhyung Kim mm->mmap = vma; 2866038def0SNamhyung Kim } 2876038def0SNamhyung Kim vma->vm_next = next; 2886038def0SNamhyung Kim if (next) 2896038def0SNamhyung Kim next->vm_prev = vma; 2906038def0SNamhyung Kim } 2916038def0SNamhyung Kim 2921b9fc5b2SWei Yang void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) 2931b9fc5b2SWei Yang { 2941b9fc5b2SWei Yang struct vm_area_struct *prev, *next; 2951b9fc5b2SWei Yang 2961b9fc5b2SWei Yang next = vma->vm_next; 2971b9fc5b2SWei Yang prev = vma->vm_prev; 2981b9fc5b2SWei Yang if (prev) 2991b9fc5b2SWei Yang prev->vm_next = next; 3001b9fc5b2SWei Yang else 3011b9fc5b2SWei Yang mm->mmap = next; 3021b9fc5b2SWei Yang if (next) 3031b9fc5b2SWei Yang next->vm_prev = prev; 3041b9fc5b2SWei Yang } 3051b9fc5b2SWei Yang 306b7643757SSiddhesh Poyarekar /* Check if the vma is being used as a stack by this task */ 307d17af505SAndy Lutomirski int vma_is_stack_for_current(struct vm_area_struct *vma) 308b7643757SSiddhesh Poyarekar { 309d17af505SAndy Lutomirski struct task_struct * __maybe_unused t = current; 310d17af505SAndy Lutomirski 311b7643757SSiddhesh Poyarekar return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 312b7643757SSiddhesh Poyarekar } 313b7643757SSiddhesh Poyarekar 314295992fbSChristian König /* 315295992fbSChristian König * Change backing file, only valid to use during initial VMA setup. 316295992fbSChristian König */ 317295992fbSChristian König void vma_set_file(struct vm_area_struct *vma, struct file *file) 318295992fbSChristian König { 319295992fbSChristian König /* Changing an anonymous vma with this is illegal */ 320295992fbSChristian König get_file(file); 321295992fbSChristian König swap(vma->vm_file, file); 322295992fbSChristian König fput(file); 323295992fbSChristian König } 324295992fbSChristian König EXPORT_SYMBOL(vma_set_file); 325295992fbSChristian König 326649775beSAlexandre Ghiti #ifndef STACK_RND_MASK 327649775beSAlexandre Ghiti #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 328649775beSAlexandre Ghiti #endif 329649775beSAlexandre Ghiti 330649775beSAlexandre Ghiti unsigned long randomize_stack_top(unsigned long stack_top) 331649775beSAlexandre Ghiti { 332649775beSAlexandre Ghiti unsigned long random_variable = 0; 333649775beSAlexandre Ghiti 334649775beSAlexandre Ghiti if (current->flags & PF_RANDOMIZE) { 335649775beSAlexandre Ghiti random_variable = get_random_long(); 336649775beSAlexandre Ghiti random_variable &= STACK_RND_MASK; 337649775beSAlexandre Ghiti random_variable <<= PAGE_SHIFT; 338649775beSAlexandre Ghiti } 339649775beSAlexandre Ghiti #ifdef CONFIG_STACK_GROWSUP 340649775beSAlexandre Ghiti return PAGE_ALIGN(stack_top) + random_variable; 341649775beSAlexandre Ghiti #else 342649775beSAlexandre Ghiti return PAGE_ALIGN(stack_top) - random_variable; 343649775beSAlexandre Ghiti #endif 344649775beSAlexandre Ghiti } 345649775beSAlexandre Ghiti 34667f3977fSAlexandre Ghiti #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 347e7142bf5SAlexandre Ghiti unsigned long arch_randomize_brk(struct mm_struct *mm) 348e7142bf5SAlexandre Ghiti { 349e7142bf5SAlexandre Ghiti /* Is the current task 32bit ? */ 350e7142bf5SAlexandre Ghiti if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 351e7142bf5SAlexandre Ghiti return randomize_page(mm->brk, SZ_32M); 352e7142bf5SAlexandre Ghiti 353e7142bf5SAlexandre Ghiti return randomize_page(mm->brk, SZ_1G); 354e7142bf5SAlexandre Ghiti } 355e7142bf5SAlexandre Ghiti 35667f3977fSAlexandre Ghiti unsigned long arch_mmap_rnd(void) 35767f3977fSAlexandre Ghiti { 35867f3977fSAlexandre Ghiti unsigned long rnd; 35967f3977fSAlexandre Ghiti 36067f3977fSAlexandre Ghiti #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 36167f3977fSAlexandre Ghiti if (is_compat_task()) 36267f3977fSAlexandre Ghiti rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 36367f3977fSAlexandre Ghiti else 36467f3977fSAlexandre Ghiti #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 36567f3977fSAlexandre Ghiti rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 36667f3977fSAlexandre Ghiti 36767f3977fSAlexandre Ghiti return rnd << PAGE_SHIFT; 36867f3977fSAlexandre Ghiti } 36967f3977fSAlexandre Ghiti 37067f3977fSAlexandre Ghiti static int mmap_is_legacy(struct rlimit *rlim_stack) 37167f3977fSAlexandre Ghiti { 37267f3977fSAlexandre Ghiti if (current->personality & ADDR_COMPAT_LAYOUT) 37367f3977fSAlexandre Ghiti return 1; 37467f3977fSAlexandre Ghiti 37567f3977fSAlexandre Ghiti if (rlim_stack->rlim_cur == RLIM_INFINITY) 37667f3977fSAlexandre Ghiti return 1; 37767f3977fSAlexandre Ghiti 37867f3977fSAlexandre Ghiti return sysctl_legacy_va_layout; 37967f3977fSAlexandre Ghiti } 38067f3977fSAlexandre Ghiti 38167f3977fSAlexandre Ghiti /* 38267f3977fSAlexandre Ghiti * Leave enough space between the mmap area and the stack to honour ulimit in 38367f3977fSAlexandre Ghiti * the face of randomisation. 38467f3977fSAlexandre Ghiti */ 38567f3977fSAlexandre Ghiti #define MIN_GAP (SZ_128M) 38667f3977fSAlexandre Ghiti #define MAX_GAP (STACK_TOP / 6 * 5) 38767f3977fSAlexandre Ghiti 38867f3977fSAlexandre Ghiti static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 38967f3977fSAlexandre Ghiti { 39067f3977fSAlexandre Ghiti unsigned long gap = rlim_stack->rlim_cur; 39167f3977fSAlexandre Ghiti unsigned long pad = stack_guard_gap; 39267f3977fSAlexandre Ghiti 39367f3977fSAlexandre Ghiti /* Account for stack randomization if necessary */ 39467f3977fSAlexandre Ghiti if (current->flags & PF_RANDOMIZE) 39567f3977fSAlexandre Ghiti pad += (STACK_RND_MASK << PAGE_SHIFT); 39667f3977fSAlexandre Ghiti 39767f3977fSAlexandre Ghiti /* Values close to RLIM_INFINITY can overflow. */ 39867f3977fSAlexandre Ghiti if (gap + pad > gap) 39967f3977fSAlexandre Ghiti gap += pad; 40067f3977fSAlexandre Ghiti 40167f3977fSAlexandre Ghiti if (gap < MIN_GAP) 40267f3977fSAlexandre Ghiti gap = MIN_GAP; 40367f3977fSAlexandre Ghiti else if (gap > MAX_GAP) 40467f3977fSAlexandre Ghiti gap = MAX_GAP; 40567f3977fSAlexandre Ghiti 40667f3977fSAlexandre Ghiti return PAGE_ALIGN(STACK_TOP - gap - rnd); 40767f3977fSAlexandre Ghiti } 40867f3977fSAlexandre Ghiti 40967f3977fSAlexandre Ghiti void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 41067f3977fSAlexandre Ghiti { 41167f3977fSAlexandre Ghiti unsigned long random_factor = 0UL; 41267f3977fSAlexandre Ghiti 41367f3977fSAlexandre Ghiti if (current->flags & PF_RANDOMIZE) 41467f3977fSAlexandre Ghiti random_factor = arch_mmap_rnd(); 41567f3977fSAlexandre Ghiti 41667f3977fSAlexandre Ghiti if (mmap_is_legacy(rlim_stack)) { 41767f3977fSAlexandre Ghiti mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 41867f3977fSAlexandre Ghiti mm->get_unmapped_area = arch_get_unmapped_area; 41967f3977fSAlexandre Ghiti } else { 42067f3977fSAlexandre Ghiti mm->mmap_base = mmap_base(random_factor, rlim_stack); 42167f3977fSAlexandre Ghiti mm->get_unmapped_area = arch_get_unmapped_area_topdown; 42267f3977fSAlexandre Ghiti } 42367f3977fSAlexandre Ghiti } 42467f3977fSAlexandre Ghiti #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 4258f2af155SKees Cook void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 42616d69265SAndrew Morton { 42716d69265SAndrew Morton mm->mmap_base = TASK_UNMAPPED_BASE; 42816d69265SAndrew Morton mm->get_unmapped_area = arch_get_unmapped_area; 42916d69265SAndrew Morton } 43016d69265SAndrew Morton #endif 431912985dcSRusty Russell 43279eb597cSDaniel Jordan /** 43379eb597cSDaniel Jordan * __account_locked_vm - account locked pages to an mm's locked_vm 43479eb597cSDaniel Jordan * @mm: mm to account against 43579eb597cSDaniel Jordan * @pages: number of pages to account 43679eb597cSDaniel Jordan * @inc: %true if @pages should be considered positive, %false if not 43779eb597cSDaniel Jordan * @task: task used to check RLIMIT_MEMLOCK 43879eb597cSDaniel Jordan * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped 43979eb597cSDaniel Jordan * 44079eb597cSDaniel Jordan * Assumes @task and @mm are valid (i.e. at least one reference on each), and 441c1e8d7c6SMichel Lespinasse * that mmap_lock is held as writer. 44279eb597cSDaniel Jordan * 44379eb597cSDaniel Jordan * Return: 44479eb597cSDaniel Jordan * * 0 on success 44579eb597cSDaniel Jordan * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 44679eb597cSDaniel Jordan */ 44779eb597cSDaniel Jordan int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, 44879eb597cSDaniel Jordan struct task_struct *task, bool bypass_rlim) 44979eb597cSDaniel Jordan { 45079eb597cSDaniel Jordan unsigned long locked_vm, limit; 45179eb597cSDaniel Jordan int ret = 0; 45279eb597cSDaniel Jordan 45342fc5414SMichel Lespinasse mmap_assert_write_locked(mm); 45479eb597cSDaniel Jordan 45579eb597cSDaniel Jordan locked_vm = mm->locked_vm; 45679eb597cSDaniel Jordan if (inc) { 45779eb597cSDaniel Jordan if (!bypass_rlim) { 45879eb597cSDaniel Jordan limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; 45979eb597cSDaniel Jordan if (locked_vm + pages > limit) 46079eb597cSDaniel Jordan ret = -ENOMEM; 46179eb597cSDaniel Jordan } 46279eb597cSDaniel Jordan if (!ret) 46379eb597cSDaniel Jordan mm->locked_vm = locked_vm + pages; 46479eb597cSDaniel Jordan } else { 46579eb597cSDaniel Jordan WARN_ON_ONCE(pages > locked_vm); 46679eb597cSDaniel Jordan mm->locked_vm = locked_vm - pages; 46779eb597cSDaniel Jordan } 46879eb597cSDaniel Jordan 46979eb597cSDaniel Jordan pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, 47079eb597cSDaniel Jordan (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, 47179eb597cSDaniel Jordan locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), 47279eb597cSDaniel Jordan ret ? " - exceeded" : ""); 47379eb597cSDaniel Jordan 47479eb597cSDaniel Jordan return ret; 47579eb597cSDaniel Jordan } 47679eb597cSDaniel Jordan EXPORT_SYMBOL_GPL(__account_locked_vm); 47779eb597cSDaniel Jordan 47879eb597cSDaniel Jordan /** 47979eb597cSDaniel Jordan * account_locked_vm - account locked pages to an mm's locked_vm 48079eb597cSDaniel Jordan * @mm: mm to account against, may be NULL 48179eb597cSDaniel Jordan * @pages: number of pages to account 48279eb597cSDaniel Jordan * @inc: %true if @pages should be considered positive, %false if not 48379eb597cSDaniel Jordan * 48479eb597cSDaniel Jordan * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). 48579eb597cSDaniel Jordan * 48679eb597cSDaniel Jordan * Return: 48779eb597cSDaniel Jordan * * 0 on success, or if mm is NULL 48879eb597cSDaniel Jordan * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. 48979eb597cSDaniel Jordan */ 49079eb597cSDaniel Jordan int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) 49179eb597cSDaniel Jordan { 49279eb597cSDaniel Jordan int ret; 49379eb597cSDaniel Jordan 49479eb597cSDaniel Jordan if (pages == 0 || !mm) 49579eb597cSDaniel Jordan return 0; 49679eb597cSDaniel Jordan 497d8ed45c5SMichel Lespinasse mmap_write_lock(mm); 49879eb597cSDaniel Jordan ret = __account_locked_vm(mm, pages, inc, current, 49979eb597cSDaniel Jordan capable(CAP_IPC_LOCK)); 500d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 50179eb597cSDaniel Jordan 50279eb597cSDaniel Jordan return ret; 50379eb597cSDaniel Jordan } 50479eb597cSDaniel Jordan EXPORT_SYMBOL_GPL(account_locked_vm); 50579eb597cSDaniel Jordan 506eb36c587SAl Viro unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 507eb36c587SAl Viro unsigned long len, unsigned long prot, 5089fbeb5abSMichal Hocko unsigned long flag, unsigned long pgoff) 509eb36c587SAl Viro { 510eb36c587SAl Viro unsigned long ret; 511eb36c587SAl Viro struct mm_struct *mm = current->mm; 51241badc15SMichel Lespinasse unsigned long populate; 513897ab3e0SMike Rapoport LIST_HEAD(uf); 514eb36c587SAl Viro 515eb36c587SAl Viro ret = security_mmap_file(file, prot, flag); 516eb36c587SAl Viro if (!ret) { 517d8ed45c5SMichel Lespinasse if (mmap_write_lock_killable(mm)) 518dc0ef0dfSMichal Hocko return -EINTR; 51945e55300SPeter Collingbourne ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, 52045e55300SPeter Collingbourne &uf); 521d8ed45c5SMichel Lespinasse mmap_write_unlock(mm); 522897ab3e0SMike Rapoport userfaultfd_unmap_complete(mm, &uf); 52341badc15SMichel Lespinasse if (populate) 52441badc15SMichel Lespinasse mm_populate(ret, populate); 525eb36c587SAl Viro } 526eb36c587SAl Viro return ret; 527eb36c587SAl Viro } 528eb36c587SAl Viro 529eb36c587SAl Viro unsigned long vm_mmap(struct file *file, unsigned long addr, 530eb36c587SAl Viro unsigned long len, unsigned long prot, 531eb36c587SAl Viro unsigned long flag, unsigned long offset) 532eb36c587SAl Viro { 533eb36c587SAl Viro if (unlikely(offset + PAGE_ALIGN(len) < offset)) 534eb36c587SAl Viro return -EINVAL; 535ea53cde0SAlexander Kuleshov if (unlikely(offset_in_page(offset))) 536eb36c587SAl Viro return -EINVAL; 537eb36c587SAl Viro 5389fbeb5abSMichal Hocko return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 539eb36c587SAl Viro } 540eb36c587SAl Viro EXPORT_SYMBOL(vm_mmap); 541eb36c587SAl Viro 542a7c3e901SMichal Hocko /** 543a7c3e901SMichal Hocko * kvmalloc_node - attempt to allocate physically contiguous memory, but upon 544a7c3e901SMichal Hocko * failure, fall back to non-contiguous (vmalloc) allocation. 545a7c3e901SMichal Hocko * @size: size of the request. 546a7c3e901SMichal Hocko * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. 547a7c3e901SMichal Hocko * @node: numa node to allocate from 548a7c3e901SMichal Hocko * 549a7c3e901SMichal Hocko * Uses kmalloc to get the memory but if the allocation fails then falls back 550a7c3e901SMichal Hocko * to the vmalloc allocator. Use kvfree for freeing the memory. 551a7c3e901SMichal Hocko * 552cc965a29SMichal Hocko * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. 553cc965a29SMichal Hocko * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is 554cc965a29SMichal Hocko * preferable to the vmalloc fallback, due to visible performance drawbacks. 555a7c3e901SMichal Hocko * 556ce91f6eeSMichal Hocko * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not 557ce91f6eeSMichal Hocko * fall back to vmalloc. 558a862f68aSMike Rapoport * 559a862f68aSMike Rapoport * Return: pointer to the allocated memory of %NULL in case of failure 560a7c3e901SMichal Hocko */ 561a7c3e901SMichal Hocko void *kvmalloc_node(size_t size, gfp_t flags, int node) 562a7c3e901SMichal Hocko { 563a7c3e901SMichal Hocko gfp_t kmalloc_flags = flags; 564a7c3e901SMichal Hocko void *ret; 565a7c3e901SMichal Hocko 566a7c3e901SMichal Hocko /* 567a7c3e901SMichal Hocko * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) 568a7c3e901SMichal Hocko * so the given set of flags has to be compatible. 569a7c3e901SMichal Hocko */ 570ce91f6eeSMichal Hocko if ((flags & GFP_KERNEL) != GFP_KERNEL) 571ce91f6eeSMichal Hocko return kmalloc_node(size, flags, node); 572a7c3e901SMichal Hocko 573a7c3e901SMichal Hocko /* 5744f4f2ba9SMichal Hocko * We want to attempt a large physically contiguous block first because 5754f4f2ba9SMichal Hocko * it is less likely to fragment multiple larger blocks and therefore 5764f4f2ba9SMichal Hocko * contribute to a long term fragmentation less than vmalloc fallback. 5774f4f2ba9SMichal Hocko * However make sure that larger requests are not too disruptive - no 5784f4f2ba9SMichal Hocko * OOM killer and no allocation failure warnings as we have a fallback. 579a7c3e901SMichal Hocko */ 5806c5ab651SMichal Hocko if (size > PAGE_SIZE) { 5816c5ab651SMichal Hocko kmalloc_flags |= __GFP_NOWARN; 5826c5ab651SMichal Hocko 583cc965a29SMichal Hocko if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) 5846c5ab651SMichal Hocko kmalloc_flags |= __GFP_NORETRY; 5856c5ab651SMichal Hocko } 586a7c3e901SMichal Hocko 587a7c3e901SMichal Hocko ret = kmalloc_node(size, kmalloc_flags, node); 588a7c3e901SMichal Hocko 589a7c3e901SMichal Hocko /* 590a7c3e901SMichal Hocko * It doesn't really make sense to fallback to vmalloc for sub page 591a7c3e901SMichal Hocko * requests 592a7c3e901SMichal Hocko */ 593a7c3e901SMichal Hocko if (ret || size <= PAGE_SIZE) 594a7c3e901SMichal Hocko return ret; 595a7c3e901SMichal Hocko 5962b905948SChristoph Hellwig return __vmalloc_node(size, 1, flags, node, 5978594a21cSMichal Hocko __builtin_return_address(0)); 598a7c3e901SMichal Hocko } 599a7c3e901SMichal Hocko EXPORT_SYMBOL(kvmalloc_node); 600a7c3e901SMichal Hocko 601ff4dc772SMike Rapoport /** 60204b8e946SAndrew Morton * kvfree() - Free memory. 60304b8e946SAndrew Morton * @addr: Pointer to allocated memory. 604ff4dc772SMike Rapoport * 60504b8e946SAndrew Morton * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). 60604b8e946SAndrew Morton * It is slightly more efficient to use kfree() or vfree() if you are certain 60704b8e946SAndrew Morton * that you know which one to use. 60804b8e946SAndrew Morton * 60952414d33SAndrey Ryabinin * Context: Either preemptible task context or not-NMI interrupt. 610ff4dc772SMike Rapoport */ 61139f1f78dSAl Viro void kvfree(const void *addr) 61239f1f78dSAl Viro { 61339f1f78dSAl Viro if (is_vmalloc_addr(addr)) 61439f1f78dSAl Viro vfree(addr); 61539f1f78dSAl Viro else 61639f1f78dSAl Viro kfree(addr); 61739f1f78dSAl Viro } 61839f1f78dSAl Viro EXPORT_SYMBOL(kvfree); 61939f1f78dSAl Viro 620d4eaa283SWaiman Long /** 621d4eaa283SWaiman Long * kvfree_sensitive - Free a data object containing sensitive information. 622d4eaa283SWaiman Long * @addr: address of the data object to be freed. 623d4eaa283SWaiman Long * @len: length of the data object. 624d4eaa283SWaiman Long * 625d4eaa283SWaiman Long * Use the special memzero_explicit() function to clear the content of a 626d4eaa283SWaiman Long * kvmalloc'ed object containing sensitive data to make sure that the 627d4eaa283SWaiman Long * compiler won't optimize out the data clearing. 628d4eaa283SWaiman Long */ 629d4eaa283SWaiman Long void kvfree_sensitive(const void *addr, size_t len) 630d4eaa283SWaiman Long { 631d4eaa283SWaiman Long if (likely(!ZERO_OR_NULL_PTR(addr))) { 632d4eaa283SWaiman Long memzero_explicit((void *)addr, len); 633d4eaa283SWaiman Long kvfree(addr); 634d4eaa283SWaiman Long } 635d4eaa283SWaiman Long } 636d4eaa283SWaiman Long EXPORT_SYMBOL(kvfree_sensitive); 637d4eaa283SWaiman Long 638e39155eaSKirill A. Shutemov static inline void *__page_rmapping(struct page *page) 639e39155eaSKirill A. Shutemov { 640e39155eaSKirill A. Shutemov unsigned long mapping; 641e39155eaSKirill A. Shutemov 642e39155eaSKirill A. Shutemov mapping = (unsigned long)page->mapping; 643e39155eaSKirill A. Shutemov mapping &= ~PAGE_MAPPING_FLAGS; 644e39155eaSKirill A. Shutemov 645e39155eaSKirill A. Shutemov return (void *)mapping; 646e39155eaSKirill A. Shutemov } 647e39155eaSKirill A. Shutemov 648e39155eaSKirill A. Shutemov /* Neutral page->mapping pointer to address_space or anon_vma or other */ 649e39155eaSKirill A. Shutemov void *page_rmapping(struct page *page) 650e39155eaSKirill A. Shutemov { 651e39155eaSKirill A. Shutemov page = compound_head(page); 652e39155eaSKirill A. Shutemov return __page_rmapping(page); 653e39155eaSKirill A. Shutemov } 654e39155eaSKirill A. Shutemov 6551aa8aea5SAndrew Morton /* 6561aa8aea5SAndrew Morton * Return true if this page is mapped into pagetables. 6571aa8aea5SAndrew Morton * For compound page it returns true if any subpage of compound page is mapped. 6581aa8aea5SAndrew Morton */ 6591aa8aea5SAndrew Morton bool page_mapped(struct page *page) 6601aa8aea5SAndrew Morton { 6611aa8aea5SAndrew Morton int i; 6621aa8aea5SAndrew Morton 6631aa8aea5SAndrew Morton if (likely(!PageCompound(page))) 6641aa8aea5SAndrew Morton return atomic_read(&page->_mapcount) >= 0; 6651aa8aea5SAndrew Morton page = compound_head(page); 6661aa8aea5SAndrew Morton if (atomic_read(compound_mapcount_ptr(page)) >= 0) 6671aa8aea5SAndrew Morton return true; 6681aa8aea5SAndrew Morton if (PageHuge(page)) 6691aa8aea5SAndrew Morton return false; 670d8c6546bSMatthew Wilcox (Oracle) for (i = 0; i < compound_nr(page); i++) { 6711aa8aea5SAndrew Morton if (atomic_read(&page[i]._mapcount) >= 0) 6721aa8aea5SAndrew Morton return true; 6731aa8aea5SAndrew Morton } 6741aa8aea5SAndrew Morton return false; 6751aa8aea5SAndrew Morton } 6761aa8aea5SAndrew Morton EXPORT_SYMBOL(page_mapped); 6771aa8aea5SAndrew Morton 678e39155eaSKirill A. Shutemov struct anon_vma *page_anon_vma(struct page *page) 679e39155eaSKirill A. Shutemov { 680e39155eaSKirill A. Shutemov unsigned long mapping; 681e39155eaSKirill A. Shutemov 682e39155eaSKirill A. Shutemov page = compound_head(page); 683e39155eaSKirill A. Shutemov mapping = (unsigned long)page->mapping; 684e39155eaSKirill A. Shutemov if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 685e39155eaSKirill A. Shutemov return NULL; 686e39155eaSKirill A. Shutemov return __page_rmapping(page); 687e39155eaSKirill A. Shutemov } 688e39155eaSKirill A. Shutemov 6899800339bSShaohua Li struct address_space *page_mapping(struct page *page) 6909800339bSShaohua Li { 6911c290f64SKirill A. Shutemov struct address_space *mapping; 6921c290f64SKirill A. Shutemov 6931c290f64SKirill A. Shutemov page = compound_head(page); 6949800339bSShaohua Li 69503e5ac2fSMikulas Patocka /* This happens if someone calls flush_dcache_page on slab page */ 69603e5ac2fSMikulas Patocka if (unlikely(PageSlab(page))) 69703e5ac2fSMikulas Patocka return NULL; 69803e5ac2fSMikulas Patocka 69933806f06SShaohua Li if (unlikely(PageSwapCache(page))) { 70033806f06SShaohua Li swp_entry_t entry; 70133806f06SShaohua Li 70233806f06SShaohua Li entry.val = page_private(page); 703e39155eaSKirill A. Shutemov return swap_address_space(entry); 704e39155eaSKirill A. Shutemov } 705e39155eaSKirill A. Shutemov 7061c290f64SKirill A. Shutemov mapping = page->mapping; 707bda807d4SMinchan Kim if ((unsigned long)mapping & PAGE_MAPPING_ANON) 708e39155eaSKirill A. Shutemov return NULL; 709bda807d4SMinchan Kim 710bda807d4SMinchan Kim return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); 7119800339bSShaohua Li } 712bda807d4SMinchan Kim EXPORT_SYMBOL(page_mapping); 7139800339bSShaohua Li 714cb9f753aSHuang Ying /* 715cb9f753aSHuang Ying * For file cache pages, return the address_space, otherwise return NULL 716cb9f753aSHuang Ying */ 717cb9f753aSHuang Ying struct address_space *page_mapping_file(struct page *page) 718cb9f753aSHuang Ying { 719cb9f753aSHuang Ying if (unlikely(PageSwapCache(page))) 720cb9f753aSHuang Ying return NULL; 721cb9f753aSHuang Ying return page_mapping(page); 722cb9f753aSHuang Ying } 723cb9f753aSHuang Ying 724b20ce5e0SKirill A. Shutemov /* Slow path of page_mapcount() for compound pages */ 725b20ce5e0SKirill A. Shutemov int __page_mapcount(struct page *page) 726b20ce5e0SKirill A. Shutemov { 727b20ce5e0SKirill A. Shutemov int ret; 728b20ce5e0SKirill A. Shutemov 729b20ce5e0SKirill A. Shutemov ret = atomic_read(&page->_mapcount) + 1; 730dd78feddSKirill A. Shutemov /* 731dd78feddSKirill A. Shutemov * For file THP page->_mapcount contains total number of mapping 732dd78feddSKirill A. Shutemov * of the page: no need to look into compound_mapcount. 733dd78feddSKirill A. Shutemov */ 734dd78feddSKirill A. Shutemov if (!PageAnon(page) && !PageHuge(page)) 735dd78feddSKirill A. Shutemov return ret; 736b20ce5e0SKirill A. Shutemov page = compound_head(page); 737b20ce5e0SKirill A. Shutemov ret += atomic_read(compound_mapcount_ptr(page)) + 1; 738b20ce5e0SKirill A. Shutemov if (PageDoubleMap(page)) 739b20ce5e0SKirill A. Shutemov ret--; 740b20ce5e0SKirill A. Shutemov return ret; 741b20ce5e0SKirill A. Shutemov } 742b20ce5e0SKirill A. Shutemov EXPORT_SYMBOL_GPL(__page_mapcount); 743b20ce5e0SKirill A. Shutemov 74439a1aa8eSAndrey Ryabinin int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; 74539a1aa8eSAndrey Ryabinin int sysctl_overcommit_ratio __read_mostly = 50; 74639a1aa8eSAndrey Ryabinin unsigned long sysctl_overcommit_kbytes __read_mostly; 74739a1aa8eSAndrey Ryabinin int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 74839a1aa8eSAndrey Ryabinin unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 74939a1aa8eSAndrey Ryabinin unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 75039a1aa8eSAndrey Ryabinin 75132927393SChristoph Hellwig int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, 75232927393SChristoph Hellwig size_t *lenp, loff_t *ppos) 75349f0ce5fSJerome Marchand { 75449f0ce5fSJerome Marchand int ret; 75549f0ce5fSJerome Marchand 75649f0ce5fSJerome Marchand ret = proc_dointvec(table, write, buffer, lenp, ppos); 75749f0ce5fSJerome Marchand if (ret == 0 && write) 75849f0ce5fSJerome Marchand sysctl_overcommit_kbytes = 0; 75949f0ce5fSJerome Marchand return ret; 76049f0ce5fSJerome Marchand } 76149f0ce5fSJerome Marchand 76256f3547bSFeng Tang static void sync_overcommit_as(struct work_struct *dummy) 76356f3547bSFeng Tang { 76456f3547bSFeng Tang percpu_counter_sync(&vm_committed_as); 76556f3547bSFeng Tang } 76656f3547bSFeng Tang 76756f3547bSFeng Tang int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, 76856f3547bSFeng Tang size_t *lenp, loff_t *ppos) 76956f3547bSFeng Tang { 77056f3547bSFeng Tang struct ctl_table t; 77156f3547bSFeng Tang int new_policy; 77256f3547bSFeng Tang int ret; 77356f3547bSFeng Tang 77456f3547bSFeng Tang /* 77556f3547bSFeng Tang * The deviation of sync_overcommit_as could be big with loose policy 77656f3547bSFeng Tang * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to 77756f3547bSFeng Tang * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply 77856f3547bSFeng Tang * with the strict "NEVER", and to avoid possible race condtion (even 77956f3547bSFeng Tang * though user usually won't too frequently do the switching to policy 78056f3547bSFeng Tang * OVERCOMMIT_NEVER), the switch is done in the following order: 78156f3547bSFeng Tang * 1. changing the batch 78256f3547bSFeng Tang * 2. sync percpu count on each CPU 78356f3547bSFeng Tang * 3. switch the policy 78456f3547bSFeng Tang */ 78556f3547bSFeng Tang if (write) { 78656f3547bSFeng Tang t = *table; 78756f3547bSFeng Tang t.data = &new_policy; 78856f3547bSFeng Tang ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 78956f3547bSFeng Tang if (ret) 79056f3547bSFeng Tang return ret; 79156f3547bSFeng Tang 79256f3547bSFeng Tang mm_compute_batch(new_policy); 79356f3547bSFeng Tang if (new_policy == OVERCOMMIT_NEVER) 79456f3547bSFeng Tang schedule_on_each_cpu(sync_overcommit_as); 79556f3547bSFeng Tang sysctl_overcommit_memory = new_policy; 79656f3547bSFeng Tang } else { 79756f3547bSFeng Tang ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 79856f3547bSFeng Tang } 79956f3547bSFeng Tang 80056f3547bSFeng Tang return ret; 80156f3547bSFeng Tang } 80256f3547bSFeng Tang 80332927393SChristoph Hellwig int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, 80432927393SChristoph Hellwig size_t *lenp, loff_t *ppos) 80549f0ce5fSJerome Marchand { 80649f0ce5fSJerome Marchand int ret; 80749f0ce5fSJerome Marchand 80849f0ce5fSJerome Marchand ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 80949f0ce5fSJerome Marchand if (ret == 0 && write) 81049f0ce5fSJerome Marchand sysctl_overcommit_ratio = 0; 81149f0ce5fSJerome Marchand return ret; 81249f0ce5fSJerome Marchand } 81349f0ce5fSJerome Marchand 81400619bccSJerome Marchand /* 81500619bccSJerome Marchand * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used 81600619bccSJerome Marchand */ 81700619bccSJerome Marchand unsigned long vm_commit_limit(void) 81800619bccSJerome Marchand { 81949f0ce5fSJerome Marchand unsigned long allowed; 82049f0ce5fSJerome Marchand 82149f0ce5fSJerome Marchand if (sysctl_overcommit_kbytes) 82249f0ce5fSJerome Marchand allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); 82349f0ce5fSJerome Marchand else 824ca79b0c2SArun KS allowed = ((totalram_pages() - hugetlb_total_pages()) 82549f0ce5fSJerome Marchand * sysctl_overcommit_ratio / 100); 82649f0ce5fSJerome Marchand allowed += total_swap_pages; 82749f0ce5fSJerome Marchand 82849f0ce5fSJerome Marchand return allowed; 82900619bccSJerome Marchand } 83000619bccSJerome Marchand 83139a1aa8eSAndrey Ryabinin /* 83239a1aa8eSAndrey Ryabinin * Make sure vm_committed_as in one cacheline and not cacheline shared with 83339a1aa8eSAndrey Ryabinin * other variables. It can be updated by several CPUs frequently. 83439a1aa8eSAndrey Ryabinin */ 83539a1aa8eSAndrey Ryabinin struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 83639a1aa8eSAndrey Ryabinin 83739a1aa8eSAndrey Ryabinin /* 83839a1aa8eSAndrey Ryabinin * The global memory commitment made in the system can be a metric 83939a1aa8eSAndrey Ryabinin * that can be used to drive ballooning decisions when Linux is hosted 84039a1aa8eSAndrey Ryabinin * as a guest. On Hyper-V, the host implements a policy engine for dynamically 84139a1aa8eSAndrey Ryabinin * balancing memory across competing virtual machines that are hosted. 84239a1aa8eSAndrey Ryabinin * Several metrics drive this policy engine including the guest reported 84339a1aa8eSAndrey Ryabinin * memory commitment. 8444e2ee51eSFeng Tang * 8454e2ee51eSFeng Tang * The time cost of this is very low for small platforms, and for big 8464e2ee51eSFeng Tang * platform like a 2S/36C/72T Skylake server, in worst case where 8474e2ee51eSFeng Tang * vm_committed_as's spinlock is under severe contention, the time cost 8484e2ee51eSFeng Tang * could be about 30~40 microseconds. 84939a1aa8eSAndrey Ryabinin */ 85039a1aa8eSAndrey Ryabinin unsigned long vm_memory_committed(void) 85139a1aa8eSAndrey Ryabinin { 8524e2ee51eSFeng Tang return percpu_counter_sum_positive(&vm_committed_as); 85339a1aa8eSAndrey Ryabinin } 85439a1aa8eSAndrey Ryabinin EXPORT_SYMBOL_GPL(vm_memory_committed); 85539a1aa8eSAndrey Ryabinin 85639a1aa8eSAndrey Ryabinin /* 85739a1aa8eSAndrey Ryabinin * Check that a process has enough memory to allocate a new virtual 85839a1aa8eSAndrey Ryabinin * mapping. 0 means there is enough memory for the allocation to 85939a1aa8eSAndrey Ryabinin * succeed and -ENOMEM implies there is not. 86039a1aa8eSAndrey Ryabinin * 86139a1aa8eSAndrey Ryabinin * We currently support three overcommit policies, which are set via the 862ad56b738SMike Rapoport * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst 86339a1aa8eSAndrey Ryabinin * 86439a1aa8eSAndrey Ryabinin * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 86539a1aa8eSAndrey Ryabinin * Additional code 2002 Jul 20 by Robert Love. 86639a1aa8eSAndrey Ryabinin * 86739a1aa8eSAndrey Ryabinin * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 86839a1aa8eSAndrey Ryabinin * 86939a1aa8eSAndrey Ryabinin * Note this is a helper function intended to be used by LSMs which 87039a1aa8eSAndrey Ryabinin * wish to use this logic. 87139a1aa8eSAndrey Ryabinin */ 87239a1aa8eSAndrey Ryabinin int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 87339a1aa8eSAndrey Ryabinin { 8748c7829b0SJohannes Weiner long allowed; 87539a1aa8eSAndrey Ryabinin 87639a1aa8eSAndrey Ryabinin vm_acct_memory(pages); 87739a1aa8eSAndrey Ryabinin 87839a1aa8eSAndrey Ryabinin /* 87939a1aa8eSAndrey Ryabinin * Sometimes we want to use more memory than we have 88039a1aa8eSAndrey Ryabinin */ 88139a1aa8eSAndrey Ryabinin if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 88239a1aa8eSAndrey Ryabinin return 0; 88339a1aa8eSAndrey Ryabinin 88439a1aa8eSAndrey Ryabinin if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 8858c7829b0SJohannes Weiner if (pages > totalram_pages() + total_swap_pages) 88639a1aa8eSAndrey Ryabinin goto error; 88739a1aa8eSAndrey Ryabinin return 0; 88839a1aa8eSAndrey Ryabinin } 88939a1aa8eSAndrey Ryabinin 89039a1aa8eSAndrey Ryabinin allowed = vm_commit_limit(); 89139a1aa8eSAndrey Ryabinin /* 89239a1aa8eSAndrey Ryabinin * Reserve some for root 89339a1aa8eSAndrey Ryabinin */ 89439a1aa8eSAndrey Ryabinin if (!cap_sys_admin) 89539a1aa8eSAndrey Ryabinin allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 89639a1aa8eSAndrey Ryabinin 89739a1aa8eSAndrey Ryabinin /* 89839a1aa8eSAndrey Ryabinin * Don't let a single process grow so big a user can't recover 89939a1aa8eSAndrey Ryabinin */ 90039a1aa8eSAndrey Ryabinin if (mm) { 9018c7829b0SJohannes Weiner long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 9028c7829b0SJohannes Weiner 90339a1aa8eSAndrey Ryabinin allowed -= min_t(long, mm->total_vm / 32, reserve); 90439a1aa8eSAndrey Ryabinin } 90539a1aa8eSAndrey Ryabinin 90639a1aa8eSAndrey Ryabinin if (percpu_counter_read_positive(&vm_committed_as) < allowed) 90739a1aa8eSAndrey Ryabinin return 0; 90839a1aa8eSAndrey Ryabinin error: 90939a1aa8eSAndrey Ryabinin vm_unacct_memory(pages); 91039a1aa8eSAndrey Ryabinin 91139a1aa8eSAndrey Ryabinin return -ENOMEM; 91239a1aa8eSAndrey Ryabinin } 91339a1aa8eSAndrey Ryabinin 914a9090253SWilliam Roberts /** 915a9090253SWilliam Roberts * get_cmdline() - copy the cmdline value to a buffer. 916a9090253SWilliam Roberts * @task: the task whose cmdline value to copy. 917a9090253SWilliam Roberts * @buffer: the buffer to copy to. 918a9090253SWilliam Roberts * @buflen: the length of the buffer. Larger cmdline values are truncated 919a9090253SWilliam Roberts * to this length. 920a862f68aSMike Rapoport * 921a862f68aSMike Rapoport * Return: the size of the cmdline field copied. Note that the copy does 922a9090253SWilliam Roberts * not guarantee an ending NULL byte. 923a9090253SWilliam Roberts */ 924a9090253SWilliam Roberts int get_cmdline(struct task_struct *task, char *buffer, int buflen) 925a9090253SWilliam Roberts { 926a9090253SWilliam Roberts int res = 0; 927a9090253SWilliam Roberts unsigned int len; 928a9090253SWilliam Roberts struct mm_struct *mm = get_task_mm(task); 929a3b609efSMateusz Guzik unsigned long arg_start, arg_end, env_start, env_end; 930a9090253SWilliam Roberts if (!mm) 931a9090253SWilliam Roberts goto out; 932a9090253SWilliam Roberts if (!mm->arg_end) 933a9090253SWilliam Roberts goto out_mm; /* Shh! No looking before we're done */ 934a9090253SWilliam Roberts 935bc81426fSMichal Koutný spin_lock(&mm->arg_lock); 936a3b609efSMateusz Guzik arg_start = mm->arg_start; 937a3b609efSMateusz Guzik arg_end = mm->arg_end; 938a3b609efSMateusz Guzik env_start = mm->env_start; 939a3b609efSMateusz Guzik env_end = mm->env_end; 940bc81426fSMichal Koutný spin_unlock(&mm->arg_lock); 941a3b609efSMateusz Guzik 942a3b609efSMateusz Guzik len = arg_end - arg_start; 943a9090253SWilliam Roberts 944a9090253SWilliam Roberts if (len > buflen) 945a9090253SWilliam Roberts len = buflen; 946a9090253SWilliam Roberts 947f307ab6dSLorenzo Stoakes res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); 948a9090253SWilliam Roberts 949a9090253SWilliam Roberts /* 950a9090253SWilliam Roberts * If the nul at the end of args has been overwritten, then 951a9090253SWilliam Roberts * assume application is using setproctitle(3). 952a9090253SWilliam Roberts */ 953a9090253SWilliam Roberts if (res > 0 && buffer[res-1] != '\0' && len < buflen) { 954a9090253SWilliam Roberts len = strnlen(buffer, res); 955a9090253SWilliam Roberts if (len < res) { 956a9090253SWilliam Roberts res = len; 957a9090253SWilliam Roberts } else { 958a3b609efSMateusz Guzik len = env_end - env_start; 959a9090253SWilliam Roberts if (len > buflen - res) 960a9090253SWilliam Roberts len = buflen - res; 961a3b609efSMateusz Guzik res += access_process_vm(task, env_start, 962f307ab6dSLorenzo Stoakes buffer+res, len, 963f307ab6dSLorenzo Stoakes FOLL_FORCE); 964a9090253SWilliam Roberts res = strnlen(buffer, res); 965a9090253SWilliam Roberts } 966a9090253SWilliam Roberts } 967a9090253SWilliam Roberts out_mm: 968a9090253SWilliam Roberts mmput(mm); 969a9090253SWilliam Roberts out: 970a9090253SWilliam Roberts return res; 971a9090253SWilliam Roberts } 972010c164aSSong Liu 9734d1a8a2dSCatalin Marinas int __weak memcmp_pages(struct page *page1, struct page *page2) 974010c164aSSong Liu { 975010c164aSSong Liu char *addr1, *addr2; 976010c164aSSong Liu int ret; 977010c164aSSong Liu 978010c164aSSong Liu addr1 = kmap_atomic(page1); 979010c164aSSong Liu addr2 = kmap_atomic(page2); 980010c164aSSong Liu ret = memcmp(addr1, addr2, PAGE_SIZE); 981010c164aSSong Liu kunmap_atomic(addr2); 982010c164aSSong Liu kunmap_atomic(addr1); 983010c164aSSong Liu return ret; 984010c164aSSong Liu } 985*8e7f37f2SPaul E. McKenney 986*8e7f37f2SPaul E. McKenney /** 987*8e7f37f2SPaul E. McKenney * mem_dump_obj - Print available provenance information 988*8e7f37f2SPaul E. McKenney * @object: object for which to find provenance information. 989*8e7f37f2SPaul E. McKenney * 990*8e7f37f2SPaul E. McKenney * This function uses pr_cont(), so that the caller is expected to have 991*8e7f37f2SPaul E. McKenney * printed out whatever preamble is appropriate. The provenance information 992*8e7f37f2SPaul E. McKenney * depends on the type of object and on how much debugging is enabled. 993*8e7f37f2SPaul E. McKenney * For example, for a slab-cache object, the slab name is printed, and, 994*8e7f37f2SPaul E. McKenney * if available, the return address and stack trace from the allocation 995*8e7f37f2SPaul E. McKenney * of that object. 996*8e7f37f2SPaul E. McKenney */ 997*8e7f37f2SPaul E. McKenney void mem_dump_obj(void *object) 998*8e7f37f2SPaul E. McKenney { 999*8e7f37f2SPaul E. McKenney if (!virt_addr_valid(object)) { 1000*8e7f37f2SPaul E. McKenney pr_cont(" non-paged (local) memory.\n"); 1001*8e7f37f2SPaul E. McKenney return; 1002*8e7f37f2SPaul E. McKenney } 1003*8e7f37f2SPaul E. McKenney if (kmem_valid_obj(object)) { 1004*8e7f37f2SPaul E. McKenney kmem_dump_obj(object); 1005*8e7f37f2SPaul E. McKenney return; 1006*8e7f37f2SPaul E. McKenney } 1007*8e7f37f2SPaul E. McKenney pr_cont(" non-slab memory.\n"); 1008*8e7f37f2SPaul E. McKenney } 1009