1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/exec.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 /* 9 * #!-checking implemented by tytso. 10 */ 11 /* 12 * Demand-loading implemented 01.12.91 - no need to read anything but 13 * the header into memory. The inode of the executable is put into 14 * "current->executable", and page faults do the actual loading. Clean. 15 * 16 * Once more I can proudly say that linux stood up to being changed: it 17 * was less than 2 hours work to get demand-loading completely implemented. 18 * 19 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, 20 * current->executable is only used by the procfs. This allows a dispatch 21 * table to check for several different types of binary formats. We keep 22 * trying until we recognize the file or we run out of supported binary 23 * formats. 24 */ 25 26 #include <linux/kernel_read_file.h> 27 #include <linux/slab.h> 28 #include <linux/file.h> 29 #include <linux/fdtable.h> 30 #include <linux/mm.h> 31 #include <linux/stat.h> 32 #include <linux/fcntl.h> 33 #include <linux/swap.h> 34 #include <linux/string.h> 35 #include <linux/init.h> 36 #include <linux/sched/mm.h> 37 #include <linux/sched/coredump.h> 38 #include <linux/sched/signal.h> 39 #include <linux/sched/numa_balancing.h> 40 #include <linux/sched/task.h> 41 #include <linux/pagemap.h> 42 #include <linux/perf_event.h> 43 #include <linux/highmem.h> 44 #include <linux/spinlock.h> 45 #include <linux/key.h> 46 #include <linux/personality.h> 47 #include <linux/binfmts.h> 48 #include <linux/utsname.h> 49 #include <linux/pid_namespace.h> 50 #include <linux/module.h> 51 #include <linux/namei.h> 52 #include <linux/mount.h> 53 #include <linux/security.h> 54 #include <linux/syscalls.h> 55 #include <linux/tsacct_kern.h> 56 #include <linux/cn_proc.h> 57 #include <linux/audit.h> 58 #include <linux/kmod.h> 59 #include <linux/fsnotify.h> 60 #include <linux/fs_struct.h> 61 #include <linux/oom.h> 62 #include <linux/compat.h> 63 #include <linux/vmalloc.h> 64 #include <linux/io_uring.h> 65 #include <linux/syscall_user_dispatch.h> 66 #include <linux/coredump.h> 67 #include <linux/time_namespace.h> 68 #include <linux/user_events.h> 69 #include <linux/rseq.h> 70 #include <linux/ksm.h> 71 72 #include <linux/uaccess.h> 73 #include <asm/mmu_context.h> 74 #include <asm/tlb.h> 75 76 #include <trace/events/task.h> 77 #include "internal.h" 78 79 #include <trace/events/sched.h> 80 81 static int bprm_creds_from_file(struct linux_binprm *bprm); 82 83 int suid_dumpable = 0; 84 85 static LIST_HEAD(formats); 86 static DEFINE_RWLOCK(binfmt_lock); 87 88 void __register_binfmt(struct linux_binfmt * fmt, int insert) 89 { 90 write_lock(&binfmt_lock); 91 insert ? list_add(&fmt->lh, &formats) : 92 list_add_tail(&fmt->lh, &formats); 93 write_unlock(&binfmt_lock); 94 } 95 96 EXPORT_SYMBOL(__register_binfmt); 97 98 void unregister_binfmt(struct linux_binfmt * fmt) 99 { 100 write_lock(&binfmt_lock); 101 list_del(&fmt->lh); 102 write_unlock(&binfmt_lock); 103 } 104 105 EXPORT_SYMBOL(unregister_binfmt); 106 107 static inline void put_binfmt(struct linux_binfmt * fmt) 108 { 109 module_put(fmt->module); 110 } 111 112 bool path_noexec(const struct path *path) 113 { 114 return (path->mnt->mnt_flags & MNT_NOEXEC) || 115 (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); 116 } 117 118 #ifdef CONFIG_MMU 119 /* 120 * The nascent bprm->mm is not visible until exec_mmap() but it can 121 * use a lot of memory, account these pages in current->mm temporary 122 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we 123 * change the counter back via acct_arg_size(0). 124 */ 125 static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) 126 { 127 struct mm_struct *mm = current->mm; 128 long diff = (long)(pages - bprm->vma_pages); 129 130 if (!mm || !diff) 131 return; 132 133 bprm->vma_pages = pages; 134 add_mm_counter(mm, MM_ANONPAGES, diff); 135 } 136 137 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 138 int write) 139 { 140 struct page *page; 141 struct vm_area_struct *vma = bprm->vma; 142 struct mm_struct *mm = bprm->mm; 143 int ret; 144 145 /* 146 * Avoid relying on expanding the stack down in GUP (which 147 * does not work for STACK_GROWSUP anyway), and just do it 148 * ahead of time. 149 */ 150 if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) 151 return NULL; 152 153 /* 154 * We are doing an exec(). 'current' is the process 155 * doing the exec and 'mm' is the new process's mm. 156 */ 157 ret = get_user_pages_remote(mm, pos, 1, 158 write ? FOLL_WRITE : 0, 159 &page, NULL); 160 mmap_read_unlock(mm); 161 if (ret <= 0) 162 return NULL; 163 164 if (write) 165 acct_arg_size(bprm, vma_pages(vma)); 166 167 return page; 168 } 169 170 static void put_arg_page(struct page *page) 171 { 172 put_page(page); 173 } 174 175 static void free_arg_pages(struct linux_binprm *bprm) 176 { 177 } 178 179 static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, 180 struct page *page) 181 { 182 flush_cache_page(bprm->vma, pos, page_to_pfn(page)); 183 } 184 185 static int __bprm_mm_init(struct linux_binprm *bprm) 186 { 187 int err; 188 struct vm_area_struct *vma = NULL; 189 struct mm_struct *mm = bprm->mm; 190 191 bprm->vma = vma = vm_area_alloc(mm); 192 if (!vma) 193 return -ENOMEM; 194 vma_set_anonymous(vma); 195 196 if (mmap_write_lock_killable(mm)) { 197 err = -EINTR; 198 goto err_free; 199 } 200 201 /* 202 * Need to be called with mmap write lock 203 * held, to avoid race with ksmd. 204 */ 205 err = ksm_execve(mm); 206 if (err) 207 goto err_ksm; 208 209 /* 210 * Place the stack at the largest stack address the architecture 211 * supports. Later, we'll move this to an appropriate place. We don't 212 * use STACK_TOP because that can depend on attributes which aren't 213 * configured yet. 214 */ 215 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 216 vma->vm_end = STACK_TOP_MAX; 217 vma->vm_start = vma->vm_end - PAGE_SIZE; 218 vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); 219 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 220 221 err = insert_vm_struct(mm, vma); 222 if (err) 223 goto err; 224 225 mm->stack_vm = mm->total_vm = 1; 226 mmap_write_unlock(mm); 227 bprm->p = vma->vm_end - sizeof(void *); 228 return 0; 229 err: 230 ksm_exit(mm); 231 err_ksm: 232 mmap_write_unlock(mm); 233 err_free: 234 bprm->vma = NULL; 235 vm_area_free(vma); 236 return err; 237 } 238 239 static bool valid_arg_len(struct linux_binprm *bprm, long len) 240 { 241 return len <= MAX_ARG_STRLEN; 242 } 243 244 #else 245 246 static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) 247 { 248 } 249 250 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 251 int write) 252 { 253 struct page *page; 254 255 page = bprm->page[pos / PAGE_SIZE]; 256 if (!page && write) { 257 page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); 258 if (!page) 259 return NULL; 260 bprm->page[pos / PAGE_SIZE] = page; 261 } 262 263 return page; 264 } 265 266 static void put_arg_page(struct page *page) 267 { 268 } 269 270 static void free_arg_page(struct linux_binprm *bprm, int i) 271 { 272 if (bprm->page[i]) { 273 __free_page(bprm->page[i]); 274 bprm->page[i] = NULL; 275 } 276 } 277 278 static void free_arg_pages(struct linux_binprm *bprm) 279 { 280 int i; 281 282 for (i = 0; i < MAX_ARG_PAGES; i++) 283 free_arg_page(bprm, i); 284 } 285 286 static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, 287 struct page *page) 288 { 289 } 290 291 static int __bprm_mm_init(struct linux_binprm *bprm) 292 { 293 bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); 294 return 0; 295 } 296 297 static bool valid_arg_len(struct linux_binprm *bprm, long len) 298 { 299 return len <= bprm->p; 300 } 301 302 #endif /* CONFIG_MMU */ 303 304 /* 305 * Create a new mm_struct and populate it with a temporary stack 306 * vm_area_struct. We don't have enough context at this point to set the stack 307 * flags, permissions, and offset, so we use temporary values. We'll update 308 * them later in setup_arg_pages(). 309 */ 310 static int bprm_mm_init(struct linux_binprm *bprm) 311 { 312 int err; 313 struct mm_struct *mm = NULL; 314 315 bprm->mm = mm = mm_alloc(); 316 err = -ENOMEM; 317 if (!mm) 318 goto err; 319 320 /* Save current stack limit for all calculations made during exec. */ 321 task_lock(current->group_leader); 322 bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; 323 task_unlock(current->group_leader); 324 325 err = __bprm_mm_init(bprm); 326 if (err) 327 goto err; 328 329 return 0; 330 331 err: 332 if (mm) { 333 bprm->mm = NULL; 334 mmdrop(mm); 335 } 336 337 return err; 338 } 339 340 struct user_arg_ptr { 341 #ifdef CONFIG_COMPAT 342 bool is_compat; 343 #endif 344 union { 345 const char __user *const __user *native; 346 #ifdef CONFIG_COMPAT 347 const compat_uptr_t __user *compat; 348 #endif 349 } ptr; 350 }; 351 352 static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) 353 { 354 const char __user *native; 355 356 #ifdef CONFIG_COMPAT 357 if (unlikely(argv.is_compat)) { 358 compat_uptr_t compat; 359 360 if (get_user(compat, argv.ptr.compat + nr)) 361 return ERR_PTR(-EFAULT); 362 363 return compat_ptr(compat); 364 } 365 #endif 366 367 if (get_user(native, argv.ptr.native + nr)) 368 return ERR_PTR(-EFAULT); 369 370 return native; 371 } 372 373 /* 374 * count() counts the number of strings in array ARGV. 375 */ 376 static int count(struct user_arg_ptr argv, int max) 377 { 378 int i = 0; 379 380 if (argv.ptr.native != NULL) { 381 for (;;) { 382 const char __user *p = get_user_arg_ptr(argv, i); 383 384 if (!p) 385 break; 386 387 if (IS_ERR(p)) 388 return -EFAULT; 389 390 if (i >= max) 391 return -E2BIG; 392 ++i; 393 394 if (fatal_signal_pending(current)) 395 return -ERESTARTNOHAND; 396 cond_resched(); 397 } 398 } 399 return i; 400 } 401 402 static int count_strings_kernel(const char *const *argv) 403 { 404 int i; 405 406 if (!argv) 407 return 0; 408 409 for (i = 0; argv[i]; ++i) { 410 if (i >= MAX_ARG_STRINGS) 411 return -E2BIG; 412 if (fatal_signal_pending(current)) 413 return -ERESTARTNOHAND; 414 cond_resched(); 415 } 416 return i; 417 } 418 419 static inline int bprm_set_stack_limit(struct linux_binprm *bprm, 420 unsigned long limit) 421 { 422 #ifdef CONFIG_MMU 423 /* Avoid a pathological bprm->p. */ 424 if (bprm->p < limit) 425 return -E2BIG; 426 bprm->argmin = bprm->p - limit; 427 #endif 428 return 0; 429 } 430 static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) 431 { 432 #ifdef CONFIG_MMU 433 return bprm->p < bprm->argmin; 434 #else 435 return false; 436 #endif 437 } 438 439 /* 440 * Calculate bprm->argmin from: 441 * - _STK_LIM 442 * - ARG_MAX 443 * - bprm->rlim_stack.rlim_cur 444 * - bprm->argc 445 * - bprm->envc 446 * - bprm->p 447 */ 448 static int bprm_stack_limits(struct linux_binprm *bprm) 449 { 450 unsigned long limit, ptr_size; 451 452 /* 453 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM 454 * (whichever is smaller) for the argv+env strings. 455 * This ensures that: 456 * - the remaining binfmt code will not run out of stack space, 457 * - the program will have a reasonable amount of stack left 458 * to work from. 459 */ 460 limit = _STK_LIM / 4 * 3; 461 limit = min(limit, bprm->rlim_stack.rlim_cur / 4); 462 /* 463 * We've historically supported up to 32 pages (ARG_MAX) 464 * of argument strings even with small stacks 465 */ 466 limit = max_t(unsigned long, limit, ARG_MAX); 467 /* Reject totally pathological counts. */ 468 if (bprm->argc < 0 || bprm->envc < 0) 469 return -E2BIG; 470 /* 471 * We must account for the size of all the argv and envp pointers to 472 * the argv and envp strings, since they will also take up space in 473 * the stack. They aren't stored until much later when we can't 474 * signal to the parent that the child has run out of stack space. 475 * Instead, calculate it here so it's possible to fail gracefully. 476 * 477 * In the case of argc = 0, make sure there is space for adding a 478 * empty string (which will bump argc to 1), to ensure confused 479 * userspace programs don't start processing from argv[1], thinking 480 * argc can never be 0, to keep them from walking envp by accident. 481 * See do_execveat_common(). 482 */ 483 if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || 484 check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) 485 return -E2BIG; 486 if (limit <= ptr_size) 487 return -E2BIG; 488 limit -= ptr_size; 489 490 return bprm_set_stack_limit(bprm, limit); 491 } 492 493 /* 494 * 'copy_strings()' copies argument/environment strings from the old 495 * processes's memory to the new process's stack. The call to get_user_pages() 496 * ensures the destination page is created and not swapped out. 497 */ 498 static int copy_strings(int argc, struct user_arg_ptr argv, 499 struct linux_binprm *bprm) 500 { 501 struct page *kmapped_page = NULL; 502 char *kaddr = NULL; 503 unsigned long kpos = 0; 504 int ret; 505 506 while (argc-- > 0) { 507 const char __user *str; 508 int len; 509 unsigned long pos; 510 511 ret = -EFAULT; 512 str = get_user_arg_ptr(argv, argc); 513 if (IS_ERR(str)) 514 goto out; 515 516 len = strnlen_user(str, MAX_ARG_STRLEN); 517 if (!len) 518 goto out; 519 520 ret = -E2BIG; 521 if (!valid_arg_len(bprm, len)) 522 goto out; 523 524 /* We're going to work our way backwards. */ 525 pos = bprm->p; 526 str += len; 527 bprm->p -= len; 528 if (bprm_hit_stack_limit(bprm)) 529 goto out; 530 531 while (len > 0) { 532 int offset, bytes_to_copy; 533 534 if (fatal_signal_pending(current)) { 535 ret = -ERESTARTNOHAND; 536 goto out; 537 } 538 cond_resched(); 539 540 offset = pos % PAGE_SIZE; 541 if (offset == 0) 542 offset = PAGE_SIZE; 543 544 bytes_to_copy = offset; 545 if (bytes_to_copy > len) 546 bytes_to_copy = len; 547 548 offset -= bytes_to_copy; 549 pos -= bytes_to_copy; 550 str -= bytes_to_copy; 551 len -= bytes_to_copy; 552 553 if (!kmapped_page || kpos != (pos & PAGE_MASK)) { 554 struct page *page; 555 556 page = get_arg_page(bprm, pos, 1); 557 if (!page) { 558 ret = -E2BIG; 559 goto out; 560 } 561 562 if (kmapped_page) { 563 flush_dcache_page(kmapped_page); 564 kunmap_local(kaddr); 565 put_arg_page(kmapped_page); 566 } 567 kmapped_page = page; 568 kaddr = kmap_local_page(kmapped_page); 569 kpos = pos & PAGE_MASK; 570 flush_arg_page(bprm, kpos, kmapped_page); 571 } 572 if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { 573 ret = -EFAULT; 574 goto out; 575 } 576 } 577 } 578 ret = 0; 579 out: 580 if (kmapped_page) { 581 flush_dcache_page(kmapped_page); 582 kunmap_local(kaddr); 583 put_arg_page(kmapped_page); 584 } 585 return ret; 586 } 587 588 /* 589 * Copy and argument/environment string from the kernel to the processes stack. 590 */ 591 int copy_string_kernel(const char *arg, struct linux_binprm *bprm) 592 { 593 int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; 594 unsigned long pos = bprm->p; 595 596 if (len == 0) 597 return -EFAULT; 598 if (!valid_arg_len(bprm, len)) 599 return -E2BIG; 600 601 /* We're going to work our way backwards. */ 602 arg += len; 603 bprm->p -= len; 604 if (bprm_hit_stack_limit(bprm)) 605 return -E2BIG; 606 607 while (len > 0) { 608 unsigned int bytes_to_copy = min_t(unsigned int, len, 609 min_not_zero(offset_in_page(pos), PAGE_SIZE)); 610 struct page *page; 611 612 pos -= bytes_to_copy; 613 arg -= bytes_to_copy; 614 len -= bytes_to_copy; 615 616 page = get_arg_page(bprm, pos, 1); 617 if (!page) 618 return -E2BIG; 619 flush_arg_page(bprm, pos & PAGE_MASK, page); 620 memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); 621 put_arg_page(page); 622 } 623 624 return 0; 625 } 626 EXPORT_SYMBOL(copy_string_kernel); 627 628 static int copy_strings_kernel(int argc, const char *const *argv, 629 struct linux_binprm *bprm) 630 { 631 while (argc-- > 0) { 632 int ret = copy_string_kernel(argv[argc], bprm); 633 if (ret < 0) 634 return ret; 635 if (fatal_signal_pending(current)) 636 return -ERESTARTNOHAND; 637 cond_resched(); 638 } 639 return 0; 640 } 641 642 #ifdef CONFIG_MMU 643 644 /* 645 * Finalizes the stack vm_area_struct. The flags and permissions are updated, 646 * the stack is optionally relocated, and some extra space is added. 647 */ 648 int setup_arg_pages(struct linux_binprm *bprm, 649 unsigned long stack_top, 650 int executable_stack) 651 { 652 unsigned long ret; 653 unsigned long stack_shift; 654 struct mm_struct *mm = current->mm; 655 struct vm_area_struct *vma = bprm->vma; 656 struct vm_area_struct *prev = NULL; 657 unsigned long vm_flags; 658 unsigned long stack_base; 659 unsigned long stack_size; 660 unsigned long stack_expand; 661 unsigned long rlim_stack; 662 struct mmu_gather tlb; 663 struct vma_iterator vmi; 664 665 #ifdef CONFIG_STACK_GROWSUP 666 /* Limit stack size */ 667 stack_base = bprm->rlim_stack.rlim_max; 668 669 stack_base = calc_max_stack_size(stack_base); 670 671 /* Add space for stack randomization. */ 672 if (current->flags & PF_RANDOMIZE) 673 stack_base += (STACK_RND_MASK << PAGE_SHIFT); 674 675 /* Make sure we didn't let the argument array grow too large. */ 676 if (vma->vm_end - vma->vm_start > stack_base) 677 return -ENOMEM; 678 679 stack_base = PAGE_ALIGN(stack_top - stack_base); 680 681 stack_shift = vma->vm_start - stack_base; 682 mm->arg_start = bprm->p - stack_shift; 683 bprm->p = vma->vm_end - stack_shift; 684 #else 685 stack_top = arch_align_stack(stack_top); 686 stack_top = PAGE_ALIGN(stack_top); 687 688 if (unlikely(stack_top < mmap_min_addr) || 689 unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) 690 return -ENOMEM; 691 692 stack_shift = vma->vm_end - stack_top; 693 694 bprm->p -= stack_shift; 695 mm->arg_start = bprm->p; 696 #endif 697 698 bprm->exec -= stack_shift; 699 700 if (mmap_write_lock_killable(mm)) 701 return -EINTR; 702 703 vm_flags = VM_STACK_FLAGS; 704 705 /* 706 * Adjust stack execute permissions; explicitly enable for 707 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone 708 * (arch default) otherwise. 709 */ 710 if (unlikely(executable_stack == EXSTACK_ENABLE_X)) 711 vm_flags |= VM_EXEC; 712 else if (executable_stack == EXSTACK_DISABLE_X) 713 vm_flags &= ~VM_EXEC; 714 vm_flags |= mm->def_flags; 715 vm_flags |= VM_STACK_INCOMPLETE_SETUP; 716 717 vma_iter_init(&vmi, mm, vma->vm_start); 718 719 tlb_gather_mmu(&tlb, mm); 720 ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, 721 vm_flags); 722 tlb_finish_mmu(&tlb); 723 724 if (ret) 725 goto out_unlock; 726 BUG_ON(prev != vma); 727 728 if (unlikely(vm_flags & VM_EXEC)) { 729 pr_warn_once("process '%pD4' started with executable stack\n", 730 bprm->file); 731 } 732 733 /* Move stack pages down in memory. */ 734 if (stack_shift) { 735 /* 736 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once 737 * the binfmt code determines where the new stack should reside, we shift it to 738 * its final location. 739 */ 740 ret = relocate_vma_down(vma, stack_shift); 741 if (ret) 742 goto out_unlock; 743 } 744 745 /* mprotect_fixup is overkill to remove the temporary stack flags */ 746 vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); 747 748 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 749 stack_size = vma->vm_end - vma->vm_start; 750 /* 751 * Align this down to a page boundary as expand_stack 752 * will align it up. 753 */ 754 rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; 755 756 stack_expand = min(rlim_stack, stack_size + stack_expand); 757 758 #ifdef CONFIG_STACK_GROWSUP 759 stack_base = vma->vm_start + stack_expand; 760 #else 761 stack_base = vma->vm_end - stack_expand; 762 #endif 763 current->mm->start_stack = bprm->p; 764 ret = expand_stack_locked(vma, stack_base); 765 if (ret) 766 ret = -EFAULT; 767 768 out_unlock: 769 mmap_write_unlock(mm); 770 return ret; 771 } 772 EXPORT_SYMBOL(setup_arg_pages); 773 774 #else 775 776 /* 777 * Transfer the program arguments and environment from the holding pages 778 * onto the stack. The provided stack pointer is adjusted accordingly. 779 */ 780 int transfer_args_to_stack(struct linux_binprm *bprm, 781 unsigned long *sp_location) 782 { 783 unsigned long index, stop, sp; 784 int ret = 0; 785 786 stop = bprm->p >> PAGE_SHIFT; 787 sp = *sp_location; 788 789 for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { 790 unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; 791 char *src = kmap_local_page(bprm->page[index]) + offset; 792 sp -= PAGE_SIZE - offset; 793 if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) 794 ret = -EFAULT; 795 kunmap_local(src); 796 if (ret) 797 goto out; 798 } 799 800 bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; 801 *sp_location = sp; 802 803 out: 804 return ret; 805 } 806 EXPORT_SYMBOL(transfer_args_to_stack); 807 808 #endif /* CONFIG_MMU */ 809 810 /* 811 * On success, caller must call do_close_execat() on the returned 812 * struct file to close it. 813 */ 814 static struct file *do_open_execat(int fd, struct filename *name, int flags) 815 { 816 int err; 817 struct file *file __free(fput) = NULL; 818 struct open_flags open_exec_flags = { 819 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 820 .acc_mode = MAY_EXEC, 821 .intent = LOOKUP_OPEN, 822 .lookup_flags = LOOKUP_FOLLOW, 823 }; 824 825 if ((flags & 826 ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0) 827 return ERR_PTR(-EINVAL); 828 if (flags & AT_SYMLINK_NOFOLLOW) 829 open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; 830 if (flags & AT_EMPTY_PATH) 831 open_exec_flags.lookup_flags |= LOOKUP_EMPTY; 832 833 file = do_filp_open(fd, name, &open_exec_flags); 834 if (IS_ERR(file)) 835 return file; 836 837 /* 838 * In the past the regular type check was here. It moved to may_open() in 839 * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is 840 * an invariant that all non-regular files error out before we get here. 841 */ 842 if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || 843 path_noexec(&file->f_path)) 844 return ERR_PTR(-EACCES); 845 846 err = exe_file_deny_write_access(file); 847 if (err) 848 return ERR_PTR(err); 849 850 return no_free_ptr(file); 851 } 852 853 /** 854 * open_exec - Open a path name for execution 855 * 856 * @name: path name to open with the intent of executing it. 857 * 858 * Returns ERR_PTR on failure or allocated struct file on success. 859 * 860 * As this is a wrapper for the internal do_open_execat(), callers 861 * must call exe_file_allow_write_access() before fput() on release. Also see 862 * do_close_execat(). 863 */ 864 struct file *open_exec(const char *name) 865 { 866 struct filename *filename = getname_kernel(name); 867 struct file *f = ERR_CAST(filename); 868 869 if (!IS_ERR(filename)) { 870 f = do_open_execat(AT_FDCWD, filename, 0); 871 putname(filename); 872 } 873 return f; 874 } 875 EXPORT_SYMBOL(open_exec); 876 877 #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) 878 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) 879 { 880 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); 881 if (res > 0) 882 flush_icache_user_range(addr, addr + len); 883 return res; 884 } 885 EXPORT_SYMBOL(read_code); 886 #endif 887 888 /* 889 * Maps the mm_struct mm into the current task struct. 890 * On success, this function returns with exec_update_lock 891 * held for writing. 892 */ 893 static int exec_mmap(struct mm_struct *mm) 894 { 895 struct task_struct *tsk; 896 struct mm_struct *old_mm, *active_mm; 897 int ret; 898 899 /* Notify parent that we're no longer interested in the old VM */ 900 tsk = current; 901 old_mm = current->mm; 902 exec_mm_release(tsk, old_mm); 903 904 ret = down_write_killable(&tsk->signal->exec_update_lock); 905 if (ret) 906 return ret; 907 908 if (old_mm) { 909 /* 910 * If there is a pending fatal signal perhaps a signal 911 * whose default action is to create a coredump get 912 * out and die instead of going through with the exec. 913 */ 914 ret = mmap_read_lock_killable(old_mm); 915 if (ret) { 916 up_write(&tsk->signal->exec_update_lock); 917 return ret; 918 } 919 } 920 921 task_lock(tsk); 922 membarrier_exec_mmap(mm); 923 924 local_irq_disable(); 925 active_mm = tsk->active_mm; 926 tsk->active_mm = mm; 927 tsk->mm = mm; 928 mm_init_cid(mm, tsk); 929 /* 930 * This prevents preemption while active_mm is being loaded and 931 * it and mm are being updated, which could cause problems for 932 * lazy tlb mm refcounting when these are updated by context 933 * switches. Not all architectures can handle irqs off over 934 * activate_mm yet. 935 */ 936 if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) 937 local_irq_enable(); 938 activate_mm(active_mm, mm); 939 if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) 940 local_irq_enable(); 941 lru_gen_add_mm(mm); 942 task_unlock(tsk); 943 lru_gen_use_mm(mm); 944 if (old_mm) { 945 mmap_read_unlock(old_mm); 946 BUG_ON(active_mm != old_mm); 947 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); 948 mm_update_next_owner(old_mm); 949 mmput(old_mm); 950 return 0; 951 } 952 mmdrop_lazy_tlb(active_mm); 953 return 0; 954 } 955 956 static int de_thread(struct task_struct *tsk) 957 { 958 struct signal_struct *sig = tsk->signal; 959 struct sighand_struct *oldsighand = tsk->sighand; 960 spinlock_t *lock = &oldsighand->siglock; 961 962 if (thread_group_empty(tsk)) 963 goto no_thread_group; 964 965 /* 966 * Kill all other threads in the thread group. 967 */ 968 spin_lock_irq(lock); 969 if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { 970 /* 971 * Another group action in progress, just 972 * return so that the signal is processed. 973 */ 974 spin_unlock_irq(lock); 975 return -EAGAIN; 976 } 977 978 sig->group_exec_task = tsk; 979 sig->notify_count = zap_other_threads(tsk); 980 if (!thread_group_leader(tsk)) 981 sig->notify_count--; 982 983 while (sig->notify_count) { 984 __set_current_state(TASK_KILLABLE); 985 spin_unlock_irq(lock); 986 schedule(); 987 if (__fatal_signal_pending(tsk)) 988 goto killed; 989 spin_lock_irq(lock); 990 } 991 spin_unlock_irq(lock); 992 993 /* 994 * At this point all other threads have exited, all we have to 995 * do is to wait for the thread group leader to become inactive, 996 * and to assume its PID: 997 */ 998 if (!thread_group_leader(tsk)) { 999 struct task_struct *leader = tsk->group_leader; 1000 1001 for (;;) { 1002 cgroup_threadgroup_change_begin(tsk); 1003 write_lock_irq(&tasklist_lock); 1004 /* 1005 * Do this under tasklist_lock to ensure that 1006 * exit_notify() can't miss ->group_exec_task 1007 */ 1008 sig->notify_count = -1; 1009 if (likely(leader->exit_state)) 1010 break; 1011 __set_current_state(TASK_KILLABLE); 1012 write_unlock_irq(&tasklist_lock); 1013 cgroup_threadgroup_change_end(tsk); 1014 schedule(); 1015 if (__fatal_signal_pending(tsk)) 1016 goto killed; 1017 } 1018 1019 /* 1020 * The only record we have of the real-time age of a 1021 * process, regardless of execs it's done, is start_time. 1022 * All the past CPU time is accumulated in signal_struct 1023 * from sister threads now dead. But in this non-leader 1024 * exec, nothing survives from the original leader thread, 1025 * whose birth marks the true age of this process now. 1026 * When we take on its identity by switching to its PID, we 1027 * also take its birthdate (always earlier than our own). 1028 */ 1029 tsk->start_time = leader->start_time; 1030 tsk->start_boottime = leader->start_boottime; 1031 1032 BUG_ON(!same_thread_group(leader, tsk)); 1033 /* 1034 * An exec() starts a new thread group with the 1035 * TGID of the previous thread group. Rehash the 1036 * two threads with a switched PID, and release 1037 * the former thread group leader: 1038 */ 1039 1040 /* Become a process group leader with the old leader's pid. 1041 * The old leader becomes a thread of the this thread group. 1042 */ 1043 exchange_tids(tsk, leader); 1044 transfer_pid(leader, tsk, PIDTYPE_TGID); 1045 transfer_pid(leader, tsk, PIDTYPE_PGID); 1046 transfer_pid(leader, tsk, PIDTYPE_SID); 1047 1048 list_replace_rcu(&leader->tasks, &tsk->tasks); 1049 list_replace_init(&leader->sibling, &tsk->sibling); 1050 1051 tsk->group_leader = tsk; 1052 leader->group_leader = tsk; 1053 1054 tsk->exit_signal = SIGCHLD; 1055 leader->exit_signal = -1; 1056 1057 BUG_ON(leader->exit_state != EXIT_ZOMBIE); 1058 leader->exit_state = EXIT_DEAD; 1059 /* 1060 * We are going to release_task()->ptrace_unlink() silently, 1061 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees 1062 * the tracer won't block again waiting for this thread. 1063 */ 1064 if (unlikely(leader->ptrace)) 1065 __wake_up_parent(leader, leader->parent); 1066 write_unlock_irq(&tasklist_lock); 1067 cgroup_threadgroup_change_end(tsk); 1068 1069 release_task(leader); 1070 } 1071 1072 sig->group_exec_task = NULL; 1073 sig->notify_count = 0; 1074 1075 no_thread_group: 1076 /* we have changed execution domain */ 1077 tsk->exit_signal = SIGCHLD; 1078 1079 BUG_ON(!thread_group_leader(tsk)); 1080 return 0; 1081 1082 killed: 1083 /* protects against exit_notify() and __exit_signal() */ 1084 read_lock(&tasklist_lock); 1085 sig->group_exec_task = NULL; 1086 sig->notify_count = 0; 1087 read_unlock(&tasklist_lock); 1088 return -EAGAIN; 1089 } 1090 1091 1092 /* 1093 * This function makes sure the current process has its own signal table, 1094 * so that flush_signal_handlers can later reset the handlers without 1095 * disturbing other processes. (Other processes might share the signal 1096 * table via the CLONE_SIGHAND option to clone().) 1097 */ 1098 static int unshare_sighand(struct task_struct *me) 1099 { 1100 struct sighand_struct *oldsighand = me->sighand; 1101 1102 if (refcount_read(&oldsighand->count) != 1) { 1103 struct sighand_struct *newsighand; 1104 /* 1105 * This ->sighand is shared with the CLONE_SIGHAND 1106 * but not CLONE_THREAD task, switch to the new one. 1107 */ 1108 newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 1109 if (!newsighand) 1110 return -ENOMEM; 1111 1112 refcount_set(&newsighand->count, 1); 1113 1114 write_lock_irq(&tasklist_lock); 1115 spin_lock(&oldsighand->siglock); 1116 memcpy(newsighand->action, oldsighand->action, 1117 sizeof(newsighand->action)); 1118 rcu_assign_pointer(me->sighand, newsighand); 1119 spin_unlock(&oldsighand->siglock); 1120 write_unlock_irq(&tasklist_lock); 1121 1122 __cleanup_sighand(oldsighand); 1123 } 1124 return 0; 1125 } 1126 1127 /* 1128 * This is unlocked -- the string will always be NUL-terminated, but 1129 * may show overlapping contents if racing concurrent reads. 1130 */ 1131 void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) 1132 { 1133 size_t len = min(strlen(buf), sizeof(tsk->comm) - 1); 1134 1135 trace_task_rename(tsk, buf); 1136 memcpy(tsk->comm, buf, len); 1137 memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len); 1138 perf_event_comm(tsk, exec); 1139 } 1140 1141 /* 1142 * Calling this is the point of no return. None of the failures will be 1143 * seen by userspace since either the process is already taking a fatal 1144 * signal (via de_thread() or coredump), or will have SEGV raised 1145 * (after exec_mmap()) by search_binary_handler (see below). 1146 */ 1147 int begin_new_exec(struct linux_binprm * bprm) 1148 { 1149 struct task_struct *me = current; 1150 int retval; 1151 1152 /* Once we are committed compute the creds */ 1153 retval = bprm_creds_from_file(bprm); 1154 if (retval) 1155 return retval; 1156 1157 /* 1158 * This tracepoint marks the point before flushing the old exec where 1159 * the current task is still unchanged, but errors are fatal (point of 1160 * no return). The later "sched_process_exec" tracepoint is called after 1161 * the current task has successfully switched to the new exec. 1162 */ 1163 trace_sched_prepare_exec(current, bprm); 1164 1165 /* 1166 * Ensure all future errors are fatal. 1167 */ 1168 bprm->point_of_no_return = true; 1169 1170 /* Make this the only thread in the thread group */ 1171 retval = de_thread(me); 1172 if (retval) 1173 goto out; 1174 /* see the comment in check_unsafe_exec() */ 1175 current->fs->in_exec = 0; 1176 /* 1177 * Cancel any io_uring activity across execve 1178 */ 1179 io_uring_task_cancel(); 1180 1181 /* Ensure the files table is not shared. */ 1182 retval = unshare_files(); 1183 if (retval) 1184 goto out; 1185 1186 /* 1187 * Must be called _before_ exec_mmap() as bprm->mm is 1188 * not visible until then. Doing it here also ensures 1189 * we don't race against replace_mm_exe_file(). 1190 */ 1191 retval = set_mm_exe_file(bprm->mm, bprm->file); 1192 if (retval) 1193 goto out; 1194 1195 /* If the binary is not readable then enforce mm->dumpable=0 */ 1196 would_dump(bprm, bprm->file); 1197 if (bprm->have_execfd) 1198 would_dump(bprm, bprm->executable); 1199 1200 /* 1201 * Release all of the old mmap stuff 1202 */ 1203 acct_arg_size(bprm, 0); 1204 retval = exec_mmap(bprm->mm); 1205 if (retval) 1206 goto out; 1207 1208 bprm->mm = NULL; 1209 1210 retval = exec_task_namespaces(); 1211 if (retval) 1212 goto out_unlock; 1213 1214 #ifdef CONFIG_POSIX_TIMERS 1215 spin_lock_irq(&me->sighand->siglock); 1216 posix_cpu_timers_exit(me); 1217 spin_unlock_irq(&me->sighand->siglock); 1218 exit_itimers(me); 1219 flush_itimer_signals(); 1220 #endif 1221 1222 /* 1223 * Make the signal table private. 1224 */ 1225 retval = unshare_sighand(me); 1226 if (retval) 1227 goto out_unlock; 1228 1229 me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | 1230 PF_NOFREEZE | PF_NO_SETAFFINITY); 1231 flush_thread(); 1232 me->personality &= ~bprm->per_clear; 1233 1234 clear_syscall_work_syscall_user_dispatch(me); 1235 1236 /* 1237 * We have to apply CLOEXEC before we change whether the process is 1238 * dumpable (in setup_new_exec) to avoid a race with a process in userspace 1239 * trying to access the should-be-closed file descriptors of a process 1240 * undergoing exec(2). 1241 */ 1242 do_close_on_exec(me->files); 1243 1244 if (bprm->secureexec) { 1245 /* Make sure parent cannot signal privileged process. */ 1246 me->pdeath_signal = 0; 1247 1248 /* 1249 * For secureexec, reset the stack limit to sane default to 1250 * avoid bad behavior from the prior rlimits. This has to 1251 * happen before arch_pick_mmap_layout(), which examines 1252 * RLIMIT_STACK, but after the point of no return to avoid 1253 * needing to clean up the change on failure. 1254 */ 1255 if (bprm->rlim_stack.rlim_cur > _STK_LIM) 1256 bprm->rlim_stack.rlim_cur = _STK_LIM; 1257 } 1258 1259 me->sas_ss_sp = me->sas_ss_size = 0; 1260 1261 /* 1262 * Figure out dumpability. Note that this checking only of current 1263 * is wrong, but userspace depends on it. This should be testing 1264 * bprm->secureexec instead. 1265 */ 1266 if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || 1267 !(uid_eq(current_euid(), current_uid()) && 1268 gid_eq(current_egid(), current_gid()))) 1269 set_dumpable(current->mm, suid_dumpable); 1270 else 1271 set_dumpable(current->mm, SUID_DUMP_USER); 1272 1273 perf_event_exec(); 1274 1275 /* 1276 * If the original filename was empty, alloc_bprm() made up a path 1277 * that will probably not be useful to admins running ps or similar. 1278 * Let's fix it up to be something reasonable. 1279 */ 1280 if (bprm->comm_from_dentry) { 1281 /* 1282 * Hold RCU lock to keep the name from being freed behind our back. 1283 * Use acquire semantics to make sure the terminating NUL from 1284 * __d_alloc() is seen. 1285 * 1286 * Note, we're deliberately sloppy here. We don't need to care about 1287 * detecting a concurrent rename and just want a terminated name. 1288 */ 1289 rcu_read_lock(); 1290 __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), 1291 true); 1292 rcu_read_unlock(); 1293 } else { 1294 __set_task_comm(me, kbasename(bprm->filename), true); 1295 } 1296 1297 /* An exec changes our domain. We are no longer part of the thread 1298 group */ 1299 WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); 1300 flush_signal_handlers(me, 0); 1301 1302 retval = set_cred_ucounts(bprm->cred); 1303 if (retval < 0) 1304 goto out_unlock; 1305 1306 /* 1307 * install the new credentials for this executable 1308 */ 1309 security_bprm_committing_creds(bprm); 1310 1311 commit_creds(bprm->cred); 1312 bprm->cred = NULL; 1313 1314 /* 1315 * Disable monitoring for regular users 1316 * when executing setuid binaries. Must 1317 * wait until new credentials are committed 1318 * by commit_creds() above 1319 */ 1320 if (get_dumpable(me->mm) != SUID_DUMP_USER) 1321 perf_event_exit_task(me); 1322 /* 1323 * cred_guard_mutex must be held at least to this point to prevent 1324 * ptrace_attach() from altering our determination of the task's 1325 * credentials; any time after this it may be unlocked. 1326 */ 1327 security_bprm_committed_creds(bprm); 1328 1329 /* Pass the opened binary to the interpreter. */ 1330 if (bprm->have_execfd) { 1331 retval = get_unused_fd_flags(0); 1332 if (retval < 0) 1333 goto out_unlock; 1334 fd_install(retval, bprm->executable); 1335 bprm->executable = NULL; 1336 bprm->execfd = retval; 1337 } 1338 return 0; 1339 1340 out_unlock: 1341 up_write(&me->signal->exec_update_lock); 1342 if (!bprm->cred) 1343 mutex_unlock(&me->signal->cred_guard_mutex); 1344 1345 out: 1346 return retval; 1347 } 1348 EXPORT_SYMBOL(begin_new_exec); 1349 1350 void would_dump(struct linux_binprm *bprm, struct file *file) 1351 { 1352 struct inode *inode = file_inode(file); 1353 struct mnt_idmap *idmap = file_mnt_idmap(file); 1354 if (inode_permission(idmap, inode, MAY_READ) < 0) { 1355 struct user_namespace *old, *user_ns; 1356 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; 1357 1358 /* Ensure mm->user_ns contains the executable */ 1359 user_ns = old = bprm->mm->user_ns; 1360 while ((user_ns != &init_user_ns) && 1361 !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) 1362 user_ns = user_ns->parent; 1363 1364 if (old != user_ns) { 1365 bprm->mm->user_ns = get_user_ns(user_ns); 1366 put_user_ns(old); 1367 } 1368 } 1369 } 1370 EXPORT_SYMBOL(would_dump); 1371 1372 void setup_new_exec(struct linux_binprm * bprm) 1373 { 1374 /* Setup things that can depend upon the personality */ 1375 struct task_struct *me = current; 1376 1377 arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); 1378 1379 arch_setup_new_exec(); 1380 1381 /* Set the new mm task size. We have to do that late because it may 1382 * depend on TIF_32BIT which is only updated in flush_thread() on 1383 * some architectures like powerpc 1384 */ 1385 me->mm->task_size = TASK_SIZE; 1386 up_write(&me->signal->exec_update_lock); 1387 mutex_unlock(&me->signal->cred_guard_mutex); 1388 } 1389 EXPORT_SYMBOL(setup_new_exec); 1390 1391 /* Runs immediately before start_thread() takes over. */ 1392 void finalize_exec(struct linux_binprm *bprm) 1393 { 1394 /* Store any stack rlimit changes before starting thread. */ 1395 task_lock(current->group_leader); 1396 current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; 1397 task_unlock(current->group_leader); 1398 } 1399 EXPORT_SYMBOL(finalize_exec); 1400 1401 /* 1402 * Prepare credentials and lock ->cred_guard_mutex. 1403 * setup_new_exec() commits the new creds and drops the lock. 1404 * Or, if exec fails before, free_bprm() should release ->cred 1405 * and unlock. 1406 */ 1407 static int prepare_bprm_creds(struct linux_binprm *bprm) 1408 { 1409 if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) 1410 return -ERESTARTNOINTR; 1411 1412 bprm->cred = prepare_exec_creds(); 1413 if (likely(bprm->cred)) 1414 return 0; 1415 1416 mutex_unlock(¤t->signal->cred_guard_mutex); 1417 return -ENOMEM; 1418 } 1419 1420 /* Matches do_open_execat() */ 1421 static void do_close_execat(struct file *file) 1422 { 1423 if (!file) 1424 return; 1425 exe_file_allow_write_access(file); 1426 fput(file); 1427 } 1428 1429 static void free_bprm(struct linux_binprm *bprm) 1430 { 1431 if (bprm->mm) { 1432 acct_arg_size(bprm, 0); 1433 mmput(bprm->mm); 1434 } 1435 free_arg_pages(bprm); 1436 if (bprm->cred) { 1437 /* in case exec fails before de_thread() succeeds */ 1438 current->fs->in_exec = 0; 1439 mutex_unlock(¤t->signal->cred_guard_mutex); 1440 abort_creds(bprm->cred); 1441 } 1442 do_close_execat(bprm->file); 1443 if (bprm->executable) 1444 fput(bprm->executable); 1445 /* If a binfmt changed the interp, free it. */ 1446 if (bprm->interp != bprm->filename) 1447 kfree(bprm->interp); 1448 kfree(bprm->fdpath); 1449 kfree(bprm); 1450 } 1451 1452 static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) 1453 { 1454 struct linux_binprm *bprm; 1455 struct file *file; 1456 int retval = -ENOMEM; 1457 1458 file = do_open_execat(fd, filename, flags); 1459 if (IS_ERR(file)) 1460 return ERR_CAST(file); 1461 1462 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 1463 if (!bprm) { 1464 do_close_execat(file); 1465 return ERR_PTR(-ENOMEM); 1466 } 1467 1468 bprm->file = file; 1469 1470 if (fd == AT_FDCWD || filename->name[0] == '/') { 1471 bprm->filename = filename->name; 1472 } else { 1473 if (filename->name[0] == '\0') { 1474 bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); 1475 bprm->comm_from_dentry = 1; 1476 } else { 1477 bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", 1478 fd, filename->name); 1479 } 1480 if (!bprm->fdpath) 1481 goto out_free; 1482 1483 /* 1484 * Record that a name derived from an O_CLOEXEC fd will be 1485 * inaccessible after exec. This allows the code in exec to 1486 * choose to fail when the executable is not mmaped into the 1487 * interpreter and an open file descriptor is not passed to 1488 * the interpreter. This makes for a better user experience 1489 * than having the interpreter start and then immediately fail 1490 * when it finds the executable is inaccessible. 1491 */ 1492 if (get_close_on_exec(fd)) 1493 bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; 1494 1495 bprm->filename = bprm->fdpath; 1496 } 1497 bprm->interp = bprm->filename; 1498 1499 /* 1500 * At this point, security_file_open() has already been called (with 1501 * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will 1502 * stop just after the security_bprm_creds_for_exec() call in 1503 * bprm_execve(). Indeed, the kernel should not try to parse the 1504 * content of the file with exec_binprm() nor change the calling 1505 * thread, which means that the following security functions will not 1506 * be called: 1507 * - security_bprm_check() 1508 * - security_bprm_creds_from_file() 1509 * - security_bprm_committing_creds() 1510 * - security_bprm_committed_creds() 1511 */ 1512 bprm->is_check = !!(flags & AT_EXECVE_CHECK); 1513 1514 retval = bprm_mm_init(bprm); 1515 if (!retval) 1516 return bprm; 1517 1518 out_free: 1519 free_bprm(bprm); 1520 return ERR_PTR(retval); 1521 } 1522 1523 int bprm_change_interp(const char *interp, struct linux_binprm *bprm) 1524 { 1525 /* If a binfmt changed the interp, free it first. */ 1526 if (bprm->interp != bprm->filename) 1527 kfree(bprm->interp); 1528 bprm->interp = kstrdup(interp, GFP_KERNEL); 1529 if (!bprm->interp) 1530 return -ENOMEM; 1531 return 0; 1532 } 1533 EXPORT_SYMBOL(bprm_change_interp); 1534 1535 /* 1536 * determine how safe it is to execute the proposed program 1537 * - the caller must hold ->cred_guard_mutex to protect against 1538 * PTRACE_ATTACH or seccomp thread-sync 1539 */ 1540 static void check_unsafe_exec(struct linux_binprm *bprm) 1541 { 1542 struct task_struct *p = current, *t; 1543 unsigned n_fs; 1544 1545 if (p->ptrace) 1546 bprm->unsafe |= LSM_UNSAFE_PTRACE; 1547 1548 /* 1549 * This isn't strictly necessary, but it makes it harder for LSMs to 1550 * mess up. 1551 */ 1552 if (task_no_new_privs(current)) 1553 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; 1554 1555 /* 1556 * If another task is sharing our fs, we cannot safely 1557 * suid exec because the differently privileged task 1558 * will be able to manipulate the current directory, etc. 1559 * It would be nice to force an unshare instead... 1560 * 1561 * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) 1562 * from another sub-thread until de_thread() succeeds, this 1563 * state is protected by cred_guard_mutex we hold. 1564 */ 1565 n_fs = 1; 1566 spin_lock(&p->fs->lock); 1567 rcu_read_lock(); 1568 for_other_threads(p, t) { 1569 if (t->fs == p->fs) 1570 n_fs++; 1571 } 1572 rcu_read_unlock(); 1573 1574 /* "users" and "in_exec" locked for copy_fs() */ 1575 if (p->fs->users > n_fs) 1576 bprm->unsafe |= LSM_UNSAFE_SHARE; 1577 else 1578 p->fs->in_exec = 1; 1579 spin_unlock(&p->fs->lock); 1580 } 1581 1582 static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) 1583 { 1584 /* Handle suid and sgid on files */ 1585 struct mnt_idmap *idmap; 1586 struct inode *inode = file_inode(file); 1587 unsigned int mode; 1588 vfsuid_t vfsuid; 1589 vfsgid_t vfsgid; 1590 int err; 1591 1592 if (!mnt_may_suid(file->f_path.mnt)) 1593 return; 1594 1595 if (task_no_new_privs(current)) 1596 return; 1597 1598 mode = READ_ONCE(inode->i_mode); 1599 if (!(mode & (S_ISUID|S_ISGID))) 1600 return; 1601 1602 idmap = file_mnt_idmap(file); 1603 1604 /* Be careful if suid/sgid is set */ 1605 inode_lock(inode); 1606 1607 /* Atomically reload and check mode/uid/gid now that lock held. */ 1608 mode = inode->i_mode; 1609 vfsuid = i_uid_into_vfsuid(idmap, inode); 1610 vfsgid = i_gid_into_vfsgid(idmap, inode); 1611 err = inode_permission(idmap, inode, MAY_EXEC); 1612 inode_unlock(inode); 1613 1614 /* Did the exec bit vanish out from under us? Give up. */ 1615 if (err) 1616 return; 1617 1618 /* We ignore suid/sgid if there are no mappings for them in the ns */ 1619 if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || 1620 !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) 1621 return; 1622 1623 if (mode & S_ISUID) { 1624 bprm->per_clear |= PER_CLEAR_ON_SETID; 1625 bprm->cred->euid = vfsuid_into_kuid(vfsuid); 1626 } 1627 1628 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1629 bprm->per_clear |= PER_CLEAR_ON_SETID; 1630 bprm->cred->egid = vfsgid_into_kgid(vfsgid); 1631 } 1632 } 1633 1634 /* 1635 * Compute brpm->cred based upon the final binary. 1636 */ 1637 static int bprm_creds_from_file(struct linux_binprm *bprm) 1638 { 1639 /* Compute creds based on which file? */ 1640 struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; 1641 1642 bprm_fill_uid(bprm, file); 1643 return security_bprm_creds_from_file(bprm, file); 1644 } 1645 1646 /* 1647 * Fill the binprm structure from the inode. 1648 * Read the first BINPRM_BUF_SIZE bytes 1649 * 1650 * This may be called multiple times for binary chains (scripts for example). 1651 */ 1652 static int prepare_binprm(struct linux_binprm *bprm) 1653 { 1654 loff_t pos = 0; 1655 1656 memset(bprm->buf, 0, BINPRM_BUF_SIZE); 1657 return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); 1658 } 1659 1660 /* 1661 * Arguments are '\0' separated strings found at the location bprm->p 1662 * points to; chop off the first by relocating brpm->p to right after 1663 * the first '\0' encountered. 1664 */ 1665 int remove_arg_zero(struct linux_binprm *bprm) 1666 { 1667 unsigned long offset; 1668 char *kaddr; 1669 struct page *page; 1670 1671 if (!bprm->argc) 1672 return 0; 1673 1674 do { 1675 offset = bprm->p & ~PAGE_MASK; 1676 page = get_arg_page(bprm, bprm->p, 0); 1677 if (!page) 1678 return -EFAULT; 1679 kaddr = kmap_local_page(page); 1680 1681 for (; offset < PAGE_SIZE && kaddr[offset]; 1682 offset++, bprm->p++) 1683 ; 1684 1685 kunmap_local(kaddr); 1686 put_arg_page(page); 1687 } while (offset == PAGE_SIZE); 1688 1689 bprm->p++; 1690 bprm->argc--; 1691 1692 return 0; 1693 } 1694 EXPORT_SYMBOL(remove_arg_zero); 1695 1696 /* 1697 * cycle the list of binary formats handler, until one recognizes the image 1698 */ 1699 static int search_binary_handler(struct linux_binprm *bprm) 1700 { 1701 struct linux_binfmt *fmt; 1702 int retval; 1703 1704 retval = prepare_binprm(bprm); 1705 if (retval < 0) 1706 return retval; 1707 1708 retval = security_bprm_check(bprm); 1709 if (retval) 1710 return retval; 1711 1712 read_lock(&binfmt_lock); 1713 list_for_each_entry(fmt, &formats, lh) { 1714 if (!try_module_get(fmt->module)) 1715 continue; 1716 read_unlock(&binfmt_lock); 1717 1718 retval = fmt->load_binary(bprm); 1719 1720 read_lock(&binfmt_lock); 1721 put_binfmt(fmt); 1722 if (bprm->point_of_no_return || (retval != -ENOEXEC)) { 1723 read_unlock(&binfmt_lock); 1724 return retval; 1725 } 1726 } 1727 read_unlock(&binfmt_lock); 1728 1729 return -ENOEXEC; 1730 } 1731 1732 /* binfmt handlers will call back into begin_new_exec() on success. */ 1733 static int exec_binprm(struct linux_binprm *bprm) 1734 { 1735 pid_t old_pid, old_vpid; 1736 int ret, depth; 1737 1738 /* Need to fetch pid before load_binary changes it */ 1739 old_pid = current->pid; 1740 rcu_read_lock(); 1741 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); 1742 rcu_read_unlock(); 1743 1744 /* This allows 4 levels of binfmt rewrites before failing hard. */ 1745 for (depth = 0;; depth++) { 1746 struct file *exec; 1747 if (depth > 5) 1748 return -ELOOP; 1749 1750 ret = search_binary_handler(bprm); 1751 if (ret < 0) 1752 return ret; 1753 if (!bprm->interpreter) 1754 break; 1755 1756 exec = bprm->file; 1757 bprm->file = bprm->interpreter; 1758 bprm->interpreter = NULL; 1759 1760 exe_file_allow_write_access(exec); 1761 if (unlikely(bprm->have_execfd)) { 1762 if (bprm->executable) { 1763 fput(exec); 1764 return -ENOEXEC; 1765 } 1766 bprm->executable = exec; 1767 } else 1768 fput(exec); 1769 } 1770 1771 audit_bprm(bprm); 1772 trace_sched_process_exec(current, old_pid, bprm); 1773 ptrace_event(PTRACE_EVENT_EXEC, old_vpid); 1774 proc_exec_connector(current); 1775 return 0; 1776 } 1777 1778 static int bprm_execve(struct linux_binprm *bprm) 1779 { 1780 int retval; 1781 1782 retval = prepare_bprm_creds(bprm); 1783 if (retval) 1784 return retval; 1785 1786 /* 1787 * Check for unsafe execution states before exec_binprm(), which 1788 * will call back into begin_new_exec(), into bprm_creds_from_file(), 1789 * where setuid-ness is evaluated. 1790 */ 1791 check_unsafe_exec(bprm); 1792 current->in_execve = 1; 1793 sched_mm_cid_before_execve(current); 1794 1795 sched_exec(); 1796 1797 /* Set the unchanging part of bprm->cred */ 1798 retval = security_bprm_creds_for_exec(bprm); 1799 if (retval || bprm->is_check) 1800 goto out; 1801 1802 retval = exec_binprm(bprm); 1803 if (retval < 0) 1804 goto out; 1805 1806 sched_mm_cid_after_execve(current); 1807 rseq_execve(current); 1808 /* execve succeeded */ 1809 current->in_execve = 0; 1810 user_events_execve(current); 1811 acct_update_integrals(current); 1812 task_numa_free(current, false); 1813 return retval; 1814 1815 out: 1816 /* 1817 * If past the point of no return ensure the code never 1818 * returns to the userspace process. Use an existing fatal 1819 * signal if present otherwise terminate the process with 1820 * SIGSEGV. 1821 */ 1822 if (bprm->point_of_no_return && !fatal_signal_pending(current)) 1823 force_fatal_sig(SIGSEGV); 1824 1825 sched_mm_cid_after_execve(current); 1826 rseq_set_notify_resume(current); 1827 current->in_execve = 0; 1828 1829 return retval; 1830 } 1831 1832 static int do_execveat_common(int fd, struct filename *filename, 1833 struct user_arg_ptr argv, 1834 struct user_arg_ptr envp, 1835 int flags) 1836 { 1837 struct linux_binprm *bprm; 1838 int retval; 1839 1840 if (IS_ERR(filename)) 1841 return PTR_ERR(filename); 1842 1843 /* 1844 * We move the actual failure in case of RLIMIT_NPROC excess from 1845 * set*uid() to execve() because too many poorly written programs 1846 * don't check setuid() return code. Here we additionally recheck 1847 * whether NPROC limit is still exceeded. 1848 */ 1849 if ((current->flags & PF_NPROC_EXCEEDED) && 1850 is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { 1851 retval = -EAGAIN; 1852 goto out_ret; 1853 } 1854 1855 /* We're below the limit (still or again), so we don't want to make 1856 * further execve() calls fail. */ 1857 current->flags &= ~PF_NPROC_EXCEEDED; 1858 1859 bprm = alloc_bprm(fd, filename, flags); 1860 if (IS_ERR(bprm)) { 1861 retval = PTR_ERR(bprm); 1862 goto out_ret; 1863 } 1864 1865 retval = count(argv, MAX_ARG_STRINGS); 1866 if (retval < 0) 1867 goto out_free; 1868 bprm->argc = retval; 1869 1870 retval = count(envp, MAX_ARG_STRINGS); 1871 if (retval < 0) 1872 goto out_free; 1873 bprm->envc = retval; 1874 1875 retval = bprm_stack_limits(bprm); 1876 if (retval < 0) 1877 goto out_free; 1878 1879 retval = copy_string_kernel(bprm->filename, bprm); 1880 if (retval < 0) 1881 goto out_free; 1882 bprm->exec = bprm->p; 1883 1884 retval = copy_strings(bprm->envc, envp, bprm); 1885 if (retval < 0) 1886 goto out_free; 1887 1888 retval = copy_strings(bprm->argc, argv, bprm); 1889 if (retval < 0) 1890 goto out_free; 1891 1892 /* 1893 * When argv is empty, add an empty string ("") as argv[0] to 1894 * ensure confused userspace programs that start processing 1895 * from argv[1] won't end up walking envp. See also 1896 * bprm_stack_limits(). 1897 */ 1898 if (bprm->argc == 0) { 1899 retval = copy_string_kernel("", bprm); 1900 if (retval < 0) 1901 goto out_free; 1902 bprm->argc = 1; 1903 1904 pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", 1905 current->comm, bprm->filename); 1906 } 1907 1908 retval = bprm_execve(bprm); 1909 out_free: 1910 free_bprm(bprm); 1911 1912 out_ret: 1913 putname(filename); 1914 return retval; 1915 } 1916 1917 int kernel_execve(const char *kernel_filename, 1918 const char *const *argv, const char *const *envp) 1919 { 1920 struct filename *filename; 1921 struct linux_binprm *bprm; 1922 int fd = AT_FDCWD; 1923 int retval; 1924 1925 /* It is non-sense for kernel threads to call execve */ 1926 if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) 1927 return -EINVAL; 1928 1929 filename = getname_kernel(kernel_filename); 1930 if (IS_ERR(filename)) 1931 return PTR_ERR(filename); 1932 1933 bprm = alloc_bprm(fd, filename, 0); 1934 if (IS_ERR(bprm)) { 1935 retval = PTR_ERR(bprm); 1936 goto out_ret; 1937 } 1938 1939 retval = count_strings_kernel(argv); 1940 if (WARN_ON_ONCE(retval == 0)) 1941 retval = -EINVAL; 1942 if (retval < 0) 1943 goto out_free; 1944 bprm->argc = retval; 1945 1946 retval = count_strings_kernel(envp); 1947 if (retval < 0) 1948 goto out_free; 1949 bprm->envc = retval; 1950 1951 retval = bprm_stack_limits(bprm); 1952 if (retval < 0) 1953 goto out_free; 1954 1955 retval = copy_string_kernel(bprm->filename, bprm); 1956 if (retval < 0) 1957 goto out_free; 1958 bprm->exec = bprm->p; 1959 1960 retval = copy_strings_kernel(bprm->envc, envp, bprm); 1961 if (retval < 0) 1962 goto out_free; 1963 1964 retval = copy_strings_kernel(bprm->argc, argv, bprm); 1965 if (retval < 0) 1966 goto out_free; 1967 1968 retval = bprm_execve(bprm); 1969 out_free: 1970 free_bprm(bprm); 1971 out_ret: 1972 putname(filename); 1973 return retval; 1974 } 1975 1976 static int do_execve(struct filename *filename, 1977 const char __user *const __user *__argv, 1978 const char __user *const __user *__envp) 1979 { 1980 struct user_arg_ptr argv = { .ptr.native = __argv }; 1981 struct user_arg_ptr envp = { .ptr.native = __envp }; 1982 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); 1983 } 1984 1985 static int do_execveat(int fd, struct filename *filename, 1986 const char __user *const __user *__argv, 1987 const char __user *const __user *__envp, 1988 int flags) 1989 { 1990 struct user_arg_ptr argv = { .ptr.native = __argv }; 1991 struct user_arg_ptr envp = { .ptr.native = __envp }; 1992 1993 return do_execveat_common(fd, filename, argv, envp, flags); 1994 } 1995 1996 #ifdef CONFIG_COMPAT 1997 static int compat_do_execve(struct filename *filename, 1998 const compat_uptr_t __user *__argv, 1999 const compat_uptr_t __user *__envp) 2000 { 2001 struct user_arg_ptr argv = { 2002 .is_compat = true, 2003 .ptr.compat = __argv, 2004 }; 2005 struct user_arg_ptr envp = { 2006 .is_compat = true, 2007 .ptr.compat = __envp, 2008 }; 2009 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); 2010 } 2011 2012 static int compat_do_execveat(int fd, struct filename *filename, 2013 const compat_uptr_t __user *__argv, 2014 const compat_uptr_t __user *__envp, 2015 int flags) 2016 { 2017 struct user_arg_ptr argv = { 2018 .is_compat = true, 2019 .ptr.compat = __argv, 2020 }; 2021 struct user_arg_ptr envp = { 2022 .is_compat = true, 2023 .ptr.compat = __envp, 2024 }; 2025 return do_execveat_common(fd, filename, argv, envp, flags); 2026 } 2027 #endif 2028 2029 void set_binfmt(struct linux_binfmt *new) 2030 { 2031 struct mm_struct *mm = current->mm; 2032 2033 if (mm->binfmt) 2034 module_put(mm->binfmt->module); 2035 2036 mm->binfmt = new; 2037 if (new) 2038 __module_get(new->module); 2039 } 2040 EXPORT_SYMBOL(set_binfmt); 2041 2042 /* 2043 * set_dumpable stores three-value SUID_DUMP_* into mm->flags. 2044 */ 2045 void set_dumpable(struct mm_struct *mm, int value) 2046 { 2047 if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) 2048 return; 2049 2050 set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); 2051 } 2052 2053 SYSCALL_DEFINE3(execve, 2054 const char __user *, filename, 2055 const char __user *const __user *, argv, 2056 const char __user *const __user *, envp) 2057 { 2058 return do_execve(getname(filename), argv, envp); 2059 } 2060 2061 SYSCALL_DEFINE5(execveat, 2062 int, fd, const char __user *, filename, 2063 const char __user *const __user *, argv, 2064 const char __user *const __user *, envp, 2065 int, flags) 2066 { 2067 return do_execveat(fd, 2068 getname_uflags(filename, flags), 2069 argv, envp, flags); 2070 } 2071 2072 #ifdef CONFIG_COMPAT 2073 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, 2074 const compat_uptr_t __user *, argv, 2075 const compat_uptr_t __user *, envp) 2076 { 2077 return compat_do_execve(getname(filename), argv, envp); 2078 } 2079 2080 COMPAT_SYSCALL_DEFINE5(execveat, int, fd, 2081 const char __user *, filename, 2082 const compat_uptr_t __user *, argv, 2083 const compat_uptr_t __user *, envp, 2084 int, flags) 2085 { 2086 return compat_do_execveat(fd, 2087 getname_uflags(filename, flags), 2088 argv, envp, flags); 2089 } 2090 #endif 2091 2092 #ifdef CONFIG_SYSCTL 2093 2094 static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, 2095 void *buffer, size_t *lenp, loff_t *ppos) 2096 { 2097 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 2098 2099 if (!error) 2100 validate_coredump_safety(); 2101 return error; 2102 } 2103 2104 static const struct ctl_table fs_exec_sysctls[] = { 2105 { 2106 .procname = "suid_dumpable", 2107 .data = &suid_dumpable, 2108 .maxlen = sizeof(int), 2109 .mode = 0644, 2110 .proc_handler = proc_dointvec_minmax_coredump, 2111 .extra1 = SYSCTL_ZERO, 2112 .extra2 = SYSCTL_TWO, 2113 }, 2114 }; 2115 2116 static int __init init_fs_exec_sysctls(void) 2117 { 2118 register_sysctl_init("fs", fs_exec_sysctls); 2119 return 0; 2120 } 2121 2122 fs_initcall(init_fs_exec_sysctls); 2123 #endif /* CONFIG_SYSCTL */ 2124 2125 #ifdef CONFIG_EXEC_KUNIT_TEST 2126 #include "tests/exec_kunit.c" 2127 #endif 2128