1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/file.h> 4 #include <linux/fdtable.h> 5 #include <linux/freezer.h> 6 #include <linux/mm.h> 7 #include <linux/stat.h> 8 #include <linux/fcntl.h> 9 #include <linux/swap.h> 10 #include <linux/ctype.h> 11 #include <linux/string.h> 12 #include <linux/init.h> 13 #include <linux/pagemap.h> 14 #include <linux/perf_event.h> 15 #include <linux/highmem.h> 16 #include <linux/spinlock.h> 17 #include <linux/key.h> 18 #include <linux/personality.h> 19 #include <linux/binfmts.h> 20 #include <linux/coredump.h> 21 #include <linux/sort.h> 22 #include <linux/sched/coredump.h> 23 #include <linux/sched/signal.h> 24 #include <linux/sched/task_stack.h> 25 #include <linux/utsname.h> 26 #include <linux/pid_namespace.h> 27 #include <linux/module.h> 28 #include <linux/namei.h> 29 #include <linux/mount.h> 30 #include <linux/security.h> 31 #include <linux/syscalls.h> 32 #include <linux/tsacct_kern.h> 33 #include <linux/cn_proc.h> 34 #include <linux/audit.h> 35 #include <linux/kmod.h> 36 #include <linux/fsnotify.h> 37 #include <linux/fs_struct.h> 38 #include <linux/pipe_fs_i.h> 39 #include <linux/oom.h> 40 #include <linux/compat.h> 41 #include <linux/fs.h> 42 #include <linux/path.h> 43 #include <linux/timekeeping.h> 44 #include <linux/sysctl.h> 45 #include <linux/elf.h> 46 47 #include <linux/uaccess.h> 48 #include <asm/mmu_context.h> 49 #include <asm/tlb.h> 50 #include <asm/exec.h> 51 52 #include <trace/events/task.h> 53 #include "internal.h" 54 55 #include <trace/events/sched.h> 56 57 static bool dump_vma_snapshot(struct coredump_params *cprm); 58 static void free_vma_snapshot(struct coredump_params *cprm); 59 60 #define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) 61 /* Define a reasonable max cap */ 62 #define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) 63 64 static int core_uses_pid; 65 static unsigned int core_pipe_limit; 66 static unsigned int core_sort_vma; 67 static char core_pattern[CORENAME_MAX_SIZE] = "core"; 68 static int core_name_size = CORENAME_MAX_SIZE; 69 unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; 70 71 struct core_name { 72 char *corename; 73 int used, size; 74 }; 75 76 static int expand_corename(struct core_name *cn, int size) 77 { 78 char *corename; 79 80 size = kmalloc_size_roundup(size); 81 corename = krealloc(cn->corename, size, GFP_KERNEL); 82 83 if (!corename) 84 return -ENOMEM; 85 86 if (size > core_name_size) /* racy but harmless */ 87 core_name_size = size; 88 89 cn->size = size; 90 cn->corename = corename; 91 return 0; 92 } 93 94 static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt, 95 va_list arg) 96 { 97 int free, need; 98 va_list arg_copy; 99 100 again: 101 free = cn->size - cn->used; 102 103 va_copy(arg_copy, arg); 104 need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); 105 va_end(arg_copy); 106 107 if (need < free) { 108 cn->used += need; 109 return 0; 110 } 111 112 if (!expand_corename(cn, cn->size + need - free + 1)) 113 goto again; 114 115 return -ENOMEM; 116 } 117 118 static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...) 119 { 120 va_list arg; 121 int ret; 122 123 va_start(arg, fmt); 124 ret = cn_vprintf(cn, fmt, arg); 125 va_end(arg); 126 127 return ret; 128 } 129 130 static __printf(2, 3) 131 int cn_esc_printf(struct core_name *cn, const char *fmt, ...) 132 { 133 int cur = cn->used; 134 va_list arg; 135 int ret; 136 137 va_start(arg, fmt); 138 ret = cn_vprintf(cn, fmt, arg); 139 va_end(arg); 140 141 if (ret == 0) { 142 /* 143 * Ensure that this coredump name component can't cause the 144 * resulting corefile path to consist of a ".." or ".". 145 */ 146 if ((cn->used - cur == 1 && cn->corename[cur] == '.') || 147 (cn->used - cur == 2 && cn->corename[cur] == '.' 148 && cn->corename[cur+1] == '.')) 149 cn->corename[cur] = '!'; 150 151 /* 152 * Empty names are fishy and could be used to create a "//" in a 153 * corefile name, causing the coredump to happen one directory 154 * level too high. Enforce that all components of the core 155 * pattern are at least one character long. 156 */ 157 if (cn->used == cur) 158 ret = cn_printf(cn, "!"); 159 } 160 161 for (; cur < cn->used; ++cur) { 162 if (cn->corename[cur] == '/') 163 cn->corename[cur] = '!'; 164 } 165 return ret; 166 } 167 168 static int cn_print_exe_file(struct core_name *cn, bool name_only) 169 { 170 struct file *exe_file; 171 char *pathbuf, *path, *ptr; 172 int ret; 173 174 exe_file = get_mm_exe_file(current->mm); 175 if (!exe_file) 176 return cn_esc_printf(cn, "%s (path unknown)", current->comm); 177 178 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 179 if (!pathbuf) { 180 ret = -ENOMEM; 181 goto put_exe_file; 182 } 183 184 path = file_path(exe_file, pathbuf, PATH_MAX); 185 if (IS_ERR(path)) { 186 ret = PTR_ERR(path); 187 goto free_buf; 188 } 189 190 if (name_only) { 191 ptr = strrchr(path, '/'); 192 if (ptr) 193 path = ptr + 1; 194 } 195 ret = cn_esc_printf(cn, "%s", path); 196 197 free_buf: 198 kfree(pathbuf); 199 put_exe_file: 200 fput(exe_file); 201 return ret; 202 } 203 204 /* format_corename will inspect the pattern parameter, and output a 205 * name into corename, which must have space for at least 206 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 207 */ 208 static int format_corename(struct core_name *cn, struct coredump_params *cprm, 209 size_t **argv, int *argc) 210 { 211 const struct cred *cred = current_cred(); 212 const char *pat_ptr = core_pattern; 213 int ispipe = (*pat_ptr == '|'); 214 bool was_space = false; 215 int pid_in_pattern = 0; 216 int err = 0; 217 218 cn->used = 0; 219 cn->corename = NULL; 220 if (expand_corename(cn, core_name_size)) 221 return -ENOMEM; 222 cn->corename[0] = '\0'; 223 224 if (ispipe) { 225 int argvs = sizeof(core_pattern) / 2; 226 (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL); 227 if (!(*argv)) 228 return -ENOMEM; 229 (*argv)[(*argc)++] = 0; 230 ++pat_ptr; 231 if (!(*pat_ptr)) 232 return -ENOMEM; 233 } 234 235 /* Repeat as long as we have more pattern to process and more output 236 space */ 237 while (*pat_ptr) { 238 /* 239 * Split on spaces before doing template expansion so that 240 * %e and %E don't get split if they have spaces in them 241 */ 242 if (ispipe) { 243 if (isspace(*pat_ptr)) { 244 if (cn->used != 0) 245 was_space = true; 246 pat_ptr++; 247 continue; 248 } else if (was_space) { 249 was_space = false; 250 err = cn_printf(cn, "%c", '\0'); 251 if (err) 252 return err; 253 (*argv)[(*argc)++] = cn->used; 254 } 255 } 256 if (*pat_ptr != '%') { 257 err = cn_printf(cn, "%c", *pat_ptr++); 258 } else { 259 switch (*++pat_ptr) { 260 /* single % at the end, drop that */ 261 case 0: 262 goto out; 263 /* Double percent, output one percent */ 264 case '%': 265 err = cn_printf(cn, "%c", '%'); 266 break; 267 /* pid */ 268 case 'p': 269 pid_in_pattern = 1; 270 err = cn_printf(cn, "%d", 271 task_tgid_vnr(current)); 272 break; 273 /* global pid */ 274 case 'P': 275 err = cn_printf(cn, "%d", 276 task_tgid_nr(current)); 277 break; 278 case 'i': 279 err = cn_printf(cn, "%d", 280 task_pid_vnr(current)); 281 break; 282 case 'I': 283 err = cn_printf(cn, "%d", 284 task_pid_nr(current)); 285 break; 286 /* uid */ 287 case 'u': 288 err = cn_printf(cn, "%u", 289 from_kuid(&init_user_ns, 290 cred->uid)); 291 break; 292 /* gid */ 293 case 'g': 294 err = cn_printf(cn, "%u", 295 from_kgid(&init_user_ns, 296 cred->gid)); 297 break; 298 case 'd': 299 err = cn_printf(cn, "%d", 300 __get_dumpable(cprm->mm_flags)); 301 break; 302 /* signal that caused the coredump */ 303 case 's': 304 err = cn_printf(cn, "%d", 305 cprm->siginfo->si_signo); 306 break; 307 /* UNIX time of coredump */ 308 case 't': { 309 time64_t time; 310 311 time = ktime_get_real_seconds(); 312 err = cn_printf(cn, "%lld", time); 313 break; 314 } 315 /* hostname */ 316 case 'h': 317 down_read(&uts_sem); 318 err = cn_esc_printf(cn, "%s", 319 utsname()->nodename); 320 up_read(&uts_sem); 321 break; 322 /* executable, could be changed by prctl PR_SET_NAME etc */ 323 case 'e': 324 err = cn_esc_printf(cn, "%s", current->comm); 325 break; 326 /* file name of executable */ 327 case 'f': 328 err = cn_print_exe_file(cn, true); 329 break; 330 case 'E': 331 err = cn_print_exe_file(cn, false); 332 break; 333 /* core limit size */ 334 case 'c': 335 err = cn_printf(cn, "%lu", 336 rlimit(RLIMIT_CORE)); 337 break; 338 /* CPU the task ran on */ 339 case 'C': 340 err = cn_printf(cn, "%d", cprm->cpu); 341 break; 342 default: 343 break; 344 } 345 ++pat_ptr; 346 } 347 348 if (err) 349 return err; 350 } 351 352 out: 353 /* Backward compatibility with core_uses_pid: 354 * 355 * If core_pattern does not include a %p (as is the default) 356 * and core_uses_pid is set, then .%pid will be appended to 357 * the filename. Do not do this for piped commands. */ 358 if (!ispipe && !pid_in_pattern && core_uses_pid) { 359 err = cn_printf(cn, ".%d", task_tgid_vnr(current)); 360 if (err) 361 return err; 362 } 363 return ispipe; 364 } 365 366 static int zap_process(struct signal_struct *signal, int exit_code) 367 { 368 struct task_struct *t; 369 int nr = 0; 370 371 signal->flags = SIGNAL_GROUP_EXIT; 372 signal->group_exit_code = exit_code; 373 signal->group_stop_count = 0; 374 375 __for_each_thread(signal, t) { 376 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); 377 if (t != current && !(t->flags & PF_POSTCOREDUMP)) { 378 sigaddset(&t->pending.signal, SIGKILL); 379 signal_wake_up(t, 1); 380 nr++; 381 } 382 } 383 384 return nr; 385 } 386 387 static int zap_threads(struct task_struct *tsk, 388 struct core_state *core_state, int exit_code) 389 { 390 struct signal_struct *signal = tsk->signal; 391 int nr = -EAGAIN; 392 393 spin_lock_irq(&tsk->sighand->siglock); 394 if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) { 395 /* Allow SIGKILL, see prepare_signal() */ 396 signal->core_state = core_state; 397 nr = zap_process(signal, exit_code); 398 clear_tsk_thread_flag(tsk, TIF_SIGPENDING); 399 tsk->flags |= PF_DUMPCORE; 400 atomic_set(&core_state->nr_threads, nr); 401 } 402 spin_unlock_irq(&tsk->sighand->siglock); 403 return nr; 404 } 405 406 static int coredump_wait(int exit_code, struct core_state *core_state) 407 { 408 struct task_struct *tsk = current; 409 int core_waiters = -EBUSY; 410 411 init_completion(&core_state->startup); 412 core_state->dumper.task = tsk; 413 core_state->dumper.next = NULL; 414 415 core_waiters = zap_threads(tsk, core_state, exit_code); 416 if (core_waiters > 0) { 417 struct core_thread *ptr; 418 419 wait_for_completion_state(&core_state->startup, 420 TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 421 /* 422 * Wait for all the threads to become inactive, so that 423 * all the thread context (extended register state, like 424 * fpu etc) gets copied to the memory. 425 */ 426 ptr = core_state->dumper.next; 427 while (ptr != NULL) { 428 wait_task_inactive(ptr->task, TASK_ANY); 429 ptr = ptr->next; 430 } 431 } 432 433 return core_waiters; 434 } 435 436 static void coredump_finish(bool core_dumped) 437 { 438 struct core_thread *curr, *next; 439 struct task_struct *task; 440 441 spin_lock_irq(¤t->sighand->siglock); 442 if (core_dumped && !__fatal_signal_pending(current)) 443 current->signal->group_exit_code |= 0x80; 444 next = current->signal->core_state->dumper.next; 445 current->signal->core_state = NULL; 446 spin_unlock_irq(¤t->sighand->siglock); 447 448 while ((curr = next) != NULL) { 449 next = curr->next; 450 task = curr->task; 451 /* 452 * see coredump_task_exit(), curr->task must not see 453 * ->task == NULL before we read ->next. 454 */ 455 smp_mb(); 456 curr->task = NULL; 457 wake_up_process(task); 458 } 459 } 460 461 static bool dump_interrupted(void) 462 { 463 /* 464 * SIGKILL or freezing() interrupt the coredumping. Perhaps we 465 * can do try_to_freeze() and check __fatal_signal_pending(), 466 * but then we need to teach dump_write() to restart and clear 467 * TIF_SIGPENDING. 468 */ 469 return fatal_signal_pending(current) || freezing(current); 470 } 471 472 static void wait_for_dump_helpers(struct file *file) 473 { 474 struct pipe_inode_info *pipe = file->private_data; 475 476 pipe_lock(pipe); 477 pipe->readers++; 478 pipe->writers--; 479 wake_up_interruptible_sync(&pipe->rd_wait); 480 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 481 pipe_unlock(pipe); 482 483 /* 484 * We actually want wait_event_freezable() but then we need 485 * to clear TIF_SIGPENDING and improve dump_interrupted(). 486 */ 487 wait_event_interruptible(pipe->rd_wait, pipe->readers == 1); 488 489 pipe_lock(pipe); 490 pipe->readers--; 491 pipe->writers++; 492 pipe_unlock(pipe); 493 } 494 495 /* 496 * umh_pipe_setup 497 * helper function to customize the process used 498 * to collect the core in userspace. Specifically 499 * it sets up a pipe and installs it as fd 0 (stdin) 500 * for the process. Returns 0 on success, or 501 * PTR_ERR on failure. 502 * Note that it also sets the core limit to 1. This 503 * is a special value that we use to trap recursive 504 * core dumps 505 */ 506 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) 507 { 508 struct file *files[2]; 509 struct coredump_params *cp = (struct coredump_params *)info->data; 510 int err = create_pipe_files(files, 0); 511 if (err) 512 return err; 513 514 cp->file = files[1]; 515 516 err = replace_fd(0, files[0], 0); 517 fput(files[0]); 518 /* and disallow core files too */ 519 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; 520 521 return err; 522 } 523 524 void do_coredump(const kernel_siginfo_t *siginfo) 525 { 526 struct core_state core_state; 527 struct core_name cn; 528 struct mm_struct *mm = current->mm; 529 struct linux_binfmt * binfmt; 530 const struct cred *old_cred; 531 struct cred *cred; 532 int retval = 0; 533 int ispipe; 534 size_t *argv = NULL; 535 int argc = 0; 536 /* require nonrelative corefile path and be extra careful */ 537 bool need_suid_safe = false; 538 bool core_dumped = false; 539 static atomic_t core_dump_count = ATOMIC_INIT(0); 540 struct coredump_params cprm = { 541 .siginfo = siginfo, 542 .limit = rlimit(RLIMIT_CORE), 543 /* 544 * We must use the same mm->flags while dumping core to avoid 545 * inconsistency of bit flags, since this flag is not protected 546 * by any locks. 547 */ 548 .mm_flags = mm->flags, 549 .vma_meta = NULL, 550 .cpu = raw_smp_processor_id(), 551 }; 552 553 audit_core_dumps(siginfo->si_signo); 554 555 binfmt = mm->binfmt; 556 if (!binfmt || !binfmt->core_dump) 557 goto fail; 558 if (!__get_dumpable(cprm.mm_flags)) 559 goto fail; 560 561 cred = prepare_creds(); 562 if (!cred) 563 goto fail; 564 /* 565 * We cannot trust fsuid as being the "true" uid of the process 566 * nor do we know its entire history. We only know it was tainted 567 * so we dump it as root in mode 2, and only into a controlled 568 * environment (pipe handler or fully qualified path). 569 */ 570 if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { 571 /* Setuid core dump mode */ 572 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ 573 need_suid_safe = true; 574 } 575 576 retval = coredump_wait(siginfo->si_signo, &core_state); 577 if (retval < 0) 578 goto fail_creds; 579 580 old_cred = override_creds(cred); 581 582 ispipe = format_corename(&cn, &cprm, &argv, &argc); 583 584 if (ispipe) { 585 int argi; 586 int dump_count; 587 char **helper_argv; 588 struct subprocess_info *sub_info; 589 590 if (ispipe < 0) { 591 coredump_report_failure("format_corename failed, aborting core"); 592 goto fail_unlock; 593 } 594 595 if (cprm.limit == 1) { 596 /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. 597 * 598 * Normally core limits are irrelevant to pipes, since 599 * we're not writing to the file system, but we use 600 * cprm.limit of 1 here as a special value, this is a 601 * consistent way to catch recursive crashes. 602 * We can still crash if the core_pattern binary sets 603 * RLIM_CORE = !1, but it runs as root, and can do 604 * lots of stupid things. 605 * 606 * Note that we use task_tgid_vnr here to grab the pid 607 * of the process group leader. That way we get the 608 * right pid if a thread in a multi-threaded 609 * core_pattern process dies. 610 */ 611 coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); 612 goto fail_unlock; 613 } 614 cprm.limit = RLIM_INFINITY; 615 616 dump_count = atomic_inc_return(&core_dump_count); 617 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 618 coredump_report_failure("over core_pipe_limit, skipping core dump"); 619 goto fail_dropcount; 620 } 621 622 helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), 623 GFP_KERNEL); 624 if (!helper_argv) { 625 coredump_report_failure("%s failed to allocate memory", __func__); 626 goto fail_dropcount; 627 } 628 for (argi = 0; argi < argc; argi++) 629 helper_argv[argi] = cn.corename + argv[argi]; 630 helper_argv[argi] = NULL; 631 632 retval = -ENOMEM; 633 sub_info = call_usermodehelper_setup(helper_argv[0], 634 helper_argv, NULL, GFP_KERNEL, 635 umh_pipe_setup, NULL, &cprm); 636 if (sub_info) 637 retval = call_usermodehelper_exec(sub_info, 638 UMH_WAIT_EXEC); 639 640 kfree(helper_argv); 641 if (retval) { 642 coredump_report_failure("|%s pipe failed", cn.corename); 643 goto close_fail; 644 } 645 } else { 646 struct mnt_idmap *idmap; 647 struct inode *inode; 648 int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | 649 O_LARGEFILE | O_EXCL; 650 651 if (cprm.limit < binfmt->min_coredump) 652 goto fail_unlock; 653 654 if (need_suid_safe && cn.corename[0] != '/') { 655 coredump_report_failure( 656 "this process can only dump core to a fully qualified path, skipping core dump"); 657 goto fail_unlock; 658 } 659 660 /* 661 * Unlink the file if it exists unless this is a SUID 662 * binary - in that case, we're running around with root 663 * privs and don't want to unlink another user's coredump. 664 */ 665 if (!need_suid_safe) { 666 /* 667 * If it doesn't exist, that's fine. If there's some 668 * other problem, we'll catch it at the filp_open(). 669 */ 670 do_unlinkat(AT_FDCWD, getname_kernel(cn.corename)); 671 } 672 673 /* 674 * There is a race between unlinking and creating the 675 * file, but if that causes an EEXIST here, that's 676 * fine - another process raced with us while creating 677 * the corefile, and the other process won. To userspace, 678 * what matters is that at least one of the two processes 679 * writes its coredump successfully, not which one. 680 */ 681 if (need_suid_safe) { 682 /* 683 * Using user namespaces, normal user tasks can change 684 * their current->fs->root to point to arbitrary 685 * directories. Since the intention of the "only dump 686 * with a fully qualified path" rule is to control where 687 * coredumps may be placed using root privileges, 688 * current->fs->root must not be used. Instead, use the 689 * root directory of init_task. 690 */ 691 struct path root; 692 693 task_lock(&init_task); 694 get_fs_root(init_task.fs, &root); 695 task_unlock(&init_task); 696 cprm.file = file_open_root(&root, cn.corename, 697 open_flags, 0600); 698 path_put(&root); 699 } else { 700 cprm.file = filp_open(cn.corename, open_flags, 0600); 701 } 702 if (IS_ERR(cprm.file)) 703 goto fail_unlock; 704 705 inode = file_inode(cprm.file); 706 if (inode->i_nlink > 1) 707 goto close_fail; 708 if (d_unhashed(cprm.file->f_path.dentry)) 709 goto close_fail; 710 /* 711 * AK: actually i see no reason to not allow this for named 712 * pipes etc, but keep the previous behaviour for now. 713 */ 714 if (!S_ISREG(inode->i_mode)) 715 goto close_fail; 716 /* 717 * Don't dump core if the filesystem changed owner or mode 718 * of the file during file creation. This is an issue when 719 * a process dumps core while its cwd is e.g. on a vfat 720 * filesystem. 721 */ 722 idmap = file_mnt_idmap(cprm.file); 723 if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), 724 current_fsuid())) { 725 coredump_report_failure("Core dump to %s aborted: " 726 "cannot preserve file owner", cn.corename); 727 goto close_fail; 728 } 729 if ((inode->i_mode & 0677) != 0600) { 730 coredump_report_failure("Core dump to %s aborted: " 731 "cannot preserve file permissions", cn.corename); 732 goto close_fail; 733 } 734 if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) 735 goto close_fail; 736 if (do_truncate(idmap, cprm.file->f_path.dentry, 737 0, 0, cprm.file)) 738 goto close_fail; 739 } 740 741 /* get us an unshared descriptor table; almost always a no-op */ 742 /* The cell spufs coredump code reads the file descriptor tables */ 743 retval = unshare_files(); 744 if (retval) 745 goto close_fail; 746 if (!dump_interrupted()) { 747 /* 748 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would 749 * have this set to NULL. 750 */ 751 if (!cprm.file) { 752 coredump_report_failure("Core dump to |%s disabled", cn.corename); 753 goto close_fail; 754 } 755 if (!dump_vma_snapshot(&cprm)) 756 goto close_fail; 757 758 file_start_write(cprm.file); 759 core_dumped = binfmt->core_dump(&cprm); 760 /* 761 * Ensures that file size is big enough to contain the current 762 * file postion. This prevents gdb from complaining about 763 * a truncated file if the last "write" to the file was 764 * dump_skip. 765 */ 766 if (cprm.to_skip) { 767 cprm.to_skip--; 768 dump_emit(&cprm, "", 1); 769 } 770 file_end_write(cprm.file); 771 free_vma_snapshot(&cprm); 772 } 773 if (ispipe && core_pipe_limit) 774 wait_for_dump_helpers(cprm.file); 775 close_fail: 776 if (cprm.file) 777 filp_close(cprm.file, NULL); 778 fail_dropcount: 779 if (ispipe) 780 atomic_dec(&core_dump_count); 781 fail_unlock: 782 kfree(argv); 783 kfree(cn.corename); 784 coredump_finish(core_dumped); 785 revert_creds(old_cred); 786 fail_creds: 787 put_cred(cred); 788 fail: 789 return; 790 } 791 792 /* 793 * Core dumping helper functions. These are the only things you should 794 * do on a core-file: use only these functions to write out all the 795 * necessary info. 796 */ 797 static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) 798 { 799 struct file *file = cprm->file; 800 loff_t pos = file->f_pos; 801 ssize_t n; 802 if (cprm->written + nr > cprm->limit) 803 return 0; 804 805 806 if (dump_interrupted()) 807 return 0; 808 n = __kernel_write(file, addr, nr, &pos); 809 if (n != nr) 810 return 0; 811 file->f_pos = pos; 812 cprm->written += n; 813 cprm->pos += n; 814 815 return 1; 816 } 817 818 static int __dump_skip(struct coredump_params *cprm, size_t nr) 819 { 820 static char zeroes[PAGE_SIZE]; 821 struct file *file = cprm->file; 822 if (file->f_mode & FMODE_LSEEK) { 823 if (dump_interrupted() || 824 vfs_llseek(file, nr, SEEK_CUR) < 0) 825 return 0; 826 cprm->pos += nr; 827 return 1; 828 } else { 829 while (nr > PAGE_SIZE) { 830 if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) 831 return 0; 832 nr -= PAGE_SIZE; 833 } 834 return __dump_emit(cprm, zeroes, nr); 835 } 836 } 837 838 int dump_emit(struct coredump_params *cprm, const void *addr, int nr) 839 { 840 if (cprm->to_skip) { 841 if (!__dump_skip(cprm, cprm->to_skip)) 842 return 0; 843 cprm->to_skip = 0; 844 } 845 return __dump_emit(cprm, addr, nr); 846 } 847 EXPORT_SYMBOL(dump_emit); 848 849 void dump_skip_to(struct coredump_params *cprm, unsigned long pos) 850 { 851 cprm->to_skip = pos - cprm->pos; 852 } 853 EXPORT_SYMBOL(dump_skip_to); 854 855 void dump_skip(struct coredump_params *cprm, size_t nr) 856 { 857 cprm->to_skip += nr; 858 } 859 EXPORT_SYMBOL(dump_skip); 860 861 #ifdef CONFIG_ELF_CORE 862 static int dump_emit_page(struct coredump_params *cprm, struct page *page) 863 { 864 struct bio_vec bvec; 865 struct iov_iter iter; 866 struct file *file = cprm->file; 867 loff_t pos; 868 ssize_t n; 869 870 if (!page) 871 return 0; 872 873 if (cprm->to_skip) { 874 if (!__dump_skip(cprm, cprm->to_skip)) 875 return 0; 876 cprm->to_skip = 0; 877 } 878 if (cprm->written + PAGE_SIZE > cprm->limit) 879 return 0; 880 if (dump_interrupted()) 881 return 0; 882 pos = file->f_pos; 883 bvec_set_page(&bvec, page, PAGE_SIZE, 0); 884 iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); 885 n = __kernel_write_iter(cprm->file, &iter, &pos); 886 if (n != PAGE_SIZE) 887 return 0; 888 file->f_pos = pos; 889 cprm->written += PAGE_SIZE; 890 cprm->pos += PAGE_SIZE; 891 892 return 1; 893 } 894 895 /* 896 * If we might get machine checks from kernel accesses during the 897 * core dump, let's get those errors early rather than during the 898 * IO. This is not performance-critical enough to warrant having 899 * all the machine check logic in the iovec paths. 900 */ 901 #ifdef copy_mc_to_kernel 902 903 #define dump_page_alloc() alloc_page(GFP_KERNEL) 904 #define dump_page_free(x) __free_page(x) 905 static struct page *dump_page_copy(struct page *src, struct page *dst) 906 { 907 void *buf = kmap_local_page(src); 908 size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE); 909 kunmap_local(buf); 910 return left ? NULL : dst; 911 } 912 913 #else 914 915 /* We just want to return non-NULL; it's never used. */ 916 #define dump_page_alloc() ERR_PTR(-EINVAL) 917 #define dump_page_free(x) ((void)(x)) 918 static inline struct page *dump_page_copy(struct page *src, struct page *dst) 919 { 920 return src; 921 } 922 #endif 923 924 int dump_user_range(struct coredump_params *cprm, unsigned long start, 925 unsigned long len) 926 { 927 unsigned long addr; 928 struct page *dump_page; 929 int locked, ret; 930 931 dump_page = dump_page_alloc(); 932 if (!dump_page) 933 return 0; 934 935 ret = 0; 936 locked = 0; 937 for (addr = start; addr < start + len; addr += PAGE_SIZE) { 938 struct page *page; 939 940 if (!locked) { 941 if (mmap_read_lock_killable(current->mm)) 942 goto out; 943 locked = 1; 944 } 945 946 /* 947 * To avoid having to allocate page tables for virtual address 948 * ranges that have never been used yet, and also to make it 949 * easy to generate sparse core files, use a helper that returns 950 * NULL when encountering an empty page table entry that would 951 * otherwise have been filled with the zero page. 952 */ 953 page = get_dump_page(addr, &locked); 954 if (page) { 955 if (locked) { 956 mmap_read_unlock(current->mm); 957 locked = 0; 958 } 959 int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); 960 put_page(page); 961 if (stop) 962 goto out; 963 } else { 964 dump_skip(cprm, PAGE_SIZE); 965 } 966 967 if (dump_interrupted()) 968 goto out; 969 970 if (!need_resched()) 971 continue; 972 if (locked) { 973 mmap_read_unlock(current->mm); 974 locked = 0; 975 } 976 cond_resched(); 977 } 978 ret = 1; 979 out: 980 if (locked) 981 mmap_read_unlock(current->mm); 982 983 dump_page_free(dump_page); 984 return ret; 985 } 986 #endif 987 988 int dump_align(struct coredump_params *cprm, int align) 989 { 990 unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1); 991 if (align & (align - 1)) 992 return 0; 993 if (mod) 994 cprm->to_skip += align - mod; 995 return 1; 996 } 997 EXPORT_SYMBOL(dump_align); 998 999 #ifdef CONFIG_SYSCTL 1000 1001 void validate_coredump_safety(void) 1002 { 1003 if (suid_dumpable == SUID_DUMP_ROOT && 1004 core_pattern[0] != '/' && core_pattern[0] != '|') { 1005 1006 coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: " 1007 "pipe handler or fully qualified core dump path required. " 1008 "Set kernel.core_pattern before fs.suid_dumpable."); 1009 } 1010 } 1011 1012 static int proc_dostring_coredump(const struct ctl_table *table, int write, 1013 void *buffer, size_t *lenp, loff_t *ppos) 1014 { 1015 int error = proc_dostring(table, write, buffer, lenp, ppos); 1016 1017 if (!error) 1018 validate_coredump_safety(); 1019 return error; 1020 } 1021 1022 static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; 1023 static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; 1024 1025 static const struct ctl_table coredump_sysctls[] = { 1026 { 1027 .procname = "core_uses_pid", 1028 .data = &core_uses_pid, 1029 .maxlen = sizeof(int), 1030 .mode = 0644, 1031 .proc_handler = proc_dointvec, 1032 }, 1033 { 1034 .procname = "core_pattern", 1035 .data = core_pattern, 1036 .maxlen = CORENAME_MAX_SIZE, 1037 .mode = 0644, 1038 .proc_handler = proc_dostring_coredump, 1039 }, 1040 { 1041 .procname = "core_pipe_limit", 1042 .data = &core_pipe_limit, 1043 .maxlen = sizeof(unsigned int), 1044 .mode = 0644, 1045 .proc_handler = proc_dointvec_minmax, 1046 .extra1 = SYSCTL_ZERO, 1047 .extra2 = SYSCTL_INT_MAX, 1048 }, 1049 { 1050 .procname = "core_file_note_size_limit", 1051 .data = &core_file_note_size_limit, 1052 .maxlen = sizeof(unsigned int), 1053 .mode = 0644, 1054 .proc_handler = proc_douintvec_minmax, 1055 .extra1 = (unsigned int *)&core_file_note_size_min, 1056 .extra2 = (unsigned int *)&core_file_note_size_max, 1057 }, 1058 { 1059 .procname = "core_sort_vma", 1060 .data = &core_sort_vma, 1061 .maxlen = sizeof(int), 1062 .mode = 0644, 1063 .proc_handler = proc_douintvec_minmax, 1064 .extra1 = SYSCTL_ZERO, 1065 .extra2 = SYSCTL_ONE, 1066 }, 1067 }; 1068 1069 static int __init init_fs_coredump_sysctls(void) 1070 { 1071 register_sysctl_init("kernel", coredump_sysctls); 1072 return 0; 1073 } 1074 fs_initcall(init_fs_coredump_sysctls); 1075 #endif /* CONFIG_SYSCTL */ 1076 1077 /* 1078 * The purpose of always_dump_vma() is to make sure that special kernel mappings 1079 * that are useful for post-mortem analysis are included in every core dump. 1080 * In that way we ensure that the core dump is fully interpretable later 1081 * without matching up the same kernel and hardware config to see what PC values 1082 * meant. These special mappings include - vDSO, vsyscall, and other 1083 * architecture specific mappings 1084 */ 1085 static bool always_dump_vma(struct vm_area_struct *vma) 1086 { 1087 /* Any vsyscall mappings? */ 1088 if (vma == get_gate_vma(vma->vm_mm)) 1089 return true; 1090 1091 /* 1092 * Assume that all vmas with a .name op should always be dumped. 1093 * If this changes, a new vm_ops field can easily be added. 1094 */ 1095 if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) 1096 return true; 1097 1098 /* 1099 * arch_vma_name() returns non-NULL for special architecture mappings, 1100 * such as vDSO sections. 1101 */ 1102 if (arch_vma_name(vma)) 1103 return true; 1104 1105 return false; 1106 } 1107 1108 #define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 1109 1110 /* 1111 * Decide how much of @vma's contents should be included in a core dump. 1112 */ 1113 static unsigned long vma_dump_size(struct vm_area_struct *vma, 1114 unsigned long mm_flags) 1115 { 1116 #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) 1117 1118 /* always dump the vdso and vsyscall sections */ 1119 if (always_dump_vma(vma)) 1120 goto whole; 1121 1122 if (vma->vm_flags & VM_DONTDUMP) 1123 return 0; 1124 1125 /* support for DAX */ 1126 if (vma_is_dax(vma)) { 1127 if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) 1128 goto whole; 1129 if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) 1130 goto whole; 1131 return 0; 1132 } 1133 1134 /* Hugetlb memory check */ 1135 if (is_vm_hugetlb_page(vma)) { 1136 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) 1137 goto whole; 1138 if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) 1139 goto whole; 1140 return 0; 1141 } 1142 1143 /* Do not dump I/O mapped devices or special mappings */ 1144 if (vma->vm_flags & VM_IO) 1145 return 0; 1146 1147 /* By default, dump shared memory if mapped from an anonymous file. */ 1148 if (vma->vm_flags & VM_SHARED) { 1149 if (file_inode(vma->vm_file)->i_nlink == 0 ? 1150 FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) 1151 goto whole; 1152 return 0; 1153 } 1154 1155 /* Dump segments that have been written to. */ 1156 if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) 1157 goto whole; 1158 if (vma->vm_file == NULL) 1159 return 0; 1160 1161 if (FILTER(MAPPED_PRIVATE)) 1162 goto whole; 1163 1164 /* 1165 * If this is the beginning of an executable file mapping, 1166 * dump the first page to aid in determining what was mapped here. 1167 */ 1168 if (FILTER(ELF_HEADERS) && 1169 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { 1170 if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) 1171 return PAGE_SIZE; 1172 1173 /* 1174 * ELF libraries aren't always executable. 1175 * We'll want to check whether the mapping starts with the ELF 1176 * magic, but not now - we're holding the mmap lock, 1177 * so copy_from_user() doesn't work here. 1178 * Use a placeholder instead, and fix it up later in 1179 * dump_vma_snapshot(). 1180 */ 1181 return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; 1182 } 1183 1184 #undef FILTER 1185 1186 return 0; 1187 1188 whole: 1189 return vma->vm_end - vma->vm_start; 1190 } 1191 1192 /* 1193 * Helper function for iterating across a vma list. It ensures that the caller 1194 * will visit `gate_vma' prior to terminating the search. 1195 */ 1196 static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, 1197 struct vm_area_struct *vma, 1198 struct vm_area_struct *gate_vma) 1199 { 1200 if (gate_vma && (vma == gate_vma)) 1201 return NULL; 1202 1203 vma = vma_next(vmi); 1204 if (vma) 1205 return vma; 1206 return gate_vma; 1207 } 1208 1209 static void free_vma_snapshot(struct coredump_params *cprm) 1210 { 1211 if (cprm->vma_meta) { 1212 int i; 1213 for (i = 0; i < cprm->vma_count; i++) { 1214 struct file *file = cprm->vma_meta[i].file; 1215 if (file) 1216 fput(file); 1217 } 1218 kvfree(cprm->vma_meta); 1219 cprm->vma_meta = NULL; 1220 } 1221 } 1222 1223 static int cmp_vma_size(const void *vma_meta_lhs_ptr, const void *vma_meta_rhs_ptr) 1224 { 1225 const struct core_vma_metadata *vma_meta_lhs = vma_meta_lhs_ptr; 1226 const struct core_vma_metadata *vma_meta_rhs = vma_meta_rhs_ptr; 1227 1228 if (vma_meta_lhs->dump_size < vma_meta_rhs->dump_size) 1229 return -1; 1230 if (vma_meta_lhs->dump_size > vma_meta_rhs->dump_size) 1231 return 1; 1232 return 0; 1233 } 1234 1235 /* 1236 * Under the mmap_lock, take a snapshot of relevant information about the task's 1237 * VMAs. 1238 */ 1239 static bool dump_vma_snapshot(struct coredump_params *cprm) 1240 { 1241 struct vm_area_struct *gate_vma, *vma = NULL; 1242 struct mm_struct *mm = current->mm; 1243 VMA_ITERATOR(vmi, mm, 0); 1244 int i = 0; 1245 1246 /* 1247 * Once the stack expansion code is fixed to not change VMA bounds 1248 * under mmap_lock in read mode, this can be changed to take the 1249 * mmap_lock in read mode. 1250 */ 1251 if (mmap_write_lock_killable(mm)) 1252 return false; 1253 1254 cprm->vma_data_size = 0; 1255 gate_vma = get_gate_vma(mm); 1256 cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); 1257 1258 cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); 1259 if (!cprm->vma_meta) { 1260 mmap_write_unlock(mm); 1261 return false; 1262 } 1263 1264 while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { 1265 struct core_vma_metadata *m = cprm->vma_meta + i; 1266 1267 m->start = vma->vm_start; 1268 m->end = vma->vm_end; 1269 m->flags = vma->vm_flags; 1270 m->dump_size = vma_dump_size(vma, cprm->mm_flags); 1271 m->pgoff = vma->vm_pgoff; 1272 m->file = vma->vm_file; 1273 if (m->file) 1274 get_file(m->file); 1275 i++; 1276 } 1277 1278 mmap_write_unlock(mm); 1279 1280 for (i = 0; i < cprm->vma_count; i++) { 1281 struct core_vma_metadata *m = cprm->vma_meta + i; 1282 1283 if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { 1284 char elfmag[SELFMAG]; 1285 1286 if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || 1287 memcmp(elfmag, ELFMAG, SELFMAG) != 0) { 1288 m->dump_size = 0; 1289 } else { 1290 m->dump_size = PAGE_SIZE; 1291 } 1292 } 1293 1294 cprm->vma_data_size += m->dump_size; 1295 } 1296 1297 if (core_sort_vma) 1298 sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), 1299 cmp_vma_size, NULL); 1300 1301 return true; 1302 } 1303