1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/file.h> 4 #include <linux/fdtable.h> 5 #include <linux/freezer.h> 6 #include <linux/mm.h> 7 #include <linux/stat.h> 8 #include <linux/fcntl.h> 9 #include <linux/swap.h> 10 #include <linux/ctype.h> 11 #include <linux/string.h> 12 #include <linux/init.h> 13 #include <linux/pagemap.h> 14 #include <linux/perf_event.h> 15 #include <linux/highmem.h> 16 #include <linux/spinlock.h> 17 #include <linux/key.h> 18 #include <linux/personality.h> 19 #include <linux/binfmts.h> 20 #include <linux/coredump.h> 21 #include <linux/sort.h> 22 #include <linux/sched/coredump.h> 23 #include <linux/sched/signal.h> 24 #include <linux/sched/task_stack.h> 25 #include <linux/utsname.h> 26 #include <linux/pid_namespace.h> 27 #include <linux/module.h> 28 #include <linux/namei.h> 29 #include <linux/mount.h> 30 #include <linux/security.h> 31 #include <linux/syscalls.h> 32 #include <linux/tsacct_kern.h> 33 #include <linux/cn_proc.h> 34 #include <linux/audit.h> 35 #include <linux/kmod.h> 36 #include <linux/fsnotify.h> 37 #include <linux/fs_struct.h> 38 #include <linux/pipe_fs_i.h> 39 #include <linux/oom.h> 40 #include <linux/compat.h> 41 #include <linux/fs.h> 42 #include <linux/path.h> 43 #include <linux/timekeeping.h> 44 #include <linux/sysctl.h> 45 #include <linux/elf.h> 46 #include <linux/pidfs.h> 47 #include <linux/net.h> 48 #include <linux/socket.h> 49 #include <net/af_unix.h> 50 #include <net/net_namespace.h> 51 #include <net/sock.h> 52 #include <uapi/linux/pidfd.h> 53 #include <uapi/linux/un.h> 54 #include <uapi/linux/coredump.h> 55 56 #include <linux/uaccess.h> 57 #include <asm/mmu_context.h> 58 #include <asm/tlb.h> 59 #include <asm/exec.h> 60 61 #include <trace/events/task.h> 62 #include "internal.h" 63 64 #include <trace/events/sched.h> 65 66 #define CREATE_TRACE_POINTS 67 #include <trace/events/coredump.h> 68 69 static bool dump_vma_snapshot(struct coredump_params *cprm); 70 static void free_vma_snapshot(struct coredump_params *cprm); 71 72 #define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) 73 /* Define a reasonable max cap */ 74 #define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) 75 /* 76 * File descriptor number for the pidfd for the thread-group leader of 77 * the coredumping task installed into the usermode helper's file 78 * descriptor table. 79 */ 80 #define COREDUMP_PIDFD_NUMBER 3 81 82 static int core_uses_pid; 83 static unsigned int core_pipe_limit; 84 static unsigned int core_sort_vma; 85 static char core_pattern[CORENAME_MAX_SIZE] = "core"; 86 static int core_name_size = CORENAME_MAX_SIZE; 87 unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; 88 static atomic_t core_pipe_count = ATOMIC_INIT(0); 89 90 enum coredump_type_t { 91 COREDUMP_FILE = 1, 92 COREDUMP_PIPE = 2, 93 COREDUMP_SOCK = 3, 94 COREDUMP_SOCK_REQ = 4, 95 }; 96 97 struct core_name { 98 char *corename __counted_by_ptr(size); 99 int used, size; 100 unsigned int core_pipe_limit; 101 bool core_dumped; 102 enum coredump_type_t core_type; 103 u64 mask; 104 }; 105 106 static int expand_corename(struct core_name *cn, int size) 107 { 108 char *corename; 109 110 size = kmalloc_size_roundup(size); 111 corename = krealloc(cn->corename, size, GFP_KERNEL); 112 if (!corename) 113 return -ENOMEM; 114 115 cn->corename = corename; 116 cn->size = size; 117 118 if (size > core_name_size) /* racy but harmless */ 119 core_name_size = size; 120 121 return 0; 122 } 123 124 static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt, 125 va_list arg) 126 { 127 int free, need; 128 va_list arg_copy; 129 130 again: 131 free = cn->size - cn->used; 132 133 va_copy(arg_copy, arg); 134 need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); 135 va_end(arg_copy); 136 137 if (need < free) { 138 cn->used += need; 139 return 0; 140 } 141 142 if (!expand_corename(cn, cn->size + need - free + 1)) 143 goto again; 144 145 return -ENOMEM; 146 } 147 148 static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...) 149 { 150 va_list arg; 151 int ret; 152 153 va_start(arg, fmt); 154 ret = cn_vprintf(cn, fmt, arg); 155 va_end(arg); 156 157 return ret; 158 } 159 160 static __printf(2, 3) 161 int cn_esc_printf(struct core_name *cn, const char *fmt, ...) 162 { 163 int cur = cn->used; 164 va_list arg; 165 int ret; 166 167 va_start(arg, fmt); 168 ret = cn_vprintf(cn, fmt, arg); 169 va_end(arg); 170 171 if (ret == 0) { 172 /* 173 * Ensure that this coredump name component can't cause the 174 * resulting corefile path to consist of a ".." or ".". 175 */ 176 if ((cn->used - cur == 1 && cn->corename[cur] == '.') || 177 (cn->used - cur == 2 && cn->corename[cur] == '.' 178 && cn->corename[cur+1] == '.')) 179 cn->corename[cur] = '!'; 180 181 /* 182 * Empty names are fishy and could be used to create a "//" in a 183 * corefile name, causing the coredump to happen one directory 184 * level too high. Enforce that all components of the core 185 * pattern are at least one character long. 186 */ 187 if (cn->used == cur) 188 ret = cn_printf(cn, "!"); 189 } 190 191 for (; cur < cn->used; ++cur) { 192 if (cn->corename[cur] == '/') 193 cn->corename[cur] = '!'; 194 } 195 return ret; 196 } 197 198 static int cn_print_exe_file(struct core_name *cn, bool name_only) 199 { 200 struct file *exe_file; 201 char *pathbuf, *path, *ptr; 202 int ret; 203 204 exe_file = get_mm_exe_file(current->mm); 205 if (!exe_file) 206 return cn_esc_printf(cn, "%s (path unknown)", current->comm); 207 208 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 209 if (!pathbuf) { 210 ret = -ENOMEM; 211 goto put_exe_file; 212 } 213 214 path = file_path(exe_file, pathbuf, PATH_MAX); 215 if (IS_ERR(path)) { 216 ret = PTR_ERR(path); 217 goto free_buf; 218 } 219 220 if (name_only) { 221 ptr = strrchr(path, '/'); 222 if (ptr) 223 path = ptr + 1; 224 } 225 ret = cn_esc_printf(cn, "%s", path); 226 227 free_buf: 228 kfree(pathbuf); 229 put_exe_file: 230 fput(exe_file); 231 return ret; 232 } 233 234 /* 235 * coredump_parse will inspect the pattern parameter, and output a name 236 * into corename, which must have space for at least CORENAME_MAX_SIZE 237 * bytes plus one byte for the zero terminator. 238 */ 239 static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, 240 size_t **argv, int *argc) 241 { 242 const struct cred *cred = current_cred(); 243 const char *pat_ptr = core_pattern; 244 bool was_space = false; 245 int pid_in_pattern = 0; 246 int err = 0; 247 248 cn->mask = COREDUMP_KERNEL; 249 if (core_pipe_limit) 250 cn->mask |= COREDUMP_WAIT; 251 cn->used = 0; 252 cn->corename = NULL; 253 cn->core_pipe_limit = 0; 254 cn->core_dumped = false; 255 if (*pat_ptr == '|') 256 cn->core_type = COREDUMP_PIPE; 257 else if (*pat_ptr == '@') 258 cn->core_type = COREDUMP_SOCK; 259 else 260 cn->core_type = COREDUMP_FILE; 261 if (expand_corename(cn, core_name_size)) 262 return false; 263 cn->corename[0] = '\0'; 264 265 switch (cn->core_type) { 266 case COREDUMP_PIPE: { 267 int argvs = sizeof(core_pattern) / 2; 268 (*argv) = kmalloc_objs(**argv, argvs); 269 if (!(*argv)) 270 return false; 271 (*argv)[(*argc)++] = 0; 272 ++pat_ptr; 273 if (!(*pat_ptr)) 274 return false; 275 break; 276 } 277 case COREDUMP_SOCK: { 278 /* skip the @ */ 279 pat_ptr++; 280 if (!(*pat_ptr)) 281 return false; 282 if (*pat_ptr == '@') { 283 pat_ptr++; 284 if (!(*pat_ptr)) 285 return false; 286 287 cn->core_type = COREDUMP_SOCK_REQ; 288 } 289 290 err = cn_printf(cn, "%s", pat_ptr); 291 if (err) 292 return false; 293 294 /* Require absolute paths. */ 295 if (cn->corename[0] != '/') 296 return false; 297 298 /* 299 * Ensure we can uses spaces to indicate additional 300 * parameters in the future. 301 */ 302 if (strchr(cn->corename, ' ')) { 303 coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename); 304 return false; 305 } 306 307 /* Must not contain ".." in the path. */ 308 if (name_contains_dotdot(cn->corename)) { 309 coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename); 310 return false; 311 } 312 313 if (strlen(cn->corename) >= UNIX_PATH_MAX) { 314 coredump_report_failure("Coredump socket path %s too long", cn->corename); 315 return false; 316 } 317 318 /* 319 * Currently no need to parse any other options. 320 * Relevant information can be retrieved from the peer 321 * pidfd retrievable via SO_PEERPIDFD by the receiver or 322 * via /proc/<pid>, using the SO_PEERPIDFD to guard 323 * against pid recycling when opening /proc/<pid>. 324 */ 325 return true; 326 } 327 case COREDUMP_FILE: 328 break; 329 default: 330 WARN_ON_ONCE(true); 331 return false; 332 } 333 334 /* Repeat as long as we have more pattern to process and more output 335 space */ 336 while (*pat_ptr) { 337 /* 338 * Split on spaces before doing template expansion so that 339 * %e and %E don't get split if they have spaces in them 340 */ 341 if (cn->core_type == COREDUMP_PIPE) { 342 if (isspace(*pat_ptr)) { 343 if (cn->used != 0) 344 was_space = true; 345 pat_ptr++; 346 continue; 347 } else if (was_space) { 348 was_space = false; 349 err = cn_printf(cn, "%c", '\0'); 350 if (err) 351 return false; 352 (*argv)[(*argc)++] = cn->used; 353 } 354 } 355 if (*pat_ptr != '%') { 356 err = cn_printf(cn, "%c", *pat_ptr++); 357 } else { 358 switch (*++pat_ptr) { 359 /* single % at the end, drop that */ 360 case 0: 361 goto out; 362 /* Double percent, output one percent */ 363 case '%': 364 err = cn_printf(cn, "%c", '%'); 365 break; 366 /* pid */ 367 case 'p': 368 pid_in_pattern = 1; 369 err = cn_printf(cn, "%d", 370 task_tgid_vnr(current)); 371 break; 372 /* global pid */ 373 case 'P': 374 err = cn_printf(cn, "%d", 375 task_tgid_nr(current)); 376 break; 377 case 'i': 378 err = cn_printf(cn, "%d", 379 task_pid_vnr(current)); 380 break; 381 case 'I': 382 err = cn_printf(cn, "%d", 383 task_pid_nr(current)); 384 break; 385 /* uid */ 386 case 'u': 387 err = cn_printf(cn, "%u", 388 from_kuid(&init_user_ns, 389 cred->uid)); 390 break; 391 /* gid */ 392 case 'g': 393 err = cn_printf(cn, "%u", 394 from_kgid(&init_user_ns, 395 cred->gid)); 396 break; 397 case 'd': 398 err = cn_printf(cn, "%d", 399 __get_dumpable(cprm->mm_flags)); 400 break; 401 /* signal that caused the coredump */ 402 case 's': 403 err = cn_printf(cn, "%d", 404 cprm->siginfo->si_signo); 405 break; 406 /* UNIX time of coredump */ 407 case 't': { 408 time64_t time; 409 410 time = ktime_get_real_seconds(); 411 err = cn_printf(cn, "%lld", time); 412 break; 413 } 414 /* hostname */ 415 case 'h': 416 down_read(&uts_sem); 417 err = cn_esc_printf(cn, "%s", 418 utsname()->nodename); 419 up_read(&uts_sem); 420 break; 421 /* executable, could be changed by prctl PR_SET_NAME etc */ 422 case 'e': 423 err = cn_esc_printf(cn, "%s", current->comm); 424 break; 425 /* file name of executable */ 426 case 'f': 427 err = cn_print_exe_file(cn, true); 428 break; 429 case 'E': 430 err = cn_print_exe_file(cn, false); 431 break; 432 /* core limit size */ 433 case 'c': 434 err = cn_printf(cn, "%lu", 435 rlimit(RLIMIT_CORE)); 436 break; 437 /* CPU the task ran on */ 438 case 'C': 439 err = cn_printf(cn, "%d", cprm->cpu); 440 break; 441 /* pidfd number */ 442 case 'F': { 443 /* 444 * Installing a pidfd only makes sense if 445 * we actually spawn a usermode helper. 446 */ 447 if (cn->core_type != COREDUMP_PIPE) 448 break; 449 450 /* 451 * Note that we'll install a pidfd for the 452 * thread-group leader. We know that task 453 * linkage hasn't been removed yet and even if 454 * this @current isn't the actual thread-group 455 * leader we know that the thread-group leader 456 * cannot be reaped until @current has exited. 457 */ 458 cprm->pid = task_tgid(current); 459 err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); 460 break; 461 } 462 default: 463 break; 464 } 465 ++pat_ptr; 466 } 467 468 if (err) 469 return false; 470 } 471 472 out: 473 /* Backward compatibility with core_uses_pid: 474 * 475 * If core_pattern does not include a %p (as is the default) 476 * and core_uses_pid is set, then .%pid will be appended to 477 * the filename. Do not do this for piped commands. */ 478 if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid) 479 return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0; 480 481 return true; 482 } 483 484 static int zap_process(struct signal_struct *signal, int exit_code) 485 { 486 struct task_struct *t; 487 int nr = 0; 488 489 signal->flags = SIGNAL_GROUP_EXIT; 490 signal->group_exit_code = exit_code; 491 signal->group_stop_count = 0; 492 493 __for_each_thread(signal, t) { 494 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); 495 if (t != current && !(t->flags & PF_POSTCOREDUMP)) { 496 sigaddset(&t->pending.signal, SIGKILL); 497 signal_wake_up(t, 1); 498 nr++; 499 } 500 } 501 502 return nr; 503 } 504 505 static int zap_threads(struct task_struct *tsk, 506 struct core_state *core_state, int exit_code) 507 { 508 struct signal_struct *signal = tsk->signal; 509 int nr = -EAGAIN; 510 511 spin_lock_irq(&tsk->sighand->siglock); 512 if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) { 513 /* Allow SIGKILL, see prepare_signal() */ 514 signal->core_state = core_state; 515 nr = zap_process(signal, exit_code); 516 clear_tsk_thread_flag(tsk, TIF_SIGPENDING); 517 tsk->flags |= PF_DUMPCORE; 518 atomic_set(&core_state->nr_threads, nr); 519 } 520 spin_unlock_irq(&tsk->sighand->siglock); 521 return nr; 522 } 523 524 static int coredump_wait(int exit_code, struct core_state *core_state) 525 { 526 struct task_struct *tsk = current; 527 int core_waiters = -EBUSY; 528 529 init_completion(&core_state->startup); 530 core_state->dumper.task = tsk; 531 core_state->dumper.next = NULL; 532 533 core_waiters = zap_threads(tsk, core_state, exit_code); 534 if (core_waiters > 0) { 535 struct core_thread *ptr; 536 537 wait_for_completion_state(&core_state->startup, 538 TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 539 /* 540 * Wait for all the threads to become inactive, so that 541 * all the thread context (extended register state, like 542 * fpu etc) gets copied to the memory. 543 */ 544 ptr = core_state->dumper.next; 545 while (ptr != NULL) { 546 wait_task_inactive(ptr->task, TASK_ANY); 547 ptr = ptr->next; 548 } 549 } 550 551 return core_waiters; 552 } 553 554 static void coredump_finish(bool core_dumped) 555 { 556 struct core_thread *curr, *next; 557 struct task_struct *task; 558 559 spin_lock_irq(¤t->sighand->siglock); 560 if (core_dumped && !__fatal_signal_pending(current)) 561 current->signal->group_exit_code |= 0x80; 562 next = current->signal->core_state->dumper.next; 563 current->signal->core_state = NULL; 564 spin_unlock_irq(¤t->sighand->siglock); 565 566 while ((curr = next) != NULL) { 567 next = curr->next; 568 task = curr->task; 569 /* 570 * see coredump_task_exit(), curr->task must not see 571 * ->task == NULL before we read ->next. 572 */ 573 smp_mb(); 574 curr->task = NULL; 575 wake_up_process(task); 576 } 577 } 578 579 static bool dump_interrupted(void) 580 { 581 /* 582 * SIGKILL or freezing() interrupt the coredumping. Perhaps we 583 * can do try_to_freeze() and check __fatal_signal_pending(), 584 * but then we need to teach dump_write() to restart and clear 585 * TIF_SIGPENDING. 586 */ 587 return fatal_signal_pending(current) || freezing(current); 588 } 589 590 static void wait_for_dump_helpers(struct file *file) 591 { 592 struct pipe_inode_info *pipe = file->private_data; 593 594 pipe_lock(pipe); 595 pipe->readers++; 596 pipe->writers--; 597 wake_up_interruptible_sync(&pipe->rd_wait); 598 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 599 pipe_unlock(pipe); 600 601 /* 602 * We actually want wait_event_freezable() but then we need 603 * to clear TIF_SIGPENDING and improve dump_interrupted(). 604 */ 605 wait_event_interruptible(pipe->rd_wait, pipe->readers == 1); 606 607 pipe_lock(pipe); 608 pipe->readers--; 609 pipe->writers++; 610 pipe_unlock(pipe); 611 } 612 613 /* 614 * umh_coredump_setup 615 * helper function to customize the process used 616 * to collect the core in userspace. Specifically 617 * it sets up a pipe and installs it as fd 0 (stdin) 618 * for the process. Returns 0 on success, or 619 * PTR_ERR on failure. 620 * Note that it also sets the core limit to 1. This 621 * is a special value that we use to trap recursive 622 * core dumps 623 */ 624 static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) 625 { 626 struct file *files[2]; 627 struct coredump_params *cp = (struct coredump_params *)info->data; 628 int err; 629 630 if (cp->pid) { 631 struct file *pidfs_file __free(fput) = NULL; 632 633 pidfs_file = pidfs_alloc_file(cp->pid, 0); 634 if (IS_ERR(pidfs_file)) 635 return PTR_ERR(pidfs_file); 636 637 pidfs_coredump(cp); 638 639 /* 640 * Usermode helpers are childen of either 641 * system_dfl_wq or of kthreadd. So we know that 642 * we're starting off with a clean file descriptor 643 * table. So we should always be able to use 644 * COREDUMP_PIDFD_NUMBER as our file descriptor value. 645 */ 646 err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); 647 if (err < 0) 648 return err; 649 } 650 651 err = create_pipe_files(files, 0); 652 if (err) 653 return err; 654 655 cp->file = files[1]; 656 657 err = replace_fd(0, files[0], 0); 658 fput(files[0]); 659 if (err < 0) 660 return err; 661 662 /* and disallow core files too */ 663 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; 664 665 return 0; 666 } 667 668 #ifdef CONFIG_UNIX 669 static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm) 670 { 671 struct file *file __free(fput) = NULL; 672 struct sockaddr_un addr = { 673 .sun_family = AF_UNIX, 674 }; 675 ssize_t addr_len; 676 int retval; 677 struct socket *socket; 678 679 addr_len = strscpy(addr.sun_path, cn->corename); 680 if (addr_len < 0) 681 return false; 682 addr_len += offsetof(struct sockaddr_un, sun_path) + 1; 683 684 /* 685 * It is possible that the userspace process which is supposed 686 * to handle the coredump and is listening on the AF_UNIX socket 687 * coredumps. Userspace should just mark itself non dumpable. 688 */ 689 690 retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); 691 if (retval < 0) 692 return false; 693 694 file = sock_alloc_file(socket, 0, NULL); 695 if (IS_ERR(file)) 696 return false; 697 698 /* 699 * Set the thread-group leader pid which is used for the peer 700 * credentials during connect() below. Then immediately register 701 * it in pidfs... 702 */ 703 cprm->pid = task_tgid(current); 704 retval = pidfs_register_pid(cprm->pid); 705 if (retval) 706 return false; 707 708 /* 709 * ... and set the coredump information so userspace has it 710 * available after connect()... 711 */ 712 pidfs_coredump(cprm); 713 714 retval = kernel_connect(socket, (struct sockaddr_unsized *)(&addr), addr_len, 715 O_NONBLOCK | SOCK_COREDUMP); 716 717 if (retval) { 718 if (retval == -EAGAIN) 719 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); 720 else 721 coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); 722 return false; 723 } 724 725 /* ... and validate that @sk_peer_pid matches @cprm.pid. */ 726 if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid)) 727 return false; 728 729 cprm->limit = RLIM_INFINITY; 730 cprm->file = no_free_ptr(file); 731 732 return true; 733 } 734 735 static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) 736 { 737 struct msghdr msg = {}; 738 struct kvec iov = { .iov_base = ack, .iov_len = size }; 739 ssize_t ret; 740 741 memset(ack, 0, size); 742 ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags); 743 return ret == size; 744 } 745 746 static inline bool coredump_sock_send(struct file *file, struct coredump_req *req) 747 { 748 struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; 749 struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) }; 750 ssize_t ret; 751 752 ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req)); 753 return ret == sizeof(*req); 754 } 755 756 static_assert(sizeof(enum coredump_mark) == sizeof(__u32)); 757 758 static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark) 759 { 760 struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; 761 struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) }; 762 ssize_t ret; 763 764 ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark)); 765 return ret == sizeof(mark); 766 } 767 768 static inline void coredump_sock_wait(struct file *file) 769 { 770 ssize_t n; 771 772 /* 773 * We use a simple read to wait for the coredump processing to 774 * finish. Either the socket is closed or we get sent unexpected 775 * data. In both cases, we're done. 776 */ 777 n = __kernel_read(file, &(char){ 0 }, 1, NULL); 778 if (n > 0) 779 coredump_report_failure("Coredump socket had unexpected data"); 780 else if (n < 0) 781 coredump_report_failure("Coredump socket failed"); 782 } 783 784 static inline void coredump_sock_shutdown(struct file *file) 785 { 786 struct socket *socket; 787 788 socket = sock_from_file(file); 789 if (!socket) 790 return; 791 792 /* Let userspace know we're done processing the coredump. */ 793 kernel_sock_shutdown(socket, SHUT_WR); 794 } 795 796 static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm) 797 { 798 struct coredump_req req = { 799 .size = sizeof(struct coredump_req), 800 .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE | 801 COREDUMP_REJECT | COREDUMP_WAIT, 802 .size_ack = sizeof(struct coredump_ack), 803 }; 804 struct coredump_ack ack = {}; 805 ssize_t usize; 806 807 if (cn->core_type != COREDUMP_SOCK_REQ) 808 return true; 809 810 /* Let userspace know what we support. */ 811 if (!coredump_sock_send(cprm->file, &req)) 812 return false; 813 814 /* Peek the size of the coredump_ack. */ 815 if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size), 816 MSG_PEEK | MSG_WAITALL)) 817 return false; 818 819 /* Refuse unknown coredump_ack sizes. */ 820 usize = ack.size; 821 if (usize < COREDUMP_ACK_SIZE_VER0) { 822 coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE); 823 return false; 824 } 825 826 if (usize > sizeof(ack)) { 827 coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE); 828 return false; 829 } 830 831 /* Now retrieve the coredump_ack. */ 832 if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL)) 833 return false; 834 if (ack.size != usize) 835 return false; 836 837 /* Refuse unknown coredump_ack flags. */ 838 if (ack.mask & ~req.mask) { 839 coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); 840 return false; 841 } 842 843 /* Refuse mutually exclusive options. */ 844 if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL | 845 COREDUMP_REJECT)) != 1) { 846 coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING); 847 return false; 848 } 849 850 if (ack.spare) { 851 coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); 852 return false; 853 } 854 855 cn->mask = ack.mask; 856 return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); 857 } 858 859 static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) 860 { 861 if (!coredump_sock_connect(cn, cprm)) 862 return false; 863 864 return coredump_sock_request(cn, cprm); 865 } 866 #else 867 static inline void coredump_sock_wait(struct file *file) { } 868 static inline void coredump_sock_shutdown(struct file *file) { } 869 static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; } 870 #endif 871 872 /* cprm->mm_flags contains a stable snapshot of dumpability flags. */ 873 static inline bool coredump_force_suid_safe(const struct coredump_params *cprm) 874 { 875 /* Require nonrelative corefile path and be extra careful. */ 876 return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT; 877 } 878 879 static bool coredump_file(struct core_name *cn, struct coredump_params *cprm, 880 const struct linux_binfmt *binfmt) 881 { 882 struct mnt_idmap *idmap; 883 struct inode *inode; 884 struct file *file __free(fput) = NULL; 885 int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; 886 887 if (cprm->limit < binfmt->min_coredump) 888 return false; 889 890 if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') { 891 coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump"); 892 return false; 893 } 894 895 /* 896 * Unlink the file if it exists unless this is a SUID 897 * binary - in that case, we're running around with root 898 * privs and don't want to unlink another user's coredump. 899 */ 900 if (!coredump_force_suid_safe(cprm)) { 901 CLASS(filename_kernel, name)(cn->corename); 902 /* 903 * If it doesn't exist, that's fine. If there's some 904 * other problem, we'll catch it at the filp_open(). 905 */ 906 filename_unlinkat(AT_FDCWD, name); 907 } 908 909 /* 910 * There is a race between unlinking and creating the 911 * file, but if that causes an EEXIST here, that's 912 * fine - another process raced with us while creating 913 * the corefile, and the other process won. To userspace, 914 * what matters is that at least one of the two processes 915 * writes its coredump successfully, not which one. 916 */ 917 if (coredump_force_suid_safe(cprm)) { 918 /* 919 * Using user namespaces, normal user tasks can change 920 * their current->fs->root to point to arbitrary 921 * directories. Since the intention of the "only dump 922 * with a fully qualified path" rule is to control where 923 * coredumps may be placed using root privileges, 924 * current->fs->root must not be used. Instead, use the 925 * root directory of init_task. 926 */ 927 struct path root; 928 929 task_lock(&init_task); 930 get_fs_root(init_task.fs, &root); 931 task_unlock(&init_task); 932 file = file_open_root(&root, cn->corename, open_flags, 0600); 933 path_put(&root); 934 } else { 935 file = filp_open(cn->corename, open_flags, 0600); 936 } 937 if (IS_ERR(file)) 938 return false; 939 940 inode = file_inode(file); 941 if (inode->i_nlink > 1) 942 return false; 943 if (d_unhashed(file->f_path.dentry)) 944 return false; 945 /* 946 * AK: actually i see no reason to not allow this for named 947 * pipes etc, but keep the previous behaviour for now. 948 */ 949 if (!S_ISREG(inode->i_mode)) 950 return false; 951 /* 952 * Don't dump core if the filesystem changed owner or mode 953 * of the file during file creation. This is an issue when 954 * a process dumps core while its cwd is e.g. on a vfat 955 * filesystem. 956 */ 957 idmap = file_mnt_idmap(file); 958 if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { 959 coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename); 960 return false; 961 } 962 if ((inode->i_mode & 0677) != 0600) { 963 coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename); 964 return false; 965 } 966 if (!(file->f_mode & FMODE_CAN_WRITE)) 967 return false; 968 if (do_truncate(idmap, file->f_path.dentry, 0, 0, file)) 969 return false; 970 971 cprm->file = no_free_ptr(file); 972 return true; 973 } 974 975 static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, 976 size_t *argv, int argc) 977 { 978 int argi; 979 char **helper_argv __free(kfree) = NULL; 980 struct subprocess_info *sub_info; 981 982 if (cprm->limit == 1) { 983 /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. 984 * 985 * Normally core limits are irrelevant to pipes, since 986 * we're not writing to the file system, but we use 987 * cprm.limit of 1 here as a special value, this is a 988 * consistent way to catch recursive crashes. 989 * We can still crash if the core_pattern binary sets 990 * RLIM_CORE = !1, but it runs as root, and can do 991 * lots of stupid things. 992 * 993 * Note that we use task_tgid_vnr here to grab the pid 994 * of the process group leader. That way we get the 995 * right pid if a thread in a multi-threaded 996 * core_pattern process dies. 997 */ 998 coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); 999 return false; 1000 } 1001 cprm->limit = RLIM_INFINITY; 1002 1003 cn->core_pipe_limit = atomic_inc_return(&core_pipe_count); 1004 if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) { 1005 coredump_report_failure("over core_pipe_limit, skipping core dump"); 1006 return false; 1007 } 1008 1009 helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL); 1010 if (!helper_argv) { 1011 coredump_report_failure("%s failed to allocate memory", __func__); 1012 return false; 1013 } 1014 for (argi = 0; argi < argc; argi++) 1015 helper_argv[argi] = cn->corename + argv[argi]; 1016 helper_argv[argi] = NULL; 1017 1018 sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, 1019 GFP_KERNEL, umh_coredump_setup, 1020 NULL, cprm); 1021 if (!sub_info) 1022 return false; 1023 1024 if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) { 1025 coredump_report_failure("|%s pipe failed", cn->corename); 1026 return false; 1027 } 1028 1029 /* 1030 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would 1031 * have this set to NULL. 1032 */ 1033 if (!cprm->file) { 1034 coredump_report_failure("Core dump to |%s disabled", cn->corename); 1035 return false; 1036 } 1037 1038 return true; 1039 } 1040 1041 static bool coredump_write(struct core_name *cn, 1042 struct coredump_params *cprm, 1043 const struct linux_binfmt *binfmt) 1044 { 1045 1046 if (dump_interrupted()) 1047 return true; 1048 1049 if (!dump_vma_snapshot(cprm)) 1050 return false; 1051 1052 file_start_write(cprm->file); 1053 cn->core_dumped = binfmt->core_dump(cprm); 1054 /* 1055 * Ensures that file size is big enough to contain the current 1056 * file postion. This prevents gdb from complaining about 1057 * a truncated file if the last "write" to the file was 1058 * dump_skip. 1059 */ 1060 if (cprm->to_skip) { 1061 cprm->to_skip--; 1062 dump_emit(cprm, "", 1); 1063 } 1064 file_end_write(cprm->file); 1065 free_vma_snapshot(cprm); 1066 return true; 1067 } 1068 1069 static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm) 1070 { 1071 if (cprm->file) 1072 filp_close(cprm->file, NULL); 1073 if (cn->core_pipe_limit) { 1074 VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE); 1075 atomic_dec(&core_pipe_count); 1076 } 1077 kfree(cn->corename); 1078 coredump_finish(cn->core_dumped); 1079 } 1080 1081 static inline bool coredump_skip(const struct coredump_params *cprm, 1082 const struct linux_binfmt *binfmt) 1083 { 1084 if (!binfmt) 1085 return true; 1086 if (!binfmt->core_dump) 1087 return true; 1088 if (!__get_dumpable(cprm->mm_flags)) 1089 return true; 1090 return false; 1091 } 1092 1093 static void do_coredump(struct core_name *cn, struct coredump_params *cprm, 1094 size_t **argv, int *argc, const struct linux_binfmt *binfmt) 1095 { 1096 trace_coredump(cprm->siginfo->si_signo); 1097 1098 if (!coredump_parse(cn, cprm, argv, argc)) { 1099 coredump_report_failure("format_corename failed, aborting core"); 1100 return; 1101 } 1102 1103 switch (cn->core_type) { 1104 case COREDUMP_FILE: 1105 if (!coredump_file(cn, cprm, binfmt)) 1106 return; 1107 break; 1108 case COREDUMP_PIPE: 1109 if (!coredump_pipe(cn, cprm, *argv, *argc)) 1110 return; 1111 break; 1112 case COREDUMP_SOCK_REQ: 1113 fallthrough; 1114 case COREDUMP_SOCK: 1115 if (!coredump_socket(cn, cprm)) 1116 return; 1117 break; 1118 default: 1119 WARN_ON_ONCE(true); 1120 return; 1121 } 1122 1123 /* Don't even generate the coredump. */ 1124 if (cn->mask & COREDUMP_REJECT) 1125 return; 1126 1127 /* get us an unshared descriptor table; almost always a no-op */ 1128 /* The cell spufs coredump code reads the file descriptor tables */ 1129 if (unshare_files()) 1130 return; 1131 1132 if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt)) 1133 return; 1134 1135 coredump_sock_shutdown(cprm->file); 1136 1137 /* Let the parent know that a coredump was generated. */ 1138 if (cn->mask & COREDUMP_USERSPACE) 1139 cn->core_dumped = true; 1140 1141 /* 1142 * When core_pipe_limit is set we wait for the coredump server 1143 * or usermodehelper to finish before exiting so it can e.g., 1144 * inspect /proc/<pid>. 1145 */ 1146 if (cn->mask & COREDUMP_WAIT) { 1147 switch (cn->core_type) { 1148 case COREDUMP_PIPE: 1149 wait_for_dump_helpers(cprm->file); 1150 break; 1151 case COREDUMP_SOCK_REQ: 1152 fallthrough; 1153 case COREDUMP_SOCK: 1154 coredump_sock_wait(cprm->file); 1155 break; 1156 default: 1157 break; 1158 } 1159 } 1160 } 1161 1162 void vfs_coredump(const kernel_siginfo_t *siginfo) 1163 { 1164 size_t *argv __free(kfree) = NULL; 1165 struct core_state core_state; 1166 struct core_name cn; 1167 const struct mm_struct *mm = current->mm; 1168 const struct linux_binfmt *binfmt = mm->binfmt; 1169 int argc = 0; 1170 struct coredump_params cprm = { 1171 .siginfo = siginfo, 1172 .limit = rlimit(RLIMIT_CORE), 1173 /* 1174 * We must use the same mm->flags while dumping core to avoid 1175 * inconsistency of bit flags, since this flag is not protected 1176 * by any locks. 1177 * 1178 * Note that we only care about MMF_DUMP* flags. 1179 */ 1180 .mm_flags = __mm_flags_get_dumpable(mm), 1181 .vma_meta = NULL, 1182 .cpu = raw_smp_processor_id(), 1183 }; 1184 1185 audit_core_dumps(siginfo->si_signo); 1186 1187 if (coredump_skip(&cprm, binfmt)) 1188 return; 1189 1190 CLASS(prepare_creds, cred)(); 1191 if (!cred) 1192 return; 1193 /* 1194 * We cannot trust fsuid as being the "true" uid of the process 1195 * nor do we know its entire history. We only know it was tainted 1196 * so we dump it as root in mode 2, and only into a controlled 1197 * environment (pipe handler or fully qualified path). 1198 */ 1199 if (coredump_force_suid_safe(&cprm)) 1200 cred->fsuid = GLOBAL_ROOT_UID; 1201 1202 if (coredump_wait(siginfo->si_signo, &core_state) < 0) 1203 return; 1204 1205 scoped_with_creds(cred) 1206 do_coredump(&cn, &cprm, &argv, &argc, binfmt); 1207 coredump_cleanup(&cn, &cprm); 1208 return; 1209 } 1210 1211 /* 1212 * Core dumping helper functions. These are the only things you should 1213 * do on a core-file: use only these functions to write out all the 1214 * necessary info. 1215 */ 1216 static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) 1217 { 1218 struct file *file = cprm->file; 1219 loff_t pos = file->f_pos; 1220 ssize_t n; 1221 1222 if (cprm->written + nr > cprm->limit) 1223 return 0; 1224 if (dump_interrupted()) 1225 return 0; 1226 n = __kernel_write(file, addr, nr, &pos); 1227 if (n != nr) 1228 return 0; 1229 file->f_pos = pos; 1230 cprm->written += n; 1231 cprm->pos += n; 1232 1233 return 1; 1234 } 1235 1236 static int __dump_skip(struct coredump_params *cprm, size_t nr) 1237 { 1238 static char zeroes[PAGE_SIZE]; 1239 struct file *file = cprm->file; 1240 1241 if (file->f_mode & FMODE_LSEEK) { 1242 if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0) 1243 return 0; 1244 cprm->pos += nr; 1245 return 1; 1246 } 1247 1248 while (nr > PAGE_SIZE) { 1249 if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) 1250 return 0; 1251 nr -= PAGE_SIZE; 1252 } 1253 1254 return __dump_emit(cprm, zeroes, nr); 1255 } 1256 1257 int dump_emit(struct coredump_params *cprm, const void *addr, int nr) 1258 { 1259 if (cprm->to_skip) { 1260 if (!__dump_skip(cprm, cprm->to_skip)) 1261 return 0; 1262 cprm->to_skip = 0; 1263 } 1264 return __dump_emit(cprm, addr, nr); 1265 } 1266 EXPORT_SYMBOL(dump_emit); 1267 1268 void dump_skip_to(struct coredump_params *cprm, unsigned long pos) 1269 { 1270 cprm->to_skip = pos - cprm->pos; 1271 } 1272 EXPORT_SYMBOL(dump_skip_to); 1273 1274 void dump_skip(struct coredump_params *cprm, size_t nr) 1275 { 1276 cprm->to_skip += nr; 1277 } 1278 EXPORT_SYMBOL(dump_skip); 1279 1280 #ifdef CONFIG_ELF_CORE 1281 static int dump_emit_page(struct coredump_params *cprm, struct page *page) 1282 { 1283 struct bio_vec bvec; 1284 struct iov_iter iter; 1285 struct file *file = cprm->file; 1286 loff_t pos; 1287 ssize_t n; 1288 1289 if (!page) 1290 return 0; 1291 1292 if (cprm->to_skip) { 1293 if (!__dump_skip(cprm, cprm->to_skip)) 1294 return 0; 1295 cprm->to_skip = 0; 1296 } 1297 if (cprm->written + PAGE_SIZE > cprm->limit) 1298 return 0; 1299 if (dump_interrupted()) 1300 return 0; 1301 pos = file->f_pos; 1302 bvec_set_page(&bvec, page, PAGE_SIZE, 0); 1303 iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); 1304 n = __kernel_write_iter(cprm->file, &iter, &pos); 1305 if (n != PAGE_SIZE) 1306 return 0; 1307 file->f_pos = pos; 1308 cprm->written += PAGE_SIZE; 1309 cprm->pos += PAGE_SIZE; 1310 1311 return 1; 1312 } 1313 1314 /* 1315 * If we might get machine checks from kernel accesses during the 1316 * core dump, let's get those errors early rather than during the 1317 * IO. This is not performance-critical enough to warrant having 1318 * all the machine check logic in the iovec paths. 1319 */ 1320 #ifdef copy_mc_to_kernel 1321 1322 #define dump_page_alloc() alloc_page(GFP_KERNEL) 1323 #define dump_page_free(x) __free_page(x) 1324 static struct page *dump_page_copy(struct page *src, struct page *dst) 1325 { 1326 void *buf = kmap_local_page(src); 1327 size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE); 1328 kunmap_local(buf); 1329 return left ? NULL : dst; 1330 } 1331 1332 #else 1333 1334 /* We just want to return non-NULL; it's never used. */ 1335 #define dump_page_alloc() ERR_PTR(-EINVAL) 1336 #define dump_page_free(x) ((void)(x)) 1337 static inline struct page *dump_page_copy(struct page *src, struct page *dst) 1338 { 1339 return src; 1340 } 1341 #endif 1342 1343 int dump_user_range(struct coredump_params *cprm, unsigned long start, 1344 unsigned long len) 1345 { 1346 unsigned long addr; 1347 struct page *dump_page; 1348 int locked, ret; 1349 1350 dump_page = dump_page_alloc(); 1351 if (!dump_page) 1352 return 0; 1353 1354 ret = 0; 1355 locked = 0; 1356 for (addr = start; addr < start + len; addr += PAGE_SIZE) { 1357 struct page *page; 1358 1359 if (!locked) { 1360 if (mmap_read_lock_killable(current->mm)) 1361 goto out; 1362 locked = 1; 1363 } 1364 1365 /* 1366 * To avoid having to allocate page tables for virtual address 1367 * ranges that have never been used yet, and also to make it 1368 * easy to generate sparse core files, use a helper that returns 1369 * NULL when encountering an empty page table entry that would 1370 * otherwise have been filled with the zero page. 1371 */ 1372 page = get_dump_page(addr, &locked); 1373 if (page) { 1374 if (locked) { 1375 mmap_read_unlock(current->mm); 1376 locked = 0; 1377 } 1378 int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); 1379 put_page(page); 1380 if (stop) 1381 goto out; 1382 } else { 1383 dump_skip(cprm, PAGE_SIZE); 1384 } 1385 1386 if (dump_interrupted()) 1387 goto out; 1388 1389 if (!need_resched()) 1390 continue; 1391 if (locked) { 1392 mmap_read_unlock(current->mm); 1393 locked = 0; 1394 } 1395 cond_resched(); 1396 } 1397 ret = 1; 1398 out: 1399 if (locked) 1400 mmap_read_unlock(current->mm); 1401 1402 dump_page_free(dump_page); 1403 return ret; 1404 } 1405 #endif 1406 1407 int dump_align(struct coredump_params *cprm, int align) 1408 { 1409 unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1); 1410 if (align & (align - 1)) 1411 return 0; 1412 if (mod) 1413 cprm->to_skip += align - mod; 1414 return 1; 1415 } 1416 EXPORT_SYMBOL(dump_align); 1417 1418 #ifdef CONFIG_SYSCTL 1419 1420 void validate_coredump_safety(void) 1421 { 1422 if (suid_dumpable == SUID_DUMP_ROOT && 1423 core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') { 1424 1425 coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: " 1426 "pipe handler or fully qualified core dump path required. " 1427 "Set kernel.core_pattern before fs.suid_dumpable."); 1428 } 1429 } 1430 1431 static inline bool check_coredump_socket(void) 1432 { 1433 const char *p; 1434 1435 if (core_pattern[0] != '@') 1436 return true; 1437 1438 /* 1439 * Coredump socket must be located in the initial mount 1440 * namespace. Don't give the impression that anything else is 1441 * supported right now. 1442 */ 1443 if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) 1444 return false; 1445 1446 /* Must be an absolute path... */ 1447 if (core_pattern[1] != '/') { 1448 /* ... or the socket request protocol... */ 1449 if (core_pattern[1] != '@') 1450 return false; 1451 /* ... and if so must be an absolute path. */ 1452 if (core_pattern[2] != '/') 1453 return false; 1454 p = &core_pattern[2]; 1455 } else { 1456 p = &core_pattern[1]; 1457 } 1458 1459 /* The path obviously cannot exceed UNIX_PATH_MAX. */ 1460 if (strlen(p) >= UNIX_PATH_MAX) 1461 return false; 1462 1463 /* Must not contain ".." in the path. */ 1464 if (name_contains_dotdot(core_pattern)) 1465 return false; 1466 1467 return true; 1468 } 1469 1470 static int proc_dostring_coredump(const struct ctl_table *table, int write, 1471 void *buffer, size_t *lenp, loff_t *ppos) 1472 { 1473 int error; 1474 ssize_t retval; 1475 char old_core_pattern[CORENAME_MAX_SIZE]; 1476 1477 if (!write) 1478 return proc_dostring(table, write, buffer, lenp, ppos); 1479 1480 retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); 1481 1482 error = proc_dostring(table, write, buffer, lenp, ppos); 1483 if (error) 1484 return error; 1485 1486 if (!check_coredump_socket()) { 1487 strscpy(core_pattern, old_core_pattern, retval + 1); 1488 return -EINVAL; 1489 } 1490 1491 validate_coredump_safety(); 1492 return error; 1493 } 1494 1495 static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; 1496 static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; 1497 static char core_modes[] = { 1498 "file\npipe" 1499 #ifdef CONFIG_UNIX 1500 "\nsocket" 1501 #endif 1502 }; 1503 1504 static const struct ctl_table coredump_sysctls[] = { 1505 { 1506 .procname = "core_uses_pid", 1507 .data = &core_uses_pid, 1508 .maxlen = sizeof(int), 1509 .mode = 0644, 1510 .proc_handler = proc_dointvec, 1511 }, 1512 { 1513 .procname = "core_pattern", 1514 .data = core_pattern, 1515 .maxlen = CORENAME_MAX_SIZE, 1516 .mode = 0644, 1517 .proc_handler = proc_dostring_coredump, 1518 }, 1519 { 1520 .procname = "core_pipe_limit", 1521 .data = &core_pipe_limit, 1522 .maxlen = sizeof(unsigned int), 1523 .mode = 0644, 1524 .proc_handler = proc_dointvec_minmax, 1525 .extra1 = SYSCTL_ZERO, 1526 .extra2 = SYSCTL_INT_MAX, 1527 }, 1528 { 1529 .procname = "core_file_note_size_limit", 1530 .data = &core_file_note_size_limit, 1531 .maxlen = sizeof(unsigned int), 1532 .mode = 0644, 1533 .proc_handler = proc_douintvec_minmax, 1534 .extra1 = (unsigned int *)&core_file_note_size_min, 1535 .extra2 = (unsigned int *)&core_file_note_size_max, 1536 }, 1537 { 1538 .procname = "core_sort_vma", 1539 .data = &core_sort_vma, 1540 .maxlen = sizeof(int), 1541 .mode = 0644, 1542 .proc_handler = proc_douintvec_minmax, 1543 .extra1 = SYSCTL_ZERO, 1544 .extra2 = SYSCTL_ONE, 1545 }, 1546 { 1547 .procname = "core_modes", 1548 .data = core_modes, 1549 .maxlen = sizeof(core_modes) - 1, 1550 .mode = 0444, 1551 .proc_handler = proc_dostring, 1552 }, 1553 }; 1554 1555 static int __init init_fs_coredump_sysctls(void) 1556 { 1557 register_sysctl_init("kernel", coredump_sysctls); 1558 return 0; 1559 } 1560 fs_initcall(init_fs_coredump_sysctls); 1561 #endif /* CONFIG_SYSCTL */ 1562 1563 /* 1564 * The purpose of always_dump_vma() is to make sure that special kernel mappings 1565 * that are useful for post-mortem analysis are included in every core dump. 1566 * In that way we ensure that the core dump is fully interpretable later 1567 * without matching up the same kernel and hardware config to see what PC values 1568 * meant. These special mappings include - vDSO, vsyscall, and other 1569 * architecture specific mappings 1570 */ 1571 static bool always_dump_vma(struct vm_area_struct *vma) 1572 { 1573 /* Any vsyscall mappings? */ 1574 if (vma == get_gate_vma(vma->vm_mm)) 1575 return true; 1576 1577 /* 1578 * Assume that all vmas with a .name op should always be dumped. 1579 * If this changes, a new vm_ops field can easily be added. 1580 */ 1581 if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) 1582 return true; 1583 1584 /* 1585 * arch_vma_name() returns non-NULL for special architecture mappings, 1586 * such as vDSO sections. 1587 */ 1588 if (arch_vma_name(vma)) 1589 return true; 1590 1591 return false; 1592 } 1593 1594 #define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 1595 1596 /* 1597 * Decide how much of @vma's contents should be included in a core dump. 1598 */ 1599 static unsigned long vma_dump_size(struct vm_area_struct *vma, 1600 unsigned long mm_flags) 1601 { 1602 #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) 1603 1604 /* always dump the vdso and vsyscall sections */ 1605 if (always_dump_vma(vma)) 1606 goto whole; 1607 1608 if (vma->vm_flags & VM_DONTDUMP) 1609 return 0; 1610 1611 /* support for DAX */ 1612 if (vma_is_dax(vma)) { 1613 if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) 1614 goto whole; 1615 if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) 1616 goto whole; 1617 return 0; 1618 } 1619 1620 /* Hugetlb memory check */ 1621 if (is_vm_hugetlb_page(vma)) { 1622 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) 1623 goto whole; 1624 if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) 1625 goto whole; 1626 return 0; 1627 } 1628 1629 /* Do not dump I/O mapped devices or special mappings */ 1630 if (vma->vm_flags & VM_IO) 1631 return 0; 1632 1633 /* By default, dump shared memory if mapped from an anonymous file. */ 1634 if (vma->vm_flags & VM_SHARED) { 1635 if (file_inode(vma->vm_file)->i_nlink == 0 ? 1636 FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) 1637 goto whole; 1638 return 0; 1639 } 1640 1641 /* Dump segments that have been written to. */ 1642 if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) 1643 goto whole; 1644 if (vma->vm_file == NULL) 1645 return 0; 1646 1647 if (FILTER(MAPPED_PRIVATE)) 1648 goto whole; 1649 1650 /* 1651 * If this is the beginning of an executable file mapping, 1652 * dump the first page to aid in determining what was mapped here. 1653 */ 1654 if (FILTER(ELF_HEADERS) && 1655 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { 1656 if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) 1657 return PAGE_SIZE; 1658 1659 /* 1660 * ELF libraries aren't always executable. 1661 * We'll want to check whether the mapping starts with the ELF 1662 * magic, but not now - we're holding the mmap lock, 1663 * so copy_from_user() doesn't work here. 1664 * Use a placeholder instead, and fix it up later in 1665 * dump_vma_snapshot(). 1666 */ 1667 return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; 1668 } 1669 1670 #undef FILTER 1671 1672 return 0; 1673 1674 whole: 1675 return vma->vm_end - vma->vm_start; 1676 } 1677 1678 /* 1679 * Helper function for iterating across a vma list. It ensures that the caller 1680 * will visit `gate_vma' prior to terminating the search. 1681 */ 1682 static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, 1683 struct vm_area_struct *vma, 1684 struct vm_area_struct *gate_vma) 1685 { 1686 if (gate_vma && (vma == gate_vma)) 1687 return NULL; 1688 1689 vma = vma_next(vmi); 1690 if (vma) 1691 return vma; 1692 return gate_vma; 1693 } 1694 1695 static void free_vma_snapshot(struct coredump_params *cprm) 1696 { 1697 if (cprm->vma_meta) { 1698 int i; 1699 for (i = 0; i < cprm->vma_count; i++) { 1700 struct file *file = cprm->vma_meta[i].file; 1701 if (file) 1702 fput(file); 1703 } 1704 kvfree(cprm->vma_meta); 1705 cprm->vma_meta = NULL; 1706 } 1707 } 1708 1709 static int cmp_vma_size(const void *vma_meta_lhs_ptr, const void *vma_meta_rhs_ptr) 1710 { 1711 const struct core_vma_metadata *vma_meta_lhs = vma_meta_lhs_ptr; 1712 const struct core_vma_metadata *vma_meta_rhs = vma_meta_rhs_ptr; 1713 1714 if (vma_meta_lhs->dump_size < vma_meta_rhs->dump_size) 1715 return -1; 1716 if (vma_meta_lhs->dump_size > vma_meta_rhs->dump_size) 1717 return 1; 1718 return 0; 1719 } 1720 1721 /* 1722 * Under the mmap_lock, take a snapshot of relevant information about the task's 1723 * VMAs. 1724 */ 1725 static bool dump_vma_snapshot(struct coredump_params *cprm) 1726 { 1727 struct vm_area_struct *gate_vma, *vma = NULL; 1728 struct mm_struct *mm = current->mm; 1729 VMA_ITERATOR(vmi, mm, 0); 1730 int i = 0; 1731 1732 /* 1733 * Once the stack expansion code is fixed to not change VMA bounds 1734 * under mmap_lock in read mode, this can be changed to take the 1735 * mmap_lock in read mode. 1736 */ 1737 if (mmap_write_lock_killable(mm)) 1738 return false; 1739 1740 cprm->vma_data_size = 0; 1741 gate_vma = get_gate_vma(mm); 1742 cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); 1743 1744 cprm->vma_meta = kvmalloc_objs(*cprm->vma_meta, cprm->vma_count); 1745 if (!cprm->vma_meta) { 1746 mmap_write_unlock(mm); 1747 return false; 1748 } 1749 1750 while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { 1751 struct core_vma_metadata *m = cprm->vma_meta + i; 1752 1753 m->start = vma->vm_start; 1754 m->end = vma->vm_end; 1755 m->flags = vma->vm_flags; 1756 m->dump_size = vma_dump_size(vma, cprm->mm_flags); 1757 m->pgoff = vma->vm_pgoff; 1758 m->file = vma->vm_file; 1759 if (m->file) 1760 get_file(m->file); 1761 i++; 1762 } 1763 1764 mmap_write_unlock(mm); 1765 1766 for (i = 0; i < cprm->vma_count; i++) { 1767 struct core_vma_metadata *m = cprm->vma_meta + i; 1768 1769 if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { 1770 char elfmag[SELFMAG]; 1771 1772 if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || 1773 memcmp(elfmag, ELFMAG, SELFMAG) != 0) { 1774 m->dump_size = 0; 1775 } else { 1776 m->dump_size = PAGE_SIZE; 1777 } 1778 } 1779 1780 cprm->vma_data_size += m->dump_size; 1781 } 1782 1783 if (core_sort_vma) 1784 sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), 1785 cmp_vma_size, NULL); 1786 1787 return true; 1788 } 1789