1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <fcntl.h> 14 #include <mem_user.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/stat.h> 18 #include <asm/unistd.h> 19 #include <as-layout.h> 20 #include <init.h> 21 #include <kern_util.h> 22 #include <mem.h> 23 #include <os.h> 24 #include <ptrace_user.h> 25 #include <registers.h> 26 #include <skas.h> 27 #include <sysdep/stub.h> 28 #include <linux/threads.h> 29 #include <timetravel.h> 30 #include "../internal.h" 31 32 int is_skas_winch(int pid, int fd, void *data) 33 { 34 return pid == getpgrp(); 35 } 36 37 static const char *ptrace_reg_name(int idx) 38 { 39 #define R(n) case HOST_##n: return #n 40 41 switch (idx) { 42 #ifdef __x86_64__ 43 R(BX); 44 R(CX); 45 R(DI); 46 R(SI); 47 R(DX); 48 R(BP); 49 R(AX); 50 R(R8); 51 R(R9); 52 R(R10); 53 R(R11); 54 R(R12); 55 R(R13); 56 R(R14); 57 R(R15); 58 R(ORIG_AX); 59 R(CS); 60 R(SS); 61 R(EFLAGS); 62 #elif defined(__i386__) 63 R(IP); 64 R(SP); 65 R(EFLAGS); 66 R(AX); 67 R(BX); 68 R(CX); 69 R(DX); 70 R(SI); 71 R(DI); 72 R(BP); 73 R(CS); 74 R(SS); 75 R(DS); 76 R(FS); 77 R(ES); 78 R(GS); 79 R(ORIG_AX); 80 #endif 81 } 82 return ""; 83 } 84 85 static int ptrace_dump_regs(int pid) 86 { 87 unsigned long regs[MAX_REG_NR]; 88 int i; 89 90 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 91 return -errno; 92 93 printk(UM_KERN_ERR "Stub registers -\n"); 94 for (i = 0; i < ARRAY_SIZE(regs); i++) { 95 const char *regname = ptrace_reg_name(i); 96 97 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 98 } 99 100 return 0; 101 } 102 103 /* 104 * Signals that are OK to receive in the stub - we'll just continue it. 105 * SIGWINCH will happen when UML is inside a detached screen. 106 */ 107 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 108 109 /* Signals that the stub will finish with - anything else is an error */ 110 #define STUB_DONE_MASK (1 << SIGTRAP) 111 112 void wait_stub_done(int pid) 113 { 114 int n, status, err; 115 116 while (1) { 117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 118 if ((n < 0) || !WIFSTOPPED(status)) 119 goto bad_wait; 120 121 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 122 break; 123 124 err = ptrace(PTRACE_CONT, pid, 0, 0); 125 if (err) { 126 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 127 __func__, errno); 128 fatal_sigsegv(); 129 } 130 } 131 132 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 133 return; 134 135 bad_wait: 136 err = ptrace_dump_regs(pid); 137 if (err) 138 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 139 -err); 140 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 141 __func__, pid, n, errno, status); 142 fatal_sigsegv(); 143 } 144 145 extern unsigned long current_stub_stack(void); 146 147 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 148 { 149 int err; 150 151 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 152 if (err) { 153 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 154 "errno = %d\n", pid, errno); 155 fatal_sigsegv(); 156 } 157 wait_stub_done(pid); 158 159 /* 160 * faultinfo is prepared by the stub_segv_handler at start of 161 * the stub stack page. We just have to copy it. 162 */ 163 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 164 } 165 166 static void handle_segv(int pid, struct uml_pt_regs *regs) 167 { 168 get_skas_faultinfo(pid, ®s->faultinfo); 169 segv(regs->faultinfo, 0, 1, NULL); 170 } 171 172 static void handle_trap(int pid, struct uml_pt_regs *regs) 173 { 174 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 175 fatal_sigsegv(); 176 177 handle_syscall(regs); 178 } 179 180 extern char __syscall_stub_start[]; 181 182 static int stub_exe_fd; 183 184 #ifndef CLOSE_RANGE_CLOEXEC 185 #define CLOSE_RANGE_CLOEXEC (1U << 2) 186 #endif 187 188 static int userspace_tramp(void *stack) 189 { 190 char *const argv[] = { "uml-userspace", NULL }; 191 int pipe_fds[2]; 192 unsigned long long offset; 193 struct stub_init_data init_data = { 194 .stub_start = STUB_START, 195 .segv_handler = STUB_CODE + 196 (unsigned long) stub_segv_handler - 197 (unsigned long) __syscall_stub_start, 198 }; 199 struct iomem_region *iomem; 200 int ret; 201 202 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 203 &offset); 204 init_data.stub_code_offset = MMAP_OFFSET(offset); 205 206 init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 207 init_data.stub_data_offset = MMAP_OFFSET(offset); 208 209 /* 210 * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs 211 * and then unsetting it on all memory related FDs. 212 * This is not strictly necessary from a safety perspective. 213 */ 214 syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); 215 216 fcntl(init_data.stub_data_fd, F_SETFD, 0); 217 for (iomem = iomem_regions; iomem; iomem = iomem->next) 218 fcntl(iomem->fd, F_SETFD, 0); 219 220 /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 221 if (pipe(pipe_fds)) 222 exit(2); 223 224 if (dup2(pipe_fds[0], 0) < 0) 225 exit(3); 226 close(pipe_fds[0]); 227 228 /* Write init_data and close write side */ 229 ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 230 close(pipe_fds[1]); 231 232 if (ret != sizeof(init_data)) 233 exit(4); 234 235 /* Raw execveat for compatibility with older libc versions */ 236 syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", 237 (unsigned long)argv, NULL, AT_EMPTY_PATH); 238 239 exit(5); 240 } 241 242 extern char stub_exe_start[]; 243 extern char stub_exe_end[]; 244 245 extern char *tempdir; 246 247 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 248 249 #ifndef MFD_EXEC 250 #define MFD_EXEC 0x0010U 251 #endif 252 253 static int __init init_stub_exe_fd(void) 254 { 255 size_t written = 0; 256 char *tmpfile = NULL; 257 258 stub_exe_fd = memfd_create("uml-userspace", 259 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 260 261 if (stub_exe_fd < 0) { 262 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 263 264 tmpfile = malloc(strlen(tempdir) + 265 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 266 if (tmpfile == NULL) 267 panic("Failed to allocate memory for stub binary name"); 268 269 strcpy(tmpfile, tempdir); 270 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 271 272 stub_exe_fd = mkstemp(tmpfile); 273 if (stub_exe_fd < 0) 274 panic("Could not create temporary file for stub binary: %d", 275 -errno); 276 } 277 278 while (written < stub_exe_end - stub_exe_start) { 279 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 280 stub_exe_end - stub_exe_start - written); 281 if (res < 0) { 282 if (errno == EINTR) 283 continue; 284 285 if (tmpfile) 286 unlink(tmpfile); 287 panic("Failed write stub binary: %d", -errno); 288 } 289 290 written += res; 291 } 292 293 if (!tmpfile) { 294 fcntl(stub_exe_fd, F_ADD_SEALS, 295 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 296 } else { 297 if (fchmod(stub_exe_fd, 00500) < 0) { 298 unlink(tmpfile); 299 panic("Could not make stub binary executable: %d", 300 -errno); 301 } 302 303 close(stub_exe_fd); 304 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 305 if (stub_exe_fd < 0) { 306 unlink(tmpfile); 307 panic("Could not reopen stub binary: %d", -errno); 308 } 309 310 unlink(tmpfile); 311 free(tmpfile); 312 } 313 314 return 0; 315 } 316 __initcall(init_stub_exe_fd); 317 318 int userspace_pid[NR_CPUS]; 319 320 /** 321 * start_userspace() - prepare a new userspace process 322 * @stub_stack: pointer to the stub stack. 323 * 324 * Setups a new temporary stack page that is used while userspace_tramp() runs 325 * Clones the kernel process into a new userspace process, with FDs only. 326 * 327 * Return: When positive: the process id of the new userspace process, 328 * when negative: an error number. 329 * FIXME: can PIDs become negative?! 330 */ 331 int start_userspace(unsigned long stub_stack) 332 { 333 void *stack; 334 unsigned long sp; 335 int pid, status, n, err; 336 337 /* setup a temporary stack page */ 338 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 339 PROT_READ | PROT_WRITE | PROT_EXEC, 340 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 341 if (stack == MAP_FAILED) { 342 err = -errno; 343 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 344 __func__, errno); 345 return err; 346 } 347 348 /* set stack pointer to the end of the stack page, so it can grow downwards */ 349 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 350 351 /* clone into new userspace process */ 352 pid = clone(userspace_tramp, (void *) sp, 353 CLONE_VFORK | CLONE_VM | SIGCHLD, 354 (void *)stub_stack); 355 if (pid < 0) { 356 err = -errno; 357 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 358 __func__, errno); 359 return err; 360 } 361 362 do { 363 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 364 if (n < 0) { 365 err = -errno; 366 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 367 __func__, errno); 368 goto out_kill; 369 } 370 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 371 372 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 373 err = -EINVAL; 374 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 375 __func__, status); 376 goto out_kill; 377 } 378 379 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 380 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 381 err = -errno; 382 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 383 __func__, errno); 384 goto out_kill; 385 } 386 387 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 388 err = -errno; 389 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 390 __func__, errno); 391 goto out_kill; 392 } 393 394 return pid; 395 396 out_kill: 397 os_kill_ptraced_process(pid, 1); 398 return err; 399 } 400 401 int unscheduled_userspace_iterations; 402 extern unsigned long tt_extra_sched_jiffies; 403 404 void userspace(struct uml_pt_regs *regs) 405 { 406 int err, status, op, pid = userspace_pid[0]; 407 siginfo_t si; 408 409 /* Handle any immediate reschedules or signals */ 410 interrupt_end(); 411 412 while (1) { 413 /* 414 * When we are in time-travel mode, userspace can theoretically 415 * do a *lot* of work without being scheduled. The problem with 416 * this is that it will prevent kernel bookkeeping (primarily 417 * the RCU) from running and this can for example cause OOM 418 * situations. 419 * 420 * This code accounts a jiffie against the scheduling clock 421 * after the defined userspace iterations in the same thread. 422 * By doing so the situation is effectively prevented. 423 */ 424 if (time_travel_mode == TT_MODE_INFCPU || 425 time_travel_mode == TT_MODE_EXTERNAL) { 426 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 427 if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && 428 unscheduled_userspace_iterations++ > 429 CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 430 tt_extra_sched_jiffies += 1; 431 unscheduled_userspace_iterations = 0; 432 } 433 #endif 434 } 435 436 time_travel_print_bc_msg(); 437 438 current_mm_sync(); 439 440 /* Flush out any pending syscalls */ 441 err = syscall_stub_flush(current_mm_id()); 442 if (err) { 443 if (err == -ENOMEM) 444 report_enomem(); 445 446 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 447 __func__, -err); 448 fatal_sigsegv(); 449 } 450 451 /* 452 * This can legitimately fail if the process loads a 453 * bogus value into a segment register. It will 454 * segfault and PTRACE_GETREGS will read that value 455 * out of the process. However, PTRACE_SETREGS will 456 * fail. In this case, there is nothing to do but 457 * just kill the process. 458 */ 459 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 460 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 461 __func__, errno); 462 fatal_sigsegv(); 463 } 464 465 if (put_fp_registers(pid, regs->fp)) { 466 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 467 __func__, errno); 468 fatal_sigsegv(); 469 } 470 471 if (singlestepping()) 472 op = PTRACE_SYSEMU_SINGLESTEP; 473 else 474 op = PTRACE_SYSEMU; 475 476 if (ptrace(op, pid, 0, 0)) { 477 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 478 __func__, op, errno); 479 fatal_sigsegv(); 480 } 481 482 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 483 if (err < 0) { 484 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 485 __func__, errno); 486 fatal_sigsegv(); 487 } 488 489 regs->is_user = 1; 490 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 491 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 492 __func__, errno); 493 fatal_sigsegv(); 494 } 495 496 if (get_fp_registers(pid, regs->fp)) { 497 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 498 __func__, errno); 499 fatal_sigsegv(); 500 } 501 502 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 503 504 if (WIFSTOPPED(status)) { 505 int sig = WSTOPSIG(status); 506 507 /* These signal handlers need the si argument. 508 * The SIGIO and SIGALARM handlers which constitute the 509 * majority of invocations, do not use it. 510 */ 511 switch (sig) { 512 case SIGSEGV: 513 case SIGTRAP: 514 case SIGILL: 515 case SIGBUS: 516 case SIGFPE: 517 case SIGWINCH: 518 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 519 break; 520 } 521 522 switch (sig) { 523 case SIGSEGV: 524 if (PTRACE_FULL_FAULTINFO) { 525 get_skas_faultinfo(pid, 526 ®s->faultinfo); 527 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 528 regs); 529 } 530 else handle_segv(pid, regs); 531 break; 532 case SIGTRAP + 0x80: 533 handle_trap(pid, regs); 534 break; 535 case SIGTRAP: 536 relay_signal(SIGTRAP, (struct siginfo *)&si, regs); 537 break; 538 case SIGALRM: 539 break; 540 case SIGIO: 541 case SIGILL: 542 case SIGBUS: 543 case SIGFPE: 544 case SIGWINCH: 545 block_signals_trace(); 546 (*sig_info[sig])(sig, (struct siginfo *)&si, regs); 547 unblock_signals_trace(); 548 break; 549 default: 550 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 551 __func__, sig); 552 fatal_sigsegv(); 553 } 554 pid = userspace_pid[0]; 555 interrupt_end(); 556 557 /* Avoid -ERESTARTSYS handling in host */ 558 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 559 PT_SYSCALL_NR(regs->gp) = -1; 560 } 561 } 562 } 563 564 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 565 { 566 (*buf)[0].JB_IP = (unsigned long) handler; 567 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 568 sizeof(void *); 569 } 570 571 #define INIT_JMP_NEW_THREAD 0 572 #define INIT_JMP_CALLBACK 1 573 #define INIT_JMP_HALT 2 574 #define INIT_JMP_REBOOT 3 575 576 void switch_threads(jmp_buf *me, jmp_buf *you) 577 { 578 unscheduled_userspace_iterations = 0; 579 580 if (UML_SETJMP(me) == 0) 581 UML_LONGJMP(you, 1); 582 } 583 584 static jmp_buf initial_jmpbuf; 585 586 /* XXX Make these percpu */ 587 static void (*cb_proc)(void *arg); 588 static void *cb_arg; 589 static jmp_buf *cb_back; 590 591 int start_idle_thread(void *stack, jmp_buf *switch_buf) 592 { 593 int n; 594 595 set_handler(SIGWINCH); 596 597 /* 598 * Can't use UML_SETJMP or UML_LONGJMP here because they save 599 * and restore signals, with the possible side-effect of 600 * trying to handle any signals which came when they were 601 * blocked, which can't be done on this stack. 602 * Signals must be blocked when jumping back here and restored 603 * after returning to the jumper. 604 */ 605 n = setjmp(initial_jmpbuf); 606 switch (n) { 607 case INIT_JMP_NEW_THREAD: 608 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 609 (*switch_buf)[0].JB_SP = (unsigned long) stack + 610 UM_THREAD_SIZE - sizeof(void *); 611 break; 612 case INIT_JMP_CALLBACK: 613 (*cb_proc)(cb_arg); 614 longjmp(*cb_back, 1); 615 break; 616 case INIT_JMP_HALT: 617 kmalloc_ok = 0; 618 return 0; 619 case INIT_JMP_REBOOT: 620 kmalloc_ok = 0; 621 return 1; 622 default: 623 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 624 __func__, n); 625 fatal_sigsegv(); 626 } 627 longjmp(*switch_buf, 1); 628 629 /* unreachable */ 630 printk(UM_KERN_ERR "impossible long jump!"); 631 fatal_sigsegv(); 632 return 0; 633 } 634 635 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 636 { 637 jmp_buf here; 638 639 cb_proc = proc; 640 cb_arg = arg; 641 cb_back = &here; 642 643 block_signals_trace(); 644 if (UML_SETJMP(&here) == 0) 645 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 646 unblock_signals_trace(); 647 648 cb_proc = NULL; 649 cb_arg = NULL; 650 cb_back = NULL; 651 } 652 653 void halt_skas(void) 654 { 655 block_signals_trace(); 656 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 657 } 658 659 static bool noreboot; 660 661 static int __init noreboot_cmd_param(char *str, int *add) 662 { 663 *add = 0; 664 noreboot = true; 665 return 0; 666 } 667 668 __uml_setup("noreboot", noreboot_cmd_param, 669 "noreboot\n" 670 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 671 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 672 " crashes in CI\n"); 673 674 void reboot_skas(void) 675 { 676 block_signals_trace(); 677 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 678 } 679 680 void __switch_mm(struct mm_id *mm_idp) 681 { 682 userspace_pid[0] = mm_idp->pid; 683 } 684