1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <fcntl.h> 14 #include <mem_user.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/stat.h> 18 #include <asm/unistd.h> 19 #include <as-layout.h> 20 #include <init.h> 21 #include <kern_util.h> 22 #include <mem.h> 23 #include <os.h> 24 #include <ptrace_user.h> 25 #include <registers.h> 26 #include <skas.h> 27 #include <sysdep/stub.h> 28 #include <linux/threads.h> 29 #include <timetravel.h> 30 #include "../internal.h" 31 32 int is_skas_winch(int pid, int fd, void *data) 33 { 34 return pid == getpgrp(); 35 } 36 37 static const char *ptrace_reg_name(int idx) 38 { 39 #define R(n) case HOST_##n: return #n 40 41 switch (idx) { 42 #ifdef __x86_64__ 43 R(BX); 44 R(CX); 45 R(DI); 46 R(SI); 47 R(DX); 48 R(BP); 49 R(AX); 50 R(R8); 51 R(R9); 52 R(R10); 53 R(R11); 54 R(R12); 55 R(R13); 56 R(R14); 57 R(R15); 58 R(ORIG_AX); 59 R(CS); 60 R(SS); 61 R(EFLAGS); 62 #elif defined(__i386__) 63 R(IP); 64 R(SP); 65 R(EFLAGS); 66 R(AX); 67 R(BX); 68 R(CX); 69 R(DX); 70 R(SI); 71 R(DI); 72 R(BP); 73 R(CS); 74 R(SS); 75 R(DS); 76 R(FS); 77 R(ES); 78 R(GS); 79 R(ORIG_AX); 80 #endif 81 } 82 return ""; 83 } 84 85 static int ptrace_dump_regs(int pid) 86 { 87 unsigned long regs[MAX_REG_NR]; 88 int i; 89 90 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 91 return -errno; 92 93 printk(UM_KERN_ERR "Stub registers -\n"); 94 for (i = 0; i < ARRAY_SIZE(regs); i++) { 95 const char *regname = ptrace_reg_name(i); 96 97 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 98 } 99 100 return 0; 101 } 102 103 /* 104 * Signals that are OK to receive in the stub - we'll just continue it. 105 * SIGWINCH will happen when UML is inside a detached screen. 106 */ 107 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 108 109 /* Signals that the stub will finish with - anything else is an error */ 110 #define STUB_DONE_MASK (1 << SIGTRAP) 111 112 void wait_stub_done(int pid) 113 { 114 int n, status, err; 115 116 while (1) { 117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 118 if ((n < 0) || !WIFSTOPPED(status)) 119 goto bad_wait; 120 121 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 122 break; 123 124 err = ptrace(PTRACE_CONT, pid, 0, 0); 125 if (err) { 126 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 127 __func__, errno); 128 fatal_sigsegv(); 129 } 130 } 131 132 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 133 return; 134 135 bad_wait: 136 err = ptrace_dump_regs(pid); 137 if (err) 138 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 139 -err); 140 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 141 __func__, pid, n, errno, status); 142 fatal_sigsegv(); 143 } 144 145 extern unsigned long current_stub_stack(void); 146 147 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 148 { 149 int err; 150 151 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 152 if (err) { 153 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 154 "errno = %d\n", pid, errno); 155 fatal_sigsegv(); 156 } 157 wait_stub_done(pid); 158 159 /* 160 * faultinfo is prepared by the stub_segv_handler at start of 161 * the stub stack page. We just have to copy it. 162 */ 163 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 164 } 165 166 static void handle_segv(int pid, struct uml_pt_regs *regs) 167 { 168 get_skas_faultinfo(pid, ®s->faultinfo); 169 segv(regs->faultinfo, 0, 1, NULL); 170 } 171 172 static void handle_trap(int pid, struct uml_pt_regs *regs) 173 { 174 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 175 fatal_sigsegv(); 176 177 handle_syscall(regs); 178 } 179 180 extern char __syscall_stub_start[]; 181 182 static int stub_exe_fd; 183 184 static int userspace_tramp(void *stack) 185 { 186 char *const argv[] = { "uml-userspace", NULL }; 187 int pipe_fds[2]; 188 unsigned long long offset; 189 struct stub_init_data init_data = { 190 .stub_start = STUB_START, 191 .segv_handler = STUB_CODE + 192 (unsigned long) stub_segv_handler - 193 (unsigned long) __syscall_stub_start, 194 }; 195 struct iomem_region *iomem; 196 int ret; 197 198 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 199 &offset); 200 init_data.stub_code_offset = MMAP_OFFSET(offset); 201 202 init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 203 init_data.stub_data_offset = MMAP_OFFSET(offset); 204 205 /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ 206 close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); 207 208 fcntl(init_data.stub_data_fd, F_SETFD, 0); 209 for (iomem = iomem_regions; iomem; iomem = iomem->next) 210 fcntl(iomem->fd, F_SETFD, 0); 211 212 /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 213 if (pipe(pipe_fds)) 214 exit(2); 215 216 if (dup2(pipe_fds[0], 0) < 0) 217 exit(3); 218 close(pipe_fds[0]); 219 220 /* Write init_data and close write side */ 221 ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 222 close(pipe_fds[1]); 223 224 if (ret != sizeof(init_data)) 225 exit(4); 226 227 execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); 228 229 exit(5); 230 } 231 232 extern char stub_exe_start[]; 233 extern char stub_exe_end[]; 234 235 extern char *tempdir; 236 237 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 238 239 #ifndef MFD_EXEC 240 #define MFD_EXEC 0x0010U 241 #endif 242 243 static int __init init_stub_exe_fd(void) 244 { 245 size_t written = 0; 246 char *tmpfile = NULL; 247 248 stub_exe_fd = memfd_create("uml-userspace", 249 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 250 251 if (stub_exe_fd < 0) { 252 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 253 254 tmpfile = malloc(strlen(tempdir) + 255 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 256 if (tmpfile == NULL) 257 panic("Failed to allocate memory for stub binary name"); 258 259 strcpy(tmpfile, tempdir); 260 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 261 262 stub_exe_fd = mkstemp(tmpfile); 263 if (stub_exe_fd < 0) 264 panic("Could not create temporary file for stub binary: %d", 265 -errno); 266 } 267 268 while (written < stub_exe_end - stub_exe_start) { 269 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 270 stub_exe_end - stub_exe_start - written); 271 if (res < 0) { 272 if (errno == EINTR) 273 continue; 274 275 if (tmpfile) 276 unlink(tmpfile); 277 panic("Failed write stub binary: %d", -errno); 278 } 279 280 written += res; 281 } 282 283 if (!tmpfile) { 284 fcntl(stub_exe_fd, F_ADD_SEALS, 285 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 286 } else { 287 if (fchmod(stub_exe_fd, 00500) < 0) { 288 unlink(tmpfile); 289 panic("Could not make stub binary executable: %d", 290 -errno); 291 } 292 293 close(stub_exe_fd); 294 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 295 if (stub_exe_fd < 0) { 296 unlink(tmpfile); 297 panic("Could not reopen stub binary: %d", -errno); 298 } 299 300 unlink(tmpfile); 301 free(tmpfile); 302 } 303 304 return 0; 305 } 306 __initcall(init_stub_exe_fd); 307 308 int userspace_pid[NR_CPUS]; 309 310 /** 311 * start_userspace() - prepare a new userspace process 312 * @stub_stack: pointer to the stub stack. 313 * 314 * Setups a new temporary stack page that is used while userspace_tramp() runs 315 * Clones the kernel process into a new userspace process, with FDs only. 316 * 317 * Return: When positive: the process id of the new userspace process, 318 * when negative: an error number. 319 * FIXME: can PIDs become negative?! 320 */ 321 int start_userspace(unsigned long stub_stack) 322 { 323 void *stack; 324 unsigned long sp; 325 int pid, status, n, err; 326 327 /* setup a temporary stack page */ 328 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 329 PROT_READ | PROT_WRITE | PROT_EXEC, 330 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 331 if (stack == MAP_FAILED) { 332 err = -errno; 333 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 334 __func__, errno); 335 return err; 336 } 337 338 /* set stack pointer to the end of the stack page, so it can grow downwards */ 339 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 340 341 /* clone into new userspace process */ 342 pid = clone(userspace_tramp, (void *) sp, 343 CLONE_VFORK | CLONE_VM | SIGCHLD, 344 (void *)stub_stack); 345 if (pid < 0) { 346 err = -errno; 347 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 348 __func__, errno); 349 return err; 350 } 351 352 do { 353 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 354 if (n < 0) { 355 err = -errno; 356 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 357 __func__, errno); 358 goto out_kill; 359 } 360 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 361 362 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 363 err = -EINVAL; 364 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 365 __func__, status); 366 goto out_kill; 367 } 368 369 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 370 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 371 err = -errno; 372 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 373 __func__, errno); 374 goto out_kill; 375 } 376 377 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 378 err = -errno; 379 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 380 __func__, errno); 381 goto out_kill; 382 } 383 384 return pid; 385 386 out_kill: 387 os_kill_ptraced_process(pid, 1); 388 return err; 389 } 390 391 int unscheduled_userspace_iterations; 392 extern unsigned long tt_extra_sched_jiffies; 393 394 void userspace(struct uml_pt_regs *regs) 395 { 396 int err, status, op, pid = userspace_pid[0]; 397 siginfo_t si; 398 399 /* Handle any immediate reschedules or signals */ 400 interrupt_end(); 401 402 while (1) { 403 /* 404 * When we are in time-travel mode, userspace can theoretically 405 * do a *lot* of work without being scheduled. The problem with 406 * this is that it will prevent kernel bookkeeping (primarily 407 * the RCU) from running and this can for example cause OOM 408 * situations. 409 * 410 * This code accounts a jiffie against the scheduling clock 411 * after the defined userspace iterations in the same thread. 412 * By doing so the situation is effectively prevented. 413 */ 414 if (time_travel_mode == TT_MODE_INFCPU || 415 time_travel_mode == TT_MODE_EXTERNAL) { 416 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 417 if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && 418 unscheduled_userspace_iterations++ > 419 CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 420 tt_extra_sched_jiffies += 1; 421 unscheduled_userspace_iterations = 0; 422 } 423 #endif 424 } 425 426 time_travel_print_bc_msg(); 427 428 current_mm_sync(); 429 430 /* Flush out any pending syscalls */ 431 err = syscall_stub_flush(current_mm_id()); 432 if (err) { 433 if (err == -ENOMEM) 434 report_enomem(); 435 436 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 437 __func__, -err); 438 fatal_sigsegv(); 439 } 440 441 /* 442 * This can legitimately fail if the process loads a 443 * bogus value into a segment register. It will 444 * segfault and PTRACE_GETREGS will read that value 445 * out of the process. However, PTRACE_SETREGS will 446 * fail. In this case, there is nothing to do but 447 * just kill the process. 448 */ 449 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 450 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 451 __func__, errno); 452 fatal_sigsegv(); 453 } 454 455 if (put_fp_registers(pid, regs->fp)) { 456 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 457 __func__, errno); 458 fatal_sigsegv(); 459 } 460 461 if (singlestepping()) 462 op = PTRACE_SYSEMU_SINGLESTEP; 463 else 464 op = PTRACE_SYSEMU; 465 466 if (ptrace(op, pid, 0, 0)) { 467 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 468 __func__, op, errno); 469 fatal_sigsegv(); 470 } 471 472 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 473 if (err < 0) { 474 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 475 __func__, errno); 476 fatal_sigsegv(); 477 } 478 479 regs->is_user = 1; 480 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 481 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 482 __func__, errno); 483 fatal_sigsegv(); 484 } 485 486 if (get_fp_registers(pid, regs->fp)) { 487 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 488 __func__, errno); 489 fatal_sigsegv(); 490 } 491 492 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 493 494 if (WIFSTOPPED(status)) { 495 int sig = WSTOPSIG(status); 496 497 /* These signal handlers need the si argument. 498 * The SIGIO and SIGALARM handlers which constitute the 499 * majority of invocations, do not use it. 500 */ 501 switch (sig) { 502 case SIGSEGV: 503 case SIGTRAP: 504 case SIGILL: 505 case SIGBUS: 506 case SIGFPE: 507 case SIGWINCH: 508 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 509 break; 510 } 511 512 switch (sig) { 513 case SIGSEGV: 514 if (PTRACE_FULL_FAULTINFO) { 515 get_skas_faultinfo(pid, 516 ®s->faultinfo); 517 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 518 regs); 519 } 520 else handle_segv(pid, regs); 521 break; 522 case SIGTRAP + 0x80: 523 handle_trap(pid, regs); 524 break; 525 case SIGTRAP: 526 relay_signal(SIGTRAP, (struct siginfo *)&si, regs); 527 break; 528 case SIGALRM: 529 break; 530 case SIGIO: 531 case SIGILL: 532 case SIGBUS: 533 case SIGFPE: 534 case SIGWINCH: 535 block_signals_trace(); 536 (*sig_info[sig])(sig, (struct siginfo *)&si, regs); 537 unblock_signals_trace(); 538 break; 539 default: 540 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 541 __func__, sig); 542 fatal_sigsegv(); 543 } 544 pid = userspace_pid[0]; 545 interrupt_end(); 546 547 /* Avoid -ERESTARTSYS handling in host */ 548 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 549 PT_SYSCALL_NR(regs->gp) = -1; 550 } 551 } 552 } 553 554 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 555 { 556 (*buf)[0].JB_IP = (unsigned long) handler; 557 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 558 sizeof(void *); 559 } 560 561 #define INIT_JMP_NEW_THREAD 0 562 #define INIT_JMP_CALLBACK 1 563 #define INIT_JMP_HALT 2 564 #define INIT_JMP_REBOOT 3 565 566 void switch_threads(jmp_buf *me, jmp_buf *you) 567 { 568 unscheduled_userspace_iterations = 0; 569 570 if (UML_SETJMP(me) == 0) 571 UML_LONGJMP(you, 1); 572 } 573 574 static jmp_buf initial_jmpbuf; 575 576 /* XXX Make these percpu */ 577 static void (*cb_proc)(void *arg); 578 static void *cb_arg; 579 static jmp_buf *cb_back; 580 581 int start_idle_thread(void *stack, jmp_buf *switch_buf) 582 { 583 int n; 584 585 set_handler(SIGWINCH); 586 587 /* 588 * Can't use UML_SETJMP or UML_LONGJMP here because they save 589 * and restore signals, with the possible side-effect of 590 * trying to handle any signals which came when they were 591 * blocked, which can't be done on this stack. 592 * Signals must be blocked when jumping back here and restored 593 * after returning to the jumper. 594 */ 595 n = setjmp(initial_jmpbuf); 596 switch (n) { 597 case INIT_JMP_NEW_THREAD: 598 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 599 (*switch_buf)[0].JB_SP = (unsigned long) stack + 600 UM_THREAD_SIZE - sizeof(void *); 601 break; 602 case INIT_JMP_CALLBACK: 603 (*cb_proc)(cb_arg); 604 longjmp(*cb_back, 1); 605 break; 606 case INIT_JMP_HALT: 607 kmalloc_ok = 0; 608 return 0; 609 case INIT_JMP_REBOOT: 610 kmalloc_ok = 0; 611 return 1; 612 default: 613 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 614 __func__, n); 615 fatal_sigsegv(); 616 } 617 longjmp(*switch_buf, 1); 618 619 /* unreachable */ 620 printk(UM_KERN_ERR "impossible long jump!"); 621 fatal_sigsegv(); 622 return 0; 623 } 624 625 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 626 { 627 jmp_buf here; 628 629 cb_proc = proc; 630 cb_arg = arg; 631 cb_back = &here; 632 633 block_signals_trace(); 634 if (UML_SETJMP(&here) == 0) 635 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 636 unblock_signals_trace(); 637 638 cb_proc = NULL; 639 cb_arg = NULL; 640 cb_back = NULL; 641 } 642 643 void halt_skas(void) 644 { 645 block_signals_trace(); 646 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 647 } 648 649 static bool noreboot; 650 651 static int __init noreboot_cmd_param(char *str, int *add) 652 { 653 *add = 0; 654 noreboot = true; 655 return 0; 656 } 657 658 __uml_setup("noreboot", noreboot_cmd_param, 659 "noreboot\n" 660 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 661 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 662 " crashes in CI\n"); 663 664 void reboot_skas(void) 665 { 666 block_signals_trace(); 667 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 668 } 669 670 void __switch_mm(struct mm_id *mm_idp) 671 { 672 userspace_pid[0] = mm_idp->pid; 673 } 674