1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <fcntl.h> 14 #include <mem_user.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/stat.h> 18 #include <asm/unistd.h> 19 #include <as-layout.h> 20 #include <init.h> 21 #include <kern_util.h> 22 #include <mem.h> 23 #include <os.h> 24 #include <ptrace_user.h> 25 #include <registers.h> 26 #include <skas.h> 27 #include <sysdep/stub.h> 28 #include <linux/threads.h> 29 #include <timetravel.h> 30 #include "../internal.h" 31 32 int is_skas_winch(int pid, int fd, void *data) 33 { 34 return pid == getpgrp(); 35 } 36 37 static const char *ptrace_reg_name(int idx) 38 { 39 #define R(n) case HOST_##n: return #n 40 41 switch (idx) { 42 #ifdef __x86_64__ 43 R(BX); 44 R(CX); 45 R(DI); 46 R(SI); 47 R(DX); 48 R(BP); 49 R(AX); 50 R(R8); 51 R(R9); 52 R(R10); 53 R(R11); 54 R(R12); 55 R(R13); 56 R(R14); 57 R(R15); 58 R(ORIG_AX); 59 R(CS); 60 R(SS); 61 R(EFLAGS); 62 #elif defined(__i386__) 63 R(IP); 64 R(SP); 65 R(EFLAGS); 66 R(AX); 67 R(BX); 68 R(CX); 69 R(DX); 70 R(SI); 71 R(DI); 72 R(BP); 73 R(CS); 74 R(SS); 75 R(DS); 76 R(FS); 77 R(ES); 78 R(GS); 79 R(ORIG_AX); 80 #endif 81 } 82 return ""; 83 } 84 85 static int ptrace_dump_regs(int pid) 86 { 87 unsigned long regs[MAX_REG_NR]; 88 int i; 89 90 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 91 return -errno; 92 93 printk(UM_KERN_ERR "Stub registers -\n"); 94 for (i = 0; i < ARRAY_SIZE(regs); i++) { 95 const char *regname = ptrace_reg_name(i); 96 97 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 98 } 99 100 return 0; 101 } 102 103 /* 104 * Signals that are OK to receive in the stub - we'll just continue it. 105 * SIGWINCH will happen when UML is inside a detached screen. 106 */ 107 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 108 109 /* Signals that the stub will finish with - anything else is an error */ 110 #define STUB_DONE_MASK (1 << SIGTRAP) 111 112 void wait_stub_done(int pid) 113 { 114 int n, status, err; 115 116 while (1) { 117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 118 if ((n < 0) || !WIFSTOPPED(status)) 119 goto bad_wait; 120 121 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 122 break; 123 124 err = ptrace(PTRACE_CONT, pid, 0, 0); 125 if (err) { 126 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 127 __func__, errno); 128 fatal_sigsegv(); 129 } 130 } 131 132 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 133 return; 134 135 bad_wait: 136 err = ptrace_dump_regs(pid); 137 if (err) 138 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 139 -err); 140 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 141 __func__, pid, n, errno, status); 142 fatal_sigsegv(); 143 } 144 145 extern unsigned long current_stub_stack(void); 146 147 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 148 { 149 int err; 150 151 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 152 if (err) { 153 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 154 "errno = %d\n", pid, errno); 155 fatal_sigsegv(); 156 } 157 wait_stub_done(pid); 158 159 /* 160 * faultinfo is prepared by the stub_segv_handler at start of 161 * the stub stack page. We just have to copy it. 162 */ 163 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 164 } 165 166 static void handle_trap(int pid, struct uml_pt_regs *regs) 167 { 168 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 169 fatal_sigsegv(); 170 171 handle_syscall(regs); 172 } 173 174 extern char __syscall_stub_start[]; 175 176 static int stub_exe_fd; 177 178 #ifndef CLOSE_RANGE_CLOEXEC 179 #define CLOSE_RANGE_CLOEXEC (1U << 2) 180 #endif 181 182 static int userspace_tramp(void *stack) 183 { 184 char *const argv[] = { "uml-userspace", NULL }; 185 int pipe_fds[2]; 186 unsigned long long offset; 187 struct stub_init_data init_data = { 188 .stub_start = STUB_START, 189 .segv_handler = STUB_CODE + 190 (unsigned long) stub_segv_handler - 191 (unsigned long) __syscall_stub_start, 192 }; 193 struct iomem_region *iomem; 194 int ret; 195 196 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 197 &offset); 198 init_data.stub_code_offset = MMAP_OFFSET(offset); 199 200 init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 201 init_data.stub_data_offset = MMAP_OFFSET(offset); 202 203 /* 204 * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs 205 * and then unsetting it on all memory related FDs. 206 * This is not strictly necessary from a safety perspective. 207 */ 208 syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); 209 210 fcntl(init_data.stub_data_fd, F_SETFD, 0); 211 for (iomem = iomem_regions; iomem; iomem = iomem->next) 212 fcntl(iomem->fd, F_SETFD, 0); 213 214 /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 215 if (pipe(pipe_fds)) 216 exit(2); 217 218 if (dup2(pipe_fds[0], 0) < 0) 219 exit(3); 220 close(pipe_fds[0]); 221 222 /* Write init_data and close write side */ 223 ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 224 close(pipe_fds[1]); 225 226 if (ret != sizeof(init_data)) 227 exit(4); 228 229 /* Raw execveat for compatibility with older libc versions */ 230 syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", 231 (unsigned long)argv, NULL, AT_EMPTY_PATH); 232 233 exit(5); 234 } 235 236 extern char stub_exe_start[]; 237 extern char stub_exe_end[]; 238 239 extern char *tempdir; 240 241 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 242 243 #ifndef MFD_EXEC 244 #define MFD_EXEC 0x0010U 245 #endif 246 247 static int __init init_stub_exe_fd(void) 248 { 249 size_t written = 0; 250 char *tmpfile = NULL; 251 252 stub_exe_fd = memfd_create("uml-userspace", 253 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 254 255 if (stub_exe_fd < 0) { 256 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 257 258 tmpfile = malloc(strlen(tempdir) + 259 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 260 if (tmpfile == NULL) 261 panic("Failed to allocate memory for stub binary name"); 262 263 strcpy(tmpfile, tempdir); 264 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 265 266 stub_exe_fd = mkstemp(tmpfile); 267 if (stub_exe_fd < 0) 268 panic("Could not create temporary file for stub binary: %d", 269 -errno); 270 } 271 272 while (written < stub_exe_end - stub_exe_start) { 273 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 274 stub_exe_end - stub_exe_start - written); 275 if (res < 0) { 276 if (errno == EINTR) 277 continue; 278 279 if (tmpfile) 280 unlink(tmpfile); 281 panic("Failed write stub binary: %d", -errno); 282 } 283 284 written += res; 285 } 286 287 if (!tmpfile) { 288 fcntl(stub_exe_fd, F_ADD_SEALS, 289 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 290 } else { 291 if (fchmod(stub_exe_fd, 00500) < 0) { 292 unlink(tmpfile); 293 panic("Could not make stub binary executable: %d", 294 -errno); 295 } 296 297 close(stub_exe_fd); 298 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 299 if (stub_exe_fd < 0) { 300 unlink(tmpfile); 301 panic("Could not reopen stub binary: %d", -errno); 302 } 303 304 unlink(tmpfile); 305 free(tmpfile); 306 } 307 308 return 0; 309 } 310 __initcall(init_stub_exe_fd); 311 312 int userspace_pid[NR_CPUS]; 313 314 /** 315 * start_userspace() - prepare a new userspace process 316 * @stub_stack: pointer to the stub stack. 317 * 318 * Setups a new temporary stack page that is used while userspace_tramp() runs 319 * Clones the kernel process into a new userspace process, with FDs only. 320 * 321 * Return: When positive: the process id of the new userspace process, 322 * when negative: an error number. 323 * FIXME: can PIDs become negative?! 324 */ 325 int start_userspace(unsigned long stub_stack) 326 { 327 void *stack; 328 unsigned long sp; 329 int pid, status, n, err; 330 331 /* setup a temporary stack page */ 332 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 333 PROT_READ | PROT_WRITE | PROT_EXEC, 334 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 335 if (stack == MAP_FAILED) { 336 err = -errno; 337 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 338 __func__, errno); 339 return err; 340 } 341 342 /* set stack pointer to the end of the stack page, so it can grow downwards */ 343 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 344 345 /* clone into new userspace process */ 346 pid = clone(userspace_tramp, (void *) sp, 347 CLONE_VFORK | CLONE_VM | SIGCHLD, 348 (void *)stub_stack); 349 if (pid < 0) { 350 err = -errno; 351 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 352 __func__, errno); 353 return err; 354 } 355 356 do { 357 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 358 if (n < 0) { 359 err = -errno; 360 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 361 __func__, errno); 362 goto out_kill; 363 } 364 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 365 366 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 367 err = -EINVAL; 368 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 369 __func__, status); 370 goto out_kill; 371 } 372 373 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 374 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 375 err = -errno; 376 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 377 __func__, errno); 378 goto out_kill; 379 } 380 381 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 382 err = -errno; 383 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 384 __func__, errno); 385 goto out_kill; 386 } 387 388 return pid; 389 390 out_kill: 391 os_kill_ptraced_process(pid, 1); 392 return err; 393 } 394 395 int unscheduled_userspace_iterations; 396 extern unsigned long tt_extra_sched_jiffies; 397 398 void userspace(struct uml_pt_regs *regs) 399 { 400 int err, status, op, pid = userspace_pid[0]; 401 siginfo_t si; 402 403 /* Handle any immediate reschedules or signals */ 404 interrupt_end(); 405 406 while (1) { 407 /* 408 * When we are in time-travel mode, userspace can theoretically 409 * do a *lot* of work without being scheduled. The problem with 410 * this is that it will prevent kernel bookkeeping (primarily 411 * the RCU) from running and this can for example cause OOM 412 * situations. 413 * 414 * This code accounts a jiffie against the scheduling clock 415 * after the defined userspace iterations in the same thread. 416 * By doing so the situation is effectively prevented. 417 */ 418 if (time_travel_mode == TT_MODE_INFCPU || 419 time_travel_mode == TT_MODE_EXTERNAL) { 420 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 421 if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && 422 unscheduled_userspace_iterations++ > 423 CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 424 tt_extra_sched_jiffies += 1; 425 unscheduled_userspace_iterations = 0; 426 } 427 #endif 428 } 429 430 time_travel_print_bc_msg(); 431 432 current_mm_sync(); 433 434 /* Flush out any pending syscalls */ 435 err = syscall_stub_flush(current_mm_id()); 436 if (err) { 437 if (err == -ENOMEM) 438 report_enomem(); 439 440 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 441 __func__, -err); 442 fatal_sigsegv(); 443 } 444 445 /* 446 * This can legitimately fail if the process loads a 447 * bogus value into a segment register. It will 448 * segfault and PTRACE_GETREGS will read that value 449 * out of the process. However, PTRACE_SETREGS will 450 * fail. In this case, there is nothing to do but 451 * just kill the process. 452 */ 453 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 454 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 455 __func__, errno); 456 fatal_sigsegv(); 457 } 458 459 if (put_fp_registers(pid, regs->fp)) { 460 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 461 __func__, errno); 462 fatal_sigsegv(); 463 } 464 465 if (singlestepping()) 466 op = PTRACE_SYSEMU_SINGLESTEP; 467 else 468 op = PTRACE_SYSEMU; 469 470 if (ptrace(op, pid, 0, 0)) { 471 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 472 __func__, op, errno); 473 fatal_sigsegv(); 474 } 475 476 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 477 if (err < 0) { 478 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 479 __func__, errno); 480 fatal_sigsegv(); 481 } 482 483 regs->is_user = 1; 484 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 485 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 486 __func__, errno); 487 fatal_sigsegv(); 488 } 489 490 if (get_fp_registers(pid, regs->fp)) { 491 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 492 __func__, errno); 493 fatal_sigsegv(); 494 } 495 496 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 497 498 if (WIFSTOPPED(status)) { 499 int sig = WSTOPSIG(status); 500 501 /* These signal handlers need the si argument. 502 * The SIGIO and SIGALARM handlers which constitute the 503 * majority of invocations, do not use it. 504 */ 505 switch (sig) { 506 case SIGSEGV: 507 case SIGTRAP: 508 case SIGILL: 509 case SIGBUS: 510 case SIGFPE: 511 case SIGWINCH: 512 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 513 break; 514 } 515 516 switch (sig) { 517 case SIGSEGV: 518 get_skas_faultinfo(pid, ®s->faultinfo); 519 520 if (PTRACE_FULL_FAULTINFO) 521 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 522 regs, NULL); 523 else 524 segv(regs->faultinfo, 0, 1, NULL, NULL); 525 526 break; 527 case SIGTRAP + 0x80: 528 handle_trap(pid, regs); 529 break; 530 case SIGTRAP: 531 relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); 532 break; 533 case SIGALRM: 534 break; 535 case SIGIO: 536 case SIGILL: 537 case SIGBUS: 538 case SIGFPE: 539 case SIGWINCH: 540 block_signals_trace(); 541 (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); 542 unblock_signals_trace(); 543 break; 544 default: 545 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 546 __func__, sig); 547 fatal_sigsegv(); 548 } 549 pid = userspace_pid[0]; 550 interrupt_end(); 551 552 /* Avoid -ERESTARTSYS handling in host */ 553 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 554 PT_SYSCALL_NR(regs->gp) = -1; 555 } 556 } 557 } 558 559 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 560 { 561 (*buf)[0].JB_IP = (unsigned long) handler; 562 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 563 sizeof(void *); 564 } 565 566 #define INIT_JMP_NEW_THREAD 0 567 #define INIT_JMP_CALLBACK 1 568 #define INIT_JMP_HALT 2 569 #define INIT_JMP_REBOOT 3 570 571 void switch_threads(jmp_buf *me, jmp_buf *you) 572 { 573 unscheduled_userspace_iterations = 0; 574 575 if (UML_SETJMP(me) == 0) 576 UML_LONGJMP(you, 1); 577 } 578 579 static jmp_buf initial_jmpbuf; 580 581 /* XXX Make these percpu */ 582 static void (*cb_proc)(void *arg); 583 static void *cb_arg; 584 static jmp_buf *cb_back; 585 586 int start_idle_thread(void *stack, jmp_buf *switch_buf) 587 { 588 int n; 589 590 set_handler(SIGWINCH); 591 592 /* 593 * Can't use UML_SETJMP or UML_LONGJMP here because they save 594 * and restore signals, with the possible side-effect of 595 * trying to handle any signals which came when they were 596 * blocked, which can't be done on this stack. 597 * Signals must be blocked when jumping back here and restored 598 * after returning to the jumper. 599 */ 600 n = setjmp(initial_jmpbuf); 601 switch (n) { 602 case INIT_JMP_NEW_THREAD: 603 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 604 (*switch_buf)[0].JB_SP = (unsigned long) stack + 605 UM_THREAD_SIZE - sizeof(void *); 606 break; 607 case INIT_JMP_CALLBACK: 608 (*cb_proc)(cb_arg); 609 longjmp(*cb_back, 1); 610 break; 611 case INIT_JMP_HALT: 612 kmalloc_ok = 0; 613 return 0; 614 case INIT_JMP_REBOOT: 615 kmalloc_ok = 0; 616 return 1; 617 default: 618 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 619 __func__, n); 620 fatal_sigsegv(); 621 } 622 longjmp(*switch_buf, 1); 623 624 /* unreachable */ 625 printk(UM_KERN_ERR "impossible long jump!"); 626 fatal_sigsegv(); 627 return 0; 628 } 629 630 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 631 { 632 jmp_buf here; 633 634 cb_proc = proc; 635 cb_arg = arg; 636 cb_back = &here; 637 638 block_signals_trace(); 639 if (UML_SETJMP(&here) == 0) 640 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 641 unblock_signals_trace(); 642 643 cb_proc = NULL; 644 cb_arg = NULL; 645 cb_back = NULL; 646 } 647 648 void halt_skas(void) 649 { 650 block_signals_trace(); 651 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 652 } 653 654 static bool noreboot; 655 656 static int __init noreboot_cmd_param(char *str, int *add) 657 { 658 *add = 0; 659 noreboot = true; 660 return 0; 661 } 662 663 __uml_setup("noreboot", noreboot_cmd_param, 664 "noreboot\n" 665 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 666 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 667 " crashes in CI\n"); 668 669 void reboot_skas(void) 670 { 671 block_signals_trace(); 672 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 673 } 674 675 void __switch_mm(struct mm_id *mm_idp) 676 { 677 userspace_pid[0] = mm_idp->pid; 678 } 679