1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <fcntl.h> 14 #include <mem_user.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/stat.h> 18 #include <asm/unistd.h> 19 #include <as-layout.h> 20 #include <init.h> 21 #include <kern_util.h> 22 #include <mem.h> 23 #include <os.h> 24 #include <ptrace_user.h> 25 #include <registers.h> 26 #include <skas.h> 27 #include <sysdep/stub.h> 28 #include <linux/threads.h> 29 #include <timetravel.h> 30 #include "../internal.h" 31 32 int is_skas_winch(int pid, int fd, void *data) 33 { 34 return pid == getpgrp(); 35 } 36 37 static const char *ptrace_reg_name(int idx) 38 { 39 #define R(n) case HOST_##n: return #n 40 41 switch (idx) { 42 #ifdef __x86_64__ 43 R(BX); 44 R(CX); 45 R(DI); 46 R(SI); 47 R(DX); 48 R(BP); 49 R(AX); 50 R(R8); 51 R(R9); 52 R(R10); 53 R(R11); 54 R(R12); 55 R(R13); 56 R(R14); 57 R(R15); 58 R(ORIG_AX); 59 R(CS); 60 R(SS); 61 R(EFLAGS); 62 #elif defined(__i386__) 63 R(IP); 64 R(SP); 65 R(EFLAGS); 66 R(AX); 67 R(BX); 68 R(CX); 69 R(DX); 70 R(SI); 71 R(DI); 72 R(BP); 73 R(CS); 74 R(SS); 75 R(DS); 76 R(FS); 77 R(ES); 78 R(GS); 79 R(ORIG_AX); 80 #endif 81 } 82 return ""; 83 } 84 85 static int ptrace_dump_regs(int pid) 86 { 87 unsigned long regs[MAX_REG_NR]; 88 int i; 89 90 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 91 return -errno; 92 93 printk(UM_KERN_ERR "Stub registers -\n"); 94 for (i = 0; i < ARRAY_SIZE(regs); i++) { 95 const char *regname = ptrace_reg_name(i); 96 97 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 98 } 99 100 return 0; 101 } 102 103 /* 104 * Signals that are OK to receive in the stub - we'll just continue it. 105 * SIGWINCH will happen when UML is inside a detached screen. 106 */ 107 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 108 109 /* Signals that the stub will finish with - anything else is an error */ 110 #define STUB_DONE_MASK (1 << SIGTRAP) 111 112 void wait_stub_done(int pid) 113 { 114 int n, status, err; 115 116 while (1) { 117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 118 if ((n < 0) || !WIFSTOPPED(status)) 119 goto bad_wait; 120 121 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 122 break; 123 124 err = ptrace(PTRACE_CONT, pid, 0, 0); 125 if (err) { 126 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 127 __func__, errno); 128 fatal_sigsegv(); 129 } 130 } 131 132 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 133 return; 134 135 bad_wait: 136 err = ptrace_dump_regs(pid); 137 if (err) 138 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 139 -err); 140 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 141 __func__, pid, n, errno, status); 142 fatal_sigsegv(); 143 } 144 145 extern unsigned long current_stub_stack(void); 146 147 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 148 { 149 int err; 150 151 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 152 if (err) { 153 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 154 "errno = %d\n", pid, errno); 155 fatal_sigsegv(); 156 } 157 wait_stub_done(pid); 158 159 /* 160 * faultinfo is prepared by the stub_segv_handler at start of 161 * the stub stack page. We just have to copy it. 162 */ 163 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 164 } 165 166 static void handle_segv(int pid, struct uml_pt_regs *regs) 167 { 168 get_skas_faultinfo(pid, ®s->faultinfo); 169 segv(regs->faultinfo, 0, 1, NULL); 170 } 171 172 static void handle_trap(int pid, struct uml_pt_regs *regs) 173 { 174 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 175 fatal_sigsegv(); 176 177 handle_syscall(regs); 178 } 179 180 extern char __syscall_stub_start[]; 181 182 static int stub_exe_fd; 183 184 static int userspace_tramp(void *stack) 185 { 186 char *const argv[] = { "uml-userspace", NULL }; 187 int pipe_fds[2]; 188 unsigned long long offset; 189 struct stub_init_data init_data = { 190 .stub_start = STUB_START, 191 .segv_handler = STUB_CODE + 192 (unsigned long) stub_segv_handler - 193 (unsigned long) __syscall_stub_start, 194 }; 195 struct iomem_region *iomem; 196 int ret; 197 198 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 199 &offset); 200 init_data.stub_code_offset = MMAP_OFFSET(offset); 201 202 init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 203 init_data.stub_data_offset = MMAP_OFFSET(offset); 204 205 /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ 206 close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); 207 208 fcntl(init_data.stub_data_fd, F_SETFD, 0); 209 for (iomem = iomem_regions; iomem; iomem = iomem->next) 210 fcntl(iomem->fd, F_SETFD, 0); 211 212 /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 213 if (pipe(pipe_fds)) 214 exit(2); 215 216 if (dup2(pipe_fds[0], 0) < 0) 217 exit(3); 218 close(pipe_fds[0]); 219 220 /* Write init_data and close write side */ 221 ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 222 close(pipe_fds[1]); 223 224 if (ret != sizeof(init_data)) 225 exit(4); 226 227 execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); 228 229 exit(5); 230 } 231 232 extern char stub_exe_start[]; 233 extern char stub_exe_end[]; 234 235 extern char *tempdir; 236 237 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 238 239 #ifndef MFD_EXEC 240 #define MFD_EXEC 0x0010U 241 #endif 242 243 static int __init init_stub_exe_fd(void) 244 { 245 size_t written = 0; 246 char *tmpfile = NULL; 247 248 stub_exe_fd = memfd_create("uml-userspace", 249 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 250 251 if (stub_exe_fd < 0) { 252 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 253 254 tmpfile = malloc(strlen(tempdir) + 255 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 256 if (tmpfile == NULL) 257 panic("Failed to allocate memory for stub binary name"); 258 259 strcpy(tmpfile, tempdir); 260 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 261 262 stub_exe_fd = mkstemp(tmpfile); 263 if (stub_exe_fd < 0) 264 panic("Could not create temporary file for stub binary: %d", 265 -errno); 266 } 267 268 while (written < stub_exe_end - stub_exe_start) { 269 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 270 stub_exe_end - stub_exe_start - written); 271 if (res < 0) { 272 if (errno == EINTR) 273 continue; 274 275 if (tmpfile) 276 unlink(tmpfile); 277 panic("Failed write stub binary: %d", -errno); 278 } 279 280 written += res; 281 } 282 283 if (!tmpfile) { 284 fcntl(stub_exe_fd, F_ADD_SEALS, 285 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 286 } else { 287 if (fchmod(stub_exe_fd, 00500) < 0) { 288 unlink(tmpfile); 289 panic("Could not make stub binary executable: %d", 290 -errno); 291 } 292 293 close(stub_exe_fd); 294 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 295 if (stub_exe_fd < 0) { 296 unlink(tmpfile); 297 panic("Could not reopen stub binary: %d", -errno); 298 } 299 300 unlink(tmpfile); 301 free(tmpfile); 302 } 303 304 return 0; 305 } 306 __initcall(init_stub_exe_fd); 307 308 int userspace_pid[NR_CPUS]; 309 310 /** 311 * start_userspace() - prepare a new userspace process 312 * @stub_stack: pointer to the stub stack. 313 * 314 * Setups a new temporary stack page that is used while userspace_tramp() runs 315 * Clones the kernel process into a new userspace process, with FDs only. 316 * 317 * Return: When positive: the process id of the new userspace process, 318 * when negative: an error number. 319 * FIXME: can PIDs become negative?! 320 */ 321 int start_userspace(unsigned long stub_stack) 322 { 323 void *stack; 324 unsigned long sp; 325 int pid, status, n, err; 326 327 /* setup a temporary stack page */ 328 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 329 PROT_READ | PROT_WRITE | PROT_EXEC, 330 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 331 if (stack == MAP_FAILED) { 332 err = -errno; 333 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 334 __func__, errno); 335 return err; 336 } 337 338 /* set stack pointer to the end of the stack page, so it can grow downwards */ 339 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 340 341 /* clone into new userspace process */ 342 pid = clone(userspace_tramp, (void *) sp, 343 CLONE_VFORK | CLONE_VM | SIGCHLD, 344 (void *)stub_stack); 345 if (pid < 0) { 346 err = -errno; 347 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 348 __func__, errno); 349 return err; 350 } 351 352 do { 353 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 354 if (n < 0) { 355 err = -errno; 356 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 357 __func__, errno); 358 goto out_kill; 359 } 360 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 361 362 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 363 err = -EINVAL; 364 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 365 __func__, status); 366 goto out_kill; 367 } 368 369 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 370 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 371 err = -errno; 372 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 373 __func__, errno); 374 goto out_kill; 375 } 376 377 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 378 err = -errno; 379 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 380 __func__, errno); 381 goto out_kill; 382 } 383 384 return pid; 385 386 out_kill: 387 os_kill_ptraced_process(pid, 1); 388 return err; 389 } 390 391 void userspace(struct uml_pt_regs *regs) 392 { 393 int err, status, op, pid = userspace_pid[0]; 394 siginfo_t si; 395 396 /* Handle any immediate reschedules or signals */ 397 interrupt_end(); 398 399 while (1) { 400 time_travel_print_bc_msg(); 401 402 current_mm_sync(); 403 404 /* Flush out any pending syscalls */ 405 err = syscall_stub_flush(current_mm_id()); 406 if (err) { 407 if (err == -ENOMEM) 408 report_enomem(); 409 410 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 411 __func__, -err); 412 fatal_sigsegv(); 413 } 414 415 /* 416 * This can legitimately fail if the process loads a 417 * bogus value into a segment register. It will 418 * segfault and PTRACE_GETREGS will read that value 419 * out of the process. However, PTRACE_SETREGS will 420 * fail. In this case, there is nothing to do but 421 * just kill the process. 422 */ 423 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 424 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 425 __func__, errno); 426 fatal_sigsegv(); 427 } 428 429 if (put_fp_registers(pid, regs->fp)) { 430 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 431 __func__, errno); 432 fatal_sigsegv(); 433 } 434 435 if (singlestepping()) 436 op = PTRACE_SYSEMU_SINGLESTEP; 437 else 438 op = PTRACE_SYSEMU; 439 440 if (ptrace(op, pid, 0, 0)) { 441 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 442 __func__, op, errno); 443 fatal_sigsegv(); 444 } 445 446 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 447 if (err < 0) { 448 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 449 __func__, errno); 450 fatal_sigsegv(); 451 } 452 453 regs->is_user = 1; 454 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 455 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 456 __func__, errno); 457 fatal_sigsegv(); 458 } 459 460 if (get_fp_registers(pid, regs->fp)) { 461 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 462 __func__, errno); 463 fatal_sigsegv(); 464 } 465 466 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 467 468 if (WIFSTOPPED(status)) { 469 int sig = WSTOPSIG(status); 470 471 /* These signal handlers need the si argument. 472 * The SIGIO and SIGALARM handlers which constitute the 473 * majority of invocations, do not use it. 474 */ 475 switch (sig) { 476 case SIGSEGV: 477 case SIGTRAP: 478 case SIGILL: 479 case SIGBUS: 480 case SIGFPE: 481 case SIGWINCH: 482 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 483 break; 484 } 485 486 switch (sig) { 487 case SIGSEGV: 488 if (PTRACE_FULL_FAULTINFO) { 489 get_skas_faultinfo(pid, 490 ®s->faultinfo); 491 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 492 regs); 493 } 494 else handle_segv(pid, regs); 495 break; 496 case SIGTRAP + 0x80: 497 handle_trap(pid, regs); 498 break; 499 case SIGTRAP: 500 relay_signal(SIGTRAP, (struct siginfo *)&si, regs); 501 break; 502 case SIGALRM: 503 break; 504 case SIGIO: 505 case SIGILL: 506 case SIGBUS: 507 case SIGFPE: 508 case SIGWINCH: 509 block_signals_trace(); 510 (*sig_info[sig])(sig, (struct siginfo *)&si, regs); 511 unblock_signals_trace(); 512 break; 513 default: 514 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 515 __func__, sig); 516 fatal_sigsegv(); 517 } 518 pid = userspace_pid[0]; 519 interrupt_end(); 520 521 /* Avoid -ERESTARTSYS handling in host */ 522 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 523 PT_SYSCALL_NR(regs->gp) = -1; 524 } 525 } 526 } 527 528 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 529 { 530 (*buf)[0].JB_IP = (unsigned long) handler; 531 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 532 sizeof(void *); 533 } 534 535 #define INIT_JMP_NEW_THREAD 0 536 #define INIT_JMP_CALLBACK 1 537 #define INIT_JMP_HALT 2 538 #define INIT_JMP_REBOOT 3 539 540 void switch_threads(jmp_buf *me, jmp_buf *you) 541 { 542 if (UML_SETJMP(me) == 0) 543 UML_LONGJMP(you, 1); 544 } 545 546 static jmp_buf initial_jmpbuf; 547 548 /* XXX Make these percpu */ 549 static void (*cb_proc)(void *arg); 550 static void *cb_arg; 551 static jmp_buf *cb_back; 552 553 int start_idle_thread(void *stack, jmp_buf *switch_buf) 554 { 555 int n; 556 557 set_handler(SIGWINCH); 558 559 /* 560 * Can't use UML_SETJMP or UML_LONGJMP here because they save 561 * and restore signals, with the possible side-effect of 562 * trying to handle any signals which came when they were 563 * blocked, which can't be done on this stack. 564 * Signals must be blocked when jumping back here and restored 565 * after returning to the jumper. 566 */ 567 n = setjmp(initial_jmpbuf); 568 switch (n) { 569 case INIT_JMP_NEW_THREAD: 570 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 571 (*switch_buf)[0].JB_SP = (unsigned long) stack + 572 UM_THREAD_SIZE - sizeof(void *); 573 break; 574 case INIT_JMP_CALLBACK: 575 (*cb_proc)(cb_arg); 576 longjmp(*cb_back, 1); 577 break; 578 case INIT_JMP_HALT: 579 kmalloc_ok = 0; 580 return 0; 581 case INIT_JMP_REBOOT: 582 kmalloc_ok = 0; 583 return 1; 584 default: 585 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 586 __func__, n); 587 fatal_sigsegv(); 588 } 589 longjmp(*switch_buf, 1); 590 591 /* unreachable */ 592 printk(UM_KERN_ERR "impossible long jump!"); 593 fatal_sigsegv(); 594 return 0; 595 } 596 597 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 598 { 599 jmp_buf here; 600 601 cb_proc = proc; 602 cb_arg = arg; 603 cb_back = &here; 604 605 block_signals_trace(); 606 if (UML_SETJMP(&here) == 0) 607 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 608 unblock_signals_trace(); 609 610 cb_proc = NULL; 611 cb_arg = NULL; 612 cb_back = NULL; 613 } 614 615 void halt_skas(void) 616 { 617 block_signals_trace(); 618 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 619 } 620 621 static bool noreboot; 622 623 static int __init noreboot_cmd_param(char *str, int *add) 624 { 625 noreboot = true; 626 return 0; 627 } 628 629 __uml_setup("noreboot", noreboot_cmd_param, 630 "noreboot\n" 631 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 632 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 633 " crashes in CI\n"); 634 635 void reboot_skas(void) 636 { 637 block_signals_trace(); 638 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 639 } 640 641 void __switch_mm(struct mm_id *mm_idp) 642 { 643 userspace_pid[0] = mm_idp->pid; 644 } 645