1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <sys/mman.h> 14 #include <sys/wait.h> 15 #include <asm/unistd.h> 16 #include <as-layout.h> 17 #include <init.h> 18 #include <kern_util.h> 19 #include <mem.h> 20 #include <os.h> 21 #include <ptrace_user.h> 22 #include <registers.h> 23 #include <skas.h> 24 #include <sysdep/stub.h> 25 #include <linux/threads.h> 26 #include <timetravel.h> 27 #include "../internal.h" 28 29 int is_skas_winch(int pid, int fd, void *data) 30 { 31 return pid == getpgrp(); 32 } 33 34 static const char *ptrace_reg_name(int idx) 35 { 36 #define R(n) case HOST_##n: return #n 37 38 switch (idx) { 39 #ifdef __x86_64__ 40 R(BX); 41 R(CX); 42 R(DI); 43 R(SI); 44 R(DX); 45 R(BP); 46 R(AX); 47 R(R8); 48 R(R9); 49 R(R10); 50 R(R11); 51 R(R12); 52 R(R13); 53 R(R14); 54 R(R15); 55 R(ORIG_AX); 56 R(CS); 57 R(SS); 58 R(EFLAGS); 59 #elif defined(__i386__) 60 R(IP); 61 R(SP); 62 R(EFLAGS); 63 R(AX); 64 R(BX); 65 R(CX); 66 R(DX); 67 R(SI); 68 R(DI); 69 R(BP); 70 R(CS); 71 R(SS); 72 R(DS); 73 R(FS); 74 R(ES); 75 R(GS); 76 R(ORIG_AX); 77 #endif 78 } 79 return ""; 80 } 81 82 static int ptrace_dump_regs(int pid) 83 { 84 unsigned long regs[MAX_REG_NR]; 85 int i; 86 87 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 88 return -errno; 89 90 printk(UM_KERN_ERR "Stub registers -\n"); 91 for (i = 0; i < ARRAY_SIZE(regs); i++) { 92 const char *regname = ptrace_reg_name(i); 93 94 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 95 } 96 97 return 0; 98 } 99 100 /* 101 * Signals that are OK to receive in the stub - we'll just continue it. 102 * SIGWINCH will happen when UML is inside a detached screen. 103 */ 104 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 105 106 /* Signals that the stub will finish with - anything else is an error */ 107 #define STUB_DONE_MASK (1 << SIGTRAP) 108 109 void wait_stub_done(int pid) 110 { 111 int n, status, err; 112 113 while (1) { 114 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 115 if ((n < 0) || !WIFSTOPPED(status)) 116 goto bad_wait; 117 118 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 119 break; 120 121 err = ptrace(PTRACE_CONT, pid, 0, 0); 122 if (err) { 123 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 124 __func__, errno); 125 fatal_sigsegv(); 126 } 127 } 128 129 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 130 return; 131 132 bad_wait: 133 err = ptrace_dump_regs(pid); 134 if (err) 135 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 136 -err); 137 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 138 __func__, pid, n, errno, status); 139 fatal_sigsegv(); 140 } 141 142 extern unsigned long current_stub_stack(void); 143 144 static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux_fp_regs) 145 { 146 int err; 147 148 err = get_fp_registers(pid, aux_fp_regs); 149 if (err < 0) { 150 printk(UM_KERN_ERR "save_fp_registers returned %d\n", 151 err); 152 fatal_sigsegv(); 153 } 154 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 155 if (err) { 156 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 157 "errno = %d\n", pid, errno); 158 fatal_sigsegv(); 159 } 160 wait_stub_done(pid); 161 162 /* 163 * faultinfo is prepared by the stub_segv_handler at start of 164 * the stub stack page. We just have to copy it. 165 */ 166 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 167 168 err = put_fp_registers(pid, aux_fp_regs); 169 if (err < 0) { 170 printk(UM_KERN_ERR "put_fp_registers returned %d\n", 171 err); 172 fatal_sigsegv(); 173 } 174 } 175 176 static void handle_segv(int pid, struct uml_pt_regs *regs, unsigned long *aux_fp_regs) 177 { 178 get_skas_faultinfo(pid, ®s->faultinfo, aux_fp_regs); 179 segv(regs->faultinfo, 0, 1, NULL); 180 } 181 182 static void handle_trap(int pid, struct uml_pt_regs *regs) 183 { 184 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 185 fatal_sigsegv(); 186 187 handle_syscall(regs); 188 } 189 190 extern char __syscall_stub_start[]; 191 192 /** 193 * userspace_tramp() - userspace trampoline 194 * @stack: pointer to the new userspace stack page 195 * 196 * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed. 197 * This function will run on a temporary stack page. 198 * It ptrace()'es itself, then 199 * Two pages are mapped into the userspace address space: 200 * - STUB_CODE (with EXEC), which contains the skas stub code 201 * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel. 202 * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process. 203 * And last the process stops itself to give control to the UML kernel for this userspace process. 204 * 205 * Return: Always zero, otherwise the current userspace process is ended with non null exit() call 206 */ 207 static int userspace_tramp(void *stack) 208 { 209 struct sigaction sa; 210 void *addr; 211 int fd; 212 unsigned long long offset; 213 unsigned long segv_handler = STUB_CODE + 214 (unsigned long) stub_segv_handler - 215 (unsigned long) __syscall_stub_start; 216 217 ptrace(PTRACE_TRACEME, 0, 0, 0); 218 219 signal(SIGTERM, SIG_DFL); 220 signal(SIGWINCH, SIG_IGN); 221 222 fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); 223 addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, 224 PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); 225 if (addr == MAP_FAILED) { 226 os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", 227 STUB_CODE, errno); 228 exit(1); 229 } 230 231 fd = phys_mapping(uml_to_phys(stack), &offset); 232 addr = mmap((void *) STUB_DATA, 233 STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, 234 MAP_FIXED | MAP_SHARED, fd, offset); 235 if (addr == MAP_FAILED) { 236 os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", 237 STUB_DATA, errno); 238 exit(1); 239 } 240 241 set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); 242 sigemptyset(&sa.sa_mask); 243 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 244 sa.sa_sigaction = (void *) segv_handler; 245 sa.sa_restorer = NULL; 246 if (sigaction(SIGSEGV, &sa, NULL) < 0) { 247 os_info("%s - setting SIGSEGV handler failed - errno = %d\n", 248 __func__, errno); 249 exit(1); 250 } 251 252 kill(os_getpid(), SIGSTOP); 253 return 0; 254 } 255 256 int userspace_pid[NR_CPUS]; 257 int kill_userspace_mm[NR_CPUS]; 258 259 /** 260 * start_userspace() - prepare a new userspace process 261 * @stub_stack: pointer to the stub stack. 262 * 263 * Setups a new temporary stack page that is used while userspace_tramp() runs 264 * Clones the kernel process into a new userspace process, with FDs only. 265 * 266 * Return: When positive: the process id of the new userspace process, 267 * when negative: an error number. 268 * FIXME: can PIDs become negative?! 269 */ 270 int start_userspace(unsigned long stub_stack) 271 { 272 void *stack; 273 unsigned long sp; 274 int pid, status, n, flags, err; 275 276 /* setup a temporary stack page */ 277 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 278 PROT_READ | PROT_WRITE | PROT_EXEC, 279 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 280 if (stack == MAP_FAILED) { 281 err = -errno; 282 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 283 __func__, errno); 284 return err; 285 } 286 287 /* set stack pointer to the end of the stack page, so it can grow downwards */ 288 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 289 290 flags = CLONE_FILES | SIGCHLD; 291 292 /* clone into new userspace process */ 293 pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); 294 if (pid < 0) { 295 err = -errno; 296 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 297 __func__, errno); 298 return err; 299 } 300 301 do { 302 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 303 if (n < 0) { 304 err = -errno; 305 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 306 __func__, errno); 307 goto out_kill; 308 } 309 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 310 311 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 312 err = -EINVAL; 313 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 314 __func__, status); 315 goto out_kill; 316 } 317 318 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 319 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 320 err = -errno; 321 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 322 __func__, errno); 323 goto out_kill; 324 } 325 326 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 327 err = -errno; 328 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 329 __func__, errno); 330 goto out_kill; 331 } 332 333 return pid; 334 335 out_kill: 336 os_kill_ptraced_process(pid, 1); 337 return err; 338 } 339 340 void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) 341 { 342 int err, status, op, pid = userspace_pid[0]; 343 siginfo_t si; 344 345 /* Handle any immediate reschedules or signals */ 346 interrupt_end(); 347 348 while (1) { 349 time_travel_print_bc_msg(); 350 351 if (kill_userspace_mm[0]) 352 fatal_sigsegv(); 353 354 /* 355 * This can legitimately fail if the process loads a 356 * bogus value into a segment register. It will 357 * segfault and PTRACE_GETREGS will read that value 358 * out of the process. However, PTRACE_SETREGS will 359 * fail. In this case, there is nothing to do but 360 * just kill the process. 361 */ 362 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 363 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 364 __func__, errno); 365 fatal_sigsegv(); 366 } 367 368 if (put_fp_registers(pid, regs->fp)) { 369 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 370 __func__, errno); 371 fatal_sigsegv(); 372 } 373 374 if (singlestepping()) 375 op = PTRACE_SYSEMU_SINGLESTEP; 376 else 377 op = PTRACE_SYSEMU; 378 379 if (ptrace(op, pid, 0, 0)) { 380 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 381 __func__, op, errno); 382 fatal_sigsegv(); 383 } 384 385 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 386 if (err < 0) { 387 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 388 __func__, errno); 389 fatal_sigsegv(); 390 } 391 392 regs->is_user = 1; 393 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 394 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 395 __func__, errno); 396 fatal_sigsegv(); 397 } 398 399 if (get_fp_registers(pid, regs->fp)) { 400 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 401 __func__, errno); 402 fatal_sigsegv(); 403 } 404 405 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 406 407 if (WIFSTOPPED(status)) { 408 int sig = WSTOPSIG(status); 409 410 /* These signal handlers need the si argument. 411 * The SIGIO and SIGALARM handlers which constitute the 412 * majority of invocations, do not use it. 413 */ 414 switch (sig) { 415 case SIGSEGV: 416 case SIGTRAP: 417 case SIGILL: 418 case SIGBUS: 419 case SIGFPE: 420 case SIGWINCH: 421 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 422 break; 423 } 424 425 switch (sig) { 426 case SIGSEGV: 427 if (PTRACE_FULL_FAULTINFO) { 428 get_skas_faultinfo(pid, 429 ®s->faultinfo, aux_fp_regs); 430 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 431 regs); 432 } 433 else handle_segv(pid, regs, aux_fp_regs); 434 break; 435 case SIGTRAP + 0x80: 436 handle_trap(pid, regs); 437 break; 438 case SIGTRAP: 439 relay_signal(SIGTRAP, (struct siginfo *)&si, regs); 440 break; 441 case SIGALRM: 442 break; 443 case SIGIO: 444 case SIGILL: 445 case SIGBUS: 446 case SIGFPE: 447 case SIGWINCH: 448 block_signals_trace(); 449 (*sig_info[sig])(sig, (struct siginfo *)&si, regs); 450 unblock_signals_trace(); 451 break; 452 default: 453 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 454 __func__, sig); 455 fatal_sigsegv(); 456 } 457 pid = userspace_pid[0]; 458 interrupt_end(); 459 460 /* Avoid -ERESTARTSYS handling in host */ 461 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 462 PT_SYSCALL_NR(regs->gp) = -1; 463 } 464 } 465 } 466 467 static unsigned long thread_regs[MAX_REG_NR]; 468 static unsigned long thread_fp_regs[FP_SIZE]; 469 470 static int __init init_thread_regs(void) 471 { 472 get_safe_registers(thread_regs, thread_fp_regs); 473 /* Set parent's instruction pointer to start of clone-stub */ 474 thread_regs[REGS_IP_INDEX] = STUB_CODE + 475 (unsigned long) stub_clone_handler - 476 (unsigned long) __syscall_stub_start; 477 478 /* syscall data as a temporary stack area (top half). */ 479 thread_regs[REGS_SP_INDEX] = STUB_DATA + 480 offsetof(struct stub_data, syscall_data) + 481 sizeof(((struct stub_data *) 0)->syscall_data) - 482 sizeof(void *); 483 return 0; 484 } 485 486 __initcall(init_thread_regs); 487 488 int copy_context_skas0(unsigned long new_stack, int pid) 489 { 490 int err; 491 unsigned long current_stack = current_stub_stack(); 492 struct stub_data *data = (struct stub_data *) current_stack; 493 struct stub_data *child_data = (struct stub_data *) new_stack; 494 unsigned long long new_offset; 495 int new_fd = phys_mapping(uml_to_phys((void *)new_stack), &new_offset); 496 497 /* 498 * prepare offset and fd of child's stack as argument for parent's 499 * and child's mmap2 calls 500 */ 501 *data = ((struct stub_data) { 502 .offset = MMAP_OFFSET(new_offset), 503 .fd = new_fd, 504 .err = -ESRCH, 505 .child_err = 0, 506 }); 507 508 *child_data = ((struct stub_data) { 509 .child_err = -ESRCH, 510 }); 511 512 err = ptrace_setregs(pid, thread_regs); 513 if (err < 0) { 514 err = -errno; 515 printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n", 516 __func__, pid, -err); 517 return err; 518 } 519 520 err = put_fp_registers(pid, thread_fp_regs); 521 if (err < 0) { 522 printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n", 523 __func__, pid, err); 524 return err; 525 } 526 527 /* 528 * Wait, until parent has finished its work: read child's pid from 529 * parent's stack, and check, if bad result. 530 */ 531 err = ptrace(PTRACE_CONT, pid, 0, 0); 532 if (err) { 533 err = -errno; 534 printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n", 535 pid, errno); 536 return err; 537 } 538 539 wait_stub_done(pid); 540 541 pid = data->err; 542 if (pid < 0) { 543 printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", 544 __func__, -pid); 545 return pid; 546 } 547 548 /* 549 * Wait, until child has finished too: read child's result from 550 * child's stack and check it. 551 */ 552 wait_stub_done(pid); 553 if (child_data->child_err != STUB_DATA) { 554 printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n", 555 __func__, pid, data->child_err); 556 err = data->child_err; 557 goto out_kill; 558 } 559 560 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 561 (void *)PTRACE_O_TRACESYSGOOD) < 0) { 562 err = -errno; 563 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 564 __func__, errno); 565 goto out_kill; 566 } 567 568 return pid; 569 570 out_kill: 571 os_kill_ptraced_process(pid, 1); 572 return err; 573 } 574 575 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 576 { 577 (*buf)[0].JB_IP = (unsigned long) handler; 578 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 579 sizeof(void *); 580 } 581 582 #define INIT_JMP_NEW_THREAD 0 583 #define INIT_JMP_CALLBACK 1 584 #define INIT_JMP_HALT 2 585 #define INIT_JMP_REBOOT 3 586 587 void switch_threads(jmp_buf *me, jmp_buf *you) 588 { 589 if (UML_SETJMP(me) == 0) 590 UML_LONGJMP(you, 1); 591 } 592 593 static jmp_buf initial_jmpbuf; 594 595 /* XXX Make these percpu */ 596 static void (*cb_proc)(void *arg); 597 static void *cb_arg; 598 static jmp_buf *cb_back; 599 600 int start_idle_thread(void *stack, jmp_buf *switch_buf) 601 { 602 int n; 603 604 set_handler(SIGWINCH); 605 606 /* 607 * Can't use UML_SETJMP or UML_LONGJMP here because they save 608 * and restore signals, with the possible side-effect of 609 * trying to handle any signals which came when they were 610 * blocked, which can't be done on this stack. 611 * Signals must be blocked when jumping back here and restored 612 * after returning to the jumper. 613 */ 614 n = setjmp(initial_jmpbuf); 615 switch (n) { 616 case INIT_JMP_NEW_THREAD: 617 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 618 (*switch_buf)[0].JB_SP = (unsigned long) stack + 619 UM_THREAD_SIZE - sizeof(void *); 620 break; 621 case INIT_JMP_CALLBACK: 622 (*cb_proc)(cb_arg); 623 longjmp(*cb_back, 1); 624 break; 625 case INIT_JMP_HALT: 626 kmalloc_ok = 0; 627 return 0; 628 case INIT_JMP_REBOOT: 629 kmalloc_ok = 0; 630 return 1; 631 default: 632 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 633 __func__, n); 634 fatal_sigsegv(); 635 } 636 longjmp(*switch_buf, 1); 637 638 /* unreachable */ 639 printk(UM_KERN_ERR "impossible long jump!"); 640 fatal_sigsegv(); 641 return 0; 642 } 643 644 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 645 { 646 jmp_buf here; 647 648 cb_proc = proc; 649 cb_arg = arg; 650 cb_back = &here; 651 652 block_signals_trace(); 653 if (UML_SETJMP(&here) == 0) 654 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 655 unblock_signals_trace(); 656 657 cb_proc = NULL; 658 cb_arg = NULL; 659 cb_back = NULL; 660 } 661 662 void halt_skas(void) 663 { 664 block_signals_trace(); 665 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 666 } 667 668 static bool noreboot; 669 670 static int __init noreboot_cmd_param(char *str, int *add) 671 { 672 noreboot = true; 673 return 0; 674 } 675 676 __uml_setup("noreboot", noreboot_cmd_param, 677 "noreboot\n" 678 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 679 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 680 " crashes in CI\n"); 681 682 void reboot_skas(void) 683 { 684 block_signals_trace(); 685 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 686 } 687 688 void __switch_mm(struct mm_id *mm_idp) 689 { 690 userspace_pid[0] = mm_idp->u.pid; 691 kill_userspace_mm[0] = mm_idp->kill; 692 } 693