1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 4 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdlib.h> 8 #include <stdbool.h> 9 #include <unistd.h> 10 #include <sched.h> 11 #include <errno.h> 12 #include <string.h> 13 #include <sys/mman.h> 14 #include <sys/wait.h> 15 #include <asm/unistd.h> 16 #include <as-layout.h> 17 #include <init.h> 18 #include <kern_util.h> 19 #include <mem.h> 20 #include <os.h> 21 #include <ptrace_user.h> 22 #include <registers.h> 23 #include <skas.h> 24 #include <sysdep/stub.h> 25 #include <linux/threads.h> 26 27 int is_skas_winch(int pid, int fd, void *data) 28 { 29 return pid == getpgrp(); 30 } 31 32 static const char *ptrace_reg_name(int idx) 33 { 34 #define R(n) case HOST_##n: return #n 35 36 switch (idx) { 37 #ifdef __x86_64__ 38 R(BX); 39 R(CX); 40 R(DI); 41 R(SI); 42 R(DX); 43 R(BP); 44 R(AX); 45 R(R8); 46 R(R9); 47 R(R10); 48 R(R11); 49 R(R12); 50 R(R13); 51 R(R14); 52 R(R15); 53 R(ORIG_AX); 54 R(CS); 55 R(SS); 56 R(EFLAGS); 57 #elif defined(__i386__) 58 R(IP); 59 R(SP); 60 R(EFLAGS); 61 R(AX); 62 R(BX); 63 R(CX); 64 R(DX); 65 R(SI); 66 R(DI); 67 R(BP); 68 R(CS); 69 R(SS); 70 R(DS); 71 R(FS); 72 R(ES); 73 R(GS); 74 R(ORIG_AX); 75 #endif 76 } 77 return ""; 78 } 79 80 static int ptrace_dump_regs(int pid) 81 { 82 unsigned long regs[MAX_REG_NR]; 83 int i; 84 85 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 86 return -errno; 87 88 printk(UM_KERN_ERR "Stub registers -\n"); 89 for (i = 0; i < ARRAY_SIZE(regs); i++) { 90 const char *regname = ptrace_reg_name(i); 91 92 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 93 } 94 95 return 0; 96 } 97 98 /* 99 * Signals that are OK to receive in the stub - we'll just continue it. 100 * SIGWINCH will happen when UML is inside a detached screen. 101 */ 102 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 103 104 /* Signals that the stub will finish with - anything else is an error */ 105 #define STUB_DONE_MASK (1 << SIGTRAP) 106 107 void wait_stub_done(int pid) 108 { 109 int n, status, err; 110 111 while (1) { 112 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 113 if ((n < 0) || !WIFSTOPPED(status)) 114 goto bad_wait; 115 116 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 117 break; 118 119 err = ptrace(PTRACE_CONT, pid, 0, 0); 120 if (err) { 121 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 122 __func__, errno); 123 fatal_sigsegv(); 124 } 125 } 126 127 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 128 return; 129 130 bad_wait: 131 err = ptrace_dump_regs(pid); 132 if (err) 133 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 134 -err); 135 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 136 __func__, pid, n, errno, status); 137 fatal_sigsegv(); 138 } 139 140 extern unsigned long current_stub_stack(void); 141 142 static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux_fp_regs) 143 { 144 int err; 145 146 err = get_fp_registers(pid, aux_fp_regs); 147 if (err < 0) { 148 printk(UM_KERN_ERR "save_fp_registers returned %d\n", 149 err); 150 fatal_sigsegv(); 151 } 152 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 153 if (err) { 154 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 155 "errno = %d\n", pid, errno); 156 fatal_sigsegv(); 157 } 158 wait_stub_done(pid); 159 160 /* 161 * faultinfo is prepared by the stub_segv_handler at start of 162 * the stub stack page. We just have to copy it. 163 */ 164 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 165 166 err = put_fp_registers(pid, aux_fp_regs); 167 if (err < 0) { 168 printk(UM_KERN_ERR "put_fp_registers returned %d\n", 169 err); 170 fatal_sigsegv(); 171 } 172 } 173 174 static void handle_segv(int pid, struct uml_pt_regs *regs, unsigned long *aux_fp_regs) 175 { 176 get_skas_faultinfo(pid, ®s->faultinfo, aux_fp_regs); 177 segv(regs->faultinfo, 0, 1, NULL); 178 } 179 180 static void handle_trap(int pid, struct uml_pt_regs *regs) 181 { 182 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 183 fatal_sigsegv(); 184 185 handle_syscall(regs); 186 } 187 188 extern char __syscall_stub_start[]; 189 190 /** 191 * userspace_tramp() - userspace trampoline 192 * @stack: pointer to the new userspace stack page 193 * 194 * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed. 195 * This function will run on a temporary stack page. 196 * It ptrace()'es itself, then 197 * Two pages are mapped into the userspace address space: 198 * - STUB_CODE (with EXEC), which contains the skas stub code 199 * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel. 200 * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process. 201 * And last the process stops itself to give control to the UML kernel for this userspace process. 202 * 203 * Return: Always zero, otherwise the current userspace process is ended with non null exit() call 204 */ 205 static int userspace_tramp(void *stack) 206 { 207 struct sigaction sa; 208 void *addr; 209 int fd; 210 unsigned long long offset; 211 unsigned long segv_handler = STUB_CODE + 212 (unsigned long) stub_segv_handler - 213 (unsigned long) __syscall_stub_start; 214 215 ptrace(PTRACE_TRACEME, 0, 0, 0); 216 217 signal(SIGTERM, SIG_DFL); 218 signal(SIGWINCH, SIG_IGN); 219 220 fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); 221 addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, 222 PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); 223 if (addr == MAP_FAILED) { 224 os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", 225 STUB_CODE, errno); 226 exit(1); 227 } 228 229 fd = phys_mapping(uml_to_phys(stack), &offset); 230 addr = mmap((void *) STUB_DATA, 231 STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, 232 MAP_FIXED | MAP_SHARED, fd, offset); 233 if (addr == MAP_FAILED) { 234 os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", 235 STUB_DATA, errno); 236 exit(1); 237 } 238 239 set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); 240 sigemptyset(&sa.sa_mask); 241 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 242 sa.sa_sigaction = (void *) segv_handler; 243 sa.sa_restorer = NULL; 244 if (sigaction(SIGSEGV, &sa, NULL) < 0) { 245 os_info("%s - setting SIGSEGV handler failed - errno = %d\n", 246 __func__, errno); 247 exit(1); 248 } 249 250 kill(os_getpid(), SIGSTOP); 251 return 0; 252 } 253 254 int userspace_pid[NR_CPUS]; 255 int kill_userspace_mm[NR_CPUS]; 256 257 /** 258 * start_userspace() - prepare a new userspace process 259 * @stub_stack: pointer to the stub stack. 260 * 261 * Setups a new temporary stack page that is used while userspace_tramp() runs 262 * Clones the kernel process into a new userspace process, with FDs only. 263 * 264 * Return: When positive: the process id of the new userspace process, 265 * when negative: an error number. 266 * FIXME: can PIDs become negative?! 267 */ 268 int start_userspace(unsigned long stub_stack) 269 { 270 void *stack; 271 unsigned long sp; 272 int pid, status, n, flags, err; 273 274 /* setup a temporary stack page */ 275 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 276 PROT_READ | PROT_WRITE | PROT_EXEC, 277 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 278 if (stack == MAP_FAILED) { 279 err = -errno; 280 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 281 __func__, errno); 282 return err; 283 } 284 285 /* set stack pointer to the end of the stack page, so it can grow downwards */ 286 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 287 288 flags = CLONE_FILES | SIGCHLD; 289 290 /* clone into new userspace process */ 291 pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); 292 if (pid < 0) { 293 err = -errno; 294 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 295 __func__, errno); 296 return err; 297 } 298 299 do { 300 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 301 if (n < 0) { 302 err = -errno; 303 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 304 __func__, errno); 305 goto out_kill; 306 } 307 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 308 309 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 310 err = -EINVAL; 311 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 312 __func__, status); 313 goto out_kill; 314 } 315 316 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 317 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 318 err = -errno; 319 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 320 __func__, errno); 321 goto out_kill; 322 } 323 324 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 325 err = -errno; 326 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 327 __func__, errno); 328 goto out_kill; 329 } 330 331 return pid; 332 333 out_kill: 334 os_kill_ptraced_process(pid, 1); 335 return err; 336 } 337 338 void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) 339 { 340 int err, status, op, pid = userspace_pid[0]; 341 siginfo_t si; 342 343 /* Handle any immediate reschedules or signals */ 344 interrupt_end(); 345 346 while (1) { 347 if (kill_userspace_mm[0]) 348 fatal_sigsegv(); 349 350 /* 351 * This can legitimately fail if the process loads a 352 * bogus value into a segment register. It will 353 * segfault and PTRACE_GETREGS will read that value 354 * out of the process. However, PTRACE_SETREGS will 355 * fail. In this case, there is nothing to do but 356 * just kill the process. 357 */ 358 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 359 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 360 __func__, errno); 361 fatal_sigsegv(); 362 } 363 364 if (put_fp_registers(pid, regs->fp)) { 365 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 366 __func__, errno); 367 fatal_sigsegv(); 368 } 369 370 if (singlestepping()) 371 op = PTRACE_SYSEMU_SINGLESTEP; 372 else 373 op = PTRACE_SYSEMU; 374 375 if (ptrace(op, pid, 0, 0)) { 376 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 377 __func__, op, errno); 378 fatal_sigsegv(); 379 } 380 381 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 382 if (err < 0) { 383 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 384 __func__, errno); 385 fatal_sigsegv(); 386 } 387 388 regs->is_user = 1; 389 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 390 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 391 __func__, errno); 392 fatal_sigsegv(); 393 } 394 395 if (get_fp_registers(pid, regs->fp)) { 396 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 397 __func__, errno); 398 fatal_sigsegv(); 399 } 400 401 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 402 403 if (WIFSTOPPED(status)) { 404 int sig = WSTOPSIG(status); 405 406 /* These signal handlers need the si argument. 407 * The SIGIO and SIGALARM handlers which constitute the 408 * majority of invocations, do not use it. 409 */ 410 switch (sig) { 411 case SIGSEGV: 412 case SIGTRAP: 413 case SIGILL: 414 case SIGBUS: 415 case SIGFPE: 416 case SIGWINCH: 417 ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); 418 break; 419 } 420 421 switch (sig) { 422 case SIGSEGV: 423 if (PTRACE_FULL_FAULTINFO) { 424 get_skas_faultinfo(pid, 425 ®s->faultinfo, aux_fp_regs); 426 (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, 427 regs); 428 } 429 else handle_segv(pid, regs, aux_fp_regs); 430 break; 431 case SIGTRAP + 0x80: 432 handle_trap(pid, regs); 433 break; 434 case SIGTRAP: 435 relay_signal(SIGTRAP, (struct siginfo *)&si, regs); 436 break; 437 case SIGALRM: 438 break; 439 case SIGIO: 440 case SIGILL: 441 case SIGBUS: 442 case SIGFPE: 443 case SIGWINCH: 444 block_signals_trace(); 445 (*sig_info[sig])(sig, (struct siginfo *)&si, regs); 446 unblock_signals_trace(); 447 break; 448 default: 449 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 450 __func__, sig); 451 fatal_sigsegv(); 452 } 453 pid = userspace_pid[0]; 454 interrupt_end(); 455 456 /* Avoid -ERESTARTSYS handling in host */ 457 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 458 PT_SYSCALL_NR(regs->gp) = -1; 459 } 460 } 461 } 462 463 static unsigned long thread_regs[MAX_REG_NR]; 464 static unsigned long thread_fp_regs[FP_SIZE]; 465 466 static int __init init_thread_regs(void) 467 { 468 get_safe_registers(thread_regs, thread_fp_regs); 469 /* Set parent's instruction pointer to start of clone-stub */ 470 thread_regs[REGS_IP_INDEX] = STUB_CODE + 471 (unsigned long) stub_clone_handler - 472 (unsigned long) __syscall_stub_start; 473 thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 474 sizeof(void *); 475 #ifdef __SIGNAL_FRAMESIZE 476 thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; 477 #endif 478 return 0; 479 } 480 481 __initcall(init_thread_regs); 482 483 int copy_context_skas0(unsigned long new_stack, int pid) 484 { 485 int err; 486 unsigned long current_stack = current_stub_stack(); 487 struct stub_data *data = (struct stub_data *) current_stack; 488 struct stub_data *child_data = (struct stub_data *) new_stack; 489 unsigned long long new_offset; 490 int new_fd = phys_mapping(uml_to_phys((void *)new_stack), &new_offset); 491 492 /* 493 * prepare offset and fd of child's stack as argument for parent's 494 * and child's mmap2 calls 495 */ 496 *data = ((struct stub_data) { 497 .offset = MMAP_OFFSET(new_offset), 498 .fd = new_fd, 499 .parent_err = -ESRCH, 500 .child_err = 0, 501 }); 502 503 *child_data = ((struct stub_data) { 504 .child_err = -ESRCH, 505 }); 506 507 err = ptrace_setregs(pid, thread_regs); 508 if (err < 0) { 509 err = -errno; 510 printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n", 511 __func__, pid, -err); 512 return err; 513 } 514 515 err = put_fp_registers(pid, thread_fp_regs); 516 if (err < 0) { 517 printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n", 518 __func__, pid, err); 519 return err; 520 } 521 522 /* 523 * Wait, until parent has finished its work: read child's pid from 524 * parent's stack, and check, if bad result. 525 */ 526 err = ptrace(PTRACE_CONT, pid, 0, 0); 527 if (err) { 528 err = -errno; 529 printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n", 530 pid, errno); 531 return err; 532 } 533 534 wait_stub_done(pid); 535 536 pid = data->parent_err; 537 if (pid < 0) { 538 printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", 539 __func__, -pid); 540 return pid; 541 } 542 543 /* 544 * Wait, until child has finished too: read child's result from 545 * child's stack and check it. 546 */ 547 wait_stub_done(pid); 548 if (child_data->child_err != STUB_DATA) { 549 printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n", 550 __func__, pid, data->child_err); 551 err = data->child_err; 552 goto out_kill; 553 } 554 555 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 556 (void *)PTRACE_O_TRACESYSGOOD) < 0) { 557 err = -errno; 558 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 559 __func__, errno); 560 goto out_kill; 561 } 562 563 return pid; 564 565 out_kill: 566 os_kill_ptraced_process(pid, 1); 567 return err; 568 } 569 570 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 571 { 572 (*buf)[0].JB_IP = (unsigned long) handler; 573 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 574 sizeof(void *); 575 } 576 577 #define INIT_JMP_NEW_THREAD 0 578 #define INIT_JMP_CALLBACK 1 579 #define INIT_JMP_HALT 2 580 #define INIT_JMP_REBOOT 3 581 582 void switch_threads(jmp_buf *me, jmp_buf *you) 583 { 584 if (UML_SETJMP(me) == 0) 585 UML_LONGJMP(you, 1); 586 } 587 588 static jmp_buf initial_jmpbuf; 589 590 /* XXX Make these percpu */ 591 static void (*cb_proc)(void *arg); 592 static void *cb_arg; 593 static jmp_buf *cb_back; 594 595 int start_idle_thread(void *stack, jmp_buf *switch_buf) 596 { 597 int n; 598 599 set_handler(SIGWINCH); 600 601 /* 602 * Can't use UML_SETJMP or UML_LONGJMP here because they save 603 * and restore signals, with the possible side-effect of 604 * trying to handle any signals which came when they were 605 * blocked, which can't be done on this stack. 606 * Signals must be blocked when jumping back here and restored 607 * after returning to the jumper. 608 */ 609 n = setjmp(initial_jmpbuf); 610 switch (n) { 611 case INIT_JMP_NEW_THREAD: 612 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 613 (*switch_buf)[0].JB_SP = (unsigned long) stack + 614 UM_THREAD_SIZE - sizeof(void *); 615 break; 616 case INIT_JMP_CALLBACK: 617 (*cb_proc)(cb_arg); 618 longjmp(*cb_back, 1); 619 break; 620 case INIT_JMP_HALT: 621 kmalloc_ok = 0; 622 return 0; 623 case INIT_JMP_REBOOT: 624 kmalloc_ok = 0; 625 return 1; 626 default: 627 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 628 __func__, n); 629 fatal_sigsegv(); 630 } 631 longjmp(*switch_buf, 1); 632 633 /* unreachable */ 634 printk(UM_KERN_ERR "impossible long jump!"); 635 fatal_sigsegv(); 636 return 0; 637 } 638 639 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 640 { 641 jmp_buf here; 642 643 cb_proc = proc; 644 cb_arg = arg; 645 cb_back = &here; 646 647 block_signals_trace(); 648 if (UML_SETJMP(&here) == 0) 649 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 650 unblock_signals_trace(); 651 652 cb_proc = NULL; 653 cb_arg = NULL; 654 cb_back = NULL; 655 } 656 657 void halt_skas(void) 658 { 659 block_signals_trace(); 660 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 661 } 662 663 static bool noreboot; 664 665 static int __init noreboot_cmd_param(char *str, int *add) 666 { 667 noreboot = true; 668 return 0; 669 } 670 671 __uml_setup("noreboot", noreboot_cmd_param, 672 "noreboot\n" 673 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 674 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 675 " crashes in CI\n"); 676 677 void reboot_skas(void) 678 { 679 block_signals_trace(); 680 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 681 } 682 683 void __switch_mm(struct mm_id *mm_idp) 684 { 685 userspace_pid[0] = mm_idp->u.pid; 686 kill_userspace_mm[0] = mm_idp->kill; 687 } 688