1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> 4 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 5 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 6 */ 7 8 #include <stdlib.h> 9 #include <stdbool.h> 10 #include <unistd.h> 11 #include <sched.h> 12 #include <errno.h> 13 #include <string.h> 14 #include <fcntl.h> 15 #include <mem_user.h> 16 #include <sys/mman.h> 17 #include <sys/wait.h> 18 #include <sys/stat.h> 19 #include <sys/socket.h> 20 #include <asm/unistd.h> 21 #include <as-layout.h> 22 #include <init.h> 23 #include <kern_util.h> 24 #include <mem.h> 25 #include <os.h> 26 #include <ptrace_user.h> 27 #include <registers.h> 28 #include <skas.h> 29 #include <sysdep/stub.h> 30 #include <sysdep/mcontext.h> 31 #include <linux/futex.h> 32 #include <linux/threads.h> 33 #include <timetravel.h> 34 #include <asm-generic/rwonce.h> 35 #include "../internal.h" 36 37 int is_skas_winch(int pid, int fd, void *data) 38 { 39 return pid == getpgrp(); 40 } 41 42 static const char *ptrace_reg_name(int idx) 43 { 44 #define R(n) case HOST_##n: return #n 45 46 switch (idx) { 47 #ifdef __x86_64__ 48 R(BX); 49 R(CX); 50 R(DI); 51 R(SI); 52 R(DX); 53 R(BP); 54 R(AX); 55 R(R8); 56 R(R9); 57 R(R10); 58 R(R11); 59 R(R12); 60 R(R13); 61 R(R14); 62 R(R15); 63 R(ORIG_AX); 64 R(CS); 65 R(SS); 66 R(EFLAGS); 67 #elif defined(__i386__) 68 R(IP); 69 R(SP); 70 R(EFLAGS); 71 R(AX); 72 R(BX); 73 R(CX); 74 R(DX); 75 R(SI); 76 R(DI); 77 R(BP); 78 R(CS); 79 R(SS); 80 R(DS); 81 R(FS); 82 R(ES); 83 R(GS); 84 R(ORIG_AX); 85 #endif 86 } 87 return ""; 88 } 89 90 static int ptrace_dump_regs(int pid) 91 { 92 unsigned long regs[MAX_REG_NR]; 93 int i; 94 95 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 96 return -errno; 97 98 printk(UM_KERN_ERR "Stub registers -\n"); 99 for (i = 0; i < ARRAY_SIZE(regs); i++) { 100 const char *regname = ptrace_reg_name(i); 101 102 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 103 } 104 105 return 0; 106 } 107 108 /* 109 * Signals that are OK to receive in the stub - we'll just continue it. 110 * SIGWINCH will happen when UML is inside a detached screen. 111 */ 112 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 113 114 /* Signals that the stub will finish with - anything else is an error */ 115 #define STUB_DONE_MASK (1 << SIGTRAP) 116 117 void wait_stub_done(int pid) 118 { 119 int n, status, err; 120 121 while (1) { 122 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 123 if ((n < 0) || !WIFSTOPPED(status)) 124 goto bad_wait; 125 126 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 127 break; 128 129 err = ptrace(PTRACE_CONT, pid, 0, 0); 130 if (err) { 131 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 132 __func__, errno); 133 fatal_sigsegv(); 134 } 135 } 136 137 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 138 return; 139 140 bad_wait: 141 err = ptrace_dump_regs(pid); 142 if (err) 143 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 144 -err); 145 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 146 __func__, pid, n, errno, status); 147 fatal_sigsegv(); 148 } 149 150 void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) 151 { 152 struct stub_data *data = (void *)mm_idp->stack; 153 int ret; 154 155 do { 156 const char byte = 0; 157 struct iovec iov = { 158 .iov_base = (void *)&byte, 159 .iov_len = sizeof(byte), 160 }; 161 union { 162 char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))]; 163 struct cmsghdr align; 164 } ctrl; 165 struct msghdr msgh = { 166 .msg_iov = &iov, 167 .msg_iovlen = 1, 168 }; 169 170 if (!running) { 171 if (mm_idp->syscall_fd_num) { 172 unsigned int fds_size = 173 sizeof(int) * mm_idp->syscall_fd_num; 174 struct cmsghdr *cmsg; 175 176 msgh.msg_control = ctrl.data; 177 msgh.msg_controllen = CMSG_SPACE(fds_size); 178 cmsg = CMSG_FIRSTHDR(&msgh); 179 cmsg->cmsg_level = SOL_SOCKET; 180 cmsg->cmsg_type = SCM_RIGHTS; 181 cmsg->cmsg_len = CMSG_LEN(fds_size); 182 memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, 183 fds_size); 184 185 CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, 186 &msgh, 0)); 187 } 188 189 data->signal = 0; 190 data->futex = FUTEX_IN_CHILD; 191 CATCH_EINTR(syscall(__NR_futex, &data->futex, 192 FUTEX_WAKE, 1, NULL, NULL, 0)); 193 } 194 195 do { 196 /* 197 * We need to check whether the child is still alive 198 * before and after the FUTEX_WAIT call. Before, in 199 * case it just died but we still updated data->futex 200 * to FUTEX_IN_CHILD. And after, in case it died while 201 * we were waiting (and SIGCHLD woke us up, see the 202 * IRQ handler in mmu.c). 203 * 204 * Either way, if PID is negative, then we have no 205 * choice but to kill the task. 206 */ 207 if (__READ_ONCE(mm_idp->pid) < 0) 208 goto out_kill; 209 210 ret = syscall(__NR_futex, &data->futex, 211 FUTEX_WAIT, FUTEX_IN_CHILD, 212 NULL, NULL, 0); 213 if (ret < 0 && errno != EINTR && errno != EAGAIN) { 214 printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", 215 __func__, errno); 216 goto out_kill; 217 } 218 } while (data->futex == FUTEX_IN_CHILD); 219 220 if (__READ_ONCE(mm_idp->pid) < 0) 221 goto out_kill; 222 223 running = 0; 224 225 /* We may receive a SIGALRM before SIGSYS, iterate again. */ 226 } while (wait_sigsys && data->signal == SIGALRM); 227 228 if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { 229 printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); 230 goto out_kill; 231 } 232 233 if (wait_sigsys && data->signal != SIGSYS) { 234 printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", 235 __func__, data->signal); 236 goto out_kill; 237 } 238 239 return; 240 241 out_kill: 242 printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", 243 __func__, mm_idp->pid, errno); 244 /* This is not true inside start_userspace */ 245 if (current_mm_id() == mm_idp) 246 fatal_sigsegv(); 247 } 248 249 extern unsigned long current_stub_stack(void); 250 251 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 252 { 253 int err; 254 255 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 256 if (err) { 257 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 258 "errno = %d\n", pid, errno); 259 fatal_sigsegv(); 260 } 261 wait_stub_done(pid); 262 263 /* 264 * faultinfo is prepared by the stub_segv_handler at start of 265 * the stub stack page. We just have to copy it. 266 */ 267 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 268 } 269 270 static void handle_trap(struct uml_pt_regs *regs) 271 { 272 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 273 fatal_sigsegv(); 274 275 handle_syscall(regs); 276 } 277 278 extern char __syscall_stub_start[]; 279 280 static int stub_exe_fd; 281 282 struct tramp_data { 283 struct stub_data *stub_data; 284 /* 0 is inherited, 1 is the kernel side */ 285 int sockpair[2]; 286 }; 287 288 #ifndef CLOSE_RANGE_CLOEXEC 289 #define CLOSE_RANGE_CLOEXEC (1U << 2) 290 #endif 291 292 static int userspace_tramp(void *data) 293 { 294 struct tramp_data *tramp_data = data; 295 char *const argv[] = { "uml-userspace", NULL }; 296 unsigned long long offset; 297 struct stub_init_data init_data = { 298 .seccomp = using_seccomp, 299 .stub_start = STUB_START, 300 }; 301 int ret; 302 303 if (using_seccomp) { 304 init_data.signal_handler = STUB_CODE + 305 (unsigned long) stub_signal_interrupt - 306 (unsigned long) __syscall_stub_start; 307 init_data.signal_restorer = STUB_CODE + 308 (unsigned long) stub_signal_restorer - 309 (unsigned long) __syscall_stub_start; 310 } else { 311 init_data.signal_handler = STUB_CODE + 312 (unsigned long) stub_segv_handler - 313 (unsigned long) __syscall_stub_start; 314 init_data.signal_restorer = 0; 315 } 316 317 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 318 &offset); 319 init_data.stub_code_offset = MMAP_OFFSET(offset); 320 321 init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data), 322 &offset); 323 init_data.stub_data_offset = MMAP_OFFSET(offset); 324 325 /* 326 * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs 327 * and then unsetting it on all memory related FDs. 328 * This is not strictly necessary from a safety perspective. 329 */ 330 syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); 331 332 fcntl(init_data.stub_data_fd, F_SETFD, 0); 333 334 /* dup2 signaling FD/socket to STDIN */ 335 if (dup2(tramp_data->sockpair[0], 0) < 0) 336 exit(3); 337 close(tramp_data->sockpair[0]); 338 339 /* Write init_data and close write side */ 340 ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data)); 341 close(tramp_data->sockpair[1]); 342 343 if (ret != sizeof(init_data)) 344 exit(4); 345 346 /* Raw execveat for compatibility with older libc versions */ 347 syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", 348 (unsigned long)argv, NULL, AT_EMPTY_PATH); 349 350 exit(5); 351 } 352 353 extern char stub_exe_start[]; 354 extern char stub_exe_end[]; 355 356 extern char *tempdir; 357 358 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 359 360 #ifndef MFD_EXEC 361 #define MFD_EXEC 0x0010U 362 #endif 363 364 static int __init init_stub_exe_fd(void) 365 { 366 size_t written = 0; 367 char *tmpfile = NULL; 368 369 stub_exe_fd = memfd_create("uml-userspace", 370 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 371 372 if (stub_exe_fd < 0) { 373 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 374 375 tmpfile = malloc(strlen(tempdir) + 376 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 377 if (tmpfile == NULL) 378 panic("Failed to allocate memory for stub binary name"); 379 380 strcpy(tmpfile, tempdir); 381 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 382 383 stub_exe_fd = mkstemp(tmpfile); 384 if (stub_exe_fd < 0) 385 panic("Could not create temporary file for stub binary: %d", 386 -errno); 387 } 388 389 while (written < stub_exe_end - stub_exe_start) { 390 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 391 stub_exe_end - stub_exe_start - written); 392 if (res < 0) { 393 if (errno == EINTR) 394 continue; 395 396 if (tmpfile) 397 unlink(tmpfile); 398 panic("Failed write stub binary: %d", -errno); 399 } 400 401 written += res; 402 } 403 404 if (!tmpfile) { 405 fcntl(stub_exe_fd, F_ADD_SEALS, 406 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 407 } else { 408 if (fchmod(stub_exe_fd, 00500) < 0) { 409 unlink(tmpfile); 410 panic("Could not make stub binary executable: %d", 411 -errno); 412 } 413 414 close(stub_exe_fd); 415 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 416 if (stub_exe_fd < 0) { 417 unlink(tmpfile); 418 panic("Could not reopen stub binary: %d", -errno); 419 } 420 421 unlink(tmpfile); 422 free(tmpfile); 423 } 424 425 return 0; 426 } 427 __initcall(init_stub_exe_fd); 428 429 int using_seccomp; 430 431 /** 432 * start_userspace() - prepare a new userspace process 433 * @mm_id: The corresponding struct mm_id 434 * 435 * Setups a new temporary stack page that is used while userspace_tramp() runs 436 * Clones the kernel process into a new userspace process, with FDs only. 437 * 438 * Return: When positive: the process id of the new userspace process, 439 * when negative: an error number. 440 * FIXME: can PIDs become negative?! 441 */ 442 int start_userspace(struct mm_id *mm_id) 443 { 444 struct stub_data *proc_data = (void *)mm_id->stack; 445 struct tramp_data tramp_data = { 446 .stub_data = proc_data, 447 }; 448 void *stack; 449 unsigned long sp; 450 int status, n, err; 451 452 /* setup a temporary stack page */ 453 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 454 PROT_READ | PROT_WRITE | PROT_EXEC, 455 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 456 if (stack == MAP_FAILED) { 457 err = -errno; 458 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 459 __func__, errno); 460 return err; 461 } 462 463 /* set stack pointer to the end of the stack page, so it can grow downwards */ 464 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 465 466 /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */ 467 if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) { 468 err = -errno; 469 printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n", 470 __func__, errno); 471 return err; 472 } 473 474 if (using_seccomp) 475 proc_data->futex = FUTEX_IN_CHILD; 476 477 mm_id->pid = clone(userspace_tramp, (void *) sp, 478 CLONE_VFORK | CLONE_VM | SIGCHLD, 479 (void *)&tramp_data); 480 if (mm_id->pid < 0) { 481 err = -errno; 482 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 483 __func__, errno); 484 goto out_close; 485 } 486 487 if (using_seccomp) { 488 wait_stub_done_seccomp(mm_id, 1, 1); 489 } else { 490 do { 491 CATCH_EINTR(n = waitpid(mm_id->pid, &status, 492 WUNTRACED | __WALL)); 493 if (n < 0) { 494 err = -errno; 495 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 496 __func__, errno); 497 goto out_kill; 498 } 499 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 500 501 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 502 err = -EINVAL; 503 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 504 __func__, status); 505 goto out_kill; 506 } 507 508 if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL, 509 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 510 err = -errno; 511 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 512 __func__, errno); 513 goto out_kill; 514 } 515 } 516 517 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 518 err = -errno; 519 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 520 __func__, errno); 521 goto out_kill; 522 } 523 524 close(tramp_data.sockpair[0]); 525 if (using_seccomp) 526 mm_id->sock = tramp_data.sockpair[1]; 527 else 528 close(tramp_data.sockpair[1]); 529 530 return 0; 531 532 out_kill: 533 os_kill_ptraced_process(mm_id->pid, 1); 534 out_close: 535 close(tramp_data.sockpair[0]); 536 close(tramp_data.sockpair[1]); 537 538 mm_id->pid = -1; 539 540 return err; 541 } 542 543 static int unscheduled_userspace_iterations; 544 extern unsigned long tt_extra_sched_jiffies; 545 546 void userspace(struct uml_pt_regs *regs) 547 { 548 int err, status, op; 549 siginfo_t si_local; 550 siginfo_t *si; 551 int sig; 552 553 /* Handle any immediate reschedules or signals */ 554 interrupt_end(); 555 556 while (1) { 557 struct mm_id *mm_id = current_mm_id(); 558 559 /* 560 * At any given time, only one CPU thread can enter the 561 * turnstile to operate on the same stub process, including 562 * executing stub system calls (mmap and munmap). 563 */ 564 enter_turnstile(mm_id); 565 566 /* 567 * When we are in time-travel mode, userspace can theoretically 568 * do a *lot* of work without being scheduled. The problem with 569 * this is that it will prevent kernel bookkeeping (primarily 570 * the RCU) from running and this can for example cause OOM 571 * situations. 572 * 573 * This code accounts a jiffie against the scheduling clock 574 * after the defined userspace iterations in the same thread. 575 * By doing so the situation is effectively prevented. 576 */ 577 if (time_travel_mode == TT_MODE_INFCPU || 578 time_travel_mode == TT_MODE_EXTERNAL) { 579 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 580 if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && 581 unscheduled_userspace_iterations++ > 582 CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 583 tt_extra_sched_jiffies += 1; 584 unscheduled_userspace_iterations = 0; 585 } 586 #endif 587 } 588 589 time_travel_print_bc_msg(); 590 591 current_mm_sync(); 592 593 if (using_seccomp) { 594 struct stub_data *proc_data = (void *) mm_id->stack; 595 596 err = set_stub_state(regs, proc_data, singlestepping()); 597 if (err) { 598 printk(UM_KERN_ERR "%s - failed to set regs: %d", 599 __func__, err); 600 fatal_sigsegv(); 601 } 602 603 /* Must have been reset by the syscall caller */ 604 if (proc_data->restart_wait != 0) 605 panic("Programming error: Flag to only run syscalls in child was not cleared!"); 606 607 /* Mark pending syscalls for flushing */ 608 proc_data->syscall_data_len = mm_id->syscall_data_len; 609 610 wait_stub_done_seccomp(mm_id, 0, 0); 611 612 sig = proc_data->signal; 613 614 if (sig == SIGTRAP && proc_data->err != 0) { 615 printk(UM_KERN_ERR "%s - Error flushing stub syscalls", 616 __func__); 617 syscall_stub_dump_error(mm_id); 618 mm_id->syscall_data_len = proc_data->err; 619 fatal_sigsegv(); 620 } 621 622 mm_id->syscall_data_len = 0; 623 mm_id->syscall_fd_num = 0; 624 625 err = get_stub_state(regs, proc_data, NULL); 626 if (err) { 627 printk(UM_KERN_ERR "%s - failed to get regs: %d", 628 __func__, err); 629 fatal_sigsegv(); 630 } 631 632 if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) 633 panic("%s - Invalid siginfo offset from child", __func__); 634 635 si = &si_local; 636 memcpy(si, &proc_data->sigstack[proc_data->si_offset], sizeof(*si)); 637 638 regs->is_user = 1; 639 640 /* Fill in ORIG_RAX and extract fault information */ 641 PT_SYSCALL_NR(regs->gp) = si->si_syscall; 642 if (sig == SIGSEGV) { 643 mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; 644 645 GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); 646 } 647 } else { 648 int pid = mm_id->pid; 649 650 /* Flush out any pending syscalls */ 651 err = syscall_stub_flush(mm_id); 652 if (err) { 653 if (err == -ENOMEM) 654 report_enomem(); 655 656 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 657 __func__, -err); 658 fatal_sigsegv(); 659 } 660 661 /* 662 * This can legitimately fail if the process loads a 663 * bogus value into a segment register. It will 664 * segfault and PTRACE_GETREGS will read that value 665 * out of the process. However, PTRACE_SETREGS will 666 * fail. In this case, there is nothing to do but 667 * just kill the process. 668 */ 669 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 670 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 671 __func__, errno); 672 fatal_sigsegv(); 673 } 674 675 if (put_fp_registers(pid, regs->fp)) { 676 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 677 __func__, errno); 678 fatal_sigsegv(); 679 } 680 681 if (singlestepping()) 682 op = PTRACE_SYSEMU_SINGLESTEP; 683 else 684 op = PTRACE_SYSEMU; 685 686 if (ptrace(op, pid, 0, 0)) { 687 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 688 __func__, op, errno); 689 fatal_sigsegv(); 690 } 691 692 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 693 if (err < 0) { 694 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 695 __func__, errno); 696 fatal_sigsegv(); 697 } 698 699 regs->is_user = 1; 700 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 701 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 702 __func__, errno); 703 fatal_sigsegv(); 704 } 705 706 if (get_fp_registers(pid, regs->fp)) { 707 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 708 __func__, errno); 709 fatal_sigsegv(); 710 } 711 712 if (WIFSTOPPED(status)) { 713 sig = WSTOPSIG(status); 714 715 /* 716 * These signal handlers need the si argument 717 * and SIGSEGV needs the faultinfo. 718 * The SIGIO and SIGALARM handlers which constitute 719 * the majority of invocations, do not use it. 720 */ 721 switch (sig) { 722 case SIGSEGV: 723 get_skas_faultinfo(pid, 724 ®s->faultinfo); 725 fallthrough; 726 case SIGTRAP: 727 case SIGILL: 728 case SIGBUS: 729 case SIGFPE: 730 case SIGWINCH: 731 ptrace(PTRACE_GETSIGINFO, pid, 0, 732 (struct siginfo *)&si_local); 733 si = &si_local; 734 break; 735 default: 736 si = NULL; 737 break; 738 } 739 } else { 740 sig = 0; 741 } 742 } 743 744 exit_turnstile(mm_id); 745 746 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 747 748 if (sig) { 749 switch (sig) { 750 case SIGSEGV: 751 if (using_seccomp || PTRACE_FULL_FAULTINFO) 752 (*sig_info[SIGSEGV])(SIGSEGV, 753 (struct siginfo *)si, 754 regs, NULL); 755 else 756 segv(regs->faultinfo, 0, 1, NULL, NULL); 757 758 break; 759 case SIGSYS: 760 handle_syscall(regs); 761 break; 762 case SIGTRAP + 0x80: 763 handle_trap(regs); 764 break; 765 case SIGTRAP: 766 relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); 767 break; 768 case SIGALRM: 769 break; 770 case SIGIO: 771 case SIGILL: 772 case SIGBUS: 773 case SIGFPE: 774 case SIGWINCH: 775 block_signals_trace(); 776 (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); 777 unblock_signals_trace(); 778 break; 779 default: 780 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 781 __func__, sig); 782 fatal_sigsegv(); 783 } 784 interrupt_end(); 785 786 /* Avoid -ERESTARTSYS handling in host */ 787 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 788 PT_SYSCALL_NR(regs->gp) = -1; 789 } 790 } 791 } 792 793 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 794 { 795 (*buf)[0].JB_IP = (unsigned long) handler; 796 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 797 sizeof(void *); 798 } 799 800 #define INIT_JMP_NEW_THREAD 0 801 #define INIT_JMP_CALLBACK 1 802 #define INIT_JMP_HALT 2 803 #define INIT_JMP_REBOOT 3 804 805 void switch_threads(jmp_buf *me, jmp_buf *you) 806 { 807 unscheduled_userspace_iterations = 0; 808 809 if (UML_SETJMP(me) == 0) 810 UML_LONGJMP(you, 1); 811 } 812 813 static jmp_buf initial_jmpbuf; 814 815 static __thread void (*cb_proc)(void *arg); 816 static __thread void *cb_arg; 817 static __thread jmp_buf *cb_back; 818 819 int start_idle_thread(void *stack, jmp_buf *switch_buf) 820 { 821 int n; 822 823 set_handler(SIGWINCH); 824 825 /* 826 * Can't use UML_SETJMP or UML_LONGJMP here because they save 827 * and restore signals, with the possible side-effect of 828 * trying to handle any signals which came when they were 829 * blocked, which can't be done on this stack. 830 * Signals must be blocked when jumping back here and restored 831 * after returning to the jumper. 832 */ 833 n = setjmp(initial_jmpbuf); 834 switch (n) { 835 case INIT_JMP_NEW_THREAD: 836 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 837 (*switch_buf)[0].JB_SP = (unsigned long) stack + 838 UM_THREAD_SIZE - sizeof(void *); 839 break; 840 case INIT_JMP_CALLBACK: 841 (*cb_proc)(cb_arg); 842 longjmp(*cb_back, 1); 843 break; 844 case INIT_JMP_HALT: 845 kmalloc_ok = 0; 846 return 0; 847 case INIT_JMP_REBOOT: 848 kmalloc_ok = 0; 849 return 1; 850 default: 851 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 852 __func__, n); 853 fatal_sigsegv(); 854 } 855 longjmp(*switch_buf, 1); 856 857 /* unreachable */ 858 printk(UM_KERN_ERR "impossible long jump!"); 859 fatal_sigsegv(); 860 return 0; 861 } 862 863 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 864 { 865 jmp_buf here; 866 867 cb_proc = proc; 868 cb_arg = arg; 869 cb_back = &here; 870 871 initial_jmpbuf_lock(); 872 if (UML_SETJMP(&here) == 0) 873 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 874 initial_jmpbuf_unlock(); 875 876 cb_proc = NULL; 877 cb_arg = NULL; 878 cb_back = NULL; 879 } 880 881 void halt_skas(void) 882 { 883 initial_jmpbuf_lock(); 884 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 885 /* unreachable */ 886 } 887 888 static bool noreboot; 889 890 static int __init noreboot_cmd_param(char *str, int *add) 891 { 892 *add = 0; 893 noreboot = true; 894 return 0; 895 } 896 897 __uml_setup("noreboot", noreboot_cmd_param, 898 "noreboot\n" 899 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 900 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 901 " crashes in CI\n\n"); 902 903 void reboot_skas(void) 904 { 905 initial_jmpbuf_lock(); 906 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 907 /* unreachable */ 908 } 909