1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> 4 * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) 5 * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 6 */ 7 8 #include <stdlib.h> 9 #include <stdbool.h> 10 #include <unistd.h> 11 #include <sched.h> 12 #include <errno.h> 13 #include <string.h> 14 #include <fcntl.h> 15 #include <mem_user.h> 16 #include <sys/mman.h> 17 #include <sys/wait.h> 18 #include <sys/stat.h> 19 #include <asm/unistd.h> 20 #include <as-layout.h> 21 #include <init.h> 22 #include <kern_util.h> 23 #include <mem.h> 24 #include <os.h> 25 #include <ptrace_user.h> 26 #include <registers.h> 27 #include <skas.h> 28 #include <sysdep/stub.h> 29 #include <sysdep/mcontext.h> 30 #include <linux/futex.h> 31 #include <linux/threads.h> 32 #include <timetravel.h> 33 #include <asm-generic/rwonce.h> 34 #include "../internal.h" 35 36 int is_skas_winch(int pid, int fd, void *data) 37 { 38 return pid == getpgrp(); 39 } 40 41 static const char *ptrace_reg_name(int idx) 42 { 43 #define R(n) case HOST_##n: return #n 44 45 switch (idx) { 46 #ifdef __x86_64__ 47 R(BX); 48 R(CX); 49 R(DI); 50 R(SI); 51 R(DX); 52 R(BP); 53 R(AX); 54 R(R8); 55 R(R9); 56 R(R10); 57 R(R11); 58 R(R12); 59 R(R13); 60 R(R14); 61 R(R15); 62 R(ORIG_AX); 63 R(CS); 64 R(SS); 65 R(EFLAGS); 66 #elif defined(__i386__) 67 R(IP); 68 R(SP); 69 R(EFLAGS); 70 R(AX); 71 R(BX); 72 R(CX); 73 R(DX); 74 R(SI); 75 R(DI); 76 R(BP); 77 R(CS); 78 R(SS); 79 R(DS); 80 R(FS); 81 R(ES); 82 R(GS); 83 R(ORIG_AX); 84 #endif 85 } 86 return ""; 87 } 88 89 static int ptrace_dump_regs(int pid) 90 { 91 unsigned long regs[MAX_REG_NR]; 92 int i; 93 94 if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0) 95 return -errno; 96 97 printk(UM_KERN_ERR "Stub registers -\n"); 98 for (i = 0; i < ARRAY_SIZE(regs); i++) { 99 const char *regname = ptrace_reg_name(i); 100 101 printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]); 102 } 103 104 return 0; 105 } 106 107 /* 108 * Signals that are OK to receive in the stub - we'll just continue it. 109 * SIGWINCH will happen when UML is inside a detached screen. 110 */ 111 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH)) 112 113 /* Signals that the stub will finish with - anything else is an error */ 114 #define STUB_DONE_MASK (1 << SIGTRAP) 115 116 void wait_stub_done(int pid) 117 { 118 int n, status, err; 119 120 while (1) { 121 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); 122 if ((n < 0) || !WIFSTOPPED(status)) 123 goto bad_wait; 124 125 if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0) 126 break; 127 128 err = ptrace(PTRACE_CONT, pid, 0, 0); 129 if (err) { 130 printk(UM_KERN_ERR "%s : continue failed, errno = %d\n", 131 __func__, errno); 132 fatal_sigsegv(); 133 } 134 } 135 136 if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0) 137 return; 138 139 bad_wait: 140 err = ptrace_dump_regs(pid); 141 if (err) 142 printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n", 143 -err); 144 printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n", 145 __func__, pid, n, errno, status); 146 fatal_sigsegv(); 147 } 148 149 void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) 150 { 151 struct stub_data *data = (void *)mm_idp->stack; 152 int ret; 153 154 do { 155 if (!running) { 156 data->signal = 0; 157 data->futex = FUTEX_IN_CHILD; 158 CATCH_EINTR(syscall(__NR_futex, &data->futex, 159 FUTEX_WAKE, 1, NULL, NULL, 0)); 160 } 161 162 do { 163 /* 164 * We need to check whether the child is still alive 165 * before and after the FUTEX_WAIT call. Before, in 166 * case it just died but we still updated data->futex 167 * to FUTEX_IN_CHILD. And after, in case it died while 168 * we were waiting (and SIGCHLD woke us up, see the 169 * IRQ handler in mmu.c). 170 * 171 * Either way, if PID is negative, then we have no 172 * choice but to kill the task. 173 */ 174 if (__READ_ONCE(mm_idp->pid) < 0) 175 goto out_kill; 176 177 ret = syscall(__NR_futex, &data->futex, 178 FUTEX_WAIT, FUTEX_IN_CHILD, 179 NULL, NULL, 0); 180 if (ret < 0 && errno != EINTR && errno != EAGAIN) { 181 printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", 182 __func__, errno); 183 goto out_kill; 184 } 185 } while (data->futex == FUTEX_IN_CHILD); 186 187 if (__READ_ONCE(mm_idp->pid) < 0) 188 goto out_kill; 189 190 running = 0; 191 192 /* We may receive a SIGALRM before SIGSYS, iterate again. */ 193 } while (wait_sigsys && data->signal == SIGALRM); 194 195 if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { 196 printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); 197 goto out_kill; 198 } 199 200 if (wait_sigsys && data->signal != SIGSYS) { 201 printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", 202 __func__, data->signal); 203 goto out_kill; 204 } 205 206 return; 207 208 out_kill: 209 printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", 210 __func__, mm_idp->pid, errno); 211 /* This is not true inside start_userspace */ 212 if (current_mm_id() == mm_idp) 213 fatal_sigsegv(); 214 } 215 216 extern unsigned long current_stub_stack(void); 217 218 static void get_skas_faultinfo(int pid, struct faultinfo *fi) 219 { 220 int err; 221 222 err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); 223 if (err) { 224 printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " 225 "errno = %d\n", pid, errno); 226 fatal_sigsegv(); 227 } 228 wait_stub_done(pid); 229 230 /* 231 * faultinfo is prepared by the stub_segv_handler at start of 232 * the stub stack page. We just have to copy it. 233 */ 234 memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); 235 } 236 237 static void handle_trap(int pid, struct uml_pt_regs *regs) 238 { 239 if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) 240 fatal_sigsegv(); 241 242 handle_syscall(regs); 243 } 244 245 extern char __syscall_stub_start[]; 246 247 static int stub_exe_fd; 248 249 #ifndef CLOSE_RANGE_CLOEXEC 250 #define CLOSE_RANGE_CLOEXEC (1U << 2) 251 #endif 252 253 static int userspace_tramp(void *stack) 254 { 255 char *const argv[] = { "uml-userspace", NULL }; 256 int pipe_fds[2]; 257 unsigned long long offset; 258 struct stub_init_data init_data = { 259 .seccomp = using_seccomp, 260 .stub_start = STUB_START, 261 }; 262 struct iomem_region *iomem; 263 int ret; 264 265 if (using_seccomp) { 266 init_data.signal_handler = STUB_CODE + 267 (unsigned long) stub_signal_interrupt - 268 (unsigned long) __syscall_stub_start; 269 init_data.signal_restorer = STUB_CODE + 270 (unsigned long) stub_signal_restorer - 271 (unsigned long) __syscall_stub_start; 272 } else { 273 init_data.signal_handler = STUB_CODE + 274 (unsigned long) stub_segv_handler - 275 (unsigned long) __syscall_stub_start; 276 init_data.signal_restorer = 0; 277 } 278 279 init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 280 &offset); 281 init_data.stub_code_offset = MMAP_OFFSET(offset); 282 283 init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 284 init_data.stub_data_offset = MMAP_OFFSET(offset); 285 286 /* 287 * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs 288 * and then unsetting it on all memory related FDs. 289 * This is not strictly necessary from a safety perspective. 290 */ 291 syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); 292 293 fcntl(init_data.stub_data_fd, F_SETFD, 0); 294 for (iomem = iomem_regions; iomem; iomem = iomem->next) 295 fcntl(iomem->fd, F_SETFD, 0); 296 297 /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 298 if (pipe(pipe_fds)) 299 exit(2); 300 301 if (dup2(pipe_fds[0], 0) < 0) 302 exit(3); 303 close(pipe_fds[0]); 304 305 /* Write init_data and close write side */ 306 ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 307 close(pipe_fds[1]); 308 309 if (ret != sizeof(init_data)) 310 exit(4); 311 312 /* Raw execveat for compatibility with older libc versions */ 313 syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", 314 (unsigned long)argv, NULL, AT_EMPTY_PATH); 315 316 exit(5); 317 } 318 319 extern char stub_exe_start[]; 320 extern char stub_exe_end[]; 321 322 extern char *tempdir; 323 324 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 325 326 #ifndef MFD_EXEC 327 #define MFD_EXEC 0x0010U 328 #endif 329 330 static int __init init_stub_exe_fd(void) 331 { 332 size_t written = 0; 333 char *tmpfile = NULL; 334 335 stub_exe_fd = memfd_create("uml-userspace", 336 MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 337 338 if (stub_exe_fd < 0) { 339 printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 340 341 tmpfile = malloc(strlen(tempdir) + 342 strlen(STUB_EXE_NAME_TEMPLATE) + 1); 343 if (tmpfile == NULL) 344 panic("Failed to allocate memory for stub binary name"); 345 346 strcpy(tmpfile, tempdir); 347 strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 348 349 stub_exe_fd = mkstemp(tmpfile); 350 if (stub_exe_fd < 0) 351 panic("Could not create temporary file for stub binary: %d", 352 -errno); 353 } 354 355 while (written < stub_exe_end - stub_exe_start) { 356 ssize_t res = write(stub_exe_fd, stub_exe_start + written, 357 stub_exe_end - stub_exe_start - written); 358 if (res < 0) { 359 if (errno == EINTR) 360 continue; 361 362 if (tmpfile) 363 unlink(tmpfile); 364 panic("Failed write stub binary: %d", -errno); 365 } 366 367 written += res; 368 } 369 370 if (!tmpfile) { 371 fcntl(stub_exe_fd, F_ADD_SEALS, 372 F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 373 } else { 374 if (fchmod(stub_exe_fd, 00500) < 0) { 375 unlink(tmpfile); 376 panic("Could not make stub binary executable: %d", 377 -errno); 378 } 379 380 close(stub_exe_fd); 381 stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 382 if (stub_exe_fd < 0) { 383 unlink(tmpfile); 384 panic("Could not reopen stub binary: %d", -errno); 385 } 386 387 unlink(tmpfile); 388 free(tmpfile); 389 } 390 391 return 0; 392 } 393 __initcall(init_stub_exe_fd); 394 395 int using_seccomp; 396 int userspace_pid[NR_CPUS]; 397 398 /** 399 * start_userspace() - prepare a new userspace process 400 * @stub_stack: pointer to the stub stack. 401 * 402 * Setups a new temporary stack page that is used while userspace_tramp() runs 403 * Clones the kernel process into a new userspace process, with FDs only. 404 * 405 * Return: When positive: the process id of the new userspace process, 406 * when negative: an error number. 407 * FIXME: can PIDs become negative?! 408 */ 409 int start_userspace(struct mm_id *mm_id) 410 { 411 struct stub_data *proc_data = (void *)mm_id->stack; 412 void *stack; 413 unsigned long sp; 414 int pid, status, n, err; 415 416 /* setup a temporary stack page */ 417 stack = mmap(NULL, UM_KERN_PAGE_SIZE, 418 PROT_READ | PROT_WRITE | PROT_EXEC, 419 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 420 if (stack == MAP_FAILED) { 421 err = -errno; 422 printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n", 423 __func__, errno); 424 return err; 425 } 426 427 /* set stack pointer to the end of the stack page, so it can grow downwards */ 428 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 429 430 if (using_seccomp) 431 proc_data->futex = FUTEX_IN_CHILD; 432 433 /* clone into new userspace process */ 434 pid = clone(userspace_tramp, (void *) sp, 435 CLONE_VFORK | CLONE_VM | SIGCHLD, 436 (void *)mm_id->stack); 437 if (pid < 0) { 438 err = -errno; 439 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", 440 __func__, errno); 441 return err; 442 } 443 444 if (using_seccomp) { 445 wait_stub_done_seccomp(mm_id, 1, 1); 446 } else { 447 do { 448 CATCH_EINTR(n = waitpid(pid, &status, 449 WUNTRACED | __WALL)); 450 if (n < 0) { 451 err = -errno; 452 printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", 453 __func__, errno); 454 goto out_kill; 455 } 456 } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); 457 458 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { 459 err = -EINVAL; 460 printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", 461 __func__, status); 462 goto out_kill; 463 } 464 465 if (ptrace(PTRACE_SETOPTIONS, pid, NULL, 466 (void *) PTRACE_O_TRACESYSGOOD) < 0) { 467 err = -errno; 468 printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", 469 __func__, errno); 470 goto out_kill; 471 } 472 } 473 474 if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { 475 err = -errno; 476 printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n", 477 __func__, errno); 478 goto out_kill; 479 } 480 481 mm_id->pid = pid; 482 483 return pid; 484 485 out_kill: 486 os_kill_ptraced_process(pid, 1); 487 return err; 488 } 489 490 int unscheduled_userspace_iterations; 491 extern unsigned long tt_extra_sched_jiffies; 492 493 void userspace(struct uml_pt_regs *regs) 494 { 495 int err, status, op, pid = userspace_pid[0]; 496 siginfo_t si_ptrace; 497 siginfo_t *si; 498 int sig; 499 500 /* Handle any immediate reschedules or signals */ 501 interrupt_end(); 502 503 while (1) { 504 /* 505 * When we are in time-travel mode, userspace can theoretically 506 * do a *lot* of work without being scheduled. The problem with 507 * this is that it will prevent kernel bookkeeping (primarily 508 * the RCU) from running and this can for example cause OOM 509 * situations. 510 * 511 * This code accounts a jiffie against the scheduling clock 512 * after the defined userspace iterations in the same thread. 513 * By doing so the situation is effectively prevented. 514 */ 515 if (time_travel_mode == TT_MODE_INFCPU || 516 time_travel_mode == TT_MODE_EXTERNAL) { 517 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 518 if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && 519 unscheduled_userspace_iterations++ > 520 CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 521 tt_extra_sched_jiffies += 1; 522 unscheduled_userspace_iterations = 0; 523 } 524 #endif 525 } 526 527 time_travel_print_bc_msg(); 528 529 current_mm_sync(); 530 531 if (using_seccomp) { 532 struct mm_id *mm_id = current_mm_id(); 533 struct stub_data *proc_data = (void *) mm_id->stack; 534 int ret; 535 536 ret = set_stub_state(regs, proc_data, singlestepping()); 537 if (ret) { 538 printk(UM_KERN_ERR "%s - failed to set regs: %d", 539 __func__, ret); 540 fatal_sigsegv(); 541 } 542 543 /* Must have been reset by the syscall caller */ 544 if (proc_data->restart_wait != 0) 545 panic("Programming error: Flag to only run syscalls in child was not cleared!"); 546 547 /* Mark pending syscalls for flushing */ 548 proc_data->syscall_data_len = mm_id->syscall_data_len; 549 mm_id->syscall_data_len = 0; 550 551 proc_data->signal = 0; 552 proc_data->futex = FUTEX_IN_CHILD; 553 CATCH_EINTR(syscall(__NR_futex, &proc_data->futex, 554 FUTEX_WAKE, 1, NULL, NULL, 0)); 555 do { 556 ret = syscall(__NR_futex, &proc_data->futex, 557 FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0); 558 } while ((ret == -1 && errno == EINTR) || 559 proc_data->futex == FUTEX_IN_CHILD); 560 561 sig = proc_data->signal; 562 563 if (sig == SIGTRAP && proc_data->err != 0) { 564 printk(UM_KERN_ERR "%s - Error flushing stub syscalls", 565 __func__); 566 syscall_stub_dump_error(mm_id); 567 fatal_sigsegv(); 568 } 569 570 ret = get_stub_state(regs, proc_data, NULL); 571 if (ret) { 572 printk(UM_KERN_ERR "%s - failed to get regs: %d", 573 __func__, ret); 574 fatal_sigsegv(); 575 } 576 577 if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) 578 panic("%s - Invalid siginfo offset from child", 579 __func__); 580 si = (void *)&proc_data->sigstack[proc_data->si_offset]; 581 582 regs->is_user = 1; 583 584 /* Fill in ORIG_RAX and extract fault information */ 585 PT_SYSCALL_NR(regs->gp) = si->si_syscall; 586 if (sig == SIGSEGV) { 587 mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; 588 589 GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); 590 } 591 } else { 592 /* Flush out any pending syscalls */ 593 err = syscall_stub_flush(current_mm_id()); 594 if (err) { 595 if (err == -ENOMEM) 596 report_enomem(); 597 598 printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", 599 __func__, -err); 600 fatal_sigsegv(); 601 } 602 603 /* 604 * This can legitimately fail if the process loads a 605 * bogus value into a segment register. It will 606 * segfault and PTRACE_GETREGS will read that value 607 * out of the process. However, PTRACE_SETREGS will 608 * fail. In this case, there is nothing to do but 609 * just kill the process. 610 */ 611 if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { 612 printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", 613 __func__, errno); 614 fatal_sigsegv(); 615 } 616 617 if (put_fp_registers(pid, regs->fp)) { 618 printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", 619 __func__, errno); 620 fatal_sigsegv(); 621 } 622 623 if (singlestepping()) 624 op = PTRACE_SYSEMU_SINGLESTEP; 625 else 626 op = PTRACE_SYSEMU; 627 628 if (ptrace(op, pid, 0, 0)) { 629 printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", 630 __func__, op, errno); 631 fatal_sigsegv(); 632 } 633 634 CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); 635 if (err < 0) { 636 printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", 637 __func__, errno); 638 fatal_sigsegv(); 639 } 640 641 regs->is_user = 1; 642 if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { 643 printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", 644 __func__, errno); 645 fatal_sigsegv(); 646 } 647 648 if (get_fp_registers(pid, regs->fp)) { 649 printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", 650 __func__, errno); 651 fatal_sigsegv(); 652 } 653 654 if (WIFSTOPPED(status)) { 655 sig = WSTOPSIG(status); 656 657 /* 658 * These signal handlers need the si argument 659 * and SIGSEGV needs the faultinfo. 660 * The SIGIO and SIGALARM handlers which constitute 661 * the majority of invocations, do not use it. 662 */ 663 switch (sig) { 664 case SIGSEGV: 665 get_skas_faultinfo(pid, 666 ®s->faultinfo); 667 fallthrough; 668 case SIGTRAP: 669 case SIGILL: 670 case SIGBUS: 671 case SIGFPE: 672 case SIGWINCH: 673 ptrace(PTRACE_GETSIGINFO, pid, 0, 674 (struct siginfo *)&si_ptrace); 675 si = &si_ptrace; 676 break; 677 default: 678 si = NULL; 679 break; 680 } 681 } else { 682 sig = 0; 683 } 684 } 685 686 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 687 688 if (sig) { 689 switch (sig) { 690 case SIGSEGV: 691 if (using_seccomp || PTRACE_FULL_FAULTINFO) 692 (*sig_info[SIGSEGV])(SIGSEGV, 693 (struct siginfo *)si, 694 regs, NULL); 695 else 696 segv(regs->faultinfo, 0, 1, NULL, NULL); 697 698 break; 699 case SIGSYS: 700 handle_syscall(regs); 701 break; 702 case SIGTRAP + 0x80: 703 handle_trap(pid, regs); 704 break; 705 case SIGTRAP: 706 relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); 707 break; 708 case SIGALRM: 709 break; 710 case SIGIO: 711 case SIGILL: 712 case SIGBUS: 713 case SIGFPE: 714 case SIGWINCH: 715 block_signals_trace(); 716 (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); 717 unblock_signals_trace(); 718 break; 719 default: 720 printk(UM_KERN_ERR "%s - child stopped with signal %d\n", 721 __func__, sig); 722 fatal_sigsegv(); 723 } 724 pid = userspace_pid[0]; 725 interrupt_end(); 726 727 /* Avoid -ERESTARTSYS handling in host */ 728 if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET) 729 PT_SYSCALL_NR(regs->gp) = -1; 730 } 731 } 732 } 733 734 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) 735 { 736 (*buf)[0].JB_IP = (unsigned long) handler; 737 (*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE - 738 sizeof(void *); 739 } 740 741 #define INIT_JMP_NEW_THREAD 0 742 #define INIT_JMP_CALLBACK 1 743 #define INIT_JMP_HALT 2 744 #define INIT_JMP_REBOOT 3 745 746 void switch_threads(jmp_buf *me, jmp_buf *you) 747 { 748 unscheduled_userspace_iterations = 0; 749 750 if (UML_SETJMP(me) == 0) 751 UML_LONGJMP(you, 1); 752 } 753 754 static jmp_buf initial_jmpbuf; 755 756 /* XXX Make these percpu */ 757 static void (*cb_proc)(void *arg); 758 static void *cb_arg; 759 static jmp_buf *cb_back; 760 761 int start_idle_thread(void *stack, jmp_buf *switch_buf) 762 { 763 int n; 764 765 set_handler(SIGWINCH); 766 767 /* 768 * Can't use UML_SETJMP or UML_LONGJMP here because they save 769 * and restore signals, with the possible side-effect of 770 * trying to handle any signals which came when they were 771 * blocked, which can't be done on this stack. 772 * Signals must be blocked when jumping back here and restored 773 * after returning to the jumper. 774 */ 775 n = setjmp(initial_jmpbuf); 776 switch (n) { 777 case INIT_JMP_NEW_THREAD: 778 (*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup; 779 (*switch_buf)[0].JB_SP = (unsigned long) stack + 780 UM_THREAD_SIZE - sizeof(void *); 781 break; 782 case INIT_JMP_CALLBACK: 783 (*cb_proc)(cb_arg); 784 longjmp(*cb_back, 1); 785 break; 786 case INIT_JMP_HALT: 787 kmalloc_ok = 0; 788 return 0; 789 case INIT_JMP_REBOOT: 790 kmalloc_ok = 0; 791 return 1; 792 default: 793 printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n", 794 __func__, n); 795 fatal_sigsegv(); 796 } 797 longjmp(*switch_buf, 1); 798 799 /* unreachable */ 800 printk(UM_KERN_ERR "impossible long jump!"); 801 fatal_sigsegv(); 802 return 0; 803 } 804 805 void initial_thread_cb_skas(void (*proc)(void *), void *arg) 806 { 807 jmp_buf here; 808 809 cb_proc = proc; 810 cb_arg = arg; 811 cb_back = &here; 812 813 block_signals_trace(); 814 if (UML_SETJMP(&here) == 0) 815 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 816 unblock_signals_trace(); 817 818 cb_proc = NULL; 819 cb_arg = NULL; 820 cb_back = NULL; 821 } 822 823 void halt_skas(void) 824 { 825 block_signals_trace(); 826 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 827 } 828 829 static bool noreboot; 830 831 static int __init noreboot_cmd_param(char *str, int *add) 832 { 833 *add = 0; 834 noreboot = true; 835 return 0; 836 } 837 838 __uml_setup("noreboot", noreboot_cmd_param, 839 "noreboot\n" 840 " Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n" 841 " This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n" 842 " crashes in CI\n"); 843 844 void reboot_skas(void) 845 { 846 block_signals_trace(); 847 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 848 } 849 850 void __switch_mm(struct mm_id *mm_idp) 851 { 852 userspace_pid[0] = mm_idp->pid; 853 } 854