xref: /linux/arch/um/os-Linux/skas/process.c (revision dac494bf54f764a114f16621ef04f534dd754ac1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
4  * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5  */
6 
7 #include <stdlib.h>
8 #include <stdbool.h>
9 #include <unistd.h>
10 #include <sched.h>
11 #include <errno.h>
12 #include <string.h>
13 #include <fcntl.h>
14 #include <mem_user.h>
15 #include <sys/mman.h>
16 #include <sys/wait.h>
17 #include <sys/stat.h>
18 #include <asm/unistd.h>
19 #include <as-layout.h>
20 #include <init.h>
21 #include <kern_util.h>
22 #include <mem.h>
23 #include <os.h>
24 #include <ptrace_user.h>
25 #include <registers.h>
26 #include <skas.h>
27 #include <sysdep/stub.h>
28 #include <linux/threads.h>
29 #include <timetravel.h>
30 #include "../internal.h"
31 
32 int is_skas_winch(int pid, int fd, void *data)
33 {
34 	return pid == getpgrp();
35 }
36 
37 static const char *ptrace_reg_name(int idx)
38 {
39 #define R(n) case HOST_##n: return #n
40 
41 	switch (idx) {
42 #ifdef __x86_64__
43 	R(BX);
44 	R(CX);
45 	R(DI);
46 	R(SI);
47 	R(DX);
48 	R(BP);
49 	R(AX);
50 	R(R8);
51 	R(R9);
52 	R(R10);
53 	R(R11);
54 	R(R12);
55 	R(R13);
56 	R(R14);
57 	R(R15);
58 	R(ORIG_AX);
59 	R(CS);
60 	R(SS);
61 	R(EFLAGS);
62 #elif defined(__i386__)
63 	R(IP);
64 	R(SP);
65 	R(EFLAGS);
66 	R(AX);
67 	R(BX);
68 	R(CX);
69 	R(DX);
70 	R(SI);
71 	R(DI);
72 	R(BP);
73 	R(CS);
74 	R(SS);
75 	R(DS);
76 	R(FS);
77 	R(ES);
78 	R(GS);
79 	R(ORIG_AX);
80 #endif
81 	}
82 	return "";
83 }
84 
85 static int ptrace_dump_regs(int pid)
86 {
87 	unsigned long regs[MAX_REG_NR];
88 	int i;
89 
90 	if (ptrace(PTRACE_GETREGS, pid, 0, regs) < 0)
91 		return -errno;
92 
93 	printk(UM_KERN_ERR "Stub registers -\n");
94 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
95 		const char *regname = ptrace_reg_name(i);
96 
97 		printk(UM_KERN_ERR "\t%s\t(%2d): %lx\n", regname, i, regs[i]);
98 	}
99 
100 	return 0;
101 }
102 
103 /*
104  * Signals that are OK to receive in the stub - we'll just continue it.
105  * SIGWINCH will happen when UML is inside a detached screen.
106  */
107 #define STUB_SIG_MASK ((1 << SIGALRM) | (1 << SIGWINCH))
108 
109 /* Signals that the stub will finish with - anything else is an error */
110 #define STUB_DONE_MASK (1 << SIGTRAP)
111 
112 void wait_stub_done(int pid)
113 {
114 	int n, status, err;
115 
116 	while (1) {
117 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
118 		if ((n < 0) || !WIFSTOPPED(status))
119 			goto bad_wait;
120 
121 		if (((1 << WSTOPSIG(status)) & STUB_SIG_MASK) == 0)
122 			break;
123 
124 		err = ptrace(PTRACE_CONT, pid, 0, 0);
125 		if (err) {
126 			printk(UM_KERN_ERR "%s : continue failed, errno = %d\n",
127 			       __func__, errno);
128 			fatal_sigsegv();
129 		}
130 	}
131 
132 	if (((1 << WSTOPSIG(status)) & STUB_DONE_MASK) != 0)
133 		return;
134 
135 bad_wait:
136 	err = ptrace_dump_regs(pid);
137 	if (err)
138 		printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n",
139 		       -err);
140 	printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n",
141 	       __func__, pid, n, errno, status);
142 	fatal_sigsegv();
143 }
144 
145 extern unsigned long current_stub_stack(void);
146 
147 static void get_skas_faultinfo(int pid, struct faultinfo *fi)
148 {
149 	int err;
150 
151 	err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
152 	if (err) {
153 		printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
154 		       "errno = %d\n", pid, errno);
155 		fatal_sigsegv();
156 	}
157 	wait_stub_done(pid);
158 
159 	/*
160 	 * faultinfo is prepared by the stub_segv_handler at start of
161 	 * the stub stack page. We just have to copy it.
162 	 */
163 	memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
164 }
165 
166 static void handle_trap(int pid, struct uml_pt_regs *regs)
167 {
168 	if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END))
169 		fatal_sigsegv();
170 
171 	handle_syscall(regs);
172 }
173 
174 extern char __syscall_stub_start[];
175 
176 static int stub_exe_fd;
177 
178 #ifndef CLOSE_RANGE_CLOEXEC
179 #define CLOSE_RANGE_CLOEXEC	(1U << 2)
180 #endif
181 
182 static int userspace_tramp(void *stack)
183 {
184 	char *const argv[] = { "uml-userspace", NULL };
185 	int pipe_fds[2];
186 	unsigned long long offset;
187 	struct stub_init_data init_data = {
188 		.stub_start = STUB_START,
189 		.segv_handler = STUB_CODE +
190 				(unsigned long) stub_segv_handler -
191 				(unsigned long) __syscall_stub_start,
192 	};
193 	struct iomem_region *iomem;
194 	int ret;
195 
196 	init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
197 					      &offset);
198 	init_data.stub_code_offset = MMAP_OFFSET(offset);
199 
200 	init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset);
201 	init_data.stub_data_offset = MMAP_OFFSET(offset);
202 
203 	/*
204 	 * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs
205 	 * and then unsetting it on all memory related FDs.
206 	 * This is not strictly necessary from a safety perspective.
207 	 */
208 	syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC);
209 
210 	fcntl(init_data.stub_data_fd, F_SETFD, 0);
211 	for (iomem = iomem_regions; iomem; iomem = iomem->next)
212 		fcntl(iomem->fd, F_SETFD, 0);
213 
214 	/* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */
215 	if (pipe(pipe_fds))
216 		exit(2);
217 
218 	if (dup2(pipe_fds[0], 0) < 0)
219 		exit(3);
220 	close(pipe_fds[0]);
221 
222 	/* Write init_data and close write side */
223 	ret = write(pipe_fds[1], &init_data, sizeof(init_data));
224 	close(pipe_fds[1]);
225 
226 	if (ret != sizeof(init_data))
227 		exit(4);
228 
229 	/* Raw execveat for compatibility with older libc versions */
230 	syscall(__NR_execveat, stub_exe_fd, (unsigned long)"",
231 		(unsigned long)argv, NULL, AT_EMPTY_PATH);
232 
233 	exit(5);
234 }
235 
236 extern char stub_exe_start[];
237 extern char stub_exe_end[];
238 
239 extern char *tempdir;
240 
241 #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX"
242 
243 #ifndef MFD_EXEC
244 #define MFD_EXEC 0x0010U
245 #endif
246 
247 static int __init init_stub_exe_fd(void)
248 {
249 	size_t written = 0;
250 	char *tmpfile = NULL;
251 
252 	stub_exe_fd = memfd_create("uml-userspace",
253 				   MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
254 
255 	if (stub_exe_fd < 0) {
256 		printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!");
257 
258 		tmpfile = malloc(strlen(tempdir) +
259 				  strlen(STUB_EXE_NAME_TEMPLATE) + 1);
260 		if (tmpfile == NULL)
261 			panic("Failed to allocate memory for stub binary name");
262 
263 		strcpy(tmpfile, tempdir);
264 		strcat(tmpfile, STUB_EXE_NAME_TEMPLATE);
265 
266 		stub_exe_fd = mkstemp(tmpfile);
267 		if (stub_exe_fd < 0)
268 			panic("Could not create temporary file for stub binary: %d",
269 			      -errno);
270 	}
271 
272 	while (written < stub_exe_end - stub_exe_start) {
273 		ssize_t res = write(stub_exe_fd, stub_exe_start + written,
274 				    stub_exe_end - stub_exe_start - written);
275 		if (res < 0) {
276 			if (errno == EINTR)
277 				continue;
278 
279 			if (tmpfile)
280 				unlink(tmpfile);
281 			panic("Failed write stub binary: %d", -errno);
282 		}
283 
284 		written += res;
285 	}
286 
287 	if (!tmpfile) {
288 		fcntl(stub_exe_fd, F_ADD_SEALS,
289 		      F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL);
290 	} else {
291 		if (fchmod(stub_exe_fd, 00500) < 0) {
292 			unlink(tmpfile);
293 			panic("Could not make stub binary executable: %d",
294 			      -errno);
295 		}
296 
297 		close(stub_exe_fd);
298 		stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
299 		if (stub_exe_fd < 0) {
300 			unlink(tmpfile);
301 			panic("Could not reopen stub binary: %d", -errno);
302 		}
303 
304 		unlink(tmpfile);
305 		free(tmpfile);
306 	}
307 
308 	return 0;
309 }
310 __initcall(init_stub_exe_fd);
311 
312 int userspace_pid[NR_CPUS];
313 
314 /**
315  * start_userspace() - prepare a new userspace process
316  * @stub_stack:	pointer to the stub stack.
317  *
318  * Setups a new temporary stack page that is used while userspace_tramp() runs
319  * Clones the kernel process into a new userspace process, with FDs only.
320  *
321  * Return: When positive: the process id of the new userspace process,
322  *         when negative: an error number.
323  * FIXME: can PIDs become negative?!
324  */
325 int start_userspace(unsigned long stub_stack)
326 {
327 	void *stack;
328 	unsigned long sp;
329 	int pid, status, n, err;
330 
331 	/* setup a temporary stack page */
332 	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
333 		     PROT_READ | PROT_WRITE | PROT_EXEC,
334 		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
335 	if (stack == MAP_FAILED) {
336 		err = -errno;
337 		printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n",
338 		       __func__, errno);
339 		return err;
340 	}
341 
342 	/* set stack pointer to the end of the stack page, so it can grow downwards */
343 	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
344 
345 	/* clone into new userspace process */
346 	pid = clone(userspace_tramp, (void *) sp,
347 		    CLONE_VFORK | CLONE_VM | SIGCHLD,
348 		    (void *)stub_stack);
349 	if (pid < 0) {
350 		err = -errno;
351 		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
352 		       __func__, errno);
353 		return err;
354 	}
355 
356 	do {
357 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
358 		if (n < 0) {
359 			err = -errno;
360 			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
361 			       __func__, errno);
362 			goto out_kill;
363 		}
364 	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
365 
366 	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
367 		err = -EINVAL;
368 		printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
369 		       __func__, status);
370 		goto out_kill;
371 	}
372 
373 	if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
374 		   (void *) PTRACE_O_TRACESYSGOOD) < 0) {
375 		err = -errno;
376 		printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
377 		       __func__, errno);
378 		goto out_kill;
379 	}
380 
381 	if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
382 		err = -errno;
383 		printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n",
384 		       __func__, errno);
385 		goto out_kill;
386 	}
387 
388 	return pid;
389 
390  out_kill:
391 	os_kill_ptraced_process(pid, 1);
392 	return err;
393 }
394 
395 int unscheduled_userspace_iterations;
396 extern unsigned long tt_extra_sched_jiffies;
397 
398 void userspace(struct uml_pt_regs *regs)
399 {
400 	int err, status, op, pid = userspace_pid[0];
401 	siginfo_t si;
402 
403 	/* Handle any immediate reschedules or signals */
404 	interrupt_end();
405 
406 	while (1) {
407 		/*
408 		 * When we are in time-travel mode, userspace can theoretically
409 		 * do a *lot* of work without being scheduled. The problem with
410 		 * this is that it will prevent kernel bookkeeping (primarily
411 		 * the RCU) from running and this can for example cause OOM
412 		 * situations.
413 		 *
414 		 * This code accounts a jiffie against the scheduling clock
415 		 * after the defined userspace iterations in the same thread.
416 		 * By doing so the situation is effectively prevented.
417 		 */
418 		if (time_travel_mode == TT_MODE_INFCPU ||
419 		    time_travel_mode == TT_MODE_EXTERNAL) {
420 #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS
421 			if (CONFIG_UML_MAX_USERSPACE_ITERATIONS &&
422 			    unscheduled_userspace_iterations++ >
423 			    CONFIG_UML_MAX_USERSPACE_ITERATIONS) {
424 				tt_extra_sched_jiffies += 1;
425 				unscheduled_userspace_iterations = 0;
426 			}
427 #endif
428 		}
429 
430 		time_travel_print_bc_msg();
431 
432 		current_mm_sync();
433 
434 		/* Flush out any pending syscalls */
435 		err = syscall_stub_flush(current_mm_id());
436 		if (err) {
437 			if (err == -ENOMEM)
438 				report_enomem();
439 
440 			printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d",
441 				__func__, -err);
442 			fatal_sigsegv();
443 		}
444 
445 		/*
446 		 * This can legitimately fail if the process loads a
447 		 * bogus value into a segment register.  It will
448 		 * segfault and PTRACE_GETREGS will read that value
449 		 * out of the process.  However, PTRACE_SETREGS will
450 		 * fail.  In this case, there is nothing to do but
451 		 * just kill the process.
452 		 */
453 		if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
454 			printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n",
455 			       __func__, errno);
456 			fatal_sigsegv();
457 		}
458 
459 		if (put_fp_registers(pid, regs->fp)) {
460 			printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n",
461 			       __func__, errno);
462 			fatal_sigsegv();
463 		}
464 
465 		if (singlestepping())
466 			op = PTRACE_SYSEMU_SINGLESTEP;
467 		else
468 			op = PTRACE_SYSEMU;
469 
470 		if (ptrace(op, pid, 0, 0)) {
471 			printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n",
472 			       __func__, op, errno);
473 			fatal_sigsegv();
474 		}
475 
476 		CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
477 		if (err < 0) {
478 			printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
479 			       __func__, errno);
480 			fatal_sigsegv();
481 		}
482 
483 		regs->is_user = 1;
484 		if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
485 			printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n",
486 			       __func__, errno);
487 			fatal_sigsegv();
488 		}
489 
490 		if (get_fp_registers(pid, regs->fp)) {
491 			printk(UM_KERN_ERR "%s -  get_fp_registers failed, errno = %d\n",
492 			       __func__, errno);
493 			fatal_sigsegv();
494 		}
495 
496 		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
497 
498 		if (WIFSTOPPED(status)) {
499 			int sig = WSTOPSIG(status);
500 
501 			/* These signal handlers need the si argument.
502 			 * The SIGIO and SIGALARM handlers which constitute the
503 			 * majority of invocations, do not use it.
504 			 */
505 			switch (sig) {
506 			case SIGSEGV:
507 			case SIGTRAP:
508 			case SIGILL:
509 			case SIGBUS:
510 			case SIGFPE:
511 			case SIGWINCH:
512 				ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si);
513 				break;
514 			}
515 
516 			switch (sig) {
517 			case SIGSEGV:
518 				get_skas_faultinfo(pid, &regs->faultinfo);
519 
520 				if (PTRACE_FULL_FAULTINFO)
521 					(*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si,
522 							     regs, NULL);
523 				else
524 					segv(regs->faultinfo, 0, 1, NULL, NULL);
525 
526 				break;
527 			case SIGTRAP + 0x80:
528 				handle_trap(pid, regs);
529 				break;
530 			case SIGTRAP:
531 				relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL);
532 				break;
533 			case SIGALRM:
534 				break;
535 			case SIGIO:
536 			case SIGILL:
537 			case SIGBUS:
538 			case SIGFPE:
539 			case SIGWINCH:
540 				block_signals_trace();
541 				(*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL);
542 				unblock_signals_trace();
543 				break;
544 			default:
545 				printk(UM_KERN_ERR "%s - child stopped with signal %d\n",
546 				       __func__, sig);
547 				fatal_sigsegv();
548 			}
549 			pid = userspace_pid[0];
550 			interrupt_end();
551 
552 			/* Avoid -ERESTARTSYS handling in host */
553 			if (PT_SYSCALL_NR_OFFSET != PT_SYSCALL_RET_OFFSET)
554 				PT_SYSCALL_NR(regs->gp) = -1;
555 		}
556 	}
557 }
558 
559 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
560 {
561 	(*buf)[0].JB_IP = (unsigned long) handler;
562 	(*buf)[0].JB_SP = (unsigned long) stack + UM_THREAD_SIZE -
563 		sizeof(void *);
564 }
565 
566 #define INIT_JMP_NEW_THREAD 0
567 #define INIT_JMP_CALLBACK 1
568 #define INIT_JMP_HALT 2
569 #define INIT_JMP_REBOOT 3
570 
571 void switch_threads(jmp_buf *me, jmp_buf *you)
572 {
573 	unscheduled_userspace_iterations = 0;
574 
575 	if (UML_SETJMP(me) == 0)
576 		UML_LONGJMP(you, 1);
577 }
578 
579 static jmp_buf initial_jmpbuf;
580 
581 /* XXX Make these percpu */
582 static void (*cb_proc)(void *arg);
583 static void *cb_arg;
584 static jmp_buf *cb_back;
585 
586 int start_idle_thread(void *stack, jmp_buf *switch_buf)
587 {
588 	int n;
589 
590 	set_handler(SIGWINCH);
591 
592 	/*
593 	 * Can't use UML_SETJMP or UML_LONGJMP here because they save
594 	 * and restore signals, with the possible side-effect of
595 	 * trying to handle any signals which came when they were
596 	 * blocked, which can't be done on this stack.
597 	 * Signals must be blocked when jumping back here and restored
598 	 * after returning to the jumper.
599 	 */
600 	n = setjmp(initial_jmpbuf);
601 	switch (n) {
602 	case INIT_JMP_NEW_THREAD:
603 		(*switch_buf)[0].JB_IP = (unsigned long) uml_finishsetup;
604 		(*switch_buf)[0].JB_SP = (unsigned long) stack +
605 			UM_THREAD_SIZE - sizeof(void *);
606 		break;
607 	case INIT_JMP_CALLBACK:
608 		(*cb_proc)(cb_arg);
609 		longjmp(*cb_back, 1);
610 		break;
611 	case INIT_JMP_HALT:
612 		kmalloc_ok = 0;
613 		return 0;
614 	case INIT_JMP_REBOOT:
615 		kmalloc_ok = 0;
616 		return 1;
617 	default:
618 		printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n",
619 		       __func__, n);
620 		fatal_sigsegv();
621 	}
622 	longjmp(*switch_buf, 1);
623 
624 	/* unreachable */
625 	printk(UM_KERN_ERR "impossible long jump!");
626 	fatal_sigsegv();
627 	return 0;
628 }
629 
630 void initial_thread_cb_skas(void (*proc)(void *), void *arg)
631 {
632 	jmp_buf here;
633 
634 	cb_proc = proc;
635 	cb_arg = arg;
636 	cb_back = &here;
637 
638 	block_signals_trace();
639 	if (UML_SETJMP(&here) == 0)
640 		UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK);
641 	unblock_signals_trace();
642 
643 	cb_proc = NULL;
644 	cb_arg = NULL;
645 	cb_back = NULL;
646 }
647 
648 void halt_skas(void)
649 {
650 	block_signals_trace();
651 	UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT);
652 }
653 
654 static bool noreboot;
655 
656 static int __init noreboot_cmd_param(char *str, int *add)
657 {
658 	*add = 0;
659 	noreboot = true;
660 	return 0;
661 }
662 
663 __uml_setup("noreboot", noreboot_cmd_param,
664 "noreboot\n"
665 "    Rather than rebooting, exit always, akin to QEMU's -no-reboot option.\n"
666 "    This is useful if you're using CONFIG_PANIC_TIMEOUT in order to catch\n"
667 "    crashes in CI\n");
668 
669 void reboot_skas(void)
670 {
671 	block_signals_trace();
672 	UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT);
673 }
674 
675 void __switch_mm(struct mm_id *mm_idp)
676 {
677 	userspace_pid[0] = mm_idp->pid;
678 }
679