xref: /linux/arch/um/os-Linux/start_up.c (revision 399ead3a6d76cbdd29a716660db5c84a314dab70)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5  */
6 
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <unistd.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sched.h>
14 #include <signal.h>
15 #include <string.h>
16 #include <sys/mman.h>
17 #include <sys/stat.h>
18 #include <sys/wait.h>
19 #include <sys/time.h>
20 #include <sys/resource.h>
21 #include <asm/ldt.h>
22 #include <asm/unistd.h>
23 #include <init.h>
24 #include <os.h>
25 #include <smp.h>
26 #include <kern_util.h>
27 #include <mem_user.h>
28 #include <ptrace_user.h>
29 #include <stdbool.h>
30 #include <stub-data.h>
31 #include <sys/prctl.h>
32 #include <linux/seccomp.h>
33 #include <linux/filter.h>
34 #include <sysdep/mcontext.h>
35 #include <sysdep/stub.h>
36 #include <registers.h>
37 #include <skas.h>
38 #include "internal.h"
39 
ptrace_child(void)40 static void ptrace_child(void)
41 {
42 	int ret;
43 	/* Calling os_getpid because some libcs cached getpid incorrectly */
44 	int pid = os_getpid(), ppid = getppid();
45 	int sc_result;
46 
47 	if (change_sig(SIGWINCH, 0) < 0 ||
48 	    ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
49 		perror("ptrace");
50 		kill(pid, SIGKILL);
51 	}
52 	kill(pid, SIGSTOP);
53 
54 	/*
55 	 * This syscall will be intercepted by the parent. Don't call more than
56 	 * once, please.
57 	 */
58 	sc_result = os_getpid();
59 
60 	if (sc_result == pid)
61 		/* Nothing modified by the parent, we are running normally. */
62 		ret = 1;
63 	else if (sc_result == ppid)
64 		/*
65 		 * Expected in check_ptrace and check_sysemu when they succeed
66 		 * in modifying the stack frame
67 		 */
68 		ret = 0;
69 	else
70 		/* Serious trouble! This could be caused by a bug in host 2.6
71 		 * SKAS3/2.6 patch before release -V6, together with a bug in
72 		 * the UML code itself.
73 		 */
74 		ret = 2;
75 
76 	exit(ret);
77 }
78 
fatal_perror(const char * str)79 static void fatal_perror(const char *str)
80 {
81 	perror(str);
82 	exit(1);
83 }
84 
fatal(char * fmt,...)85 static void fatal(char *fmt, ...)
86 {
87 	va_list list;
88 
89 	va_start(list, fmt);
90 	vfprintf(stderr, fmt, list);
91 	va_end(list);
92 
93 	exit(1);
94 }
95 
non_fatal(char * fmt,...)96 static void non_fatal(char *fmt, ...)
97 {
98 	va_list list;
99 
100 	va_start(list, fmt);
101 	vfprintf(stderr, fmt, list);
102 	va_end(list);
103 }
104 
start_ptraced_child(void)105 static int start_ptraced_child(void)
106 {
107 	int pid, n, status;
108 
109 	fflush(stdout);
110 
111 	pid = fork();
112 	if (pid == 0)
113 		ptrace_child();
114 	else if (pid < 0)
115 		fatal_perror("start_ptraced_child : fork failed");
116 
117 	CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
118 	if (n < 0)
119 		fatal_perror("check_ptrace : waitpid failed");
120 	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
121 		fatal("check_ptrace : expected SIGSTOP, got status = %d",
122 		      status);
123 
124 	return pid;
125 }
126 
stop_ptraced_child(int pid,int exitcode)127 static void stop_ptraced_child(int pid, int exitcode)
128 {
129 	int status, n;
130 
131 	if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
132 		fatal_perror("stop_ptraced_child : ptrace failed");
133 
134 	CATCH_EINTR(n = waitpid(pid, &status, 0));
135 	if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
136 		int exit_with = WEXITSTATUS(status);
137 		fatal("stop_ptraced_child : child exited with exitcode %d, "
138 		      "while expecting %d; status 0x%x\n", exit_with,
139 		      exitcode, status);
140 	}
141 }
142 
check_sysemu(void)143 static void __init check_sysemu(void)
144 {
145 	int pid, n, status, count=0;
146 
147 	os_info("Checking syscall emulation for ptrace...");
148 	pid = start_ptraced_child();
149 
150 	if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
151 		   (void *) PTRACE_O_TRACESYSGOOD) < 0))
152 		fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
153 
154 	while (1) {
155 		count++;
156 		if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
157 			goto fail;
158 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
159 		if (n < 0)
160 			fatal_perror("check_sysemu: wait failed");
161 
162 		if (WIFSTOPPED(status) &&
163 		    (WSTOPSIG(status) == (SIGTRAP|0x80))) {
164 			if (!count) {
165 				non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
166 					  "doesn't singlestep");
167 				goto fail;
168 			}
169 			n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
170 				   os_getpid());
171 			if (n < 0)
172 				fatal_perror("check_sysemu : failed to modify "
173 					     "system call return");
174 			break;
175 		}
176 		else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
177 			count++;
178 		else {
179 			non_fatal("check_sysemu: expected SIGTRAP or "
180 				  "(SIGTRAP | 0x80), got status = %d\n",
181 				  status);
182 			goto fail;
183 		}
184 	}
185 	stop_ptraced_child(pid, 0);
186 
187 	os_info("OK\n");
188 
189 	return;
190 
191 fail:
192 	stop_ptraced_child(pid, 1);
193 	fatal("missing\n");
194 }
195 
check_ptrace(void)196 static void __init check_ptrace(void)
197 {
198 	int pid, syscall, n, status;
199 
200 	os_info("Checking that ptrace can change system call numbers...");
201 	pid = start_ptraced_child();
202 
203 	if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
204 		   (void *) PTRACE_O_TRACESYSGOOD) < 0))
205 		fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
206 
207 	while (1) {
208 		if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
209 			fatal_perror("check_ptrace : ptrace failed");
210 
211 		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
212 		if (n < 0)
213 			fatal_perror("check_ptrace : wait failed");
214 
215 		if (!WIFSTOPPED(status) ||
216 		   (WSTOPSIG(status) != (SIGTRAP | 0x80)))
217 			fatal("check_ptrace : expected (SIGTRAP|0x80), "
218 			       "got status = %d", status);
219 
220 		syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
221 				 0);
222 		if (syscall == __NR_getpid) {
223 			n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
224 				   __NR_getppid);
225 			if (n < 0)
226 				fatal_perror("check_ptrace : failed to modify "
227 					     "system call");
228 			break;
229 		}
230 	}
231 	stop_ptraced_child(pid, 0);
232 	os_info("OK\n");
233 	check_sysemu();
234 }
235 
236 extern unsigned long host_fp_size;
237 extern unsigned long exec_regs[MAX_REG_NR];
238 extern unsigned long *exec_fp_regs;
239 
240 __initdata static struct stub_data *seccomp_test_stub_data;
241 
sigsys_handler(int sig,siginfo_t * info,void * p)242 static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
243 {
244 	ucontext_t *uc = p;
245 
246 	/* Stow away the location of the mcontext in the stack */
247 	seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
248 					      (unsigned long)&seccomp_test_stub_data->sigstack[0];
249 
250 	/* Prevent libc from clearing memory (mctx_offset in particular) */
251 	syscall(__NR_exit, 0);
252 }
253 
seccomp_helper(void * data)254 static int __init seccomp_helper(void *data)
255 {
256 	static struct sock_filter filter[] = {
257 		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
258 			 offsetof(struct seccomp_data, nr)),
259 		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
260 		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
261 		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
262 	};
263 	static struct sock_fprog prog = {
264 		.len = ARRAY_SIZE(filter),
265 		.filter = filter,
266 	};
267 	struct sigaction sa;
268 
269 	/* close_range is needed for the stub */
270 	if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
271 		exit(1);
272 
273 	set_sigstack(seccomp_test_stub_data->sigstack,
274 			sizeof(seccomp_test_stub_data->sigstack));
275 
276 	sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
277 	sa.sa_sigaction = (void *) sigsys_handler;
278 	sa.sa_restorer = NULL;
279 	if (sigaction(SIGSYS, &sa, NULL) < 0)
280 		exit(2);
281 
282 	prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
283 	if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
284 			SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
285 		exit(3);
286 
287 	sleep(0);
288 
289 	/* Never reached. */
290 	_exit(4);
291 }
292 
init_seccomp(void)293 static bool __init init_seccomp(void)
294 {
295 	int pid;
296 	int status;
297 	int n;
298 	unsigned long sp;
299 
300 	/*
301 	 * We check that we can install a seccomp filter and then exit(0)
302 	 * from a trapped syscall.
303 	 *
304 	 * Note that we cannot verify that no seccomp filter already exists
305 	 * for a syscall that results in the process/thread to be killed.
306 	 */
307 
308 	os_info("Checking that seccomp filters can be installed...");
309 
310 	seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
311 				      PROT_READ | PROT_WRITE,
312 				      MAP_SHARED | MAP_ANON, 0, 0);
313 
314 	/* Use the syscall data area as stack, we just need something */
315 	sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
316 	     sizeof(seccomp_test_stub_data->syscall_data) -
317 	     sizeof(void *);
318 	pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
319 
320 	if (pid < 0)
321 		fatal_perror("check_seccomp : clone failed");
322 
323 	CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
324 	if (n < 0)
325 		fatal_perror("check_seccomp : waitpid failed");
326 
327 	if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
328 		struct uml_pt_regs *regs;
329 		unsigned long fp_size;
330 		int r;
331 
332 		/* Fill in the host_fp_size from the mcontext. */
333 		regs = calloc(1, sizeof(struct uml_pt_regs));
334 		get_stub_state(regs, seccomp_test_stub_data, &fp_size);
335 		host_fp_size = fp_size;
336 		free(regs);
337 
338 		/* Repeat with the correct size */
339 		regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
340 		r = get_stub_state(regs, seccomp_test_stub_data, NULL);
341 
342 		/* Store as the default startup registers */
343 		exec_fp_regs = malloc(host_fp_size);
344 		memcpy(exec_regs, regs->gp, sizeof(exec_regs));
345 		memcpy(exec_fp_regs, regs->fp, host_fp_size);
346 
347 		munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
348 
349 		free(regs);
350 
351 		if (r) {
352 			os_info("failed to fetch registers: %d\n", r);
353 			return false;
354 		}
355 
356 		os_info("OK\n");
357 		return true;
358 	}
359 
360 	if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
361 		os_info("missing\n");
362 	else
363 		os_info("error\n");
364 
365 	munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
366 	return false;
367 }
368 
369 
check_coredump_limit(void)370 static void __init check_coredump_limit(void)
371 {
372 	struct rlimit lim;
373 	int err = getrlimit(RLIMIT_CORE, &lim);
374 
375 	if (err) {
376 		perror("Getting core dump limit");
377 		return;
378 	}
379 
380 	os_info("Core dump limits :\n\tsoft - ");
381 	if (lim.rlim_cur == RLIM_INFINITY)
382 		os_info("NONE\n");
383 	else
384 		os_info("%llu\n", (unsigned long long)lim.rlim_cur);
385 
386 	os_info("\thard - ");
387 	if (lim.rlim_max == RLIM_INFINITY)
388 		os_info("NONE\n");
389 	else
390 		os_info("%llu\n", (unsigned long long)lim.rlim_max);
391 }
392 
get_host_cpu_features(void (* flags_helper_func)(char * line),void (* cache_helper_func)(char * line))393 void  __init get_host_cpu_features(
394 		void (*flags_helper_func)(char *line),
395 		void (*cache_helper_func)(char *line))
396 {
397 	FILE *cpuinfo;
398 	char *line = NULL;
399 	size_t len = 0;
400 	int done_parsing = 0;
401 
402 	cpuinfo = fopen("/proc/cpuinfo", "r");
403 	if (cpuinfo == NULL) {
404 		os_info("Failed to get host CPU features\n");
405 	} else {
406 		while ((getline(&line, &len, cpuinfo)) != -1) {
407 			if (strstr(line, "flags")) {
408 				flags_helper_func(line);
409 				done_parsing++;
410 			}
411 			if (strstr(line, "cache_alignment")) {
412 				cache_helper_func(line);
413 				done_parsing++;
414 			}
415 			free(line);
416 			line = NULL;
417 			if (done_parsing > 1)
418 				break;
419 		}
420 		fclose(cpuinfo);
421 	}
422 }
423 
424 static int seccomp_config __initdata;
425 
uml_seccomp_config(char * line,int * add)426 static int __init uml_seccomp_config(char *line, int *add)
427 {
428 	*add = 0;
429 
430 	if (strcmp(line, "off") == 0)
431 		seccomp_config = 0;
432 	else if (strcmp(line, "auto") == 0)
433 		seccomp_config = 1;
434 	else if (strcmp(line, "on") == 0)
435 		seccomp_config = 2;
436 	else
437 		fatal("Invalid seccomp option '%s', expected on/auto/off\n",
438 		      line);
439 
440 	return 0;
441 }
442 
443 __uml_setup("seccomp=", uml_seccomp_config,
444 "seccomp=<on/auto/off>\n"
445 "    Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
446 "    processes work collaboratively with the kernel instead of being\n"
447 "    traced using ptrace. All syscalls from the application are caught and\n"
448 "    redirected using a signal. This signal handler in turn is permitted to\n"
449 "    do the selected set of syscalls to communicate with the UML kernel and\n"
450 "    do the required memory management.\n"
451 "\n"
452 "    This method is overall faster than the ptrace based userspace, primarily\n"
453 "    because it reduces the number of context switches for (minor) page faults.\n"
454 "\n"
455 "    However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
456 "    userspace from reading and writing all physical memory. Userspace\n"
457 "    processes could also trick the stub into disabling SIGALRM which\n"
458 "    prevents it from being interrupted for scheduling purposes.\n"
459 "\n"
460 "    This is insecure and should only be used with a trusted userspace\n\n"
461 );
462 
os_early_checks(void)463 void __init os_early_checks(void)
464 {
465 	int pid;
466 
467 	/* Print out the core dump limits early */
468 	check_coredump_limit();
469 
470 	/* Need to check this early because mmapping happens before the
471 	 * kernel is running.
472 	 */
473 	check_tmpexec();
474 
475 	if (seccomp_config) {
476 		if (init_seccomp()) {
477 			using_seccomp = 1;
478 			return;
479 		}
480 
481 		if (seccomp_config == 2)
482 			fatal("SECCOMP userspace requested but not functional!\n");
483 	}
484 
485 	if (uml_ncpus > 1)
486 		fatal("SMP is not supported with PTRACE userspace.\n");
487 
488 	using_seccomp = 0;
489 	check_ptrace();
490 
491 	pid = start_ptraced_child();
492 	if (init_pid_registers(pid))
493 		fatal("Failed to initialize default registers");
494 	stop_ptraced_child(pid, 1);
495 }
496