1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5 */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <unistd.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sched.h>
14 #include <signal.h>
15 #include <string.h>
16 #include <sys/mman.h>
17 #include <sys/stat.h>
18 #include <sys/wait.h>
19 #include <sys/time.h>
20 #include <sys/resource.h>
21 #include <asm/ldt.h>
22 #include <asm/unistd.h>
23 #include <init.h>
24 #include <os.h>
25 #include <smp.h>
26 #include <kern_util.h>
27 #include <mem_user.h>
28 #include <ptrace_user.h>
29 #include <stdbool.h>
30 #include <stub-data.h>
31 #include <sys/prctl.h>
32 #include <linux/seccomp.h>
33 #include <linux/filter.h>
34 #include <sysdep/mcontext.h>
35 #include <sysdep/stub.h>
36 #include <registers.h>
37 #include <skas.h>
38 #include "internal.h"
39
ptrace_child(void)40 static void ptrace_child(void)
41 {
42 int ret;
43 /* Calling os_getpid because some libcs cached getpid incorrectly */
44 int pid = os_getpid(), ppid = getppid();
45 int sc_result;
46
47 if (change_sig(SIGWINCH, 0) < 0 ||
48 ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
49 perror("ptrace");
50 kill(pid, SIGKILL);
51 }
52 kill(pid, SIGSTOP);
53
54 /*
55 * This syscall will be intercepted by the parent. Don't call more than
56 * once, please.
57 */
58 sc_result = os_getpid();
59
60 if (sc_result == pid)
61 /* Nothing modified by the parent, we are running normally. */
62 ret = 1;
63 else if (sc_result == ppid)
64 /*
65 * Expected in check_ptrace and check_sysemu when they succeed
66 * in modifying the stack frame
67 */
68 ret = 0;
69 else
70 /* Serious trouble! This could be caused by a bug in host 2.6
71 * SKAS3/2.6 patch before release -V6, together with a bug in
72 * the UML code itself.
73 */
74 ret = 2;
75
76 exit(ret);
77 }
78
fatal_perror(const char * str)79 static void fatal_perror(const char *str)
80 {
81 perror(str);
82 exit(1);
83 }
84
fatal(char * fmt,...)85 static void fatal(char *fmt, ...)
86 {
87 va_list list;
88
89 va_start(list, fmt);
90 vfprintf(stderr, fmt, list);
91 va_end(list);
92
93 exit(1);
94 }
95
non_fatal(char * fmt,...)96 static void non_fatal(char *fmt, ...)
97 {
98 va_list list;
99
100 va_start(list, fmt);
101 vfprintf(stderr, fmt, list);
102 va_end(list);
103 }
104
start_ptraced_child(void)105 static int start_ptraced_child(void)
106 {
107 int pid, n, status;
108
109 fflush(stdout);
110
111 pid = fork();
112 if (pid == 0)
113 ptrace_child();
114 else if (pid < 0)
115 fatal_perror("start_ptraced_child : fork failed");
116
117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
118 if (n < 0)
119 fatal_perror("check_ptrace : waitpid failed");
120 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
121 fatal("check_ptrace : expected SIGSTOP, got status = %d",
122 status);
123
124 return pid;
125 }
126
stop_ptraced_child(int pid,int exitcode)127 static void stop_ptraced_child(int pid, int exitcode)
128 {
129 int status, n;
130
131 if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
132 fatal_perror("stop_ptraced_child : ptrace failed");
133
134 CATCH_EINTR(n = waitpid(pid, &status, 0));
135 if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
136 int exit_with = WEXITSTATUS(status);
137 fatal("stop_ptraced_child : child exited with exitcode %d, "
138 "while expecting %d; status 0x%x\n", exit_with,
139 exitcode, status);
140 }
141 }
142
check_sysemu(void)143 static void __init check_sysemu(void)
144 {
145 int pid, n, status, count=0;
146
147 os_info("Checking syscall emulation for ptrace...");
148 pid = start_ptraced_child();
149
150 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
151 (void *) PTRACE_O_TRACESYSGOOD) < 0))
152 fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
153
154 while (1) {
155 count++;
156 if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
157 goto fail;
158 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
159 if (n < 0)
160 fatal_perror("check_sysemu: wait failed");
161
162 if (WIFSTOPPED(status) &&
163 (WSTOPSIG(status) == (SIGTRAP|0x80))) {
164 if (!count) {
165 non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
166 "doesn't singlestep");
167 goto fail;
168 }
169 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
170 os_getpid());
171 if (n < 0)
172 fatal_perror("check_sysemu : failed to modify "
173 "system call return");
174 break;
175 }
176 else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
177 count++;
178 else {
179 non_fatal("check_sysemu: expected SIGTRAP or "
180 "(SIGTRAP | 0x80), got status = %d\n",
181 status);
182 goto fail;
183 }
184 }
185 stop_ptraced_child(pid, 0);
186
187 os_info("OK\n");
188
189 return;
190
191 fail:
192 stop_ptraced_child(pid, 1);
193 fatal("missing\n");
194 }
195
check_ptrace(void)196 static void __init check_ptrace(void)
197 {
198 int pid, syscall, n, status;
199
200 os_info("Checking that ptrace can change system call numbers...");
201 pid = start_ptraced_child();
202
203 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
204 (void *) PTRACE_O_TRACESYSGOOD) < 0))
205 fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
206
207 while (1) {
208 if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
209 fatal_perror("check_ptrace : ptrace failed");
210
211 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
212 if (n < 0)
213 fatal_perror("check_ptrace : wait failed");
214
215 if (!WIFSTOPPED(status) ||
216 (WSTOPSIG(status) != (SIGTRAP | 0x80)))
217 fatal("check_ptrace : expected (SIGTRAP|0x80), "
218 "got status = %d", status);
219
220 syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
221 0);
222 if (syscall == __NR_getpid) {
223 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
224 __NR_getppid);
225 if (n < 0)
226 fatal_perror("check_ptrace : failed to modify "
227 "system call");
228 break;
229 }
230 }
231 stop_ptraced_child(pid, 0);
232 os_info("OK\n");
233 check_sysemu();
234 }
235
236 extern unsigned long host_fp_size;
237 extern unsigned long exec_regs[MAX_REG_NR];
238 extern unsigned long *exec_fp_regs;
239
240 __initdata static struct stub_data *seccomp_test_stub_data;
241
sigsys_handler(int sig,siginfo_t * info,void * p)242 static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
243 {
244 ucontext_t *uc = p;
245
246 /* Stow away the location of the mcontext in the stack */
247 seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
248 (unsigned long)&seccomp_test_stub_data->sigstack[0];
249
250 /* Prevent libc from clearing memory (mctx_offset in particular) */
251 syscall(__NR_exit, 0);
252 }
253
seccomp_helper(void * data)254 static int __init seccomp_helper(void *data)
255 {
256 static struct sock_filter filter[] = {
257 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
258 offsetof(struct seccomp_data, nr)),
259 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
260 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
261 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
262 };
263 static struct sock_fprog prog = {
264 .len = ARRAY_SIZE(filter),
265 .filter = filter,
266 };
267 struct sigaction sa;
268
269 /* close_range is needed for the stub */
270 if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
271 exit(1);
272
273 set_sigstack(seccomp_test_stub_data->sigstack,
274 sizeof(seccomp_test_stub_data->sigstack));
275
276 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
277 sa.sa_sigaction = (void *) sigsys_handler;
278 sa.sa_restorer = NULL;
279 if (sigaction(SIGSYS, &sa, NULL) < 0)
280 exit(2);
281
282 prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
283 if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
284 SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
285 exit(3);
286
287 sleep(0);
288
289 /* Never reached. */
290 _exit(4);
291 }
292
init_seccomp(void)293 static bool __init init_seccomp(void)
294 {
295 int pid;
296 int status;
297 int n;
298 unsigned long sp;
299
300 /*
301 * We check that we can install a seccomp filter and then exit(0)
302 * from a trapped syscall.
303 *
304 * Note that we cannot verify that no seccomp filter already exists
305 * for a syscall that results in the process/thread to be killed.
306 */
307
308 os_info("Checking that seccomp filters can be installed...");
309
310 seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
311 PROT_READ | PROT_WRITE,
312 MAP_SHARED | MAP_ANON, 0, 0);
313
314 /* Use the syscall data area as stack, we just need something */
315 sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
316 sizeof(seccomp_test_stub_data->syscall_data) -
317 sizeof(void *);
318 pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
319
320 if (pid < 0)
321 fatal_perror("check_seccomp : clone failed");
322
323 CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
324 if (n < 0)
325 fatal_perror("check_seccomp : waitpid failed");
326
327 if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
328 struct uml_pt_regs *regs;
329 unsigned long fp_size;
330 int r;
331
332 /* Fill in the host_fp_size from the mcontext. */
333 regs = calloc(1, sizeof(struct uml_pt_regs));
334 get_stub_state(regs, seccomp_test_stub_data, &fp_size);
335 host_fp_size = fp_size;
336 free(regs);
337
338 /* Repeat with the correct size */
339 regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
340 r = get_stub_state(regs, seccomp_test_stub_data, NULL);
341
342 /* Store as the default startup registers */
343 exec_fp_regs = malloc(host_fp_size);
344 memcpy(exec_regs, regs->gp, sizeof(exec_regs));
345 memcpy(exec_fp_regs, regs->fp, host_fp_size);
346
347 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
348
349 free(regs);
350
351 if (r) {
352 os_info("failed to fetch registers: %d\n", r);
353 return false;
354 }
355
356 os_info("OK\n");
357 return true;
358 }
359
360 if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
361 os_info("missing\n");
362 else
363 os_info("error\n");
364
365 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
366 return false;
367 }
368
369
check_coredump_limit(void)370 static void __init check_coredump_limit(void)
371 {
372 struct rlimit lim;
373 int err = getrlimit(RLIMIT_CORE, &lim);
374
375 if (err) {
376 perror("Getting core dump limit");
377 return;
378 }
379
380 os_info("Core dump limits :\n\tsoft - ");
381 if (lim.rlim_cur == RLIM_INFINITY)
382 os_info("NONE\n");
383 else
384 os_info("%llu\n", (unsigned long long)lim.rlim_cur);
385
386 os_info("\thard - ");
387 if (lim.rlim_max == RLIM_INFINITY)
388 os_info("NONE\n");
389 else
390 os_info("%llu\n", (unsigned long long)lim.rlim_max);
391 }
392
get_host_cpu_features(void (* flags_helper_func)(char * line),void (* cache_helper_func)(char * line))393 void __init get_host_cpu_features(
394 void (*flags_helper_func)(char *line),
395 void (*cache_helper_func)(char *line))
396 {
397 FILE *cpuinfo;
398 char *line = NULL;
399 size_t len = 0;
400 int done_parsing = 0;
401
402 cpuinfo = fopen("/proc/cpuinfo", "r");
403 if (cpuinfo == NULL) {
404 os_info("Failed to get host CPU features\n");
405 } else {
406 while ((getline(&line, &len, cpuinfo)) != -1) {
407 if (strstr(line, "flags")) {
408 flags_helper_func(line);
409 done_parsing++;
410 }
411 if (strstr(line, "cache_alignment")) {
412 cache_helper_func(line);
413 done_parsing++;
414 }
415 free(line);
416 line = NULL;
417 if (done_parsing > 1)
418 break;
419 }
420 fclose(cpuinfo);
421 }
422 }
423
424 static int seccomp_config __initdata;
425
uml_seccomp_config(char * line,int * add)426 static int __init uml_seccomp_config(char *line, int *add)
427 {
428 *add = 0;
429
430 if (strcmp(line, "off") == 0)
431 seccomp_config = 0;
432 else if (strcmp(line, "auto") == 0)
433 seccomp_config = 1;
434 else if (strcmp(line, "on") == 0)
435 seccomp_config = 2;
436 else
437 fatal("Invalid seccomp option '%s', expected on/auto/off\n",
438 line);
439
440 return 0;
441 }
442
443 __uml_setup("seccomp=", uml_seccomp_config,
444 "seccomp=<on/auto/off>\n"
445 " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
446 " processes work collaboratively with the kernel instead of being\n"
447 " traced using ptrace. All syscalls from the application are caught and\n"
448 " redirected using a signal. This signal handler in turn is permitted to\n"
449 " do the selected set of syscalls to communicate with the UML kernel and\n"
450 " do the required memory management.\n"
451 "\n"
452 " This method is overall faster than the ptrace based userspace, primarily\n"
453 " because it reduces the number of context switches for (minor) page faults.\n"
454 "\n"
455 " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
456 " userspace from reading and writing all physical memory. Userspace\n"
457 " processes could also trick the stub into disabling SIGALRM which\n"
458 " prevents it from being interrupted for scheduling purposes.\n"
459 "\n"
460 " This is insecure and should only be used with a trusted userspace\n\n"
461 );
462
os_early_checks(void)463 void __init os_early_checks(void)
464 {
465 int pid;
466
467 /* Print out the core dump limits early */
468 check_coredump_limit();
469
470 /* Need to check this early because mmapping happens before the
471 * kernel is running.
472 */
473 check_tmpexec();
474
475 if (seccomp_config) {
476 if (init_seccomp()) {
477 using_seccomp = 1;
478 return;
479 }
480
481 if (seccomp_config == 2)
482 fatal("SECCOMP userspace requested but not functional!\n");
483 }
484
485 if (uml_ncpus > 1)
486 fatal("SMP is not supported with PTRACE userspace.\n");
487
488 using_seccomp = 0;
489 check_ptrace();
490
491 pid = start_ptraced_child();
492 if (init_pid_registers(pid))
493 fatal("Failed to initialize default registers");
494 stop_ptraced_child(pid, 1);
495 }
496