1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
5 */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <unistd.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sched.h>
14 #include <signal.h>
15 #include <string.h>
16 #include <sys/mman.h>
17 #include <sys/stat.h>
18 #include <sys/wait.h>
19 #include <sys/time.h>
20 #include <sys/resource.h>
21 #include <asm/ldt.h>
22 #include <asm/unistd.h>
23 #include <init.h>
24 #include <os.h>
25 #include <kern_util.h>
26 #include <mem_user.h>
27 #include <ptrace_user.h>
28 #include <stdbool.h>
29 #include <stub-data.h>
30 #include <sys/prctl.h>
31 #include <linux/seccomp.h>
32 #include <linux/filter.h>
33 #include <sysdep/mcontext.h>
34 #include <sysdep/stub.h>
35 #include <registers.h>
36 #include <skas.h>
37 #include "internal.h"
38
ptrace_child(void)39 static void ptrace_child(void)
40 {
41 int ret;
42 /* Calling os_getpid because some libcs cached getpid incorrectly */
43 int pid = os_getpid(), ppid = getppid();
44 int sc_result;
45
46 if (change_sig(SIGWINCH, 0) < 0 ||
47 ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
48 perror("ptrace");
49 kill(pid, SIGKILL);
50 }
51 kill(pid, SIGSTOP);
52
53 /*
54 * This syscall will be intercepted by the parent. Don't call more than
55 * once, please.
56 */
57 sc_result = os_getpid();
58
59 if (sc_result == pid)
60 /* Nothing modified by the parent, we are running normally. */
61 ret = 1;
62 else if (sc_result == ppid)
63 /*
64 * Expected in check_ptrace and check_sysemu when they succeed
65 * in modifying the stack frame
66 */
67 ret = 0;
68 else
69 /* Serious trouble! This could be caused by a bug in host 2.6
70 * SKAS3/2.6 patch before release -V6, together with a bug in
71 * the UML code itself.
72 */
73 ret = 2;
74
75 exit(ret);
76 }
77
fatal_perror(const char * str)78 static void fatal_perror(const char *str)
79 {
80 perror(str);
81 exit(1);
82 }
83
fatal(char * fmt,...)84 static void fatal(char *fmt, ...)
85 {
86 va_list list;
87
88 va_start(list, fmt);
89 vfprintf(stderr, fmt, list);
90 va_end(list);
91
92 exit(1);
93 }
94
non_fatal(char * fmt,...)95 static void non_fatal(char *fmt, ...)
96 {
97 va_list list;
98
99 va_start(list, fmt);
100 vfprintf(stderr, fmt, list);
101 va_end(list);
102 }
103
start_ptraced_child(void)104 static int start_ptraced_child(void)
105 {
106 int pid, n, status;
107
108 fflush(stdout);
109
110 pid = fork();
111 if (pid == 0)
112 ptrace_child();
113 else if (pid < 0)
114 fatal_perror("start_ptraced_child : fork failed");
115
116 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
117 if (n < 0)
118 fatal_perror("check_ptrace : waitpid failed");
119 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
120 fatal("check_ptrace : expected SIGSTOP, got status = %d",
121 status);
122
123 return pid;
124 }
125
stop_ptraced_child(int pid,int exitcode)126 static void stop_ptraced_child(int pid, int exitcode)
127 {
128 int status, n;
129
130 if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
131 fatal_perror("stop_ptraced_child : ptrace failed");
132
133 CATCH_EINTR(n = waitpid(pid, &status, 0));
134 if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
135 int exit_with = WEXITSTATUS(status);
136 fatal("stop_ptraced_child : child exited with exitcode %d, "
137 "while expecting %d; status 0x%x\n", exit_with,
138 exitcode, status);
139 }
140 }
141
check_sysemu(void)142 static void __init check_sysemu(void)
143 {
144 int pid, n, status, count=0;
145
146 os_info("Checking syscall emulation for ptrace...");
147 pid = start_ptraced_child();
148
149 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
150 (void *) PTRACE_O_TRACESYSGOOD) < 0))
151 fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
152
153 while (1) {
154 count++;
155 if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
156 goto fail;
157 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
158 if (n < 0)
159 fatal_perror("check_sysemu: wait failed");
160
161 if (WIFSTOPPED(status) &&
162 (WSTOPSIG(status) == (SIGTRAP|0x80))) {
163 if (!count) {
164 non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
165 "doesn't singlestep");
166 goto fail;
167 }
168 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
169 os_getpid());
170 if (n < 0)
171 fatal_perror("check_sysemu : failed to modify "
172 "system call return");
173 break;
174 }
175 else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
176 count++;
177 else {
178 non_fatal("check_sysemu: expected SIGTRAP or "
179 "(SIGTRAP | 0x80), got status = %d\n",
180 status);
181 goto fail;
182 }
183 }
184 stop_ptraced_child(pid, 0);
185
186 os_info("OK\n");
187
188 return;
189
190 fail:
191 stop_ptraced_child(pid, 1);
192 fatal("missing\n");
193 }
194
check_ptrace(void)195 static void __init check_ptrace(void)
196 {
197 int pid, syscall, n, status;
198
199 os_info("Checking that ptrace can change system call numbers...");
200 pid = start_ptraced_child();
201
202 if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
203 (void *) PTRACE_O_TRACESYSGOOD) < 0))
204 fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
205
206 while (1) {
207 if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
208 fatal_perror("check_ptrace : ptrace failed");
209
210 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
211 if (n < 0)
212 fatal_perror("check_ptrace : wait failed");
213
214 if (!WIFSTOPPED(status) ||
215 (WSTOPSIG(status) != (SIGTRAP | 0x80)))
216 fatal("check_ptrace : expected (SIGTRAP|0x80), "
217 "got status = %d", status);
218
219 syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
220 0);
221 if (syscall == __NR_getpid) {
222 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
223 __NR_getppid);
224 if (n < 0)
225 fatal_perror("check_ptrace : failed to modify "
226 "system call");
227 break;
228 }
229 }
230 stop_ptraced_child(pid, 0);
231 os_info("OK\n");
232 check_sysemu();
233 }
234
235 extern unsigned long host_fp_size;
236 extern unsigned long exec_regs[MAX_REG_NR];
237 extern unsigned long *exec_fp_regs;
238
239 __initdata static struct stub_data *seccomp_test_stub_data;
240
sigsys_handler(int sig,siginfo_t * info,void * p)241 static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
242 {
243 ucontext_t *uc = p;
244
245 /* Stow away the location of the mcontext in the stack */
246 seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
247 (unsigned long)&seccomp_test_stub_data->sigstack[0];
248
249 /* Prevent libc from clearing memory (mctx_offset in particular) */
250 syscall(__NR_exit, 0);
251 }
252
seccomp_helper(void * data)253 static int __init seccomp_helper(void *data)
254 {
255 static struct sock_filter filter[] = {
256 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
257 offsetof(struct seccomp_data, nr)),
258 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
259 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
260 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
261 };
262 static struct sock_fprog prog = {
263 .len = ARRAY_SIZE(filter),
264 .filter = filter,
265 };
266 struct sigaction sa;
267
268 /* close_range is needed for the stub */
269 if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
270 exit(1);
271
272 set_sigstack(seccomp_test_stub_data->sigstack,
273 sizeof(seccomp_test_stub_data->sigstack));
274
275 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
276 sa.sa_sigaction = (void *) sigsys_handler;
277 sa.sa_restorer = NULL;
278 if (sigaction(SIGSYS, &sa, NULL) < 0)
279 exit(2);
280
281 prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
282 if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
283 SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
284 exit(3);
285
286 sleep(0);
287
288 /* Never reached. */
289 _exit(4);
290 }
291
init_seccomp(void)292 static bool __init init_seccomp(void)
293 {
294 int pid;
295 int status;
296 int n;
297 unsigned long sp;
298
299 /*
300 * We check that we can install a seccomp filter and then exit(0)
301 * from a trapped syscall.
302 *
303 * Note that we cannot verify that no seccomp filter already exists
304 * for a syscall that results in the process/thread to be killed.
305 */
306
307 os_info("Checking that seccomp filters can be installed...");
308
309 seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
310 PROT_READ | PROT_WRITE,
311 MAP_SHARED | MAP_ANON, 0, 0);
312
313 /* Use the syscall data area as stack, we just need something */
314 sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
315 sizeof(seccomp_test_stub_data->syscall_data) -
316 sizeof(void *);
317 pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
318
319 if (pid < 0)
320 fatal_perror("check_seccomp : clone failed");
321
322 CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
323 if (n < 0)
324 fatal_perror("check_seccomp : waitpid failed");
325
326 if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
327 struct uml_pt_regs *regs;
328 unsigned long fp_size;
329 int r;
330
331 /* Fill in the host_fp_size from the mcontext. */
332 regs = calloc(1, sizeof(struct uml_pt_regs));
333 get_stub_state(regs, seccomp_test_stub_data, &fp_size);
334 host_fp_size = fp_size;
335 free(regs);
336
337 /* Repeat with the correct size */
338 regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
339 r = get_stub_state(regs, seccomp_test_stub_data, NULL);
340
341 /* Store as the default startup registers */
342 exec_fp_regs = malloc(host_fp_size);
343 memcpy(exec_regs, regs->gp, sizeof(exec_regs));
344 memcpy(exec_fp_regs, regs->fp, host_fp_size);
345
346 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
347
348 free(regs);
349
350 if (r) {
351 os_info("failed to fetch registers: %d\n", r);
352 return false;
353 }
354
355 os_info("OK\n");
356 return true;
357 }
358
359 if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
360 os_info("missing\n");
361 else
362 os_info("error\n");
363
364 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
365 return false;
366 }
367
368
check_coredump_limit(void)369 static void __init check_coredump_limit(void)
370 {
371 struct rlimit lim;
372 int err = getrlimit(RLIMIT_CORE, &lim);
373
374 if (err) {
375 perror("Getting core dump limit");
376 return;
377 }
378
379 os_info("Core dump limits :\n\tsoft - ");
380 if (lim.rlim_cur == RLIM_INFINITY)
381 os_info("NONE\n");
382 else
383 os_info("%llu\n", (unsigned long long)lim.rlim_cur);
384
385 os_info("\thard - ");
386 if (lim.rlim_max == RLIM_INFINITY)
387 os_info("NONE\n");
388 else
389 os_info("%llu\n", (unsigned long long)lim.rlim_max);
390 }
391
get_host_cpu_features(void (* flags_helper_func)(char * line),void (* cache_helper_func)(char * line))392 void __init get_host_cpu_features(
393 void (*flags_helper_func)(char *line),
394 void (*cache_helper_func)(char *line))
395 {
396 FILE *cpuinfo;
397 char *line = NULL;
398 size_t len = 0;
399 int done_parsing = 0;
400
401 cpuinfo = fopen("/proc/cpuinfo", "r");
402 if (cpuinfo == NULL) {
403 os_info("Failed to get host CPU features\n");
404 } else {
405 while ((getline(&line, &len, cpuinfo)) != -1) {
406 if (strstr(line, "flags")) {
407 flags_helper_func(line);
408 done_parsing++;
409 }
410 if (strstr(line, "cache_alignment")) {
411 cache_helper_func(line);
412 done_parsing++;
413 }
414 free(line);
415 line = NULL;
416 if (done_parsing > 1)
417 break;
418 }
419 fclose(cpuinfo);
420 }
421 }
422
423 static int seccomp_config __initdata;
424
uml_seccomp_config(char * line,int * add)425 static int __init uml_seccomp_config(char *line, int *add)
426 {
427 *add = 0;
428
429 if (strcmp(line, "off") == 0)
430 seccomp_config = 0;
431 else if (strcmp(line, "auto") == 0)
432 seccomp_config = 1;
433 else if (strcmp(line, "on") == 0)
434 seccomp_config = 2;
435 else
436 fatal("Invalid seccomp option '%s', expected on/auto/off\n",
437 line);
438
439 return 0;
440 }
441
442 __uml_setup("seccomp=", uml_seccomp_config,
443 "seccomp=<on/auto/off>\n"
444 " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
445 " processes work collaboratively with the kernel instead of being\n"
446 " traced using ptrace. All syscalls from the application are caught and\n"
447 " redirected using a signal. This signal handler in turn is permitted to\n"
448 " do the selected set of syscalls to communicate with the UML kernel and\n"
449 " do the required memory management.\n"
450 "\n"
451 " This method is overall faster than the ptrace based userspace, primarily\n"
452 " because it reduces the number of context switches for (minor) page faults.\n"
453 "\n"
454 " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
455 " userspace from reading and writing all physical memory. Userspace\n"
456 " processes could also trick the stub into disabling SIGALRM which\n"
457 " prevents it from being interrupted for scheduling purposes.\n"
458 "\n"
459 " This is insecure and should only be used with a trusted userspace\n\n"
460 );
461
os_early_checks(void)462 void __init os_early_checks(void)
463 {
464 int pid;
465
466 /* Print out the core dump limits early */
467 check_coredump_limit();
468
469 /* Need to check this early because mmapping happens before the
470 * kernel is running.
471 */
472 check_tmpexec();
473
474 if (seccomp_config) {
475 if (init_seccomp()) {
476 using_seccomp = 1;
477 return;
478 }
479
480 if (seccomp_config == 2)
481 fatal("SECCOMP userspace requested but not functional!\n");
482 }
483
484 using_seccomp = 0;
485 check_ptrace();
486
487 pid = start_ptraced_child();
488 if (init_pid_registers(pid))
489 fatal("Failed to initialize default registers");
490 stop_ptraced_child(pid, 1);
491 }
492
parse_iomem(char * str,int * add)493 int __init parse_iomem(char *str, int *add)
494 {
495 struct iomem_region *new;
496 struct stat64 buf;
497 char *file, *driver;
498 int fd, size;
499
500 driver = str;
501 file = strchr(str,',');
502 if (file == NULL) {
503 os_warn("parse_iomem : failed to parse iomem\n");
504 goto out;
505 }
506 *file = '\0';
507 file++;
508 fd = open(file, O_RDWR, 0);
509 if (fd < 0) {
510 perror("parse_iomem - Couldn't open io file");
511 goto out;
512 }
513
514 if (fstat64(fd, &buf) < 0) {
515 perror("parse_iomem - cannot stat_fd file");
516 goto out_close;
517 }
518
519 new = malloc(sizeof(*new));
520 if (new == NULL) {
521 perror("Couldn't allocate iomem_region struct");
522 goto out_close;
523 }
524
525 size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1);
526
527 *new = ((struct iomem_region) { .next = iomem_regions,
528 .driver = driver,
529 .fd = fd,
530 .size = size,
531 .phys = 0,
532 .virt = 0 });
533 iomem_regions = new;
534 iomem_size += new->size + UM_KERN_PAGE_SIZE;
535
536 return 0;
537 out_close:
538 close(fd);
539 out:
540 return 1;
541 }
542