1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> 4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <stdarg.h> 10 #include <unistd.h> 11 #include <errno.h> 12 #include <fcntl.h> 13 #include <sched.h> 14 #include <signal.h> 15 #include <string.h> 16 #include <sys/mman.h> 17 #include <sys/stat.h> 18 #include <sys/wait.h> 19 #include <sys/time.h> 20 #include <sys/resource.h> 21 #include <asm/ldt.h> 22 #include <asm/unistd.h> 23 #include <init.h> 24 #include <os.h> 25 #include <smp.h> 26 #include <kern_util.h> 27 #include <mem_user.h> 28 #include <ptrace_user.h> 29 #include <stdbool.h> 30 #include <stub-data.h> 31 #include <sys/prctl.h> 32 #include <linux/seccomp.h> 33 #include <linux/filter.h> 34 #include <sysdep/mcontext.h> 35 #include <sysdep/stub.h> 36 #include <registers.h> 37 #include <skas.h> 38 #include "internal.h" 39 40 static void ptrace_child(void) 41 { 42 int ret; 43 /* Calling os_getpid because some libcs cached getpid incorrectly */ 44 int pid = os_getpid(), ppid = getppid(); 45 int sc_result; 46 47 if (change_sig(SIGWINCH, 0) < 0 || 48 ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) { 49 perror("ptrace"); 50 kill(pid, SIGKILL); 51 } 52 kill(pid, SIGSTOP); 53 54 /* 55 * This syscall will be intercepted by the parent. Don't call more than 56 * once, please. 57 */ 58 sc_result = os_getpid(); 59 60 if (sc_result == pid) 61 /* Nothing modified by the parent, we are running normally. */ 62 ret = 1; 63 else if (sc_result == ppid) 64 /* 65 * Expected in check_ptrace and check_sysemu when they succeed 66 * in modifying the stack frame 67 */ 68 ret = 0; 69 else 70 /* Serious trouble! This could be caused by a bug in host 2.6 71 * SKAS3/2.6 patch before release -V6, together with a bug in 72 * the UML code itself. 73 */ 74 ret = 2; 75 76 exit(ret); 77 } 78 79 static void fatal_perror(const char *str) 80 { 81 perror(str); 82 exit(1); 83 } 84 85 static void fatal(char *fmt, ...) 86 { 87 va_list list; 88 89 va_start(list, fmt); 90 vfprintf(stderr, fmt, list); 91 va_end(list); 92 93 exit(1); 94 } 95 96 static void non_fatal(char *fmt, ...) 97 { 98 va_list list; 99 100 va_start(list, fmt); 101 vfprintf(stderr, fmt, list); 102 va_end(list); 103 } 104 105 static int start_ptraced_child(void) 106 { 107 int pid, n, status; 108 109 fflush(stdout); 110 111 pid = fork(); 112 if (pid == 0) 113 ptrace_child(); 114 else if (pid < 0) 115 fatal_perror("start_ptraced_child : fork failed"); 116 117 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 118 if (n < 0) 119 fatal_perror("check_ptrace : waitpid failed"); 120 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) 121 fatal("check_ptrace : expected SIGSTOP, got status = %d", 122 status); 123 124 return pid; 125 } 126 127 static void stop_ptraced_child(int pid, int exitcode) 128 { 129 int status, n; 130 131 if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) 132 fatal_perror("stop_ptraced_child : ptrace failed"); 133 134 CATCH_EINTR(n = waitpid(pid, &status, 0)); 135 if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { 136 int exit_with = WEXITSTATUS(status); 137 fatal("stop_ptraced_child : child exited with exitcode %d, " 138 "while expecting %d; status 0x%x\n", exit_with, 139 exitcode, status); 140 } 141 } 142 143 static void __init check_sysemu(void) 144 { 145 int pid, n, status, count=0; 146 147 os_info("Checking syscall emulation for ptrace..."); 148 pid = start_ptraced_child(); 149 150 if ((ptrace(PTRACE_SETOPTIONS, pid, 0, 151 (void *) PTRACE_O_TRACESYSGOOD) < 0)) 152 fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed"); 153 154 while (1) { 155 count++; 156 if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0) 157 goto fail; 158 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 159 if (n < 0) 160 fatal_perror("check_sysemu: wait failed"); 161 162 if (WIFSTOPPED(status) && 163 (WSTOPSIG(status) == (SIGTRAP|0x80))) { 164 if (!count) { 165 non_fatal("check_sysemu: SYSEMU_SINGLESTEP " 166 "doesn't singlestep"); 167 goto fail; 168 } 169 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, 170 os_getpid()); 171 if (n < 0) 172 fatal_perror("check_sysemu : failed to modify " 173 "system call return"); 174 break; 175 } 176 else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) 177 count++; 178 else { 179 non_fatal("check_sysemu: expected SIGTRAP or " 180 "(SIGTRAP | 0x80), got status = %d\n", 181 status); 182 goto fail; 183 } 184 } 185 stop_ptraced_child(pid, 0); 186 187 os_info("OK\n"); 188 189 return; 190 191 fail: 192 stop_ptraced_child(pid, 1); 193 fatal("missing\n"); 194 } 195 196 static void __init check_ptrace(void) 197 { 198 int pid, syscall, n, status; 199 200 os_info("Checking that ptrace can change system call numbers..."); 201 pid = start_ptraced_child(); 202 203 if ((ptrace(PTRACE_SETOPTIONS, pid, 0, 204 (void *) PTRACE_O_TRACESYSGOOD) < 0)) 205 fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed"); 206 207 while (1) { 208 if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) 209 fatal_perror("check_ptrace : ptrace failed"); 210 211 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 212 if (n < 0) 213 fatal_perror("check_ptrace : wait failed"); 214 215 if (!WIFSTOPPED(status) || 216 (WSTOPSIG(status) != (SIGTRAP | 0x80))) 217 fatal("check_ptrace : expected (SIGTRAP|0x80), " 218 "got status = %d", status); 219 220 syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, 221 0); 222 if (syscall == __NR_getpid) { 223 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 224 __NR_getppid); 225 if (n < 0) 226 fatal_perror("check_ptrace : failed to modify " 227 "system call"); 228 break; 229 } 230 } 231 stop_ptraced_child(pid, 0); 232 os_info("OK\n"); 233 check_sysemu(); 234 } 235 236 extern unsigned long host_fp_size; 237 extern unsigned long exec_regs[MAX_REG_NR]; 238 extern unsigned long *exec_fp_regs; 239 240 __initdata static struct stub_data *seccomp_test_stub_data; 241 242 static void __init sigsys_handler(int sig, siginfo_t *info, void *p) 243 { 244 ucontext_t *uc = p; 245 246 /* Stow away the location of the mcontext in the stack */ 247 seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - 248 (unsigned long)&seccomp_test_stub_data->sigstack[0]; 249 250 /* Prevent libc from clearing memory (mctx_offset in particular) */ 251 syscall(__NR_exit, 0); 252 } 253 254 static int __init seccomp_helper(void *data) 255 { 256 static struct sock_filter filter[] = { 257 BPF_STMT(BPF_LD | BPF_W | BPF_ABS, 258 offsetof(struct seccomp_data, nr)), 259 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), 260 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), 261 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), 262 }; 263 static struct sock_fprog prog = { 264 .len = ARRAY_SIZE(filter), 265 .filter = filter, 266 }; 267 struct sigaction sa; 268 269 /* close_range is needed for the stub */ 270 if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) 271 exit(1); 272 273 set_sigstack(seccomp_test_stub_data->sigstack, 274 sizeof(seccomp_test_stub_data->sigstack)); 275 276 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 277 sa.sa_sigaction = (void *) sigsys_handler; 278 sa.sa_restorer = NULL; 279 if (sigaction(SIGSYS, &sa, NULL) < 0) 280 exit(2); 281 282 prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 283 if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 284 SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) 285 exit(3); 286 287 sleep(0); 288 289 /* Never reached. */ 290 _exit(4); 291 } 292 293 static bool __init init_seccomp(void) 294 { 295 int pid; 296 int status; 297 int n; 298 unsigned long sp; 299 300 /* 301 * We check that we can install a seccomp filter and then exit(0) 302 * from a trapped syscall. 303 * 304 * Note that we cannot verify that no seccomp filter already exists 305 * for a syscall that results in the process/thread to be killed. 306 */ 307 308 os_info("Checking that seccomp filters can be installed..."); 309 310 seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), 311 PROT_READ | PROT_WRITE, 312 MAP_SHARED | MAP_ANON, 0, 0); 313 314 /* Use the syscall data area as stack, we just need something */ 315 sp = (unsigned long)&seccomp_test_stub_data->syscall_data + 316 sizeof(seccomp_test_stub_data->syscall_data) - 317 sizeof(void *); 318 pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); 319 320 if (pid < 0) 321 fatal_perror("check_seccomp : clone failed"); 322 323 CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); 324 if (n < 0) 325 fatal_perror("check_seccomp : waitpid failed"); 326 327 if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { 328 struct uml_pt_regs *regs; 329 unsigned long fp_size; 330 int r; 331 332 /* Fill in the host_fp_size from the mcontext. */ 333 regs = calloc(1, sizeof(struct uml_pt_regs)); 334 get_stub_state(regs, seccomp_test_stub_data, &fp_size); 335 host_fp_size = fp_size; 336 free(regs); 337 338 /* Repeat with the correct size */ 339 regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); 340 r = get_stub_state(regs, seccomp_test_stub_data, NULL); 341 342 /* Store as the default startup registers */ 343 exec_fp_regs = malloc(host_fp_size); 344 memcpy(exec_regs, regs->gp, sizeof(exec_regs)); 345 memcpy(exec_fp_regs, regs->fp, host_fp_size); 346 347 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 348 349 free(regs); 350 351 if (r) { 352 os_info("failed to fetch registers: %d\n", r); 353 return false; 354 } 355 356 os_info("OK\n"); 357 return true; 358 } 359 360 if (WIFEXITED(status) && WEXITSTATUS(status) == 2) 361 os_info("missing\n"); 362 else 363 os_info("error\n"); 364 365 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 366 return false; 367 } 368 369 370 static void __init check_coredump_limit(void) 371 { 372 struct rlimit lim; 373 int err = getrlimit(RLIMIT_CORE, &lim); 374 375 if (err) { 376 perror("Getting core dump limit"); 377 return; 378 } 379 380 os_info("Core dump limits :\n\tsoft - "); 381 if (lim.rlim_cur == RLIM_INFINITY) 382 os_info("NONE\n"); 383 else 384 os_info("%llu\n", (unsigned long long)lim.rlim_cur); 385 386 os_info("\thard - "); 387 if (lim.rlim_max == RLIM_INFINITY) 388 os_info("NONE\n"); 389 else 390 os_info("%llu\n", (unsigned long long)lim.rlim_max); 391 } 392 393 void __init get_host_cpu_features( 394 void (*flags_helper_func)(char *line), 395 void (*cache_helper_func)(char *line)) 396 { 397 FILE *cpuinfo; 398 char *line = NULL; 399 size_t len = 0; 400 int done_parsing = 0; 401 402 cpuinfo = fopen("/proc/cpuinfo", "r"); 403 if (cpuinfo == NULL) { 404 os_info("Failed to get host CPU features\n"); 405 } else { 406 while ((getline(&line, &len, cpuinfo)) != -1) { 407 if (strstr(line, "flags")) { 408 flags_helper_func(line); 409 done_parsing++; 410 } 411 if (strstr(line, "cache_alignment")) { 412 cache_helper_func(line); 413 done_parsing++; 414 } 415 free(line); 416 line = NULL; 417 if (done_parsing > 1) 418 break; 419 } 420 fclose(cpuinfo); 421 } 422 } 423 424 static int seccomp_config __initdata; 425 426 static int __init uml_seccomp_config(char *line, int *add) 427 { 428 *add = 0; 429 430 if (strcmp(line, "off") == 0) 431 seccomp_config = 0; 432 else if (strcmp(line, "auto") == 0) 433 seccomp_config = 1; 434 else if (strcmp(line, "on") == 0) 435 seccomp_config = 2; 436 else 437 fatal("Invalid seccomp option '%s', expected on/auto/off\n", 438 line); 439 440 return 0; 441 } 442 443 __uml_setup("seccomp=", uml_seccomp_config, 444 "seccomp=<on/auto/off>\n" 445 " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" 446 " processes work collaboratively with the kernel instead of being\n" 447 " traced using ptrace. All syscalls from the application are caught and\n" 448 " redirected using a signal. This signal handler in turn is permitted to\n" 449 " do the selected set of syscalls to communicate with the UML kernel and\n" 450 " do the required memory management.\n" 451 "\n" 452 " This method is overall faster than the ptrace based userspace, primarily\n" 453 " because it reduces the number of context switches for (minor) page faults.\n" 454 "\n" 455 " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" 456 " userspace from reading and writing all physical memory. Userspace\n" 457 " processes could also trick the stub into disabling SIGALRM which\n" 458 " prevents it from being interrupted for scheduling purposes.\n" 459 "\n" 460 " This is insecure and should only be used with a trusted userspace\n\n" 461 ); 462 463 void __init os_early_checks(void) 464 { 465 int pid; 466 467 /* Print out the core dump limits early */ 468 check_coredump_limit(); 469 470 /* Need to check this early because mmapping happens before the 471 * kernel is running. 472 */ 473 check_tmpexec(); 474 475 if (seccomp_config) { 476 if (init_seccomp()) { 477 using_seccomp = 1; 478 return; 479 } 480 481 if (seccomp_config == 2) 482 fatal("SECCOMP userspace requested but not functional!\n"); 483 } 484 485 if (uml_ncpus > 1) 486 fatal("SMP is not supported with PTRACE userspace.\n"); 487 488 using_seccomp = 0; 489 check_ptrace(); 490 491 pid = start_ptraced_child(); 492 if (init_pid_registers(pid)) 493 fatal("Failed to initialize default registers"); 494 stop_ptraced_child(pid, 1); 495 } 496