1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4 */
5
6 #include <sysdep/stub.h>
7
8 #include <linux/futex.h>
9 #include <sys/socket.h>
10 #include <errno.h>
11
12 /*
13 * Known security issues
14 *
15 * Userspace can jump to this address to execute *any* syscall that is
16 * permitted by the stub. As we will return afterwards, it can do
17 * whatever it likes, including:
18 * - Tricking the kernel into handing out the memory FD
19 * - Using this memory FD to read/write all physical memory
20 * - Running in parallel to the kernel processing a syscall
21 * (possibly creating data races?)
22 * - Blocking e.g. SIGALRM to avoid time based scheduling
23 *
24 * To avoid this, the permitted location for each syscall needs to be
25 * checked for in the SECCOMP filter (which is reasonably simple). Also,
26 * more care will need to go into considerations how the code might be
27 * tricked by using a prepared stack (or even modifying the stack from
28 * another thread in case SMP support is added).
29 *
30 * As for the SIGALRM, the best counter measure will be to check in the
31 * kernel that the process is reporting back the SIGALRM in a timely
32 * fashion.
33 */
syscall_handler(int fd_map[STUB_MAX_FDS])34 static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
35 {
36 struct stub_data *d = get_stub_data();
37 int i;
38 unsigned long res;
39 int fd;
40
41 for (i = 0; i < d->syscall_data_len; i++) {
42 struct stub_syscall *sc = &d->syscall_data[i];
43
44 switch (sc->syscall) {
45 case STUB_SYSCALL_MMAP:
46 if (fd_map)
47 fd = fd_map[sc->mem.fd];
48 else
49 fd = sc->mem.fd;
50
51 res = stub_syscall6(STUB_MMAP_NR,
52 sc->mem.addr, sc->mem.length,
53 sc->mem.prot,
54 MAP_SHARED | MAP_FIXED,
55 fd, sc->mem.offset);
56 if (res != sc->mem.addr) {
57 d->err = res;
58 d->syscall_data_len = i;
59 return -1;
60 }
61 break;
62 case STUB_SYSCALL_MUNMAP:
63 res = stub_syscall2(__NR_munmap,
64 sc->mem.addr, sc->mem.length);
65 if (res) {
66 d->err = res;
67 d->syscall_data_len = i;
68 return -1;
69 }
70 break;
71 default:
72 d->err = -95; /* EOPNOTSUPP */
73 d->syscall_data_len = i;
74 return -1;
75 }
76 }
77
78 d->err = 0;
79 d->syscall_data_len = 0;
80
81 return 0;
82 }
83
84 void __section(".__syscall_stub")
stub_syscall_handler(void)85 stub_syscall_handler(void)
86 {
87 syscall_handler(NULL);
88
89 trap_myself();
90 }
91
92 void __section(".__syscall_stub")
stub_signal_interrupt(int sig,siginfo_t * info,void * p)93 stub_signal_interrupt(int sig, siginfo_t *info, void *p)
94 {
95 struct stub_data *d = get_stub_data();
96 char rcv_data;
97 union {
98 char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
99 struct cmsghdr align;
100 } ctrl = {};
101 struct iovec iov = {
102 .iov_base = &rcv_data,
103 .iov_len = 1,
104 };
105 struct msghdr msghdr = {
106 .msg_iov = &iov,
107 .msg_iovlen = 1,
108 .msg_control = &ctrl,
109 .msg_controllen = sizeof(ctrl),
110 };
111 ucontext_t *uc = p;
112 struct cmsghdr *fd_msg;
113 int *fd_map;
114 int num_fds;
115 long res;
116
117 d->signal = sig;
118 d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
119 d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
120
121 restart_wait:
122 d->futex = FUTEX_IN_KERN;
123 do {
124 res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
125 FUTEX_WAKE, 1);
126 } while (res == -EINTR);
127
128 do {
129 res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
130 FUTEX_WAIT, FUTEX_IN_KERN, 0);
131 } while (res == -EINTR || d->futex == FUTEX_IN_KERN);
132
133 if (res < 0 && res != -EAGAIN)
134 stub_syscall1(__NR_exit_group, 1);
135
136 if (d->syscall_data_len) {
137 /* Read passed FDs (if any) */
138 do {
139 res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
140 } while (res == -EINTR);
141
142 /* We should never have a receive error (other than -EAGAIN) */
143 if (res < 0 && res != -EAGAIN)
144 stub_syscall1(__NR_exit_group, 1);
145
146 /* Receive the FDs */
147 num_fds = 0;
148 fd_msg = msghdr.msg_control;
149 fd_map = (void *)&CMSG_DATA(fd_msg);
150 if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
151 num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
152
153 /* Try running queued syscalls. */
154 res = syscall_handler(fd_map);
155
156 while (num_fds)
157 stub_syscall2(__NR_close, fd_map[--num_fds], 0);
158 } else {
159 res = 0;
160 }
161
162 if (res < 0 || d->restart_wait) {
163 /* Report SIGSYS if we restart. */
164 d->signal = SIGSYS;
165 d->restart_wait = 0;
166
167 goto restart_wait;
168 }
169
170 /* Restore arch dependent state that is not part of the mcontext */
171 stub_seccomp_restore_state(&d->arch_data);
172
173 /* Return so that the host modified mcontext is restored. */
174 }
175
176 void __section(".__syscall_stub")
stub_signal_restorer(void)177 stub_signal_restorer(void)
178 {
179 /* We must not have anything on the stack when doing rt_sigreturn */
180 stub_syscall0(__NR_rt_sigreturn);
181 }
182