1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
4  */
5 
6 #include <sysdep/stub.h>
7 
8 #include <linux/futex.h>
9 #include <sys/socket.h>
10 #include <errno.h>
11 
12 /*
13  * Known security issues
14  *
15  * Userspace can jump to this address to execute *any* syscall that is
16  * permitted by the stub. As we will return afterwards, it can do
17  * whatever it likes, including:
18  * - Tricking the kernel into handing out the memory FD
19  * - Using this memory FD to read/write all physical memory
20  * - Running in parallel to the kernel processing a syscall
21  *   (possibly creating data races?)
22  * - Blocking e.g. SIGALRM to avoid time based scheduling
23  *
24  * To avoid this, the permitted location for each syscall needs to be
25  * checked for in the SECCOMP filter (which is reasonably simple). Also,
26  * more care will need to go into considerations how the code might be
27  * tricked by using a prepared stack (or even modifying the stack from
28  * another thread in case SMP support is added).
29  *
30  * As for the SIGALRM, the best counter measure will be to check in the
31  * kernel that the process is reporting back the SIGALRM in a timely
32  * fashion.
33  */
syscall_handler(int fd_map[STUB_MAX_FDS])34 static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
35 {
36 	struct stub_data *d = get_stub_data();
37 	int i;
38 	unsigned long res;
39 	int fd;
40 
41 	for (i = 0; i < d->syscall_data_len; i++) {
42 		struct stub_syscall *sc = &d->syscall_data[i];
43 
44 		switch (sc->syscall) {
45 		case STUB_SYSCALL_MMAP:
46 			if (fd_map)
47 				fd = fd_map[sc->mem.fd];
48 			else
49 				fd = sc->mem.fd;
50 
51 			res = stub_syscall6(STUB_MMAP_NR,
52 					    sc->mem.addr, sc->mem.length,
53 					    sc->mem.prot,
54 					    MAP_SHARED | MAP_FIXED,
55 					    fd, sc->mem.offset);
56 			if (res != sc->mem.addr) {
57 				d->err = res;
58 				d->syscall_data_len = i;
59 				return -1;
60 			}
61 			break;
62 		case STUB_SYSCALL_MUNMAP:
63 			res = stub_syscall2(__NR_munmap,
64 					    sc->mem.addr, sc->mem.length);
65 			if (res) {
66 				d->err = res;
67 				d->syscall_data_len = i;
68 				return -1;
69 			}
70 			break;
71 		default:
72 			d->err = -95; /* EOPNOTSUPP */
73 			d->syscall_data_len = i;
74 			return -1;
75 		}
76 	}
77 
78 	d->err = 0;
79 	d->syscall_data_len = 0;
80 
81 	return 0;
82 }
83 
84 void __section(".__syscall_stub")
stub_syscall_handler(void)85 stub_syscall_handler(void)
86 {
87 	syscall_handler(NULL);
88 
89 	trap_myself();
90 }
91 
92 void __section(".__syscall_stub")
stub_signal_interrupt(int sig,siginfo_t * info,void * p)93 stub_signal_interrupt(int sig, siginfo_t *info, void *p)
94 {
95 	struct stub_data *d = get_stub_data();
96 	char rcv_data;
97 	union {
98 		char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
99 		struct cmsghdr align;
100 	} ctrl = {};
101 	struct iovec iov = {
102 		.iov_base = &rcv_data,
103 		.iov_len = 1,
104 	};
105 	struct msghdr msghdr = {
106 		.msg_iov = &iov,
107 		.msg_iovlen = 1,
108 		.msg_control = &ctrl,
109 		.msg_controllen = sizeof(ctrl),
110 	};
111 	ucontext_t *uc = p;
112 	struct cmsghdr *fd_msg;
113 	int *fd_map;
114 	int num_fds;
115 	long res;
116 
117 	d->signal = sig;
118 	d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
119 	d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
120 
121 restart_wait:
122 	d->futex = FUTEX_IN_KERN;
123 	do {
124 		res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
125 				    FUTEX_WAKE, 1);
126 	} while (res == -EINTR);
127 
128 	do {
129 		res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
130 				    FUTEX_WAIT, FUTEX_IN_KERN, 0);
131 	} while (res == -EINTR || d->futex == FUTEX_IN_KERN);
132 
133 	if (res < 0 && res != -EAGAIN)
134 		stub_syscall1(__NR_exit_group, 1);
135 
136 	if (d->syscall_data_len) {
137 		/* Read passed FDs (if any) */
138 		do {
139 			res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
140 		} while (res == -EINTR);
141 
142 		/* We should never have a receive error (other than -EAGAIN) */
143 		if (res < 0 && res != -EAGAIN)
144 			stub_syscall1(__NR_exit_group, 1);
145 
146 		/* Receive the FDs */
147 		num_fds = 0;
148 		fd_msg = msghdr.msg_control;
149 		fd_map = (void *)&CMSG_DATA(fd_msg);
150 		if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
151 			num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
152 
153 		/* Try running queued syscalls. */
154 		res = syscall_handler(fd_map);
155 
156 		while (num_fds)
157 			stub_syscall2(__NR_close, fd_map[--num_fds], 0);
158 	} else {
159 		res = 0;
160 	}
161 
162 	if (res < 0 || d->restart_wait) {
163 		/* Report SIGSYS if we restart. */
164 		d->signal = SIGSYS;
165 		d->restart_wait = 0;
166 
167 		goto restart_wait;
168 	}
169 
170 	/* Restore arch dependent state that is not part of the mcontext */
171 	stub_seccomp_restore_state(&d->arch_data);
172 
173 	/* Return so that the host modified mcontext is restored. */
174 }
175 
176 void __section(".__syscall_stub")
stub_signal_restorer(void)177 stub_signal_restorer(void)
178 {
179 	/* We must not have anything on the stack when doing rt_sigreturn */
180 	stub_syscall0(__NR_rt_sigreturn);
181 }
182