1 #include <signal.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <unistd.h> 5 #include <errno.h> 6 #include <fcntl.h> 7 #include <string.h> 8 #include <stddef.h> 9 #include <sys/sysmacros.h> 10 #include <sys/types.h> 11 #include <sys/wait.h> 12 #include <sys/socket.h> 13 #include <sys/stat.h> 14 #include <sys/mman.h> 15 #include <sys/syscall.h> 16 #include <sys/user.h> 17 #include <sys/ioctl.h> 18 #include <sys/ptrace.h> 19 #include <sys/mount.h> 20 #include <linux/limits.h> 21 #include <linux/filter.h> 22 #include <linux/seccomp.h> 23 24 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) 25 26 static int seccomp(unsigned int op, unsigned int flags, void *args) 27 { 28 errno = 0; 29 return syscall(__NR_seccomp, op, flags, args); 30 } 31 32 static int send_fd(int sock, int fd) 33 { 34 struct msghdr msg = {}; 35 struct cmsghdr *cmsg; 36 char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; 37 struct iovec io = { 38 .iov_base = &c, 39 .iov_len = 1, 40 }; 41 42 msg.msg_iov = &io; 43 msg.msg_iovlen = 1; 44 msg.msg_control = buf; 45 msg.msg_controllen = sizeof(buf); 46 cmsg = CMSG_FIRSTHDR(&msg); 47 cmsg->cmsg_level = SOL_SOCKET; 48 cmsg->cmsg_type = SCM_RIGHTS; 49 cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 50 *((int *)CMSG_DATA(cmsg)) = fd; 51 msg.msg_controllen = cmsg->cmsg_len; 52 53 if (sendmsg(sock, &msg, 0) < 0) { 54 perror("sendmsg"); 55 return -1; 56 } 57 58 return 0; 59 } 60 61 static int recv_fd(int sock) 62 { 63 struct msghdr msg = {}; 64 struct cmsghdr *cmsg; 65 char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; 66 struct iovec io = { 67 .iov_base = &c, 68 .iov_len = 1, 69 }; 70 71 msg.msg_iov = &io; 72 msg.msg_iovlen = 1; 73 msg.msg_control = buf; 74 msg.msg_controllen = sizeof(buf); 75 76 if (recvmsg(sock, &msg, 0) < 0) { 77 perror("recvmsg"); 78 return -1; 79 } 80 81 cmsg = CMSG_FIRSTHDR(&msg); 82 83 return *((int *)CMSG_DATA(cmsg)); 84 } 85 86 static int user_trap_syscall(int nr, unsigned int flags) 87 { 88 struct sock_filter filter[] = { 89 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, 90 offsetof(struct seccomp_data, nr)), 91 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), 92 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), 93 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), 94 }; 95 96 struct sock_fprog prog = { 97 .len = (unsigned short)ARRAY_SIZE(filter), 98 .filter = filter, 99 }; 100 101 return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); 102 } 103 104 static int handle_req(struct seccomp_notif *req, 105 struct seccomp_notif_resp *resp, int listener) 106 { 107 char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; 108 int ret = -1, mem; 109 110 resp->id = req->id; 111 resp->error = -EPERM; 112 resp->val = 0; 113 114 if (req->data.nr != __NR_mount) { 115 fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); 116 return -1; 117 } 118 119 /* Only allow bind mounts. */ 120 if (!(req->data.args[3] & MS_BIND)) 121 return 0; 122 123 /* 124 * Ok, let's read the task's memory to see where they wanted their 125 * mount to go. 126 */ 127 snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); 128 mem = open(path, O_RDONLY); 129 if (mem < 0) { 130 perror("open mem"); 131 return -1; 132 } 133 134 /* 135 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since 136 * the pid that made the syscall may have died, we need to confirm that 137 * the pid is still valid after we open its /proc/pid/mem file. We can 138 * ask the listener fd this as follows. 139 * 140 * Note that this check should occur *after* any task-specific 141 * resources are opened, to make sure that the task has not died and 142 * we're not wrongly reading someone else's state in order to make 143 * decisions. 144 */ 145 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { 146 fprintf(stderr, "task died before we could map its memory\n"); 147 goto out; 148 } 149 150 /* 151 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note 152 * that to avoid another TOCTOU, we should read all of the pointer args 153 * before we decide to allow the syscall. 154 */ 155 if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { 156 perror("seek"); 157 goto out; 158 } 159 160 ret = read(mem, source, sizeof(source)); 161 if (ret < 0) { 162 perror("read"); 163 goto out; 164 } 165 166 if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { 167 perror("seek"); 168 goto out; 169 } 170 171 ret = read(mem, target, sizeof(target)); 172 if (ret < 0) { 173 perror("read"); 174 goto out; 175 } 176 177 /* 178 * Our policy is to only allow bind mounts inside /tmp. This isn't very 179 * interesting, because we could do unprivlieged bind mounts with user 180 * namespaces already, but you get the idea. 181 */ 182 if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { 183 if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { 184 ret = -1; 185 perror("actual mount"); 186 goto out; 187 } 188 resp->error = 0; 189 } 190 191 /* Even if we didn't allow it because of policy, generating the 192 * response was be a success, because we want to tell the worker EPERM. 193 */ 194 ret = 0; 195 196 out: 197 close(mem); 198 return ret; 199 } 200 201 int main(void) 202 { 203 int sk_pair[2], ret = 1, status, listener; 204 pid_t worker = 0 , tracer = 0; 205 206 if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { 207 perror("socketpair"); 208 return 1; 209 } 210 211 worker = fork(); 212 if (worker < 0) { 213 perror("fork"); 214 goto close_pair; 215 } 216 217 if (worker == 0) { 218 listener = user_trap_syscall(__NR_mount, 219 SECCOMP_FILTER_FLAG_NEW_LISTENER); 220 if (listener < 0) { 221 perror("seccomp"); 222 exit(1); 223 } 224 225 /* 226 * Drop privileges. We definitely can't mount as uid 1000. 227 */ 228 if (setuid(1000) < 0) { 229 perror("setuid"); 230 exit(1); 231 } 232 233 /* 234 * Send the listener to the parent; also serves as 235 * synchronization. 236 */ 237 if (send_fd(sk_pair[1], listener) < 0) 238 exit(1); 239 close(listener); 240 241 if (mkdir("/tmp/foo", 0755) < 0) { 242 perror("mkdir"); 243 exit(1); 244 } 245 246 /* 247 * Try a bad mount just for grins. 248 */ 249 if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { 250 fprintf(stderr, "huh? mounted /dev/sda?\n"); 251 exit(1); 252 } 253 254 if (errno != EPERM) { 255 perror("bad error from mount"); 256 exit(1); 257 } 258 259 /* 260 * Ok, we expect this one to succeed. 261 */ 262 if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { 263 perror("mount"); 264 exit(1); 265 } 266 267 exit(0); 268 } 269 270 /* 271 * Get the listener from the child. 272 */ 273 listener = recv_fd(sk_pair[0]); 274 if (listener < 0) 275 goto out_kill; 276 277 /* 278 * Fork a task to handle the requests. This isn't strictly necessary, 279 * but it makes the particular writing of this sample easier, since we 280 * can just wait ofr the tracee to exit and kill the tracer. 281 */ 282 tracer = fork(); 283 if (tracer < 0) { 284 perror("fork"); 285 goto out_kill; 286 } 287 288 if (tracer == 0) { 289 struct seccomp_notif *req; 290 struct seccomp_notif_resp *resp; 291 struct seccomp_notif_sizes sizes; 292 293 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { 294 perror("seccomp(GET_NOTIF_SIZES)"); 295 goto out_close; 296 } 297 298 req = malloc(sizes.seccomp_notif); 299 if (!req) 300 goto out_close; 301 302 resp = malloc(sizes.seccomp_notif_resp); 303 if (!resp) 304 goto out_req; 305 memset(resp, 0, sizes.seccomp_notif_resp); 306 307 while (1) { 308 memset(req, 0, sizes.seccomp_notif); 309 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { 310 perror("ioctl recv"); 311 goto out_resp; 312 } 313 314 if (handle_req(req, resp, listener) < 0) 315 goto out_resp; 316 317 /* 318 * ENOENT here means that the task may have gotten a 319 * signal and restarted the syscall. It's up to the 320 * handler to decide what to do in this case, but for 321 * the sample code, we just ignore it. Probably 322 * something better should happen, like undoing the 323 * mount, or keeping track of the args to make sure we 324 * don't do it again. 325 */ 326 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && 327 errno != ENOENT) { 328 perror("ioctl send"); 329 goto out_resp; 330 } 331 } 332 out_resp: 333 free(resp); 334 out_req: 335 free(req); 336 out_close: 337 close(listener); 338 exit(1); 339 } 340 341 close(listener); 342 343 if (waitpid(worker, &status, 0) != worker) { 344 perror("waitpid"); 345 goto out_kill; 346 } 347 348 if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { 349 perror("umount2"); 350 goto out_kill; 351 } 352 353 if (remove("/tmp/foo") < 0 && errno != ENOENT) { 354 perror("remove"); 355 exit(1); 356 } 357 358 if (!WIFEXITED(status) || WEXITSTATUS(status)) { 359 fprintf(stderr, "worker exited nonzero\n"); 360 goto out_kill; 361 } 362 363 ret = 0; 364 365 out_kill: 366 if (tracer > 0) 367 kill(tracer, SIGKILL); 368 if (worker > 0) 369 kill(worker, SIGKILL); 370 371 close_pair: 372 close(sk_pair[0]); 373 close(sk_pair[1]); 374 return ret; 375 } 376