xref: /linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision adc4fb9c814b5d5cc6021022900fd5eb0b3c8165)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 #include <sys/capability.h>
50 #include <linux/perf_event.h>
51 
52 #include <unistd.h>
53 #include <sys/syscall.h>
54 #include <poll.h>
55 
56 #include "../kselftest_harness.h"
57 #include "../clone3/clone3_selftests.h"
58 
59 /* Attempt to de-conflict with the selftests tree. */
60 #ifndef SKIP
61 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
62 #endif
63 
64 #ifndef MIN
65 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
66 #endif
67 
68 #ifndef PR_SET_PTRACER
69 # define PR_SET_PTRACER 0x59616d61
70 #endif
71 
72 #ifndef noinline
73 #define noinline __attribute__((noinline))
74 #endif
75 
76 #ifndef PR_SET_NO_NEW_PRIVS
77 #define PR_SET_NO_NEW_PRIVS 38
78 #define PR_GET_NO_NEW_PRIVS 39
79 #endif
80 
81 #ifndef PR_SECCOMP_EXT
82 #define PR_SECCOMP_EXT 43
83 #endif
84 
85 #ifndef SECCOMP_EXT_ACT
86 #define SECCOMP_EXT_ACT 1
87 #endif
88 
89 #ifndef SECCOMP_EXT_ACT_TSYNC
90 #define SECCOMP_EXT_ACT_TSYNC 1
91 #endif
92 
93 #ifndef SECCOMP_MODE_STRICT
94 #define SECCOMP_MODE_STRICT 1
95 #endif
96 
97 #ifndef SECCOMP_MODE_FILTER
98 #define SECCOMP_MODE_FILTER 2
99 #endif
100 
101 #ifndef SECCOMP_RET_ALLOW
102 struct seccomp_data {
103 	int nr;
104 	__u32 arch;
105 	__u64 instruction_pointer;
106 	__u64 args[6];
107 };
108 #endif
109 
110 #ifndef SECCOMP_RET_KILL_PROCESS
111 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
112 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
113 #endif
114 #ifndef SECCOMP_RET_KILL
115 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
116 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
117 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
118 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
119 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
120 #endif
121 #ifndef SECCOMP_RET_LOG
122 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
123 #endif
124 
125 #ifndef __NR_seccomp
126 # if defined(__i386__)
127 #  define __NR_seccomp 354
128 # elif defined(__x86_64__)
129 #  define __NR_seccomp 317
130 # elif defined(__arm__)
131 #  define __NR_seccomp 383
132 # elif defined(__aarch64__)
133 #  define __NR_seccomp 277
134 # elif defined(__riscv)
135 #  define __NR_seccomp 277
136 # elif defined(__csky__)
137 #  define __NR_seccomp 277
138 # elif defined(__loongarch__)
139 #  define __NR_seccomp 277
140 # elif defined(__hppa__)
141 #  define __NR_seccomp 338
142 # elif defined(__powerpc__)
143 #  define __NR_seccomp 358
144 # elif defined(__s390__)
145 #  define __NR_seccomp 348
146 # elif defined(__xtensa__)
147 #  define __NR_seccomp 337
148 # elif defined(__sh__)
149 #  define __NR_seccomp 372
150 # elif defined(__mc68000__)
151 #  define __NR_seccomp 380
152 # else
153 #  warning "seccomp syscall number unknown for this architecture"
154 #  define __NR_seccomp 0xffff
155 # endif
156 #endif
157 
158 #ifndef __NR_uretprobe
159 # if defined(__x86_64__)
160 #  define __NR_uretprobe 335
161 # endif
162 #endif
163 
164 #ifndef SECCOMP_SET_MODE_STRICT
165 #define SECCOMP_SET_MODE_STRICT 0
166 #endif
167 
168 #ifndef SECCOMP_SET_MODE_FILTER
169 #define SECCOMP_SET_MODE_FILTER 1
170 #endif
171 
172 #ifndef SECCOMP_GET_ACTION_AVAIL
173 #define SECCOMP_GET_ACTION_AVAIL 2
174 #endif
175 
176 #ifndef SECCOMP_GET_NOTIF_SIZES
177 #define SECCOMP_GET_NOTIF_SIZES 3
178 #endif
179 
180 #ifndef SECCOMP_FILTER_FLAG_TSYNC
181 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
182 #endif
183 
184 #ifndef SECCOMP_FILTER_FLAG_LOG
185 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
186 #endif
187 
188 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
189 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
190 #endif
191 
192 #ifndef PTRACE_SECCOMP_GET_METADATA
193 #define PTRACE_SECCOMP_GET_METADATA	0x420d
194 
195 struct seccomp_metadata {
196 	__u64 filter_off;       /* Input: which filter */
197 	__u64 flags;             /* Output: filter's flags */
198 };
199 #endif
200 
201 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
202 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
203 #endif
204 
205 #ifndef SECCOMP_RET_USER_NOTIF
206 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
207 
208 #define SECCOMP_IOC_MAGIC		'!'
209 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
210 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
211 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
212 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
213 
214 /* Flags for seccomp notification fd ioctl. */
215 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
216 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
217 						struct seccomp_notif_resp)
218 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
219 
220 struct seccomp_notif {
221 	__u64 id;
222 	__u32 pid;
223 	__u32 flags;
224 	struct seccomp_data data;
225 };
226 
227 struct seccomp_notif_resp {
228 	__u64 id;
229 	__s64 val;
230 	__s32 error;
231 	__u32 flags;
232 };
233 
234 struct seccomp_notif_sizes {
235 	__u16 seccomp_notif;
236 	__u16 seccomp_notif_resp;
237 	__u16 seccomp_data;
238 };
239 #endif
240 
241 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
242 /* On success, the return value is the remote process's added fd number */
243 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
244 						struct seccomp_notif_addfd)
245 
246 /* valid flags for seccomp_notif_addfd */
247 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
248 
249 struct seccomp_notif_addfd {
250 	__u64 id;
251 	__u32 flags;
252 	__u32 srcfd;
253 	__u32 newfd;
254 	__u32 newfd_flags;
255 };
256 #endif
257 
258 #ifndef SECCOMP_ADDFD_FLAG_SEND
259 #define SECCOMP_ADDFD_FLAG_SEND	(1UL << 1) /* Addfd and return it, atomically */
260 #endif
261 
262 struct seccomp_notif_addfd_small {
263 	__u64 id;
264 	char weird[4];
265 };
266 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
267 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
268 
269 struct seccomp_notif_addfd_big {
270 	union {
271 		struct seccomp_notif_addfd addfd;
272 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
273 	};
274 };
275 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
276 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
277 
278 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
279 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
280 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
281 #endif
282 
283 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
284 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
285 #endif
286 
287 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
288 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
289 #endif
290 
291 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
292 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
293 #endif
294 
295 #ifndef seccomp
seccomp(unsigned int op,unsigned int flags,void * args)296 int seccomp(unsigned int op, unsigned int flags, void *args)
297 {
298 	errno = 0;
299 	return syscall(__NR_seccomp, op, flags, args);
300 }
301 #endif
302 
303 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
304 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
305 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
306 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
307 #else
308 #error "wut? Unknown __BYTE_ORDER__?!"
309 #endif
310 
311 #define SIBLING_EXIT_UNKILLED	0xbadbeef
312 #define SIBLING_EXIT_FAILURE	0xbadface
313 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
314 
__filecmp(pid_t pid1,pid_t pid2,int fd1,int fd2)315 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
316 {
317 #ifdef __NR_kcmp
318 	errno = 0;
319 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
320 #else
321 	errno = ENOSYS;
322 	return -1;
323 #endif
324 }
325 
326 /* Have TH_LOG report actual location filecmp() is used. */
327 #define filecmp(pid1, pid2, fd1, fd2)	({		\
328 	int _ret;					\
329 							\
330 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
331 	if (_ret != 0) {				\
332 		if (_ret < 0 && errno == ENOSYS) {	\
333 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
334 			_ret = 0;			\
335 		}					\
336 	}						\
337 	_ret; })
338 
TEST(kcmp)339 TEST(kcmp)
340 {
341 	int ret;
342 
343 	ret = __filecmp(getpid(), getpid(), 1, 1);
344 	EXPECT_EQ(ret, 0);
345 	if (ret != 0 && errno == ENOSYS)
346 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
347 }
348 
TEST(mode_strict_support)349 TEST(mode_strict_support)
350 {
351 	long ret;
352 
353 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
354 	ASSERT_EQ(0, ret) {
355 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
356 	}
357 	syscall(__NR_exit, 0);
358 }
359 
TEST_SIGNAL(mode_strict_cannot_call_prctl,SIGKILL)360 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
361 {
362 	long ret;
363 
364 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
365 	ASSERT_EQ(0, ret) {
366 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
367 	}
368 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
369 		NULL, NULL, NULL);
370 	EXPECT_FALSE(true) {
371 		TH_LOG("Unreachable!");
372 	}
373 }
374 
375 /* Note! This doesn't test no new privs behavior */
TEST(no_new_privs_support)376 TEST(no_new_privs_support)
377 {
378 	long ret;
379 
380 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
381 	EXPECT_EQ(0, ret) {
382 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
383 	}
384 }
385 
386 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
TEST(mode_filter_support)387 TEST(mode_filter_support)
388 {
389 	long ret;
390 
391 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
392 	ASSERT_EQ(0, ret) {
393 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
394 	}
395 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
396 	EXPECT_EQ(-1, ret);
397 	EXPECT_EQ(EFAULT, errno) {
398 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
399 	}
400 }
401 
TEST(mode_filter_without_nnp)402 TEST(mode_filter_without_nnp)
403 {
404 	struct sock_filter filter[] = {
405 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
406 	};
407 	struct sock_fprog prog = {
408 		.len = (unsigned short)ARRAY_SIZE(filter),
409 		.filter = filter,
410 	};
411 	long ret;
412 	cap_t cap = cap_get_proc();
413 	cap_flag_value_t is_cap_sys_admin = 0;
414 
415 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
416 	ASSERT_LE(0, ret) {
417 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
418 	}
419 	errno = 0;
420 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
421 	/* Succeeds with CAP_SYS_ADMIN, fails without */
422 	cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);
423 	if (!is_cap_sys_admin) {
424 		EXPECT_EQ(-1, ret);
425 		EXPECT_EQ(EACCES, errno);
426 	} else {
427 		EXPECT_EQ(0, ret);
428 	}
429 }
430 
431 #define MAX_INSNS_PER_PATH 32768
432 
TEST(filter_size_limits)433 TEST(filter_size_limits)
434 {
435 	int i;
436 	int count = BPF_MAXINSNS + 1;
437 	struct sock_filter allow[] = {
438 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
439 	};
440 	struct sock_filter *filter;
441 	struct sock_fprog prog = { };
442 	long ret;
443 
444 	filter = calloc(count, sizeof(*filter));
445 	ASSERT_NE(NULL, filter);
446 
447 	for (i = 0; i < count; i++)
448 		filter[i] = allow[0];
449 
450 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
451 	ASSERT_EQ(0, ret);
452 
453 	prog.filter = filter;
454 	prog.len = count;
455 
456 	/* Too many filter instructions in a single filter. */
457 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
458 	ASSERT_NE(0, ret) {
459 		TH_LOG("Installing %d insn filter was allowed", prog.len);
460 	}
461 
462 	/* One less is okay, though. */
463 	prog.len -= 1;
464 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
465 	ASSERT_EQ(0, ret) {
466 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
467 	}
468 }
469 
TEST(filter_chain_limits)470 TEST(filter_chain_limits)
471 {
472 	int i;
473 	int count = BPF_MAXINSNS;
474 	struct sock_filter allow[] = {
475 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
476 	};
477 	struct sock_filter *filter;
478 	struct sock_fprog prog = { };
479 	long ret;
480 
481 	filter = calloc(count, sizeof(*filter));
482 	ASSERT_NE(NULL, filter);
483 
484 	for (i = 0; i < count; i++)
485 		filter[i] = allow[0];
486 
487 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
488 	ASSERT_EQ(0, ret);
489 
490 	prog.filter = filter;
491 	prog.len = 1;
492 
493 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
494 	ASSERT_EQ(0, ret);
495 
496 	prog.len = count;
497 
498 	/* Too many total filter instructions. */
499 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
500 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
501 		if (ret != 0)
502 			break;
503 	}
504 	ASSERT_NE(0, ret) {
505 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
506 		       i, count, i * (count + 4));
507 	}
508 }
509 
TEST(mode_filter_cannot_move_to_strict)510 TEST(mode_filter_cannot_move_to_strict)
511 {
512 	struct sock_filter filter[] = {
513 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
514 	};
515 	struct sock_fprog prog = {
516 		.len = (unsigned short)ARRAY_SIZE(filter),
517 		.filter = filter,
518 	};
519 	long ret;
520 
521 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
522 	ASSERT_EQ(0, ret);
523 
524 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
525 	ASSERT_EQ(0, ret);
526 
527 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
528 	EXPECT_EQ(-1, ret);
529 	EXPECT_EQ(EINVAL, errno);
530 }
531 
532 
TEST(mode_filter_get_seccomp)533 TEST(mode_filter_get_seccomp)
534 {
535 	struct sock_filter filter[] = {
536 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
537 	};
538 	struct sock_fprog prog = {
539 		.len = (unsigned short)ARRAY_SIZE(filter),
540 		.filter = filter,
541 	};
542 	long ret;
543 
544 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
545 	ASSERT_EQ(0, ret);
546 
547 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
548 	EXPECT_EQ(0, ret);
549 
550 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
551 	ASSERT_EQ(0, ret);
552 
553 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
554 	EXPECT_EQ(2, ret);
555 }
556 
557 
TEST(ALLOW_all)558 TEST(ALLOW_all)
559 {
560 	struct sock_filter filter[] = {
561 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
562 	};
563 	struct sock_fprog prog = {
564 		.len = (unsigned short)ARRAY_SIZE(filter),
565 		.filter = filter,
566 	};
567 	long ret;
568 
569 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
570 	ASSERT_EQ(0, ret);
571 
572 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
573 	ASSERT_EQ(0, ret);
574 }
575 
TEST(empty_prog)576 TEST(empty_prog)
577 {
578 	struct sock_filter filter[] = {
579 	};
580 	struct sock_fprog prog = {
581 		.len = (unsigned short)ARRAY_SIZE(filter),
582 		.filter = filter,
583 	};
584 	long ret;
585 
586 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
587 	ASSERT_EQ(0, ret);
588 
589 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
590 	EXPECT_EQ(-1, ret);
591 	EXPECT_EQ(EINVAL, errno);
592 }
593 
TEST(log_all)594 TEST(log_all)
595 {
596 	struct sock_filter filter[] = {
597 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
598 	};
599 	struct sock_fprog prog = {
600 		.len = (unsigned short)ARRAY_SIZE(filter),
601 		.filter = filter,
602 	};
603 	long ret;
604 	pid_t parent = getppid();
605 
606 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
607 	ASSERT_EQ(0, ret);
608 
609 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
610 	ASSERT_EQ(0, ret);
611 
612 	/* getppid() should succeed and be logged (no check for logging) */
613 	EXPECT_EQ(parent, syscall(__NR_getppid));
614 }
615 
TEST_SIGNAL(unknown_ret_is_kill_inside,SIGSYS)616 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
617 {
618 	struct sock_filter filter[] = {
619 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
620 	};
621 	struct sock_fprog prog = {
622 		.len = (unsigned short)ARRAY_SIZE(filter),
623 		.filter = filter,
624 	};
625 	long ret;
626 
627 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
628 	ASSERT_EQ(0, ret);
629 
630 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
631 	ASSERT_EQ(0, ret);
632 	EXPECT_EQ(0, syscall(__NR_getpid)) {
633 		TH_LOG("getpid() shouldn't ever return");
634 	}
635 }
636 
637 /* return code >= 0x80000000 is unused. */
TEST_SIGNAL(unknown_ret_is_kill_above_allow,SIGSYS)638 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
639 {
640 	struct sock_filter filter[] = {
641 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
642 	};
643 	struct sock_fprog prog = {
644 		.len = (unsigned short)ARRAY_SIZE(filter),
645 		.filter = filter,
646 	};
647 	long ret;
648 
649 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
650 	ASSERT_EQ(0, ret);
651 
652 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
653 	ASSERT_EQ(0, ret);
654 	EXPECT_EQ(0, syscall(__NR_getpid)) {
655 		TH_LOG("getpid() shouldn't ever return");
656 	}
657 }
658 
TEST_SIGNAL(KILL_all,SIGSYS)659 TEST_SIGNAL(KILL_all, SIGSYS)
660 {
661 	struct sock_filter filter[] = {
662 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
663 	};
664 	struct sock_fprog prog = {
665 		.len = (unsigned short)ARRAY_SIZE(filter),
666 		.filter = filter,
667 	};
668 	long ret;
669 
670 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
671 	ASSERT_EQ(0, ret);
672 
673 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
674 	ASSERT_EQ(0, ret);
675 }
676 
TEST_SIGNAL(KILL_one,SIGSYS)677 TEST_SIGNAL(KILL_one, SIGSYS)
678 {
679 	struct sock_filter filter[] = {
680 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
681 			offsetof(struct seccomp_data, nr)),
682 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
683 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
684 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
685 	};
686 	struct sock_fprog prog = {
687 		.len = (unsigned short)ARRAY_SIZE(filter),
688 		.filter = filter,
689 	};
690 	long ret;
691 	pid_t parent = getppid();
692 
693 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
694 	ASSERT_EQ(0, ret);
695 
696 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
697 	ASSERT_EQ(0, ret);
698 
699 	EXPECT_EQ(parent, syscall(__NR_getppid));
700 	/* getpid() should never return. */
701 	EXPECT_EQ(0, syscall(__NR_getpid));
702 }
703 
TEST_SIGNAL(KILL_one_arg_one,SIGSYS)704 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
705 {
706 	void *fatal_address;
707 	struct sock_filter filter[] = {
708 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
709 			offsetof(struct seccomp_data, nr)),
710 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
711 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
712 		/* Only both with lower 32-bit for now. */
713 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
714 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
715 			(unsigned long)&fatal_address, 0, 1),
716 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
717 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
718 	};
719 	struct sock_fprog prog = {
720 		.len = (unsigned short)ARRAY_SIZE(filter),
721 		.filter = filter,
722 	};
723 	long ret;
724 	pid_t parent = getppid();
725 	struct tms timebuf;
726 	clock_t clock = times(&timebuf);
727 
728 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
729 	ASSERT_EQ(0, ret);
730 
731 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
732 	ASSERT_EQ(0, ret);
733 
734 	EXPECT_EQ(parent, syscall(__NR_getppid));
735 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
736 	/* times() should never return. */
737 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
738 }
739 
TEST_SIGNAL(KILL_one_arg_six,SIGSYS)740 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
741 {
742 #ifndef __NR_mmap2
743 	int sysno = __NR_mmap;
744 #else
745 	int sysno = __NR_mmap2;
746 #endif
747 	struct sock_filter filter[] = {
748 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
749 			offsetof(struct seccomp_data, nr)),
750 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
751 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
752 		/* Only both with lower 32-bit for now. */
753 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
754 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
755 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
756 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
757 	};
758 	struct sock_fprog prog = {
759 		.len = (unsigned short)ARRAY_SIZE(filter),
760 		.filter = filter,
761 	};
762 	long ret;
763 	pid_t parent = getppid();
764 	int fd;
765 	void *map1, *map2;
766 	int page_size = sysconf(_SC_PAGESIZE);
767 
768 	ASSERT_LT(0, page_size);
769 
770 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
771 	ASSERT_EQ(0, ret);
772 
773 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
774 	ASSERT_EQ(0, ret);
775 
776 	fd = open("/dev/zero", O_RDONLY);
777 	ASSERT_NE(-1, fd);
778 
779 	EXPECT_EQ(parent, syscall(__NR_getppid));
780 	map1 = (void *)syscall(sysno,
781 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
782 	EXPECT_NE(MAP_FAILED, map1);
783 	/* mmap2() should never return. */
784 	map2 = (void *)syscall(sysno,
785 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
786 	EXPECT_EQ(MAP_FAILED, map2);
787 
788 	/* The test failed, so clean up the resources. */
789 	munmap(map1, page_size);
790 	munmap(map2, page_size);
791 	close(fd);
792 }
793 
794 /* This is a thread task to die via seccomp filter violation. */
kill_thread(void * data)795 void *kill_thread(void *data)
796 {
797 	bool die = (bool)data;
798 
799 	if (die) {
800 		syscall(__NR_getpid);
801 		return (void *)SIBLING_EXIT_FAILURE;
802 	}
803 
804 	return (void *)SIBLING_EXIT_UNKILLED;
805 }
806 
807 enum kill_t {
808 	KILL_THREAD,
809 	KILL_PROCESS,
810 	RET_UNKNOWN
811 };
812 
813 /* Prepare a thread that will kill itself or both of us. */
kill_thread_or_group(struct __test_metadata * _metadata,enum kill_t kill_how)814 void kill_thread_or_group(struct __test_metadata *_metadata,
815 			  enum kill_t kill_how)
816 {
817 	pthread_t thread;
818 	void *status;
819 	/* Kill only when calling __NR_getpid. */
820 	struct sock_filter filter_thread[] = {
821 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
822 			offsetof(struct seccomp_data, nr)),
823 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
824 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
825 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
826 	};
827 	struct sock_fprog prog_thread = {
828 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
829 		.filter = filter_thread,
830 	};
831 	int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
832 	struct sock_filter filter_process[] = {
833 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
834 			offsetof(struct seccomp_data, nr)),
835 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
836 		BPF_STMT(BPF_RET|BPF_K, kill),
837 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
838 	};
839 	struct sock_fprog prog_process = {
840 		.len = (unsigned short)ARRAY_SIZE(filter_process),
841 		.filter = filter_process,
842 	};
843 
844 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
845 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
846 	}
847 
848 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
849 			     kill_how == KILL_THREAD ? &prog_thread
850 						     : &prog_process));
851 
852 	/*
853 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
854 	 * flag cannot be downgraded by a new filter.
855 	 */
856 	if (kill_how == KILL_PROCESS)
857 		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
858 
859 	/* Start a thread that will exit immediately. */
860 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
861 	ASSERT_EQ(0, pthread_join(thread, &status));
862 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
863 
864 	/* Start a thread that will die immediately. */
865 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
866 	ASSERT_EQ(0, pthread_join(thread, &status));
867 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
868 
869 	/*
870 	 * If we get here, only the spawned thread died. Let the parent know
871 	 * the whole process didn't die (i.e. this thread, the spawner,
872 	 * stayed running).
873 	 */
874 	exit(42);
875 }
876 
TEST(KILL_thread)877 TEST(KILL_thread)
878 {
879 	int status;
880 	pid_t child_pid;
881 
882 	child_pid = fork();
883 	ASSERT_LE(0, child_pid);
884 	if (child_pid == 0) {
885 		kill_thread_or_group(_metadata, KILL_THREAD);
886 		_exit(38);
887 	}
888 
889 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
890 
891 	/* If only the thread was killed, we'll see exit 42. */
892 	ASSERT_TRUE(WIFEXITED(status));
893 	ASSERT_EQ(42, WEXITSTATUS(status));
894 }
895 
TEST(KILL_process)896 TEST(KILL_process)
897 {
898 	int status;
899 	pid_t child_pid;
900 
901 	child_pid = fork();
902 	ASSERT_LE(0, child_pid);
903 	if (child_pid == 0) {
904 		kill_thread_or_group(_metadata, KILL_PROCESS);
905 		_exit(38);
906 	}
907 
908 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
909 
910 	/* If the entire process was killed, we'll see SIGSYS. */
911 	ASSERT_TRUE(WIFSIGNALED(status));
912 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
913 }
914 
TEST(KILL_unknown)915 TEST(KILL_unknown)
916 {
917 	int status;
918 	pid_t child_pid;
919 
920 	child_pid = fork();
921 	ASSERT_LE(0, child_pid);
922 	if (child_pid == 0) {
923 		kill_thread_or_group(_metadata, RET_UNKNOWN);
924 		_exit(38);
925 	}
926 
927 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
928 
929 	/* If the entire process was killed, we'll see SIGSYS. */
930 	EXPECT_TRUE(WIFSIGNALED(status)) {
931 		TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
932 	}
933 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
934 }
935 
936 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
TEST(arg_out_of_range)937 TEST(arg_out_of_range)
938 {
939 	struct sock_filter filter[] = {
940 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
941 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
942 	};
943 	struct sock_fprog prog = {
944 		.len = (unsigned short)ARRAY_SIZE(filter),
945 		.filter = filter,
946 	};
947 	long ret;
948 
949 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
950 	ASSERT_EQ(0, ret);
951 
952 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
953 	EXPECT_EQ(-1, ret);
954 	EXPECT_EQ(EINVAL, errno);
955 }
956 
957 #define ERRNO_FILTER(name, errno)					\
958 	struct sock_filter _read_filter_##name[] = {			\
959 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
960 			offsetof(struct seccomp_data, nr)),		\
961 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
962 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
963 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
964 	};								\
965 	struct sock_fprog prog_##name = {				\
966 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
967 		.filter = _read_filter_##name,				\
968 	}
969 
970 /* Make sure basic errno values are correctly passed through a filter. */
TEST(ERRNO_valid)971 TEST(ERRNO_valid)
972 {
973 	ERRNO_FILTER(valid, E2BIG);
974 	long ret;
975 	pid_t parent = getppid();
976 
977 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
978 	ASSERT_EQ(0, ret);
979 
980 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
981 	ASSERT_EQ(0, ret);
982 
983 	EXPECT_EQ(parent, syscall(__NR_getppid));
984 	EXPECT_EQ(-1, read(-1, NULL, 0));
985 	EXPECT_EQ(E2BIG, errno);
986 }
987 
988 /* Make sure an errno of zero is correctly handled by the arch code. */
TEST(ERRNO_zero)989 TEST(ERRNO_zero)
990 {
991 	ERRNO_FILTER(zero, 0);
992 	long ret;
993 	pid_t parent = getppid();
994 
995 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
996 	ASSERT_EQ(0, ret);
997 
998 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
999 	ASSERT_EQ(0, ret);
1000 
1001 	EXPECT_EQ(parent, syscall(__NR_getppid));
1002 	/* "errno" of 0 is ok. */
1003 	EXPECT_EQ(0, read(-1, NULL, 0));
1004 }
1005 
1006 /*
1007  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
1008  * This tests that the errno value gets capped correctly, fixed by
1009  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
1010  */
TEST(ERRNO_capped)1011 TEST(ERRNO_capped)
1012 {
1013 	ERRNO_FILTER(capped, 4096);
1014 	long ret;
1015 	pid_t parent = getppid();
1016 
1017 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1018 	ASSERT_EQ(0, ret);
1019 
1020 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1021 	ASSERT_EQ(0, ret);
1022 
1023 	EXPECT_EQ(parent, syscall(__NR_getppid));
1024 	EXPECT_EQ(-1, read(-1, NULL, 0));
1025 	EXPECT_EQ(4095, errno);
1026 }
1027 
1028 /*
1029  * Filters are processed in reverse order: last applied is executed first.
1030  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1031  * SECCOMP_RET_DATA mask results will follow the most recently applied
1032  * matching filter return (and not the lowest or highest value).
1033  */
TEST(ERRNO_order)1034 TEST(ERRNO_order)
1035 {
1036 	ERRNO_FILTER(first,  11);
1037 	ERRNO_FILTER(second, 13);
1038 	ERRNO_FILTER(third,  12);
1039 	long ret;
1040 	pid_t parent = getppid();
1041 
1042 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1043 	ASSERT_EQ(0, ret);
1044 
1045 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1046 	ASSERT_EQ(0, ret);
1047 
1048 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1049 	ASSERT_EQ(0, ret);
1050 
1051 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1052 	ASSERT_EQ(0, ret);
1053 
1054 	EXPECT_EQ(parent, syscall(__NR_getppid));
1055 	EXPECT_EQ(-1, read(-1, NULL, 0));
1056 	EXPECT_EQ(12, errno);
1057 }
1058 
FIXTURE(TRAP)1059 FIXTURE(TRAP) {
1060 	struct sock_fprog prog;
1061 };
1062 
FIXTURE_SETUP(TRAP)1063 FIXTURE_SETUP(TRAP)
1064 {
1065 	struct sock_filter filter[] = {
1066 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1067 			offsetof(struct seccomp_data, nr)),
1068 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1069 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1070 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1071 	};
1072 
1073 	memset(&self->prog, 0, sizeof(self->prog));
1074 	self->prog.filter = malloc(sizeof(filter));
1075 	ASSERT_NE(NULL, self->prog.filter);
1076 	memcpy(self->prog.filter, filter, sizeof(filter));
1077 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1078 }
1079 
FIXTURE_TEARDOWN(TRAP)1080 FIXTURE_TEARDOWN(TRAP)
1081 {
1082 	if (self->prog.filter)
1083 		free(self->prog.filter);
1084 }
1085 
TEST_F_SIGNAL(TRAP,dfl,SIGSYS)1086 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1087 {
1088 	long ret;
1089 
1090 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1091 	ASSERT_EQ(0, ret);
1092 
1093 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1094 	ASSERT_EQ(0, ret);
1095 	syscall(__NR_getpid);
1096 }
1097 
1098 /* Ensure that SIGSYS overrides SIG_IGN */
TEST_F_SIGNAL(TRAP,ign,SIGSYS)1099 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1100 {
1101 	long ret;
1102 
1103 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1104 	ASSERT_EQ(0, ret);
1105 
1106 	signal(SIGSYS, SIG_IGN);
1107 
1108 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1109 	ASSERT_EQ(0, ret);
1110 	syscall(__NR_getpid);
1111 }
1112 
1113 static siginfo_t TRAP_info;
1114 static volatile int TRAP_nr;
TRAP_action(int nr,siginfo_t * info,void * void_context)1115 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1116 {
1117 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1118 	TRAP_nr = nr;
1119 }
1120 
TEST_F(TRAP,handler)1121 TEST_F(TRAP, handler)
1122 {
1123 	int ret, test;
1124 	struct sigaction act;
1125 	sigset_t mask;
1126 
1127 	memset(&act, 0, sizeof(act));
1128 	sigemptyset(&mask);
1129 	sigaddset(&mask, SIGSYS);
1130 
1131 	act.sa_sigaction = &TRAP_action;
1132 	act.sa_flags = SA_SIGINFO;
1133 	ret = sigaction(SIGSYS, &act, NULL);
1134 	ASSERT_EQ(0, ret) {
1135 		TH_LOG("sigaction failed");
1136 	}
1137 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1138 	ASSERT_EQ(0, ret) {
1139 		TH_LOG("sigprocmask failed");
1140 	}
1141 
1142 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1143 	ASSERT_EQ(0, ret);
1144 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1145 	ASSERT_EQ(0, ret);
1146 	TRAP_nr = 0;
1147 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1148 	/* Expect the registers to be rolled back. (nr = error) may vary
1149 	 * based on arch. */
1150 	ret = syscall(__NR_getpid);
1151 	/* Silence gcc warning about volatile. */
1152 	test = TRAP_nr;
1153 	EXPECT_EQ(SIGSYS, test);
1154 	struct local_sigsys {
1155 		void *_call_addr;	/* calling user insn */
1156 		int _syscall;		/* triggering system call number */
1157 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1158 	} *sigsys = (struct local_sigsys *)
1159 #ifdef si_syscall
1160 		&(TRAP_info.si_call_addr);
1161 #else
1162 		&TRAP_info.si_pid;
1163 #endif
1164 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1165 	/* Make sure arch is non-zero. */
1166 	EXPECT_NE(0, sigsys->_arch);
1167 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1168 }
1169 
FIXTURE(precedence)1170 FIXTURE(precedence) {
1171 	struct sock_fprog allow;
1172 	struct sock_fprog log;
1173 	struct sock_fprog trace;
1174 	struct sock_fprog error;
1175 	struct sock_fprog trap;
1176 	struct sock_fprog kill;
1177 };
1178 
FIXTURE_SETUP(precedence)1179 FIXTURE_SETUP(precedence)
1180 {
1181 	struct sock_filter allow_insns[] = {
1182 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1183 	};
1184 	struct sock_filter log_insns[] = {
1185 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1186 			offsetof(struct seccomp_data, nr)),
1187 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1188 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1189 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1190 	};
1191 	struct sock_filter trace_insns[] = {
1192 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1193 			offsetof(struct seccomp_data, nr)),
1194 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1195 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1196 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1197 	};
1198 	struct sock_filter error_insns[] = {
1199 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1200 			offsetof(struct seccomp_data, nr)),
1201 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1202 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1203 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1204 	};
1205 	struct sock_filter trap_insns[] = {
1206 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1207 			offsetof(struct seccomp_data, nr)),
1208 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1209 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1210 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1211 	};
1212 	struct sock_filter kill_insns[] = {
1213 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1214 			offsetof(struct seccomp_data, nr)),
1215 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1216 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1217 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1218 	};
1219 
1220 	memset(self, 0, sizeof(*self));
1221 #define FILTER_ALLOC(_x) \
1222 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1223 	ASSERT_NE(NULL, self->_x.filter); \
1224 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1225 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1226 	FILTER_ALLOC(allow);
1227 	FILTER_ALLOC(log);
1228 	FILTER_ALLOC(trace);
1229 	FILTER_ALLOC(error);
1230 	FILTER_ALLOC(trap);
1231 	FILTER_ALLOC(kill);
1232 }
1233 
FIXTURE_TEARDOWN(precedence)1234 FIXTURE_TEARDOWN(precedence)
1235 {
1236 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1237 	FILTER_FREE(allow);
1238 	FILTER_FREE(log);
1239 	FILTER_FREE(trace);
1240 	FILTER_FREE(error);
1241 	FILTER_FREE(trap);
1242 	FILTER_FREE(kill);
1243 }
1244 
TEST_F(precedence,allow_ok)1245 TEST_F(precedence, allow_ok)
1246 {
1247 	pid_t parent, res = 0;
1248 	long ret;
1249 
1250 	parent = getppid();
1251 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1252 	ASSERT_EQ(0, ret);
1253 
1254 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1255 	ASSERT_EQ(0, ret);
1256 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1257 	ASSERT_EQ(0, ret);
1258 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1259 	ASSERT_EQ(0, ret);
1260 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1261 	ASSERT_EQ(0, ret);
1262 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1263 	ASSERT_EQ(0, ret);
1264 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1265 	ASSERT_EQ(0, ret);
1266 	/* Should work just fine. */
1267 	res = syscall(__NR_getppid);
1268 	EXPECT_EQ(parent, res);
1269 }
1270 
TEST_F_SIGNAL(precedence,kill_is_highest,SIGSYS)1271 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1272 {
1273 	pid_t parent, res = 0;
1274 	long ret;
1275 
1276 	parent = getppid();
1277 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1278 	ASSERT_EQ(0, ret);
1279 
1280 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1281 	ASSERT_EQ(0, ret);
1282 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1283 	ASSERT_EQ(0, ret);
1284 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1285 	ASSERT_EQ(0, ret);
1286 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1287 	ASSERT_EQ(0, ret);
1288 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1289 	ASSERT_EQ(0, ret);
1290 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1291 	ASSERT_EQ(0, ret);
1292 	/* Should work just fine. */
1293 	res = syscall(__NR_getppid);
1294 	EXPECT_EQ(parent, res);
1295 	/* getpid() should never return. */
1296 	res = syscall(__NR_getpid);
1297 	EXPECT_EQ(0, res);
1298 }
1299 
TEST_F_SIGNAL(precedence,kill_is_highest_in_any_order,SIGSYS)1300 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1301 {
1302 	pid_t parent;
1303 	long ret;
1304 
1305 	parent = getppid();
1306 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1307 	ASSERT_EQ(0, ret);
1308 
1309 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1310 	ASSERT_EQ(0, ret);
1311 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1312 	ASSERT_EQ(0, ret);
1313 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1314 	ASSERT_EQ(0, ret);
1315 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1316 	ASSERT_EQ(0, ret);
1317 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1318 	ASSERT_EQ(0, ret);
1319 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1320 	ASSERT_EQ(0, ret);
1321 	/* Should work just fine. */
1322 	EXPECT_EQ(parent, syscall(__NR_getppid));
1323 	/* getpid() should never return. */
1324 	EXPECT_EQ(0, syscall(__NR_getpid));
1325 }
1326 
TEST_F_SIGNAL(precedence,trap_is_second,SIGSYS)1327 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1328 {
1329 	pid_t parent;
1330 	long ret;
1331 
1332 	parent = getppid();
1333 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1334 	ASSERT_EQ(0, ret);
1335 
1336 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1337 	ASSERT_EQ(0, ret);
1338 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1339 	ASSERT_EQ(0, ret);
1340 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1341 	ASSERT_EQ(0, ret);
1342 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1343 	ASSERT_EQ(0, ret);
1344 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1345 	ASSERT_EQ(0, ret);
1346 	/* Should work just fine. */
1347 	EXPECT_EQ(parent, syscall(__NR_getppid));
1348 	/* getpid() should never return. */
1349 	EXPECT_EQ(0, syscall(__NR_getpid));
1350 }
1351 
TEST_F_SIGNAL(precedence,trap_is_second_in_any_order,SIGSYS)1352 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1353 {
1354 	pid_t parent;
1355 	long ret;
1356 
1357 	parent = getppid();
1358 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1359 	ASSERT_EQ(0, ret);
1360 
1361 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1362 	ASSERT_EQ(0, ret);
1363 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1364 	ASSERT_EQ(0, ret);
1365 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1366 	ASSERT_EQ(0, ret);
1367 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1368 	ASSERT_EQ(0, ret);
1369 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1370 	ASSERT_EQ(0, ret);
1371 	/* Should work just fine. */
1372 	EXPECT_EQ(parent, syscall(__NR_getppid));
1373 	/* getpid() should never return. */
1374 	EXPECT_EQ(0, syscall(__NR_getpid));
1375 }
1376 
TEST_F(precedence,errno_is_third)1377 TEST_F(precedence, errno_is_third)
1378 {
1379 	pid_t parent;
1380 	long ret;
1381 
1382 	parent = getppid();
1383 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1384 	ASSERT_EQ(0, ret);
1385 
1386 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1387 	ASSERT_EQ(0, ret);
1388 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1389 	ASSERT_EQ(0, ret);
1390 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1391 	ASSERT_EQ(0, ret);
1392 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1393 	ASSERT_EQ(0, ret);
1394 	/* Should work just fine. */
1395 	EXPECT_EQ(parent, syscall(__NR_getppid));
1396 	EXPECT_EQ(0, syscall(__NR_getpid));
1397 }
1398 
TEST_F(precedence,errno_is_third_in_any_order)1399 TEST_F(precedence, errno_is_third_in_any_order)
1400 {
1401 	pid_t parent;
1402 	long ret;
1403 
1404 	parent = getppid();
1405 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1406 	ASSERT_EQ(0, ret);
1407 
1408 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1409 	ASSERT_EQ(0, ret);
1410 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1411 	ASSERT_EQ(0, ret);
1412 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1413 	ASSERT_EQ(0, ret);
1414 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1415 	ASSERT_EQ(0, ret);
1416 	/* Should work just fine. */
1417 	EXPECT_EQ(parent, syscall(__NR_getppid));
1418 	EXPECT_EQ(0, syscall(__NR_getpid));
1419 }
1420 
TEST_F(precedence,trace_is_fourth)1421 TEST_F(precedence, trace_is_fourth)
1422 {
1423 	pid_t parent;
1424 	long ret;
1425 
1426 	parent = getppid();
1427 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1428 	ASSERT_EQ(0, ret);
1429 
1430 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1431 	ASSERT_EQ(0, ret);
1432 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1433 	ASSERT_EQ(0, ret);
1434 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1435 	ASSERT_EQ(0, ret);
1436 	/* Should work just fine. */
1437 	EXPECT_EQ(parent, syscall(__NR_getppid));
1438 	/* No ptracer */
1439 	EXPECT_EQ(-1, syscall(__NR_getpid));
1440 }
1441 
TEST_F(precedence,trace_is_fourth_in_any_order)1442 TEST_F(precedence, trace_is_fourth_in_any_order)
1443 {
1444 	pid_t parent;
1445 	long ret;
1446 
1447 	parent = getppid();
1448 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1449 	ASSERT_EQ(0, ret);
1450 
1451 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1452 	ASSERT_EQ(0, ret);
1453 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1454 	ASSERT_EQ(0, ret);
1455 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1456 	ASSERT_EQ(0, ret);
1457 	/* Should work just fine. */
1458 	EXPECT_EQ(parent, syscall(__NR_getppid));
1459 	/* No ptracer */
1460 	EXPECT_EQ(-1, syscall(__NR_getpid));
1461 }
1462 
TEST_F(precedence,log_is_fifth)1463 TEST_F(precedence, log_is_fifth)
1464 {
1465 	pid_t mypid, parent;
1466 	long ret;
1467 
1468 	mypid = getpid();
1469 	parent = getppid();
1470 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1471 	ASSERT_EQ(0, ret);
1472 
1473 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1474 	ASSERT_EQ(0, ret);
1475 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1476 	ASSERT_EQ(0, ret);
1477 	/* Should work just fine. */
1478 	EXPECT_EQ(parent, syscall(__NR_getppid));
1479 	/* Should also work just fine */
1480 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1481 }
1482 
TEST_F(precedence,log_is_fifth_in_any_order)1483 TEST_F(precedence, log_is_fifth_in_any_order)
1484 {
1485 	pid_t mypid, parent;
1486 	long ret;
1487 
1488 	mypid = getpid();
1489 	parent = getppid();
1490 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1491 	ASSERT_EQ(0, ret);
1492 
1493 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1494 	ASSERT_EQ(0, ret);
1495 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1496 	ASSERT_EQ(0, ret);
1497 	/* Should work just fine. */
1498 	EXPECT_EQ(parent, syscall(__NR_getppid));
1499 	/* Should also work just fine */
1500 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1501 }
1502 
1503 #ifndef PTRACE_O_TRACESECCOMP
1504 #define PTRACE_O_TRACESECCOMP	0x00000080
1505 #endif
1506 
1507 /* Catch the Ubuntu 12.04 value error. */
1508 #if PTRACE_EVENT_SECCOMP != 7
1509 #undef PTRACE_EVENT_SECCOMP
1510 #endif
1511 
1512 #ifndef PTRACE_EVENT_SECCOMP
1513 #define PTRACE_EVENT_SECCOMP 7
1514 #endif
1515 
1516 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1517 bool tracer_running;
tracer_stop(int sig)1518 void tracer_stop(int sig)
1519 {
1520 	tracer_running = false;
1521 }
1522 
1523 typedef void tracer_func_t(struct __test_metadata *_metadata,
1524 			   pid_t tracee, int status, void *args);
1525 
start_tracer(struct __test_metadata * _metadata,int fd,pid_t tracee,tracer_func_t tracer_func,void * args,bool ptrace_syscall)1526 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1527 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1528 {
1529 	int ret = -1;
1530 	struct sigaction action = {
1531 		.sa_handler = tracer_stop,
1532 	};
1533 
1534 	/* Allow external shutdown. */
1535 	tracer_running = true;
1536 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1537 
1538 	errno = 0;
1539 	while (ret == -1 && errno != EINVAL)
1540 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1541 	ASSERT_EQ(0, ret) {
1542 		kill(tracee, SIGKILL);
1543 	}
1544 	/* Wait for attach stop */
1545 	wait(NULL);
1546 
1547 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1548 						      PTRACE_O_TRACESYSGOOD :
1549 						      PTRACE_O_TRACESECCOMP);
1550 	ASSERT_EQ(0, ret) {
1551 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1552 		kill(tracee, SIGKILL);
1553 	}
1554 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1555 		     tracee, NULL, 0);
1556 	ASSERT_EQ(0, ret);
1557 
1558 	/* Unblock the tracee */
1559 	ASSERT_EQ(1, write(fd, "A", 1));
1560 	ASSERT_EQ(0, close(fd));
1561 
1562 	/* Run until we're shut down. Must assert to stop execution. */
1563 	while (tracer_running) {
1564 		int status;
1565 
1566 		if (wait(&status) != tracee)
1567 			continue;
1568 
1569 		if (WIFSIGNALED(status)) {
1570 			/* Child caught a fatal signal. */
1571 			return;
1572 		}
1573 		if (WIFEXITED(status)) {
1574 			/* Child exited with code. */
1575 			return;
1576 		}
1577 
1578 		/* Check if we got an expected event. */
1579 		ASSERT_EQ(WIFCONTINUED(status), false);
1580 		ASSERT_EQ(WIFSTOPPED(status), true);
1581 		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1582 			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1583 		}
1584 
1585 		tracer_func(_metadata, tracee, status, args);
1586 
1587 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1588 			     tracee, NULL, 0);
1589 		ASSERT_EQ(0, ret);
1590 	}
1591 	/* Directly report the status of our test harness results. */
1592 	syscall(__NR_exit, _metadata->exit_code);
1593 }
1594 
1595 /* Common tracer setup/teardown functions. */
cont_handler(int num)1596 void cont_handler(int num)
1597 { }
setup_trace_fixture(struct __test_metadata * _metadata,tracer_func_t func,void * args,bool ptrace_syscall)1598 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1599 			  tracer_func_t func, void *args, bool ptrace_syscall)
1600 {
1601 	char sync;
1602 	int pipefd[2];
1603 	pid_t tracer_pid;
1604 	pid_t tracee = getpid();
1605 
1606 	/* Setup a pipe for clean synchronization. */
1607 	ASSERT_EQ(0, pipe(pipefd));
1608 
1609 	/* Fork a child which we'll promote to tracer */
1610 	tracer_pid = fork();
1611 	ASSERT_LE(0, tracer_pid);
1612 	signal(SIGALRM, cont_handler);
1613 	if (tracer_pid == 0) {
1614 		close(pipefd[0]);
1615 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1616 			     ptrace_syscall);
1617 		syscall(__NR_exit, 0);
1618 	}
1619 	close(pipefd[1]);
1620 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1621 	read(pipefd[0], &sync, 1);
1622 	close(pipefd[0]);
1623 
1624 	return tracer_pid;
1625 }
1626 
teardown_trace_fixture(struct __test_metadata * _metadata,pid_t tracer)1627 void teardown_trace_fixture(struct __test_metadata *_metadata,
1628 			    pid_t tracer)
1629 {
1630 	if (tracer) {
1631 		int status;
1632 		/*
1633 		 * Extract the exit code from the other process and
1634 		 * adopt it for ourselves in case its asserts failed.
1635 		 */
1636 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1637 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1638 		if (WEXITSTATUS(status))
1639 			_metadata->exit_code = KSFT_FAIL;
1640 	}
1641 }
1642 
1643 /* "poke" tracer arguments and function. */
1644 struct tracer_args_poke_t {
1645 	unsigned long poke_addr;
1646 };
1647 
tracer_poke(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)1648 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1649 		 void *args)
1650 {
1651 	int ret;
1652 	unsigned long msg;
1653 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1654 
1655 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1656 	EXPECT_EQ(0, ret);
1657 	/* If this fails, don't try to recover. */
1658 	ASSERT_EQ(0x1001, msg) {
1659 		kill(tracee, SIGKILL);
1660 	}
1661 	/*
1662 	 * Poke in the message.
1663 	 * Registers are not touched to try to keep this relatively arch
1664 	 * agnostic.
1665 	 */
1666 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1667 	EXPECT_EQ(0, ret);
1668 }
1669 
FIXTURE(TRACE_poke)1670 FIXTURE(TRACE_poke) {
1671 	struct sock_fprog prog;
1672 	pid_t tracer;
1673 	long poked;
1674 	struct tracer_args_poke_t tracer_args;
1675 };
1676 
FIXTURE_SETUP(TRACE_poke)1677 FIXTURE_SETUP(TRACE_poke)
1678 {
1679 	struct sock_filter filter[] = {
1680 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1681 			offsetof(struct seccomp_data, nr)),
1682 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1683 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1684 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1685 	};
1686 
1687 	self->poked = 0;
1688 	memset(&self->prog, 0, sizeof(self->prog));
1689 	self->prog.filter = malloc(sizeof(filter));
1690 	ASSERT_NE(NULL, self->prog.filter);
1691 	memcpy(self->prog.filter, filter, sizeof(filter));
1692 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1693 
1694 	/* Set up tracer args. */
1695 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1696 
1697 	/* Launch tracer. */
1698 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1699 					   &self->tracer_args, false);
1700 }
1701 
FIXTURE_TEARDOWN(TRACE_poke)1702 FIXTURE_TEARDOWN(TRACE_poke)
1703 {
1704 	teardown_trace_fixture(_metadata, self->tracer);
1705 	if (self->prog.filter)
1706 		free(self->prog.filter);
1707 }
1708 
TEST_F(TRACE_poke,read_has_side_effects)1709 TEST_F(TRACE_poke, read_has_side_effects)
1710 {
1711 	ssize_t ret;
1712 
1713 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1714 	ASSERT_EQ(0, ret);
1715 
1716 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1717 	ASSERT_EQ(0, ret);
1718 
1719 	EXPECT_EQ(0, self->poked);
1720 	ret = read(-1, NULL, 0);
1721 	EXPECT_EQ(-1, ret);
1722 	EXPECT_EQ(0x1001, self->poked);
1723 }
1724 
TEST_F(TRACE_poke,getpid_runs_normally)1725 TEST_F(TRACE_poke, getpid_runs_normally)
1726 {
1727 	long ret;
1728 
1729 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1730 	ASSERT_EQ(0, ret);
1731 
1732 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1733 	ASSERT_EQ(0, ret);
1734 
1735 	EXPECT_EQ(0, self->poked);
1736 	EXPECT_NE(0, syscall(__NR_getpid));
1737 	EXPECT_EQ(0, self->poked);
1738 }
1739 
1740 #if defined(__x86_64__)
1741 # define ARCH_REGS		struct user_regs_struct
1742 # define SYSCALL_NUM(_regs)	(_regs).orig_rax
1743 # define SYSCALL_RET(_regs)	(_regs).rax
1744 #elif defined(__i386__)
1745 # define ARCH_REGS		struct user_regs_struct
1746 # define SYSCALL_NUM(_regs)	(_regs).orig_eax
1747 # define SYSCALL_RET(_regs)	(_regs).eax
1748 #elif defined(__arm__)
1749 # define ARCH_REGS		struct pt_regs
1750 # define SYSCALL_NUM(_regs)	(_regs).ARM_r7
1751 # ifndef PTRACE_SET_SYSCALL
1752 #  define PTRACE_SET_SYSCALL   23
1753 # endif
1754 # define SYSCALL_NUM_SET(_regs, _nr)	\
1755 		EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1756 # define SYSCALL_RET(_regs)	(_regs).ARM_r0
1757 #elif defined(__aarch64__)
1758 # define ARCH_REGS		struct user_pt_regs
1759 # define SYSCALL_NUM(_regs)	(_regs).regs[8]
1760 # ifndef NT_ARM_SYSTEM_CALL
1761 #  define NT_ARM_SYSTEM_CALL 0x404
1762 # endif
1763 # define SYSCALL_NUM_SET(_regs, _nr)				\
1764 	do {							\
1765 		struct iovec __v;				\
1766 		typeof(_nr) __nr = (_nr);			\
1767 		__v.iov_base = &__nr;				\
1768 		__v.iov_len = sizeof(__nr);			\
1769 		EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,	\
1770 				    NT_ARM_SYSTEM_CALL, &__v));	\
1771 	} while (0)
1772 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1773 #elif defined(__loongarch__)
1774 # define ARCH_REGS		struct user_pt_regs
1775 # define SYSCALL_NUM(_regs)	(_regs).regs[11]
1776 # define SYSCALL_RET(_regs)	(_regs).regs[4]
1777 #elif defined(__riscv) && __riscv_xlen == 64
1778 # define ARCH_REGS		struct user_regs_struct
1779 # define SYSCALL_NUM(_regs)	(_regs).a7
1780 # define SYSCALL_RET(_regs)	(_regs).a0
1781 #elif defined(__csky__)
1782 # define ARCH_REGS		struct pt_regs
1783 #  if defined(__CSKYABIV2__)
1784 #   define SYSCALL_NUM(_regs)	(_regs).regs[3]
1785 #  else
1786 #   define SYSCALL_NUM(_regs)	(_regs).regs[9]
1787 #  endif
1788 # define SYSCALL_RET(_regs)	(_regs).a0
1789 #elif defined(__hppa__)
1790 # define ARCH_REGS		struct user_regs_struct
1791 # define SYSCALL_NUM(_regs)	(_regs).gr[20]
1792 # define SYSCALL_RET(_regs)	(_regs).gr[28]
1793 #elif defined(__powerpc__)
1794 # define ARCH_REGS		struct pt_regs
1795 # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
1796 # define SYSCALL_RET(_regs)	(_regs).gpr[3]
1797 # define SYSCALL_RET_SET(_regs, _val)				\
1798 	do {							\
1799 		typeof(_val) _result = (_val);			\
1800 		if ((_regs.trap & 0xfff0) == 0x3000) {		\
1801 			/*					\
1802 			 * scv 0 system call uses -ve result	\
1803 			 * for error, so no need to adjust.	\
1804 			 */					\
1805 			SYSCALL_RET(_regs) = _result;		\
1806 		} else {					\
1807 			/*					\
1808 			 * A syscall error is signaled by the	\
1809 			 * CR0 SO bit and the code is stored as	\
1810 			 * a positive value.			\
1811 			 */					\
1812 			if (_result < 0) {			\
1813 				SYSCALL_RET(_regs) = -_result;	\
1814 				(_regs).ccr |= 0x10000000;	\
1815 			} else {				\
1816 				SYSCALL_RET(_regs) = _result;	\
1817 				(_regs).ccr &= ~0x10000000;	\
1818 			}					\
1819 		}						\
1820 	} while (0)
1821 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1822 #elif defined(__s390__)
1823 # define ARCH_REGS		s390_regs
1824 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
1825 # define SYSCALL_RET_SET(_regs, _val)			\
1826 		TH_LOG("Can't modify syscall return on this architecture")
1827 #elif defined(__mips__)
1828 # include <asm/unistd_nr_n32.h>
1829 # include <asm/unistd_nr_n64.h>
1830 # include <asm/unistd_nr_o32.h>
1831 # define ARCH_REGS		struct pt_regs
1832 # define SYSCALL_NUM(_regs)				\
1833 	({						\
1834 		typeof((_regs).regs[2]) _nr;		\
1835 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1836 			_nr = (_regs).regs[4];		\
1837 		else					\
1838 			_nr = (_regs).regs[2];		\
1839 		_nr;					\
1840 	})
1841 # define SYSCALL_NUM_SET(_regs, _nr)			\
1842 	do {						\
1843 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1844 			(_regs).regs[4] = _nr;		\
1845 		else					\
1846 			(_regs).regs[2] = _nr;		\
1847 	} while (0)
1848 # define SYSCALL_RET_SET(_regs, _val)			\
1849 		TH_LOG("Can't modify syscall return on this architecture")
1850 #elif defined(__xtensa__)
1851 # define ARCH_REGS		struct user_pt_regs
1852 # define SYSCALL_NUM(_regs)	(_regs).syscall
1853 /*
1854  * On xtensa syscall return value is in the register
1855  * a2 of the current window which is not fixed.
1856  */
1857 #define SYSCALL_RET(_regs)	(_regs).a[(_regs).windowbase * 4 + 2]
1858 #elif defined(__sh__)
1859 # define ARCH_REGS		struct pt_regs
1860 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
1861 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1862 #elif defined(__mc68000__)
1863 # define ARCH_REGS		struct user_regs_struct
1864 # define SYSCALL_NUM(_regs)	(_regs).orig_d0
1865 # define SYSCALL_RET(_regs)	(_regs).d0
1866 #else
1867 # error "Do not know how to find your architecture's registers and syscalls"
1868 #endif
1869 
1870 /*
1871  * Most architectures can change the syscall by just updating the
1872  * associated register. This is the default if not defined above.
1873  */
1874 #ifndef SYSCALL_NUM_SET
1875 # define SYSCALL_NUM_SET(_regs, _nr)		\
1876 	do {					\
1877 		SYSCALL_NUM(_regs) = (_nr);	\
1878 	} while (0)
1879 #endif
1880 /*
1881  * Most architectures can change the syscall return value by just
1882  * writing to the SYSCALL_RET register. This is the default if not
1883  * defined above. If an architecture cannot set the return value
1884  * (for example when the syscall and return value register is
1885  * shared), report it with TH_LOG() in an arch-specific definition
1886  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1887  */
1888 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1889 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1890 #endif
1891 #ifndef SYSCALL_RET_SET
1892 # define SYSCALL_RET_SET(_regs, _val)		\
1893 	do {					\
1894 		SYSCALL_RET(_regs) = (_val);	\
1895 	} while (0)
1896 #endif
1897 
1898 /* When the syscall return can't be changed, stub out the tests for it. */
1899 #ifndef SYSCALL_RET
1900 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1901 #else
1902 # define EXPECT_SYSCALL_RETURN(val, action)		\
1903 	do {						\
1904 		errno = 0;				\
1905 		if (val < 0) {				\
1906 			EXPECT_EQ(-1, action);		\
1907 			EXPECT_EQ(-(val), errno);	\
1908 		} else {				\
1909 			EXPECT_EQ(val, action);		\
1910 		}					\
1911 	} while (0)
1912 #endif
1913 
1914 /*
1915  * Some architectures (e.g. powerpc) can only set syscall
1916  * return values on syscall exit during ptrace.
1917  */
1918 const bool ptrace_entry_set_syscall_nr = true;
1919 const bool ptrace_entry_set_syscall_ret =
1920 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1921 	true;
1922 #else
1923 	false;
1924 #endif
1925 
1926 /*
1927  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1928  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1929  */
1930 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
1931 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1932 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1933 #else
1934 # define ARCH_GETREGS(_regs)	({					\
1935 		struct iovec __v;					\
1936 		__v.iov_base = &(_regs);				\
1937 		__v.iov_len = sizeof(_regs);				\
1938 		ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);	\
1939 	})
1940 # define ARCH_SETREGS(_regs)	({					\
1941 		struct iovec __v;					\
1942 		__v.iov_base = &(_regs);				\
1943 		__v.iov_len = sizeof(_regs);				\
1944 		ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);	\
1945 	})
1946 #endif
1947 
1948 /* Architecture-specific syscall fetching routine. */
get_syscall(struct __test_metadata * _metadata,pid_t tracee)1949 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1950 {
1951 	ARCH_REGS regs;
1952 
1953 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1954 		return -1;
1955 	}
1956 
1957 	return SYSCALL_NUM(regs);
1958 }
1959 
1960 /* Architecture-specific syscall changing routine. */
__change_syscall(struct __test_metadata * _metadata,pid_t tracee,long * syscall,long * ret)1961 void __change_syscall(struct __test_metadata *_metadata,
1962 		    pid_t tracee, long *syscall, long *ret)
1963 {
1964 	ARCH_REGS orig, regs;
1965 
1966 	/* Do not get/set registers if we have nothing to do. */
1967 	if (!syscall && !ret)
1968 		return;
1969 
1970 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1971 		return;
1972 	}
1973 	orig = regs;
1974 
1975 	if (syscall)
1976 		SYSCALL_NUM_SET(regs, *syscall);
1977 
1978 	if (ret)
1979 		SYSCALL_RET_SET(regs, *ret);
1980 
1981 	/* Flush any register changes made. */
1982 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1983 		EXPECT_EQ(0, ARCH_SETREGS(regs));
1984 }
1985 
1986 /* Change only syscall number. */
change_syscall_nr(struct __test_metadata * _metadata,pid_t tracee,long syscall)1987 void change_syscall_nr(struct __test_metadata *_metadata,
1988 		       pid_t tracee, long syscall)
1989 {
1990 	__change_syscall(_metadata, tracee, &syscall, NULL);
1991 }
1992 
1993 /* Change syscall return value (and set syscall number to -1). */
change_syscall_ret(struct __test_metadata * _metadata,pid_t tracee,long ret)1994 void change_syscall_ret(struct __test_metadata *_metadata,
1995 			pid_t tracee, long ret)
1996 {
1997 	long syscall = -1;
1998 
1999 	__change_syscall(_metadata, tracee, &syscall, &ret);
2000 }
2001 
tracer_seccomp(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)2002 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
2003 		    int status, void *args)
2004 {
2005 	int ret;
2006 	unsigned long msg;
2007 
2008 	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
2009 		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
2010 		return;
2011 	}
2012 
2013 	/* Make sure we got the right message. */
2014 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2015 	EXPECT_EQ(0, ret);
2016 
2017 	/* Validate and take action on expected syscalls. */
2018 	switch (msg) {
2019 	case 0x1002:
2020 		/* change getpid to getppid. */
2021 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
2022 		change_syscall_nr(_metadata, tracee, __NR_getppid);
2023 		break;
2024 	case 0x1003:
2025 		/* skip gettid with valid return code. */
2026 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2027 		change_syscall_ret(_metadata, tracee, 45000);
2028 		break;
2029 	case 0x1004:
2030 		/* skip openat with error. */
2031 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2032 		change_syscall_ret(_metadata, tracee, -ESRCH);
2033 		break;
2034 	case 0x1005:
2035 		/* do nothing (allow getppid) */
2036 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2037 		break;
2038 	default:
2039 		EXPECT_EQ(0, msg) {
2040 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2041 			kill(tracee, SIGKILL);
2042 		}
2043 	}
2044 
2045 }
2046 
FIXTURE(TRACE_syscall)2047 FIXTURE(TRACE_syscall) {
2048 	struct sock_fprog prog;
2049 	pid_t tracer, mytid, mypid, parent;
2050 	long syscall_nr;
2051 };
2052 
tracer_ptrace(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)2053 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2054 		   int status, void *args)
2055 {
2056 	int ret;
2057 	unsigned long msg;
2058 	static bool entry;
2059 	long syscall_nr_val, syscall_ret_val;
2060 	long *syscall_nr = NULL, *syscall_ret = NULL;
2061 	FIXTURE_DATA(TRACE_syscall) *self = args;
2062 
2063 	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2064 		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2065 		return;
2066 	}
2067 
2068 	/*
2069 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
2070 	 * is by counting.
2071 	 */
2072 	entry = !entry;
2073 
2074 	/* Make sure we got an appropriate message. */
2075 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2076 	EXPECT_EQ(0, ret);
2077 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2078 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2079 
2080 	/*
2081 	 * Some architectures only support setting return values during
2082 	 * syscall exit under ptrace, and on exit the syscall number may
2083 	 * no longer be available. Therefore, save the initial sycall
2084 	 * number here, so it can be examined during both entry and exit
2085 	 * phases.
2086 	 */
2087 	if (entry)
2088 		self->syscall_nr = get_syscall(_metadata, tracee);
2089 
2090 	/*
2091 	 * Depending on the architecture's syscall setting abilities, we
2092 	 * pick which things to set during this phase (entry or exit).
2093 	 */
2094 	if (entry == ptrace_entry_set_syscall_nr)
2095 		syscall_nr = &syscall_nr_val;
2096 	if (entry == ptrace_entry_set_syscall_ret)
2097 		syscall_ret = &syscall_ret_val;
2098 
2099 	/* Now handle the actual rewriting cases. */
2100 	switch (self->syscall_nr) {
2101 	case __NR_getpid:
2102 		syscall_nr_val = __NR_getppid;
2103 		/* Never change syscall return for this case. */
2104 		syscall_ret = NULL;
2105 		break;
2106 	case __NR_gettid:
2107 		syscall_nr_val = -1;
2108 		syscall_ret_val = 45000;
2109 		break;
2110 	case __NR_openat:
2111 		syscall_nr_val = -1;
2112 		syscall_ret_val = -ESRCH;
2113 		break;
2114 	default:
2115 		/* Unhandled, do nothing. */
2116 		return;
2117 	}
2118 
2119 	__change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2120 }
2121 
FIXTURE_VARIANT(TRACE_syscall)2122 FIXTURE_VARIANT(TRACE_syscall) {
2123 	/*
2124 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2125 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2126 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
2127 	 * ptrace (true).
2128 	 */
2129 	bool use_ptrace;
2130 };
2131 
FIXTURE_VARIANT_ADD(TRACE_syscall,ptrace)2132 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2133 	.use_ptrace = true,
2134 };
2135 
FIXTURE_VARIANT_ADD(TRACE_syscall,seccomp)2136 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2137 	.use_ptrace = false,
2138 };
2139 
FIXTURE_SETUP(TRACE_syscall)2140 FIXTURE_SETUP(TRACE_syscall)
2141 {
2142 	struct sock_filter filter[] = {
2143 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2144 			offsetof(struct seccomp_data, nr)),
2145 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2146 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2147 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2148 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2149 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2150 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2151 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2152 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2153 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2154 	};
2155 	struct sock_fprog prog = {
2156 		.len = (unsigned short)ARRAY_SIZE(filter),
2157 		.filter = filter,
2158 	};
2159 	long ret;
2160 
2161 	/* Prepare some testable syscall results. */
2162 	self->mytid = syscall(__NR_gettid);
2163 	ASSERT_GT(self->mytid, 0);
2164 	ASSERT_NE(self->mytid, 1) {
2165 		TH_LOG("Running this test as init is not supported. :)");
2166 	}
2167 
2168 	self->mypid = getpid();
2169 	ASSERT_GT(self->mypid, 0);
2170 	ASSERT_EQ(self->mytid, self->mypid);
2171 
2172 	self->parent = getppid();
2173 	ASSERT_GT(self->parent, 0);
2174 	ASSERT_NE(self->parent, self->mypid);
2175 
2176 	/* Launch tracer. */
2177 	self->tracer = setup_trace_fixture(_metadata,
2178 					   variant->use_ptrace ? tracer_ptrace
2179 							       : tracer_seccomp,
2180 					   self, variant->use_ptrace);
2181 
2182 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2183 	ASSERT_EQ(0, ret);
2184 
2185 	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2186 	if (variant->use_ptrace)
2187 		return;
2188 
2189 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2190 	ASSERT_EQ(0, ret);
2191 }
2192 
FIXTURE_TEARDOWN(TRACE_syscall)2193 FIXTURE_TEARDOWN(TRACE_syscall)
2194 {
2195 	teardown_trace_fixture(_metadata, self->tracer);
2196 }
2197 
TEST(negative_ENOSYS)2198 TEST(negative_ENOSYS)
2199 {
2200 #if defined(__arm__)
2201 	SKIP(return, "arm32 does not support calling syscall -1");
2202 #endif
2203 	/*
2204 	 * There should be no difference between an "internal" skip
2205 	 * and userspace asking for syscall "-1".
2206 	 */
2207 	errno = 0;
2208 	EXPECT_EQ(-1, syscall(-1));
2209 	EXPECT_EQ(errno, ENOSYS);
2210 	/* And no difference for "still not valid but not -1". */
2211 	errno = 0;
2212 	EXPECT_EQ(-1, syscall(-101));
2213 	EXPECT_EQ(errno, ENOSYS);
2214 }
2215 
TEST_F(TRACE_syscall,negative_ENOSYS)2216 TEST_F(TRACE_syscall, negative_ENOSYS)
2217 {
2218 	negative_ENOSYS(_metadata);
2219 }
2220 
TEST_F(TRACE_syscall,syscall_allowed)2221 TEST_F(TRACE_syscall, syscall_allowed)
2222 {
2223 	/* getppid works as expected (no changes). */
2224 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2225 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2226 }
2227 
TEST_F(TRACE_syscall,syscall_redirected)2228 TEST_F(TRACE_syscall, syscall_redirected)
2229 {
2230 	/* getpid has been redirected to getppid as expected. */
2231 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2232 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2233 }
2234 
TEST_F(TRACE_syscall,syscall_errno)2235 TEST_F(TRACE_syscall, syscall_errno)
2236 {
2237 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2238 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2239 }
2240 
TEST_F(TRACE_syscall,syscall_faked)2241 TEST_F(TRACE_syscall, syscall_faked)
2242 {
2243 	/* Tracer skips the gettid syscall and store altered return value. */
2244 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2245 }
2246 
TEST_F_SIGNAL(TRACE_syscall,kill_immediate,SIGSYS)2247 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2248 {
2249 	struct sock_filter filter[] = {
2250 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2251 			offsetof(struct seccomp_data, nr)),
2252 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2253 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2254 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2255 	};
2256 	struct sock_fprog prog = {
2257 		.len = (unsigned short)ARRAY_SIZE(filter),
2258 		.filter = filter,
2259 	};
2260 	long ret;
2261 
2262 	/* Install "kill on mknodat" filter. */
2263 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2264 	ASSERT_EQ(0, ret);
2265 
2266 	/* This should immediately die with SIGSYS, regardless of tracer. */
2267 	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2268 }
2269 
TEST_F(TRACE_syscall,skip_after)2270 TEST_F(TRACE_syscall, skip_after)
2271 {
2272 	struct sock_filter filter[] = {
2273 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2274 			offsetof(struct seccomp_data, nr)),
2275 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2276 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2277 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2278 	};
2279 	struct sock_fprog prog = {
2280 		.len = (unsigned short)ARRAY_SIZE(filter),
2281 		.filter = filter,
2282 	};
2283 	long ret;
2284 
2285 	/* Install additional "errno on getppid" filter. */
2286 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2287 	ASSERT_EQ(0, ret);
2288 
2289 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2290 	errno = 0;
2291 	EXPECT_EQ(-1, syscall(__NR_getpid));
2292 	EXPECT_EQ(EPERM, errno);
2293 }
2294 
TEST_F_SIGNAL(TRACE_syscall,kill_after,SIGSYS)2295 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2296 {
2297 	struct sock_filter filter[] = {
2298 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2299 			offsetof(struct seccomp_data, nr)),
2300 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2301 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2302 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2303 	};
2304 	struct sock_fprog prog = {
2305 		.len = (unsigned short)ARRAY_SIZE(filter),
2306 		.filter = filter,
2307 	};
2308 	long ret;
2309 
2310 	/* Install additional "death on getppid" filter. */
2311 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2312 	ASSERT_EQ(0, ret);
2313 
2314 	/* Tracer will redirect getpid to getppid, and we should die. */
2315 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2316 }
2317 
TEST(seccomp_syscall)2318 TEST(seccomp_syscall)
2319 {
2320 	struct sock_filter filter[] = {
2321 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2322 	};
2323 	struct sock_fprog prog = {
2324 		.len = (unsigned short)ARRAY_SIZE(filter),
2325 		.filter = filter,
2326 	};
2327 	long ret;
2328 
2329 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2330 	ASSERT_EQ(0, ret) {
2331 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2332 	}
2333 
2334 	/* Reject insane operation. */
2335 	ret = seccomp(-1, 0, &prog);
2336 	ASSERT_NE(ENOSYS, errno) {
2337 		TH_LOG("Kernel does not support seccomp syscall!");
2338 	}
2339 	EXPECT_EQ(EINVAL, errno) {
2340 		TH_LOG("Did not reject crazy op value!");
2341 	}
2342 
2343 	/* Reject strict with flags or pointer. */
2344 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2345 	EXPECT_EQ(EINVAL, errno) {
2346 		TH_LOG("Did not reject mode strict with flags!");
2347 	}
2348 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2349 	EXPECT_EQ(EINVAL, errno) {
2350 		TH_LOG("Did not reject mode strict with uargs!");
2351 	}
2352 
2353 	/* Reject insane args for filter. */
2354 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2355 	EXPECT_EQ(EINVAL, errno) {
2356 		TH_LOG("Did not reject crazy filter flags!");
2357 	}
2358 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2359 	EXPECT_EQ(EFAULT, errno) {
2360 		TH_LOG("Did not reject NULL filter!");
2361 	}
2362 
2363 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2364 	EXPECT_EQ(0, errno) {
2365 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2366 			strerror(errno));
2367 	}
2368 }
2369 
TEST(seccomp_syscall_mode_lock)2370 TEST(seccomp_syscall_mode_lock)
2371 {
2372 	struct sock_filter filter[] = {
2373 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2374 	};
2375 	struct sock_fprog prog = {
2376 		.len = (unsigned short)ARRAY_SIZE(filter),
2377 		.filter = filter,
2378 	};
2379 	long ret;
2380 
2381 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2382 	ASSERT_EQ(0, ret) {
2383 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2384 	}
2385 
2386 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2387 	ASSERT_NE(ENOSYS, errno) {
2388 		TH_LOG("Kernel does not support seccomp syscall!");
2389 	}
2390 	EXPECT_EQ(0, ret) {
2391 		TH_LOG("Could not install filter!");
2392 	}
2393 
2394 	/* Make sure neither entry point will switch to strict. */
2395 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2396 	EXPECT_EQ(EINVAL, errno) {
2397 		TH_LOG("Switched to mode strict!");
2398 	}
2399 
2400 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2401 	EXPECT_EQ(EINVAL, errno) {
2402 		TH_LOG("Switched to mode strict!");
2403 	}
2404 }
2405 
2406 /*
2407  * Test detection of known and unknown filter flags. Userspace needs to be able
2408  * to check if a filter flag is supported by the current kernel and a good way
2409  * of doing that is by attempting to enter filter mode, with the flag bit in
2410  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2411  * that the flag is valid and EINVAL indicates that the flag is invalid.
2412  */
TEST(detect_seccomp_filter_flags)2413 TEST(detect_seccomp_filter_flags)
2414 {
2415 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2416 				 SECCOMP_FILTER_FLAG_LOG,
2417 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2418 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2419 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2420 	unsigned int exclusive[] = {
2421 				SECCOMP_FILTER_FLAG_TSYNC,
2422 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2423 	unsigned int flag, all_flags, exclusive_mask;
2424 	int i;
2425 	long ret;
2426 
2427 	/* Test detection of individual known-good filter flags */
2428 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2429 		int bits = 0;
2430 
2431 		flag = flags[i];
2432 		/* Make sure the flag is a single bit! */
2433 		while (flag) {
2434 			if (flag & 0x1)
2435 				bits ++;
2436 			flag >>= 1;
2437 		}
2438 		ASSERT_EQ(1, bits);
2439 		flag = flags[i];
2440 
2441 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2442 		ASSERT_NE(ENOSYS, errno) {
2443 			TH_LOG("Kernel does not support seccomp syscall!");
2444 		}
2445 		EXPECT_EQ(-1, ret);
2446 		EXPECT_EQ(EFAULT, errno) {
2447 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2448 			       flag);
2449 		}
2450 
2451 		all_flags |= flag;
2452 	}
2453 
2454 	/*
2455 	 * Test detection of all known-good filter flags combined. But
2456 	 * for the exclusive flags we need to mask them out and try them
2457 	 * individually for the "all flags" testing.
2458 	 */
2459 	exclusive_mask = 0;
2460 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2461 		exclusive_mask |= exclusive[i];
2462 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2463 		flag = all_flags & ~exclusive_mask;
2464 		flag |= exclusive[i];
2465 
2466 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2467 		EXPECT_EQ(-1, ret);
2468 		EXPECT_EQ(EFAULT, errno) {
2469 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2470 			       flag);
2471 		}
2472 	}
2473 
2474 	/* Test detection of an unknown filter flags, without exclusives. */
2475 	flag = -1;
2476 	flag &= ~exclusive_mask;
2477 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2478 	EXPECT_EQ(-1, ret);
2479 	EXPECT_EQ(EINVAL, errno) {
2480 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2481 		       flag);
2482 	}
2483 
2484 	/*
2485 	 * Test detection of an unknown filter flag that may simply need to be
2486 	 * added to this test
2487 	 */
2488 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2489 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2490 	EXPECT_EQ(-1, ret);
2491 	EXPECT_EQ(EINVAL, errno) {
2492 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2493 		       flag);
2494 	}
2495 }
2496 
TEST(TSYNC_first)2497 TEST(TSYNC_first)
2498 {
2499 	struct sock_filter filter[] = {
2500 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2501 	};
2502 	struct sock_fprog prog = {
2503 		.len = (unsigned short)ARRAY_SIZE(filter),
2504 		.filter = filter,
2505 	};
2506 	long ret;
2507 
2508 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2509 	ASSERT_EQ(0, ret) {
2510 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2511 	}
2512 
2513 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2514 		      &prog);
2515 	ASSERT_NE(ENOSYS, errno) {
2516 		TH_LOG("Kernel does not support seccomp syscall!");
2517 	}
2518 	EXPECT_EQ(0, ret) {
2519 		TH_LOG("Could not install initial filter with TSYNC!");
2520 	}
2521 }
2522 
2523 #define TSYNC_SIBLINGS 2
2524 struct tsync_sibling {
2525 	pthread_t tid;
2526 	pid_t system_tid;
2527 	sem_t *started;
2528 	pthread_cond_t *cond;
2529 	pthread_mutex_t *mutex;
2530 	int diverge;
2531 	int num_waits;
2532 	struct sock_fprog *prog;
2533 	struct __test_metadata *metadata;
2534 };
2535 
2536 /*
2537  * To avoid joining joined threads (which is not allowed by Bionic),
2538  * make sure we both successfully join and clear the tid to skip a
2539  * later join attempt during fixture teardown. Any remaining threads
2540  * will be directly killed during teardown.
2541  */
2542 #define PTHREAD_JOIN(tid, status)					\
2543 	do {								\
2544 		int _rc = pthread_join(tid, status);			\
2545 		if (_rc) {						\
2546 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2547 				(unsigned int)tid, _rc);		\
2548 		} else {						\
2549 			tid = 0;					\
2550 		}							\
2551 	} while (0)
2552 
FIXTURE(TSYNC)2553 FIXTURE(TSYNC) {
2554 	struct sock_fprog root_prog, apply_prog;
2555 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2556 	sem_t started;
2557 	pthread_cond_t cond;
2558 	pthread_mutex_t mutex;
2559 	int sibling_count;
2560 };
2561 
FIXTURE_SETUP(TSYNC)2562 FIXTURE_SETUP(TSYNC)
2563 {
2564 	struct sock_filter root_filter[] = {
2565 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2566 	};
2567 	struct sock_filter apply_filter[] = {
2568 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2569 			offsetof(struct seccomp_data, nr)),
2570 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2571 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2572 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2573 	};
2574 
2575 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2576 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2577 	memset(&self->sibling, 0, sizeof(self->sibling));
2578 	self->root_prog.filter = malloc(sizeof(root_filter));
2579 	ASSERT_NE(NULL, self->root_prog.filter);
2580 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2581 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2582 
2583 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2584 	ASSERT_NE(NULL, self->apply_prog.filter);
2585 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2586 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2587 
2588 	self->sibling_count = 0;
2589 	pthread_mutex_init(&self->mutex, NULL);
2590 	pthread_cond_init(&self->cond, NULL);
2591 	sem_init(&self->started, 0, 0);
2592 	self->sibling[0].tid = 0;
2593 	self->sibling[0].cond = &self->cond;
2594 	self->sibling[0].started = &self->started;
2595 	self->sibling[0].mutex = &self->mutex;
2596 	self->sibling[0].diverge = 0;
2597 	self->sibling[0].num_waits = 1;
2598 	self->sibling[0].prog = &self->root_prog;
2599 	self->sibling[0].metadata = _metadata;
2600 	self->sibling[1].tid = 0;
2601 	self->sibling[1].cond = &self->cond;
2602 	self->sibling[1].started = &self->started;
2603 	self->sibling[1].mutex = &self->mutex;
2604 	self->sibling[1].diverge = 0;
2605 	self->sibling[1].prog = &self->root_prog;
2606 	self->sibling[1].num_waits = 1;
2607 	self->sibling[1].metadata = _metadata;
2608 }
2609 
FIXTURE_TEARDOWN(TSYNC)2610 FIXTURE_TEARDOWN(TSYNC)
2611 {
2612 	int sib = 0;
2613 
2614 	if (self->root_prog.filter)
2615 		free(self->root_prog.filter);
2616 	if (self->apply_prog.filter)
2617 		free(self->apply_prog.filter);
2618 
2619 	for ( ; sib < self->sibling_count; ++sib) {
2620 		struct tsync_sibling *s = &self->sibling[sib];
2621 
2622 		if (!s->tid)
2623 			continue;
2624 		/*
2625 		 * If a thread is still running, it may be stuck, so hit
2626 		 * it over the head really hard.
2627 		 */
2628 		pthread_kill(s->tid, 9);
2629 	}
2630 	pthread_mutex_destroy(&self->mutex);
2631 	pthread_cond_destroy(&self->cond);
2632 	sem_destroy(&self->started);
2633 }
2634 
tsync_sibling(void * data)2635 void *tsync_sibling(void *data)
2636 {
2637 	long ret = 0;
2638 	struct tsync_sibling *me = data;
2639 
2640 	me->system_tid = syscall(__NR_gettid);
2641 
2642 	pthread_mutex_lock(me->mutex);
2643 	if (me->diverge) {
2644 		/* Just re-apply the root prog to fork the tree */
2645 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2646 				me->prog, 0, 0);
2647 	}
2648 	sem_post(me->started);
2649 	/* Return outside of started so parent notices failures. */
2650 	if (ret) {
2651 		pthread_mutex_unlock(me->mutex);
2652 		return (void *)SIBLING_EXIT_FAILURE;
2653 	}
2654 	do {
2655 		pthread_cond_wait(me->cond, me->mutex);
2656 		me->num_waits = me->num_waits - 1;
2657 	} while (me->num_waits);
2658 	pthread_mutex_unlock(me->mutex);
2659 
2660 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2661 	if (!ret)
2662 		return (void *)SIBLING_EXIT_NEWPRIVS;
2663 	read(-1, NULL, 0);
2664 	return (void *)SIBLING_EXIT_UNKILLED;
2665 }
2666 
tsync_start_sibling(struct tsync_sibling * sibling)2667 void tsync_start_sibling(struct tsync_sibling *sibling)
2668 {
2669 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2670 }
2671 
TEST_F(TSYNC,siblings_fail_prctl)2672 TEST_F(TSYNC, siblings_fail_prctl)
2673 {
2674 	long ret;
2675 	void *status;
2676 	struct sock_filter filter[] = {
2677 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2678 			offsetof(struct seccomp_data, nr)),
2679 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2680 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2681 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2682 	};
2683 	struct sock_fprog prog = {
2684 		.len = (unsigned short)ARRAY_SIZE(filter),
2685 		.filter = filter,
2686 	};
2687 
2688 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2689 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2690 	}
2691 
2692 	/* Check prctl failure detection by requesting sib 0 diverge. */
2693 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2694 	ASSERT_NE(ENOSYS, errno) {
2695 		TH_LOG("Kernel does not support seccomp syscall!");
2696 	}
2697 	ASSERT_EQ(0, ret) {
2698 		TH_LOG("setting filter failed");
2699 	}
2700 
2701 	self->sibling[0].diverge = 1;
2702 	tsync_start_sibling(&self->sibling[0]);
2703 	tsync_start_sibling(&self->sibling[1]);
2704 
2705 	while (self->sibling_count < TSYNC_SIBLINGS) {
2706 		sem_wait(&self->started);
2707 		self->sibling_count++;
2708 	}
2709 
2710 	/* Signal the threads to clean up*/
2711 	pthread_mutex_lock(&self->mutex);
2712 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2713 		TH_LOG("cond broadcast non-zero");
2714 	}
2715 	pthread_mutex_unlock(&self->mutex);
2716 
2717 	/* Ensure diverging sibling failed to call prctl. */
2718 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2719 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2720 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2721 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2722 }
2723 
TEST_F(TSYNC,two_siblings_with_ancestor)2724 TEST_F(TSYNC, two_siblings_with_ancestor)
2725 {
2726 	long ret;
2727 	void *status;
2728 
2729 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2730 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2731 	}
2732 
2733 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2734 	ASSERT_NE(ENOSYS, errno) {
2735 		TH_LOG("Kernel does not support seccomp syscall!");
2736 	}
2737 	ASSERT_EQ(0, ret) {
2738 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2739 	}
2740 	tsync_start_sibling(&self->sibling[0]);
2741 	tsync_start_sibling(&self->sibling[1]);
2742 
2743 	while (self->sibling_count < TSYNC_SIBLINGS) {
2744 		sem_wait(&self->started);
2745 		self->sibling_count++;
2746 	}
2747 
2748 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2749 		      &self->apply_prog);
2750 	ASSERT_EQ(0, ret) {
2751 		TH_LOG("Could install filter on all threads!");
2752 	}
2753 	/* Tell the siblings to test the policy */
2754 	pthread_mutex_lock(&self->mutex);
2755 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2756 		TH_LOG("cond broadcast non-zero");
2757 	}
2758 	pthread_mutex_unlock(&self->mutex);
2759 	/* Ensure they are both killed and don't exit cleanly. */
2760 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2761 	EXPECT_EQ(0x0, (long)status);
2762 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2763 	EXPECT_EQ(0x0, (long)status);
2764 }
2765 
TEST_F(TSYNC,two_sibling_want_nnp)2766 TEST_F(TSYNC, two_sibling_want_nnp)
2767 {
2768 	void *status;
2769 
2770 	/* start siblings before any prctl() operations */
2771 	tsync_start_sibling(&self->sibling[0]);
2772 	tsync_start_sibling(&self->sibling[1]);
2773 	while (self->sibling_count < TSYNC_SIBLINGS) {
2774 		sem_wait(&self->started);
2775 		self->sibling_count++;
2776 	}
2777 
2778 	/* Tell the siblings to test no policy */
2779 	pthread_mutex_lock(&self->mutex);
2780 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2781 		TH_LOG("cond broadcast non-zero");
2782 	}
2783 	pthread_mutex_unlock(&self->mutex);
2784 
2785 	/* Ensure they are both upset about lacking nnp. */
2786 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2787 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2788 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2789 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2790 }
2791 
TEST_F(TSYNC,two_siblings_with_no_filter)2792 TEST_F(TSYNC, two_siblings_with_no_filter)
2793 {
2794 	long ret;
2795 	void *status;
2796 
2797 	/* start siblings before any prctl() operations */
2798 	tsync_start_sibling(&self->sibling[0]);
2799 	tsync_start_sibling(&self->sibling[1]);
2800 	while (self->sibling_count < TSYNC_SIBLINGS) {
2801 		sem_wait(&self->started);
2802 		self->sibling_count++;
2803 	}
2804 
2805 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2806 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2807 	}
2808 
2809 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2810 		      &self->apply_prog);
2811 	ASSERT_NE(ENOSYS, errno) {
2812 		TH_LOG("Kernel does not support seccomp syscall!");
2813 	}
2814 	ASSERT_EQ(0, ret) {
2815 		TH_LOG("Could install filter on all threads!");
2816 	}
2817 
2818 	/* Tell the siblings to test the policy */
2819 	pthread_mutex_lock(&self->mutex);
2820 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2821 		TH_LOG("cond broadcast non-zero");
2822 	}
2823 	pthread_mutex_unlock(&self->mutex);
2824 
2825 	/* Ensure they are both killed and don't exit cleanly. */
2826 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2827 	EXPECT_EQ(0x0, (long)status);
2828 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2829 	EXPECT_EQ(0x0, (long)status);
2830 }
2831 
TEST_F(TSYNC,two_siblings_with_one_divergence)2832 TEST_F(TSYNC, two_siblings_with_one_divergence)
2833 {
2834 	long ret;
2835 	void *status;
2836 
2837 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2838 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2839 	}
2840 
2841 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2842 	ASSERT_NE(ENOSYS, errno) {
2843 		TH_LOG("Kernel does not support seccomp syscall!");
2844 	}
2845 	ASSERT_EQ(0, ret) {
2846 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2847 	}
2848 	self->sibling[0].diverge = 1;
2849 	tsync_start_sibling(&self->sibling[0]);
2850 	tsync_start_sibling(&self->sibling[1]);
2851 
2852 	while (self->sibling_count < TSYNC_SIBLINGS) {
2853 		sem_wait(&self->started);
2854 		self->sibling_count++;
2855 	}
2856 
2857 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2858 		      &self->apply_prog);
2859 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2860 		TH_LOG("Did not fail on diverged sibling.");
2861 	}
2862 
2863 	/* Wake the threads */
2864 	pthread_mutex_lock(&self->mutex);
2865 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2866 		TH_LOG("cond broadcast non-zero");
2867 	}
2868 	pthread_mutex_unlock(&self->mutex);
2869 
2870 	/* Ensure they are both unkilled. */
2871 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2872 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2873 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2874 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2875 }
2876 
TEST_F(TSYNC,two_siblings_with_one_divergence_no_tid_in_err)2877 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2878 {
2879 	long ret, flags;
2880 	void *status;
2881 
2882 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2883 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2884 	}
2885 
2886 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2887 	ASSERT_NE(ENOSYS, errno) {
2888 		TH_LOG("Kernel does not support seccomp syscall!");
2889 	}
2890 	ASSERT_EQ(0, ret) {
2891 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2892 	}
2893 	self->sibling[0].diverge = 1;
2894 	tsync_start_sibling(&self->sibling[0]);
2895 	tsync_start_sibling(&self->sibling[1]);
2896 
2897 	while (self->sibling_count < TSYNC_SIBLINGS) {
2898 		sem_wait(&self->started);
2899 		self->sibling_count++;
2900 	}
2901 
2902 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2903 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2904 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2905 	ASSERT_EQ(ESRCH, errno) {
2906 		TH_LOG("Did not return ESRCH for diverged sibling.");
2907 	}
2908 	ASSERT_EQ(-1, ret) {
2909 		TH_LOG("Did not fail on diverged sibling.");
2910 	}
2911 
2912 	/* Wake the threads */
2913 	pthread_mutex_lock(&self->mutex);
2914 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2915 		TH_LOG("cond broadcast non-zero");
2916 	}
2917 	pthread_mutex_unlock(&self->mutex);
2918 
2919 	/* Ensure they are both unkilled. */
2920 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2921 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2922 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2923 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2924 }
2925 
TEST_F(TSYNC,two_siblings_not_under_filter)2926 TEST_F(TSYNC, two_siblings_not_under_filter)
2927 {
2928 	long ret, sib;
2929 	void *status;
2930 	struct timespec delay = { .tv_nsec = 100000000 };
2931 
2932 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2933 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2934 	}
2935 
2936 	/*
2937 	 * Sibling 0 will have its own seccomp policy
2938 	 * and Sibling 1 will not be under seccomp at
2939 	 * all. Sibling 1 will enter seccomp and 0
2940 	 * will cause failure.
2941 	 */
2942 	self->sibling[0].diverge = 1;
2943 	tsync_start_sibling(&self->sibling[0]);
2944 	tsync_start_sibling(&self->sibling[1]);
2945 
2946 	while (self->sibling_count < TSYNC_SIBLINGS) {
2947 		sem_wait(&self->started);
2948 		self->sibling_count++;
2949 	}
2950 
2951 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2952 	ASSERT_NE(ENOSYS, errno) {
2953 		TH_LOG("Kernel does not support seccomp syscall!");
2954 	}
2955 	ASSERT_EQ(0, ret) {
2956 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2957 	}
2958 
2959 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2960 		      &self->apply_prog);
2961 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2962 		TH_LOG("Did not fail on diverged sibling.");
2963 	}
2964 	sib = 1;
2965 	if (ret == self->sibling[0].system_tid)
2966 		sib = 0;
2967 
2968 	pthread_mutex_lock(&self->mutex);
2969 
2970 	/* Increment the other siblings num_waits so we can clean up
2971 	 * the one we just saw.
2972 	 */
2973 	self->sibling[!sib].num_waits += 1;
2974 
2975 	/* Signal the thread to clean up*/
2976 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2977 		TH_LOG("cond broadcast non-zero");
2978 	}
2979 	pthread_mutex_unlock(&self->mutex);
2980 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2981 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2982 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2983 	while (!kill(self->sibling[sib].system_tid, 0))
2984 		nanosleep(&delay, NULL);
2985 	/* Switch to the remaining sibling */
2986 	sib = !sib;
2987 
2988 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2989 		      &self->apply_prog);
2990 	ASSERT_EQ(0, ret) {
2991 		TH_LOG("Expected the remaining sibling to sync");
2992 	};
2993 
2994 	pthread_mutex_lock(&self->mutex);
2995 
2996 	/* If remaining sibling didn't have a chance to wake up during
2997 	 * the first broadcast, manually reduce the num_waits now.
2998 	 */
2999 	if (self->sibling[sib].num_waits > 1)
3000 		self->sibling[sib].num_waits = 1;
3001 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
3002 		TH_LOG("cond broadcast non-zero");
3003 	}
3004 	pthread_mutex_unlock(&self->mutex);
3005 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
3006 	EXPECT_EQ(0, (long)status);
3007 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
3008 	while (!kill(self->sibling[sib].system_tid, 0))
3009 		nanosleep(&delay, NULL);
3010 
3011 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
3012 		      &self->apply_prog);
3013 	ASSERT_EQ(0, ret);  /* just us chickens */
3014 }
3015 
3016 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
TEST(syscall_restart)3017 TEST(syscall_restart)
3018 {
3019 	long ret;
3020 	unsigned long msg;
3021 	pid_t child_pid;
3022 	int pipefd[2];
3023 	int status;
3024 	siginfo_t info = { };
3025 	struct sock_filter filter[] = {
3026 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3027 			 offsetof(struct seccomp_data, nr)),
3028 
3029 #ifdef __NR_sigreturn
3030 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3031 #endif
3032 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3033 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3034 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3035 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3036 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3037 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3038 
3039 		/* Allow __NR_write for easy logging. */
3040 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3041 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3042 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3043 		/* The nanosleep jump target. */
3044 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3045 		/* The restart_syscall jump target. */
3046 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3047 	};
3048 	struct sock_fprog prog = {
3049 		.len = (unsigned short)ARRAY_SIZE(filter),
3050 		.filter = filter,
3051 	};
3052 #if defined(__arm__)
3053 	struct utsname utsbuf;
3054 #endif
3055 
3056 	ASSERT_EQ(0, pipe(pipefd));
3057 
3058 	child_pid = fork();
3059 	ASSERT_LE(0, child_pid);
3060 	if (child_pid == 0) {
3061 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
3062 		char buf = ' ';
3063 		struct timespec timeout = { };
3064 
3065 		/* Attach parent as tracer and stop. */
3066 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3067 		EXPECT_EQ(0, raise(SIGSTOP));
3068 
3069 		EXPECT_EQ(0, close(pipefd[1]));
3070 
3071 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3072 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3073 		}
3074 
3075 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3076 		EXPECT_EQ(0, ret) {
3077 			TH_LOG("Failed to install filter!");
3078 		}
3079 
3080 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3081 			TH_LOG("Failed to read() sync from parent");
3082 		}
3083 		EXPECT_EQ('.', buf) {
3084 			TH_LOG("Failed to get sync data from read()");
3085 		}
3086 
3087 		/* Start nanosleep to be interrupted. */
3088 		timeout.tv_sec = 1;
3089 		errno = 0;
3090 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3091 			TH_LOG("Call to nanosleep() failed (errno %d: %s)",
3092 				errno, strerror(errno));
3093 		}
3094 
3095 		/* Read final sync from parent. */
3096 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3097 			TH_LOG("Failed final read() from parent");
3098 		}
3099 		EXPECT_EQ('!', buf) {
3100 			TH_LOG("Failed to get final data from read()");
3101 		}
3102 
3103 		/* Directly report the status of our test harness results. */
3104 		syscall(__NR_exit, _metadata->exit_code);
3105 	}
3106 	EXPECT_EQ(0, close(pipefd[0]));
3107 
3108 	/* Attach to child, setup options, and release. */
3109 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3110 	ASSERT_EQ(true, WIFSTOPPED(status));
3111 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3112 			    PTRACE_O_TRACESECCOMP));
3113 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3114 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
3115 
3116 	/* Wait for nanosleep() to start. */
3117 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3118 	ASSERT_EQ(true, WIFSTOPPED(status));
3119 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3120 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3121 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3122 	ASSERT_EQ(0x100, msg);
3123 	ret = get_syscall(_metadata, child_pid);
3124 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3125 
3126 	/* Might as well check siginfo for sanity while we're here. */
3127 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3128 	ASSERT_EQ(SIGTRAP, info.si_signo);
3129 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3130 	EXPECT_EQ(0, info.si_errno);
3131 	EXPECT_EQ(getuid(), info.si_uid);
3132 	/* Verify signal delivery came from child (seccomp-triggered). */
3133 	EXPECT_EQ(child_pid, info.si_pid);
3134 
3135 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3136 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3137 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3138 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3139 	ASSERT_EQ(true, WIFSTOPPED(status));
3140 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3141 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3142 	/*
3143 	 * There is no siginfo on SIGSTOP any more, so we can't verify
3144 	 * signal delivery came from parent now (getpid() == info.si_pid).
3145 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3146 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3147 	 */
3148 	EXPECT_EQ(SIGSTOP, info.si_signo);
3149 
3150 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3151 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
3152 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3153 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3154 	ASSERT_EQ(true, WIFSTOPPED(status));
3155 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3156 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3157 
3158 	/* Wait for restart_syscall() to start. */
3159 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3160 	ASSERT_EQ(true, WIFSTOPPED(status));
3161 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3162 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3163 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3164 
3165 	ASSERT_EQ(0x200, msg);
3166 	ret = get_syscall(_metadata, child_pid);
3167 #if defined(__arm__)
3168 	/*
3169 	 * FIXME:
3170 	 * - native ARM registers do NOT expose true syscall.
3171 	 * - compat ARM registers on ARM64 DO expose true syscall.
3172 	 */
3173 	ASSERT_EQ(0, uname(&utsbuf));
3174 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3175 		EXPECT_EQ(__NR_nanosleep, ret);
3176 	} else
3177 #endif
3178 	{
3179 		EXPECT_EQ(__NR_restart_syscall, ret);
3180 	}
3181 
3182 	/* Write again to end test. */
3183 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3184 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
3185 	EXPECT_EQ(0, close(pipefd[1]));
3186 
3187 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3188 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
3189 		_metadata->exit_code = KSFT_FAIL;
3190 }
3191 
TEST_SIGNAL(filter_flag_log,SIGSYS)3192 TEST_SIGNAL(filter_flag_log, SIGSYS)
3193 {
3194 	struct sock_filter allow_filter[] = {
3195 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3196 	};
3197 	struct sock_filter kill_filter[] = {
3198 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3199 			offsetof(struct seccomp_data, nr)),
3200 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3201 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3202 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3203 	};
3204 	struct sock_fprog allow_prog = {
3205 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
3206 		.filter = allow_filter,
3207 	};
3208 	struct sock_fprog kill_prog = {
3209 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
3210 		.filter = kill_filter,
3211 	};
3212 	long ret;
3213 	pid_t parent = getppid();
3214 
3215 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3216 	ASSERT_EQ(0, ret);
3217 
3218 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3219 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3220 		      &allow_prog);
3221 	ASSERT_NE(ENOSYS, errno) {
3222 		TH_LOG("Kernel does not support seccomp syscall!");
3223 	}
3224 	EXPECT_NE(0, ret) {
3225 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3226 	}
3227 	EXPECT_EQ(EINVAL, errno) {
3228 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3229 	}
3230 
3231 	/* Verify that a simple, permissive filter can be added with no flags */
3232 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3233 	EXPECT_EQ(0, ret);
3234 
3235 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3236 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3237 		      &allow_prog);
3238 	ASSERT_NE(EINVAL, errno) {
3239 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3240 	}
3241 	EXPECT_EQ(0, ret);
3242 
3243 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3244 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3245 		      &kill_prog);
3246 	EXPECT_EQ(0, ret);
3247 
3248 	EXPECT_EQ(parent, syscall(__NR_getppid));
3249 	/* getpid() should never return. */
3250 	EXPECT_EQ(0, syscall(__NR_getpid));
3251 }
3252 
TEST(get_action_avail)3253 TEST(get_action_avail)
3254 {
3255 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3256 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3257 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3258 	__u32 unknown_action = 0x10000000U;
3259 	int i;
3260 	long ret;
3261 
3262 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3263 	ASSERT_NE(ENOSYS, errno) {
3264 		TH_LOG("Kernel does not support seccomp syscall!");
3265 	}
3266 	ASSERT_NE(EINVAL, errno) {
3267 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3268 	}
3269 	EXPECT_EQ(ret, 0);
3270 
3271 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3272 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3273 		EXPECT_EQ(ret, 0) {
3274 			TH_LOG("Expected action (0x%X) not available!",
3275 			       actions[i]);
3276 		}
3277 	}
3278 
3279 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3280 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3281 	EXPECT_EQ(ret, -1);
3282 	EXPECT_EQ(errno, EOPNOTSUPP);
3283 }
3284 
TEST(get_metadata)3285 TEST(get_metadata)
3286 {
3287 	pid_t pid;
3288 	int pipefd[2];
3289 	char buf;
3290 	struct seccomp_metadata md;
3291 	long ret;
3292 
3293 	/* Only real root can get metadata. */
3294 	if (geteuid()) {
3295 		SKIP(return, "get_metadata requires real root");
3296 		return;
3297 	}
3298 
3299 	ASSERT_EQ(0, pipe(pipefd));
3300 
3301 	pid = fork();
3302 	ASSERT_GE(pid, 0);
3303 	if (pid == 0) {
3304 		struct sock_filter filter[] = {
3305 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3306 		};
3307 		struct sock_fprog prog = {
3308 			.len = (unsigned short)ARRAY_SIZE(filter),
3309 			.filter = filter,
3310 		};
3311 
3312 		/* one with log, one without */
3313 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3314 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3315 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3316 
3317 		EXPECT_EQ(0, close(pipefd[0]));
3318 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3319 		ASSERT_EQ(0, close(pipefd[1]));
3320 
3321 		while (1)
3322 			sleep(100);
3323 	}
3324 
3325 	ASSERT_EQ(0, close(pipefd[1]));
3326 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3327 
3328 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3329 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3330 
3331 	/* Past here must not use ASSERT or child process is never killed. */
3332 
3333 	md.filter_off = 0;
3334 	errno = 0;
3335 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3336 	EXPECT_EQ(sizeof(md), ret) {
3337 		if (errno == EINVAL)
3338 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3339 	}
3340 
3341 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3342 	EXPECT_EQ(md.filter_off, 0);
3343 
3344 	md.filter_off = 1;
3345 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3346 	EXPECT_EQ(sizeof(md), ret);
3347 	EXPECT_EQ(md.flags, 0);
3348 	EXPECT_EQ(md.filter_off, 1);
3349 
3350 skip:
3351 	ASSERT_EQ(0, kill(pid, SIGKILL));
3352 }
3353 
user_notif_syscall(int nr,unsigned int flags)3354 static int user_notif_syscall(int nr, unsigned int flags)
3355 {
3356 	struct sock_filter filter[] = {
3357 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3358 			offsetof(struct seccomp_data, nr)),
3359 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3360 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3361 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3362 	};
3363 
3364 	struct sock_fprog prog = {
3365 		.len = (unsigned short)ARRAY_SIZE(filter),
3366 		.filter = filter,
3367 	};
3368 
3369 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3370 }
3371 
3372 #define USER_NOTIF_MAGIC INT_MAX
TEST(user_notification_basic)3373 TEST(user_notification_basic)
3374 {
3375 	pid_t pid;
3376 	long ret;
3377 	int status, listener;
3378 	struct seccomp_notif req = {};
3379 	struct seccomp_notif_resp resp = {};
3380 	struct pollfd pollfd;
3381 
3382 	struct sock_filter filter[] = {
3383 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3384 	};
3385 	struct sock_fprog prog = {
3386 		.len = (unsigned short)ARRAY_SIZE(filter),
3387 		.filter = filter,
3388 	};
3389 
3390 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3391 	ASSERT_EQ(0, ret) {
3392 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3393 	}
3394 
3395 	pid = fork();
3396 	ASSERT_GE(pid, 0);
3397 
3398 	/* Check that we get -ENOSYS with no listener attached */
3399 	if (pid == 0) {
3400 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3401 			exit(1);
3402 		ret = syscall(__NR_getppid);
3403 		exit(ret >= 0 || errno != ENOSYS);
3404 	}
3405 
3406 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3407 	EXPECT_EQ(true, WIFEXITED(status));
3408 	EXPECT_EQ(0, WEXITSTATUS(status));
3409 
3410 	/* Add some no-op filters for grins. */
3411 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3412 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3413 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3414 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3415 
3416 	/* Check that the basic notification machinery works */
3417 	listener = user_notif_syscall(__NR_getppid,
3418 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3419 	ASSERT_GE(listener, 0);
3420 
3421 	/* Installing a second listener in the chain should EBUSY */
3422 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3423 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3424 		  -1);
3425 	EXPECT_EQ(errno, EBUSY);
3426 
3427 	pid = fork();
3428 	ASSERT_GE(pid, 0);
3429 
3430 	if (pid == 0) {
3431 		ret = syscall(__NR_getppid);
3432 		exit(ret != USER_NOTIF_MAGIC);
3433 	}
3434 
3435 	pollfd.fd = listener;
3436 	pollfd.events = POLLIN | POLLOUT;
3437 
3438 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3439 	EXPECT_EQ(pollfd.revents, POLLIN);
3440 
3441 	/* Test that we can't pass garbage to the kernel. */
3442 	memset(&req, 0, sizeof(req));
3443 	req.pid = -1;
3444 	errno = 0;
3445 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3446 	EXPECT_EQ(-1, ret);
3447 	EXPECT_EQ(EINVAL, errno);
3448 
3449 	if (ret) {
3450 		req.pid = 0;
3451 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3452 	}
3453 
3454 	pollfd.fd = listener;
3455 	pollfd.events = POLLIN | POLLOUT;
3456 
3457 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3458 	EXPECT_EQ(pollfd.revents, POLLOUT);
3459 
3460 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3461 
3462 	resp.id = req.id;
3463 	resp.error = 0;
3464 	resp.val = USER_NOTIF_MAGIC;
3465 
3466 	/* check that we make sure flags == 0 */
3467 	resp.flags = 1;
3468 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3469 	EXPECT_EQ(errno, EINVAL);
3470 
3471 	resp.flags = 0;
3472 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3473 
3474 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3475 	EXPECT_EQ(true, WIFEXITED(status));
3476 	EXPECT_EQ(0, WEXITSTATUS(status));
3477 }
3478 
TEST(user_notification_with_tsync)3479 TEST(user_notification_with_tsync)
3480 {
3481 	int ret;
3482 	unsigned int flags;
3483 
3484 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3485 	ASSERT_EQ(0, ret) {
3486 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3487 	}
3488 
3489 	/* these were exclusive */
3490 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3491 		SECCOMP_FILTER_FLAG_TSYNC;
3492 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3493 	ASSERT_EQ(EINVAL, errno);
3494 
3495 	/* but now they're not */
3496 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3497 	ret = user_notif_syscall(__NR_getppid, flags);
3498 	close(ret);
3499 	ASSERT_LE(0, ret);
3500 }
3501 
TEST(user_notification_kill_in_middle)3502 TEST(user_notification_kill_in_middle)
3503 {
3504 	pid_t pid;
3505 	long ret;
3506 	int listener;
3507 	struct seccomp_notif req = {};
3508 	struct seccomp_notif_resp resp = {};
3509 
3510 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3511 	ASSERT_EQ(0, ret) {
3512 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3513 	}
3514 
3515 	listener = user_notif_syscall(__NR_getppid,
3516 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3517 	ASSERT_GE(listener, 0);
3518 
3519 	/*
3520 	 * Check that nothing bad happens when we kill the task in the middle
3521 	 * of a syscall.
3522 	 */
3523 	pid = fork();
3524 	ASSERT_GE(pid, 0);
3525 
3526 	if (pid == 0) {
3527 		ret = syscall(__NR_getppid);
3528 		exit(ret != USER_NOTIF_MAGIC);
3529 	}
3530 
3531 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3532 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3533 
3534 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3535 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3536 
3537 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3538 
3539 	resp.id = req.id;
3540 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3541 	EXPECT_EQ(ret, -1);
3542 	EXPECT_EQ(errno, ENOENT);
3543 }
3544 
3545 static int handled = -1;
3546 
signal_handler(int signal)3547 static void signal_handler(int signal)
3548 {
3549 	if (write(handled, "c", 1) != 1)
3550 		perror("write from signal");
3551 }
3552 
TEST(user_notification_signal)3553 TEST(user_notification_signal)
3554 {
3555 	pid_t pid;
3556 	long ret;
3557 	int status, listener, sk_pair[2];
3558 	struct seccomp_notif req = {};
3559 	struct seccomp_notif_resp resp = {};
3560 	char c;
3561 
3562 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3563 	ASSERT_EQ(0, ret) {
3564 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3565 	}
3566 
3567 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3568 
3569 	listener = user_notif_syscall(__NR_gettid,
3570 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3571 	ASSERT_GE(listener, 0);
3572 
3573 	pid = fork();
3574 	ASSERT_GE(pid, 0);
3575 
3576 	if (pid == 0) {
3577 		close(sk_pair[0]);
3578 		handled = sk_pair[1];
3579 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3580 			perror("signal");
3581 			exit(1);
3582 		}
3583 		/*
3584 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3585 		 * to rely on a signal that has not yet been handled. Let's at
3586 		 * least check that the error code gets propagated through, and
3587 		 * hope that it doesn't break when there is actually a signal :)
3588 		 */
3589 		ret = syscall(__NR_gettid);
3590 		exit(!(ret == -1 && errno == 512));
3591 	}
3592 
3593 	close(sk_pair[1]);
3594 
3595 	memset(&req, 0, sizeof(req));
3596 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3597 
3598 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3599 
3600 	/*
3601 	 * Make sure the signal really is delivered, which means we're not
3602 	 * stuck in the user notification code any more and the notification
3603 	 * should be dead.
3604 	 */
3605 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3606 
3607 	resp.id = req.id;
3608 	resp.error = -EPERM;
3609 	resp.val = 0;
3610 
3611 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3612 	EXPECT_EQ(errno, ENOENT);
3613 
3614 	memset(&req, 0, sizeof(req));
3615 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3616 
3617 	resp.id = req.id;
3618 	resp.error = -512; /* -ERESTARTSYS */
3619 	resp.val = 0;
3620 
3621 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3622 
3623 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3624 	EXPECT_EQ(true, WIFEXITED(status));
3625 	EXPECT_EQ(0, WEXITSTATUS(status));
3626 }
3627 
TEST(user_notification_closed_listener)3628 TEST(user_notification_closed_listener)
3629 {
3630 	pid_t pid;
3631 	long ret;
3632 	int status, listener;
3633 
3634 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3635 	ASSERT_EQ(0, ret) {
3636 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3637 	}
3638 
3639 	listener = user_notif_syscall(__NR_getppid,
3640 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3641 	ASSERT_GE(listener, 0);
3642 
3643 	/*
3644 	 * Check that we get an ENOSYS when the listener is closed.
3645 	 */
3646 	pid = fork();
3647 	ASSERT_GE(pid, 0);
3648 	if (pid == 0) {
3649 		close(listener);
3650 		ret = syscall(__NR_getppid);
3651 		exit(ret != -1 && errno != ENOSYS);
3652 	}
3653 
3654 	close(listener);
3655 
3656 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3657 	EXPECT_EQ(true, WIFEXITED(status));
3658 	EXPECT_EQ(0, WEXITSTATUS(status));
3659 }
3660 
3661 /*
3662  * Check that a pid in a child namespace still shows up as valid in ours.
3663  */
TEST(user_notification_child_pid_ns)3664 TEST(user_notification_child_pid_ns)
3665 {
3666 	pid_t pid;
3667 	int status, listener;
3668 	struct seccomp_notif req = {};
3669 	struct seccomp_notif_resp resp = {};
3670 
3671 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3672 		if (errno == EINVAL)
3673 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3674 	};
3675 
3676 	listener = user_notif_syscall(__NR_getppid,
3677 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3678 	ASSERT_GE(listener, 0);
3679 
3680 	pid = fork();
3681 	ASSERT_GE(pid, 0);
3682 
3683 	if (pid == 0)
3684 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3685 
3686 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3687 	EXPECT_EQ(req.pid, pid);
3688 
3689 	resp.id = req.id;
3690 	resp.error = 0;
3691 	resp.val = USER_NOTIF_MAGIC;
3692 
3693 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3694 
3695 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3696 	EXPECT_EQ(true, WIFEXITED(status));
3697 	EXPECT_EQ(0, WEXITSTATUS(status));
3698 	close(listener);
3699 }
3700 
3701 /*
3702  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3703  * invalid.
3704  */
TEST(user_notification_sibling_pid_ns)3705 TEST(user_notification_sibling_pid_ns)
3706 {
3707 	pid_t pid, pid2;
3708 	int status, listener;
3709 	struct seccomp_notif req = {};
3710 	struct seccomp_notif_resp resp = {};
3711 
3712 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3713 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3714 	}
3715 
3716 	listener = user_notif_syscall(__NR_getppid,
3717 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3718 	ASSERT_GE(listener, 0);
3719 
3720 	pid = fork();
3721 	ASSERT_GE(pid, 0);
3722 
3723 	if (pid == 0) {
3724 		ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3725 			if (errno == EPERM)
3726 				SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3727 			else if (errno == EINVAL)
3728 				SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3729 		}
3730 
3731 		pid2 = fork();
3732 		ASSERT_GE(pid2, 0);
3733 
3734 		if (pid2 == 0)
3735 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3736 
3737 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3738 		EXPECT_EQ(true, WIFEXITED(status));
3739 		EXPECT_EQ(0, WEXITSTATUS(status));
3740 		exit(WEXITSTATUS(status));
3741 	}
3742 
3743 	/* Create the sibling ns, and sibling in it. */
3744 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3745 		if (errno == EPERM)
3746 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3747 		else if (errno == EINVAL)
3748 			SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3749 	}
3750 	ASSERT_EQ(errno, 0);
3751 
3752 	pid2 = fork();
3753 	ASSERT_GE(pid2, 0);
3754 
3755 	if (pid2 == 0) {
3756 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3757 		/*
3758 		 * The pid should be 0, i.e. the task is in some namespace that
3759 		 * we can't "see".
3760 		 */
3761 		EXPECT_EQ(req.pid, 0);
3762 
3763 		resp.id = req.id;
3764 		resp.error = 0;
3765 		resp.val = USER_NOTIF_MAGIC;
3766 
3767 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3768 		exit(0);
3769 	}
3770 
3771 	close(listener);
3772 
3773 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3774 	EXPECT_EQ(true, WIFEXITED(status));
3775 	EXPECT_EQ(0, WEXITSTATUS(status));
3776 
3777 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3778 	EXPECT_EQ(true, WIFEXITED(status));
3779 	EXPECT_EQ(0, WEXITSTATUS(status));
3780 }
3781 
TEST(user_notification_fault_recv)3782 TEST(user_notification_fault_recv)
3783 {
3784 	pid_t pid;
3785 	int status, listener;
3786 	struct seccomp_notif req = {};
3787 	struct seccomp_notif_resp resp = {};
3788 
3789 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3790 		if (errno == EINVAL)
3791 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3792 	}
3793 
3794 	listener = user_notif_syscall(__NR_getppid,
3795 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3796 	ASSERT_GE(listener, 0);
3797 
3798 	pid = fork();
3799 	ASSERT_GE(pid, 0);
3800 
3801 	if (pid == 0)
3802 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3803 
3804 	/* Do a bad recv() */
3805 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3806 	EXPECT_EQ(errno, EFAULT);
3807 
3808 	/* We should still be able to receive this notification, though. */
3809 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3810 	EXPECT_EQ(req.pid, pid);
3811 
3812 	resp.id = req.id;
3813 	resp.error = 0;
3814 	resp.val = USER_NOTIF_MAGIC;
3815 
3816 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3817 
3818 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3819 	EXPECT_EQ(true, WIFEXITED(status));
3820 	EXPECT_EQ(0, WEXITSTATUS(status));
3821 }
3822 
TEST(seccomp_get_notif_sizes)3823 TEST(seccomp_get_notif_sizes)
3824 {
3825 	struct seccomp_notif_sizes sizes;
3826 
3827 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3828 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3829 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3830 }
3831 
TEST(user_notification_continue)3832 TEST(user_notification_continue)
3833 {
3834 	pid_t pid;
3835 	long ret;
3836 	int status, listener;
3837 	struct seccomp_notif req = {};
3838 	struct seccomp_notif_resp resp = {};
3839 	struct pollfd pollfd;
3840 
3841 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3842 	ASSERT_EQ(0, ret) {
3843 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3844 	}
3845 
3846 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3847 	ASSERT_GE(listener, 0);
3848 
3849 	pid = fork();
3850 	ASSERT_GE(pid, 0);
3851 
3852 	if (pid == 0) {
3853 		int dup_fd, pipe_fds[2];
3854 		pid_t self;
3855 
3856 		ASSERT_GE(pipe(pipe_fds), 0);
3857 
3858 		dup_fd = dup(pipe_fds[0]);
3859 		ASSERT_GE(dup_fd, 0);
3860 		EXPECT_NE(pipe_fds[0], dup_fd);
3861 
3862 		self = getpid();
3863 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3864 		exit(0);
3865 	}
3866 
3867 	pollfd.fd = listener;
3868 	pollfd.events = POLLIN | POLLOUT;
3869 
3870 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3871 	EXPECT_EQ(pollfd.revents, POLLIN);
3872 
3873 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3874 
3875 	pollfd.fd = listener;
3876 	pollfd.events = POLLIN | POLLOUT;
3877 
3878 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3879 	EXPECT_EQ(pollfd.revents, POLLOUT);
3880 
3881 	EXPECT_EQ(req.data.nr, __NR_dup);
3882 
3883 	resp.id = req.id;
3884 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3885 
3886 	/*
3887 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3888 	 * args be set to 0.
3889 	 */
3890 	resp.error = 0;
3891 	resp.val = USER_NOTIF_MAGIC;
3892 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3893 	EXPECT_EQ(errno, EINVAL);
3894 
3895 	resp.error = USER_NOTIF_MAGIC;
3896 	resp.val = 0;
3897 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3898 	EXPECT_EQ(errno, EINVAL);
3899 
3900 	resp.error = 0;
3901 	resp.val = 0;
3902 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3903 		if (errno == EINVAL)
3904 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3905 	}
3906 
3907 skip:
3908 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3909 	EXPECT_EQ(true, WIFEXITED(status));
3910 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3911 		if (WEXITSTATUS(status) == 2) {
3912 			SKIP(return, "Kernel does not support kcmp() syscall");
3913 			return;
3914 		}
3915 	}
3916 }
3917 
TEST(user_notification_filter_empty)3918 TEST(user_notification_filter_empty)
3919 {
3920 	pid_t pid;
3921 	long ret;
3922 	int status;
3923 	struct pollfd pollfd;
3924 	struct __clone_args args = {
3925 		.flags = CLONE_FILES,
3926 		.exit_signal = SIGCHLD,
3927 	};
3928 
3929 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3930 	ASSERT_EQ(0, ret) {
3931 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3932 	}
3933 
3934 	if (__NR_clone3 < 0)
3935 		SKIP(return, "Test not built with clone3 support");
3936 
3937 	pid = sys_clone3(&args, sizeof(args));
3938 	ASSERT_GE(pid, 0);
3939 
3940 	if (pid == 0) {
3941 		int listener;
3942 
3943 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3944 		if (listener < 0)
3945 			_exit(EXIT_FAILURE);
3946 
3947 		if (dup2(listener, 200) != 200)
3948 			_exit(EXIT_FAILURE);
3949 
3950 		close(listener);
3951 
3952 		_exit(EXIT_SUCCESS);
3953 	}
3954 
3955 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3956 	EXPECT_EQ(true, WIFEXITED(status));
3957 	EXPECT_EQ(0, WEXITSTATUS(status));
3958 
3959 	/*
3960 	 * The seccomp filter has become unused so we should be notified once
3961 	 * the kernel gets around to cleaning up task struct.
3962 	 */
3963 	pollfd.fd = 200;
3964 	pollfd.events = POLLHUP;
3965 
3966 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3967 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3968 }
3969 
TEST(user_ioctl_notification_filter_empty)3970 TEST(user_ioctl_notification_filter_empty)
3971 {
3972 	pid_t pid;
3973 	long ret;
3974 	int status, p[2];
3975 	struct __clone_args args = {
3976 		.flags = CLONE_FILES,
3977 		.exit_signal = SIGCHLD,
3978 	};
3979 	struct seccomp_notif req = {};
3980 
3981 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3982 	ASSERT_EQ(0, ret) {
3983 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3984 	}
3985 
3986 	if (__NR_clone3 < 0)
3987 		SKIP(return, "Test not built with clone3 support");
3988 
3989 	ASSERT_EQ(0, pipe(p));
3990 
3991 	pid = sys_clone3(&args, sizeof(args));
3992 	ASSERT_GE(pid, 0);
3993 
3994 	if (pid == 0) {
3995 		int listener;
3996 
3997 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3998 		if (listener < 0)
3999 			_exit(EXIT_FAILURE);
4000 
4001 		if (dup2(listener, 200) != 200)
4002 			_exit(EXIT_FAILURE);
4003 		close(p[1]);
4004 		close(listener);
4005 		sleep(1);
4006 
4007 		_exit(EXIT_SUCCESS);
4008 	}
4009 	if (read(p[0], &status, 1) != 0)
4010 		_exit(EXIT_SUCCESS);
4011 	close(p[0]);
4012 	/*
4013 	 * The seccomp filter has become unused so we should be notified once
4014 	 * the kernel gets around to cleaning up task struct.
4015 	 */
4016 	EXPECT_EQ(ioctl(200, SECCOMP_IOCTL_NOTIF_RECV, &req), -1);
4017 	EXPECT_EQ(errno, ENOENT);
4018 
4019 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4020 	EXPECT_EQ(true, WIFEXITED(status));
4021 	EXPECT_EQ(0, WEXITSTATUS(status));
4022 }
4023 
do_thread(void * data)4024 static void *do_thread(void *data)
4025 {
4026 	return NULL;
4027 }
4028 
TEST(user_notification_filter_empty_threaded)4029 TEST(user_notification_filter_empty_threaded)
4030 {
4031 	pid_t pid;
4032 	long ret;
4033 	int status;
4034 	struct pollfd pollfd;
4035 	struct __clone_args args = {
4036 		.flags = CLONE_FILES,
4037 		.exit_signal = SIGCHLD,
4038 	};
4039 
4040 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4041 	ASSERT_EQ(0, ret) {
4042 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4043 	}
4044 
4045 	if (__NR_clone3 < 0)
4046 		SKIP(return, "Test not built with clone3 support");
4047 
4048 	pid = sys_clone3(&args, sizeof(args));
4049 	ASSERT_GE(pid, 0);
4050 
4051 	if (pid == 0) {
4052 		pid_t pid1, pid2;
4053 		int listener, status;
4054 		pthread_t thread;
4055 
4056 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
4057 		if (listener < 0)
4058 			_exit(EXIT_FAILURE);
4059 
4060 		if (dup2(listener, 200) != 200)
4061 			_exit(EXIT_FAILURE);
4062 
4063 		close(listener);
4064 
4065 		pid1 = fork();
4066 		if (pid1 < 0)
4067 			_exit(EXIT_FAILURE);
4068 
4069 		if (pid1 == 0)
4070 			_exit(EXIT_SUCCESS);
4071 
4072 		pid2 = fork();
4073 		if (pid2 < 0)
4074 			_exit(EXIT_FAILURE);
4075 
4076 		if (pid2 == 0)
4077 			_exit(EXIT_SUCCESS);
4078 
4079 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4080 		    pthread_join(thread, NULL))
4081 			_exit(EXIT_FAILURE);
4082 
4083 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4084 		    pthread_join(thread, NULL))
4085 			_exit(EXIT_FAILURE);
4086 
4087 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
4088 		    WEXITSTATUS(status))
4089 			_exit(EXIT_FAILURE);
4090 
4091 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
4092 		    WEXITSTATUS(status))
4093 			_exit(EXIT_FAILURE);
4094 
4095 		exit(EXIT_SUCCESS);
4096 	}
4097 
4098 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4099 	EXPECT_EQ(true, WIFEXITED(status));
4100 	EXPECT_EQ(0, WEXITSTATUS(status));
4101 
4102 	/*
4103 	 * The seccomp filter has become unused so we should be notified once
4104 	 * the kernel gets around to cleaning up task struct.
4105 	 */
4106 	pollfd.fd = 200;
4107 	pollfd.events = POLLHUP;
4108 
4109 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4110 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4111 }
4112 
4113 
get_next_fd(int prev_fd)4114 int get_next_fd(int prev_fd)
4115 {
4116 	for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) {
4117 		if (fcntl(i, F_GETFD) == -1)
4118 			return i;
4119 	}
4120 	_exit(EXIT_FAILURE);
4121 }
4122 
TEST(user_notification_addfd)4123 TEST(user_notification_addfd)
4124 {
4125 	pid_t pid;
4126 	long ret;
4127 	int status, listener, memfd, fd, nextfd;
4128 	struct seccomp_notif_addfd addfd = {};
4129 	struct seccomp_notif_addfd_small small = {};
4130 	struct seccomp_notif_addfd_big big = {};
4131 	struct seccomp_notif req = {};
4132 	struct seccomp_notif_resp resp = {};
4133 	/* 100 ms */
4134 	struct timespec delay = { .tv_nsec = 100000000 };
4135 
4136 	/* There may be arbitrary already-open fds at test start. */
4137 	memfd = memfd_create("test", 0);
4138 	ASSERT_GE(memfd, 0);
4139 	nextfd = get_next_fd(memfd);
4140 
4141 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4142 	ASSERT_EQ(0, ret) {
4143 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4144 	}
4145 
4146 	/* fd: 4 */
4147 	/* Check that the basic notification machinery works */
4148 	listener = user_notif_syscall(__NR_getppid,
4149 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4150 	ASSERT_EQ(listener, nextfd);
4151 	nextfd = get_next_fd(nextfd);
4152 
4153 	pid = fork();
4154 	ASSERT_GE(pid, 0);
4155 
4156 	if (pid == 0) {
4157 		/* fds will be added and this value is expected */
4158 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4159 			exit(1);
4160 
4161 		/* Atomic addfd+send is received here. Check it is a valid fd */
4162 		if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4163 			exit(1);
4164 
4165 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4166 	}
4167 
4168 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4169 
4170 	addfd.srcfd = memfd;
4171 	addfd.newfd = 0;
4172 	addfd.id = req.id;
4173 	addfd.flags = 0x0;
4174 
4175 	/* Verify bad newfd_flags cannot be set */
4176 	addfd.newfd_flags = ~O_CLOEXEC;
4177 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4178 	EXPECT_EQ(errno, EINVAL);
4179 	addfd.newfd_flags = O_CLOEXEC;
4180 
4181 	/* Verify bad flags cannot be set */
4182 	addfd.flags = 0xff;
4183 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4184 	EXPECT_EQ(errno, EINVAL);
4185 	addfd.flags = 0;
4186 
4187 	/* Verify that remote_fd cannot be set without setting flags */
4188 	addfd.newfd = 1;
4189 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4190 	EXPECT_EQ(errno, EINVAL);
4191 	addfd.newfd = 0;
4192 
4193 	/* Verify small size cannot be set */
4194 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4195 	EXPECT_EQ(errno, EINVAL);
4196 
4197 	/* Verify we can't send bits filled in unknown buffer area */
4198 	memset(&big, 0xAA, sizeof(big));
4199 	big.addfd = addfd;
4200 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4201 	EXPECT_EQ(errno, E2BIG);
4202 
4203 
4204 	/* Verify we can set an arbitrary remote fd */
4205 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4206 	EXPECT_EQ(fd, nextfd);
4207 	nextfd = get_next_fd(nextfd);
4208 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4209 
4210 	/* Verify we can set an arbitrary remote fd with large size */
4211 	memset(&big, 0x0, sizeof(big));
4212 	big.addfd = addfd;
4213 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4214 	EXPECT_EQ(fd, nextfd);
4215 	nextfd = get_next_fd(nextfd);
4216 
4217 	/* Verify we can set a specific remote fd */
4218 	addfd.newfd = 42;
4219 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4220 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4221 	EXPECT_EQ(fd, 42);
4222 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4223 
4224 	/* Resume syscall */
4225 	resp.id = req.id;
4226 	resp.error = 0;
4227 	resp.val = USER_NOTIF_MAGIC;
4228 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4229 
4230 	/*
4231 	 * This sets the ID of the ADD FD to the last request plus 1. The
4232 	 * notification ID increments 1 per notification.
4233 	 */
4234 	addfd.id = req.id + 1;
4235 
4236 	/* This spins until the underlying notification is generated */
4237 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4238 	       errno != -EINPROGRESS)
4239 		nanosleep(&delay, NULL);
4240 
4241 	memset(&req, 0, sizeof(req));
4242 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4243 	ASSERT_EQ(addfd.id, req.id);
4244 
4245 	/* Verify we can do an atomic addfd and send */
4246 	addfd.newfd = 0;
4247 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4248 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4249 	/*
4250 	 * Child has earlier "low" fds and now 42, so we expect the next
4251 	 * lowest available fd to be assigned here.
4252 	 */
4253 	EXPECT_EQ(fd, nextfd);
4254 	nextfd = get_next_fd(nextfd);
4255 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4256 
4257 	/*
4258 	 * This sets the ID of the ADD FD to the last request plus 1. The
4259 	 * notification ID increments 1 per notification.
4260 	 */
4261 	addfd.id = req.id + 1;
4262 
4263 	/* This spins until the underlying notification is generated */
4264 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4265 	       errno != -EINPROGRESS)
4266 		nanosleep(&delay, NULL);
4267 
4268 	memset(&req, 0, sizeof(req));
4269 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4270 	ASSERT_EQ(addfd.id, req.id);
4271 
4272 	resp.id = req.id;
4273 	resp.error = 0;
4274 	resp.val = USER_NOTIF_MAGIC;
4275 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4276 
4277 	/* Wait for child to finish. */
4278 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4279 	EXPECT_EQ(true, WIFEXITED(status));
4280 	EXPECT_EQ(0, WEXITSTATUS(status));
4281 
4282 	close(memfd);
4283 }
4284 
TEST(user_notification_addfd_rlimit)4285 TEST(user_notification_addfd_rlimit)
4286 {
4287 	pid_t pid;
4288 	long ret;
4289 	int status, listener, memfd;
4290 	struct seccomp_notif_addfd addfd = {};
4291 	struct seccomp_notif req = {};
4292 	struct seccomp_notif_resp resp = {};
4293 	const struct rlimit lim = {
4294 		.rlim_cur	= 0,
4295 		.rlim_max	= 0,
4296 	};
4297 
4298 	memfd = memfd_create("test", 0);
4299 	ASSERT_GE(memfd, 0);
4300 
4301 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4302 	ASSERT_EQ(0, ret) {
4303 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4304 	}
4305 
4306 	/* Check that the basic notification machinery works */
4307 	listener = user_notif_syscall(__NR_getppid,
4308 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4309 	ASSERT_GE(listener, 0);
4310 
4311 	pid = fork();
4312 	ASSERT_GE(pid, 0);
4313 
4314 	if (pid == 0)
4315 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4316 
4317 
4318 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4319 
4320 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4321 
4322 	addfd.srcfd = memfd;
4323 	addfd.newfd_flags = O_CLOEXEC;
4324 	addfd.newfd = 0;
4325 	addfd.id = req.id;
4326 	addfd.flags = 0;
4327 
4328 	/* Should probably spot check /proc/sys/fs/file-nr */
4329 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4330 	EXPECT_EQ(errno, EMFILE);
4331 
4332 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4333 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4334 	EXPECT_EQ(errno, EMFILE);
4335 
4336 	addfd.newfd = 100;
4337 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4338 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4339 	EXPECT_EQ(errno, EBADF);
4340 
4341 	resp.id = req.id;
4342 	resp.error = 0;
4343 	resp.val = USER_NOTIF_MAGIC;
4344 
4345 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4346 
4347 	/* Wait for child to finish. */
4348 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4349 	EXPECT_EQ(true, WIFEXITED(status));
4350 	EXPECT_EQ(0, WEXITSTATUS(status));
4351 
4352 	close(memfd);
4353 }
4354 
4355 #ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
4356 #define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
4357 #define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
4358 #endif
4359 
TEST(user_notification_sync)4360 TEST(user_notification_sync)
4361 {
4362 	struct seccomp_notif req = {};
4363 	struct seccomp_notif_resp resp = {};
4364 	int status, listener;
4365 	pid_t pid;
4366 	long ret;
4367 
4368 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4369 	ASSERT_EQ(0, ret) {
4370 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4371 	}
4372 
4373 	listener = user_notif_syscall(__NR_getppid,
4374 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4375 	ASSERT_GE(listener, 0);
4376 
4377 	/* Try to set invalid flags. */
4378 	EXPECT_SYSCALL_RETURN(-EINVAL,
4379 		ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
4380 
4381 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
4382 			SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
4383 
4384 	pid = fork();
4385 	ASSERT_GE(pid, 0);
4386 	if (pid == 0) {
4387 		ret = syscall(__NR_getppid);
4388 		ASSERT_EQ(ret, USER_NOTIF_MAGIC) {
4389 			_exit(1);
4390 		}
4391 		_exit(0);
4392 	}
4393 
4394 	req.pid = 0;
4395 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4396 
4397 	ASSERT_EQ(req.data.nr,  __NR_getppid);
4398 
4399 	resp.id = req.id;
4400 	resp.error = 0;
4401 	resp.val = USER_NOTIF_MAGIC;
4402 	resp.flags = 0;
4403 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4404 
4405 	ASSERT_EQ(waitpid(pid, &status, 0), pid);
4406 	ASSERT_EQ(status, 0);
4407 }
4408 
4409 
4410 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP)4411 FIXTURE(O_SUSPEND_SECCOMP) {
4412 	pid_t pid;
4413 };
4414 
FIXTURE_SETUP(O_SUSPEND_SECCOMP)4415 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4416 {
4417 	ERRNO_FILTER(block_read, E2BIG);
4418 	cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4419 	cap_t caps;
4420 
4421 	self->pid = 0;
4422 
4423 	/* make sure we don't have CAP_SYS_ADMIN */
4424 	caps = cap_get_proc();
4425 	ASSERT_NE(NULL, caps);
4426 	ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4427 	ASSERT_EQ(0, cap_set_proc(caps));
4428 	cap_free(caps);
4429 
4430 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4431 	ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4432 
4433 	self->pid = fork();
4434 	ASSERT_GE(self->pid, 0);
4435 
4436 	if (self->pid == 0) {
4437 		while (1)
4438 			pause();
4439 		_exit(127);
4440 	}
4441 }
4442 
FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)4443 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4444 {
4445 	if (self->pid)
4446 		kill(self->pid, SIGKILL);
4447 }
4448 
TEST_F(O_SUSPEND_SECCOMP,setoptions)4449 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4450 {
4451 	int wstatus;
4452 
4453 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4454 	ASSERT_EQ(self->pid, wait(&wstatus));
4455 	ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4456 	if (errno == EINVAL)
4457 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4458 	ASSERT_EQ(EPERM, errno);
4459 }
4460 
TEST_F(O_SUSPEND_SECCOMP,seize)4461 TEST_F(O_SUSPEND_SECCOMP, seize)
4462 {
4463 	int ret;
4464 
4465 	ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4466 	ASSERT_EQ(-1, ret);
4467 	if (errno == EINVAL)
4468 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4469 	ASSERT_EQ(EPERM, errno);
4470 }
4471 
4472 /*
4473  * get_nth - Get the nth, space separated entry in a file.
4474  *
4475  * Returns the length of the read field.
4476  * Throws error if field is zero-lengthed.
4477  */
get_nth(struct __test_metadata * _metadata,const char * path,const unsigned int position,char ** entry)4478 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4479 		     const unsigned int position, char **entry)
4480 {
4481 	char *line = NULL;
4482 	unsigned int i;
4483 	ssize_t nread;
4484 	size_t len = 0;
4485 	FILE *f;
4486 
4487 	f = fopen(path, "r");
4488 	ASSERT_NE(f, NULL) {
4489 		TH_LOG("Could not open %s: %s", path, strerror(errno));
4490 	}
4491 
4492 	for (i = 0; i < position; i++) {
4493 		nread = getdelim(&line, &len, ' ', f);
4494 		ASSERT_GE(nread, 0) {
4495 			TH_LOG("Failed to read %d entry in file %s", i, path);
4496 		}
4497 	}
4498 	fclose(f);
4499 
4500 	ASSERT_GT(nread, 0) {
4501 		TH_LOG("Entry in file %s had zero length", path);
4502 	}
4503 
4504 	*entry = line;
4505 	return nread - 1;
4506 }
4507 
4508 /* For a given PID, get the task state (D, R, etc...) */
get_proc_stat(struct __test_metadata * _metadata,pid_t pid)4509 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4510 {
4511 	char proc_path[100] = {0};
4512 	char status;
4513 	char *line;
4514 
4515 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4516 	ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4517 
4518 	status = *line;
4519 	free(line);
4520 
4521 	return status;
4522 }
4523 
TEST(user_notification_fifo)4524 TEST(user_notification_fifo)
4525 {
4526 	struct seccomp_notif_resp resp = {};
4527 	struct seccomp_notif req = {};
4528 	int i, status, listener;
4529 	pid_t pid, pids[3];
4530 	__u64 baseid;
4531 	long ret;
4532 	/* 100 ms */
4533 	struct timespec delay = { .tv_nsec = 100000000 };
4534 
4535 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4536 	ASSERT_EQ(0, ret) {
4537 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4538 	}
4539 
4540 	/* Setup a listener */
4541 	listener = user_notif_syscall(__NR_getppid,
4542 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4543 	ASSERT_GE(listener, 0);
4544 
4545 	pid = fork();
4546 	ASSERT_GE(pid, 0);
4547 
4548 	if (pid == 0) {
4549 		ret = syscall(__NR_getppid);
4550 		exit(ret != USER_NOTIF_MAGIC);
4551 	}
4552 
4553 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4554 	baseid = req.id + 1;
4555 
4556 	resp.id = req.id;
4557 	resp.error = 0;
4558 	resp.val = USER_NOTIF_MAGIC;
4559 
4560 	/* check that we make sure flags == 0 */
4561 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4562 
4563 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4564 	EXPECT_EQ(true, WIFEXITED(status));
4565 	EXPECT_EQ(0, WEXITSTATUS(status));
4566 
4567 	/* Start children, and generate notifications */
4568 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4569 		pid = fork();
4570 		if (pid == 0) {
4571 			ret = syscall(__NR_getppid);
4572 			exit(ret != USER_NOTIF_MAGIC);
4573 		}
4574 		pids[i] = pid;
4575 	}
4576 
4577 	/* This spins until all of the children are sleeping */
4578 restart_wait:
4579 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4580 		if (get_proc_stat(_metadata, pids[i]) != 'S') {
4581 			nanosleep(&delay, NULL);
4582 			goto restart_wait;
4583 		}
4584 	}
4585 
4586 	/* Read the notifications in order (and respond) */
4587 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4588 		memset(&req, 0, sizeof(req));
4589 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4590 		EXPECT_EQ(req.id, baseid + i);
4591 		resp.id = req.id;
4592 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4593 	}
4594 
4595 	/* Make sure notifications were received */
4596 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4597 		EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4598 		EXPECT_EQ(true, WIFEXITED(status));
4599 		EXPECT_EQ(0, WEXITSTATUS(status));
4600 	}
4601 }
4602 
4603 /* get_proc_syscall - Get the syscall in progress for a given pid
4604  *
4605  * Returns the current syscall number for a given process
4606  * Returns -1 if not in syscall (running or blocked)
4607  */
get_proc_syscall(struct __test_metadata * _metadata,int pid)4608 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4609 {
4610 	char proc_path[100] = {0};
4611 	long ret = -1;
4612 	ssize_t nread;
4613 	char *line;
4614 
4615 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4616 	nread = get_nth(_metadata, proc_path, 1, &line);
4617 	ASSERT_GT(nread, 0);
4618 
4619 	if (!strncmp("running", line, MIN(7, nread)))
4620 		ret = strtol(line, NULL, 16);
4621 
4622 	free(line);
4623 	return ret;
4624 }
4625 
4626 /* Ensure non-fatal signals prior to receive are unmodified */
TEST(user_notification_wait_killable_pre_notification)4627 TEST(user_notification_wait_killable_pre_notification)
4628 {
4629 	struct sigaction new_action = {
4630 		.sa_handler = signal_handler,
4631 	};
4632 	int listener, status, sk_pair[2];
4633 	pid_t pid;
4634 	long ret;
4635 	char c;
4636 	/* 100 ms */
4637 	struct timespec delay = { .tv_nsec = 100000000 };
4638 
4639 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4640 
4641 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4642 	ASSERT_EQ(0, ret)
4643 	{
4644 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4645 	}
4646 
4647 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4648 
4649 	listener = user_notif_syscall(
4650 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4651 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4652 	ASSERT_GE(listener, 0);
4653 
4654 	/*
4655 	 * Check that we can kill the process with SIGUSR1 prior to receiving
4656 	 * the notification. SIGUSR1 is wired up to a custom signal handler,
4657 	 * and make sure it gets called.
4658 	 */
4659 	pid = fork();
4660 	ASSERT_GE(pid, 0);
4661 
4662 	if (pid == 0) {
4663 		close(sk_pair[0]);
4664 		handled = sk_pair[1];
4665 
4666 		/* Setup the non-fatal sigaction without SA_RESTART */
4667 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4668 			perror("sigaction");
4669 			exit(1);
4670 		}
4671 
4672 		ret = syscall(__NR_getppid);
4673 		/* Make sure we got a return from a signal interruption */
4674 		exit(ret != -1 || errno != EINTR);
4675 	}
4676 
4677 	/*
4678 	 * Make sure we've gotten to the seccomp user notification wait
4679 	 * from getppid prior to sending any signals
4680 	 */
4681 	while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4682 	       get_proc_stat(_metadata, pid) != 'S')
4683 		nanosleep(&delay, NULL);
4684 
4685 	/* Send non-fatal kill signal */
4686 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4687 
4688 	/* wait for process to exit (exit checks for EINTR) */
4689 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4690 	EXPECT_EQ(true, WIFEXITED(status));
4691 	EXPECT_EQ(0, WEXITSTATUS(status));
4692 
4693 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4694 }
4695 
4696 /* Ensure non-fatal signals after receive are blocked */
TEST(user_notification_wait_killable)4697 TEST(user_notification_wait_killable)
4698 {
4699 	struct sigaction new_action = {
4700 		.sa_handler = signal_handler,
4701 	};
4702 	struct seccomp_notif_resp resp = {};
4703 	struct seccomp_notif req = {};
4704 	int listener, status, sk_pair[2];
4705 	pid_t pid;
4706 	long ret;
4707 	char c;
4708 	/* 100 ms */
4709 	struct timespec delay = { .tv_nsec = 100000000 };
4710 
4711 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4712 
4713 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4714 	ASSERT_EQ(0, ret)
4715 	{
4716 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4717 	}
4718 
4719 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4720 
4721 	listener = user_notif_syscall(
4722 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4723 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4724 	ASSERT_GE(listener, 0);
4725 
4726 	pid = fork();
4727 	ASSERT_GE(pid, 0);
4728 
4729 	if (pid == 0) {
4730 		close(sk_pair[0]);
4731 		handled = sk_pair[1];
4732 
4733 		/* Setup the sigaction without SA_RESTART */
4734 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4735 			perror("sigaction");
4736 			exit(1);
4737 		}
4738 
4739 		/* Make sure that the syscall is completed (no EINTR) */
4740 		ret = syscall(__NR_getppid);
4741 		exit(ret != USER_NOTIF_MAGIC);
4742 	}
4743 
4744 	/*
4745 	 * Get the notification, to make move the notifying process into a
4746 	 * non-preemptible (TASK_KILLABLE) state.
4747 	 */
4748 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4749 	/* Send non-fatal kill signal */
4750 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4751 
4752 	/*
4753 	 * Make sure the task enters moves to TASK_KILLABLE by waiting for
4754 	 * D (Disk Sleep) state after receiving non-fatal signal.
4755 	 */
4756 	while (get_proc_stat(_metadata, pid) != 'D')
4757 		nanosleep(&delay, NULL);
4758 
4759 	resp.id = req.id;
4760 	resp.val = USER_NOTIF_MAGIC;
4761 	/* Make sure the notification is found and able to be replied to */
4762 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4763 
4764 	/*
4765 	 * Make sure that the signal handler does get called once we're back in
4766 	 * userspace.
4767 	 */
4768 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4769 	/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4770 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4771 	EXPECT_EQ(true, WIFEXITED(status));
4772 	EXPECT_EQ(0, WEXITSTATUS(status));
4773 }
4774 
4775 /* Ensure fatal signals after receive are not blocked */
TEST(user_notification_wait_killable_fatal)4776 TEST(user_notification_wait_killable_fatal)
4777 {
4778 	struct seccomp_notif req = {};
4779 	int listener, status;
4780 	pid_t pid;
4781 	long ret;
4782 	/* 100 ms */
4783 	struct timespec delay = { .tv_nsec = 100000000 };
4784 
4785 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4786 	ASSERT_EQ(0, ret)
4787 	{
4788 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4789 	}
4790 
4791 	listener = user_notif_syscall(
4792 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4793 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4794 	ASSERT_GE(listener, 0);
4795 
4796 	pid = fork();
4797 	ASSERT_GE(pid, 0);
4798 
4799 	if (pid == 0) {
4800 		/* This should never complete as it should get a SIGTERM */
4801 		syscall(__NR_getppid);
4802 		exit(1);
4803 	}
4804 
4805 	while (get_proc_stat(_metadata, pid) != 'S')
4806 		nanosleep(&delay, NULL);
4807 
4808 	/*
4809 	 * Get the notification, to make move the notifying process into a
4810 	 * non-preemptible (TASK_KILLABLE) state.
4811 	 */
4812 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4813 	/* Kill the process with a fatal signal */
4814 	EXPECT_EQ(kill(pid, SIGTERM), 0);
4815 
4816 	/*
4817 	 * Wait for the process to exit, and make sure the process terminated
4818 	 * due to the SIGTERM signal.
4819 	 */
4820 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4821 	EXPECT_EQ(true, WIFSIGNALED(status));
4822 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
4823 }
4824 
4825 struct tsync_vs_thread_leader_args {
4826 	pthread_t leader;
4827 };
4828 
tsync_vs_dead_thread_leader_sibling(void * _args)4829 static void *tsync_vs_dead_thread_leader_sibling(void *_args)
4830 {
4831 	struct sock_filter allow_filter[] = {
4832 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4833 	};
4834 	struct sock_fprog allow_prog = {
4835 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
4836 		.filter = allow_filter,
4837 	};
4838 	struct tsync_vs_thread_leader_args *args = _args;
4839 	void *retval;
4840 	long ret;
4841 
4842 	ret = pthread_join(args->leader, &retval);
4843 	if (ret)
4844 		exit(1);
4845 	if (retval != _args)
4846 		exit(2);
4847 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &allow_prog);
4848 	if (ret)
4849 		exit(3);
4850 
4851 	exit(0);
4852 }
4853 
4854 /*
4855  * Ensure that a dead thread leader doesn't prevent installing new filters with
4856  * SECCOMP_FILTER_FLAG_TSYNC from other threads.
4857  */
TEST(tsync_vs_dead_thread_leader)4858 TEST(tsync_vs_dead_thread_leader)
4859 {
4860 	int status;
4861 	pid_t pid;
4862 	long ret;
4863 
4864 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4865 	ASSERT_EQ(0, ret) {
4866 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4867 	}
4868 
4869 	pid = fork();
4870 	ASSERT_GE(pid, 0);
4871 
4872 	if (pid == 0) {
4873 		struct sock_filter allow_filter[] = {
4874 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4875 		};
4876 		struct sock_fprog allow_prog = {
4877 			.len = (unsigned short)ARRAY_SIZE(allow_filter),
4878 			.filter = allow_filter,
4879 		};
4880 		struct  tsync_vs_thread_leader_args *args;
4881 		pthread_t sibling;
4882 
4883 		args = malloc(sizeof(*args));
4884 		ASSERT_NE(NULL, args);
4885 		args->leader = pthread_self();
4886 
4887 		ret = pthread_create(&sibling, NULL,
4888 				     tsync_vs_dead_thread_leader_sibling, args);
4889 		ASSERT_EQ(0, ret);
4890 
4891 		/* Install a new filter just to the leader thread. */
4892 		ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
4893 		ASSERT_EQ(0, ret);
4894 		pthread_exit(args);
4895 		exit(1);
4896 	}
4897 
4898 	EXPECT_EQ(pid, waitpid(pid, &status, 0));
4899 	EXPECT_EQ(0, status);
4900 }
4901 
probed(void)4902 noinline int probed(void)
4903 {
4904 	return 1;
4905 }
4906 
parse_uint_from_file(const char * file,const char * fmt)4907 static int parse_uint_from_file(const char *file, const char *fmt)
4908 {
4909 	int err = -1, ret;
4910 	FILE *f;
4911 
4912 	f = fopen(file, "re");
4913 	if (f) {
4914 		err = fscanf(f, fmt, &ret);
4915 		fclose(f);
4916 	}
4917 	return err == 1 ? ret : err;
4918 }
4919 
determine_uprobe_perf_type(void)4920 static int determine_uprobe_perf_type(void)
4921 {
4922 	const char *file = "/sys/bus/event_source/devices/uprobe/type";
4923 
4924 	return parse_uint_from_file(file, "%d\n");
4925 }
4926 
determine_uprobe_retprobe_bit(void)4927 static int determine_uprobe_retprobe_bit(void)
4928 {
4929 	const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe";
4930 
4931 	return parse_uint_from_file(file, "config:%d\n");
4932 }
4933 
get_uprobe_offset(const void * addr)4934 static ssize_t get_uprobe_offset(const void *addr)
4935 {
4936 	size_t start, base, end;
4937 	bool found = false;
4938 	char buf[256];
4939 	FILE *f;
4940 
4941 	f = fopen("/proc/self/maps", "r");
4942 	if (!f)
4943 		return -1;
4944 
4945 	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
4946 		if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
4947 			found = true;
4948 			break;
4949 		}
4950 	}
4951 	fclose(f);
4952 	return found ? (uintptr_t)addr - start + base : -1;
4953 }
4954 
FIXTURE(URETPROBE)4955 FIXTURE(URETPROBE) {
4956 	int fd;
4957 };
4958 
FIXTURE_VARIANT(URETPROBE)4959 FIXTURE_VARIANT(URETPROBE) {
4960 	/*
4961 	 * All of the URETPROBE behaviors can be tested with either
4962 	 * uretprobe attached or not
4963 	 */
4964 	bool attach;
4965 };
4966 
FIXTURE_VARIANT_ADD(URETPROBE,attached)4967 FIXTURE_VARIANT_ADD(URETPROBE, attached) {
4968 	.attach = true,
4969 };
4970 
FIXTURE_VARIANT_ADD(URETPROBE,not_attached)4971 FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
4972 	.attach = false,
4973 };
4974 
FIXTURE_SETUP(URETPROBE)4975 FIXTURE_SETUP(URETPROBE)
4976 {
4977 	const size_t attr_sz = sizeof(struct perf_event_attr);
4978 	struct perf_event_attr attr;
4979 	ssize_t offset;
4980 	int type, bit;
4981 
4982 #ifndef __NR_uretprobe
4983 	SKIP(return, "__NR_uretprobe syscall not defined");
4984 #endif
4985 
4986 	if (!variant->attach)
4987 		return;
4988 
4989 	memset(&attr, 0, attr_sz);
4990 
4991 	type = determine_uprobe_perf_type();
4992 	ASSERT_GE(type, 0);
4993 	bit = determine_uprobe_retprobe_bit();
4994 	ASSERT_GE(bit, 0);
4995 	offset = get_uprobe_offset(probed);
4996 	ASSERT_GE(offset, 0);
4997 
4998 	attr.config |= 1 << bit;
4999 	attr.size = attr_sz;
5000 	attr.type = type;
5001 	attr.config1 = ptr_to_u64("/proc/self/exe");
5002 	attr.config2 = offset;
5003 
5004 	self->fd = syscall(__NR_perf_event_open, &attr,
5005 			   getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */,
5006 			   PERF_FLAG_FD_CLOEXEC);
5007 }
5008 
FIXTURE_TEARDOWN(URETPROBE)5009 FIXTURE_TEARDOWN(URETPROBE)
5010 {
5011 	/* we could call close(self->fd), but we'd need extra filter for
5012 	 * that and since we are calling _exit right away..
5013 	 */
5014 }
5015 
run_probed_with_filter(struct sock_fprog * prog)5016 static int run_probed_with_filter(struct sock_fprog *prog)
5017 {
5018 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
5019 	    seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) {
5020 		return -1;
5021 	}
5022 
5023 	probed();
5024 	return 0;
5025 }
5026 
TEST_F(URETPROBE,uretprobe_default_allow)5027 TEST_F(URETPROBE, uretprobe_default_allow)
5028 {
5029 	struct sock_filter filter[] = {
5030 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5031 	};
5032 	struct sock_fprog prog = {
5033 		.len = (unsigned short)ARRAY_SIZE(filter),
5034 		.filter = filter,
5035 	};
5036 
5037 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5038 }
5039 
TEST_F(URETPROBE,uretprobe_default_block)5040 TEST_F(URETPROBE, uretprobe_default_block)
5041 {
5042 	struct sock_filter filter[] = {
5043 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5044 			offsetof(struct seccomp_data, nr)),
5045 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5046 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5047 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5048 	};
5049 	struct sock_fprog prog = {
5050 		.len = (unsigned short)ARRAY_SIZE(filter),
5051 		.filter = filter,
5052 	};
5053 
5054 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5055 }
5056 
TEST_F(URETPROBE,uretprobe_block_uretprobe_syscall)5057 TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
5058 {
5059 	struct sock_filter filter[] = {
5060 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5061 			offsetof(struct seccomp_data, nr)),
5062 #ifdef __NR_uretprobe
5063 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
5064 #endif
5065 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5066 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5067 	};
5068 	struct sock_fprog prog = {
5069 		.len = (unsigned short)ARRAY_SIZE(filter),
5070 		.filter = filter,
5071 	};
5072 
5073 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5074 }
5075 
TEST_F(URETPROBE,uretprobe_default_block_with_uretprobe_syscall)5076 TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
5077 {
5078 	struct sock_filter filter[] = {
5079 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5080 			offsetof(struct seccomp_data, nr)),
5081 #ifdef __NR_uretprobe
5082 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
5083 #endif
5084 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5085 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5086 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5087 	};
5088 	struct sock_fprog prog = {
5089 		.len = (unsigned short)ARRAY_SIZE(filter),
5090 		.filter = filter,
5091 	};
5092 
5093 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5094 }
5095 
5096 /*
5097  * TODO:
5098  * - expand NNP testing
5099  * - better arch-specific TRACE and TRAP handlers.
5100  * - endianness checking when appropriate
5101  * - 64-bit arg prodding
5102  * - arch value testing (x86 modes especially)
5103  * - verify that FILTER_FLAG_LOG filters generate log messages
5104  * - verify that RET_LOG generates log messages
5105  */
5106 
5107 TEST_HARNESS_MAIN
5108