xref: /linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision 41fa04327384148b0e2e828c9be9862c5240e9fa)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/time.h>
28 #include <sys/user.h>
29 #include <linux/prctl.h>
30 #include <linux/ptrace.h>
31 #include <linux/seccomp.h>
32 #include <pthread.h>
33 #include <semaphore.h>
34 #include <signal.h>
35 #include <stddef.h>
36 #include <stdbool.h>
37 #include <string.h>
38 #include <time.h>
39 #include <limits.h>
40 #include <linux/elf.h>
41 #include <sys/uio.h>
42 #include <sys/utsname.h>
43 #include <sys/fcntl.h>
44 #include <sys/mman.h>
45 #include <sys/times.h>
46 #include <sys/socket.h>
47 #include <sys/ioctl.h>
48 #include <linux/kcmp.h>
49 #include <sys/resource.h>
50 #include <sys/capability.h>
51 #include <linux/perf_event.h>
52 
53 #include <unistd.h>
54 #include <sys/syscall.h>
55 #include <poll.h>
56 
57 #include "kselftest_harness.h"
58 #include "../clone3/clone3_selftests.h"
59 
60 /* Attempt to de-conflict with the selftests tree. */
61 #ifndef SKIP
62 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
63 #endif
64 
65 #ifndef MIN
66 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
67 #endif
68 
69 #ifndef PR_SET_PTRACER
70 # define PR_SET_PTRACER 0x59616d61
71 #endif
72 
73 #ifndef noinline
74 #define noinline __attribute__((noinline))
75 #endif
76 
77 #ifndef __nocf_check
78 #define __nocf_check __attribute__((nocf_check))
79 #endif
80 
81 #ifndef __naked
82 #define __naked __attribute__((__naked__))
83 #endif
84 
85 #ifndef PR_SET_NO_NEW_PRIVS
86 #define PR_SET_NO_NEW_PRIVS 38
87 #define PR_GET_NO_NEW_PRIVS 39
88 #endif
89 
90 #ifndef PR_SECCOMP_EXT
91 #define PR_SECCOMP_EXT 43
92 #endif
93 
94 #ifndef SECCOMP_EXT_ACT
95 #define SECCOMP_EXT_ACT 1
96 #endif
97 
98 #ifndef SECCOMP_EXT_ACT_TSYNC
99 #define SECCOMP_EXT_ACT_TSYNC 1
100 #endif
101 
102 #ifndef SECCOMP_MODE_STRICT
103 #define SECCOMP_MODE_STRICT 1
104 #endif
105 
106 #ifndef SECCOMP_MODE_FILTER
107 #define SECCOMP_MODE_FILTER 2
108 #endif
109 
110 #ifndef SECCOMP_RET_ALLOW
111 struct seccomp_data {
112 	int nr;
113 	__u32 arch;
114 	__u64 instruction_pointer;
115 	__u64 args[6];
116 };
117 #endif
118 
119 #ifndef SECCOMP_RET_KILL_PROCESS
120 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
121 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
122 #endif
123 #ifndef SECCOMP_RET_KILL
124 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
125 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
126 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
127 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
128 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
129 #endif
130 #ifndef SECCOMP_RET_LOG
131 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
132 #endif
133 
134 #ifndef __NR_seccomp
135 # if defined(__i386__)
136 #  define __NR_seccomp 354
137 # elif defined(__x86_64__)
138 #  define __NR_seccomp 317
139 # elif defined(__arm__)
140 #  define __NR_seccomp 383
141 # elif defined(__aarch64__)
142 #  define __NR_seccomp 277
143 # elif defined(__riscv)
144 #  define __NR_seccomp 277
145 # elif defined(__csky__)
146 #  define __NR_seccomp 277
147 # elif defined(__loongarch__)
148 #  define __NR_seccomp 277
149 # elif defined(__hppa__)
150 #  define __NR_seccomp 338
151 # elif defined(__powerpc__)
152 #  define __NR_seccomp 358
153 # elif defined(__s390__)
154 #  define __NR_seccomp 348
155 # elif defined(__xtensa__)
156 #  define __NR_seccomp 337
157 # elif defined(__sh__)
158 #  define __NR_seccomp 372
159 # elif defined(__mc68000__)
160 #  define __NR_seccomp 380
161 # else
162 #  warning "seccomp syscall number unknown for this architecture"
163 #  define __NR_seccomp 0xffff
164 # endif
165 #endif
166 
167 #ifndef __NR_uretprobe
168 # if defined(__x86_64__)
169 #  define __NR_uretprobe 335
170 # endif
171 #endif
172 
173 #ifndef __NR_uprobe
174 # if defined(__x86_64__)
175 #  define __NR_uprobe 336
176 # endif
177 #endif
178 
179 #ifndef SECCOMP_SET_MODE_STRICT
180 #define SECCOMP_SET_MODE_STRICT 0
181 #endif
182 
183 #ifndef SECCOMP_SET_MODE_FILTER
184 #define SECCOMP_SET_MODE_FILTER 1
185 #endif
186 
187 #ifndef SECCOMP_GET_ACTION_AVAIL
188 #define SECCOMP_GET_ACTION_AVAIL 2
189 #endif
190 
191 #ifndef SECCOMP_GET_NOTIF_SIZES
192 #define SECCOMP_GET_NOTIF_SIZES 3
193 #endif
194 
195 #ifndef SECCOMP_FILTER_FLAG_TSYNC
196 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
197 #endif
198 
199 #ifndef SECCOMP_FILTER_FLAG_LOG
200 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
201 #endif
202 
203 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
204 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
205 #endif
206 
207 #ifndef PTRACE_SECCOMP_GET_METADATA
208 #define PTRACE_SECCOMP_GET_METADATA	0x420d
209 
210 struct seccomp_metadata {
211 	__u64 filter_off;       /* Input: which filter */
212 	__u64 flags;             /* Output: filter's flags */
213 };
214 #endif
215 
216 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
217 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
218 #endif
219 
220 #ifndef SECCOMP_RET_USER_NOTIF
221 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
222 
223 #define SECCOMP_IOC_MAGIC		'!'
224 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
225 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
226 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
227 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
228 
229 /* Flags for seccomp notification fd ioctl. */
230 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
231 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
232 						struct seccomp_notif_resp)
233 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
234 
235 struct seccomp_notif {
236 	__u64 id;
237 	__u32 pid;
238 	__u32 flags;
239 	struct seccomp_data data;
240 };
241 
242 struct seccomp_notif_resp {
243 	__u64 id;
244 	__s64 val;
245 	__s32 error;
246 	__u32 flags;
247 };
248 
249 struct seccomp_notif_sizes {
250 	__u16 seccomp_notif;
251 	__u16 seccomp_notif_resp;
252 	__u16 seccomp_data;
253 };
254 #endif
255 
256 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
257 /* On success, the return value is the remote process's added fd number */
258 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
259 						struct seccomp_notif_addfd)
260 
261 /* valid flags for seccomp_notif_addfd */
262 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
263 
264 struct seccomp_notif_addfd {
265 	__u64 id;
266 	__u32 flags;
267 	__u32 srcfd;
268 	__u32 newfd;
269 	__u32 newfd_flags;
270 };
271 #endif
272 
273 #ifndef SECCOMP_ADDFD_FLAG_SEND
274 #define SECCOMP_ADDFD_FLAG_SEND	(1UL << 1) /* Addfd and return it, atomically */
275 #endif
276 
277 struct seccomp_notif_addfd_small {
278 	__u64 id;
279 	char weird[4];
280 };
281 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
282 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
283 
284 struct seccomp_notif_addfd_big {
285 	union {
286 		struct seccomp_notif_addfd addfd;
287 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
288 	};
289 };
290 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
291 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
292 
293 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
294 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
295 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
296 #endif
297 
298 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
299 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
300 #endif
301 
302 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
303 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
304 #endif
305 
306 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
307 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
308 #endif
309 
310 #ifndef seccomp
311 int seccomp(unsigned int op, unsigned int flags, void *args)
312 {
313 	errno = 0;
314 	return syscall(__NR_seccomp, op, flags, args);
315 }
316 #endif
317 
318 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
319 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
320 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
321 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
322 #else
323 #error "wut? Unknown __BYTE_ORDER__?!"
324 #endif
325 
326 #define SIBLING_EXIT_UNKILLED	0xbadbeef
327 #define SIBLING_EXIT_FAILURE	0xbadface
328 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
329 
330 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
331 {
332 #ifdef __NR_kcmp
333 	errno = 0;
334 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
335 #else
336 	errno = ENOSYS;
337 	return -1;
338 #endif
339 }
340 
341 /* Have TH_LOG report actual location filecmp() is used. */
342 #define filecmp(pid1, pid2, fd1, fd2)	({		\
343 	int _ret;					\
344 							\
345 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
346 	if (_ret != 0) {				\
347 		if (_ret < 0 && errno == ENOSYS) {	\
348 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
349 			_ret = 0;			\
350 		}					\
351 	}						\
352 	_ret; })
353 
354 TEST(kcmp)
355 {
356 	int ret;
357 
358 	ret = __filecmp(getpid(), getpid(), 1, 1);
359 	EXPECT_EQ(ret, 0);
360 	if (ret != 0 && errno == ENOSYS)
361 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
362 }
363 
364 TEST(mode_strict_support)
365 {
366 	long ret;
367 
368 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
369 	ASSERT_EQ(0, ret) {
370 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
371 	}
372 	syscall(__NR_exit, 0);
373 }
374 
375 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
376 {
377 	long ret;
378 
379 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
380 	ASSERT_EQ(0, ret) {
381 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
382 	}
383 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
384 		NULL, NULL, NULL);
385 	EXPECT_FALSE(true) {
386 		TH_LOG("Unreachable!");
387 	}
388 }
389 
390 /* Note! This doesn't test no new privs behavior */
391 TEST(no_new_privs_support)
392 {
393 	long ret;
394 
395 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
396 	EXPECT_EQ(0, ret) {
397 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
398 	}
399 }
400 
401 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
402 TEST(mode_filter_support)
403 {
404 	long ret;
405 
406 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
407 	ASSERT_EQ(0, ret) {
408 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
409 	}
410 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
411 	EXPECT_EQ(-1, ret);
412 	EXPECT_EQ(EFAULT, errno) {
413 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
414 	}
415 }
416 
417 TEST(mode_filter_without_nnp)
418 {
419 	struct sock_filter filter[] = {
420 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
421 	};
422 	struct sock_fprog prog = {
423 		.len = (unsigned short)ARRAY_SIZE(filter),
424 		.filter = filter,
425 	};
426 	long ret;
427 	cap_t cap = cap_get_proc();
428 	cap_flag_value_t is_cap_sys_admin = 0;
429 
430 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
431 	ASSERT_LE(0, ret) {
432 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
433 	}
434 	errno = 0;
435 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
436 	/* Succeeds with CAP_SYS_ADMIN, fails without */
437 	cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);
438 	if (!is_cap_sys_admin) {
439 		EXPECT_EQ(-1, ret);
440 		EXPECT_EQ(EACCES, errno);
441 	} else {
442 		EXPECT_EQ(0, ret);
443 	}
444 }
445 
446 #define MAX_INSNS_PER_PATH 32768
447 
448 TEST(filter_size_limits)
449 {
450 	int i;
451 	int count = BPF_MAXINSNS + 1;
452 	struct sock_filter allow[] = {
453 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
454 	};
455 	struct sock_filter *filter;
456 	struct sock_fprog prog = { };
457 	long ret;
458 
459 	filter = calloc(count, sizeof(*filter));
460 	ASSERT_NE(NULL, filter);
461 
462 	for (i = 0; i < count; i++)
463 		filter[i] = allow[0];
464 
465 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
466 	ASSERT_EQ(0, ret);
467 
468 	prog.filter = filter;
469 	prog.len = count;
470 
471 	/* Too many filter instructions in a single filter. */
472 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
473 	ASSERT_NE(0, ret) {
474 		TH_LOG("Installing %d insn filter was allowed", prog.len);
475 	}
476 
477 	/* One less is okay, though. */
478 	prog.len -= 1;
479 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
480 	ASSERT_EQ(0, ret) {
481 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
482 	}
483 }
484 
485 TEST(filter_chain_limits)
486 {
487 	int i;
488 	int count = BPF_MAXINSNS;
489 	struct sock_filter allow[] = {
490 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
491 	};
492 	struct sock_filter *filter;
493 	struct sock_fprog prog = { };
494 	long ret;
495 
496 	filter = calloc(count, sizeof(*filter));
497 	ASSERT_NE(NULL, filter);
498 
499 	for (i = 0; i < count; i++)
500 		filter[i] = allow[0];
501 
502 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
503 	ASSERT_EQ(0, ret);
504 
505 	prog.filter = filter;
506 	prog.len = 1;
507 
508 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
509 	ASSERT_EQ(0, ret);
510 
511 	prog.len = count;
512 
513 	/* Too many total filter instructions. */
514 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
515 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
516 		if (ret != 0)
517 			break;
518 	}
519 	ASSERT_NE(0, ret) {
520 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
521 		       i, count, i * (count + 4));
522 	}
523 }
524 
525 TEST(mode_filter_cannot_move_to_strict)
526 {
527 	struct sock_filter filter[] = {
528 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
529 	};
530 	struct sock_fprog prog = {
531 		.len = (unsigned short)ARRAY_SIZE(filter),
532 		.filter = filter,
533 	};
534 	long ret;
535 
536 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
537 	ASSERT_EQ(0, ret);
538 
539 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
540 	ASSERT_EQ(0, ret);
541 
542 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
543 	EXPECT_EQ(-1, ret);
544 	EXPECT_EQ(EINVAL, errno);
545 }
546 
547 
548 TEST(mode_filter_get_seccomp)
549 {
550 	struct sock_filter filter[] = {
551 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
552 	};
553 	struct sock_fprog prog = {
554 		.len = (unsigned short)ARRAY_SIZE(filter),
555 		.filter = filter,
556 	};
557 	long ret;
558 
559 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
560 	ASSERT_EQ(0, ret);
561 
562 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
563 	EXPECT_EQ(0, ret);
564 
565 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
566 	ASSERT_EQ(0, ret);
567 
568 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
569 	EXPECT_EQ(2, ret);
570 }
571 
572 
573 TEST(ALLOW_all)
574 {
575 	struct sock_filter filter[] = {
576 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
577 	};
578 	struct sock_fprog prog = {
579 		.len = (unsigned short)ARRAY_SIZE(filter),
580 		.filter = filter,
581 	};
582 	long ret;
583 
584 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
585 	ASSERT_EQ(0, ret);
586 
587 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
588 	ASSERT_EQ(0, ret);
589 }
590 
591 TEST(empty_prog)
592 {
593 	struct sock_filter filter[] = {
594 	};
595 	struct sock_fprog prog = {
596 		.len = (unsigned short)ARRAY_SIZE(filter),
597 		.filter = filter,
598 	};
599 	long ret;
600 
601 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
602 	ASSERT_EQ(0, ret);
603 
604 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
605 	EXPECT_EQ(-1, ret);
606 	EXPECT_EQ(EINVAL, errno);
607 }
608 
609 TEST(log_all)
610 {
611 	struct sock_filter filter[] = {
612 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
613 	};
614 	struct sock_fprog prog = {
615 		.len = (unsigned short)ARRAY_SIZE(filter),
616 		.filter = filter,
617 	};
618 	long ret;
619 	pid_t parent = getppid();
620 
621 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
622 	ASSERT_EQ(0, ret);
623 
624 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
625 	ASSERT_EQ(0, ret);
626 
627 	/* getppid() should succeed and be logged (no check for logging) */
628 	EXPECT_EQ(parent, syscall(__NR_getppid));
629 }
630 
631 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
632 {
633 	struct sock_filter filter[] = {
634 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
635 	};
636 	struct sock_fprog prog = {
637 		.len = (unsigned short)ARRAY_SIZE(filter),
638 		.filter = filter,
639 	};
640 	long ret;
641 
642 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
643 	ASSERT_EQ(0, ret);
644 
645 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
646 	ASSERT_EQ(0, ret);
647 	EXPECT_EQ(0, syscall(__NR_getpid)) {
648 		TH_LOG("getpid() shouldn't ever return");
649 	}
650 }
651 
652 /* return code >= 0x80000000 is unused. */
653 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
654 {
655 	struct sock_filter filter[] = {
656 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
657 	};
658 	struct sock_fprog prog = {
659 		.len = (unsigned short)ARRAY_SIZE(filter),
660 		.filter = filter,
661 	};
662 	long ret;
663 
664 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
665 	ASSERT_EQ(0, ret);
666 
667 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
668 	ASSERT_EQ(0, ret);
669 	EXPECT_EQ(0, syscall(__NR_getpid)) {
670 		TH_LOG("getpid() shouldn't ever return");
671 	}
672 }
673 
674 TEST_SIGNAL(KILL_all, SIGSYS)
675 {
676 	struct sock_filter filter[] = {
677 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
678 	};
679 	struct sock_fprog prog = {
680 		.len = (unsigned short)ARRAY_SIZE(filter),
681 		.filter = filter,
682 	};
683 	long ret;
684 
685 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
686 	ASSERT_EQ(0, ret);
687 
688 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
689 	ASSERT_EQ(0, ret);
690 }
691 
692 TEST_SIGNAL(KILL_one, SIGSYS)
693 {
694 	struct sock_filter filter[] = {
695 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
696 			offsetof(struct seccomp_data, nr)),
697 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
698 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
699 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
700 	};
701 	struct sock_fprog prog = {
702 		.len = (unsigned short)ARRAY_SIZE(filter),
703 		.filter = filter,
704 	};
705 	long ret;
706 	pid_t parent = getppid();
707 
708 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
709 	ASSERT_EQ(0, ret);
710 
711 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
712 	ASSERT_EQ(0, ret);
713 
714 	EXPECT_EQ(parent, syscall(__NR_getppid));
715 	/* getpid() should never return. */
716 	EXPECT_EQ(0, syscall(__NR_getpid));
717 }
718 
719 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
720 {
721 	void *fatal_address;
722 	struct sock_filter filter[] = {
723 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
724 			offsetof(struct seccomp_data, nr)),
725 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
726 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
727 		/* Only both with lower 32-bit for now. */
728 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
729 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
730 			(unsigned long)&fatal_address, 0, 1),
731 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
732 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
733 	};
734 	struct sock_fprog prog = {
735 		.len = (unsigned short)ARRAY_SIZE(filter),
736 		.filter = filter,
737 	};
738 	long ret;
739 	pid_t parent = getppid();
740 	struct tms timebuf;
741 	clock_t clock = times(&timebuf);
742 
743 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
744 	ASSERT_EQ(0, ret);
745 
746 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
747 	ASSERT_EQ(0, ret);
748 
749 	EXPECT_EQ(parent, syscall(__NR_getppid));
750 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
751 	/* times() should never return. */
752 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
753 }
754 
755 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
756 {
757 #ifndef __NR_mmap2
758 	int sysno = __NR_mmap;
759 #else
760 	int sysno = __NR_mmap2;
761 #endif
762 	struct sock_filter filter[] = {
763 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
764 			offsetof(struct seccomp_data, nr)),
765 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
766 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
767 		/* Only both with lower 32-bit for now. */
768 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
769 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
770 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
771 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
772 	};
773 	struct sock_fprog prog = {
774 		.len = (unsigned short)ARRAY_SIZE(filter),
775 		.filter = filter,
776 	};
777 	long ret;
778 	pid_t parent = getppid();
779 	int fd;
780 	void *map1, *map2;
781 	int page_size = sysconf(_SC_PAGESIZE);
782 
783 	ASSERT_LT(0, page_size);
784 
785 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
786 	ASSERT_EQ(0, ret);
787 
788 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
789 	ASSERT_EQ(0, ret);
790 
791 	fd = open("/dev/zero", O_RDONLY);
792 	ASSERT_NE(-1, fd);
793 
794 	EXPECT_EQ(parent, syscall(__NR_getppid));
795 	map1 = (void *)syscall(sysno,
796 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
797 	EXPECT_NE(MAP_FAILED, map1);
798 	/* mmap2() should never return. */
799 	map2 = (void *)syscall(sysno,
800 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
801 	EXPECT_EQ(MAP_FAILED, map2);
802 
803 	/* The test failed, so clean up the resources. */
804 	munmap(map1, page_size);
805 	munmap(map2, page_size);
806 	close(fd);
807 }
808 
809 /* This is a thread task to die via seccomp filter violation. */
810 void *kill_thread(void *data)
811 {
812 	bool die = (bool)data;
813 
814 	if (die) {
815 		syscall(__NR_getpid);
816 		return (void *)SIBLING_EXIT_FAILURE;
817 	}
818 
819 	return (void *)SIBLING_EXIT_UNKILLED;
820 }
821 
822 enum kill_t {
823 	KILL_THREAD,
824 	KILL_PROCESS,
825 	RET_UNKNOWN
826 };
827 
828 /* Prepare a thread that will kill itself or both of us. */
829 void kill_thread_or_group(struct __test_metadata *_metadata,
830 			  enum kill_t kill_how)
831 {
832 	pthread_t thread;
833 	void *status;
834 	/* Kill only when calling __NR_getpid. */
835 	struct sock_filter filter_thread[] = {
836 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
837 			offsetof(struct seccomp_data, nr)),
838 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
839 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
840 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
841 	};
842 	struct sock_fprog prog_thread = {
843 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
844 		.filter = filter_thread,
845 	};
846 	int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
847 	struct sock_filter filter_process[] = {
848 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
849 			offsetof(struct seccomp_data, nr)),
850 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
851 		BPF_STMT(BPF_RET|BPF_K, kill),
852 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
853 	};
854 	struct sock_fprog prog_process = {
855 		.len = (unsigned short)ARRAY_SIZE(filter_process),
856 		.filter = filter_process,
857 	};
858 
859 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
860 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
861 	}
862 
863 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
864 			     kill_how == KILL_THREAD ? &prog_thread
865 						     : &prog_process));
866 
867 	/*
868 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
869 	 * flag cannot be downgraded by a new filter.
870 	 */
871 	if (kill_how == KILL_PROCESS)
872 		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
873 
874 	/* Start a thread that will exit immediately. */
875 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
876 	ASSERT_EQ(0, pthread_join(thread, &status));
877 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
878 
879 	/* Start a thread that will die immediately. */
880 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
881 	ASSERT_EQ(0, pthread_join(thread, &status));
882 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
883 
884 	/*
885 	 * If we get here, only the spawned thread died. Let the parent know
886 	 * the whole process didn't die (i.e. this thread, the spawner,
887 	 * stayed running).
888 	 */
889 	exit(42);
890 }
891 
892 TEST(KILL_thread)
893 {
894 	int status;
895 	pid_t child_pid;
896 
897 	child_pid = fork();
898 	ASSERT_LE(0, child_pid);
899 	if (child_pid == 0) {
900 		kill_thread_or_group(_metadata, KILL_THREAD);
901 		_exit(38);
902 	}
903 
904 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
905 
906 	/* If only the thread was killed, we'll see exit 42. */
907 	ASSERT_TRUE(WIFEXITED(status));
908 	ASSERT_EQ(42, WEXITSTATUS(status));
909 }
910 
911 TEST(KILL_process)
912 {
913 	int status;
914 	pid_t child_pid;
915 
916 	child_pid = fork();
917 	ASSERT_LE(0, child_pid);
918 	if (child_pid == 0) {
919 		kill_thread_or_group(_metadata, KILL_PROCESS);
920 		_exit(38);
921 	}
922 
923 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
924 
925 	/* If the entire process was killed, we'll see SIGSYS. */
926 	ASSERT_TRUE(WIFSIGNALED(status));
927 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
928 }
929 
930 TEST(KILL_unknown)
931 {
932 	int status;
933 	pid_t child_pid;
934 
935 	child_pid = fork();
936 	ASSERT_LE(0, child_pid);
937 	if (child_pid == 0) {
938 		kill_thread_or_group(_metadata, RET_UNKNOWN);
939 		_exit(38);
940 	}
941 
942 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
943 
944 	/* If the entire process was killed, we'll see SIGSYS. */
945 	EXPECT_TRUE(WIFSIGNALED(status)) {
946 		TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
947 	}
948 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
949 }
950 
951 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
952 TEST(arg_out_of_range)
953 {
954 	struct sock_filter filter[] = {
955 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
956 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
957 	};
958 	struct sock_fprog prog = {
959 		.len = (unsigned short)ARRAY_SIZE(filter),
960 		.filter = filter,
961 	};
962 	long ret;
963 
964 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
965 	ASSERT_EQ(0, ret);
966 
967 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
968 	EXPECT_EQ(-1, ret);
969 	EXPECT_EQ(EINVAL, errno);
970 }
971 
972 #define ERRNO_FILTER(name, errno)					\
973 	struct sock_filter _read_filter_##name[] = {			\
974 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
975 			offsetof(struct seccomp_data, nr)),		\
976 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
977 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
978 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
979 	};								\
980 	struct sock_fprog prog_##name = {				\
981 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
982 		.filter = _read_filter_##name,				\
983 	}
984 
985 /* Make sure basic errno values are correctly passed through a filter. */
986 TEST(ERRNO_valid)
987 {
988 	ERRNO_FILTER(valid, E2BIG);
989 	long ret;
990 	pid_t parent = getppid();
991 
992 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
993 	ASSERT_EQ(0, ret);
994 
995 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
996 	ASSERT_EQ(0, ret);
997 
998 	EXPECT_EQ(parent, syscall(__NR_getppid));
999 	EXPECT_EQ(-1, read(-1, NULL, 0));
1000 	EXPECT_EQ(E2BIG, errno);
1001 }
1002 
1003 /* Make sure an errno of zero is correctly handled by the arch code. */
1004 TEST(ERRNO_zero)
1005 {
1006 	ERRNO_FILTER(zero, 0);
1007 	long ret;
1008 	pid_t parent = getppid();
1009 
1010 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1011 	ASSERT_EQ(0, ret);
1012 
1013 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
1014 	ASSERT_EQ(0, ret);
1015 
1016 	EXPECT_EQ(parent, syscall(__NR_getppid));
1017 	/* "errno" of 0 is ok. */
1018 	EXPECT_EQ(0, read(-1, NULL, 0));
1019 }
1020 
1021 /*
1022  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
1023  * This tests that the errno value gets capped correctly, fixed by
1024  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
1025  */
1026 TEST(ERRNO_capped)
1027 {
1028 	ERRNO_FILTER(capped, 4096);
1029 	long ret;
1030 	pid_t parent = getppid();
1031 
1032 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1033 	ASSERT_EQ(0, ret);
1034 
1035 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1036 	ASSERT_EQ(0, ret);
1037 
1038 	EXPECT_EQ(parent, syscall(__NR_getppid));
1039 	EXPECT_EQ(-1, read(-1, NULL, 0));
1040 	EXPECT_EQ(4095, errno);
1041 }
1042 
1043 /*
1044  * Filters are processed in reverse order: last applied is executed first.
1045  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1046  * SECCOMP_RET_DATA mask results will follow the most recently applied
1047  * matching filter return (and not the lowest or highest value).
1048  */
1049 TEST(ERRNO_order)
1050 {
1051 	ERRNO_FILTER(first,  11);
1052 	ERRNO_FILTER(second, 13);
1053 	ERRNO_FILTER(third,  12);
1054 	long ret;
1055 	pid_t parent = getppid();
1056 
1057 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1058 	ASSERT_EQ(0, ret);
1059 
1060 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1061 	ASSERT_EQ(0, ret);
1062 
1063 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1064 	ASSERT_EQ(0, ret);
1065 
1066 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1067 	ASSERT_EQ(0, ret);
1068 
1069 	EXPECT_EQ(parent, syscall(__NR_getppid));
1070 	EXPECT_EQ(-1, read(-1, NULL, 0));
1071 	EXPECT_EQ(12, errno);
1072 }
1073 
1074 FIXTURE(TRAP) {
1075 	struct sock_fprog prog;
1076 };
1077 
1078 FIXTURE_SETUP(TRAP)
1079 {
1080 	struct sock_filter filter[] = {
1081 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1082 			offsetof(struct seccomp_data, nr)),
1083 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1084 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1085 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1086 	};
1087 
1088 	memset(&self->prog, 0, sizeof(self->prog));
1089 	self->prog.filter = malloc(sizeof(filter));
1090 	ASSERT_NE(NULL, self->prog.filter);
1091 	memcpy(self->prog.filter, filter, sizeof(filter));
1092 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1093 }
1094 
1095 FIXTURE_TEARDOWN(TRAP)
1096 {
1097 	if (self->prog.filter)
1098 		free(self->prog.filter);
1099 }
1100 
1101 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1102 {
1103 	long ret;
1104 
1105 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1106 	ASSERT_EQ(0, ret);
1107 
1108 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1109 	ASSERT_EQ(0, ret);
1110 	syscall(__NR_getpid);
1111 }
1112 
1113 /* Ensure that SIGSYS overrides SIG_IGN */
1114 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1115 {
1116 	long ret;
1117 
1118 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1119 	ASSERT_EQ(0, ret);
1120 
1121 	signal(SIGSYS, SIG_IGN);
1122 
1123 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1124 	ASSERT_EQ(0, ret);
1125 	syscall(__NR_getpid);
1126 }
1127 
1128 static siginfo_t TRAP_info;
1129 static volatile int TRAP_nr;
1130 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1131 {
1132 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1133 	TRAP_nr = nr;
1134 }
1135 
1136 TEST_F(TRAP, handler)
1137 {
1138 	int ret, test;
1139 	struct sigaction act;
1140 	sigset_t mask;
1141 
1142 	memset(&act, 0, sizeof(act));
1143 	sigemptyset(&mask);
1144 	sigaddset(&mask, SIGSYS);
1145 
1146 	act.sa_sigaction = &TRAP_action;
1147 	act.sa_flags = SA_SIGINFO;
1148 	ret = sigaction(SIGSYS, &act, NULL);
1149 	ASSERT_EQ(0, ret) {
1150 		TH_LOG("sigaction failed");
1151 	}
1152 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1153 	ASSERT_EQ(0, ret) {
1154 		TH_LOG("sigprocmask failed");
1155 	}
1156 
1157 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1158 	ASSERT_EQ(0, ret);
1159 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1160 	ASSERT_EQ(0, ret);
1161 	TRAP_nr = 0;
1162 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1163 	/* Expect the registers to be rolled back. (nr = error) may vary
1164 	 * based on arch. */
1165 	ret = syscall(__NR_getpid);
1166 	/* Silence gcc warning about volatile. */
1167 	test = TRAP_nr;
1168 	EXPECT_EQ(SIGSYS, test);
1169 	struct local_sigsys {
1170 		void *_call_addr;	/* calling user insn */
1171 		int _syscall;		/* triggering system call number */
1172 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1173 	} *sigsys = (struct local_sigsys *)
1174 #ifdef si_syscall
1175 		&(TRAP_info.si_call_addr);
1176 #else
1177 		&TRAP_info.si_pid;
1178 #endif
1179 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1180 	/* Make sure arch is non-zero. */
1181 	EXPECT_NE(0, sigsys->_arch);
1182 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1183 }
1184 
1185 FIXTURE(precedence) {
1186 	struct sock_fprog allow;
1187 	struct sock_fprog log;
1188 	struct sock_fprog trace;
1189 	struct sock_fprog error;
1190 	struct sock_fprog trap;
1191 	struct sock_fprog kill;
1192 };
1193 
1194 FIXTURE_SETUP(precedence)
1195 {
1196 	struct sock_filter allow_insns[] = {
1197 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1198 	};
1199 	struct sock_filter log_insns[] = {
1200 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1201 			offsetof(struct seccomp_data, nr)),
1202 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1203 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1204 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1205 	};
1206 	struct sock_filter trace_insns[] = {
1207 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1208 			offsetof(struct seccomp_data, nr)),
1209 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1210 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1211 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1212 	};
1213 	struct sock_filter error_insns[] = {
1214 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1215 			offsetof(struct seccomp_data, nr)),
1216 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1217 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1218 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1219 	};
1220 	struct sock_filter trap_insns[] = {
1221 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1222 			offsetof(struct seccomp_data, nr)),
1223 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1224 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1225 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1226 	};
1227 	struct sock_filter kill_insns[] = {
1228 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1229 			offsetof(struct seccomp_data, nr)),
1230 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1231 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1232 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1233 	};
1234 
1235 	memset(self, 0, sizeof(*self));
1236 #define FILTER_ALLOC(_x) \
1237 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1238 	ASSERT_NE(NULL, self->_x.filter); \
1239 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1240 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1241 	FILTER_ALLOC(allow);
1242 	FILTER_ALLOC(log);
1243 	FILTER_ALLOC(trace);
1244 	FILTER_ALLOC(error);
1245 	FILTER_ALLOC(trap);
1246 	FILTER_ALLOC(kill);
1247 }
1248 
1249 FIXTURE_TEARDOWN(precedence)
1250 {
1251 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1252 	FILTER_FREE(allow);
1253 	FILTER_FREE(log);
1254 	FILTER_FREE(trace);
1255 	FILTER_FREE(error);
1256 	FILTER_FREE(trap);
1257 	FILTER_FREE(kill);
1258 }
1259 
1260 TEST_F(precedence, allow_ok)
1261 {
1262 	pid_t parent, res = 0;
1263 	long ret;
1264 
1265 	parent = getppid();
1266 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1267 	ASSERT_EQ(0, ret);
1268 
1269 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1270 	ASSERT_EQ(0, ret);
1271 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1272 	ASSERT_EQ(0, ret);
1273 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1274 	ASSERT_EQ(0, ret);
1275 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1276 	ASSERT_EQ(0, ret);
1277 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1278 	ASSERT_EQ(0, ret);
1279 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1280 	ASSERT_EQ(0, ret);
1281 	/* Should work just fine. */
1282 	res = syscall(__NR_getppid);
1283 	EXPECT_EQ(parent, res);
1284 }
1285 
1286 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1287 {
1288 	pid_t parent, res = 0;
1289 	long ret;
1290 
1291 	parent = getppid();
1292 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1293 	ASSERT_EQ(0, ret);
1294 
1295 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1296 	ASSERT_EQ(0, ret);
1297 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1298 	ASSERT_EQ(0, ret);
1299 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1300 	ASSERT_EQ(0, ret);
1301 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1302 	ASSERT_EQ(0, ret);
1303 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1304 	ASSERT_EQ(0, ret);
1305 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1306 	ASSERT_EQ(0, ret);
1307 	/* Should work just fine. */
1308 	res = syscall(__NR_getppid);
1309 	EXPECT_EQ(parent, res);
1310 	/* getpid() should never return. */
1311 	res = syscall(__NR_getpid);
1312 	EXPECT_EQ(0, res);
1313 }
1314 
1315 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1316 {
1317 	pid_t parent;
1318 	long ret;
1319 
1320 	parent = getppid();
1321 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1322 	ASSERT_EQ(0, ret);
1323 
1324 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1325 	ASSERT_EQ(0, ret);
1326 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1327 	ASSERT_EQ(0, ret);
1328 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1329 	ASSERT_EQ(0, ret);
1330 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1331 	ASSERT_EQ(0, ret);
1332 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1333 	ASSERT_EQ(0, ret);
1334 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1335 	ASSERT_EQ(0, ret);
1336 	/* Should work just fine. */
1337 	EXPECT_EQ(parent, syscall(__NR_getppid));
1338 	/* getpid() should never return. */
1339 	EXPECT_EQ(0, syscall(__NR_getpid));
1340 }
1341 
1342 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1343 {
1344 	pid_t parent;
1345 	long ret;
1346 
1347 	parent = getppid();
1348 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1349 	ASSERT_EQ(0, ret);
1350 
1351 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1352 	ASSERT_EQ(0, ret);
1353 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1354 	ASSERT_EQ(0, ret);
1355 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1356 	ASSERT_EQ(0, ret);
1357 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1358 	ASSERT_EQ(0, ret);
1359 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1360 	ASSERT_EQ(0, ret);
1361 	/* Should work just fine. */
1362 	EXPECT_EQ(parent, syscall(__NR_getppid));
1363 	/* getpid() should never return. */
1364 	EXPECT_EQ(0, syscall(__NR_getpid));
1365 }
1366 
1367 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1368 {
1369 	pid_t parent;
1370 	long ret;
1371 
1372 	parent = getppid();
1373 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1374 	ASSERT_EQ(0, ret);
1375 
1376 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1377 	ASSERT_EQ(0, ret);
1378 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1379 	ASSERT_EQ(0, ret);
1380 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1381 	ASSERT_EQ(0, ret);
1382 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1383 	ASSERT_EQ(0, ret);
1384 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1385 	ASSERT_EQ(0, ret);
1386 	/* Should work just fine. */
1387 	EXPECT_EQ(parent, syscall(__NR_getppid));
1388 	/* getpid() should never return. */
1389 	EXPECT_EQ(0, syscall(__NR_getpid));
1390 }
1391 
1392 TEST_F(precedence, errno_is_third)
1393 {
1394 	pid_t parent;
1395 	long ret;
1396 
1397 	parent = getppid();
1398 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1399 	ASSERT_EQ(0, ret);
1400 
1401 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1402 	ASSERT_EQ(0, ret);
1403 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1404 	ASSERT_EQ(0, ret);
1405 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1406 	ASSERT_EQ(0, ret);
1407 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1408 	ASSERT_EQ(0, ret);
1409 	/* Should work just fine. */
1410 	EXPECT_EQ(parent, syscall(__NR_getppid));
1411 	EXPECT_EQ(0, syscall(__NR_getpid));
1412 }
1413 
1414 TEST_F(precedence, errno_is_third_in_any_order)
1415 {
1416 	pid_t parent;
1417 	long ret;
1418 
1419 	parent = getppid();
1420 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1421 	ASSERT_EQ(0, ret);
1422 
1423 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1424 	ASSERT_EQ(0, ret);
1425 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1426 	ASSERT_EQ(0, ret);
1427 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1428 	ASSERT_EQ(0, ret);
1429 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1430 	ASSERT_EQ(0, ret);
1431 	/* Should work just fine. */
1432 	EXPECT_EQ(parent, syscall(__NR_getppid));
1433 	EXPECT_EQ(0, syscall(__NR_getpid));
1434 }
1435 
1436 TEST_F(precedence, trace_is_fourth)
1437 {
1438 	pid_t parent;
1439 	long ret;
1440 
1441 	parent = getppid();
1442 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1443 	ASSERT_EQ(0, ret);
1444 
1445 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1446 	ASSERT_EQ(0, ret);
1447 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1448 	ASSERT_EQ(0, ret);
1449 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1450 	ASSERT_EQ(0, ret);
1451 	/* Should work just fine. */
1452 	EXPECT_EQ(parent, syscall(__NR_getppid));
1453 	/* No ptracer */
1454 	EXPECT_EQ(-1, syscall(__NR_getpid));
1455 }
1456 
1457 TEST_F(precedence, trace_is_fourth_in_any_order)
1458 {
1459 	pid_t parent;
1460 	long ret;
1461 
1462 	parent = getppid();
1463 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1464 	ASSERT_EQ(0, ret);
1465 
1466 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1467 	ASSERT_EQ(0, ret);
1468 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1469 	ASSERT_EQ(0, ret);
1470 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1471 	ASSERT_EQ(0, ret);
1472 	/* Should work just fine. */
1473 	EXPECT_EQ(parent, syscall(__NR_getppid));
1474 	/* No ptracer */
1475 	EXPECT_EQ(-1, syscall(__NR_getpid));
1476 }
1477 
1478 TEST_F(precedence, log_is_fifth)
1479 {
1480 	pid_t mypid, parent;
1481 	long ret;
1482 
1483 	mypid = getpid();
1484 	parent = getppid();
1485 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1486 	ASSERT_EQ(0, ret);
1487 
1488 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1489 	ASSERT_EQ(0, ret);
1490 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1491 	ASSERT_EQ(0, ret);
1492 	/* Should work just fine. */
1493 	EXPECT_EQ(parent, syscall(__NR_getppid));
1494 	/* Should also work just fine */
1495 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1496 }
1497 
1498 TEST_F(precedence, log_is_fifth_in_any_order)
1499 {
1500 	pid_t mypid, parent;
1501 	long ret;
1502 
1503 	mypid = getpid();
1504 	parent = getppid();
1505 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1506 	ASSERT_EQ(0, ret);
1507 
1508 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1509 	ASSERT_EQ(0, ret);
1510 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1511 	ASSERT_EQ(0, ret);
1512 	/* Should work just fine. */
1513 	EXPECT_EQ(parent, syscall(__NR_getppid));
1514 	/* Should also work just fine */
1515 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1516 }
1517 
1518 #ifndef PTRACE_O_TRACESECCOMP
1519 #define PTRACE_O_TRACESECCOMP	0x00000080
1520 #endif
1521 
1522 /* Catch the Ubuntu 12.04 value error. */
1523 #if PTRACE_EVENT_SECCOMP != 7
1524 #undef PTRACE_EVENT_SECCOMP
1525 #endif
1526 
1527 #ifndef PTRACE_EVENT_SECCOMP
1528 #define PTRACE_EVENT_SECCOMP 7
1529 #endif
1530 
1531 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1532 bool tracer_running;
1533 void tracer_stop(int sig)
1534 {
1535 	tracer_running = false;
1536 }
1537 
1538 typedef void tracer_func_t(struct __test_metadata *_metadata,
1539 			   pid_t tracee, int status, void *args);
1540 
1541 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1542 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1543 {
1544 	int ret = -1;
1545 	struct sigaction action = {
1546 		.sa_handler = tracer_stop,
1547 	};
1548 
1549 	/* Allow external shutdown. */
1550 	tracer_running = true;
1551 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1552 
1553 	errno = 0;
1554 	while (ret == -1 && errno != EINVAL)
1555 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1556 	ASSERT_EQ(0, ret) {
1557 		kill(tracee, SIGKILL);
1558 	}
1559 	/* Wait for attach stop */
1560 	wait(NULL);
1561 
1562 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1563 						      PTRACE_O_TRACESYSGOOD :
1564 						      PTRACE_O_TRACESECCOMP);
1565 	ASSERT_EQ(0, ret) {
1566 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1567 		kill(tracee, SIGKILL);
1568 	}
1569 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1570 		     tracee, NULL, 0);
1571 	ASSERT_EQ(0, ret);
1572 
1573 	/* Unblock the tracee */
1574 	ASSERT_EQ(1, write(fd, "A", 1));
1575 	ASSERT_EQ(0, close(fd));
1576 
1577 	/* Run until we're shut down. Must assert to stop execution. */
1578 	while (tracer_running) {
1579 		int status;
1580 
1581 		if (wait(&status) != tracee)
1582 			continue;
1583 
1584 		if (WIFSIGNALED(status)) {
1585 			/* Child caught a fatal signal. */
1586 			return;
1587 		}
1588 		if (WIFEXITED(status)) {
1589 			/* Child exited with code. */
1590 			return;
1591 		}
1592 
1593 		/* Check if we got an expected event. */
1594 		ASSERT_EQ(WIFCONTINUED(status), false);
1595 		ASSERT_EQ(WIFSTOPPED(status), true);
1596 		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1597 			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1598 		}
1599 
1600 		tracer_func(_metadata, tracee, status, args);
1601 
1602 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1603 			     tracee, NULL, 0);
1604 		ASSERT_EQ(0, ret);
1605 	}
1606 	/* Directly report the status of our test harness results. */
1607 	syscall(__NR_exit, _metadata->exit_code);
1608 }
1609 
1610 /* Common tracer setup/teardown functions. */
1611 void cont_handler(int num)
1612 { }
1613 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1614 			  tracer_func_t func, void *args, bool ptrace_syscall)
1615 {
1616 	char sync;
1617 	int pipefd[2];
1618 	pid_t tracer_pid;
1619 	pid_t tracee = getpid();
1620 
1621 	/* Setup a pipe for clean synchronization. */
1622 	ASSERT_EQ(0, pipe(pipefd));
1623 
1624 	/* Fork a child which we'll promote to tracer */
1625 	tracer_pid = fork();
1626 	ASSERT_LE(0, tracer_pid);
1627 	signal(SIGALRM, cont_handler);
1628 	if (tracer_pid == 0) {
1629 		close(pipefd[0]);
1630 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1631 			     ptrace_syscall);
1632 		syscall(__NR_exit, 0);
1633 	}
1634 	close(pipefd[1]);
1635 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1636 	read(pipefd[0], &sync, 1);
1637 	close(pipefd[0]);
1638 
1639 	return tracer_pid;
1640 }
1641 
1642 void teardown_trace_fixture(struct __test_metadata *_metadata,
1643 			    pid_t tracer)
1644 {
1645 	if (tracer) {
1646 		int status;
1647 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1648 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1649 	}
1650 }
1651 
1652 /* "poke" tracer arguments and function. */
1653 struct tracer_args_poke_t {
1654 	unsigned long poke_addr;
1655 };
1656 
1657 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1658 		 void *args)
1659 {
1660 	int ret;
1661 	unsigned long msg;
1662 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1663 
1664 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1665 	EXPECT_EQ(0, ret);
1666 	/* If this fails, don't try to recover. */
1667 	ASSERT_EQ(0x1001, msg) {
1668 		kill(tracee, SIGKILL);
1669 	}
1670 	/*
1671 	 * Poke in the message.
1672 	 * Registers are not touched to try to keep this relatively arch
1673 	 * agnostic.
1674 	 */
1675 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1676 	EXPECT_EQ(0, ret);
1677 }
1678 
1679 FIXTURE(TRACE_poke) {
1680 	struct sock_fprog prog;
1681 	pid_t tracer;
1682 	long poked;
1683 	struct tracer_args_poke_t tracer_args;
1684 };
1685 
1686 FIXTURE_SETUP(TRACE_poke)
1687 {
1688 	struct sock_filter filter[] = {
1689 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1690 			offsetof(struct seccomp_data, nr)),
1691 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1692 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1693 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1694 	};
1695 
1696 	self->poked = 0;
1697 	memset(&self->prog, 0, sizeof(self->prog));
1698 	self->prog.filter = malloc(sizeof(filter));
1699 	ASSERT_NE(NULL, self->prog.filter);
1700 	memcpy(self->prog.filter, filter, sizeof(filter));
1701 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1702 
1703 	/* Set up tracer args. */
1704 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1705 
1706 	/* Launch tracer. */
1707 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1708 					   &self->tracer_args, false);
1709 }
1710 
1711 FIXTURE_TEARDOWN(TRACE_poke)
1712 {
1713 	teardown_trace_fixture(_metadata, self->tracer);
1714 	if (self->prog.filter)
1715 		free(self->prog.filter);
1716 }
1717 
1718 TEST_F(TRACE_poke, read_has_side_effects)
1719 {
1720 	ssize_t ret;
1721 
1722 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1723 	ASSERT_EQ(0, ret);
1724 
1725 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1726 	ASSERT_EQ(0, ret);
1727 
1728 	EXPECT_EQ(0, self->poked);
1729 	ret = read(-1, NULL, 0);
1730 	EXPECT_EQ(-1, ret);
1731 	EXPECT_EQ(0x1001, self->poked);
1732 }
1733 
1734 TEST_F(TRACE_poke, getpid_runs_normally)
1735 {
1736 	long ret;
1737 
1738 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1739 	ASSERT_EQ(0, ret);
1740 
1741 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1742 	ASSERT_EQ(0, ret);
1743 
1744 	EXPECT_EQ(0, self->poked);
1745 	EXPECT_NE(0, syscall(__NR_getpid));
1746 	EXPECT_EQ(0, self->poked);
1747 }
1748 
1749 #if defined(__x86_64__)
1750 # define ARCH_REGS		struct user_regs_struct
1751 # define SYSCALL_NUM(_regs)	(_regs).orig_rax
1752 # define SYSCALL_RET(_regs)	(_regs).rax
1753 #elif defined(__i386__)
1754 # define ARCH_REGS		struct user_regs_struct
1755 # define SYSCALL_NUM(_regs)	(_regs).orig_eax
1756 # define SYSCALL_RET(_regs)	(_regs).eax
1757 #elif defined(__arm__)
1758 # define ARCH_REGS		struct pt_regs
1759 # define SYSCALL_NUM(_regs)	(_regs).ARM_r7
1760 # ifndef PTRACE_SET_SYSCALL
1761 #  define PTRACE_SET_SYSCALL   23
1762 # endif
1763 # define SYSCALL_NUM_SET(_regs, _nr)	\
1764 		EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1765 # define SYSCALL_RET(_regs)	(_regs).ARM_r0
1766 #elif defined(__aarch64__)
1767 # define ARCH_REGS		struct user_pt_regs
1768 # define SYSCALL_NUM(_regs)	(_regs).regs[8]
1769 # ifndef NT_ARM_SYSTEM_CALL
1770 #  define NT_ARM_SYSTEM_CALL 0x404
1771 # endif
1772 # define SYSCALL_NUM_SET(_regs, _nr)				\
1773 	do {							\
1774 		struct iovec __v;				\
1775 		typeof(_nr) __nr = (_nr);			\
1776 		__v.iov_base = &__nr;				\
1777 		__v.iov_len = sizeof(__nr);			\
1778 		EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,	\
1779 				    NT_ARM_SYSTEM_CALL, &__v));	\
1780 	} while (0)
1781 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1782 #elif defined(__loongarch__)
1783 # define ARCH_REGS		struct user_pt_regs
1784 # define SYSCALL_NUM(_regs)	(_regs).regs[11]
1785 # define SYSCALL_RET(_regs)	(_regs).regs[4]
1786 #elif defined(__riscv) && __riscv_xlen == 64
1787 # define ARCH_REGS		struct user_regs_struct
1788 # define SYSCALL_NUM(_regs)	(_regs).a7
1789 # define SYSCALL_RET(_regs)	(_regs).a0
1790 #elif defined(__csky__)
1791 # define ARCH_REGS		struct pt_regs
1792 #  if defined(__CSKYABIV2__)
1793 #   define SYSCALL_NUM(_regs)	(_regs).regs[3]
1794 #  else
1795 #   define SYSCALL_NUM(_regs)	(_regs).regs[9]
1796 #  endif
1797 # define SYSCALL_RET(_regs)	(_regs).a0
1798 #elif defined(__hppa__)
1799 # define ARCH_REGS		struct user_regs_struct
1800 # define SYSCALL_NUM(_regs)	(_regs).gr[20]
1801 # define SYSCALL_RET(_regs)	(_regs).gr[28]
1802 #elif defined(__powerpc__)
1803 # define ARCH_REGS		struct pt_regs
1804 # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
1805 # define SYSCALL_RET(_regs)	(_regs).gpr[3]
1806 # define SYSCALL_RET_SET(_regs, _val)				\
1807 	do {							\
1808 		typeof(_val) _result = (_val);			\
1809 		if ((_regs.trap & 0xfff0) == 0x3000) {		\
1810 			/*					\
1811 			 * scv 0 system call uses -ve result	\
1812 			 * for error, so no need to adjust.	\
1813 			 */					\
1814 			SYSCALL_RET(_regs) = _result;		\
1815 		} else {					\
1816 			/*					\
1817 			 * A syscall error is signaled by the	\
1818 			 * CR0 SO bit and the code is stored as	\
1819 			 * a positive value.			\
1820 			 */					\
1821 			if (_result < 0) {			\
1822 				SYSCALL_RET(_regs) = -_result;	\
1823 				(_regs).ccr |= 0x10000000;	\
1824 			} else {				\
1825 				SYSCALL_RET(_regs) = _result;	\
1826 				(_regs).ccr &= ~0x10000000;	\
1827 			}					\
1828 		}						\
1829 	} while (0)
1830 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1831 #elif defined(__s390__)
1832 # define ARCH_REGS		s390_regs
1833 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
1834 # define SYSCALL_RET_SET(_regs, _val)			\
1835 		TH_LOG("Can't modify syscall return on this architecture")
1836 #elif defined(__mips__)
1837 # include <asm/unistd_nr_n32.h>
1838 # include <asm/unistd_nr_n64.h>
1839 # include <asm/unistd_nr_o32.h>
1840 # define ARCH_REGS		struct pt_regs
1841 # define SYSCALL_NUM(_regs)				\
1842 	({						\
1843 		typeof((_regs).regs[2]) _nr;		\
1844 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1845 			_nr = (_regs).regs[4];		\
1846 		else					\
1847 			_nr = (_regs).regs[2];		\
1848 		_nr;					\
1849 	})
1850 # define SYSCALL_NUM_SET(_regs, _nr)			\
1851 	do {						\
1852 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1853 			(_regs).regs[4] = _nr;		\
1854 		else					\
1855 			(_regs).regs[2] = _nr;		\
1856 	} while (0)
1857 # define SYSCALL_RET_SET(_regs, _val)			\
1858 		TH_LOG("Can't modify syscall return on this architecture")
1859 #elif defined(__xtensa__)
1860 # define ARCH_REGS		struct user_pt_regs
1861 # define SYSCALL_NUM(_regs)	(_regs).syscall
1862 /*
1863  * On xtensa syscall return value is in the register
1864  * a2 of the current window which is not fixed.
1865  */
1866 #define SYSCALL_RET(_regs)	(_regs).a[(_regs).windowbase * 4 + 2]
1867 #elif defined(__sh__)
1868 # define ARCH_REGS		struct pt_regs
1869 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
1870 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1871 #elif defined(__mc68000__)
1872 # define ARCH_REGS		struct user_regs_struct
1873 # define SYSCALL_NUM(_regs)	(_regs).orig_d0
1874 # define SYSCALL_RET(_regs)	(_regs).d0
1875 #else
1876 # error "Do not know how to find your architecture's registers and syscalls"
1877 #endif
1878 
1879 /*
1880  * Most architectures can change the syscall by just updating the
1881  * associated register. This is the default if not defined above.
1882  */
1883 #ifndef SYSCALL_NUM_SET
1884 # define SYSCALL_NUM_SET(_regs, _nr)		\
1885 	do {					\
1886 		SYSCALL_NUM(_regs) = (_nr);	\
1887 	} while (0)
1888 #endif
1889 /*
1890  * Most architectures can change the syscall return value by just
1891  * writing to the SYSCALL_RET register. This is the default if not
1892  * defined above. If an architecture cannot set the return value
1893  * (for example when the syscall and return value register is
1894  * shared), report it with TH_LOG() in an arch-specific definition
1895  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1896  */
1897 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1898 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1899 #endif
1900 #ifndef SYSCALL_RET_SET
1901 # define SYSCALL_RET_SET(_regs, _val)		\
1902 	do {					\
1903 		SYSCALL_RET(_regs) = (_val);	\
1904 	} while (0)
1905 #endif
1906 
1907 /* When the syscall return can't be changed, stub out the tests for it. */
1908 #ifndef SYSCALL_RET
1909 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1910 #else
1911 # define EXPECT_SYSCALL_RETURN(val, action)		\
1912 	do {						\
1913 		errno = 0;				\
1914 		if (val < 0) {				\
1915 			EXPECT_EQ(-1, action);		\
1916 			EXPECT_EQ(-(val), errno);	\
1917 		} else {				\
1918 			EXPECT_EQ(val, action);		\
1919 		}					\
1920 	} while (0)
1921 #endif
1922 
1923 /*
1924  * Some architectures (e.g. powerpc) can only set syscall
1925  * return values on syscall exit during ptrace.
1926  */
1927 const bool ptrace_entry_set_syscall_nr = true;
1928 const bool ptrace_entry_set_syscall_ret =
1929 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1930 	true;
1931 #else
1932 	false;
1933 #endif
1934 
1935 /*
1936  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1937  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1938  */
1939 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
1940 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1941 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1942 #else
1943 # define ARCH_GETREGS(_regs)	({					\
1944 		struct iovec __v;					\
1945 		__v.iov_base = &(_regs);				\
1946 		__v.iov_len = sizeof(_regs);				\
1947 		ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);	\
1948 	})
1949 # define ARCH_SETREGS(_regs)	({					\
1950 		struct iovec __v;					\
1951 		__v.iov_base = &(_regs);				\
1952 		__v.iov_len = sizeof(_regs);				\
1953 		ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);	\
1954 	})
1955 #endif
1956 
1957 /* Architecture-specific syscall fetching routine. */
1958 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1959 {
1960 	ARCH_REGS regs;
1961 
1962 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1963 		return -1;
1964 	}
1965 
1966 	return SYSCALL_NUM(regs);
1967 }
1968 
1969 /* Architecture-specific syscall changing routine. */
1970 void __change_syscall(struct __test_metadata *_metadata,
1971 		    pid_t tracee, long *syscall, long *ret)
1972 {
1973 	ARCH_REGS orig, regs;
1974 
1975 	/* Do not get/set registers if we have nothing to do. */
1976 	if (!syscall && !ret)
1977 		return;
1978 
1979 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1980 		return;
1981 	}
1982 	orig = regs;
1983 
1984 	if (syscall)
1985 		SYSCALL_NUM_SET(regs, *syscall);
1986 
1987 	if (ret)
1988 		SYSCALL_RET_SET(regs, *ret);
1989 
1990 	/* Flush any register changes made. */
1991 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1992 		EXPECT_EQ(0, ARCH_SETREGS(regs));
1993 }
1994 
1995 /* Change only syscall number. */
1996 void change_syscall_nr(struct __test_metadata *_metadata,
1997 		       pid_t tracee, long syscall)
1998 {
1999 	__change_syscall(_metadata, tracee, &syscall, NULL);
2000 }
2001 
2002 /* Change syscall return value (and set syscall number to -1). */
2003 void change_syscall_ret(struct __test_metadata *_metadata,
2004 			pid_t tracee, long ret)
2005 {
2006 	long syscall = -1;
2007 
2008 	__change_syscall(_metadata, tracee, &syscall, &ret);
2009 }
2010 
2011 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
2012 		    int status, void *args)
2013 {
2014 	int ret;
2015 	unsigned long msg;
2016 
2017 	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
2018 		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
2019 		return;
2020 	}
2021 
2022 	/* Make sure we got the right message. */
2023 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2024 	EXPECT_EQ(0, ret);
2025 
2026 	/* Validate and take action on expected syscalls. */
2027 	switch (msg) {
2028 	case 0x1002:
2029 		/* change getpid to getppid. */
2030 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
2031 		change_syscall_nr(_metadata, tracee, __NR_getppid);
2032 		break;
2033 	case 0x1003:
2034 		/* skip gettid with valid return code. */
2035 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2036 		change_syscall_ret(_metadata, tracee, 45000);
2037 		break;
2038 	case 0x1004:
2039 		/* skip openat with error. */
2040 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2041 		change_syscall_ret(_metadata, tracee, -ESRCH);
2042 		break;
2043 	case 0x1005:
2044 		/* do nothing (allow getppid) */
2045 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2046 		break;
2047 	default:
2048 		EXPECT_EQ(0, msg) {
2049 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2050 			kill(tracee, SIGKILL);
2051 		}
2052 	}
2053 
2054 }
2055 
2056 FIXTURE(TRACE_syscall) {
2057 	struct sock_fprog prog;
2058 	pid_t tracer, mytid, mypid, parent;
2059 	long syscall_nr;
2060 };
2061 
2062 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2063 		   int status, void *args)
2064 {
2065 	int ret;
2066 	unsigned long msg;
2067 	static bool entry;
2068 	long syscall_nr_val, syscall_ret_val;
2069 	long *syscall_nr = NULL, *syscall_ret = NULL;
2070 	FIXTURE_DATA(TRACE_syscall) *self = args;
2071 
2072 	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2073 		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2074 		return;
2075 	}
2076 
2077 	/*
2078 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
2079 	 * is by counting.
2080 	 */
2081 	entry = !entry;
2082 
2083 	/* Make sure we got an appropriate message. */
2084 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2085 	EXPECT_EQ(0, ret);
2086 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2087 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2088 
2089 	/*
2090 	 * Some architectures only support setting return values during
2091 	 * syscall exit under ptrace, and on exit the syscall number may
2092 	 * no longer be available. Therefore, save the initial sycall
2093 	 * number here, so it can be examined during both entry and exit
2094 	 * phases.
2095 	 */
2096 	if (entry)
2097 		self->syscall_nr = get_syscall(_metadata, tracee);
2098 
2099 	/*
2100 	 * Depending on the architecture's syscall setting abilities, we
2101 	 * pick which things to set during this phase (entry or exit).
2102 	 */
2103 	if (entry == ptrace_entry_set_syscall_nr)
2104 		syscall_nr = &syscall_nr_val;
2105 	if (entry == ptrace_entry_set_syscall_ret)
2106 		syscall_ret = &syscall_ret_val;
2107 
2108 	/* Now handle the actual rewriting cases. */
2109 	switch (self->syscall_nr) {
2110 	case __NR_getpid:
2111 		syscall_nr_val = __NR_getppid;
2112 		/* Never change syscall return for this case. */
2113 		syscall_ret = NULL;
2114 		break;
2115 	case __NR_gettid:
2116 		syscall_nr_val = -1;
2117 		syscall_ret_val = 45000;
2118 		break;
2119 	case __NR_openat:
2120 		syscall_nr_val = -1;
2121 		syscall_ret_val = -ESRCH;
2122 		break;
2123 	default:
2124 		/* Unhandled, do nothing. */
2125 		return;
2126 	}
2127 
2128 	__change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2129 }
2130 
2131 FIXTURE_VARIANT(TRACE_syscall) {
2132 	/*
2133 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2134 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2135 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
2136 	 * ptrace (true).
2137 	 */
2138 	bool use_ptrace;
2139 };
2140 
2141 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2142 	.use_ptrace = true,
2143 };
2144 
2145 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2146 	.use_ptrace = false,
2147 };
2148 
2149 FIXTURE_SETUP(TRACE_syscall)
2150 {
2151 	struct sock_filter filter[] = {
2152 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2153 			offsetof(struct seccomp_data, nr)),
2154 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2155 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2156 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2157 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2158 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2159 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2160 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2161 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2162 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2163 	};
2164 	struct sock_fprog prog = {
2165 		.len = (unsigned short)ARRAY_SIZE(filter),
2166 		.filter = filter,
2167 	};
2168 	long ret;
2169 
2170 	/* Prepare some testable syscall results. */
2171 	self->mytid = syscall(__NR_gettid);
2172 	ASSERT_GT(self->mytid, 0);
2173 	ASSERT_NE(self->mytid, 1) {
2174 		TH_LOG("Running this test as init is not supported. :)");
2175 	}
2176 
2177 	self->mypid = getpid();
2178 	ASSERT_GT(self->mypid, 0);
2179 	ASSERT_EQ(self->mytid, self->mypid);
2180 
2181 	self->parent = getppid();
2182 	ASSERT_GT(self->parent, 0);
2183 	ASSERT_NE(self->parent, self->mypid);
2184 
2185 	/* Launch tracer. */
2186 	self->tracer = setup_trace_fixture(_metadata,
2187 					   variant->use_ptrace ? tracer_ptrace
2188 							       : tracer_seccomp,
2189 					   self, variant->use_ptrace);
2190 
2191 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2192 	ASSERT_EQ(0, ret);
2193 
2194 	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2195 	if (variant->use_ptrace)
2196 		return;
2197 
2198 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2199 	ASSERT_EQ(0, ret);
2200 }
2201 
2202 FIXTURE_TEARDOWN(TRACE_syscall)
2203 {
2204 	teardown_trace_fixture(_metadata, self->tracer);
2205 }
2206 
2207 TEST(negative_ENOSYS)
2208 {
2209 #if defined(__arm__)
2210 	SKIP(return, "arm32 does not support calling syscall -1");
2211 #endif
2212 	/*
2213 	 * There should be no difference between an "internal" skip
2214 	 * and userspace asking for syscall "-1".
2215 	 */
2216 	errno = 0;
2217 	EXPECT_EQ(-1, syscall(-1));
2218 	EXPECT_EQ(errno, ENOSYS);
2219 	/* And no difference for "still not valid but not -1". */
2220 	errno = 0;
2221 	EXPECT_EQ(-1, syscall(-101));
2222 	EXPECT_EQ(errno, ENOSYS);
2223 }
2224 
2225 TEST_F(TRACE_syscall, negative_ENOSYS)
2226 {
2227 	negative_ENOSYS(_metadata);
2228 }
2229 
2230 TEST_F(TRACE_syscall, syscall_allowed)
2231 {
2232 	/* getppid works as expected (no changes). */
2233 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2234 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2235 }
2236 
2237 TEST_F(TRACE_syscall, syscall_redirected)
2238 {
2239 	/* getpid has been redirected to getppid as expected. */
2240 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2241 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2242 }
2243 
2244 TEST_F(TRACE_syscall, syscall_errno)
2245 {
2246 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2247 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2248 }
2249 
2250 TEST_F(TRACE_syscall, syscall_faked)
2251 {
2252 	/* Tracer skips the gettid syscall and store altered return value. */
2253 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2254 }
2255 
2256 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2257 {
2258 	struct sock_filter filter[] = {
2259 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2260 			offsetof(struct seccomp_data, nr)),
2261 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2262 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2263 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2264 	};
2265 	struct sock_fprog prog = {
2266 		.len = (unsigned short)ARRAY_SIZE(filter),
2267 		.filter = filter,
2268 	};
2269 	long ret;
2270 
2271 	/* Install "kill on mknodat" filter. */
2272 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2273 	ASSERT_EQ(0, ret);
2274 
2275 	/* This should immediately die with SIGSYS, regardless of tracer. */
2276 	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2277 }
2278 
2279 TEST_F(TRACE_syscall, skip_after)
2280 {
2281 	struct sock_filter filter[] = {
2282 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2283 			offsetof(struct seccomp_data, nr)),
2284 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2285 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2286 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2287 	};
2288 	struct sock_fprog prog = {
2289 		.len = (unsigned short)ARRAY_SIZE(filter),
2290 		.filter = filter,
2291 	};
2292 	long ret;
2293 
2294 	/* Install additional "errno on getppid" filter. */
2295 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2296 	ASSERT_EQ(0, ret);
2297 
2298 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2299 	errno = 0;
2300 	EXPECT_EQ(-1, syscall(__NR_getpid));
2301 	EXPECT_EQ(EPERM, errno);
2302 }
2303 
2304 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2305 {
2306 	struct sock_filter filter[] = {
2307 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2308 			offsetof(struct seccomp_data, nr)),
2309 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2310 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2311 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2312 	};
2313 	struct sock_fprog prog = {
2314 		.len = (unsigned short)ARRAY_SIZE(filter),
2315 		.filter = filter,
2316 	};
2317 	long ret;
2318 
2319 	/* Install additional "death on getppid" filter. */
2320 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2321 	ASSERT_EQ(0, ret);
2322 
2323 	/* Tracer will redirect getpid to getppid, and we should die. */
2324 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2325 }
2326 
2327 TEST(seccomp_syscall)
2328 {
2329 	struct sock_filter filter[] = {
2330 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2331 	};
2332 	struct sock_fprog prog = {
2333 		.len = (unsigned short)ARRAY_SIZE(filter),
2334 		.filter = filter,
2335 	};
2336 	long ret;
2337 
2338 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2339 	ASSERT_EQ(0, ret) {
2340 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2341 	}
2342 
2343 	/* Reject insane operation. */
2344 	ret = seccomp(-1, 0, &prog);
2345 	ASSERT_NE(ENOSYS, errno) {
2346 		TH_LOG("Kernel does not support seccomp syscall!");
2347 	}
2348 	EXPECT_EQ(EINVAL, errno) {
2349 		TH_LOG("Did not reject crazy op value!");
2350 	}
2351 
2352 	/* Reject strict with flags or pointer. */
2353 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2354 	EXPECT_EQ(EINVAL, errno) {
2355 		TH_LOG("Did not reject mode strict with flags!");
2356 	}
2357 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2358 	EXPECT_EQ(EINVAL, errno) {
2359 		TH_LOG("Did not reject mode strict with uargs!");
2360 	}
2361 
2362 	/* Reject insane args for filter. */
2363 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2364 	EXPECT_EQ(EINVAL, errno) {
2365 		TH_LOG("Did not reject crazy filter flags!");
2366 	}
2367 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2368 	EXPECT_EQ(EFAULT, errno) {
2369 		TH_LOG("Did not reject NULL filter!");
2370 	}
2371 
2372 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2373 	EXPECT_EQ(0, errno) {
2374 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2375 			strerror(errno));
2376 	}
2377 }
2378 
2379 TEST(seccomp_syscall_mode_lock)
2380 {
2381 	struct sock_filter filter[] = {
2382 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2383 	};
2384 	struct sock_fprog prog = {
2385 		.len = (unsigned short)ARRAY_SIZE(filter),
2386 		.filter = filter,
2387 	};
2388 	long ret;
2389 
2390 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2391 	ASSERT_EQ(0, ret) {
2392 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2393 	}
2394 
2395 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2396 	ASSERT_NE(ENOSYS, errno) {
2397 		TH_LOG("Kernel does not support seccomp syscall!");
2398 	}
2399 	EXPECT_EQ(0, ret) {
2400 		TH_LOG("Could not install filter!");
2401 	}
2402 
2403 	/* Make sure neither entry point will switch to strict. */
2404 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2405 	EXPECT_EQ(EINVAL, errno) {
2406 		TH_LOG("Switched to mode strict!");
2407 	}
2408 
2409 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2410 	EXPECT_EQ(EINVAL, errno) {
2411 		TH_LOG("Switched to mode strict!");
2412 	}
2413 }
2414 
2415 /*
2416  * Test detection of known and unknown filter flags. Userspace needs to be able
2417  * to check if a filter flag is supported by the current kernel and a good way
2418  * of doing that is by attempting to enter filter mode, with the flag bit in
2419  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2420  * that the flag is valid and EINVAL indicates that the flag is invalid.
2421  */
2422 TEST(detect_seccomp_filter_flags)
2423 {
2424 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2425 				 SECCOMP_FILTER_FLAG_LOG,
2426 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2427 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2428 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2429 	unsigned int exclusive[] = {
2430 				SECCOMP_FILTER_FLAG_TSYNC,
2431 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2432 	unsigned int flag, all_flags, exclusive_mask;
2433 	int i;
2434 	long ret;
2435 
2436 	/* Test detection of individual known-good filter flags */
2437 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2438 		int bits = 0;
2439 
2440 		flag = flags[i];
2441 		/* Make sure the flag is a single bit! */
2442 		while (flag) {
2443 			if (flag & 0x1)
2444 				bits ++;
2445 			flag >>= 1;
2446 		}
2447 		ASSERT_EQ(1, bits);
2448 		flag = flags[i];
2449 
2450 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2451 		ASSERT_NE(ENOSYS, errno) {
2452 			TH_LOG("Kernel does not support seccomp syscall!");
2453 		}
2454 		EXPECT_EQ(-1, ret);
2455 		EXPECT_EQ(EFAULT, errno) {
2456 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2457 			       flag);
2458 		}
2459 
2460 		all_flags |= flag;
2461 	}
2462 
2463 	/*
2464 	 * Test detection of all known-good filter flags combined. But
2465 	 * for the exclusive flags we need to mask them out and try them
2466 	 * individually for the "all flags" testing.
2467 	 */
2468 	exclusive_mask = 0;
2469 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2470 		exclusive_mask |= exclusive[i];
2471 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2472 		flag = all_flags & ~exclusive_mask;
2473 		flag |= exclusive[i];
2474 
2475 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2476 		EXPECT_EQ(-1, ret);
2477 		EXPECT_EQ(EFAULT, errno) {
2478 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2479 			       flag);
2480 		}
2481 	}
2482 
2483 	/* Test detection of an unknown filter flags, without exclusives. */
2484 	flag = -1;
2485 	flag &= ~exclusive_mask;
2486 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2487 	EXPECT_EQ(-1, ret);
2488 	EXPECT_EQ(EINVAL, errno) {
2489 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2490 		       flag);
2491 	}
2492 
2493 	/*
2494 	 * Test detection of an unknown filter flag that may simply need to be
2495 	 * added to this test
2496 	 */
2497 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2498 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2499 	EXPECT_EQ(-1, ret);
2500 	EXPECT_EQ(EINVAL, errno) {
2501 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2502 		       flag);
2503 	}
2504 }
2505 
2506 TEST(TSYNC_first)
2507 {
2508 	struct sock_filter filter[] = {
2509 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2510 	};
2511 	struct sock_fprog prog = {
2512 		.len = (unsigned short)ARRAY_SIZE(filter),
2513 		.filter = filter,
2514 	};
2515 	long ret;
2516 
2517 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2518 	ASSERT_EQ(0, ret) {
2519 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2520 	}
2521 
2522 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2523 		      &prog);
2524 	ASSERT_NE(ENOSYS, errno) {
2525 		TH_LOG("Kernel does not support seccomp syscall!");
2526 	}
2527 	EXPECT_EQ(0, ret) {
2528 		TH_LOG("Could not install initial filter with TSYNC!");
2529 	}
2530 }
2531 
2532 #define TSYNC_SIBLINGS 2
2533 struct tsync_sibling {
2534 	pthread_t tid;
2535 	pid_t system_tid;
2536 	sem_t *started;
2537 	pthread_cond_t *cond;
2538 	pthread_mutex_t *mutex;
2539 	int diverge;
2540 	int num_waits;
2541 	struct sock_fprog *prog;
2542 	struct __test_metadata *metadata;
2543 };
2544 
2545 /*
2546  * To avoid joining joined threads (which is not allowed by Bionic),
2547  * make sure we both successfully join and clear the tid to skip a
2548  * later join attempt during fixture teardown. Any remaining threads
2549  * will be directly killed during teardown.
2550  */
2551 #define PTHREAD_JOIN(tid, status)					\
2552 	do {								\
2553 		int _rc = pthread_join(tid, status);			\
2554 		if (_rc) {						\
2555 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2556 				(unsigned int)tid, _rc);		\
2557 		} else {						\
2558 			tid = 0;					\
2559 		}							\
2560 	} while (0)
2561 
2562 FIXTURE(TSYNC) {
2563 	struct sock_fprog root_prog, apply_prog;
2564 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2565 	sem_t started;
2566 	pthread_cond_t cond;
2567 	pthread_mutex_t mutex;
2568 	int sibling_count;
2569 };
2570 
2571 FIXTURE_SETUP(TSYNC)
2572 {
2573 	struct sock_filter root_filter[] = {
2574 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2575 	};
2576 	struct sock_filter apply_filter[] = {
2577 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2578 			offsetof(struct seccomp_data, nr)),
2579 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2580 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2581 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2582 	};
2583 
2584 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2585 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2586 	memset(&self->sibling, 0, sizeof(self->sibling));
2587 	self->root_prog.filter = malloc(sizeof(root_filter));
2588 	ASSERT_NE(NULL, self->root_prog.filter);
2589 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2590 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2591 
2592 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2593 	ASSERT_NE(NULL, self->apply_prog.filter);
2594 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2595 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2596 
2597 	self->sibling_count = 0;
2598 	pthread_mutex_init(&self->mutex, NULL);
2599 	pthread_cond_init(&self->cond, NULL);
2600 	sem_init(&self->started, 0, 0);
2601 	self->sibling[0].tid = 0;
2602 	self->sibling[0].cond = &self->cond;
2603 	self->sibling[0].started = &self->started;
2604 	self->sibling[0].mutex = &self->mutex;
2605 	self->sibling[0].diverge = 0;
2606 	self->sibling[0].num_waits = 1;
2607 	self->sibling[0].prog = &self->root_prog;
2608 	self->sibling[0].metadata = _metadata;
2609 	self->sibling[1].tid = 0;
2610 	self->sibling[1].cond = &self->cond;
2611 	self->sibling[1].started = &self->started;
2612 	self->sibling[1].mutex = &self->mutex;
2613 	self->sibling[1].diverge = 0;
2614 	self->sibling[1].prog = &self->root_prog;
2615 	self->sibling[1].num_waits = 1;
2616 	self->sibling[1].metadata = _metadata;
2617 }
2618 
2619 FIXTURE_TEARDOWN(TSYNC)
2620 {
2621 	int sib = 0;
2622 
2623 	if (self->root_prog.filter)
2624 		free(self->root_prog.filter);
2625 	if (self->apply_prog.filter)
2626 		free(self->apply_prog.filter);
2627 
2628 	for ( ; sib < self->sibling_count; ++sib) {
2629 		struct tsync_sibling *s = &self->sibling[sib];
2630 
2631 		if (!s->tid)
2632 			continue;
2633 		/*
2634 		 * If a thread is still running, it may be stuck, so hit
2635 		 * it over the head really hard.
2636 		 */
2637 		pthread_kill(s->tid, 9);
2638 	}
2639 	pthread_mutex_destroy(&self->mutex);
2640 	pthread_cond_destroy(&self->cond);
2641 	sem_destroy(&self->started);
2642 }
2643 
2644 void *tsync_sibling(void *data)
2645 {
2646 	long ret = 0;
2647 	struct tsync_sibling *me = data;
2648 
2649 	me->system_tid = syscall(__NR_gettid);
2650 
2651 	pthread_mutex_lock(me->mutex);
2652 	if (me->diverge) {
2653 		/* Just re-apply the root prog to fork the tree */
2654 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2655 				me->prog, 0, 0);
2656 	}
2657 	sem_post(me->started);
2658 	/* Return outside of started so parent notices failures. */
2659 	if (ret) {
2660 		pthread_mutex_unlock(me->mutex);
2661 		return (void *)SIBLING_EXIT_FAILURE;
2662 	}
2663 	do {
2664 		pthread_cond_wait(me->cond, me->mutex);
2665 		me->num_waits = me->num_waits - 1;
2666 	} while (me->num_waits);
2667 	pthread_mutex_unlock(me->mutex);
2668 
2669 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2670 	if (!ret)
2671 		return (void *)SIBLING_EXIT_NEWPRIVS;
2672 	read(-1, NULL, 0);
2673 	return (void *)SIBLING_EXIT_UNKILLED;
2674 }
2675 
2676 void tsync_start_sibling(struct tsync_sibling *sibling)
2677 {
2678 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2679 }
2680 
2681 TEST_F(TSYNC, siblings_fail_prctl)
2682 {
2683 	long ret;
2684 	void *status;
2685 	struct sock_filter filter[] = {
2686 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2687 			offsetof(struct seccomp_data, nr)),
2688 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2689 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2690 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2691 	};
2692 	struct sock_fprog prog = {
2693 		.len = (unsigned short)ARRAY_SIZE(filter),
2694 		.filter = filter,
2695 	};
2696 
2697 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2698 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2699 	}
2700 
2701 	/* Check prctl failure detection by requesting sib 0 diverge. */
2702 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2703 	ASSERT_NE(ENOSYS, errno) {
2704 		TH_LOG("Kernel does not support seccomp syscall!");
2705 	}
2706 	ASSERT_EQ(0, ret) {
2707 		TH_LOG("setting filter failed");
2708 	}
2709 
2710 	self->sibling[0].diverge = 1;
2711 	tsync_start_sibling(&self->sibling[0]);
2712 	tsync_start_sibling(&self->sibling[1]);
2713 
2714 	while (self->sibling_count < TSYNC_SIBLINGS) {
2715 		sem_wait(&self->started);
2716 		self->sibling_count++;
2717 	}
2718 
2719 	/* Signal the threads to clean up*/
2720 	pthread_mutex_lock(&self->mutex);
2721 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2722 		TH_LOG("cond broadcast non-zero");
2723 	}
2724 	pthread_mutex_unlock(&self->mutex);
2725 
2726 	/* Ensure diverging sibling failed to call prctl. */
2727 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2728 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2729 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2730 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2731 }
2732 
2733 TEST_F(TSYNC, two_siblings_with_ancestor)
2734 {
2735 	long ret;
2736 	void *status;
2737 
2738 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2739 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2740 	}
2741 
2742 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2743 	ASSERT_NE(ENOSYS, errno) {
2744 		TH_LOG("Kernel does not support seccomp syscall!");
2745 	}
2746 	ASSERT_EQ(0, ret) {
2747 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2748 	}
2749 	tsync_start_sibling(&self->sibling[0]);
2750 	tsync_start_sibling(&self->sibling[1]);
2751 
2752 	while (self->sibling_count < TSYNC_SIBLINGS) {
2753 		sem_wait(&self->started);
2754 		self->sibling_count++;
2755 	}
2756 
2757 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2758 		      &self->apply_prog);
2759 	ASSERT_EQ(0, ret) {
2760 		TH_LOG("Could install filter on all threads!");
2761 	}
2762 	/* Tell the siblings to test the policy */
2763 	pthread_mutex_lock(&self->mutex);
2764 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2765 		TH_LOG("cond broadcast non-zero");
2766 	}
2767 	pthread_mutex_unlock(&self->mutex);
2768 	/* Ensure they are both killed and don't exit cleanly. */
2769 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2770 	EXPECT_EQ(0x0, (long)status);
2771 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2772 	EXPECT_EQ(0x0, (long)status);
2773 }
2774 
2775 TEST_F(TSYNC, two_sibling_want_nnp)
2776 {
2777 	void *status;
2778 
2779 	/* start siblings before any prctl() operations */
2780 	tsync_start_sibling(&self->sibling[0]);
2781 	tsync_start_sibling(&self->sibling[1]);
2782 	while (self->sibling_count < TSYNC_SIBLINGS) {
2783 		sem_wait(&self->started);
2784 		self->sibling_count++;
2785 	}
2786 
2787 	/* Tell the siblings to test no policy */
2788 	pthread_mutex_lock(&self->mutex);
2789 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2790 		TH_LOG("cond broadcast non-zero");
2791 	}
2792 	pthread_mutex_unlock(&self->mutex);
2793 
2794 	/* Ensure they are both upset about lacking nnp. */
2795 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2796 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2797 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2798 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2799 }
2800 
2801 TEST_F(TSYNC, two_siblings_with_no_filter)
2802 {
2803 	long ret;
2804 	void *status;
2805 
2806 	/* start siblings before any prctl() operations */
2807 	tsync_start_sibling(&self->sibling[0]);
2808 	tsync_start_sibling(&self->sibling[1]);
2809 	while (self->sibling_count < TSYNC_SIBLINGS) {
2810 		sem_wait(&self->started);
2811 		self->sibling_count++;
2812 	}
2813 
2814 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2815 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2816 	}
2817 
2818 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2819 		      &self->apply_prog);
2820 	ASSERT_NE(ENOSYS, errno) {
2821 		TH_LOG("Kernel does not support seccomp syscall!");
2822 	}
2823 	ASSERT_EQ(0, ret) {
2824 		TH_LOG("Could install filter on all threads!");
2825 	}
2826 
2827 	/* Tell the siblings to test the policy */
2828 	pthread_mutex_lock(&self->mutex);
2829 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2830 		TH_LOG("cond broadcast non-zero");
2831 	}
2832 	pthread_mutex_unlock(&self->mutex);
2833 
2834 	/* Ensure they are both killed and don't exit cleanly. */
2835 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2836 	EXPECT_EQ(0x0, (long)status);
2837 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2838 	EXPECT_EQ(0x0, (long)status);
2839 }
2840 
2841 TEST_F(TSYNC, two_siblings_with_one_divergence)
2842 {
2843 	long ret;
2844 	void *status;
2845 
2846 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2847 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2848 	}
2849 
2850 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2851 	ASSERT_NE(ENOSYS, errno) {
2852 		TH_LOG("Kernel does not support seccomp syscall!");
2853 	}
2854 	ASSERT_EQ(0, ret) {
2855 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2856 	}
2857 	self->sibling[0].diverge = 1;
2858 	tsync_start_sibling(&self->sibling[0]);
2859 	tsync_start_sibling(&self->sibling[1]);
2860 
2861 	while (self->sibling_count < TSYNC_SIBLINGS) {
2862 		sem_wait(&self->started);
2863 		self->sibling_count++;
2864 	}
2865 
2866 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2867 		      &self->apply_prog);
2868 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2869 		TH_LOG("Did not fail on diverged sibling.");
2870 	}
2871 
2872 	/* Wake the threads */
2873 	pthread_mutex_lock(&self->mutex);
2874 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2875 		TH_LOG("cond broadcast non-zero");
2876 	}
2877 	pthread_mutex_unlock(&self->mutex);
2878 
2879 	/* Ensure they are both unkilled. */
2880 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2881 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2882 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2883 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2884 }
2885 
2886 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2887 {
2888 	long ret, flags;
2889 	void *status;
2890 
2891 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2892 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2893 	}
2894 
2895 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2896 	ASSERT_NE(ENOSYS, errno) {
2897 		TH_LOG("Kernel does not support seccomp syscall!");
2898 	}
2899 	ASSERT_EQ(0, ret) {
2900 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2901 	}
2902 	self->sibling[0].diverge = 1;
2903 	tsync_start_sibling(&self->sibling[0]);
2904 	tsync_start_sibling(&self->sibling[1]);
2905 
2906 	while (self->sibling_count < TSYNC_SIBLINGS) {
2907 		sem_wait(&self->started);
2908 		self->sibling_count++;
2909 	}
2910 
2911 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2912 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2913 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2914 	ASSERT_EQ(ESRCH, errno) {
2915 		TH_LOG("Did not return ESRCH for diverged sibling.");
2916 	}
2917 	ASSERT_EQ(-1, ret) {
2918 		TH_LOG("Did not fail on diverged sibling.");
2919 	}
2920 
2921 	/* Wake the threads */
2922 	pthread_mutex_lock(&self->mutex);
2923 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2924 		TH_LOG("cond broadcast non-zero");
2925 	}
2926 	pthread_mutex_unlock(&self->mutex);
2927 
2928 	/* Ensure they are both unkilled. */
2929 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2930 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2931 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2932 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2933 }
2934 
2935 TEST_F(TSYNC, two_siblings_not_under_filter)
2936 {
2937 	long ret, sib;
2938 	void *status;
2939 	struct timespec delay = { .tv_nsec = 100000000 };
2940 
2941 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2942 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2943 	}
2944 
2945 	/*
2946 	 * Sibling 0 will have its own seccomp policy
2947 	 * and Sibling 1 will not be under seccomp at
2948 	 * all. Sibling 1 will enter seccomp and 0
2949 	 * will cause failure.
2950 	 */
2951 	self->sibling[0].diverge = 1;
2952 	tsync_start_sibling(&self->sibling[0]);
2953 	tsync_start_sibling(&self->sibling[1]);
2954 
2955 	while (self->sibling_count < TSYNC_SIBLINGS) {
2956 		sem_wait(&self->started);
2957 		self->sibling_count++;
2958 	}
2959 
2960 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2961 	ASSERT_NE(ENOSYS, errno) {
2962 		TH_LOG("Kernel does not support seccomp syscall!");
2963 	}
2964 	ASSERT_EQ(0, ret) {
2965 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2966 	}
2967 
2968 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2969 		      &self->apply_prog);
2970 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2971 		TH_LOG("Did not fail on diverged sibling.");
2972 	}
2973 	sib = 1;
2974 	if (ret == self->sibling[0].system_tid)
2975 		sib = 0;
2976 
2977 	pthread_mutex_lock(&self->mutex);
2978 
2979 	/* Increment the other siblings num_waits so we can clean up
2980 	 * the one we just saw.
2981 	 */
2982 	self->sibling[!sib].num_waits += 1;
2983 
2984 	/* Signal the thread to clean up*/
2985 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2986 		TH_LOG("cond broadcast non-zero");
2987 	}
2988 	pthread_mutex_unlock(&self->mutex);
2989 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2990 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2991 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2992 	while (!kill(self->sibling[sib].system_tid, 0))
2993 		nanosleep(&delay, NULL);
2994 	/* Switch to the remaining sibling */
2995 	sib = !sib;
2996 
2997 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2998 		      &self->apply_prog);
2999 	ASSERT_EQ(0, ret) {
3000 		TH_LOG("Expected the remaining sibling to sync");
3001 	};
3002 
3003 	pthread_mutex_lock(&self->mutex);
3004 
3005 	/* If remaining sibling didn't have a chance to wake up during
3006 	 * the first broadcast, manually reduce the num_waits now.
3007 	 */
3008 	if (self->sibling[sib].num_waits > 1)
3009 		self->sibling[sib].num_waits = 1;
3010 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
3011 		TH_LOG("cond broadcast non-zero");
3012 	}
3013 	pthread_mutex_unlock(&self->mutex);
3014 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
3015 	EXPECT_EQ(0, (long)status);
3016 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
3017 	while (!kill(self->sibling[sib].system_tid, 0))
3018 		nanosleep(&delay, NULL);
3019 
3020 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
3021 		      &self->apply_prog);
3022 	ASSERT_EQ(0, ret);  /* just us chickens */
3023 }
3024 
3025 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
3026 TEST(syscall_restart)
3027 {
3028 	long ret;
3029 	unsigned long msg;
3030 	pid_t child_pid;
3031 	int pipefd[2];
3032 	int status;
3033 	siginfo_t info = { };
3034 	struct sock_filter filter[] = {
3035 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3036 			 offsetof(struct seccomp_data, nr)),
3037 
3038 #ifdef __NR_sigreturn
3039 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3040 #endif
3041 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3042 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3043 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3044 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3045 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3046 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3047 
3048 		/* Allow __NR_write for easy logging. */
3049 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3050 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3051 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3052 		/* The nanosleep jump target. */
3053 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3054 		/* The restart_syscall jump target. */
3055 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3056 	};
3057 	struct sock_fprog prog = {
3058 		.len = (unsigned short)ARRAY_SIZE(filter),
3059 		.filter = filter,
3060 	};
3061 #if defined(__arm__)
3062 	struct utsname utsbuf;
3063 #endif
3064 
3065 	ASSERT_EQ(0, pipe(pipefd));
3066 
3067 	child_pid = fork();
3068 	ASSERT_LE(0, child_pid);
3069 	if (child_pid == 0) {
3070 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
3071 		char buf = ' ';
3072 		struct timespec timeout = { };
3073 
3074 		/* Attach parent as tracer and stop. */
3075 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3076 		EXPECT_EQ(0, raise(SIGSTOP));
3077 
3078 		EXPECT_EQ(0, close(pipefd[1]));
3079 
3080 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3081 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3082 		}
3083 
3084 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3085 		EXPECT_EQ(0, ret) {
3086 			TH_LOG("Failed to install filter!");
3087 		}
3088 
3089 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3090 			TH_LOG("Failed to read() sync from parent");
3091 		}
3092 		EXPECT_EQ('.', buf) {
3093 			TH_LOG("Failed to get sync data from read()");
3094 		}
3095 
3096 		/* Start nanosleep to be interrupted. */
3097 		timeout.tv_sec = 1;
3098 		errno = 0;
3099 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3100 			TH_LOG("Call to nanosleep() failed (errno %d: %s)",
3101 				errno, strerror(errno));
3102 		}
3103 
3104 		/* Read final sync from parent. */
3105 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3106 			TH_LOG("Failed final read() from parent");
3107 		}
3108 		EXPECT_EQ('!', buf) {
3109 			TH_LOG("Failed to get final data from read()");
3110 		}
3111 
3112 		/* Directly report the status of our test harness results. */
3113 		syscall(__NR_exit, _metadata->exit_code);
3114 	}
3115 	EXPECT_EQ(0, close(pipefd[0]));
3116 
3117 	/* Attach to child, setup options, and release. */
3118 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3119 	ASSERT_EQ(true, WIFSTOPPED(status));
3120 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3121 			    PTRACE_O_TRACESECCOMP));
3122 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3123 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
3124 
3125 	/* Wait for nanosleep() to start. */
3126 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3127 	ASSERT_EQ(true, WIFSTOPPED(status));
3128 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3129 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3130 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3131 	ASSERT_EQ(0x100, msg);
3132 	ret = get_syscall(_metadata, child_pid);
3133 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3134 
3135 	/* Might as well check siginfo for sanity while we're here. */
3136 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3137 	ASSERT_EQ(SIGTRAP, info.si_signo);
3138 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3139 	EXPECT_EQ(0, info.si_errno);
3140 	EXPECT_EQ(getuid(), info.si_uid);
3141 	/* Verify signal delivery came from child (seccomp-triggered). */
3142 	EXPECT_EQ(child_pid, info.si_pid);
3143 
3144 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3145 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3146 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3147 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3148 	ASSERT_EQ(true, WIFSTOPPED(status));
3149 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3150 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3151 	/*
3152 	 * There is no siginfo on SIGSTOP any more, so we can't verify
3153 	 * signal delivery came from parent now (getpid() == info.si_pid).
3154 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3155 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3156 	 */
3157 	EXPECT_EQ(SIGSTOP, info.si_signo);
3158 
3159 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3160 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
3161 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3162 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3163 	ASSERT_EQ(true, WIFSTOPPED(status));
3164 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3165 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3166 
3167 	/* Wait for restart_syscall() to start. */
3168 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3169 	ASSERT_EQ(true, WIFSTOPPED(status));
3170 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3171 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3172 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3173 
3174 	ASSERT_EQ(0x200, msg);
3175 	ret = get_syscall(_metadata, child_pid);
3176 #if defined(__arm__)
3177 	/*
3178 	 * - native ARM registers do NOT expose true syscall.
3179 	 * - compat ARM registers on ARM64 DO expose true syscall.
3180 	 * - values of utsbuf.machine include 'armv8l' or 'armb8b'
3181 	 *   for ARM64 running in compat mode.
3182 	 */
3183 	ASSERT_EQ(0, uname(&utsbuf));
3184 	if ((strncmp(utsbuf.machine, "arm", 3) == 0) &&
3185 	    (strncmp(utsbuf.machine, "armv8l", 6) != 0) &&
3186 	    (strncmp(utsbuf.machine, "armv8b", 6) != 0)) {
3187 		EXPECT_EQ(__NR_nanosleep, ret);
3188 	} else
3189 #endif
3190 	{
3191 		EXPECT_EQ(__NR_restart_syscall, ret);
3192 	}
3193 
3194 	/* Write again to end test. */
3195 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3196 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
3197 	EXPECT_EQ(0, close(pipefd[1]));
3198 
3199 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3200 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
3201 		_metadata->exit_code = KSFT_FAIL;
3202 }
3203 
3204 TEST_SIGNAL(filter_flag_log, SIGSYS)
3205 {
3206 	struct sock_filter allow_filter[] = {
3207 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3208 	};
3209 	struct sock_filter kill_filter[] = {
3210 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3211 			offsetof(struct seccomp_data, nr)),
3212 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3213 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3214 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3215 	};
3216 	struct sock_fprog allow_prog = {
3217 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
3218 		.filter = allow_filter,
3219 	};
3220 	struct sock_fprog kill_prog = {
3221 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
3222 		.filter = kill_filter,
3223 	};
3224 	long ret;
3225 	pid_t parent = getppid();
3226 
3227 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3228 	ASSERT_EQ(0, ret);
3229 
3230 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3231 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3232 		      &allow_prog);
3233 	ASSERT_NE(ENOSYS, errno) {
3234 		TH_LOG("Kernel does not support seccomp syscall!");
3235 	}
3236 	EXPECT_NE(0, ret) {
3237 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3238 	}
3239 	EXPECT_EQ(EINVAL, errno) {
3240 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3241 	}
3242 
3243 	/* Verify that a simple, permissive filter can be added with no flags */
3244 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3245 	EXPECT_EQ(0, ret);
3246 
3247 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3248 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3249 		      &allow_prog);
3250 	ASSERT_NE(EINVAL, errno) {
3251 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3252 	}
3253 	EXPECT_EQ(0, ret);
3254 
3255 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3256 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3257 		      &kill_prog);
3258 	EXPECT_EQ(0, ret);
3259 
3260 	EXPECT_EQ(parent, syscall(__NR_getppid));
3261 	/* getpid() should never return. */
3262 	EXPECT_EQ(0, syscall(__NR_getpid));
3263 }
3264 
3265 TEST(get_action_avail)
3266 {
3267 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3268 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3269 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3270 	__u32 unknown_action = 0x10000000U;
3271 	int i;
3272 	long ret;
3273 
3274 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3275 	ASSERT_NE(ENOSYS, errno) {
3276 		TH_LOG("Kernel does not support seccomp syscall!");
3277 	}
3278 	ASSERT_NE(EINVAL, errno) {
3279 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3280 	}
3281 	EXPECT_EQ(ret, 0);
3282 
3283 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3284 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3285 		EXPECT_EQ(ret, 0) {
3286 			TH_LOG("Expected action (0x%X) not available!",
3287 			       actions[i]);
3288 		}
3289 	}
3290 
3291 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3292 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3293 	EXPECT_EQ(ret, -1);
3294 	EXPECT_EQ(errno, EOPNOTSUPP);
3295 }
3296 
3297 TEST(get_metadata)
3298 {
3299 	pid_t pid;
3300 	int pipefd[2];
3301 	char buf;
3302 	struct seccomp_metadata md;
3303 	long ret;
3304 
3305 	/* Only real root can get metadata. */
3306 	if (geteuid()) {
3307 		SKIP(return, "get_metadata requires real root");
3308 		return;
3309 	}
3310 
3311 	ASSERT_EQ(0, pipe(pipefd));
3312 
3313 	pid = fork();
3314 	ASSERT_GE(pid, 0);
3315 	if (pid == 0) {
3316 		struct sock_filter filter[] = {
3317 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3318 		};
3319 		struct sock_fprog prog = {
3320 			.len = (unsigned short)ARRAY_SIZE(filter),
3321 			.filter = filter,
3322 		};
3323 
3324 		/* one with log, one without */
3325 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3326 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3327 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3328 
3329 		EXPECT_EQ(0, close(pipefd[0]));
3330 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3331 		ASSERT_EQ(0, close(pipefd[1]));
3332 
3333 		while (1)
3334 			sleep(100);
3335 	}
3336 
3337 	ASSERT_EQ(0, close(pipefd[1]));
3338 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3339 
3340 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3341 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3342 
3343 	/* Past here must not use ASSERT or child process is never killed. */
3344 
3345 	md.filter_off = 0;
3346 	errno = 0;
3347 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3348 	EXPECT_EQ(sizeof(md), ret) {
3349 		if (errno == EINVAL)
3350 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3351 	}
3352 
3353 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3354 	EXPECT_EQ(md.filter_off, 0);
3355 
3356 	md.filter_off = 1;
3357 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3358 	EXPECT_EQ(sizeof(md), ret);
3359 	EXPECT_EQ(md.flags, 0);
3360 	EXPECT_EQ(md.filter_off, 1);
3361 
3362 skip:
3363 	ASSERT_EQ(0, kill(pid, SIGKILL));
3364 }
3365 
3366 static int user_notif_syscall(int nr, unsigned int flags)
3367 {
3368 	struct sock_filter filter[] = {
3369 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3370 			offsetof(struct seccomp_data, nr)),
3371 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3372 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3373 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3374 	};
3375 
3376 	struct sock_fprog prog = {
3377 		.len = (unsigned short)ARRAY_SIZE(filter),
3378 		.filter = filter,
3379 	};
3380 
3381 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3382 }
3383 
3384 #define USER_NOTIF_MAGIC INT_MAX
3385 TEST(user_notification_basic)
3386 {
3387 	pid_t pid;
3388 	long ret;
3389 	int status, listener;
3390 	struct seccomp_notif req = {};
3391 	struct seccomp_notif_resp resp = {};
3392 	struct pollfd pollfd;
3393 
3394 	struct sock_filter filter[] = {
3395 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3396 	};
3397 	struct sock_fprog prog = {
3398 		.len = (unsigned short)ARRAY_SIZE(filter),
3399 		.filter = filter,
3400 	};
3401 
3402 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3403 	ASSERT_EQ(0, ret) {
3404 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3405 	}
3406 
3407 	pid = fork();
3408 	ASSERT_GE(pid, 0);
3409 
3410 	/* Check that we get -ENOSYS with no listener attached */
3411 	if (pid == 0) {
3412 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3413 			exit(1);
3414 		ret = syscall(__NR_getppid);
3415 		exit(ret >= 0 || errno != ENOSYS);
3416 	}
3417 
3418 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3419 	EXPECT_EQ(true, WIFEXITED(status));
3420 	EXPECT_EQ(0, WEXITSTATUS(status));
3421 
3422 	/* Add some no-op filters for grins. */
3423 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3424 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3425 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3426 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3427 
3428 	/* Check that the basic notification machinery works */
3429 	listener = user_notif_syscall(__NR_getppid,
3430 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3431 	ASSERT_GE(listener, 0);
3432 
3433 	/* Installing a second listener in the chain should EBUSY */
3434 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3435 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3436 		  -1);
3437 	EXPECT_EQ(errno, EBUSY);
3438 
3439 	pid = fork();
3440 	ASSERT_GE(pid, 0);
3441 
3442 	if (pid == 0) {
3443 		ret = syscall(__NR_getppid);
3444 		exit(ret != USER_NOTIF_MAGIC);
3445 	}
3446 
3447 	pollfd.fd = listener;
3448 	pollfd.events = POLLIN | POLLOUT;
3449 
3450 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3451 	EXPECT_EQ(pollfd.revents, POLLIN);
3452 
3453 	/* Test that we can't pass garbage to the kernel. */
3454 	memset(&req, 0, sizeof(req));
3455 	req.pid = -1;
3456 	errno = 0;
3457 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3458 	EXPECT_EQ(-1, ret);
3459 	EXPECT_EQ(EINVAL, errno);
3460 
3461 	if (ret) {
3462 		req.pid = 0;
3463 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3464 	}
3465 
3466 	pollfd.fd = listener;
3467 	pollfd.events = POLLIN | POLLOUT;
3468 
3469 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3470 	EXPECT_EQ(pollfd.revents, POLLOUT);
3471 
3472 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3473 
3474 	resp.id = req.id;
3475 	resp.error = 0;
3476 	resp.val = USER_NOTIF_MAGIC;
3477 
3478 	/* check that we make sure flags == 0 */
3479 	resp.flags = 1;
3480 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3481 	EXPECT_EQ(errno, EINVAL);
3482 
3483 	resp.flags = 0;
3484 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3485 
3486 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3487 	EXPECT_EQ(true, WIFEXITED(status));
3488 	EXPECT_EQ(0, WEXITSTATUS(status));
3489 }
3490 
3491 TEST(user_notification_with_tsync)
3492 {
3493 	int ret;
3494 	unsigned int flags;
3495 
3496 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3497 	ASSERT_EQ(0, ret) {
3498 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3499 	}
3500 
3501 	/* these were exclusive */
3502 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3503 		SECCOMP_FILTER_FLAG_TSYNC;
3504 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3505 	ASSERT_EQ(EINVAL, errno);
3506 
3507 	/* but now they're not */
3508 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3509 	ret = user_notif_syscall(__NR_getppid, flags);
3510 	close(ret);
3511 	ASSERT_LE(0, ret);
3512 }
3513 
3514 TEST(user_notification_kill_in_middle)
3515 {
3516 	pid_t pid;
3517 	long ret;
3518 	int listener;
3519 	struct seccomp_notif req = {};
3520 	struct seccomp_notif_resp resp = {};
3521 
3522 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3523 	ASSERT_EQ(0, ret) {
3524 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3525 	}
3526 
3527 	listener = user_notif_syscall(__NR_getppid,
3528 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3529 	ASSERT_GE(listener, 0);
3530 
3531 	/*
3532 	 * Check that nothing bad happens when we kill the task in the middle
3533 	 * of a syscall.
3534 	 */
3535 	pid = fork();
3536 	ASSERT_GE(pid, 0);
3537 
3538 	if (pid == 0) {
3539 		ret = syscall(__NR_getppid);
3540 		exit(ret != USER_NOTIF_MAGIC);
3541 	}
3542 
3543 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3544 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3545 
3546 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3547 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3548 
3549 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3550 
3551 	resp.id = req.id;
3552 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3553 	EXPECT_EQ(ret, -1);
3554 	EXPECT_EQ(errno, ENOENT);
3555 }
3556 
3557 static int handled = -1;
3558 
3559 static void signal_handler(int signal)
3560 {
3561 	if (write(handled, "c", 1) != 1)
3562 		perror("write from signal");
3563 }
3564 
3565 static void signal_handler_nop(int signal)
3566 {
3567 }
3568 
3569 TEST(user_notification_signal)
3570 {
3571 	pid_t pid;
3572 	long ret;
3573 	int status, listener, sk_pair[2];
3574 	struct seccomp_notif req = {};
3575 	struct seccomp_notif_resp resp = {};
3576 	char c;
3577 
3578 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3579 	ASSERT_EQ(0, ret) {
3580 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3581 	}
3582 
3583 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3584 
3585 	listener = user_notif_syscall(__NR_gettid,
3586 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3587 	ASSERT_GE(listener, 0);
3588 
3589 	pid = fork();
3590 	ASSERT_GE(pid, 0);
3591 
3592 	if (pid == 0) {
3593 		close(sk_pair[0]);
3594 		handled = sk_pair[1];
3595 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3596 			perror("signal");
3597 			exit(1);
3598 		}
3599 		/*
3600 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3601 		 * to rely on a signal that has not yet been handled. Let's at
3602 		 * least check that the error code gets propagated through, and
3603 		 * hope that it doesn't break when there is actually a signal :)
3604 		 */
3605 		ret = syscall(__NR_gettid);
3606 		exit(!(ret == -1 && errno == 512));
3607 	}
3608 
3609 	close(sk_pair[1]);
3610 
3611 	memset(&req, 0, sizeof(req));
3612 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3613 
3614 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3615 
3616 	/*
3617 	 * Make sure the signal really is delivered, which means we're not
3618 	 * stuck in the user notification code any more and the notification
3619 	 * should be dead.
3620 	 */
3621 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3622 
3623 	resp.id = req.id;
3624 	resp.error = -EPERM;
3625 	resp.val = 0;
3626 
3627 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3628 	EXPECT_EQ(errno, ENOENT);
3629 
3630 	memset(&req, 0, sizeof(req));
3631 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3632 
3633 	resp.id = req.id;
3634 	resp.error = -512; /* -ERESTARTSYS */
3635 	resp.val = 0;
3636 
3637 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3638 
3639 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3640 	EXPECT_EQ(true, WIFEXITED(status));
3641 	EXPECT_EQ(0, WEXITSTATUS(status));
3642 }
3643 
3644 TEST(user_notification_closed_listener)
3645 {
3646 	pid_t pid;
3647 	long ret;
3648 	int status, listener;
3649 
3650 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3651 	ASSERT_EQ(0, ret) {
3652 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3653 	}
3654 
3655 	listener = user_notif_syscall(__NR_getppid,
3656 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3657 	ASSERT_GE(listener, 0);
3658 
3659 	/*
3660 	 * Check that we get an ENOSYS when the listener is closed.
3661 	 */
3662 	pid = fork();
3663 	ASSERT_GE(pid, 0);
3664 	if (pid == 0) {
3665 		close(listener);
3666 		ret = syscall(__NR_getppid);
3667 		exit(ret != -1 && errno != ENOSYS);
3668 	}
3669 
3670 	close(listener);
3671 
3672 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3673 	EXPECT_EQ(true, WIFEXITED(status));
3674 	EXPECT_EQ(0, WEXITSTATUS(status));
3675 }
3676 
3677 /*
3678  * Check that a pid in a child namespace still shows up as valid in ours.
3679  */
3680 TEST(user_notification_child_pid_ns)
3681 {
3682 	pid_t pid;
3683 	int status, listener;
3684 	struct seccomp_notif req = {};
3685 	struct seccomp_notif_resp resp = {};
3686 
3687 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3688 		if (errno == EINVAL)
3689 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3690 	};
3691 
3692 	listener = user_notif_syscall(__NR_getppid,
3693 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3694 	ASSERT_GE(listener, 0);
3695 
3696 	pid = fork();
3697 	ASSERT_GE(pid, 0);
3698 
3699 	if (pid == 0)
3700 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3701 
3702 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3703 	EXPECT_EQ(req.pid, pid);
3704 
3705 	resp.id = req.id;
3706 	resp.error = 0;
3707 	resp.val = USER_NOTIF_MAGIC;
3708 
3709 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3710 
3711 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3712 	EXPECT_EQ(true, WIFEXITED(status));
3713 	EXPECT_EQ(0, WEXITSTATUS(status));
3714 	close(listener);
3715 }
3716 
3717 /*
3718  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3719  * invalid.
3720  */
3721 TEST(user_notification_sibling_pid_ns)
3722 {
3723 	pid_t pid, pid2;
3724 	int status, listener;
3725 	struct seccomp_notif req = {};
3726 	struct seccomp_notif_resp resp = {};
3727 
3728 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3729 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3730 	}
3731 
3732 	listener = user_notif_syscall(__NR_getppid,
3733 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3734 	ASSERT_GE(listener, 0);
3735 
3736 	pid = fork();
3737 	ASSERT_GE(pid, 0);
3738 
3739 	if (pid == 0) {
3740 		ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3741 			if (errno == EPERM)
3742 				SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3743 			else if (errno == EINVAL)
3744 				SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3745 		}
3746 
3747 		pid2 = fork();
3748 		ASSERT_GE(pid2, 0);
3749 
3750 		if (pid2 == 0)
3751 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3752 
3753 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3754 		EXPECT_EQ(true, WIFEXITED(status));
3755 		EXPECT_EQ(0, WEXITSTATUS(status));
3756 		exit(WEXITSTATUS(status));
3757 	}
3758 
3759 	/* Create the sibling ns, and sibling in it. */
3760 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3761 		if (errno == EPERM)
3762 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3763 		else if (errno == EINVAL)
3764 			SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3765 	}
3766 	ASSERT_EQ(errno, 0);
3767 
3768 	pid2 = fork();
3769 	ASSERT_GE(pid2, 0);
3770 
3771 	if (pid2 == 0) {
3772 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3773 		/*
3774 		 * The pid should be 0, i.e. the task is in some namespace that
3775 		 * we can't "see".
3776 		 */
3777 		EXPECT_EQ(req.pid, 0);
3778 
3779 		resp.id = req.id;
3780 		resp.error = 0;
3781 		resp.val = USER_NOTIF_MAGIC;
3782 
3783 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3784 		exit(0);
3785 	}
3786 
3787 	close(listener);
3788 
3789 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3790 	EXPECT_EQ(true, WIFEXITED(status));
3791 	EXPECT_EQ(0, WEXITSTATUS(status));
3792 
3793 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3794 	EXPECT_EQ(true, WIFEXITED(status));
3795 	EXPECT_EQ(0, WEXITSTATUS(status));
3796 }
3797 
3798 TEST(user_notification_fault_recv)
3799 {
3800 	pid_t pid;
3801 	int status, listener;
3802 	struct seccomp_notif req = {};
3803 	struct seccomp_notif_resp resp = {};
3804 
3805 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3806 		if (errno == EINVAL)
3807 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3808 	}
3809 
3810 	listener = user_notif_syscall(__NR_getppid,
3811 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3812 	ASSERT_GE(listener, 0);
3813 
3814 	pid = fork();
3815 	ASSERT_GE(pid, 0);
3816 
3817 	if (pid == 0)
3818 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3819 
3820 	/* Do a bad recv() */
3821 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3822 	EXPECT_EQ(errno, EFAULT);
3823 
3824 	/* We should still be able to receive this notification, though. */
3825 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3826 	EXPECT_EQ(req.pid, pid);
3827 
3828 	resp.id = req.id;
3829 	resp.error = 0;
3830 	resp.val = USER_NOTIF_MAGIC;
3831 
3832 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3833 
3834 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3835 	EXPECT_EQ(true, WIFEXITED(status));
3836 	EXPECT_EQ(0, WEXITSTATUS(status));
3837 }
3838 
3839 TEST(seccomp_get_notif_sizes)
3840 {
3841 	struct seccomp_notif_sizes sizes;
3842 
3843 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3844 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3845 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3846 }
3847 
3848 TEST(user_notification_continue)
3849 {
3850 	pid_t pid;
3851 	long ret;
3852 	int status, listener;
3853 	struct seccomp_notif req = {};
3854 	struct seccomp_notif_resp resp = {};
3855 	struct pollfd pollfd;
3856 
3857 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3858 	ASSERT_EQ(0, ret) {
3859 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3860 	}
3861 
3862 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3863 	ASSERT_GE(listener, 0);
3864 
3865 	pid = fork();
3866 	ASSERT_GE(pid, 0);
3867 
3868 	if (pid == 0) {
3869 		int dup_fd, pipe_fds[2];
3870 		pid_t self;
3871 
3872 		ASSERT_GE(pipe(pipe_fds), 0);
3873 
3874 		dup_fd = dup(pipe_fds[0]);
3875 		ASSERT_GE(dup_fd, 0);
3876 		EXPECT_NE(pipe_fds[0], dup_fd);
3877 
3878 		self = getpid();
3879 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3880 		exit(0);
3881 	}
3882 
3883 	pollfd.fd = listener;
3884 	pollfd.events = POLLIN | POLLOUT;
3885 
3886 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3887 	EXPECT_EQ(pollfd.revents, POLLIN);
3888 
3889 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3890 
3891 	pollfd.fd = listener;
3892 	pollfd.events = POLLIN | POLLOUT;
3893 
3894 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3895 	EXPECT_EQ(pollfd.revents, POLLOUT);
3896 
3897 	EXPECT_EQ(req.data.nr, __NR_dup);
3898 
3899 	resp.id = req.id;
3900 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3901 
3902 	/*
3903 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3904 	 * args be set to 0.
3905 	 */
3906 	resp.error = 0;
3907 	resp.val = USER_NOTIF_MAGIC;
3908 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3909 	EXPECT_EQ(errno, EINVAL);
3910 
3911 	resp.error = USER_NOTIF_MAGIC;
3912 	resp.val = 0;
3913 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3914 	EXPECT_EQ(errno, EINVAL);
3915 
3916 	resp.error = 0;
3917 	resp.val = 0;
3918 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3919 		if (errno == EINVAL)
3920 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3921 	}
3922 
3923 skip:
3924 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3925 	EXPECT_EQ(true, WIFEXITED(status));
3926 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3927 		if (WEXITSTATUS(status) == 2) {
3928 			SKIP(return, "Kernel does not support kcmp() syscall");
3929 			return;
3930 		}
3931 	}
3932 }
3933 
3934 TEST(user_notification_filter_empty)
3935 {
3936 	pid_t pid;
3937 	long ret;
3938 	int status;
3939 	struct pollfd pollfd;
3940 	struct __clone_args args = {
3941 		.flags = CLONE_FILES,
3942 		.exit_signal = SIGCHLD,
3943 	};
3944 
3945 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3946 	ASSERT_EQ(0, ret) {
3947 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3948 	}
3949 
3950 	if (__NR_clone3 < 0)
3951 		SKIP(return, "Test not built with clone3 support");
3952 
3953 	pid = sys_clone3(&args, sizeof(args));
3954 	ASSERT_GE(pid, 0);
3955 
3956 	if (pid == 0) {
3957 		int listener;
3958 
3959 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3960 		if (listener < 0)
3961 			_exit(EXIT_FAILURE);
3962 
3963 		if (dup2(listener, 200) != 200)
3964 			_exit(EXIT_FAILURE);
3965 
3966 		close(listener);
3967 
3968 		_exit(EXIT_SUCCESS);
3969 	}
3970 
3971 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3972 	EXPECT_EQ(true, WIFEXITED(status));
3973 	EXPECT_EQ(0, WEXITSTATUS(status));
3974 
3975 	/*
3976 	 * The seccomp filter has become unused so we should be notified once
3977 	 * the kernel gets around to cleaning up task struct.
3978 	 */
3979 	pollfd.fd = 200;
3980 	pollfd.events = POLLHUP;
3981 
3982 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3983 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3984 }
3985 
3986 TEST(user_ioctl_notification_filter_empty)
3987 {
3988 	pid_t pid;
3989 	long ret;
3990 	int status, p[2];
3991 	struct __clone_args args = {
3992 		.flags = CLONE_FILES,
3993 		.exit_signal = SIGCHLD,
3994 	};
3995 	struct seccomp_notif req = {};
3996 
3997 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3998 	ASSERT_EQ(0, ret) {
3999 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4000 	}
4001 
4002 	if (__NR_clone3 < 0)
4003 		SKIP(return, "Test not built with clone3 support");
4004 
4005 	ASSERT_EQ(0, pipe(p));
4006 
4007 	pid = sys_clone3(&args, sizeof(args));
4008 	ASSERT_GE(pid, 0);
4009 
4010 	if (pid == 0) {
4011 		int listener;
4012 
4013 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
4014 		if (listener < 0)
4015 			_exit(EXIT_FAILURE);
4016 
4017 		if (dup2(listener, 200) != 200)
4018 			_exit(EXIT_FAILURE);
4019 		close(p[1]);
4020 		close(listener);
4021 		sleep(1);
4022 
4023 		_exit(EXIT_SUCCESS);
4024 	}
4025 	if (read(p[0], &status, 1) != 0)
4026 		_exit(EXIT_SUCCESS);
4027 	close(p[0]);
4028 	/*
4029 	 * The seccomp filter has become unused so we should be notified once
4030 	 * the kernel gets around to cleaning up task struct.
4031 	 */
4032 	EXPECT_EQ(ioctl(200, SECCOMP_IOCTL_NOTIF_RECV, &req), -1);
4033 	EXPECT_EQ(errno, ENOENT);
4034 
4035 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4036 	EXPECT_EQ(true, WIFEXITED(status));
4037 	EXPECT_EQ(0, WEXITSTATUS(status));
4038 }
4039 
4040 static void *do_thread(void *data)
4041 {
4042 	return NULL;
4043 }
4044 
4045 TEST(user_notification_filter_empty_threaded)
4046 {
4047 	pid_t pid;
4048 	long ret;
4049 	int status;
4050 	struct pollfd pollfd;
4051 	struct __clone_args args = {
4052 		.flags = CLONE_FILES,
4053 		.exit_signal = SIGCHLD,
4054 	};
4055 
4056 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4057 	ASSERT_EQ(0, ret) {
4058 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4059 	}
4060 
4061 	if (__NR_clone3 < 0)
4062 		SKIP(return, "Test not built with clone3 support");
4063 
4064 	pid = sys_clone3(&args, sizeof(args));
4065 	ASSERT_GE(pid, 0);
4066 
4067 	if (pid == 0) {
4068 		pid_t pid1, pid2;
4069 		int listener, status;
4070 		pthread_t thread;
4071 
4072 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
4073 		if (listener < 0)
4074 			_exit(EXIT_FAILURE);
4075 
4076 		if (dup2(listener, 200) != 200)
4077 			_exit(EXIT_FAILURE);
4078 
4079 		close(listener);
4080 
4081 		pid1 = fork();
4082 		if (pid1 < 0)
4083 			_exit(EXIT_FAILURE);
4084 
4085 		if (pid1 == 0)
4086 			_exit(EXIT_SUCCESS);
4087 
4088 		pid2 = fork();
4089 		if (pid2 < 0)
4090 			_exit(EXIT_FAILURE);
4091 
4092 		if (pid2 == 0)
4093 			_exit(EXIT_SUCCESS);
4094 
4095 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4096 		    pthread_join(thread, NULL))
4097 			_exit(EXIT_FAILURE);
4098 
4099 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4100 		    pthread_join(thread, NULL))
4101 			_exit(EXIT_FAILURE);
4102 
4103 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
4104 		    WEXITSTATUS(status))
4105 			_exit(EXIT_FAILURE);
4106 
4107 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
4108 		    WEXITSTATUS(status))
4109 			_exit(EXIT_FAILURE);
4110 
4111 		exit(EXIT_SUCCESS);
4112 	}
4113 
4114 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4115 	EXPECT_EQ(true, WIFEXITED(status));
4116 	EXPECT_EQ(0, WEXITSTATUS(status));
4117 
4118 	/*
4119 	 * The seccomp filter has become unused so we should be notified once
4120 	 * the kernel gets around to cleaning up task struct.
4121 	 */
4122 	pollfd.fd = 200;
4123 	pollfd.events = POLLHUP;
4124 
4125 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4126 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4127 }
4128 
4129 
4130 int get_next_fd(int prev_fd)
4131 {
4132 	for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) {
4133 		if (fcntl(i, F_GETFD) == -1)
4134 			return i;
4135 	}
4136 	_exit(EXIT_FAILURE);
4137 }
4138 
4139 TEST(user_notification_addfd)
4140 {
4141 	pid_t pid;
4142 	long ret;
4143 	int status, listener, memfd, fd, nextfd;
4144 	struct seccomp_notif_addfd addfd = {};
4145 	struct seccomp_notif_addfd_small small = {};
4146 	struct seccomp_notif_addfd_big big = {};
4147 	struct seccomp_notif req = {};
4148 	struct seccomp_notif_resp resp = {};
4149 	/* 100 ms */
4150 	struct timespec delay = { .tv_nsec = 100000000 };
4151 
4152 	/* There may be arbitrary already-open fds at test start. */
4153 	memfd = memfd_create("test", 0);
4154 	ASSERT_GE(memfd, 0);
4155 	nextfd = get_next_fd(memfd);
4156 
4157 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4158 	ASSERT_EQ(0, ret) {
4159 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4160 	}
4161 
4162 	/* fd: 4 */
4163 	/* Check that the basic notification machinery works */
4164 	listener = user_notif_syscall(__NR_getppid,
4165 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4166 	ASSERT_EQ(listener, nextfd);
4167 	nextfd = get_next_fd(nextfd);
4168 
4169 	pid = fork();
4170 	ASSERT_GE(pid, 0);
4171 
4172 	if (pid == 0) {
4173 		/* fds will be added and this value is expected */
4174 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4175 			exit(1);
4176 
4177 		/* Atomic addfd+send is received here. Check it is a valid fd */
4178 		if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4179 			exit(1);
4180 
4181 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4182 	}
4183 
4184 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4185 
4186 	addfd.srcfd = memfd;
4187 	addfd.newfd = 0;
4188 	addfd.id = req.id;
4189 	addfd.flags = 0x0;
4190 
4191 	/* Verify bad newfd_flags cannot be set */
4192 	addfd.newfd_flags = ~O_CLOEXEC;
4193 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4194 	EXPECT_EQ(errno, EINVAL);
4195 	addfd.newfd_flags = O_CLOEXEC;
4196 
4197 	/* Verify bad flags cannot be set */
4198 	addfd.flags = 0xff;
4199 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4200 	EXPECT_EQ(errno, EINVAL);
4201 	addfd.flags = 0;
4202 
4203 	/* Verify that remote_fd cannot be set without setting flags */
4204 	addfd.newfd = 1;
4205 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4206 	EXPECT_EQ(errno, EINVAL);
4207 	addfd.newfd = 0;
4208 
4209 	/* Verify small size cannot be set */
4210 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4211 	EXPECT_EQ(errno, EINVAL);
4212 
4213 	/* Verify we can't send bits filled in unknown buffer area */
4214 	memset(&big, 0xAA, sizeof(big));
4215 	big.addfd = addfd;
4216 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4217 	EXPECT_EQ(errno, E2BIG);
4218 
4219 
4220 	/* Verify we can set an arbitrary remote fd */
4221 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4222 	EXPECT_EQ(fd, nextfd);
4223 	nextfd = get_next_fd(nextfd);
4224 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4225 
4226 	/* Verify we can set an arbitrary remote fd with large size */
4227 	memset(&big, 0x0, sizeof(big));
4228 	big.addfd = addfd;
4229 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4230 	EXPECT_EQ(fd, nextfd);
4231 	nextfd = get_next_fd(nextfd);
4232 
4233 	/* Verify we can set a specific remote fd */
4234 	addfd.newfd = 42;
4235 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4236 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4237 	EXPECT_EQ(fd, 42);
4238 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4239 
4240 	/* Resume syscall */
4241 	resp.id = req.id;
4242 	resp.error = 0;
4243 	resp.val = USER_NOTIF_MAGIC;
4244 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4245 
4246 	/*
4247 	 * This sets the ID of the ADD FD to the last request plus 1. The
4248 	 * notification ID increments 1 per notification.
4249 	 */
4250 	addfd.id = req.id + 1;
4251 
4252 	/* This spins until the underlying notification is generated */
4253 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4254 	       errno != -EINPROGRESS)
4255 		nanosleep(&delay, NULL);
4256 
4257 	memset(&req, 0, sizeof(req));
4258 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4259 	ASSERT_EQ(addfd.id, req.id);
4260 
4261 	/* Verify we can do an atomic addfd and send */
4262 	addfd.newfd = 0;
4263 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4264 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4265 	/*
4266 	 * Child has earlier "low" fds and now 42, so we expect the next
4267 	 * lowest available fd to be assigned here.
4268 	 */
4269 	EXPECT_EQ(fd, nextfd);
4270 	nextfd = get_next_fd(nextfd);
4271 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4272 
4273 	/*
4274 	 * This sets the ID of the ADD FD to the last request plus 1. The
4275 	 * notification ID increments 1 per notification.
4276 	 */
4277 	addfd.id = req.id + 1;
4278 
4279 	/* This spins until the underlying notification is generated */
4280 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4281 	       errno != -EINPROGRESS)
4282 		nanosleep(&delay, NULL);
4283 
4284 	memset(&req, 0, sizeof(req));
4285 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4286 	ASSERT_EQ(addfd.id, req.id);
4287 
4288 	resp.id = req.id;
4289 	resp.error = 0;
4290 	resp.val = USER_NOTIF_MAGIC;
4291 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4292 
4293 	/* Wait for child to finish. */
4294 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4295 	EXPECT_EQ(true, WIFEXITED(status));
4296 	EXPECT_EQ(0, WEXITSTATUS(status));
4297 
4298 	close(memfd);
4299 }
4300 
4301 TEST(user_notification_addfd_rlimit)
4302 {
4303 	pid_t pid;
4304 	long ret;
4305 	int status, listener, memfd;
4306 	struct seccomp_notif_addfd addfd = {};
4307 	struct seccomp_notif req = {};
4308 	struct seccomp_notif_resp resp = {};
4309 	const struct rlimit lim = {
4310 		.rlim_cur	= 0,
4311 		.rlim_max	= 0,
4312 	};
4313 
4314 	memfd = memfd_create("test", 0);
4315 	ASSERT_GE(memfd, 0);
4316 
4317 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4318 	ASSERT_EQ(0, ret) {
4319 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4320 	}
4321 
4322 	/* Check that the basic notification machinery works */
4323 	listener = user_notif_syscall(__NR_getppid,
4324 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4325 	ASSERT_GE(listener, 0);
4326 
4327 	pid = fork();
4328 	ASSERT_GE(pid, 0);
4329 
4330 	if (pid == 0)
4331 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4332 
4333 
4334 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4335 
4336 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4337 
4338 	addfd.srcfd = memfd;
4339 	addfd.newfd_flags = O_CLOEXEC;
4340 	addfd.newfd = 0;
4341 	addfd.id = req.id;
4342 	addfd.flags = 0;
4343 
4344 	/* Should probably spot check /proc/sys/fs/file-nr */
4345 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4346 	EXPECT_EQ(errno, EMFILE);
4347 
4348 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4349 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4350 	EXPECT_EQ(errno, EMFILE);
4351 
4352 	addfd.newfd = 100;
4353 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4354 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4355 	EXPECT_EQ(errno, EBADF);
4356 
4357 	resp.id = req.id;
4358 	resp.error = 0;
4359 	resp.val = USER_NOTIF_MAGIC;
4360 
4361 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4362 
4363 	/* Wait for child to finish. */
4364 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4365 	EXPECT_EQ(true, WIFEXITED(status));
4366 	EXPECT_EQ(0, WEXITSTATUS(status));
4367 
4368 	close(memfd);
4369 }
4370 
4371 #ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
4372 #define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
4373 #define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
4374 #endif
4375 
4376 TEST(user_notification_sync)
4377 {
4378 	struct seccomp_notif req = {};
4379 	struct seccomp_notif_resp resp = {};
4380 	int status, listener;
4381 	pid_t pid;
4382 	long ret;
4383 
4384 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4385 	ASSERT_EQ(0, ret) {
4386 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4387 	}
4388 
4389 	listener = user_notif_syscall(__NR_getppid,
4390 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4391 	ASSERT_GE(listener, 0);
4392 
4393 	/* Try to set invalid flags. */
4394 	EXPECT_SYSCALL_RETURN(-EINVAL,
4395 		ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
4396 
4397 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
4398 			SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
4399 
4400 	pid = fork();
4401 	ASSERT_GE(pid, 0);
4402 	if (pid == 0) {
4403 		ret = syscall(__NR_getppid);
4404 		ASSERT_EQ(ret, USER_NOTIF_MAGIC) {
4405 			_exit(1);
4406 		}
4407 		_exit(0);
4408 	}
4409 
4410 	req.pid = 0;
4411 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4412 
4413 	ASSERT_EQ(req.data.nr,  __NR_getppid);
4414 
4415 	resp.id = req.id;
4416 	resp.error = 0;
4417 	resp.val = USER_NOTIF_MAGIC;
4418 	resp.flags = 0;
4419 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4420 
4421 	ASSERT_EQ(waitpid(pid, &status, 0), pid);
4422 	ASSERT_EQ(status, 0);
4423 }
4424 
4425 
4426 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
4427 FIXTURE(O_SUSPEND_SECCOMP) {
4428 	pid_t pid;
4429 };
4430 
4431 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4432 {
4433 	ERRNO_FILTER(block_read, E2BIG);
4434 	cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4435 	cap_t caps;
4436 
4437 	self->pid = 0;
4438 
4439 	/* make sure we don't have CAP_SYS_ADMIN */
4440 	caps = cap_get_proc();
4441 	ASSERT_NE(NULL, caps);
4442 	ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4443 	ASSERT_EQ(0, cap_set_proc(caps));
4444 	cap_free(caps);
4445 
4446 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4447 	ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4448 
4449 	self->pid = fork();
4450 	ASSERT_GE(self->pid, 0);
4451 
4452 	if (self->pid == 0) {
4453 		while (1)
4454 			pause();
4455 		_exit(127);
4456 	}
4457 }
4458 
4459 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4460 {
4461 	if (self->pid)
4462 		kill(self->pid, SIGKILL);
4463 }
4464 
4465 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4466 {
4467 	int wstatus;
4468 
4469 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4470 	ASSERT_EQ(self->pid, wait(&wstatus));
4471 	ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4472 	if (errno == EINVAL)
4473 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4474 	ASSERT_EQ(EPERM, errno);
4475 }
4476 
4477 TEST_F(O_SUSPEND_SECCOMP, seize)
4478 {
4479 	int ret;
4480 
4481 	ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4482 	ASSERT_EQ(-1, ret);
4483 	if (errno == EINVAL)
4484 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4485 	ASSERT_EQ(EPERM, errno);
4486 }
4487 
4488 /*
4489  * get_nth - Get the nth, space separated entry in a file.
4490  *
4491  * Returns the length of the read field.
4492  * Throws error if field is zero-lengthed.
4493  */
4494 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4495 		     const unsigned int position, char **entry)
4496 {
4497 	char *line = NULL;
4498 	unsigned int i;
4499 	ssize_t nread;
4500 	size_t len = 0;
4501 	FILE *f;
4502 
4503 	f = fopen(path, "r");
4504 	ASSERT_NE(f, NULL) {
4505 		TH_LOG("Could not open %s: %s", path, strerror(errno));
4506 	}
4507 
4508 	for (i = 0; i < position; i++) {
4509 		nread = getdelim(&line, &len, ' ', f);
4510 		ASSERT_GE(nread, 0) {
4511 			TH_LOG("Failed to read %d entry in file %s", i, path);
4512 		}
4513 	}
4514 	fclose(f);
4515 
4516 	ASSERT_GT(nread, 0) {
4517 		TH_LOG("Entry in file %s had zero length", path);
4518 	}
4519 
4520 	*entry = line;
4521 	return nread - 1;
4522 }
4523 
4524 /* For a given PID, get the task state (D, R, etc...) */
4525 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4526 {
4527 	char proc_path[100] = {0};
4528 	char status;
4529 	char *line;
4530 
4531 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4532 	ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4533 
4534 	status = *line;
4535 	free(line);
4536 
4537 	return status;
4538 }
4539 
4540 TEST(user_notification_fifo)
4541 {
4542 	struct seccomp_notif_resp resp = {};
4543 	struct seccomp_notif req = {};
4544 	int i, status, listener;
4545 	pid_t pid, pids[3];
4546 	__u64 baseid;
4547 	long ret;
4548 	/* 100 ms */
4549 	struct timespec delay = { .tv_nsec = 100000000 };
4550 
4551 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4552 	ASSERT_EQ(0, ret) {
4553 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4554 	}
4555 
4556 	/* Setup a listener */
4557 	listener = user_notif_syscall(__NR_getppid,
4558 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4559 	ASSERT_GE(listener, 0);
4560 
4561 	pid = fork();
4562 	ASSERT_GE(pid, 0);
4563 
4564 	if (pid == 0) {
4565 		ret = syscall(__NR_getppid);
4566 		exit(ret != USER_NOTIF_MAGIC);
4567 	}
4568 
4569 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4570 	baseid = req.id + 1;
4571 
4572 	resp.id = req.id;
4573 	resp.error = 0;
4574 	resp.val = USER_NOTIF_MAGIC;
4575 
4576 	/* check that we make sure flags == 0 */
4577 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4578 
4579 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4580 	EXPECT_EQ(true, WIFEXITED(status));
4581 	EXPECT_EQ(0, WEXITSTATUS(status));
4582 
4583 	/* Start children, and generate notifications */
4584 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4585 		pid = fork();
4586 		if (pid == 0) {
4587 			ret = syscall(__NR_getppid);
4588 			exit(ret != USER_NOTIF_MAGIC);
4589 		}
4590 		pids[i] = pid;
4591 	}
4592 
4593 	/* This spins until all of the children are sleeping */
4594 restart_wait:
4595 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4596 		if (get_proc_stat(_metadata, pids[i]) != 'S') {
4597 			nanosleep(&delay, NULL);
4598 			goto restart_wait;
4599 		}
4600 	}
4601 
4602 	/* Read the notifications in order (and respond) */
4603 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4604 		memset(&req, 0, sizeof(req));
4605 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4606 		EXPECT_EQ(req.id, baseid + i);
4607 		resp.id = req.id;
4608 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4609 	}
4610 
4611 	/* Make sure notifications were received */
4612 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4613 		EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4614 		EXPECT_EQ(true, WIFEXITED(status));
4615 		EXPECT_EQ(0, WEXITSTATUS(status));
4616 	}
4617 }
4618 
4619 /* get_proc_syscall - Get the syscall in progress for a given pid
4620  *
4621  * Returns the current syscall number for a given process
4622  * Returns -1 if not in syscall (running or blocked)
4623  */
4624 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4625 {
4626 	char proc_path[100] = {0};
4627 	long ret = -1;
4628 	ssize_t nread;
4629 	char *line;
4630 
4631 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4632 	nread = get_nth(_metadata, proc_path, 1, &line);
4633 	ASSERT_GT(nread, 0);
4634 
4635 	if (!strncmp("running", line, MIN(7, nread)))
4636 		ret = strtol(line, NULL, 16);
4637 
4638 	free(line);
4639 	return ret;
4640 }
4641 
4642 /* Ensure non-fatal signals prior to receive are unmodified */
4643 TEST(user_notification_wait_killable_pre_notification)
4644 {
4645 	struct sigaction new_action = {
4646 		.sa_handler = signal_handler,
4647 	};
4648 	int listener, status, sk_pair[2];
4649 	pid_t pid;
4650 	long ret;
4651 	char c;
4652 	/* 100 ms */
4653 	struct timespec delay = { .tv_nsec = 100000000 };
4654 
4655 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4656 
4657 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4658 	ASSERT_EQ(0, ret)
4659 	{
4660 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4661 	}
4662 
4663 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4664 
4665 	listener = user_notif_syscall(
4666 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4667 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4668 	ASSERT_GE(listener, 0);
4669 
4670 	/*
4671 	 * Check that we can kill the process with SIGUSR1 prior to receiving
4672 	 * the notification. SIGUSR1 is wired up to a custom signal handler,
4673 	 * and make sure it gets called.
4674 	 */
4675 	pid = fork();
4676 	ASSERT_GE(pid, 0);
4677 
4678 	if (pid == 0) {
4679 		close(sk_pair[0]);
4680 		handled = sk_pair[1];
4681 
4682 		/* Setup the non-fatal sigaction without SA_RESTART */
4683 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4684 			perror("sigaction");
4685 			exit(1);
4686 		}
4687 
4688 		ret = syscall(__NR_getppid);
4689 		/* Make sure we got a return from a signal interruption */
4690 		exit(ret != -1 || errno != EINTR);
4691 	}
4692 
4693 	/*
4694 	 * Make sure we've gotten to the seccomp user notification wait
4695 	 * from getppid prior to sending any signals
4696 	 */
4697 	while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4698 	       get_proc_stat(_metadata, pid) != 'S')
4699 		nanosleep(&delay, NULL);
4700 
4701 	/* Send non-fatal kill signal */
4702 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4703 
4704 	/* wait for process to exit (exit checks for EINTR) */
4705 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4706 	EXPECT_EQ(true, WIFEXITED(status));
4707 	EXPECT_EQ(0, WEXITSTATUS(status));
4708 
4709 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4710 }
4711 
4712 /* Ensure non-fatal signals after receive are blocked */
4713 TEST(user_notification_wait_killable)
4714 {
4715 	struct sigaction new_action = {
4716 		.sa_handler = signal_handler,
4717 	};
4718 	struct seccomp_notif_resp resp = {};
4719 	struct seccomp_notif req = {};
4720 	int listener, status, sk_pair[2];
4721 	pid_t pid;
4722 	long ret;
4723 	char c;
4724 	/* 100 ms */
4725 	struct timespec delay = { .tv_nsec = 100000000 };
4726 
4727 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4728 
4729 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4730 	ASSERT_EQ(0, ret)
4731 	{
4732 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4733 	}
4734 
4735 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4736 
4737 	listener = user_notif_syscall(
4738 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4739 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4740 	ASSERT_GE(listener, 0);
4741 
4742 	pid = fork();
4743 	ASSERT_GE(pid, 0);
4744 
4745 	if (pid == 0) {
4746 		close(sk_pair[0]);
4747 		handled = sk_pair[1];
4748 
4749 		/* Setup the sigaction without SA_RESTART */
4750 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4751 			perror("sigaction");
4752 			exit(1);
4753 		}
4754 
4755 		/* Make sure that the syscall is completed (no EINTR) */
4756 		ret = syscall(__NR_getppid);
4757 		exit(ret != USER_NOTIF_MAGIC);
4758 	}
4759 
4760 	/*
4761 	 * Get the notification, to make move the notifying process into a
4762 	 * non-preemptible (TASK_KILLABLE) state.
4763 	 */
4764 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4765 	/* Send non-fatal kill signal */
4766 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4767 
4768 	/*
4769 	 * Make sure the task enters moves to TASK_KILLABLE by waiting for
4770 	 * D (Disk Sleep) state after receiving non-fatal signal.
4771 	 */
4772 	while (get_proc_stat(_metadata, pid) != 'D')
4773 		nanosleep(&delay, NULL);
4774 
4775 	resp.id = req.id;
4776 	resp.val = USER_NOTIF_MAGIC;
4777 	/* Make sure the notification is found and able to be replied to */
4778 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4779 
4780 	/*
4781 	 * Make sure that the signal handler does get called once we're back in
4782 	 * userspace.
4783 	 */
4784 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4785 	/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4786 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4787 	EXPECT_EQ(true, WIFEXITED(status));
4788 	EXPECT_EQ(0, WEXITSTATUS(status));
4789 }
4790 
4791 /* Ensure fatal signals after receive are not blocked */
4792 TEST(user_notification_wait_killable_fatal)
4793 {
4794 	struct seccomp_notif req = {};
4795 	int listener, status;
4796 	pid_t pid;
4797 	long ret;
4798 	/* 100 ms */
4799 	struct timespec delay = { .tv_nsec = 100000000 };
4800 
4801 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4802 	ASSERT_EQ(0, ret)
4803 	{
4804 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4805 	}
4806 
4807 	listener = user_notif_syscall(
4808 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4809 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4810 	ASSERT_GE(listener, 0);
4811 
4812 	pid = fork();
4813 	ASSERT_GE(pid, 0);
4814 
4815 	if (pid == 0) {
4816 		/* This should never complete as it should get a SIGTERM */
4817 		syscall(__NR_getppid);
4818 		exit(1);
4819 	}
4820 
4821 	while (get_proc_stat(_metadata, pid) != 'S')
4822 		nanosleep(&delay, NULL);
4823 
4824 	/*
4825 	 * Get the notification, to make move the notifying process into a
4826 	 * non-preemptible (TASK_KILLABLE) state.
4827 	 */
4828 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4829 	/* Kill the process with a fatal signal */
4830 	EXPECT_EQ(kill(pid, SIGTERM), 0);
4831 
4832 	/*
4833 	 * Wait for the process to exit, and make sure the process terminated
4834 	 * due to the SIGTERM signal.
4835 	 */
4836 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4837 	EXPECT_EQ(true, WIFSIGNALED(status));
4838 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
4839 }
4840 
4841 /* Ensure signals after the reply do not interrupt */
4842 TEST(user_notification_wait_killable_after_reply)
4843 {
4844 	int i, max_iter = 100000;
4845 	int listener, status;
4846 	int pipe_fds[2];
4847 	pid_t pid;
4848 	long ret;
4849 
4850 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4851 	ASSERT_EQ(0, ret)
4852 	{
4853 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4854 	}
4855 
4856 	listener = user_notif_syscall(
4857 		__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4858 			  SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4859 	ASSERT_GE(listener, 0);
4860 
4861 	/*
4862 	 * Used to count invocations. One token is transferred from the child
4863 	 * to the parent per syscall invocation, the parent tries to take
4864 	 * one token per successful RECV. If the syscall is restarted after
4865 	 * RECV the parent will try to get two tokens while the child only
4866 	 * provided one.
4867 	 */
4868 	ASSERT_EQ(pipe(pipe_fds), 0);
4869 
4870 	pid = fork();
4871 	ASSERT_GE(pid, 0);
4872 
4873 	if (pid == 0) {
4874 		struct sigaction new_action = {
4875 			.sa_handler = signal_handler_nop,
4876 			.sa_flags = SA_RESTART,
4877 		};
4878 		struct itimerval timer = {
4879 			.it_value = { .tv_usec = 1000 },
4880 			.it_interval = { .tv_usec = 1000 },
4881 		};
4882 		char c = 'a';
4883 
4884 		close(pipe_fds[0]);
4885 
4886 		/* Setup the sigaction with SA_RESTART */
4887 		if (sigaction(SIGALRM, &new_action, NULL)) {
4888 			perror("sigaction");
4889 			exit(1);
4890 		}
4891 
4892 		/*
4893 		 * Kill with SIGALRM repeatedly, to try to hit the race when
4894 		 * handling the syscall.
4895 		 */
4896 		if (setitimer(ITIMER_REAL, &timer, NULL) < 0)
4897 			perror("setitimer");
4898 
4899 		for (i = 0; i < max_iter; ++i) {
4900 			int fd;
4901 
4902 			/* Send one token per iteration to catch repeats. */
4903 			if (write(pipe_fds[1], &c, sizeof(c)) != 1) {
4904 				perror("write");
4905 				exit(1);
4906 			}
4907 
4908 			fd = syscall(__NR_dup, 0);
4909 			if (fd < 0) {
4910 				perror("dup");
4911 				exit(1);
4912 			}
4913 			close(fd);
4914 		}
4915 
4916 		exit(0);
4917 	}
4918 
4919 	close(pipe_fds[1]);
4920 
4921 	for (i = 0; i < max_iter; ++i) {
4922 		struct seccomp_notif req = {};
4923 		struct seccomp_notif_addfd addfd = {};
4924 		struct pollfd pfd = {
4925 			.fd = pipe_fds[0],
4926 			.events = POLLIN,
4927 		};
4928 		char c;
4929 
4930 		/*
4931 		 * Try to receive one token. If it failed, one child syscall
4932 		 * was restarted after RECV and needed to be handled twice.
4933 		 */
4934 		ASSERT_EQ(poll(&pfd, 1, 1000), 1)
4935 			kill(pid, SIGKILL);
4936 
4937 		ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1)
4938 			kill(pid, SIGKILL);
4939 
4940 		/*
4941 		 * Get the notification, reply to it as fast as possible to test
4942 		 * whether the child wrongly skips going into the non-preemptible
4943 		 * (TASK_KILLABLE) state.
4944 		 */
4945 		do
4946 			ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
4947 		while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */
4948 		ASSERT_EQ(ret, 0)
4949 			kill(pid, SIGKILL);
4950 
4951 		addfd.id = req.id;
4952 		addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4953 		addfd.srcfd = 0;
4954 		ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0)
4955 			kill(pid, SIGKILL);
4956 	}
4957 
4958 	/*
4959 	 * Wait for the process to exit, and make sure the process terminated
4960 	 * with a zero exit code..
4961 	 */
4962 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4963 	EXPECT_EQ(true, WIFEXITED(status));
4964 	EXPECT_EQ(0, WEXITSTATUS(status));
4965 }
4966 
4967 struct tsync_vs_thread_leader_args {
4968 	pthread_t leader;
4969 };
4970 
4971 static void *tsync_vs_dead_thread_leader_sibling(void *_args)
4972 {
4973 	struct sock_filter allow_filter[] = {
4974 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4975 	};
4976 	struct sock_fprog allow_prog = {
4977 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
4978 		.filter = allow_filter,
4979 	};
4980 	struct tsync_vs_thread_leader_args *args = _args;
4981 	void *retval;
4982 	long ret;
4983 
4984 	ret = pthread_join(args->leader, &retval);
4985 	if (ret)
4986 		exit(1);
4987 	if (retval != _args)
4988 		exit(2);
4989 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &allow_prog);
4990 	if (ret)
4991 		exit(3);
4992 
4993 	exit(0);
4994 }
4995 
4996 /*
4997  * Ensure that a dead thread leader doesn't prevent installing new filters with
4998  * SECCOMP_FILTER_FLAG_TSYNC from other threads.
4999  */
5000 TEST(tsync_vs_dead_thread_leader)
5001 {
5002 	int status;
5003 	pid_t pid;
5004 	long ret;
5005 
5006 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
5007 	ASSERT_EQ(0, ret) {
5008 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
5009 	}
5010 
5011 	pid = fork();
5012 	ASSERT_GE(pid, 0);
5013 
5014 	if (pid == 0) {
5015 		struct sock_filter allow_filter[] = {
5016 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5017 		};
5018 		struct sock_fprog allow_prog = {
5019 			.len = (unsigned short)ARRAY_SIZE(allow_filter),
5020 			.filter = allow_filter,
5021 		};
5022 		struct  tsync_vs_thread_leader_args *args;
5023 		pthread_t sibling;
5024 
5025 		args = malloc(sizeof(*args));
5026 		ASSERT_NE(NULL, args);
5027 		args->leader = pthread_self();
5028 
5029 		ret = pthread_create(&sibling, NULL,
5030 				     tsync_vs_dead_thread_leader_sibling, args);
5031 		ASSERT_EQ(0, ret);
5032 
5033 		/* Install a new filter just to the leader thread. */
5034 		ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
5035 		ASSERT_EQ(0, ret);
5036 		pthread_exit(args);
5037 		exit(1);
5038 	}
5039 
5040 	EXPECT_EQ(pid, waitpid(pid, &status, 0));
5041 	EXPECT_EQ(0, status);
5042 }
5043 
5044 #ifdef __x86_64__
5045 
5046 /*
5047  * We need naked probed_uprobe function. Using __nocf_check
5048  * check to skip possible endbr64 instruction and ignoring
5049  * -Wattributes, otherwise the compilation might fail.
5050  */
5051 #pragma GCC diagnostic push
5052 #pragma GCC diagnostic ignored "-Wattributes"
5053 
5054 __naked __nocf_check noinline int probed_uprobe(void)
5055 {
5056 	/*
5057 	 * Optimized uprobe is possible only on top of nop5 instruction.
5058 	 */
5059 	asm volatile ("                                 \n"
5060 		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00     \n"
5061 		"ret                                    \n"
5062 	);
5063 }
5064 #pragma GCC diagnostic pop
5065 
5066 #else
5067 noinline int probed_uprobe(void)
5068 {
5069 	return 1;
5070 }
5071 #endif
5072 
5073 noinline int probed_uretprobe(void)
5074 {
5075 	return 1;
5076 }
5077 
5078 static int parse_uint_from_file(const char *file, const char *fmt)
5079 {
5080 	int err = -1, ret;
5081 	FILE *f;
5082 
5083 	f = fopen(file, "re");
5084 	if (f) {
5085 		err = fscanf(f, fmt, &ret);
5086 		fclose(f);
5087 	}
5088 	return err == 1 ? ret : err;
5089 }
5090 
5091 static int determine_uprobe_perf_type(void)
5092 {
5093 	const char *file = "/sys/bus/event_source/devices/uprobe/type";
5094 
5095 	return parse_uint_from_file(file, "%d\n");
5096 }
5097 
5098 static int determine_uprobe_retprobe_bit(void)
5099 {
5100 	const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe";
5101 
5102 	return parse_uint_from_file(file, "config:%d\n");
5103 }
5104 
5105 static ssize_t get_uprobe_offset(const void *addr)
5106 {
5107 	size_t start, base, end;
5108 	bool found = false;
5109 	char buf[256];
5110 	FILE *f;
5111 
5112 	f = fopen("/proc/self/maps", "r");
5113 	if (!f)
5114 		return -1;
5115 
5116 	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
5117 		if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
5118 			found = true;
5119 			break;
5120 		}
5121 	}
5122 	fclose(f);
5123 	return found ? (uintptr_t)addr - start + base : -1;
5124 }
5125 
5126 FIXTURE(UPROBE) {
5127 	int fd;
5128 };
5129 
5130 FIXTURE_VARIANT(UPROBE) {
5131 	/*
5132 	 * All of the U(RET)PROBE behaviors can be tested with either
5133 	 * u(ret)probe attached or not
5134 	 */
5135 	bool attach;
5136 	/*
5137 	 * Test both uprobe and uretprobe.
5138 	 */
5139 	bool uretprobe;
5140 };
5141 
5142 FIXTURE_VARIANT_ADD(UPROBE, not_attached) {
5143 	.attach = false,
5144 	.uretprobe = false,
5145 };
5146 
5147 FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) {
5148 	.attach = true,
5149 	.uretprobe = false,
5150 };
5151 
5152 FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) {
5153 	.attach = true,
5154 	.uretprobe = true,
5155 };
5156 
5157 FIXTURE_SETUP(UPROBE)
5158 {
5159 	const size_t attr_sz = sizeof(struct perf_event_attr);
5160 	struct perf_event_attr attr;
5161 	ssize_t offset;
5162 	int type, bit;
5163 
5164 #if !defined(__NR_uprobe) || !defined(__NR_uretprobe)
5165 	SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined");
5166 #endif
5167 
5168 	if (!variant->attach)
5169 		return;
5170 
5171 	memset(&attr, 0, attr_sz);
5172 
5173 	type = determine_uprobe_perf_type();
5174 	ASSERT_GE(type, 0);
5175 
5176 	if (variant->uretprobe) {
5177 		bit = determine_uprobe_retprobe_bit();
5178 		ASSERT_GE(bit, 0);
5179 	}
5180 
5181 	offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
5182 	ASSERT_GE(offset, 0);
5183 
5184 	if (variant->uretprobe)
5185 		attr.config |= 1 << bit;
5186 	attr.size = attr_sz;
5187 	attr.type = type;
5188 	attr.config1 = ptr_to_u64("/proc/self/exe");
5189 	attr.config2 = offset;
5190 
5191 	self->fd = syscall(__NR_perf_event_open, &attr,
5192 			   getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */,
5193 			   PERF_FLAG_FD_CLOEXEC);
5194 }
5195 
5196 FIXTURE_TEARDOWN(UPROBE)
5197 {
5198 	/* we could call close(self->fd), but we'd need extra filter for
5199 	 * that and since we are calling _exit right away..
5200 	 */
5201 }
5202 
5203 static int run_probed_with_filter(struct sock_fprog *prog)
5204 {
5205 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
5206 	    seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) {
5207 		return -1;
5208 	}
5209 
5210 	/*
5211 	 * Uprobe is optimized after first hit, so let's hit twice.
5212 	 */
5213 	probed_uprobe();
5214 	probed_uprobe();
5215 
5216 	probed_uretprobe();
5217 	return 0;
5218 }
5219 
5220 TEST_F(UPROBE, uprobe_default_allow)
5221 {
5222 	struct sock_filter filter[] = {
5223 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5224 	};
5225 	struct sock_fprog prog = {
5226 		.len = (unsigned short)ARRAY_SIZE(filter),
5227 		.filter = filter,
5228 	};
5229 
5230 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5231 }
5232 
5233 TEST_F(UPROBE, uprobe_default_block)
5234 {
5235 	struct sock_filter filter[] = {
5236 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5237 			offsetof(struct seccomp_data, nr)),
5238 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5239 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5240 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5241 	};
5242 	struct sock_fprog prog = {
5243 		.len = (unsigned short)ARRAY_SIZE(filter),
5244 		.filter = filter,
5245 	};
5246 
5247 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5248 }
5249 
5250 TEST_F(UPROBE, uprobe_block_syscall)
5251 {
5252 	struct sock_filter filter[] = {
5253 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5254 			offsetof(struct seccomp_data, nr)),
5255 #ifdef __NR_uprobe
5256 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2),
5257 #endif
5258 #ifdef __NR_uretprobe
5259 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
5260 #endif
5261 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5262 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5263 	};
5264 	struct sock_fprog prog = {
5265 		.len = (unsigned short)ARRAY_SIZE(filter),
5266 		.filter = filter,
5267 	};
5268 
5269 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5270 }
5271 
5272 TEST_F(UPROBE, uprobe_default_block_with_syscall)
5273 {
5274 	struct sock_filter filter[] = {
5275 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5276 			offsetof(struct seccomp_data, nr)),
5277 #ifdef __NR_uprobe
5278 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0),
5279 #endif
5280 #ifdef __NR_uretprobe
5281 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
5282 #endif
5283 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5284 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5285 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5286 	};
5287 	struct sock_fprog prog = {
5288 		.len = (unsigned short)ARRAY_SIZE(filter),
5289 		.filter = filter,
5290 	};
5291 
5292 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5293 }
5294 
5295 /*
5296  * TODO:
5297  * - expand NNP testing
5298  * - better arch-specific TRACE and TRAP handlers.
5299  * - endianness checking when appropriate
5300  * - 64-bit arg prodding
5301  * - arch value testing (x86 modes especially)
5302  * - verify that FILTER_FLAG_LOG filters generate log messages
5303  * - verify that RET_LOG generates log messages
5304  */
5305 
5306 TEST_HARNESS_MAIN
5307