xref: /linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision f4a45f14cf6902a96d9805ba51829054940ef3e7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 #include <sys/capability.h>
50 #include <linux/perf_event.h>
51 
52 #include <unistd.h>
53 #include <sys/syscall.h>
54 #include <poll.h>
55 
56 #include "../kselftest_harness.h"
57 #include "../clone3/clone3_selftests.h"
58 
59 /* Attempt to de-conflict with the selftests tree. */
60 #ifndef SKIP
61 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
62 #endif
63 
64 #ifndef MIN
65 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
66 #endif
67 
68 #ifndef PR_SET_PTRACER
69 # define PR_SET_PTRACER 0x59616d61
70 #endif
71 
72 #ifndef noinline
73 #define noinline __attribute__((noinline))
74 #endif
75 
76 #ifndef PR_SET_NO_NEW_PRIVS
77 #define PR_SET_NO_NEW_PRIVS 38
78 #define PR_GET_NO_NEW_PRIVS 39
79 #endif
80 
81 #ifndef PR_SECCOMP_EXT
82 #define PR_SECCOMP_EXT 43
83 #endif
84 
85 #ifndef SECCOMP_EXT_ACT
86 #define SECCOMP_EXT_ACT 1
87 #endif
88 
89 #ifndef SECCOMP_EXT_ACT_TSYNC
90 #define SECCOMP_EXT_ACT_TSYNC 1
91 #endif
92 
93 #ifndef SECCOMP_MODE_STRICT
94 #define SECCOMP_MODE_STRICT 1
95 #endif
96 
97 #ifndef SECCOMP_MODE_FILTER
98 #define SECCOMP_MODE_FILTER 2
99 #endif
100 
101 #ifndef SECCOMP_RET_ALLOW
102 struct seccomp_data {
103 	int nr;
104 	__u32 arch;
105 	__u64 instruction_pointer;
106 	__u64 args[6];
107 };
108 #endif
109 
110 #ifndef SECCOMP_RET_KILL_PROCESS
111 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
112 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
113 #endif
114 #ifndef SECCOMP_RET_KILL
115 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
116 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
117 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
118 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
119 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
120 #endif
121 #ifndef SECCOMP_RET_LOG
122 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
123 #endif
124 
125 #ifndef __NR_seccomp
126 # if defined(__i386__)
127 #  define __NR_seccomp 354
128 # elif defined(__x86_64__)
129 #  define __NR_seccomp 317
130 # elif defined(__arm__)
131 #  define __NR_seccomp 383
132 # elif defined(__aarch64__)
133 #  define __NR_seccomp 277
134 # elif defined(__riscv)
135 #  define __NR_seccomp 277
136 # elif defined(__csky__)
137 #  define __NR_seccomp 277
138 # elif defined(__loongarch__)
139 #  define __NR_seccomp 277
140 # elif defined(__hppa__)
141 #  define __NR_seccomp 338
142 # elif defined(__powerpc__)
143 #  define __NR_seccomp 358
144 # elif defined(__s390__)
145 #  define __NR_seccomp 348
146 # elif defined(__xtensa__)
147 #  define __NR_seccomp 337
148 # elif defined(__sh__)
149 #  define __NR_seccomp 372
150 # elif defined(__mc68000__)
151 #  define __NR_seccomp 380
152 # else
153 #  warning "seccomp syscall number unknown for this architecture"
154 #  define __NR_seccomp 0xffff
155 # endif
156 #endif
157 
158 #ifndef SECCOMP_SET_MODE_STRICT
159 #define SECCOMP_SET_MODE_STRICT 0
160 #endif
161 
162 #ifndef SECCOMP_SET_MODE_FILTER
163 #define SECCOMP_SET_MODE_FILTER 1
164 #endif
165 
166 #ifndef SECCOMP_GET_ACTION_AVAIL
167 #define SECCOMP_GET_ACTION_AVAIL 2
168 #endif
169 
170 #ifndef SECCOMP_GET_NOTIF_SIZES
171 #define SECCOMP_GET_NOTIF_SIZES 3
172 #endif
173 
174 #ifndef SECCOMP_FILTER_FLAG_TSYNC
175 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
176 #endif
177 
178 #ifndef SECCOMP_FILTER_FLAG_LOG
179 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
180 #endif
181 
182 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
183 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
184 #endif
185 
186 #ifndef PTRACE_SECCOMP_GET_METADATA
187 #define PTRACE_SECCOMP_GET_METADATA	0x420d
188 
189 struct seccomp_metadata {
190 	__u64 filter_off;       /* Input: which filter */
191 	__u64 flags;             /* Output: filter's flags */
192 };
193 #endif
194 
195 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
196 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
197 #endif
198 
199 #ifndef SECCOMP_RET_USER_NOTIF
200 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
201 
202 #define SECCOMP_IOC_MAGIC		'!'
203 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
204 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
205 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
206 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
207 
208 /* Flags for seccomp notification fd ioctl. */
209 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
210 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
211 						struct seccomp_notif_resp)
212 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
213 
214 struct seccomp_notif {
215 	__u64 id;
216 	__u32 pid;
217 	__u32 flags;
218 	struct seccomp_data data;
219 };
220 
221 struct seccomp_notif_resp {
222 	__u64 id;
223 	__s64 val;
224 	__s32 error;
225 	__u32 flags;
226 };
227 
228 struct seccomp_notif_sizes {
229 	__u16 seccomp_notif;
230 	__u16 seccomp_notif_resp;
231 	__u16 seccomp_data;
232 };
233 #endif
234 
235 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
236 /* On success, the return value is the remote process's added fd number */
237 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
238 						struct seccomp_notif_addfd)
239 
240 /* valid flags for seccomp_notif_addfd */
241 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
242 
243 struct seccomp_notif_addfd {
244 	__u64 id;
245 	__u32 flags;
246 	__u32 srcfd;
247 	__u32 newfd;
248 	__u32 newfd_flags;
249 };
250 #endif
251 
252 #ifndef SECCOMP_ADDFD_FLAG_SEND
253 #define SECCOMP_ADDFD_FLAG_SEND	(1UL << 1) /* Addfd and return it, atomically */
254 #endif
255 
256 struct seccomp_notif_addfd_small {
257 	__u64 id;
258 	char weird[4];
259 };
260 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
261 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
262 
263 struct seccomp_notif_addfd_big {
264 	union {
265 		struct seccomp_notif_addfd addfd;
266 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
267 	};
268 };
269 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
270 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
271 
272 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
273 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
274 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
275 #endif
276 
277 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
278 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
279 #endif
280 
281 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
282 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
283 #endif
284 
285 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
286 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
287 #endif
288 
289 #ifndef seccomp
seccomp(unsigned int op,unsigned int flags,void * args)290 int seccomp(unsigned int op, unsigned int flags, void *args)
291 {
292 	errno = 0;
293 	return syscall(__NR_seccomp, op, flags, args);
294 }
295 #endif
296 
297 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
298 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
299 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
300 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
301 #else
302 #error "wut? Unknown __BYTE_ORDER__?!"
303 #endif
304 
305 #define SIBLING_EXIT_UNKILLED	0xbadbeef
306 #define SIBLING_EXIT_FAILURE	0xbadface
307 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
308 
__filecmp(pid_t pid1,pid_t pid2,int fd1,int fd2)309 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
310 {
311 #ifdef __NR_kcmp
312 	errno = 0;
313 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
314 #else
315 	errno = ENOSYS;
316 	return -1;
317 #endif
318 }
319 
320 /* Have TH_LOG report actual location filecmp() is used. */
321 #define filecmp(pid1, pid2, fd1, fd2)	({		\
322 	int _ret;					\
323 							\
324 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
325 	if (_ret != 0) {				\
326 		if (_ret < 0 && errno == ENOSYS) {	\
327 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
328 			_ret = 0;			\
329 		}					\
330 	}						\
331 	_ret; })
332 
TEST(kcmp)333 TEST(kcmp)
334 {
335 	int ret;
336 
337 	ret = __filecmp(getpid(), getpid(), 1, 1);
338 	EXPECT_EQ(ret, 0);
339 	if (ret != 0 && errno == ENOSYS)
340 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
341 }
342 
TEST(mode_strict_support)343 TEST(mode_strict_support)
344 {
345 	long ret;
346 
347 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
348 	ASSERT_EQ(0, ret) {
349 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
350 	}
351 	syscall(__NR_exit, 0);
352 }
353 
TEST_SIGNAL(mode_strict_cannot_call_prctl,SIGKILL)354 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
355 {
356 	long ret;
357 
358 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
359 	ASSERT_EQ(0, ret) {
360 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
361 	}
362 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
363 		NULL, NULL, NULL);
364 	EXPECT_FALSE(true) {
365 		TH_LOG("Unreachable!");
366 	}
367 }
368 
369 /* Note! This doesn't test no new privs behavior */
TEST(no_new_privs_support)370 TEST(no_new_privs_support)
371 {
372 	long ret;
373 
374 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
375 	EXPECT_EQ(0, ret) {
376 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
377 	}
378 }
379 
380 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
TEST(mode_filter_support)381 TEST(mode_filter_support)
382 {
383 	long ret;
384 
385 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
386 	ASSERT_EQ(0, ret) {
387 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
388 	}
389 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
390 	EXPECT_EQ(-1, ret);
391 	EXPECT_EQ(EFAULT, errno) {
392 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
393 	}
394 }
395 
TEST(mode_filter_without_nnp)396 TEST(mode_filter_without_nnp)
397 {
398 	struct sock_filter filter[] = {
399 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
400 	};
401 	struct sock_fprog prog = {
402 		.len = (unsigned short)ARRAY_SIZE(filter),
403 		.filter = filter,
404 	};
405 	long ret;
406 	cap_t cap = cap_get_proc();
407 	cap_flag_value_t is_cap_sys_admin = 0;
408 
409 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
410 	ASSERT_LE(0, ret) {
411 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
412 	}
413 	errno = 0;
414 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
415 	/* Succeeds with CAP_SYS_ADMIN, fails without */
416 	cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);
417 	if (!is_cap_sys_admin) {
418 		EXPECT_EQ(-1, ret);
419 		EXPECT_EQ(EACCES, errno);
420 	} else {
421 		EXPECT_EQ(0, ret);
422 	}
423 }
424 
425 #define MAX_INSNS_PER_PATH 32768
426 
TEST(filter_size_limits)427 TEST(filter_size_limits)
428 {
429 	int i;
430 	int count = BPF_MAXINSNS + 1;
431 	struct sock_filter allow[] = {
432 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
433 	};
434 	struct sock_filter *filter;
435 	struct sock_fprog prog = { };
436 	long ret;
437 
438 	filter = calloc(count, sizeof(*filter));
439 	ASSERT_NE(NULL, filter);
440 
441 	for (i = 0; i < count; i++)
442 		filter[i] = allow[0];
443 
444 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
445 	ASSERT_EQ(0, ret);
446 
447 	prog.filter = filter;
448 	prog.len = count;
449 
450 	/* Too many filter instructions in a single filter. */
451 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
452 	ASSERT_NE(0, ret) {
453 		TH_LOG("Installing %d insn filter was allowed", prog.len);
454 	}
455 
456 	/* One less is okay, though. */
457 	prog.len -= 1;
458 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
459 	ASSERT_EQ(0, ret) {
460 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
461 	}
462 }
463 
TEST(filter_chain_limits)464 TEST(filter_chain_limits)
465 {
466 	int i;
467 	int count = BPF_MAXINSNS;
468 	struct sock_filter allow[] = {
469 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
470 	};
471 	struct sock_filter *filter;
472 	struct sock_fprog prog = { };
473 	long ret;
474 
475 	filter = calloc(count, sizeof(*filter));
476 	ASSERT_NE(NULL, filter);
477 
478 	for (i = 0; i < count; i++)
479 		filter[i] = allow[0];
480 
481 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
482 	ASSERT_EQ(0, ret);
483 
484 	prog.filter = filter;
485 	prog.len = 1;
486 
487 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
488 	ASSERT_EQ(0, ret);
489 
490 	prog.len = count;
491 
492 	/* Too many total filter instructions. */
493 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
494 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
495 		if (ret != 0)
496 			break;
497 	}
498 	ASSERT_NE(0, ret) {
499 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
500 		       i, count, i * (count + 4));
501 	}
502 }
503 
TEST(mode_filter_cannot_move_to_strict)504 TEST(mode_filter_cannot_move_to_strict)
505 {
506 	struct sock_filter filter[] = {
507 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
508 	};
509 	struct sock_fprog prog = {
510 		.len = (unsigned short)ARRAY_SIZE(filter),
511 		.filter = filter,
512 	};
513 	long ret;
514 
515 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
516 	ASSERT_EQ(0, ret);
517 
518 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
519 	ASSERT_EQ(0, ret);
520 
521 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
522 	EXPECT_EQ(-1, ret);
523 	EXPECT_EQ(EINVAL, errno);
524 }
525 
526 
TEST(mode_filter_get_seccomp)527 TEST(mode_filter_get_seccomp)
528 {
529 	struct sock_filter filter[] = {
530 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
531 	};
532 	struct sock_fprog prog = {
533 		.len = (unsigned short)ARRAY_SIZE(filter),
534 		.filter = filter,
535 	};
536 	long ret;
537 
538 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
539 	ASSERT_EQ(0, ret);
540 
541 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
542 	EXPECT_EQ(0, ret);
543 
544 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
545 	ASSERT_EQ(0, ret);
546 
547 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
548 	EXPECT_EQ(2, ret);
549 }
550 
551 
TEST(ALLOW_all)552 TEST(ALLOW_all)
553 {
554 	struct sock_filter filter[] = {
555 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
556 	};
557 	struct sock_fprog prog = {
558 		.len = (unsigned short)ARRAY_SIZE(filter),
559 		.filter = filter,
560 	};
561 	long ret;
562 
563 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
564 	ASSERT_EQ(0, ret);
565 
566 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
567 	ASSERT_EQ(0, ret);
568 }
569 
TEST(empty_prog)570 TEST(empty_prog)
571 {
572 	struct sock_filter filter[] = {
573 	};
574 	struct sock_fprog prog = {
575 		.len = (unsigned short)ARRAY_SIZE(filter),
576 		.filter = filter,
577 	};
578 	long ret;
579 
580 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
581 	ASSERT_EQ(0, ret);
582 
583 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
584 	EXPECT_EQ(-1, ret);
585 	EXPECT_EQ(EINVAL, errno);
586 }
587 
TEST(log_all)588 TEST(log_all)
589 {
590 	struct sock_filter filter[] = {
591 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
592 	};
593 	struct sock_fprog prog = {
594 		.len = (unsigned short)ARRAY_SIZE(filter),
595 		.filter = filter,
596 	};
597 	long ret;
598 	pid_t parent = getppid();
599 
600 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
601 	ASSERT_EQ(0, ret);
602 
603 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
604 	ASSERT_EQ(0, ret);
605 
606 	/* getppid() should succeed and be logged (no check for logging) */
607 	EXPECT_EQ(parent, syscall(__NR_getppid));
608 }
609 
TEST_SIGNAL(unknown_ret_is_kill_inside,SIGSYS)610 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
611 {
612 	struct sock_filter filter[] = {
613 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
614 	};
615 	struct sock_fprog prog = {
616 		.len = (unsigned short)ARRAY_SIZE(filter),
617 		.filter = filter,
618 	};
619 	long ret;
620 
621 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
622 	ASSERT_EQ(0, ret);
623 
624 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
625 	ASSERT_EQ(0, ret);
626 	EXPECT_EQ(0, syscall(__NR_getpid)) {
627 		TH_LOG("getpid() shouldn't ever return");
628 	}
629 }
630 
631 /* return code >= 0x80000000 is unused. */
TEST_SIGNAL(unknown_ret_is_kill_above_allow,SIGSYS)632 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
633 {
634 	struct sock_filter filter[] = {
635 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
636 	};
637 	struct sock_fprog prog = {
638 		.len = (unsigned short)ARRAY_SIZE(filter),
639 		.filter = filter,
640 	};
641 	long ret;
642 
643 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
644 	ASSERT_EQ(0, ret);
645 
646 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
647 	ASSERT_EQ(0, ret);
648 	EXPECT_EQ(0, syscall(__NR_getpid)) {
649 		TH_LOG("getpid() shouldn't ever return");
650 	}
651 }
652 
TEST_SIGNAL(KILL_all,SIGSYS)653 TEST_SIGNAL(KILL_all, SIGSYS)
654 {
655 	struct sock_filter filter[] = {
656 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
657 	};
658 	struct sock_fprog prog = {
659 		.len = (unsigned short)ARRAY_SIZE(filter),
660 		.filter = filter,
661 	};
662 	long ret;
663 
664 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
665 	ASSERT_EQ(0, ret);
666 
667 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
668 	ASSERT_EQ(0, ret);
669 }
670 
TEST_SIGNAL(KILL_one,SIGSYS)671 TEST_SIGNAL(KILL_one, SIGSYS)
672 {
673 	struct sock_filter filter[] = {
674 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
675 			offsetof(struct seccomp_data, nr)),
676 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
677 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
678 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
679 	};
680 	struct sock_fprog prog = {
681 		.len = (unsigned short)ARRAY_SIZE(filter),
682 		.filter = filter,
683 	};
684 	long ret;
685 	pid_t parent = getppid();
686 
687 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
688 	ASSERT_EQ(0, ret);
689 
690 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
691 	ASSERT_EQ(0, ret);
692 
693 	EXPECT_EQ(parent, syscall(__NR_getppid));
694 	/* getpid() should never return. */
695 	EXPECT_EQ(0, syscall(__NR_getpid));
696 }
697 
TEST_SIGNAL(KILL_one_arg_one,SIGSYS)698 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
699 {
700 	void *fatal_address;
701 	struct sock_filter filter[] = {
702 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
703 			offsetof(struct seccomp_data, nr)),
704 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
705 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
706 		/* Only both with lower 32-bit for now. */
707 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
708 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
709 			(unsigned long)&fatal_address, 0, 1),
710 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
711 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
712 	};
713 	struct sock_fprog prog = {
714 		.len = (unsigned short)ARRAY_SIZE(filter),
715 		.filter = filter,
716 	};
717 	long ret;
718 	pid_t parent = getppid();
719 	struct tms timebuf;
720 	clock_t clock = times(&timebuf);
721 
722 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
723 	ASSERT_EQ(0, ret);
724 
725 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
726 	ASSERT_EQ(0, ret);
727 
728 	EXPECT_EQ(parent, syscall(__NR_getppid));
729 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
730 	/* times() should never return. */
731 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
732 }
733 
TEST_SIGNAL(KILL_one_arg_six,SIGSYS)734 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
735 {
736 #ifndef __NR_mmap2
737 	int sysno = __NR_mmap;
738 #else
739 	int sysno = __NR_mmap2;
740 #endif
741 	struct sock_filter filter[] = {
742 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
743 			offsetof(struct seccomp_data, nr)),
744 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
745 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
746 		/* Only both with lower 32-bit for now. */
747 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
748 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
749 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
750 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
751 	};
752 	struct sock_fprog prog = {
753 		.len = (unsigned short)ARRAY_SIZE(filter),
754 		.filter = filter,
755 	};
756 	long ret;
757 	pid_t parent = getppid();
758 	int fd;
759 	void *map1, *map2;
760 	int page_size = sysconf(_SC_PAGESIZE);
761 
762 	ASSERT_LT(0, page_size);
763 
764 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
765 	ASSERT_EQ(0, ret);
766 
767 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
768 	ASSERT_EQ(0, ret);
769 
770 	fd = open("/dev/zero", O_RDONLY);
771 	ASSERT_NE(-1, fd);
772 
773 	EXPECT_EQ(parent, syscall(__NR_getppid));
774 	map1 = (void *)syscall(sysno,
775 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
776 	EXPECT_NE(MAP_FAILED, map1);
777 	/* mmap2() should never return. */
778 	map2 = (void *)syscall(sysno,
779 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
780 	EXPECT_EQ(MAP_FAILED, map2);
781 
782 	/* The test failed, so clean up the resources. */
783 	munmap(map1, page_size);
784 	munmap(map2, page_size);
785 	close(fd);
786 }
787 
788 /* This is a thread task to die via seccomp filter violation. */
kill_thread(void * data)789 void *kill_thread(void *data)
790 {
791 	bool die = (bool)data;
792 
793 	if (die) {
794 		syscall(__NR_getpid);
795 		return (void *)SIBLING_EXIT_FAILURE;
796 	}
797 
798 	return (void *)SIBLING_EXIT_UNKILLED;
799 }
800 
801 enum kill_t {
802 	KILL_THREAD,
803 	KILL_PROCESS,
804 	RET_UNKNOWN
805 };
806 
807 /* Prepare a thread that will kill itself or both of us. */
kill_thread_or_group(struct __test_metadata * _metadata,enum kill_t kill_how)808 void kill_thread_or_group(struct __test_metadata *_metadata,
809 			  enum kill_t kill_how)
810 {
811 	pthread_t thread;
812 	void *status;
813 	/* Kill only when calling __NR_getpid. */
814 	struct sock_filter filter_thread[] = {
815 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
816 			offsetof(struct seccomp_data, nr)),
817 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
818 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
819 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
820 	};
821 	struct sock_fprog prog_thread = {
822 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
823 		.filter = filter_thread,
824 	};
825 	int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
826 	struct sock_filter filter_process[] = {
827 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
828 			offsetof(struct seccomp_data, nr)),
829 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
830 		BPF_STMT(BPF_RET|BPF_K, kill),
831 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
832 	};
833 	struct sock_fprog prog_process = {
834 		.len = (unsigned short)ARRAY_SIZE(filter_process),
835 		.filter = filter_process,
836 	};
837 
838 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
839 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
840 	}
841 
842 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
843 			     kill_how == KILL_THREAD ? &prog_thread
844 						     : &prog_process));
845 
846 	/*
847 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
848 	 * flag cannot be downgraded by a new filter.
849 	 */
850 	if (kill_how == KILL_PROCESS)
851 		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
852 
853 	/* Start a thread that will exit immediately. */
854 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
855 	ASSERT_EQ(0, pthread_join(thread, &status));
856 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
857 
858 	/* Start a thread that will die immediately. */
859 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
860 	ASSERT_EQ(0, pthread_join(thread, &status));
861 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
862 
863 	/*
864 	 * If we get here, only the spawned thread died. Let the parent know
865 	 * the whole process didn't die (i.e. this thread, the spawner,
866 	 * stayed running).
867 	 */
868 	exit(42);
869 }
870 
TEST(KILL_thread)871 TEST(KILL_thread)
872 {
873 	int status;
874 	pid_t child_pid;
875 
876 	child_pid = fork();
877 	ASSERT_LE(0, child_pid);
878 	if (child_pid == 0) {
879 		kill_thread_or_group(_metadata, KILL_THREAD);
880 		_exit(38);
881 	}
882 
883 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
884 
885 	/* If only the thread was killed, we'll see exit 42. */
886 	ASSERT_TRUE(WIFEXITED(status));
887 	ASSERT_EQ(42, WEXITSTATUS(status));
888 }
889 
TEST(KILL_process)890 TEST(KILL_process)
891 {
892 	int status;
893 	pid_t child_pid;
894 
895 	child_pid = fork();
896 	ASSERT_LE(0, child_pid);
897 	if (child_pid == 0) {
898 		kill_thread_or_group(_metadata, KILL_PROCESS);
899 		_exit(38);
900 	}
901 
902 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
903 
904 	/* If the entire process was killed, we'll see SIGSYS. */
905 	ASSERT_TRUE(WIFSIGNALED(status));
906 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
907 }
908 
TEST(KILL_unknown)909 TEST(KILL_unknown)
910 {
911 	int status;
912 	pid_t child_pid;
913 
914 	child_pid = fork();
915 	ASSERT_LE(0, child_pid);
916 	if (child_pid == 0) {
917 		kill_thread_or_group(_metadata, RET_UNKNOWN);
918 		_exit(38);
919 	}
920 
921 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
922 
923 	/* If the entire process was killed, we'll see SIGSYS. */
924 	EXPECT_TRUE(WIFSIGNALED(status)) {
925 		TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
926 	}
927 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
928 }
929 
930 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
TEST(arg_out_of_range)931 TEST(arg_out_of_range)
932 {
933 	struct sock_filter filter[] = {
934 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
935 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
936 	};
937 	struct sock_fprog prog = {
938 		.len = (unsigned short)ARRAY_SIZE(filter),
939 		.filter = filter,
940 	};
941 	long ret;
942 
943 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
944 	ASSERT_EQ(0, ret);
945 
946 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
947 	EXPECT_EQ(-1, ret);
948 	EXPECT_EQ(EINVAL, errno);
949 }
950 
951 #define ERRNO_FILTER(name, errno)					\
952 	struct sock_filter _read_filter_##name[] = {			\
953 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
954 			offsetof(struct seccomp_data, nr)),		\
955 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
956 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
957 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
958 	};								\
959 	struct sock_fprog prog_##name = {				\
960 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
961 		.filter = _read_filter_##name,				\
962 	}
963 
964 /* Make sure basic errno values are correctly passed through a filter. */
TEST(ERRNO_valid)965 TEST(ERRNO_valid)
966 {
967 	ERRNO_FILTER(valid, E2BIG);
968 	long ret;
969 	pid_t parent = getppid();
970 
971 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
972 	ASSERT_EQ(0, ret);
973 
974 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
975 	ASSERT_EQ(0, ret);
976 
977 	EXPECT_EQ(parent, syscall(__NR_getppid));
978 	EXPECT_EQ(-1, read(-1, NULL, 0));
979 	EXPECT_EQ(E2BIG, errno);
980 }
981 
982 /* Make sure an errno of zero is correctly handled by the arch code. */
TEST(ERRNO_zero)983 TEST(ERRNO_zero)
984 {
985 	ERRNO_FILTER(zero, 0);
986 	long ret;
987 	pid_t parent = getppid();
988 
989 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
990 	ASSERT_EQ(0, ret);
991 
992 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
993 	ASSERT_EQ(0, ret);
994 
995 	EXPECT_EQ(parent, syscall(__NR_getppid));
996 	/* "errno" of 0 is ok. */
997 	EXPECT_EQ(0, read(-1, NULL, 0));
998 }
999 
1000 /*
1001  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
1002  * This tests that the errno value gets capped correctly, fixed by
1003  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
1004  */
TEST(ERRNO_capped)1005 TEST(ERRNO_capped)
1006 {
1007 	ERRNO_FILTER(capped, 4096);
1008 	long ret;
1009 	pid_t parent = getppid();
1010 
1011 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1012 	ASSERT_EQ(0, ret);
1013 
1014 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1015 	ASSERT_EQ(0, ret);
1016 
1017 	EXPECT_EQ(parent, syscall(__NR_getppid));
1018 	EXPECT_EQ(-1, read(-1, NULL, 0));
1019 	EXPECT_EQ(4095, errno);
1020 }
1021 
1022 /*
1023  * Filters are processed in reverse order: last applied is executed first.
1024  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1025  * SECCOMP_RET_DATA mask results will follow the most recently applied
1026  * matching filter return (and not the lowest or highest value).
1027  */
TEST(ERRNO_order)1028 TEST(ERRNO_order)
1029 {
1030 	ERRNO_FILTER(first,  11);
1031 	ERRNO_FILTER(second, 13);
1032 	ERRNO_FILTER(third,  12);
1033 	long ret;
1034 	pid_t parent = getppid();
1035 
1036 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1037 	ASSERT_EQ(0, ret);
1038 
1039 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1040 	ASSERT_EQ(0, ret);
1041 
1042 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1043 	ASSERT_EQ(0, ret);
1044 
1045 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1046 	ASSERT_EQ(0, ret);
1047 
1048 	EXPECT_EQ(parent, syscall(__NR_getppid));
1049 	EXPECT_EQ(-1, read(-1, NULL, 0));
1050 	EXPECT_EQ(12, errno);
1051 }
1052 
FIXTURE(TRAP)1053 FIXTURE(TRAP) {
1054 	struct sock_fprog prog;
1055 };
1056 
FIXTURE_SETUP(TRAP)1057 FIXTURE_SETUP(TRAP)
1058 {
1059 	struct sock_filter filter[] = {
1060 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1061 			offsetof(struct seccomp_data, nr)),
1062 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1063 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1064 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1065 	};
1066 
1067 	memset(&self->prog, 0, sizeof(self->prog));
1068 	self->prog.filter = malloc(sizeof(filter));
1069 	ASSERT_NE(NULL, self->prog.filter);
1070 	memcpy(self->prog.filter, filter, sizeof(filter));
1071 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1072 }
1073 
FIXTURE_TEARDOWN(TRAP)1074 FIXTURE_TEARDOWN(TRAP)
1075 {
1076 	if (self->prog.filter)
1077 		free(self->prog.filter);
1078 }
1079 
TEST_F_SIGNAL(TRAP,dfl,SIGSYS)1080 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1081 {
1082 	long ret;
1083 
1084 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1085 	ASSERT_EQ(0, ret);
1086 
1087 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1088 	ASSERT_EQ(0, ret);
1089 	syscall(__NR_getpid);
1090 }
1091 
1092 /* Ensure that SIGSYS overrides SIG_IGN */
TEST_F_SIGNAL(TRAP,ign,SIGSYS)1093 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1094 {
1095 	long ret;
1096 
1097 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1098 	ASSERT_EQ(0, ret);
1099 
1100 	signal(SIGSYS, SIG_IGN);
1101 
1102 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1103 	ASSERT_EQ(0, ret);
1104 	syscall(__NR_getpid);
1105 }
1106 
1107 static siginfo_t TRAP_info;
1108 static volatile int TRAP_nr;
TRAP_action(int nr,siginfo_t * info,void * void_context)1109 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1110 {
1111 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1112 	TRAP_nr = nr;
1113 }
1114 
TEST_F(TRAP,handler)1115 TEST_F(TRAP, handler)
1116 {
1117 	int ret, test;
1118 	struct sigaction act;
1119 	sigset_t mask;
1120 
1121 	memset(&act, 0, sizeof(act));
1122 	sigemptyset(&mask);
1123 	sigaddset(&mask, SIGSYS);
1124 
1125 	act.sa_sigaction = &TRAP_action;
1126 	act.sa_flags = SA_SIGINFO;
1127 	ret = sigaction(SIGSYS, &act, NULL);
1128 	ASSERT_EQ(0, ret) {
1129 		TH_LOG("sigaction failed");
1130 	}
1131 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1132 	ASSERT_EQ(0, ret) {
1133 		TH_LOG("sigprocmask failed");
1134 	}
1135 
1136 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1137 	ASSERT_EQ(0, ret);
1138 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1139 	ASSERT_EQ(0, ret);
1140 	TRAP_nr = 0;
1141 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1142 	/* Expect the registers to be rolled back. (nr = error) may vary
1143 	 * based on arch. */
1144 	ret = syscall(__NR_getpid);
1145 	/* Silence gcc warning about volatile. */
1146 	test = TRAP_nr;
1147 	EXPECT_EQ(SIGSYS, test);
1148 	struct local_sigsys {
1149 		void *_call_addr;	/* calling user insn */
1150 		int _syscall;		/* triggering system call number */
1151 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1152 	} *sigsys = (struct local_sigsys *)
1153 #ifdef si_syscall
1154 		&(TRAP_info.si_call_addr);
1155 #else
1156 		&TRAP_info.si_pid;
1157 #endif
1158 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1159 	/* Make sure arch is non-zero. */
1160 	EXPECT_NE(0, sigsys->_arch);
1161 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1162 }
1163 
FIXTURE(precedence)1164 FIXTURE(precedence) {
1165 	struct sock_fprog allow;
1166 	struct sock_fprog log;
1167 	struct sock_fprog trace;
1168 	struct sock_fprog error;
1169 	struct sock_fprog trap;
1170 	struct sock_fprog kill;
1171 };
1172 
FIXTURE_SETUP(precedence)1173 FIXTURE_SETUP(precedence)
1174 {
1175 	struct sock_filter allow_insns[] = {
1176 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1177 	};
1178 	struct sock_filter log_insns[] = {
1179 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1180 			offsetof(struct seccomp_data, nr)),
1181 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1182 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1183 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1184 	};
1185 	struct sock_filter trace_insns[] = {
1186 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1187 			offsetof(struct seccomp_data, nr)),
1188 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1189 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1190 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1191 	};
1192 	struct sock_filter error_insns[] = {
1193 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1194 			offsetof(struct seccomp_data, nr)),
1195 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1196 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1197 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1198 	};
1199 	struct sock_filter trap_insns[] = {
1200 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1201 			offsetof(struct seccomp_data, nr)),
1202 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1203 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1204 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1205 	};
1206 	struct sock_filter kill_insns[] = {
1207 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1208 			offsetof(struct seccomp_data, nr)),
1209 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1210 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1211 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1212 	};
1213 
1214 	memset(self, 0, sizeof(*self));
1215 #define FILTER_ALLOC(_x) \
1216 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1217 	ASSERT_NE(NULL, self->_x.filter); \
1218 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1219 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1220 	FILTER_ALLOC(allow);
1221 	FILTER_ALLOC(log);
1222 	FILTER_ALLOC(trace);
1223 	FILTER_ALLOC(error);
1224 	FILTER_ALLOC(trap);
1225 	FILTER_ALLOC(kill);
1226 }
1227 
FIXTURE_TEARDOWN(precedence)1228 FIXTURE_TEARDOWN(precedence)
1229 {
1230 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1231 	FILTER_FREE(allow);
1232 	FILTER_FREE(log);
1233 	FILTER_FREE(trace);
1234 	FILTER_FREE(error);
1235 	FILTER_FREE(trap);
1236 	FILTER_FREE(kill);
1237 }
1238 
TEST_F(precedence,allow_ok)1239 TEST_F(precedence, allow_ok)
1240 {
1241 	pid_t parent, res = 0;
1242 	long ret;
1243 
1244 	parent = getppid();
1245 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1246 	ASSERT_EQ(0, ret);
1247 
1248 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1249 	ASSERT_EQ(0, ret);
1250 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1251 	ASSERT_EQ(0, ret);
1252 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1253 	ASSERT_EQ(0, ret);
1254 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1255 	ASSERT_EQ(0, ret);
1256 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1257 	ASSERT_EQ(0, ret);
1258 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1259 	ASSERT_EQ(0, ret);
1260 	/* Should work just fine. */
1261 	res = syscall(__NR_getppid);
1262 	EXPECT_EQ(parent, res);
1263 }
1264 
TEST_F_SIGNAL(precedence,kill_is_highest,SIGSYS)1265 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1266 {
1267 	pid_t parent, res = 0;
1268 	long ret;
1269 
1270 	parent = getppid();
1271 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1272 	ASSERT_EQ(0, ret);
1273 
1274 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1275 	ASSERT_EQ(0, ret);
1276 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1277 	ASSERT_EQ(0, ret);
1278 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1279 	ASSERT_EQ(0, ret);
1280 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1281 	ASSERT_EQ(0, ret);
1282 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1283 	ASSERT_EQ(0, ret);
1284 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1285 	ASSERT_EQ(0, ret);
1286 	/* Should work just fine. */
1287 	res = syscall(__NR_getppid);
1288 	EXPECT_EQ(parent, res);
1289 	/* getpid() should never return. */
1290 	res = syscall(__NR_getpid);
1291 	EXPECT_EQ(0, res);
1292 }
1293 
TEST_F_SIGNAL(precedence,kill_is_highest_in_any_order,SIGSYS)1294 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1295 {
1296 	pid_t parent;
1297 	long ret;
1298 
1299 	parent = getppid();
1300 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1301 	ASSERT_EQ(0, ret);
1302 
1303 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1304 	ASSERT_EQ(0, ret);
1305 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1306 	ASSERT_EQ(0, ret);
1307 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1308 	ASSERT_EQ(0, ret);
1309 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1310 	ASSERT_EQ(0, ret);
1311 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1312 	ASSERT_EQ(0, ret);
1313 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1314 	ASSERT_EQ(0, ret);
1315 	/* Should work just fine. */
1316 	EXPECT_EQ(parent, syscall(__NR_getppid));
1317 	/* getpid() should never return. */
1318 	EXPECT_EQ(0, syscall(__NR_getpid));
1319 }
1320 
TEST_F_SIGNAL(precedence,trap_is_second,SIGSYS)1321 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1322 {
1323 	pid_t parent;
1324 	long ret;
1325 
1326 	parent = getppid();
1327 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1328 	ASSERT_EQ(0, ret);
1329 
1330 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1331 	ASSERT_EQ(0, ret);
1332 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1333 	ASSERT_EQ(0, ret);
1334 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1335 	ASSERT_EQ(0, ret);
1336 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1337 	ASSERT_EQ(0, ret);
1338 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1339 	ASSERT_EQ(0, ret);
1340 	/* Should work just fine. */
1341 	EXPECT_EQ(parent, syscall(__NR_getppid));
1342 	/* getpid() should never return. */
1343 	EXPECT_EQ(0, syscall(__NR_getpid));
1344 }
1345 
TEST_F_SIGNAL(precedence,trap_is_second_in_any_order,SIGSYS)1346 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1347 {
1348 	pid_t parent;
1349 	long ret;
1350 
1351 	parent = getppid();
1352 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1353 	ASSERT_EQ(0, ret);
1354 
1355 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1356 	ASSERT_EQ(0, ret);
1357 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1358 	ASSERT_EQ(0, ret);
1359 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1360 	ASSERT_EQ(0, ret);
1361 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1362 	ASSERT_EQ(0, ret);
1363 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1364 	ASSERT_EQ(0, ret);
1365 	/* Should work just fine. */
1366 	EXPECT_EQ(parent, syscall(__NR_getppid));
1367 	/* getpid() should never return. */
1368 	EXPECT_EQ(0, syscall(__NR_getpid));
1369 }
1370 
TEST_F(precedence,errno_is_third)1371 TEST_F(precedence, errno_is_third)
1372 {
1373 	pid_t parent;
1374 	long ret;
1375 
1376 	parent = getppid();
1377 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1378 	ASSERT_EQ(0, ret);
1379 
1380 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1381 	ASSERT_EQ(0, ret);
1382 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1383 	ASSERT_EQ(0, ret);
1384 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1385 	ASSERT_EQ(0, ret);
1386 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1387 	ASSERT_EQ(0, ret);
1388 	/* Should work just fine. */
1389 	EXPECT_EQ(parent, syscall(__NR_getppid));
1390 	EXPECT_EQ(0, syscall(__NR_getpid));
1391 }
1392 
TEST_F(precedence,errno_is_third_in_any_order)1393 TEST_F(precedence, errno_is_third_in_any_order)
1394 {
1395 	pid_t parent;
1396 	long ret;
1397 
1398 	parent = getppid();
1399 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1400 	ASSERT_EQ(0, ret);
1401 
1402 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1403 	ASSERT_EQ(0, ret);
1404 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1405 	ASSERT_EQ(0, ret);
1406 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1407 	ASSERT_EQ(0, ret);
1408 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1409 	ASSERT_EQ(0, ret);
1410 	/* Should work just fine. */
1411 	EXPECT_EQ(parent, syscall(__NR_getppid));
1412 	EXPECT_EQ(0, syscall(__NR_getpid));
1413 }
1414 
TEST_F(precedence,trace_is_fourth)1415 TEST_F(precedence, trace_is_fourth)
1416 {
1417 	pid_t parent;
1418 	long ret;
1419 
1420 	parent = getppid();
1421 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1422 	ASSERT_EQ(0, ret);
1423 
1424 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1425 	ASSERT_EQ(0, ret);
1426 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1427 	ASSERT_EQ(0, ret);
1428 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1429 	ASSERT_EQ(0, ret);
1430 	/* Should work just fine. */
1431 	EXPECT_EQ(parent, syscall(__NR_getppid));
1432 	/* No ptracer */
1433 	EXPECT_EQ(-1, syscall(__NR_getpid));
1434 }
1435 
TEST_F(precedence,trace_is_fourth_in_any_order)1436 TEST_F(precedence, trace_is_fourth_in_any_order)
1437 {
1438 	pid_t parent;
1439 	long ret;
1440 
1441 	parent = getppid();
1442 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1443 	ASSERT_EQ(0, ret);
1444 
1445 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1446 	ASSERT_EQ(0, ret);
1447 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1448 	ASSERT_EQ(0, ret);
1449 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1450 	ASSERT_EQ(0, ret);
1451 	/* Should work just fine. */
1452 	EXPECT_EQ(parent, syscall(__NR_getppid));
1453 	/* No ptracer */
1454 	EXPECT_EQ(-1, syscall(__NR_getpid));
1455 }
1456 
TEST_F(precedence,log_is_fifth)1457 TEST_F(precedence, log_is_fifth)
1458 {
1459 	pid_t mypid, parent;
1460 	long ret;
1461 
1462 	mypid = getpid();
1463 	parent = getppid();
1464 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1465 	ASSERT_EQ(0, ret);
1466 
1467 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1468 	ASSERT_EQ(0, ret);
1469 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1470 	ASSERT_EQ(0, ret);
1471 	/* Should work just fine. */
1472 	EXPECT_EQ(parent, syscall(__NR_getppid));
1473 	/* Should also work just fine */
1474 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1475 }
1476 
TEST_F(precedence,log_is_fifth_in_any_order)1477 TEST_F(precedence, log_is_fifth_in_any_order)
1478 {
1479 	pid_t mypid, parent;
1480 	long ret;
1481 
1482 	mypid = getpid();
1483 	parent = getppid();
1484 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1485 	ASSERT_EQ(0, ret);
1486 
1487 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1488 	ASSERT_EQ(0, ret);
1489 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1490 	ASSERT_EQ(0, ret);
1491 	/* Should work just fine. */
1492 	EXPECT_EQ(parent, syscall(__NR_getppid));
1493 	/* Should also work just fine */
1494 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1495 }
1496 
1497 #ifndef PTRACE_O_TRACESECCOMP
1498 #define PTRACE_O_TRACESECCOMP	0x00000080
1499 #endif
1500 
1501 /* Catch the Ubuntu 12.04 value error. */
1502 #if PTRACE_EVENT_SECCOMP != 7
1503 #undef PTRACE_EVENT_SECCOMP
1504 #endif
1505 
1506 #ifndef PTRACE_EVENT_SECCOMP
1507 #define PTRACE_EVENT_SECCOMP 7
1508 #endif
1509 
1510 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1511 bool tracer_running;
tracer_stop(int sig)1512 void tracer_stop(int sig)
1513 {
1514 	tracer_running = false;
1515 }
1516 
1517 typedef void tracer_func_t(struct __test_metadata *_metadata,
1518 			   pid_t tracee, int status, void *args);
1519 
start_tracer(struct __test_metadata * _metadata,int fd,pid_t tracee,tracer_func_t tracer_func,void * args,bool ptrace_syscall)1520 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1521 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1522 {
1523 	int ret = -1;
1524 	struct sigaction action = {
1525 		.sa_handler = tracer_stop,
1526 	};
1527 
1528 	/* Allow external shutdown. */
1529 	tracer_running = true;
1530 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1531 
1532 	errno = 0;
1533 	while (ret == -1 && errno != EINVAL)
1534 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1535 	ASSERT_EQ(0, ret) {
1536 		kill(tracee, SIGKILL);
1537 	}
1538 	/* Wait for attach stop */
1539 	wait(NULL);
1540 
1541 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1542 						      PTRACE_O_TRACESYSGOOD :
1543 						      PTRACE_O_TRACESECCOMP);
1544 	ASSERT_EQ(0, ret) {
1545 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1546 		kill(tracee, SIGKILL);
1547 	}
1548 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1549 		     tracee, NULL, 0);
1550 	ASSERT_EQ(0, ret);
1551 
1552 	/* Unblock the tracee */
1553 	ASSERT_EQ(1, write(fd, "A", 1));
1554 	ASSERT_EQ(0, close(fd));
1555 
1556 	/* Run until we're shut down. Must assert to stop execution. */
1557 	while (tracer_running) {
1558 		int status;
1559 
1560 		if (wait(&status) != tracee)
1561 			continue;
1562 
1563 		if (WIFSIGNALED(status)) {
1564 			/* Child caught a fatal signal. */
1565 			return;
1566 		}
1567 		if (WIFEXITED(status)) {
1568 			/* Child exited with code. */
1569 			return;
1570 		}
1571 
1572 		/* Check if we got an expected event. */
1573 		ASSERT_EQ(WIFCONTINUED(status), false);
1574 		ASSERT_EQ(WIFSTOPPED(status), true);
1575 		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1576 			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1577 		}
1578 
1579 		tracer_func(_metadata, tracee, status, args);
1580 
1581 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1582 			     tracee, NULL, 0);
1583 		ASSERT_EQ(0, ret);
1584 	}
1585 	/* Directly report the status of our test harness results. */
1586 	syscall(__NR_exit, _metadata->exit_code);
1587 }
1588 
1589 /* Common tracer setup/teardown functions. */
cont_handler(int num)1590 void cont_handler(int num)
1591 { }
setup_trace_fixture(struct __test_metadata * _metadata,tracer_func_t func,void * args,bool ptrace_syscall)1592 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1593 			  tracer_func_t func, void *args, bool ptrace_syscall)
1594 {
1595 	char sync;
1596 	int pipefd[2];
1597 	pid_t tracer_pid;
1598 	pid_t tracee = getpid();
1599 
1600 	/* Setup a pipe for clean synchronization. */
1601 	ASSERT_EQ(0, pipe(pipefd));
1602 
1603 	/* Fork a child which we'll promote to tracer */
1604 	tracer_pid = fork();
1605 	ASSERT_LE(0, tracer_pid);
1606 	signal(SIGALRM, cont_handler);
1607 	if (tracer_pid == 0) {
1608 		close(pipefd[0]);
1609 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1610 			     ptrace_syscall);
1611 		syscall(__NR_exit, 0);
1612 	}
1613 	close(pipefd[1]);
1614 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1615 	read(pipefd[0], &sync, 1);
1616 	close(pipefd[0]);
1617 
1618 	return tracer_pid;
1619 }
1620 
teardown_trace_fixture(struct __test_metadata * _metadata,pid_t tracer)1621 void teardown_trace_fixture(struct __test_metadata *_metadata,
1622 			    pid_t tracer)
1623 {
1624 	if (tracer) {
1625 		int status;
1626 		/*
1627 		 * Extract the exit code from the other process and
1628 		 * adopt it for ourselves in case its asserts failed.
1629 		 */
1630 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1631 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1632 		if (WEXITSTATUS(status))
1633 			_metadata->exit_code = KSFT_FAIL;
1634 	}
1635 }
1636 
1637 /* "poke" tracer arguments and function. */
1638 struct tracer_args_poke_t {
1639 	unsigned long poke_addr;
1640 };
1641 
tracer_poke(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)1642 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1643 		 void *args)
1644 {
1645 	int ret;
1646 	unsigned long msg;
1647 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1648 
1649 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1650 	EXPECT_EQ(0, ret);
1651 	/* If this fails, don't try to recover. */
1652 	ASSERT_EQ(0x1001, msg) {
1653 		kill(tracee, SIGKILL);
1654 	}
1655 	/*
1656 	 * Poke in the message.
1657 	 * Registers are not touched to try to keep this relatively arch
1658 	 * agnostic.
1659 	 */
1660 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1661 	EXPECT_EQ(0, ret);
1662 }
1663 
FIXTURE(TRACE_poke)1664 FIXTURE(TRACE_poke) {
1665 	struct sock_fprog prog;
1666 	pid_t tracer;
1667 	long poked;
1668 	struct tracer_args_poke_t tracer_args;
1669 };
1670 
FIXTURE_SETUP(TRACE_poke)1671 FIXTURE_SETUP(TRACE_poke)
1672 {
1673 	struct sock_filter filter[] = {
1674 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1675 			offsetof(struct seccomp_data, nr)),
1676 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1677 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1678 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1679 	};
1680 
1681 	self->poked = 0;
1682 	memset(&self->prog, 0, sizeof(self->prog));
1683 	self->prog.filter = malloc(sizeof(filter));
1684 	ASSERT_NE(NULL, self->prog.filter);
1685 	memcpy(self->prog.filter, filter, sizeof(filter));
1686 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1687 
1688 	/* Set up tracer args. */
1689 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1690 
1691 	/* Launch tracer. */
1692 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1693 					   &self->tracer_args, false);
1694 }
1695 
FIXTURE_TEARDOWN(TRACE_poke)1696 FIXTURE_TEARDOWN(TRACE_poke)
1697 {
1698 	teardown_trace_fixture(_metadata, self->tracer);
1699 	if (self->prog.filter)
1700 		free(self->prog.filter);
1701 }
1702 
TEST_F(TRACE_poke,read_has_side_effects)1703 TEST_F(TRACE_poke, read_has_side_effects)
1704 {
1705 	ssize_t ret;
1706 
1707 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1708 	ASSERT_EQ(0, ret);
1709 
1710 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1711 	ASSERT_EQ(0, ret);
1712 
1713 	EXPECT_EQ(0, self->poked);
1714 	ret = read(-1, NULL, 0);
1715 	EXPECT_EQ(-1, ret);
1716 	EXPECT_EQ(0x1001, self->poked);
1717 }
1718 
TEST_F(TRACE_poke,getpid_runs_normally)1719 TEST_F(TRACE_poke, getpid_runs_normally)
1720 {
1721 	long ret;
1722 
1723 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1724 	ASSERT_EQ(0, ret);
1725 
1726 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1727 	ASSERT_EQ(0, ret);
1728 
1729 	EXPECT_EQ(0, self->poked);
1730 	EXPECT_NE(0, syscall(__NR_getpid));
1731 	EXPECT_EQ(0, self->poked);
1732 }
1733 
1734 #if defined(__x86_64__)
1735 # define ARCH_REGS		struct user_regs_struct
1736 # define SYSCALL_NUM(_regs)	(_regs).orig_rax
1737 # define SYSCALL_RET(_regs)	(_regs).rax
1738 #elif defined(__i386__)
1739 # define ARCH_REGS		struct user_regs_struct
1740 # define SYSCALL_NUM(_regs)	(_regs).orig_eax
1741 # define SYSCALL_RET(_regs)	(_regs).eax
1742 #elif defined(__arm__)
1743 # define ARCH_REGS		struct pt_regs
1744 # define SYSCALL_NUM(_regs)	(_regs).ARM_r7
1745 # ifndef PTRACE_SET_SYSCALL
1746 #  define PTRACE_SET_SYSCALL   23
1747 # endif
1748 # define SYSCALL_NUM_SET(_regs, _nr)	\
1749 		EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1750 # define SYSCALL_RET(_regs)	(_regs).ARM_r0
1751 #elif defined(__aarch64__)
1752 # define ARCH_REGS		struct user_pt_regs
1753 # define SYSCALL_NUM(_regs)	(_regs).regs[8]
1754 # ifndef NT_ARM_SYSTEM_CALL
1755 #  define NT_ARM_SYSTEM_CALL 0x404
1756 # endif
1757 # define SYSCALL_NUM_SET(_regs, _nr)				\
1758 	do {							\
1759 		struct iovec __v;				\
1760 		typeof(_nr) __nr = (_nr);			\
1761 		__v.iov_base = &__nr;				\
1762 		__v.iov_len = sizeof(__nr);			\
1763 		EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,	\
1764 				    NT_ARM_SYSTEM_CALL, &__v));	\
1765 	} while (0)
1766 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1767 #elif defined(__loongarch__)
1768 # define ARCH_REGS		struct user_pt_regs
1769 # define SYSCALL_NUM(_regs)	(_regs).regs[11]
1770 # define SYSCALL_RET(_regs)	(_regs).regs[4]
1771 #elif defined(__riscv) && __riscv_xlen == 64
1772 # define ARCH_REGS		struct user_regs_struct
1773 # define SYSCALL_NUM(_regs)	(_regs).a7
1774 # define SYSCALL_RET(_regs)	(_regs).a0
1775 #elif defined(__csky__)
1776 # define ARCH_REGS		struct pt_regs
1777 #  if defined(__CSKYABIV2__)
1778 #   define SYSCALL_NUM(_regs)	(_regs).regs[3]
1779 #  else
1780 #   define SYSCALL_NUM(_regs)	(_regs).regs[9]
1781 #  endif
1782 # define SYSCALL_RET(_regs)	(_regs).a0
1783 #elif defined(__hppa__)
1784 # define ARCH_REGS		struct user_regs_struct
1785 # define SYSCALL_NUM(_regs)	(_regs).gr[20]
1786 # define SYSCALL_RET(_regs)	(_regs).gr[28]
1787 #elif defined(__powerpc__)
1788 # define ARCH_REGS		struct pt_regs
1789 # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
1790 # define SYSCALL_RET(_regs)	(_regs).gpr[3]
1791 # define SYSCALL_RET_SET(_regs, _val)				\
1792 	do {							\
1793 		typeof(_val) _result = (_val);			\
1794 		if ((_regs.trap & 0xfff0) == 0x3000) {		\
1795 			/*					\
1796 			 * scv 0 system call uses -ve result	\
1797 			 * for error, so no need to adjust.	\
1798 			 */					\
1799 			SYSCALL_RET(_regs) = _result;		\
1800 		} else {					\
1801 			/*					\
1802 			 * A syscall error is signaled by the	\
1803 			 * CR0 SO bit and the code is stored as	\
1804 			 * a positive value.			\
1805 			 */					\
1806 			if (_result < 0) {			\
1807 				SYSCALL_RET(_regs) = -_result;	\
1808 				(_regs).ccr |= 0x10000000;	\
1809 			} else {				\
1810 				SYSCALL_RET(_regs) = _result;	\
1811 				(_regs).ccr &= ~0x10000000;	\
1812 			}					\
1813 		}						\
1814 	} while (0)
1815 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1816 #elif defined(__s390__)
1817 # define ARCH_REGS		s390_regs
1818 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
1819 # define SYSCALL_RET_SET(_regs, _val)			\
1820 		TH_LOG("Can't modify syscall return on this architecture")
1821 #elif defined(__mips__)
1822 # include <asm/unistd_nr_n32.h>
1823 # include <asm/unistd_nr_n64.h>
1824 # include <asm/unistd_nr_o32.h>
1825 # define ARCH_REGS		struct pt_regs
1826 # define SYSCALL_NUM(_regs)				\
1827 	({						\
1828 		typeof((_regs).regs[2]) _nr;		\
1829 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1830 			_nr = (_regs).regs[4];		\
1831 		else					\
1832 			_nr = (_regs).regs[2];		\
1833 		_nr;					\
1834 	})
1835 # define SYSCALL_NUM_SET(_regs, _nr)			\
1836 	do {						\
1837 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1838 			(_regs).regs[4] = _nr;		\
1839 		else					\
1840 			(_regs).regs[2] = _nr;		\
1841 	} while (0)
1842 # define SYSCALL_RET_SET(_regs, _val)			\
1843 		TH_LOG("Can't modify syscall return on this architecture")
1844 #elif defined(__xtensa__)
1845 # define ARCH_REGS		struct user_pt_regs
1846 # define SYSCALL_NUM(_regs)	(_regs).syscall
1847 /*
1848  * On xtensa syscall return value is in the register
1849  * a2 of the current window which is not fixed.
1850  */
1851 #define SYSCALL_RET(_regs)	(_regs).a[(_regs).windowbase * 4 + 2]
1852 #elif defined(__sh__)
1853 # define ARCH_REGS		struct pt_regs
1854 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
1855 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1856 #elif defined(__mc68000__)
1857 # define ARCH_REGS		struct user_regs_struct
1858 # define SYSCALL_NUM(_regs)	(_regs).orig_d0
1859 # define SYSCALL_RET(_regs)	(_regs).d0
1860 #else
1861 # error "Do not know how to find your architecture's registers and syscalls"
1862 #endif
1863 
1864 /*
1865  * Most architectures can change the syscall by just updating the
1866  * associated register. This is the default if not defined above.
1867  */
1868 #ifndef SYSCALL_NUM_SET
1869 # define SYSCALL_NUM_SET(_regs, _nr)		\
1870 	do {					\
1871 		SYSCALL_NUM(_regs) = (_nr);	\
1872 	} while (0)
1873 #endif
1874 /*
1875  * Most architectures can change the syscall return value by just
1876  * writing to the SYSCALL_RET register. This is the default if not
1877  * defined above. If an architecture cannot set the return value
1878  * (for example when the syscall and return value register is
1879  * shared), report it with TH_LOG() in an arch-specific definition
1880  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1881  */
1882 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1883 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1884 #endif
1885 #ifndef SYSCALL_RET_SET
1886 # define SYSCALL_RET_SET(_regs, _val)		\
1887 	do {					\
1888 		SYSCALL_RET(_regs) = (_val);	\
1889 	} while (0)
1890 #endif
1891 
1892 /* When the syscall return can't be changed, stub out the tests for it. */
1893 #ifndef SYSCALL_RET
1894 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1895 #else
1896 # define EXPECT_SYSCALL_RETURN(val, action)		\
1897 	do {						\
1898 		errno = 0;				\
1899 		if (val < 0) {				\
1900 			EXPECT_EQ(-1, action);		\
1901 			EXPECT_EQ(-(val), errno);	\
1902 		} else {				\
1903 			EXPECT_EQ(val, action);		\
1904 		}					\
1905 	} while (0)
1906 #endif
1907 
1908 /*
1909  * Some architectures (e.g. powerpc) can only set syscall
1910  * return values on syscall exit during ptrace.
1911  */
1912 const bool ptrace_entry_set_syscall_nr = true;
1913 const bool ptrace_entry_set_syscall_ret =
1914 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1915 	true;
1916 #else
1917 	false;
1918 #endif
1919 
1920 /*
1921  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1922  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1923  */
1924 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
1925 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1926 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1927 #else
1928 # define ARCH_GETREGS(_regs)	({					\
1929 		struct iovec __v;					\
1930 		__v.iov_base = &(_regs);				\
1931 		__v.iov_len = sizeof(_regs);				\
1932 		ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);	\
1933 	})
1934 # define ARCH_SETREGS(_regs)	({					\
1935 		struct iovec __v;					\
1936 		__v.iov_base = &(_regs);				\
1937 		__v.iov_len = sizeof(_regs);				\
1938 		ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);	\
1939 	})
1940 #endif
1941 
1942 /* Architecture-specific syscall fetching routine. */
get_syscall(struct __test_metadata * _metadata,pid_t tracee)1943 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1944 {
1945 	ARCH_REGS regs;
1946 
1947 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1948 		return -1;
1949 	}
1950 
1951 	return SYSCALL_NUM(regs);
1952 }
1953 
1954 /* Architecture-specific syscall changing routine. */
__change_syscall(struct __test_metadata * _metadata,pid_t tracee,long * syscall,long * ret)1955 void __change_syscall(struct __test_metadata *_metadata,
1956 		    pid_t tracee, long *syscall, long *ret)
1957 {
1958 	ARCH_REGS orig, regs;
1959 
1960 	/* Do not get/set registers if we have nothing to do. */
1961 	if (!syscall && !ret)
1962 		return;
1963 
1964 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1965 		return;
1966 	}
1967 	orig = regs;
1968 
1969 	if (syscall)
1970 		SYSCALL_NUM_SET(regs, *syscall);
1971 
1972 	if (ret)
1973 		SYSCALL_RET_SET(regs, *ret);
1974 
1975 	/* Flush any register changes made. */
1976 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1977 		EXPECT_EQ(0, ARCH_SETREGS(regs));
1978 }
1979 
1980 /* Change only syscall number. */
change_syscall_nr(struct __test_metadata * _metadata,pid_t tracee,long syscall)1981 void change_syscall_nr(struct __test_metadata *_metadata,
1982 		       pid_t tracee, long syscall)
1983 {
1984 	__change_syscall(_metadata, tracee, &syscall, NULL);
1985 }
1986 
1987 /* Change syscall return value (and set syscall number to -1). */
change_syscall_ret(struct __test_metadata * _metadata,pid_t tracee,long ret)1988 void change_syscall_ret(struct __test_metadata *_metadata,
1989 			pid_t tracee, long ret)
1990 {
1991 	long syscall = -1;
1992 
1993 	__change_syscall(_metadata, tracee, &syscall, &ret);
1994 }
1995 
tracer_seccomp(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)1996 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1997 		    int status, void *args)
1998 {
1999 	int ret;
2000 	unsigned long msg;
2001 
2002 	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
2003 		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
2004 		return;
2005 	}
2006 
2007 	/* Make sure we got the right message. */
2008 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2009 	EXPECT_EQ(0, ret);
2010 
2011 	/* Validate and take action on expected syscalls. */
2012 	switch (msg) {
2013 	case 0x1002:
2014 		/* change getpid to getppid. */
2015 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
2016 		change_syscall_nr(_metadata, tracee, __NR_getppid);
2017 		break;
2018 	case 0x1003:
2019 		/* skip gettid with valid return code. */
2020 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2021 		change_syscall_ret(_metadata, tracee, 45000);
2022 		break;
2023 	case 0x1004:
2024 		/* skip openat with error. */
2025 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2026 		change_syscall_ret(_metadata, tracee, -ESRCH);
2027 		break;
2028 	case 0x1005:
2029 		/* do nothing (allow getppid) */
2030 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2031 		break;
2032 	default:
2033 		EXPECT_EQ(0, msg) {
2034 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2035 			kill(tracee, SIGKILL);
2036 		}
2037 	}
2038 
2039 }
2040 
FIXTURE(TRACE_syscall)2041 FIXTURE(TRACE_syscall) {
2042 	struct sock_fprog prog;
2043 	pid_t tracer, mytid, mypid, parent;
2044 	long syscall_nr;
2045 };
2046 
tracer_ptrace(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)2047 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2048 		   int status, void *args)
2049 {
2050 	int ret;
2051 	unsigned long msg;
2052 	static bool entry;
2053 	long syscall_nr_val, syscall_ret_val;
2054 	long *syscall_nr = NULL, *syscall_ret = NULL;
2055 	FIXTURE_DATA(TRACE_syscall) *self = args;
2056 
2057 	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2058 		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2059 		return;
2060 	}
2061 
2062 	/*
2063 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
2064 	 * is by counting.
2065 	 */
2066 	entry = !entry;
2067 
2068 	/* Make sure we got an appropriate message. */
2069 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2070 	EXPECT_EQ(0, ret);
2071 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2072 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2073 
2074 	/*
2075 	 * Some architectures only support setting return values during
2076 	 * syscall exit under ptrace, and on exit the syscall number may
2077 	 * no longer be available. Therefore, save the initial sycall
2078 	 * number here, so it can be examined during both entry and exit
2079 	 * phases.
2080 	 */
2081 	if (entry)
2082 		self->syscall_nr = get_syscall(_metadata, tracee);
2083 
2084 	/*
2085 	 * Depending on the architecture's syscall setting abilities, we
2086 	 * pick which things to set during this phase (entry or exit).
2087 	 */
2088 	if (entry == ptrace_entry_set_syscall_nr)
2089 		syscall_nr = &syscall_nr_val;
2090 	if (entry == ptrace_entry_set_syscall_ret)
2091 		syscall_ret = &syscall_ret_val;
2092 
2093 	/* Now handle the actual rewriting cases. */
2094 	switch (self->syscall_nr) {
2095 	case __NR_getpid:
2096 		syscall_nr_val = __NR_getppid;
2097 		/* Never change syscall return for this case. */
2098 		syscall_ret = NULL;
2099 		break;
2100 	case __NR_gettid:
2101 		syscall_nr_val = -1;
2102 		syscall_ret_val = 45000;
2103 		break;
2104 	case __NR_openat:
2105 		syscall_nr_val = -1;
2106 		syscall_ret_val = -ESRCH;
2107 		break;
2108 	default:
2109 		/* Unhandled, do nothing. */
2110 		return;
2111 	}
2112 
2113 	__change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2114 }
2115 
FIXTURE_VARIANT(TRACE_syscall)2116 FIXTURE_VARIANT(TRACE_syscall) {
2117 	/*
2118 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2119 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2120 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
2121 	 * ptrace (true).
2122 	 */
2123 	bool use_ptrace;
2124 };
2125 
FIXTURE_VARIANT_ADD(TRACE_syscall,ptrace)2126 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2127 	.use_ptrace = true,
2128 };
2129 
FIXTURE_VARIANT_ADD(TRACE_syscall,seccomp)2130 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2131 	.use_ptrace = false,
2132 };
2133 
FIXTURE_SETUP(TRACE_syscall)2134 FIXTURE_SETUP(TRACE_syscall)
2135 {
2136 	struct sock_filter filter[] = {
2137 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2138 			offsetof(struct seccomp_data, nr)),
2139 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2140 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2141 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2142 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2143 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2144 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2145 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2146 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2147 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2148 	};
2149 	struct sock_fprog prog = {
2150 		.len = (unsigned short)ARRAY_SIZE(filter),
2151 		.filter = filter,
2152 	};
2153 	long ret;
2154 
2155 	/* Prepare some testable syscall results. */
2156 	self->mytid = syscall(__NR_gettid);
2157 	ASSERT_GT(self->mytid, 0);
2158 	ASSERT_NE(self->mytid, 1) {
2159 		TH_LOG("Running this test as init is not supported. :)");
2160 	}
2161 
2162 	self->mypid = getpid();
2163 	ASSERT_GT(self->mypid, 0);
2164 	ASSERT_EQ(self->mytid, self->mypid);
2165 
2166 	self->parent = getppid();
2167 	ASSERT_GT(self->parent, 0);
2168 	ASSERT_NE(self->parent, self->mypid);
2169 
2170 	/* Launch tracer. */
2171 	self->tracer = setup_trace_fixture(_metadata,
2172 					   variant->use_ptrace ? tracer_ptrace
2173 							       : tracer_seccomp,
2174 					   self, variant->use_ptrace);
2175 
2176 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2177 	ASSERT_EQ(0, ret);
2178 
2179 	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2180 	if (variant->use_ptrace)
2181 		return;
2182 
2183 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2184 	ASSERT_EQ(0, ret);
2185 }
2186 
FIXTURE_TEARDOWN(TRACE_syscall)2187 FIXTURE_TEARDOWN(TRACE_syscall)
2188 {
2189 	teardown_trace_fixture(_metadata, self->tracer);
2190 }
2191 
TEST(negative_ENOSYS)2192 TEST(negative_ENOSYS)
2193 {
2194 #if defined(__arm__)
2195 	SKIP(return, "arm32 does not support calling syscall -1");
2196 #endif
2197 	/*
2198 	 * There should be no difference between an "internal" skip
2199 	 * and userspace asking for syscall "-1".
2200 	 */
2201 	errno = 0;
2202 	EXPECT_EQ(-1, syscall(-1));
2203 	EXPECT_EQ(errno, ENOSYS);
2204 	/* And no difference for "still not valid but not -1". */
2205 	errno = 0;
2206 	EXPECT_EQ(-1, syscall(-101));
2207 	EXPECT_EQ(errno, ENOSYS);
2208 }
2209 
TEST_F(TRACE_syscall,negative_ENOSYS)2210 TEST_F(TRACE_syscall, negative_ENOSYS)
2211 {
2212 	negative_ENOSYS(_metadata);
2213 }
2214 
TEST_F(TRACE_syscall,syscall_allowed)2215 TEST_F(TRACE_syscall, syscall_allowed)
2216 {
2217 	/* getppid works as expected (no changes). */
2218 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2219 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2220 }
2221 
TEST_F(TRACE_syscall,syscall_redirected)2222 TEST_F(TRACE_syscall, syscall_redirected)
2223 {
2224 	/* getpid has been redirected to getppid as expected. */
2225 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2226 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2227 }
2228 
TEST_F(TRACE_syscall,syscall_errno)2229 TEST_F(TRACE_syscall, syscall_errno)
2230 {
2231 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2232 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2233 }
2234 
TEST_F(TRACE_syscall,syscall_faked)2235 TEST_F(TRACE_syscall, syscall_faked)
2236 {
2237 	/* Tracer skips the gettid syscall and store altered return value. */
2238 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2239 }
2240 
TEST_F_SIGNAL(TRACE_syscall,kill_immediate,SIGSYS)2241 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2242 {
2243 	struct sock_filter filter[] = {
2244 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2245 			offsetof(struct seccomp_data, nr)),
2246 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2247 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2248 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2249 	};
2250 	struct sock_fprog prog = {
2251 		.len = (unsigned short)ARRAY_SIZE(filter),
2252 		.filter = filter,
2253 	};
2254 	long ret;
2255 
2256 	/* Install "kill on mknodat" filter. */
2257 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2258 	ASSERT_EQ(0, ret);
2259 
2260 	/* This should immediately die with SIGSYS, regardless of tracer. */
2261 	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2262 }
2263 
TEST_F(TRACE_syscall,skip_after)2264 TEST_F(TRACE_syscall, skip_after)
2265 {
2266 	struct sock_filter filter[] = {
2267 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2268 			offsetof(struct seccomp_data, nr)),
2269 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2270 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2271 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2272 	};
2273 	struct sock_fprog prog = {
2274 		.len = (unsigned short)ARRAY_SIZE(filter),
2275 		.filter = filter,
2276 	};
2277 	long ret;
2278 
2279 	/* Install additional "errno on getppid" filter. */
2280 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2281 	ASSERT_EQ(0, ret);
2282 
2283 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2284 	errno = 0;
2285 	EXPECT_EQ(-1, syscall(__NR_getpid));
2286 	EXPECT_EQ(EPERM, errno);
2287 }
2288 
TEST_F_SIGNAL(TRACE_syscall,kill_after,SIGSYS)2289 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2290 {
2291 	struct sock_filter filter[] = {
2292 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2293 			offsetof(struct seccomp_data, nr)),
2294 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2295 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2296 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2297 	};
2298 	struct sock_fprog prog = {
2299 		.len = (unsigned short)ARRAY_SIZE(filter),
2300 		.filter = filter,
2301 	};
2302 	long ret;
2303 
2304 	/* Install additional "death on getppid" filter. */
2305 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2306 	ASSERT_EQ(0, ret);
2307 
2308 	/* Tracer will redirect getpid to getppid, and we should die. */
2309 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2310 }
2311 
TEST(seccomp_syscall)2312 TEST(seccomp_syscall)
2313 {
2314 	struct sock_filter filter[] = {
2315 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2316 	};
2317 	struct sock_fprog prog = {
2318 		.len = (unsigned short)ARRAY_SIZE(filter),
2319 		.filter = filter,
2320 	};
2321 	long ret;
2322 
2323 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2324 	ASSERT_EQ(0, ret) {
2325 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2326 	}
2327 
2328 	/* Reject insane operation. */
2329 	ret = seccomp(-1, 0, &prog);
2330 	ASSERT_NE(ENOSYS, errno) {
2331 		TH_LOG("Kernel does not support seccomp syscall!");
2332 	}
2333 	EXPECT_EQ(EINVAL, errno) {
2334 		TH_LOG("Did not reject crazy op value!");
2335 	}
2336 
2337 	/* Reject strict with flags or pointer. */
2338 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2339 	EXPECT_EQ(EINVAL, errno) {
2340 		TH_LOG("Did not reject mode strict with flags!");
2341 	}
2342 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2343 	EXPECT_EQ(EINVAL, errno) {
2344 		TH_LOG("Did not reject mode strict with uargs!");
2345 	}
2346 
2347 	/* Reject insane args for filter. */
2348 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2349 	EXPECT_EQ(EINVAL, errno) {
2350 		TH_LOG("Did not reject crazy filter flags!");
2351 	}
2352 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2353 	EXPECT_EQ(EFAULT, errno) {
2354 		TH_LOG("Did not reject NULL filter!");
2355 	}
2356 
2357 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2358 	EXPECT_EQ(0, errno) {
2359 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2360 			strerror(errno));
2361 	}
2362 }
2363 
TEST(seccomp_syscall_mode_lock)2364 TEST(seccomp_syscall_mode_lock)
2365 {
2366 	struct sock_filter filter[] = {
2367 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2368 	};
2369 	struct sock_fprog prog = {
2370 		.len = (unsigned short)ARRAY_SIZE(filter),
2371 		.filter = filter,
2372 	};
2373 	long ret;
2374 
2375 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2376 	ASSERT_EQ(0, ret) {
2377 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2378 	}
2379 
2380 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2381 	ASSERT_NE(ENOSYS, errno) {
2382 		TH_LOG("Kernel does not support seccomp syscall!");
2383 	}
2384 	EXPECT_EQ(0, ret) {
2385 		TH_LOG("Could not install filter!");
2386 	}
2387 
2388 	/* Make sure neither entry point will switch to strict. */
2389 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2390 	EXPECT_EQ(EINVAL, errno) {
2391 		TH_LOG("Switched to mode strict!");
2392 	}
2393 
2394 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2395 	EXPECT_EQ(EINVAL, errno) {
2396 		TH_LOG("Switched to mode strict!");
2397 	}
2398 }
2399 
2400 /*
2401  * Test detection of known and unknown filter flags. Userspace needs to be able
2402  * to check if a filter flag is supported by the current kernel and a good way
2403  * of doing that is by attempting to enter filter mode, with the flag bit in
2404  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2405  * that the flag is valid and EINVAL indicates that the flag is invalid.
2406  */
TEST(detect_seccomp_filter_flags)2407 TEST(detect_seccomp_filter_flags)
2408 {
2409 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2410 				 SECCOMP_FILTER_FLAG_LOG,
2411 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2412 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2413 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2414 	unsigned int exclusive[] = {
2415 				SECCOMP_FILTER_FLAG_TSYNC,
2416 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2417 	unsigned int flag, all_flags, exclusive_mask;
2418 	int i;
2419 	long ret;
2420 
2421 	/* Test detection of individual known-good filter flags */
2422 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2423 		int bits = 0;
2424 
2425 		flag = flags[i];
2426 		/* Make sure the flag is a single bit! */
2427 		while (flag) {
2428 			if (flag & 0x1)
2429 				bits ++;
2430 			flag >>= 1;
2431 		}
2432 		ASSERT_EQ(1, bits);
2433 		flag = flags[i];
2434 
2435 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2436 		ASSERT_NE(ENOSYS, errno) {
2437 			TH_LOG("Kernel does not support seccomp syscall!");
2438 		}
2439 		EXPECT_EQ(-1, ret);
2440 		EXPECT_EQ(EFAULT, errno) {
2441 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2442 			       flag);
2443 		}
2444 
2445 		all_flags |= flag;
2446 	}
2447 
2448 	/*
2449 	 * Test detection of all known-good filter flags combined. But
2450 	 * for the exclusive flags we need to mask them out and try them
2451 	 * individually for the "all flags" testing.
2452 	 */
2453 	exclusive_mask = 0;
2454 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2455 		exclusive_mask |= exclusive[i];
2456 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2457 		flag = all_flags & ~exclusive_mask;
2458 		flag |= exclusive[i];
2459 
2460 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2461 		EXPECT_EQ(-1, ret);
2462 		EXPECT_EQ(EFAULT, errno) {
2463 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2464 			       flag);
2465 		}
2466 	}
2467 
2468 	/* Test detection of an unknown filter flags, without exclusives. */
2469 	flag = -1;
2470 	flag &= ~exclusive_mask;
2471 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2472 	EXPECT_EQ(-1, ret);
2473 	EXPECT_EQ(EINVAL, errno) {
2474 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2475 		       flag);
2476 	}
2477 
2478 	/*
2479 	 * Test detection of an unknown filter flag that may simply need to be
2480 	 * added to this test
2481 	 */
2482 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2483 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2484 	EXPECT_EQ(-1, ret);
2485 	EXPECT_EQ(EINVAL, errno) {
2486 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2487 		       flag);
2488 	}
2489 }
2490 
TEST(TSYNC_first)2491 TEST(TSYNC_first)
2492 {
2493 	struct sock_filter filter[] = {
2494 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2495 	};
2496 	struct sock_fprog prog = {
2497 		.len = (unsigned short)ARRAY_SIZE(filter),
2498 		.filter = filter,
2499 	};
2500 	long ret;
2501 
2502 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2503 	ASSERT_EQ(0, ret) {
2504 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2505 	}
2506 
2507 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2508 		      &prog);
2509 	ASSERT_NE(ENOSYS, errno) {
2510 		TH_LOG("Kernel does not support seccomp syscall!");
2511 	}
2512 	EXPECT_EQ(0, ret) {
2513 		TH_LOG("Could not install initial filter with TSYNC!");
2514 	}
2515 }
2516 
2517 #define TSYNC_SIBLINGS 2
2518 struct tsync_sibling {
2519 	pthread_t tid;
2520 	pid_t system_tid;
2521 	sem_t *started;
2522 	pthread_cond_t *cond;
2523 	pthread_mutex_t *mutex;
2524 	int diverge;
2525 	int num_waits;
2526 	struct sock_fprog *prog;
2527 	struct __test_metadata *metadata;
2528 };
2529 
2530 /*
2531  * To avoid joining joined threads (which is not allowed by Bionic),
2532  * make sure we both successfully join and clear the tid to skip a
2533  * later join attempt during fixture teardown. Any remaining threads
2534  * will be directly killed during teardown.
2535  */
2536 #define PTHREAD_JOIN(tid, status)					\
2537 	do {								\
2538 		int _rc = pthread_join(tid, status);			\
2539 		if (_rc) {						\
2540 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2541 				(unsigned int)tid, _rc);		\
2542 		} else {						\
2543 			tid = 0;					\
2544 		}							\
2545 	} while (0)
2546 
FIXTURE(TSYNC)2547 FIXTURE(TSYNC) {
2548 	struct sock_fprog root_prog, apply_prog;
2549 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2550 	sem_t started;
2551 	pthread_cond_t cond;
2552 	pthread_mutex_t mutex;
2553 	int sibling_count;
2554 };
2555 
FIXTURE_SETUP(TSYNC)2556 FIXTURE_SETUP(TSYNC)
2557 {
2558 	struct sock_filter root_filter[] = {
2559 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2560 	};
2561 	struct sock_filter apply_filter[] = {
2562 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2563 			offsetof(struct seccomp_data, nr)),
2564 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2565 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2566 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2567 	};
2568 
2569 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2570 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2571 	memset(&self->sibling, 0, sizeof(self->sibling));
2572 	self->root_prog.filter = malloc(sizeof(root_filter));
2573 	ASSERT_NE(NULL, self->root_prog.filter);
2574 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2575 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2576 
2577 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2578 	ASSERT_NE(NULL, self->apply_prog.filter);
2579 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2580 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2581 
2582 	self->sibling_count = 0;
2583 	pthread_mutex_init(&self->mutex, NULL);
2584 	pthread_cond_init(&self->cond, NULL);
2585 	sem_init(&self->started, 0, 0);
2586 	self->sibling[0].tid = 0;
2587 	self->sibling[0].cond = &self->cond;
2588 	self->sibling[0].started = &self->started;
2589 	self->sibling[0].mutex = &self->mutex;
2590 	self->sibling[0].diverge = 0;
2591 	self->sibling[0].num_waits = 1;
2592 	self->sibling[0].prog = &self->root_prog;
2593 	self->sibling[0].metadata = _metadata;
2594 	self->sibling[1].tid = 0;
2595 	self->sibling[1].cond = &self->cond;
2596 	self->sibling[1].started = &self->started;
2597 	self->sibling[1].mutex = &self->mutex;
2598 	self->sibling[1].diverge = 0;
2599 	self->sibling[1].prog = &self->root_prog;
2600 	self->sibling[1].num_waits = 1;
2601 	self->sibling[1].metadata = _metadata;
2602 }
2603 
FIXTURE_TEARDOWN(TSYNC)2604 FIXTURE_TEARDOWN(TSYNC)
2605 {
2606 	int sib = 0;
2607 
2608 	if (self->root_prog.filter)
2609 		free(self->root_prog.filter);
2610 	if (self->apply_prog.filter)
2611 		free(self->apply_prog.filter);
2612 
2613 	for ( ; sib < self->sibling_count; ++sib) {
2614 		struct tsync_sibling *s = &self->sibling[sib];
2615 
2616 		if (!s->tid)
2617 			continue;
2618 		/*
2619 		 * If a thread is still running, it may be stuck, so hit
2620 		 * it over the head really hard.
2621 		 */
2622 		pthread_kill(s->tid, 9);
2623 	}
2624 	pthread_mutex_destroy(&self->mutex);
2625 	pthread_cond_destroy(&self->cond);
2626 	sem_destroy(&self->started);
2627 }
2628 
tsync_sibling(void * data)2629 void *tsync_sibling(void *data)
2630 {
2631 	long ret = 0;
2632 	struct tsync_sibling *me = data;
2633 
2634 	me->system_tid = syscall(__NR_gettid);
2635 
2636 	pthread_mutex_lock(me->mutex);
2637 	if (me->diverge) {
2638 		/* Just re-apply the root prog to fork the tree */
2639 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2640 				me->prog, 0, 0);
2641 	}
2642 	sem_post(me->started);
2643 	/* Return outside of started so parent notices failures. */
2644 	if (ret) {
2645 		pthread_mutex_unlock(me->mutex);
2646 		return (void *)SIBLING_EXIT_FAILURE;
2647 	}
2648 	do {
2649 		pthread_cond_wait(me->cond, me->mutex);
2650 		me->num_waits = me->num_waits - 1;
2651 	} while (me->num_waits);
2652 	pthread_mutex_unlock(me->mutex);
2653 
2654 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2655 	if (!ret)
2656 		return (void *)SIBLING_EXIT_NEWPRIVS;
2657 	read(-1, NULL, 0);
2658 	return (void *)SIBLING_EXIT_UNKILLED;
2659 }
2660 
tsync_start_sibling(struct tsync_sibling * sibling)2661 void tsync_start_sibling(struct tsync_sibling *sibling)
2662 {
2663 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2664 }
2665 
TEST_F(TSYNC,siblings_fail_prctl)2666 TEST_F(TSYNC, siblings_fail_prctl)
2667 {
2668 	long ret;
2669 	void *status;
2670 	struct sock_filter filter[] = {
2671 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2672 			offsetof(struct seccomp_data, nr)),
2673 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2674 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2675 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2676 	};
2677 	struct sock_fprog prog = {
2678 		.len = (unsigned short)ARRAY_SIZE(filter),
2679 		.filter = filter,
2680 	};
2681 
2682 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2683 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2684 	}
2685 
2686 	/* Check prctl failure detection by requesting sib 0 diverge. */
2687 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2688 	ASSERT_NE(ENOSYS, errno) {
2689 		TH_LOG("Kernel does not support seccomp syscall!");
2690 	}
2691 	ASSERT_EQ(0, ret) {
2692 		TH_LOG("setting filter failed");
2693 	}
2694 
2695 	self->sibling[0].diverge = 1;
2696 	tsync_start_sibling(&self->sibling[0]);
2697 	tsync_start_sibling(&self->sibling[1]);
2698 
2699 	while (self->sibling_count < TSYNC_SIBLINGS) {
2700 		sem_wait(&self->started);
2701 		self->sibling_count++;
2702 	}
2703 
2704 	/* Signal the threads to clean up*/
2705 	pthread_mutex_lock(&self->mutex);
2706 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2707 		TH_LOG("cond broadcast non-zero");
2708 	}
2709 	pthread_mutex_unlock(&self->mutex);
2710 
2711 	/* Ensure diverging sibling failed to call prctl. */
2712 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2713 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2714 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2715 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2716 }
2717 
TEST_F(TSYNC,two_siblings_with_ancestor)2718 TEST_F(TSYNC, two_siblings_with_ancestor)
2719 {
2720 	long ret;
2721 	void *status;
2722 
2723 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2724 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2725 	}
2726 
2727 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2728 	ASSERT_NE(ENOSYS, errno) {
2729 		TH_LOG("Kernel does not support seccomp syscall!");
2730 	}
2731 	ASSERT_EQ(0, ret) {
2732 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2733 	}
2734 	tsync_start_sibling(&self->sibling[0]);
2735 	tsync_start_sibling(&self->sibling[1]);
2736 
2737 	while (self->sibling_count < TSYNC_SIBLINGS) {
2738 		sem_wait(&self->started);
2739 		self->sibling_count++;
2740 	}
2741 
2742 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2743 		      &self->apply_prog);
2744 	ASSERT_EQ(0, ret) {
2745 		TH_LOG("Could install filter on all threads!");
2746 	}
2747 	/* Tell the siblings to test the policy */
2748 	pthread_mutex_lock(&self->mutex);
2749 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2750 		TH_LOG("cond broadcast non-zero");
2751 	}
2752 	pthread_mutex_unlock(&self->mutex);
2753 	/* Ensure they are both killed and don't exit cleanly. */
2754 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2755 	EXPECT_EQ(0x0, (long)status);
2756 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2757 	EXPECT_EQ(0x0, (long)status);
2758 }
2759 
TEST_F(TSYNC,two_sibling_want_nnp)2760 TEST_F(TSYNC, two_sibling_want_nnp)
2761 {
2762 	void *status;
2763 
2764 	/* start siblings before any prctl() operations */
2765 	tsync_start_sibling(&self->sibling[0]);
2766 	tsync_start_sibling(&self->sibling[1]);
2767 	while (self->sibling_count < TSYNC_SIBLINGS) {
2768 		sem_wait(&self->started);
2769 		self->sibling_count++;
2770 	}
2771 
2772 	/* Tell the siblings to test no policy */
2773 	pthread_mutex_lock(&self->mutex);
2774 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2775 		TH_LOG("cond broadcast non-zero");
2776 	}
2777 	pthread_mutex_unlock(&self->mutex);
2778 
2779 	/* Ensure they are both upset about lacking nnp. */
2780 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2781 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2782 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2783 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2784 }
2785 
TEST_F(TSYNC,two_siblings_with_no_filter)2786 TEST_F(TSYNC, two_siblings_with_no_filter)
2787 {
2788 	long ret;
2789 	void *status;
2790 
2791 	/* start siblings before any prctl() operations */
2792 	tsync_start_sibling(&self->sibling[0]);
2793 	tsync_start_sibling(&self->sibling[1]);
2794 	while (self->sibling_count < TSYNC_SIBLINGS) {
2795 		sem_wait(&self->started);
2796 		self->sibling_count++;
2797 	}
2798 
2799 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2800 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2801 	}
2802 
2803 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2804 		      &self->apply_prog);
2805 	ASSERT_NE(ENOSYS, errno) {
2806 		TH_LOG("Kernel does not support seccomp syscall!");
2807 	}
2808 	ASSERT_EQ(0, ret) {
2809 		TH_LOG("Could install filter on all threads!");
2810 	}
2811 
2812 	/* Tell the siblings to test the policy */
2813 	pthread_mutex_lock(&self->mutex);
2814 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2815 		TH_LOG("cond broadcast non-zero");
2816 	}
2817 	pthread_mutex_unlock(&self->mutex);
2818 
2819 	/* Ensure they are both killed and don't exit cleanly. */
2820 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2821 	EXPECT_EQ(0x0, (long)status);
2822 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2823 	EXPECT_EQ(0x0, (long)status);
2824 }
2825 
TEST_F(TSYNC,two_siblings_with_one_divergence)2826 TEST_F(TSYNC, two_siblings_with_one_divergence)
2827 {
2828 	long ret;
2829 	void *status;
2830 
2831 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2832 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2833 	}
2834 
2835 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2836 	ASSERT_NE(ENOSYS, errno) {
2837 		TH_LOG("Kernel does not support seccomp syscall!");
2838 	}
2839 	ASSERT_EQ(0, ret) {
2840 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2841 	}
2842 	self->sibling[0].diverge = 1;
2843 	tsync_start_sibling(&self->sibling[0]);
2844 	tsync_start_sibling(&self->sibling[1]);
2845 
2846 	while (self->sibling_count < TSYNC_SIBLINGS) {
2847 		sem_wait(&self->started);
2848 		self->sibling_count++;
2849 	}
2850 
2851 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2852 		      &self->apply_prog);
2853 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2854 		TH_LOG("Did not fail on diverged sibling.");
2855 	}
2856 
2857 	/* Wake the threads */
2858 	pthread_mutex_lock(&self->mutex);
2859 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2860 		TH_LOG("cond broadcast non-zero");
2861 	}
2862 	pthread_mutex_unlock(&self->mutex);
2863 
2864 	/* Ensure they are both unkilled. */
2865 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2866 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2867 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2868 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2869 }
2870 
TEST_F(TSYNC,two_siblings_with_one_divergence_no_tid_in_err)2871 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2872 {
2873 	long ret, flags;
2874 	void *status;
2875 
2876 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2877 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2878 	}
2879 
2880 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2881 	ASSERT_NE(ENOSYS, errno) {
2882 		TH_LOG("Kernel does not support seccomp syscall!");
2883 	}
2884 	ASSERT_EQ(0, ret) {
2885 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2886 	}
2887 	self->sibling[0].diverge = 1;
2888 	tsync_start_sibling(&self->sibling[0]);
2889 	tsync_start_sibling(&self->sibling[1]);
2890 
2891 	while (self->sibling_count < TSYNC_SIBLINGS) {
2892 		sem_wait(&self->started);
2893 		self->sibling_count++;
2894 	}
2895 
2896 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2897 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2898 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2899 	ASSERT_EQ(ESRCH, errno) {
2900 		TH_LOG("Did not return ESRCH for diverged sibling.");
2901 	}
2902 	ASSERT_EQ(-1, ret) {
2903 		TH_LOG("Did not fail on diverged sibling.");
2904 	}
2905 
2906 	/* Wake the threads */
2907 	pthread_mutex_lock(&self->mutex);
2908 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2909 		TH_LOG("cond broadcast non-zero");
2910 	}
2911 	pthread_mutex_unlock(&self->mutex);
2912 
2913 	/* Ensure they are both unkilled. */
2914 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2915 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2916 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2917 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2918 }
2919 
TEST_F(TSYNC,two_siblings_not_under_filter)2920 TEST_F(TSYNC, two_siblings_not_under_filter)
2921 {
2922 	long ret, sib;
2923 	void *status;
2924 	struct timespec delay = { .tv_nsec = 100000000 };
2925 
2926 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2927 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2928 	}
2929 
2930 	/*
2931 	 * Sibling 0 will have its own seccomp policy
2932 	 * and Sibling 1 will not be under seccomp at
2933 	 * all. Sibling 1 will enter seccomp and 0
2934 	 * will cause failure.
2935 	 */
2936 	self->sibling[0].diverge = 1;
2937 	tsync_start_sibling(&self->sibling[0]);
2938 	tsync_start_sibling(&self->sibling[1]);
2939 
2940 	while (self->sibling_count < TSYNC_SIBLINGS) {
2941 		sem_wait(&self->started);
2942 		self->sibling_count++;
2943 	}
2944 
2945 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2946 	ASSERT_NE(ENOSYS, errno) {
2947 		TH_LOG("Kernel does not support seccomp syscall!");
2948 	}
2949 	ASSERT_EQ(0, ret) {
2950 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2951 	}
2952 
2953 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2954 		      &self->apply_prog);
2955 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2956 		TH_LOG("Did not fail on diverged sibling.");
2957 	}
2958 	sib = 1;
2959 	if (ret == self->sibling[0].system_tid)
2960 		sib = 0;
2961 
2962 	pthread_mutex_lock(&self->mutex);
2963 
2964 	/* Increment the other siblings num_waits so we can clean up
2965 	 * the one we just saw.
2966 	 */
2967 	self->sibling[!sib].num_waits += 1;
2968 
2969 	/* Signal the thread to clean up*/
2970 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2971 		TH_LOG("cond broadcast non-zero");
2972 	}
2973 	pthread_mutex_unlock(&self->mutex);
2974 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2975 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2976 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2977 	while (!kill(self->sibling[sib].system_tid, 0))
2978 		nanosleep(&delay, NULL);
2979 	/* Switch to the remaining sibling */
2980 	sib = !sib;
2981 
2982 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2983 		      &self->apply_prog);
2984 	ASSERT_EQ(0, ret) {
2985 		TH_LOG("Expected the remaining sibling to sync");
2986 	};
2987 
2988 	pthread_mutex_lock(&self->mutex);
2989 
2990 	/* If remaining sibling didn't have a chance to wake up during
2991 	 * the first broadcast, manually reduce the num_waits now.
2992 	 */
2993 	if (self->sibling[sib].num_waits > 1)
2994 		self->sibling[sib].num_waits = 1;
2995 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2996 		TH_LOG("cond broadcast non-zero");
2997 	}
2998 	pthread_mutex_unlock(&self->mutex);
2999 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
3000 	EXPECT_EQ(0, (long)status);
3001 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
3002 	while (!kill(self->sibling[sib].system_tid, 0))
3003 		nanosleep(&delay, NULL);
3004 
3005 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
3006 		      &self->apply_prog);
3007 	ASSERT_EQ(0, ret);  /* just us chickens */
3008 }
3009 
3010 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
TEST(syscall_restart)3011 TEST(syscall_restart)
3012 {
3013 	long ret;
3014 	unsigned long msg;
3015 	pid_t child_pid;
3016 	int pipefd[2];
3017 	int status;
3018 	siginfo_t info = { };
3019 	struct sock_filter filter[] = {
3020 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3021 			 offsetof(struct seccomp_data, nr)),
3022 
3023 #ifdef __NR_sigreturn
3024 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3025 #endif
3026 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3027 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3028 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3029 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3030 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3031 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3032 
3033 		/* Allow __NR_write for easy logging. */
3034 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3035 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3036 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3037 		/* The nanosleep jump target. */
3038 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3039 		/* The restart_syscall jump target. */
3040 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3041 	};
3042 	struct sock_fprog prog = {
3043 		.len = (unsigned short)ARRAY_SIZE(filter),
3044 		.filter = filter,
3045 	};
3046 #if defined(__arm__)
3047 	struct utsname utsbuf;
3048 #endif
3049 
3050 	ASSERT_EQ(0, pipe(pipefd));
3051 
3052 	child_pid = fork();
3053 	ASSERT_LE(0, child_pid);
3054 	if (child_pid == 0) {
3055 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
3056 		char buf = ' ';
3057 		struct timespec timeout = { };
3058 
3059 		/* Attach parent as tracer and stop. */
3060 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3061 		EXPECT_EQ(0, raise(SIGSTOP));
3062 
3063 		EXPECT_EQ(0, close(pipefd[1]));
3064 
3065 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3066 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3067 		}
3068 
3069 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3070 		EXPECT_EQ(0, ret) {
3071 			TH_LOG("Failed to install filter!");
3072 		}
3073 
3074 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3075 			TH_LOG("Failed to read() sync from parent");
3076 		}
3077 		EXPECT_EQ('.', buf) {
3078 			TH_LOG("Failed to get sync data from read()");
3079 		}
3080 
3081 		/* Start nanosleep to be interrupted. */
3082 		timeout.tv_sec = 1;
3083 		errno = 0;
3084 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3085 			TH_LOG("Call to nanosleep() failed (errno %d: %s)",
3086 				errno, strerror(errno));
3087 		}
3088 
3089 		/* Read final sync from parent. */
3090 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3091 			TH_LOG("Failed final read() from parent");
3092 		}
3093 		EXPECT_EQ('!', buf) {
3094 			TH_LOG("Failed to get final data from read()");
3095 		}
3096 
3097 		/* Directly report the status of our test harness results. */
3098 		syscall(__NR_exit, _metadata->exit_code);
3099 	}
3100 	EXPECT_EQ(0, close(pipefd[0]));
3101 
3102 	/* Attach to child, setup options, and release. */
3103 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3104 	ASSERT_EQ(true, WIFSTOPPED(status));
3105 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3106 			    PTRACE_O_TRACESECCOMP));
3107 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3108 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
3109 
3110 	/* Wait for nanosleep() to start. */
3111 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3112 	ASSERT_EQ(true, WIFSTOPPED(status));
3113 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3114 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3115 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3116 	ASSERT_EQ(0x100, msg);
3117 	ret = get_syscall(_metadata, child_pid);
3118 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3119 
3120 	/* Might as well check siginfo for sanity while we're here. */
3121 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3122 	ASSERT_EQ(SIGTRAP, info.si_signo);
3123 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3124 	EXPECT_EQ(0, info.si_errno);
3125 	EXPECT_EQ(getuid(), info.si_uid);
3126 	/* Verify signal delivery came from child (seccomp-triggered). */
3127 	EXPECT_EQ(child_pid, info.si_pid);
3128 
3129 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3130 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3131 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3132 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3133 	ASSERT_EQ(true, WIFSTOPPED(status));
3134 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3135 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3136 	/*
3137 	 * There is no siginfo on SIGSTOP any more, so we can't verify
3138 	 * signal delivery came from parent now (getpid() == info.si_pid).
3139 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3140 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3141 	 */
3142 	EXPECT_EQ(SIGSTOP, info.si_signo);
3143 
3144 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3145 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
3146 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3147 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3148 	ASSERT_EQ(true, WIFSTOPPED(status));
3149 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3150 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3151 
3152 	/* Wait for restart_syscall() to start. */
3153 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3154 	ASSERT_EQ(true, WIFSTOPPED(status));
3155 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3156 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3157 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3158 
3159 	ASSERT_EQ(0x200, msg);
3160 	ret = get_syscall(_metadata, child_pid);
3161 #if defined(__arm__)
3162 	/*
3163 	 * FIXME:
3164 	 * - native ARM registers do NOT expose true syscall.
3165 	 * - compat ARM registers on ARM64 DO expose true syscall.
3166 	 */
3167 	ASSERT_EQ(0, uname(&utsbuf));
3168 	if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3169 		EXPECT_EQ(__NR_nanosleep, ret);
3170 	} else
3171 #endif
3172 	{
3173 		EXPECT_EQ(__NR_restart_syscall, ret);
3174 	}
3175 
3176 	/* Write again to end test. */
3177 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3178 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
3179 	EXPECT_EQ(0, close(pipefd[1]));
3180 
3181 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3182 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
3183 		_metadata->exit_code = KSFT_FAIL;
3184 }
3185 
TEST_SIGNAL(filter_flag_log,SIGSYS)3186 TEST_SIGNAL(filter_flag_log, SIGSYS)
3187 {
3188 	struct sock_filter allow_filter[] = {
3189 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3190 	};
3191 	struct sock_filter kill_filter[] = {
3192 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3193 			offsetof(struct seccomp_data, nr)),
3194 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3195 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3196 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3197 	};
3198 	struct sock_fprog allow_prog = {
3199 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
3200 		.filter = allow_filter,
3201 	};
3202 	struct sock_fprog kill_prog = {
3203 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
3204 		.filter = kill_filter,
3205 	};
3206 	long ret;
3207 	pid_t parent = getppid();
3208 
3209 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3210 	ASSERT_EQ(0, ret);
3211 
3212 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3213 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3214 		      &allow_prog);
3215 	ASSERT_NE(ENOSYS, errno) {
3216 		TH_LOG("Kernel does not support seccomp syscall!");
3217 	}
3218 	EXPECT_NE(0, ret) {
3219 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3220 	}
3221 	EXPECT_EQ(EINVAL, errno) {
3222 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3223 	}
3224 
3225 	/* Verify that a simple, permissive filter can be added with no flags */
3226 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3227 	EXPECT_EQ(0, ret);
3228 
3229 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3230 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3231 		      &allow_prog);
3232 	ASSERT_NE(EINVAL, errno) {
3233 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3234 	}
3235 	EXPECT_EQ(0, ret);
3236 
3237 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3238 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3239 		      &kill_prog);
3240 	EXPECT_EQ(0, ret);
3241 
3242 	EXPECT_EQ(parent, syscall(__NR_getppid));
3243 	/* getpid() should never return. */
3244 	EXPECT_EQ(0, syscall(__NR_getpid));
3245 }
3246 
TEST(get_action_avail)3247 TEST(get_action_avail)
3248 {
3249 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3250 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3251 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3252 	__u32 unknown_action = 0x10000000U;
3253 	int i;
3254 	long ret;
3255 
3256 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3257 	ASSERT_NE(ENOSYS, errno) {
3258 		TH_LOG("Kernel does not support seccomp syscall!");
3259 	}
3260 	ASSERT_NE(EINVAL, errno) {
3261 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3262 	}
3263 	EXPECT_EQ(ret, 0);
3264 
3265 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3266 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3267 		EXPECT_EQ(ret, 0) {
3268 			TH_LOG("Expected action (0x%X) not available!",
3269 			       actions[i]);
3270 		}
3271 	}
3272 
3273 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3274 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3275 	EXPECT_EQ(ret, -1);
3276 	EXPECT_EQ(errno, EOPNOTSUPP);
3277 }
3278 
TEST(get_metadata)3279 TEST(get_metadata)
3280 {
3281 	pid_t pid;
3282 	int pipefd[2];
3283 	char buf;
3284 	struct seccomp_metadata md;
3285 	long ret;
3286 
3287 	/* Only real root can get metadata. */
3288 	if (geteuid()) {
3289 		SKIP(return, "get_metadata requires real root");
3290 		return;
3291 	}
3292 
3293 	ASSERT_EQ(0, pipe(pipefd));
3294 
3295 	pid = fork();
3296 	ASSERT_GE(pid, 0);
3297 	if (pid == 0) {
3298 		struct sock_filter filter[] = {
3299 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3300 		};
3301 		struct sock_fprog prog = {
3302 			.len = (unsigned short)ARRAY_SIZE(filter),
3303 			.filter = filter,
3304 		};
3305 
3306 		/* one with log, one without */
3307 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3308 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3309 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3310 
3311 		EXPECT_EQ(0, close(pipefd[0]));
3312 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3313 		ASSERT_EQ(0, close(pipefd[1]));
3314 
3315 		while (1)
3316 			sleep(100);
3317 	}
3318 
3319 	ASSERT_EQ(0, close(pipefd[1]));
3320 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3321 
3322 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3323 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3324 
3325 	/* Past here must not use ASSERT or child process is never killed. */
3326 
3327 	md.filter_off = 0;
3328 	errno = 0;
3329 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3330 	EXPECT_EQ(sizeof(md), ret) {
3331 		if (errno == EINVAL)
3332 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3333 	}
3334 
3335 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3336 	EXPECT_EQ(md.filter_off, 0);
3337 
3338 	md.filter_off = 1;
3339 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3340 	EXPECT_EQ(sizeof(md), ret);
3341 	EXPECT_EQ(md.flags, 0);
3342 	EXPECT_EQ(md.filter_off, 1);
3343 
3344 skip:
3345 	ASSERT_EQ(0, kill(pid, SIGKILL));
3346 }
3347 
user_notif_syscall(int nr,unsigned int flags)3348 static int user_notif_syscall(int nr, unsigned int flags)
3349 {
3350 	struct sock_filter filter[] = {
3351 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3352 			offsetof(struct seccomp_data, nr)),
3353 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3354 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3355 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3356 	};
3357 
3358 	struct sock_fprog prog = {
3359 		.len = (unsigned short)ARRAY_SIZE(filter),
3360 		.filter = filter,
3361 	};
3362 
3363 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3364 }
3365 
3366 #define USER_NOTIF_MAGIC INT_MAX
TEST(user_notification_basic)3367 TEST(user_notification_basic)
3368 {
3369 	pid_t pid;
3370 	long ret;
3371 	int status, listener;
3372 	struct seccomp_notif req = {};
3373 	struct seccomp_notif_resp resp = {};
3374 	struct pollfd pollfd;
3375 
3376 	struct sock_filter filter[] = {
3377 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3378 	};
3379 	struct sock_fprog prog = {
3380 		.len = (unsigned short)ARRAY_SIZE(filter),
3381 		.filter = filter,
3382 	};
3383 
3384 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3385 	ASSERT_EQ(0, ret) {
3386 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3387 	}
3388 
3389 	pid = fork();
3390 	ASSERT_GE(pid, 0);
3391 
3392 	/* Check that we get -ENOSYS with no listener attached */
3393 	if (pid == 0) {
3394 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3395 			exit(1);
3396 		ret = syscall(__NR_getppid);
3397 		exit(ret >= 0 || errno != ENOSYS);
3398 	}
3399 
3400 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3401 	EXPECT_EQ(true, WIFEXITED(status));
3402 	EXPECT_EQ(0, WEXITSTATUS(status));
3403 
3404 	/* Add some no-op filters for grins. */
3405 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3406 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3407 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3408 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3409 
3410 	/* Check that the basic notification machinery works */
3411 	listener = user_notif_syscall(__NR_getppid,
3412 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3413 	ASSERT_GE(listener, 0);
3414 
3415 	/* Installing a second listener in the chain should EBUSY */
3416 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3417 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3418 		  -1);
3419 	EXPECT_EQ(errno, EBUSY);
3420 
3421 	pid = fork();
3422 	ASSERT_GE(pid, 0);
3423 
3424 	if (pid == 0) {
3425 		ret = syscall(__NR_getppid);
3426 		exit(ret != USER_NOTIF_MAGIC);
3427 	}
3428 
3429 	pollfd.fd = listener;
3430 	pollfd.events = POLLIN | POLLOUT;
3431 
3432 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3433 	EXPECT_EQ(pollfd.revents, POLLIN);
3434 
3435 	/* Test that we can't pass garbage to the kernel. */
3436 	memset(&req, 0, sizeof(req));
3437 	req.pid = -1;
3438 	errno = 0;
3439 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3440 	EXPECT_EQ(-1, ret);
3441 	EXPECT_EQ(EINVAL, errno);
3442 
3443 	if (ret) {
3444 		req.pid = 0;
3445 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3446 	}
3447 
3448 	pollfd.fd = listener;
3449 	pollfd.events = POLLIN | POLLOUT;
3450 
3451 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3452 	EXPECT_EQ(pollfd.revents, POLLOUT);
3453 
3454 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3455 
3456 	resp.id = req.id;
3457 	resp.error = 0;
3458 	resp.val = USER_NOTIF_MAGIC;
3459 
3460 	/* check that we make sure flags == 0 */
3461 	resp.flags = 1;
3462 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3463 	EXPECT_EQ(errno, EINVAL);
3464 
3465 	resp.flags = 0;
3466 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3467 
3468 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3469 	EXPECT_EQ(true, WIFEXITED(status));
3470 	EXPECT_EQ(0, WEXITSTATUS(status));
3471 }
3472 
TEST(user_notification_with_tsync)3473 TEST(user_notification_with_tsync)
3474 {
3475 	int ret;
3476 	unsigned int flags;
3477 
3478 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3479 	ASSERT_EQ(0, ret) {
3480 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3481 	}
3482 
3483 	/* these were exclusive */
3484 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3485 		SECCOMP_FILTER_FLAG_TSYNC;
3486 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3487 	ASSERT_EQ(EINVAL, errno);
3488 
3489 	/* but now they're not */
3490 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3491 	ret = user_notif_syscall(__NR_getppid, flags);
3492 	close(ret);
3493 	ASSERT_LE(0, ret);
3494 }
3495 
TEST(user_notification_kill_in_middle)3496 TEST(user_notification_kill_in_middle)
3497 {
3498 	pid_t pid;
3499 	long ret;
3500 	int listener;
3501 	struct seccomp_notif req = {};
3502 	struct seccomp_notif_resp resp = {};
3503 
3504 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3505 	ASSERT_EQ(0, ret) {
3506 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3507 	}
3508 
3509 	listener = user_notif_syscall(__NR_getppid,
3510 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3511 	ASSERT_GE(listener, 0);
3512 
3513 	/*
3514 	 * Check that nothing bad happens when we kill the task in the middle
3515 	 * of a syscall.
3516 	 */
3517 	pid = fork();
3518 	ASSERT_GE(pid, 0);
3519 
3520 	if (pid == 0) {
3521 		ret = syscall(__NR_getppid);
3522 		exit(ret != USER_NOTIF_MAGIC);
3523 	}
3524 
3525 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3526 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3527 
3528 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3529 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3530 
3531 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3532 
3533 	resp.id = req.id;
3534 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3535 	EXPECT_EQ(ret, -1);
3536 	EXPECT_EQ(errno, ENOENT);
3537 }
3538 
3539 static int handled = -1;
3540 
signal_handler(int signal)3541 static void signal_handler(int signal)
3542 {
3543 	if (write(handled, "c", 1) != 1)
3544 		perror("write from signal");
3545 }
3546 
TEST(user_notification_signal)3547 TEST(user_notification_signal)
3548 {
3549 	pid_t pid;
3550 	long ret;
3551 	int status, listener, sk_pair[2];
3552 	struct seccomp_notif req = {};
3553 	struct seccomp_notif_resp resp = {};
3554 	char c;
3555 
3556 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3557 	ASSERT_EQ(0, ret) {
3558 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3559 	}
3560 
3561 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3562 
3563 	listener = user_notif_syscall(__NR_gettid,
3564 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3565 	ASSERT_GE(listener, 0);
3566 
3567 	pid = fork();
3568 	ASSERT_GE(pid, 0);
3569 
3570 	if (pid == 0) {
3571 		close(sk_pair[0]);
3572 		handled = sk_pair[1];
3573 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3574 			perror("signal");
3575 			exit(1);
3576 		}
3577 		/*
3578 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3579 		 * to rely on a signal that has not yet been handled. Let's at
3580 		 * least check that the error code gets propagated through, and
3581 		 * hope that it doesn't break when there is actually a signal :)
3582 		 */
3583 		ret = syscall(__NR_gettid);
3584 		exit(!(ret == -1 && errno == 512));
3585 	}
3586 
3587 	close(sk_pair[1]);
3588 
3589 	memset(&req, 0, sizeof(req));
3590 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3591 
3592 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3593 
3594 	/*
3595 	 * Make sure the signal really is delivered, which means we're not
3596 	 * stuck in the user notification code any more and the notification
3597 	 * should be dead.
3598 	 */
3599 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3600 
3601 	resp.id = req.id;
3602 	resp.error = -EPERM;
3603 	resp.val = 0;
3604 
3605 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3606 	EXPECT_EQ(errno, ENOENT);
3607 
3608 	memset(&req, 0, sizeof(req));
3609 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3610 
3611 	resp.id = req.id;
3612 	resp.error = -512; /* -ERESTARTSYS */
3613 	resp.val = 0;
3614 
3615 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3616 
3617 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3618 	EXPECT_EQ(true, WIFEXITED(status));
3619 	EXPECT_EQ(0, WEXITSTATUS(status));
3620 }
3621 
TEST(user_notification_closed_listener)3622 TEST(user_notification_closed_listener)
3623 {
3624 	pid_t pid;
3625 	long ret;
3626 	int status, listener;
3627 
3628 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3629 	ASSERT_EQ(0, ret) {
3630 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3631 	}
3632 
3633 	listener = user_notif_syscall(__NR_getppid,
3634 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3635 	ASSERT_GE(listener, 0);
3636 
3637 	/*
3638 	 * Check that we get an ENOSYS when the listener is closed.
3639 	 */
3640 	pid = fork();
3641 	ASSERT_GE(pid, 0);
3642 	if (pid == 0) {
3643 		close(listener);
3644 		ret = syscall(__NR_getppid);
3645 		exit(ret != -1 && errno != ENOSYS);
3646 	}
3647 
3648 	close(listener);
3649 
3650 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3651 	EXPECT_EQ(true, WIFEXITED(status));
3652 	EXPECT_EQ(0, WEXITSTATUS(status));
3653 }
3654 
3655 /*
3656  * Check that a pid in a child namespace still shows up as valid in ours.
3657  */
TEST(user_notification_child_pid_ns)3658 TEST(user_notification_child_pid_ns)
3659 {
3660 	pid_t pid;
3661 	int status, listener;
3662 	struct seccomp_notif req = {};
3663 	struct seccomp_notif_resp resp = {};
3664 
3665 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3666 		if (errno == EINVAL)
3667 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3668 	};
3669 
3670 	listener = user_notif_syscall(__NR_getppid,
3671 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3672 	ASSERT_GE(listener, 0);
3673 
3674 	pid = fork();
3675 	ASSERT_GE(pid, 0);
3676 
3677 	if (pid == 0)
3678 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3679 
3680 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3681 	EXPECT_EQ(req.pid, pid);
3682 
3683 	resp.id = req.id;
3684 	resp.error = 0;
3685 	resp.val = USER_NOTIF_MAGIC;
3686 
3687 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3688 
3689 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3690 	EXPECT_EQ(true, WIFEXITED(status));
3691 	EXPECT_EQ(0, WEXITSTATUS(status));
3692 	close(listener);
3693 }
3694 
3695 /*
3696  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3697  * invalid.
3698  */
TEST(user_notification_sibling_pid_ns)3699 TEST(user_notification_sibling_pid_ns)
3700 {
3701 	pid_t pid, pid2;
3702 	int status, listener;
3703 	struct seccomp_notif req = {};
3704 	struct seccomp_notif_resp resp = {};
3705 
3706 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3707 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3708 	}
3709 
3710 	listener = user_notif_syscall(__NR_getppid,
3711 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3712 	ASSERT_GE(listener, 0);
3713 
3714 	pid = fork();
3715 	ASSERT_GE(pid, 0);
3716 
3717 	if (pid == 0) {
3718 		ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3719 			if (errno == EPERM)
3720 				SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3721 			else if (errno == EINVAL)
3722 				SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3723 		}
3724 
3725 		pid2 = fork();
3726 		ASSERT_GE(pid2, 0);
3727 
3728 		if (pid2 == 0)
3729 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3730 
3731 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3732 		EXPECT_EQ(true, WIFEXITED(status));
3733 		EXPECT_EQ(0, WEXITSTATUS(status));
3734 		exit(WEXITSTATUS(status));
3735 	}
3736 
3737 	/* Create the sibling ns, and sibling in it. */
3738 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3739 		if (errno == EPERM)
3740 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3741 		else if (errno == EINVAL)
3742 			SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3743 	}
3744 	ASSERT_EQ(errno, 0);
3745 
3746 	pid2 = fork();
3747 	ASSERT_GE(pid2, 0);
3748 
3749 	if (pid2 == 0) {
3750 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3751 		/*
3752 		 * The pid should be 0, i.e. the task is in some namespace that
3753 		 * we can't "see".
3754 		 */
3755 		EXPECT_EQ(req.pid, 0);
3756 
3757 		resp.id = req.id;
3758 		resp.error = 0;
3759 		resp.val = USER_NOTIF_MAGIC;
3760 
3761 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3762 		exit(0);
3763 	}
3764 
3765 	close(listener);
3766 
3767 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3768 	EXPECT_EQ(true, WIFEXITED(status));
3769 	EXPECT_EQ(0, WEXITSTATUS(status));
3770 
3771 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3772 	EXPECT_EQ(true, WIFEXITED(status));
3773 	EXPECT_EQ(0, WEXITSTATUS(status));
3774 }
3775 
TEST(user_notification_fault_recv)3776 TEST(user_notification_fault_recv)
3777 {
3778 	pid_t pid;
3779 	int status, listener;
3780 	struct seccomp_notif req = {};
3781 	struct seccomp_notif_resp resp = {};
3782 
3783 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3784 		if (errno == EINVAL)
3785 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3786 	}
3787 
3788 	listener = user_notif_syscall(__NR_getppid,
3789 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3790 	ASSERT_GE(listener, 0);
3791 
3792 	pid = fork();
3793 	ASSERT_GE(pid, 0);
3794 
3795 	if (pid == 0)
3796 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3797 
3798 	/* Do a bad recv() */
3799 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3800 	EXPECT_EQ(errno, EFAULT);
3801 
3802 	/* We should still be able to receive this notification, though. */
3803 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3804 	EXPECT_EQ(req.pid, pid);
3805 
3806 	resp.id = req.id;
3807 	resp.error = 0;
3808 	resp.val = USER_NOTIF_MAGIC;
3809 
3810 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3811 
3812 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3813 	EXPECT_EQ(true, WIFEXITED(status));
3814 	EXPECT_EQ(0, WEXITSTATUS(status));
3815 }
3816 
TEST(seccomp_get_notif_sizes)3817 TEST(seccomp_get_notif_sizes)
3818 {
3819 	struct seccomp_notif_sizes sizes;
3820 
3821 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3822 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3823 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3824 }
3825 
TEST(user_notification_continue)3826 TEST(user_notification_continue)
3827 {
3828 	pid_t pid;
3829 	long ret;
3830 	int status, listener;
3831 	struct seccomp_notif req = {};
3832 	struct seccomp_notif_resp resp = {};
3833 	struct pollfd pollfd;
3834 
3835 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3836 	ASSERT_EQ(0, ret) {
3837 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3838 	}
3839 
3840 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3841 	ASSERT_GE(listener, 0);
3842 
3843 	pid = fork();
3844 	ASSERT_GE(pid, 0);
3845 
3846 	if (pid == 0) {
3847 		int dup_fd, pipe_fds[2];
3848 		pid_t self;
3849 
3850 		ASSERT_GE(pipe(pipe_fds), 0);
3851 
3852 		dup_fd = dup(pipe_fds[0]);
3853 		ASSERT_GE(dup_fd, 0);
3854 		EXPECT_NE(pipe_fds[0], dup_fd);
3855 
3856 		self = getpid();
3857 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3858 		exit(0);
3859 	}
3860 
3861 	pollfd.fd = listener;
3862 	pollfd.events = POLLIN | POLLOUT;
3863 
3864 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3865 	EXPECT_EQ(pollfd.revents, POLLIN);
3866 
3867 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3868 
3869 	pollfd.fd = listener;
3870 	pollfd.events = POLLIN | POLLOUT;
3871 
3872 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3873 	EXPECT_EQ(pollfd.revents, POLLOUT);
3874 
3875 	EXPECT_EQ(req.data.nr, __NR_dup);
3876 
3877 	resp.id = req.id;
3878 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3879 
3880 	/*
3881 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3882 	 * args be set to 0.
3883 	 */
3884 	resp.error = 0;
3885 	resp.val = USER_NOTIF_MAGIC;
3886 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3887 	EXPECT_EQ(errno, EINVAL);
3888 
3889 	resp.error = USER_NOTIF_MAGIC;
3890 	resp.val = 0;
3891 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3892 	EXPECT_EQ(errno, EINVAL);
3893 
3894 	resp.error = 0;
3895 	resp.val = 0;
3896 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3897 		if (errno == EINVAL)
3898 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3899 	}
3900 
3901 skip:
3902 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3903 	EXPECT_EQ(true, WIFEXITED(status));
3904 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3905 		if (WEXITSTATUS(status) == 2) {
3906 			SKIP(return, "Kernel does not support kcmp() syscall");
3907 			return;
3908 		}
3909 	}
3910 }
3911 
TEST(user_notification_filter_empty)3912 TEST(user_notification_filter_empty)
3913 {
3914 	pid_t pid;
3915 	long ret;
3916 	int status;
3917 	struct pollfd pollfd;
3918 	struct __clone_args args = {
3919 		.flags = CLONE_FILES,
3920 		.exit_signal = SIGCHLD,
3921 	};
3922 
3923 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3924 	ASSERT_EQ(0, ret) {
3925 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3926 	}
3927 
3928 	if (__NR_clone3 < 0)
3929 		SKIP(return, "Test not built with clone3 support");
3930 
3931 	pid = sys_clone3(&args, sizeof(args));
3932 	ASSERT_GE(pid, 0);
3933 
3934 	if (pid == 0) {
3935 		int listener;
3936 
3937 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3938 		if (listener < 0)
3939 			_exit(EXIT_FAILURE);
3940 
3941 		if (dup2(listener, 200) != 200)
3942 			_exit(EXIT_FAILURE);
3943 
3944 		close(listener);
3945 
3946 		_exit(EXIT_SUCCESS);
3947 	}
3948 
3949 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3950 	EXPECT_EQ(true, WIFEXITED(status));
3951 	EXPECT_EQ(0, WEXITSTATUS(status));
3952 
3953 	/*
3954 	 * The seccomp filter has become unused so we should be notified once
3955 	 * the kernel gets around to cleaning up task struct.
3956 	 */
3957 	pollfd.fd = 200;
3958 	pollfd.events = POLLHUP;
3959 
3960 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3961 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3962 }
3963 
TEST(user_ioctl_notification_filter_empty)3964 TEST(user_ioctl_notification_filter_empty)
3965 {
3966 	pid_t pid;
3967 	long ret;
3968 	int status, p[2];
3969 	struct __clone_args args = {
3970 		.flags = CLONE_FILES,
3971 		.exit_signal = SIGCHLD,
3972 	};
3973 	struct seccomp_notif req = {};
3974 
3975 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3976 	ASSERT_EQ(0, ret) {
3977 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3978 	}
3979 
3980 	if (__NR_clone3 < 0)
3981 		SKIP(return, "Test not built with clone3 support");
3982 
3983 	ASSERT_EQ(0, pipe(p));
3984 
3985 	pid = sys_clone3(&args, sizeof(args));
3986 	ASSERT_GE(pid, 0);
3987 
3988 	if (pid == 0) {
3989 		int listener;
3990 
3991 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3992 		if (listener < 0)
3993 			_exit(EXIT_FAILURE);
3994 
3995 		if (dup2(listener, 200) != 200)
3996 			_exit(EXIT_FAILURE);
3997 		close(p[1]);
3998 		close(listener);
3999 		sleep(1);
4000 
4001 		_exit(EXIT_SUCCESS);
4002 	}
4003 	if (read(p[0], &status, 1) != 0)
4004 		_exit(EXIT_SUCCESS);
4005 	close(p[0]);
4006 	/*
4007 	 * The seccomp filter has become unused so we should be notified once
4008 	 * the kernel gets around to cleaning up task struct.
4009 	 */
4010 	EXPECT_EQ(ioctl(200, SECCOMP_IOCTL_NOTIF_RECV, &req), -1);
4011 	EXPECT_EQ(errno, ENOENT);
4012 
4013 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4014 	EXPECT_EQ(true, WIFEXITED(status));
4015 	EXPECT_EQ(0, WEXITSTATUS(status));
4016 }
4017 
do_thread(void * data)4018 static void *do_thread(void *data)
4019 {
4020 	return NULL;
4021 }
4022 
TEST(user_notification_filter_empty_threaded)4023 TEST(user_notification_filter_empty_threaded)
4024 {
4025 	pid_t pid;
4026 	long ret;
4027 	int status;
4028 	struct pollfd pollfd;
4029 	struct __clone_args args = {
4030 		.flags = CLONE_FILES,
4031 		.exit_signal = SIGCHLD,
4032 	};
4033 
4034 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4035 	ASSERT_EQ(0, ret) {
4036 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4037 	}
4038 
4039 	if (__NR_clone3 < 0)
4040 		SKIP(return, "Test not built with clone3 support");
4041 
4042 	pid = sys_clone3(&args, sizeof(args));
4043 	ASSERT_GE(pid, 0);
4044 
4045 	if (pid == 0) {
4046 		pid_t pid1, pid2;
4047 		int listener, status;
4048 		pthread_t thread;
4049 
4050 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
4051 		if (listener < 0)
4052 			_exit(EXIT_FAILURE);
4053 
4054 		if (dup2(listener, 200) != 200)
4055 			_exit(EXIT_FAILURE);
4056 
4057 		close(listener);
4058 
4059 		pid1 = fork();
4060 		if (pid1 < 0)
4061 			_exit(EXIT_FAILURE);
4062 
4063 		if (pid1 == 0)
4064 			_exit(EXIT_SUCCESS);
4065 
4066 		pid2 = fork();
4067 		if (pid2 < 0)
4068 			_exit(EXIT_FAILURE);
4069 
4070 		if (pid2 == 0)
4071 			_exit(EXIT_SUCCESS);
4072 
4073 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4074 		    pthread_join(thread, NULL))
4075 			_exit(EXIT_FAILURE);
4076 
4077 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4078 		    pthread_join(thread, NULL))
4079 			_exit(EXIT_FAILURE);
4080 
4081 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
4082 		    WEXITSTATUS(status))
4083 			_exit(EXIT_FAILURE);
4084 
4085 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
4086 		    WEXITSTATUS(status))
4087 			_exit(EXIT_FAILURE);
4088 
4089 		exit(EXIT_SUCCESS);
4090 	}
4091 
4092 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4093 	EXPECT_EQ(true, WIFEXITED(status));
4094 	EXPECT_EQ(0, WEXITSTATUS(status));
4095 
4096 	/*
4097 	 * The seccomp filter has become unused so we should be notified once
4098 	 * the kernel gets around to cleaning up task struct.
4099 	 */
4100 	pollfd.fd = 200;
4101 	pollfd.events = POLLHUP;
4102 
4103 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4104 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4105 }
4106 
4107 
get_next_fd(int prev_fd)4108 int get_next_fd(int prev_fd)
4109 {
4110 	for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) {
4111 		if (fcntl(i, F_GETFD) == -1)
4112 			return i;
4113 	}
4114 	_exit(EXIT_FAILURE);
4115 }
4116 
TEST(user_notification_addfd)4117 TEST(user_notification_addfd)
4118 {
4119 	pid_t pid;
4120 	long ret;
4121 	int status, listener, memfd, fd, nextfd;
4122 	struct seccomp_notif_addfd addfd = {};
4123 	struct seccomp_notif_addfd_small small = {};
4124 	struct seccomp_notif_addfd_big big = {};
4125 	struct seccomp_notif req = {};
4126 	struct seccomp_notif_resp resp = {};
4127 	/* 100 ms */
4128 	struct timespec delay = { .tv_nsec = 100000000 };
4129 
4130 	/* There may be arbitrary already-open fds at test start. */
4131 	memfd = memfd_create("test", 0);
4132 	ASSERT_GE(memfd, 0);
4133 	nextfd = get_next_fd(memfd);
4134 
4135 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4136 	ASSERT_EQ(0, ret) {
4137 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4138 	}
4139 
4140 	/* fd: 4 */
4141 	/* Check that the basic notification machinery works */
4142 	listener = user_notif_syscall(__NR_getppid,
4143 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4144 	ASSERT_EQ(listener, nextfd);
4145 	nextfd = get_next_fd(nextfd);
4146 
4147 	pid = fork();
4148 	ASSERT_GE(pid, 0);
4149 
4150 	if (pid == 0) {
4151 		/* fds will be added and this value is expected */
4152 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4153 			exit(1);
4154 
4155 		/* Atomic addfd+send is received here. Check it is a valid fd */
4156 		if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4157 			exit(1);
4158 
4159 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4160 	}
4161 
4162 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4163 
4164 	addfd.srcfd = memfd;
4165 	addfd.newfd = 0;
4166 	addfd.id = req.id;
4167 	addfd.flags = 0x0;
4168 
4169 	/* Verify bad newfd_flags cannot be set */
4170 	addfd.newfd_flags = ~O_CLOEXEC;
4171 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4172 	EXPECT_EQ(errno, EINVAL);
4173 	addfd.newfd_flags = O_CLOEXEC;
4174 
4175 	/* Verify bad flags cannot be set */
4176 	addfd.flags = 0xff;
4177 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4178 	EXPECT_EQ(errno, EINVAL);
4179 	addfd.flags = 0;
4180 
4181 	/* Verify that remote_fd cannot be set without setting flags */
4182 	addfd.newfd = 1;
4183 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4184 	EXPECT_EQ(errno, EINVAL);
4185 	addfd.newfd = 0;
4186 
4187 	/* Verify small size cannot be set */
4188 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4189 	EXPECT_EQ(errno, EINVAL);
4190 
4191 	/* Verify we can't send bits filled in unknown buffer area */
4192 	memset(&big, 0xAA, sizeof(big));
4193 	big.addfd = addfd;
4194 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4195 	EXPECT_EQ(errno, E2BIG);
4196 
4197 
4198 	/* Verify we can set an arbitrary remote fd */
4199 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4200 	EXPECT_EQ(fd, nextfd);
4201 	nextfd = get_next_fd(nextfd);
4202 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4203 
4204 	/* Verify we can set an arbitrary remote fd with large size */
4205 	memset(&big, 0x0, sizeof(big));
4206 	big.addfd = addfd;
4207 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4208 	EXPECT_EQ(fd, nextfd);
4209 	nextfd = get_next_fd(nextfd);
4210 
4211 	/* Verify we can set a specific remote fd */
4212 	addfd.newfd = 42;
4213 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4214 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4215 	EXPECT_EQ(fd, 42);
4216 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4217 
4218 	/* Resume syscall */
4219 	resp.id = req.id;
4220 	resp.error = 0;
4221 	resp.val = USER_NOTIF_MAGIC;
4222 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4223 
4224 	/*
4225 	 * This sets the ID of the ADD FD to the last request plus 1. The
4226 	 * notification ID increments 1 per notification.
4227 	 */
4228 	addfd.id = req.id + 1;
4229 
4230 	/* This spins until the underlying notification is generated */
4231 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4232 	       errno != -EINPROGRESS)
4233 		nanosleep(&delay, NULL);
4234 
4235 	memset(&req, 0, sizeof(req));
4236 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4237 	ASSERT_EQ(addfd.id, req.id);
4238 
4239 	/* Verify we can do an atomic addfd and send */
4240 	addfd.newfd = 0;
4241 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4242 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4243 	/*
4244 	 * Child has earlier "low" fds and now 42, so we expect the next
4245 	 * lowest available fd to be assigned here.
4246 	 */
4247 	EXPECT_EQ(fd, nextfd);
4248 	nextfd = get_next_fd(nextfd);
4249 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4250 
4251 	/*
4252 	 * This sets the ID of the ADD FD to the last request plus 1. The
4253 	 * notification ID increments 1 per notification.
4254 	 */
4255 	addfd.id = req.id + 1;
4256 
4257 	/* This spins until the underlying notification is generated */
4258 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4259 	       errno != -EINPROGRESS)
4260 		nanosleep(&delay, NULL);
4261 
4262 	memset(&req, 0, sizeof(req));
4263 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4264 	ASSERT_EQ(addfd.id, req.id);
4265 
4266 	resp.id = req.id;
4267 	resp.error = 0;
4268 	resp.val = USER_NOTIF_MAGIC;
4269 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4270 
4271 	/* Wait for child to finish. */
4272 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4273 	EXPECT_EQ(true, WIFEXITED(status));
4274 	EXPECT_EQ(0, WEXITSTATUS(status));
4275 
4276 	close(memfd);
4277 }
4278 
TEST(user_notification_addfd_rlimit)4279 TEST(user_notification_addfd_rlimit)
4280 {
4281 	pid_t pid;
4282 	long ret;
4283 	int status, listener, memfd;
4284 	struct seccomp_notif_addfd addfd = {};
4285 	struct seccomp_notif req = {};
4286 	struct seccomp_notif_resp resp = {};
4287 	const struct rlimit lim = {
4288 		.rlim_cur	= 0,
4289 		.rlim_max	= 0,
4290 	};
4291 
4292 	memfd = memfd_create("test", 0);
4293 	ASSERT_GE(memfd, 0);
4294 
4295 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4296 	ASSERT_EQ(0, ret) {
4297 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4298 	}
4299 
4300 	/* Check that the basic notification machinery works */
4301 	listener = user_notif_syscall(__NR_getppid,
4302 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4303 	ASSERT_GE(listener, 0);
4304 
4305 	pid = fork();
4306 	ASSERT_GE(pid, 0);
4307 
4308 	if (pid == 0)
4309 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4310 
4311 
4312 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4313 
4314 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4315 
4316 	addfd.srcfd = memfd;
4317 	addfd.newfd_flags = O_CLOEXEC;
4318 	addfd.newfd = 0;
4319 	addfd.id = req.id;
4320 	addfd.flags = 0;
4321 
4322 	/* Should probably spot check /proc/sys/fs/file-nr */
4323 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4324 	EXPECT_EQ(errno, EMFILE);
4325 
4326 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4327 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4328 	EXPECT_EQ(errno, EMFILE);
4329 
4330 	addfd.newfd = 100;
4331 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4332 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4333 	EXPECT_EQ(errno, EBADF);
4334 
4335 	resp.id = req.id;
4336 	resp.error = 0;
4337 	resp.val = USER_NOTIF_MAGIC;
4338 
4339 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4340 
4341 	/* Wait for child to finish. */
4342 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4343 	EXPECT_EQ(true, WIFEXITED(status));
4344 	EXPECT_EQ(0, WEXITSTATUS(status));
4345 
4346 	close(memfd);
4347 }
4348 
4349 #ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
4350 #define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
4351 #define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
4352 #endif
4353 
TEST(user_notification_sync)4354 TEST(user_notification_sync)
4355 {
4356 	struct seccomp_notif req = {};
4357 	struct seccomp_notif_resp resp = {};
4358 	int status, listener;
4359 	pid_t pid;
4360 	long ret;
4361 
4362 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4363 	ASSERT_EQ(0, ret) {
4364 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4365 	}
4366 
4367 	listener = user_notif_syscall(__NR_getppid,
4368 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4369 	ASSERT_GE(listener, 0);
4370 
4371 	/* Try to set invalid flags. */
4372 	EXPECT_SYSCALL_RETURN(-EINVAL,
4373 		ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
4374 
4375 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
4376 			SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
4377 
4378 	pid = fork();
4379 	ASSERT_GE(pid, 0);
4380 	if (pid == 0) {
4381 		ret = syscall(__NR_getppid);
4382 		ASSERT_EQ(ret, USER_NOTIF_MAGIC) {
4383 			_exit(1);
4384 		}
4385 		_exit(0);
4386 	}
4387 
4388 	req.pid = 0;
4389 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4390 
4391 	ASSERT_EQ(req.data.nr,  __NR_getppid);
4392 
4393 	resp.id = req.id;
4394 	resp.error = 0;
4395 	resp.val = USER_NOTIF_MAGIC;
4396 	resp.flags = 0;
4397 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4398 
4399 	ASSERT_EQ(waitpid(pid, &status, 0), pid);
4400 	ASSERT_EQ(status, 0);
4401 }
4402 
4403 
4404 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP)4405 FIXTURE(O_SUSPEND_SECCOMP) {
4406 	pid_t pid;
4407 };
4408 
FIXTURE_SETUP(O_SUSPEND_SECCOMP)4409 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4410 {
4411 	ERRNO_FILTER(block_read, E2BIG);
4412 	cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4413 	cap_t caps;
4414 
4415 	self->pid = 0;
4416 
4417 	/* make sure we don't have CAP_SYS_ADMIN */
4418 	caps = cap_get_proc();
4419 	ASSERT_NE(NULL, caps);
4420 	ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4421 	ASSERT_EQ(0, cap_set_proc(caps));
4422 	cap_free(caps);
4423 
4424 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4425 	ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4426 
4427 	self->pid = fork();
4428 	ASSERT_GE(self->pid, 0);
4429 
4430 	if (self->pid == 0) {
4431 		while (1)
4432 			pause();
4433 		_exit(127);
4434 	}
4435 }
4436 
FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)4437 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4438 {
4439 	if (self->pid)
4440 		kill(self->pid, SIGKILL);
4441 }
4442 
TEST_F(O_SUSPEND_SECCOMP,setoptions)4443 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4444 {
4445 	int wstatus;
4446 
4447 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4448 	ASSERT_EQ(self->pid, wait(&wstatus));
4449 	ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4450 	if (errno == EINVAL)
4451 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4452 	ASSERT_EQ(EPERM, errno);
4453 }
4454 
TEST_F(O_SUSPEND_SECCOMP,seize)4455 TEST_F(O_SUSPEND_SECCOMP, seize)
4456 {
4457 	int ret;
4458 
4459 	ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4460 	ASSERT_EQ(-1, ret);
4461 	if (errno == EINVAL)
4462 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4463 	ASSERT_EQ(EPERM, errno);
4464 }
4465 
4466 /*
4467  * get_nth - Get the nth, space separated entry in a file.
4468  *
4469  * Returns the length of the read field.
4470  * Throws error if field is zero-lengthed.
4471  */
get_nth(struct __test_metadata * _metadata,const char * path,const unsigned int position,char ** entry)4472 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4473 		     const unsigned int position, char **entry)
4474 {
4475 	char *line = NULL;
4476 	unsigned int i;
4477 	ssize_t nread;
4478 	size_t len = 0;
4479 	FILE *f;
4480 
4481 	f = fopen(path, "r");
4482 	ASSERT_NE(f, NULL) {
4483 		TH_LOG("Could not open %s: %s", path, strerror(errno));
4484 	}
4485 
4486 	for (i = 0; i < position; i++) {
4487 		nread = getdelim(&line, &len, ' ', f);
4488 		ASSERT_GE(nread, 0) {
4489 			TH_LOG("Failed to read %d entry in file %s", i, path);
4490 		}
4491 	}
4492 	fclose(f);
4493 
4494 	ASSERT_GT(nread, 0) {
4495 		TH_LOG("Entry in file %s had zero length", path);
4496 	}
4497 
4498 	*entry = line;
4499 	return nread - 1;
4500 }
4501 
4502 /* For a given PID, get the task state (D, R, etc...) */
get_proc_stat(struct __test_metadata * _metadata,pid_t pid)4503 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4504 {
4505 	char proc_path[100] = {0};
4506 	char status;
4507 	char *line;
4508 
4509 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4510 	ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4511 
4512 	status = *line;
4513 	free(line);
4514 
4515 	return status;
4516 }
4517 
TEST(user_notification_fifo)4518 TEST(user_notification_fifo)
4519 {
4520 	struct seccomp_notif_resp resp = {};
4521 	struct seccomp_notif req = {};
4522 	int i, status, listener;
4523 	pid_t pid, pids[3];
4524 	__u64 baseid;
4525 	long ret;
4526 	/* 100 ms */
4527 	struct timespec delay = { .tv_nsec = 100000000 };
4528 
4529 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4530 	ASSERT_EQ(0, ret) {
4531 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4532 	}
4533 
4534 	/* Setup a listener */
4535 	listener = user_notif_syscall(__NR_getppid,
4536 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4537 	ASSERT_GE(listener, 0);
4538 
4539 	pid = fork();
4540 	ASSERT_GE(pid, 0);
4541 
4542 	if (pid == 0) {
4543 		ret = syscall(__NR_getppid);
4544 		exit(ret != USER_NOTIF_MAGIC);
4545 	}
4546 
4547 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4548 	baseid = req.id + 1;
4549 
4550 	resp.id = req.id;
4551 	resp.error = 0;
4552 	resp.val = USER_NOTIF_MAGIC;
4553 
4554 	/* check that we make sure flags == 0 */
4555 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4556 
4557 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4558 	EXPECT_EQ(true, WIFEXITED(status));
4559 	EXPECT_EQ(0, WEXITSTATUS(status));
4560 
4561 	/* Start children, and generate notifications */
4562 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4563 		pid = fork();
4564 		if (pid == 0) {
4565 			ret = syscall(__NR_getppid);
4566 			exit(ret != USER_NOTIF_MAGIC);
4567 		}
4568 		pids[i] = pid;
4569 	}
4570 
4571 	/* This spins until all of the children are sleeping */
4572 restart_wait:
4573 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4574 		if (get_proc_stat(_metadata, pids[i]) != 'S') {
4575 			nanosleep(&delay, NULL);
4576 			goto restart_wait;
4577 		}
4578 	}
4579 
4580 	/* Read the notifications in order (and respond) */
4581 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4582 		memset(&req, 0, sizeof(req));
4583 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4584 		EXPECT_EQ(req.id, baseid + i);
4585 		resp.id = req.id;
4586 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4587 	}
4588 
4589 	/* Make sure notifications were received */
4590 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4591 		EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4592 		EXPECT_EQ(true, WIFEXITED(status));
4593 		EXPECT_EQ(0, WEXITSTATUS(status));
4594 	}
4595 }
4596 
4597 /* get_proc_syscall - Get the syscall in progress for a given pid
4598  *
4599  * Returns the current syscall number for a given process
4600  * Returns -1 if not in syscall (running or blocked)
4601  */
get_proc_syscall(struct __test_metadata * _metadata,int pid)4602 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4603 {
4604 	char proc_path[100] = {0};
4605 	long ret = -1;
4606 	ssize_t nread;
4607 	char *line;
4608 
4609 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4610 	nread = get_nth(_metadata, proc_path, 1, &line);
4611 	ASSERT_GT(nread, 0);
4612 
4613 	if (!strncmp("running", line, MIN(7, nread)))
4614 		ret = strtol(line, NULL, 16);
4615 
4616 	free(line);
4617 	return ret;
4618 }
4619 
4620 /* Ensure non-fatal signals prior to receive are unmodified */
TEST(user_notification_wait_killable_pre_notification)4621 TEST(user_notification_wait_killable_pre_notification)
4622 {
4623 	struct sigaction new_action = {
4624 		.sa_handler = signal_handler,
4625 	};
4626 	int listener, status, sk_pair[2];
4627 	pid_t pid;
4628 	long ret;
4629 	char c;
4630 	/* 100 ms */
4631 	struct timespec delay = { .tv_nsec = 100000000 };
4632 
4633 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4634 
4635 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4636 	ASSERT_EQ(0, ret)
4637 	{
4638 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4639 	}
4640 
4641 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4642 
4643 	listener = user_notif_syscall(
4644 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4645 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4646 	ASSERT_GE(listener, 0);
4647 
4648 	/*
4649 	 * Check that we can kill the process with SIGUSR1 prior to receiving
4650 	 * the notification. SIGUSR1 is wired up to a custom signal handler,
4651 	 * and make sure it gets called.
4652 	 */
4653 	pid = fork();
4654 	ASSERT_GE(pid, 0);
4655 
4656 	if (pid == 0) {
4657 		close(sk_pair[0]);
4658 		handled = sk_pair[1];
4659 
4660 		/* Setup the non-fatal sigaction without SA_RESTART */
4661 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4662 			perror("sigaction");
4663 			exit(1);
4664 		}
4665 
4666 		ret = syscall(__NR_getppid);
4667 		/* Make sure we got a return from a signal interruption */
4668 		exit(ret != -1 || errno != EINTR);
4669 	}
4670 
4671 	/*
4672 	 * Make sure we've gotten to the seccomp user notification wait
4673 	 * from getppid prior to sending any signals
4674 	 */
4675 	while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4676 	       get_proc_stat(_metadata, pid) != 'S')
4677 		nanosleep(&delay, NULL);
4678 
4679 	/* Send non-fatal kill signal */
4680 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4681 
4682 	/* wait for process to exit (exit checks for EINTR) */
4683 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4684 	EXPECT_EQ(true, WIFEXITED(status));
4685 	EXPECT_EQ(0, WEXITSTATUS(status));
4686 
4687 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4688 }
4689 
4690 /* Ensure non-fatal signals after receive are blocked */
TEST(user_notification_wait_killable)4691 TEST(user_notification_wait_killable)
4692 {
4693 	struct sigaction new_action = {
4694 		.sa_handler = signal_handler,
4695 	};
4696 	struct seccomp_notif_resp resp = {};
4697 	struct seccomp_notif req = {};
4698 	int listener, status, sk_pair[2];
4699 	pid_t pid;
4700 	long ret;
4701 	char c;
4702 	/* 100 ms */
4703 	struct timespec delay = { .tv_nsec = 100000000 };
4704 
4705 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4706 
4707 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4708 	ASSERT_EQ(0, ret)
4709 	{
4710 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4711 	}
4712 
4713 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4714 
4715 	listener = user_notif_syscall(
4716 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4717 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4718 	ASSERT_GE(listener, 0);
4719 
4720 	pid = fork();
4721 	ASSERT_GE(pid, 0);
4722 
4723 	if (pid == 0) {
4724 		close(sk_pair[0]);
4725 		handled = sk_pair[1];
4726 
4727 		/* Setup the sigaction without SA_RESTART */
4728 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4729 			perror("sigaction");
4730 			exit(1);
4731 		}
4732 
4733 		/* Make sure that the syscall is completed (no EINTR) */
4734 		ret = syscall(__NR_getppid);
4735 		exit(ret != USER_NOTIF_MAGIC);
4736 	}
4737 
4738 	/*
4739 	 * Get the notification, to make move the notifying process into a
4740 	 * non-preemptible (TASK_KILLABLE) state.
4741 	 */
4742 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4743 	/* Send non-fatal kill signal */
4744 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4745 
4746 	/*
4747 	 * Make sure the task enters moves to TASK_KILLABLE by waiting for
4748 	 * D (Disk Sleep) state after receiving non-fatal signal.
4749 	 */
4750 	while (get_proc_stat(_metadata, pid) != 'D')
4751 		nanosleep(&delay, NULL);
4752 
4753 	resp.id = req.id;
4754 	resp.val = USER_NOTIF_MAGIC;
4755 	/* Make sure the notification is found and able to be replied to */
4756 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4757 
4758 	/*
4759 	 * Make sure that the signal handler does get called once we're back in
4760 	 * userspace.
4761 	 */
4762 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4763 	/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4764 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4765 	EXPECT_EQ(true, WIFEXITED(status));
4766 	EXPECT_EQ(0, WEXITSTATUS(status));
4767 }
4768 
4769 /* Ensure fatal signals after receive are not blocked */
TEST(user_notification_wait_killable_fatal)4770 TEST(user_notification_wait_killable_fatal)
4771 {
4772 	struct seccomp_notif req = {};
4773 	int listener, status;
4774 	pid_t pid;
4775 	long ret;
4776 	/* 100 ms */
4777 	struct timespec delay = { .tv_nsec = 100000000 };
4778 
4779 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4780 	ASSERT_EQ(0, ret)
4781 	{
4782 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4783 	}
4784 
4785 	listener = user_notif_syscall(
4786 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4787 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4788 	ASSERT_GE(listener, 0);
4789 
4790 	pid = fork();
4791 	ASSERT_GE(pid, 0);
4792 
4793 	if (pid == 0) {
4794 		/* This should never complete as it should get a SIGTERM */
4795 		syscall(__NR_getppid);
4796 		exit(1);
4797 	}
4798 
4799 	while (get_proc_stat(_metadata, pid) != 'S')
4800 		nanosleep(&delay, NULL);
4801 
4802 	/*
4803 	 * Get the notification, to make move the notifying process into a
4804 	 * non-preemptible (TASK_KILLABLE) state.
4805 	 */
4806 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4807 	/* Kill the process with a fatal signal */
4808 	EXPECT_EQ(kill(pid, SIGTERM), 0);
4809 
4810 	/*
4811 	 * Wait for the process to exit, and make sure the process terminated
4812 	 * due to the SIGTERM signal.
4813 	 */
4814 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4815 	EXPECT_EQ(true, WIFSIGNALED(status));
4816 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
4817 }
4818 
4819 struct tsync_vs_thread_leader_args {
4820 	pthread_t leader;
4821 };
4822 
tsync_vs_dead_thread_leader_sibling(void * _args)4823 static void *tsync_vs_dead_thread_leader_sibling(void *_args)
4824 {
4825 	struct sock_filter allow_filter[] = {
4826 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4827 	};
4828 	struct sock_fprog allow_prog = {
4829 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
4830 		.filter = allow_filter,
4831 	};
4832 	struct tsync_vs_thread_leader_args *args = _args;
4833 	void *retval;
4834 	long ret;
4835 
4836 	ret = pthread_join(args->leader, &retval);
4837 	if (ret)
4838 		exit(1);
4839 	if (retval != _args)
4840 		exit(2);
4841 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &allow_prog);
4842 	if (ret)
4843 		exit(3);
4844 
4845 	exit(0);
4846 }
4847 
4848 /*
4849  * Ensure that a dead thread leader doesn't prevent installing new filters with
4850  * SECCOMP_FILTER_FLAG_TSYNC from other threads.
4851  */
TEST(tsync_vs_dead_thread_leader)4852 TEST(tsync_vs_dead_thread_leader)
4853 {
4854 	int status;
4855 	pid_t pid;
4856 	long ret;
4857 
4858 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4859 	ASSERT_EQ(0, ret) {
4860 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4861 	}
4862 
4863 	pid = fork();
4864 	ASSERT_GE(pid, 0);
4865 
4866 	if (pid == 0) {
4867 		struct sock_filter allow_filter[] = {
4868 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4869 		};
4870 		struct sock_fprog allow_prog = {
4871 			.len = (unsigned short)ARRAY_SIZE(allow_filter),
4872 			.filter = allow_filter,
4873 		};
4874 		struct  tsync_vs_thread_leader_args *args;
4875 		pthread_t sibling;
4876 
4877 		args = malloc(sizeof(*args));
4878 		ASSERT_NE(NULL, args);
4879 		args->leader = pthread_self();
4880 
4881 		ret = pthread_create(&sibling, NULL,
4882 				     tsync_vs_dead_thread_leader_sibling, args);
4883 		ASSERT_EQ(0, ret);
4884 
4885 		/* Install a new filter just to the leader thread. */
4886 		ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
4887 		ASSERT_EQ(0, ret);
4888 		pthread_exit(args);
4889 		exit(1);
4890 	}
4891 
4892 	EXPECT_EQ(pid, waitpid(pid, &status, 0));
4893 	EXPECT_EQ(0, status);
4894 }
4895 
probed(void)4896 noinline int probed(void)
4897 {
4898 	return 1;
4899 }
4900 
parse_uint_from_file(const char * file,const char * fmt)4901 static int parse_uint_from_file(const char *file, const char *fmt)
4902 {
4903 	int err = -1, ret;
4904 	FILE *f;
4905 
4906 	f = fopen(file, "re");
4907 	if (f) {
4908 		err = fscanf(f, fmt, &ret);
4909 		fclose(f);
4910 	}
4911 	return err == 1 ? ret : err;
4912 }
4913 
determine_uprobe_perf_type(void)4914 static int determine_uprobe_perf_type(void)
4915 {
4916 	const char *file = "/sys/bus/event_source/devices/uprobe/type";
4917 
4918 	return parse_uint_from_file(file, "%d\n");
4919 }
4920 
determine_uprobe_retprobe_bit(void)4921 static int determine_uprobe_retprobe_bit(void)
4922 {
4923 	const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe";
4924 
4925 	return parse_uint_from_file(file, "config:%d\n");
4926 }
4927 
get_uprobe_offset(const void * addr)4928 static ssize_t get_uprobe_offset(const void *addr)
4929 {
4930 	size_t start, base, end;
4931 	bool found = false;
4932 	char buf[256];
4933 	FILE *f;
4934 
4935 	f = fopen("/proc/self/maps", "r");
4936 	if (!f)
4937 		return -1;
4938 
4939 	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
4940 		if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
4941 			found = true;
4942 			break;
4943 		}
4944 	}
4945 	fclose(f);
4946 	return found ? (uintptr_t)addr - start + base : -1;
4947 }
4948 
FIXTURE(URETPROBE)4949 FIXTURE(URETPROBE) {
4950 	int fd;
4951 };
4952 
FIXTURE_VARIANT(URETPROBE)4953 FIXTURE_VARIANT(URETPROBE) {
4954 	/*
4955 	 * All of the URETPROBE behaviors can be tested with either
4956 	 * uretprobe attached or not
4957 	 */
4958 	bool attach;
4959 };
4960 
FIXTURE_VARIANT_ADD(URETPROBE,attached)4961 FIXTURE_VARIANT_ADD(URETPROBE, attached) {
4962 	.attach = true,
4963 };
4964 
FIXTURE_VARIANT_ADD(URETPROBE,not_attached)4965 FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
4966 	.attach = false,
4967 };
4968 
FIXTURE_SETUP(URETPROBE)4969 FIXTURE_SETUP(URETPROBE)
4970 {
4971 	const size_t attr_sz = sizeof(struct perf_event_attr);
4972 	struct perf_event_attr attr;
4973 	ssize_t offset;
4974 	int type, bit;
4975 
4976 #ifndef __NR_uretprobe
4977 	SKIP(return, "__NR_uretprobe syscall not defined");
4978 #endif
4979 
4980 	if (!variant->attach)
4981 		return;
4982 
4983 	memset(&attr, 0, attr_sz);
4984 
4985 	type = determine_uprobe_perf_type();
4986 	ASSERT_GE(type, 0);
4987 	bit = determine_uprobe_retprobe_bit();
4988 	ASSERT_GE(bit, 0);
4989 	offset = get_uprobe_offset(probed);
4990 	ASSERT_GE(offset, 0);
4991 
4992 	attr.config |= 1 << bit;
4993 	attr.size = attr_sz;
4994 	attr.type = type;
4995 	attr.config1 = ptr_to_u64("/proc/self/exe");
4996 	attr.config2 = offset;
4997 
4998 	self->fd = syscall(__NR_perf_event_open, &attr,
4999 			   getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */,
5000 			   PERF_FLAG_FD_CLOEXEC);
5001 }
5002 
FIXTURE_TEARDOWN(URETPROBE)5003 FIXTURE_TEARDOWN(URETPROBE)
5004 {
5005 	/* we could call close(self->fd), but we'd need extra filter for
5006 	 * that and since we are calling _exit right away..
5007 	 */
5008 }
5009 
run_probed_with_filter(struct sock_fprog * prog)5010 static int run_probed_with_filter(struct sock_fprog *prog)
5011 {
5012 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
5013 	    seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) {
5014 		return -1;
5015 	}
5016 
5017 	probed();
5018 	return 0;
5019 }
5020 
TEST_F(URETPROBE,uretprobe_default_allow)5021 TEST_F(URETPROBE, uretprobe_default_allow)
5022 {
5023 	struct sock_filter filter[] = {
5024 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5025 	};
5026 	struct sock_fprog prog = {
5027 		.len = (unsigned short)ARRAY_SIZE(filter),
5028 		.filter = filter,
5029 	};
5030 
5031 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5032 }
5033 
TEST_F(URETPROBE,uretprobe_default_block)5034 TEST_F(URETPROBE, uretprobe_default_block)
5035 {
5036 	struct sock_filter filter[] = {
5037 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5038 			offsetof(struct seccomp_data, nr)),
5039 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5040 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5041 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5042 	};
5043 	struct sock_fprog prog = {
5044 		.len = (unsigned short)ARRAY_SIZE(filter),
5045 		.filter = filter,
5046 	};
5047 
5048 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5049 }
5050 
TEST_F(URETPROBE,uretprobe_block_uretprobe_syscall)5051 TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
5052 {
5053 	struct sock_filter filter[] = {
5054 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5055 			offsetof(struct seccomp_data, nr)),
5056 #ifdef __NR_uretprobe
5057 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
5058 #endif
5059 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5060 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5061 	};
5062 	struct sock_fprog prog = {
5063 		.len = (unsigned short)ARRAY_SIZE(filter),
5064 		.filter = filter,
5065 	};
5066 
5067 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5068 }
5069 
TEST_F(URETPROBE,uretprobe_default_block_with_uretprobe_syscall)5070 TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
5071 {
5072 	struct sock_filter filter[] = {
5073 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5074 			offsetof(struct seccomp_data, nr)),
5075 #ifdef __NR_uretprobe
5076 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
5077 #endif
5078 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5079 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5080 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5081 	};
5082 	struct sock_fprog prog = {
5083 		.len = (unsigned short)ARRAY_SIZE(filter),
5084 		.filter = filter,
5085 	};
5086 
5087 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5088 }
5089 
5090 /*
5091  * TODO:
5092  * - expand NNP testing
5093  * - better arch-specific TRACE and TRAP handlers.
5094  * - endianness checking when appropriate
5095  * - 64-bit arg prodding
5096  * - arch value testing (x86 modes especially)
5097  * - verify that FILTER_FLAG_LOG filters generate log messages
5098  * - verify that RET_LOG generates log messages
5099  */
5100 
5101 TEST_HARNESS_MAIN
5102