xref: /freebsd/contrib/capsicum-test/linux.cc (revision 8ac5aef8f354384d727d3d25b8d4c195596647c1)
1*8ac5aef8SEnji Cooper // Tests of Linux-specific functionality
2*8ac5aef8SEnji Cooper #ifdef __linux__
3*8ac5aef8SEnji Cooper 
4*8ac5aef8SEnji Cooper #include <sys/types.h>
5*8ac5aef8SEnji Cooper #include <sys/stat.h>
6*8ac5aef8SEnji Cooper #include <sys/socket.h>
7*8ac5aef8SEnji Cooper #include <sys/timerfd.h>
8*8ac5aef8SEnji Cooper #include <sys/signalfd.h>
9*8ac5aef8SEnji Cooper #include <sys/eventfd.h>
10*8ac5aef8SEnji Cooper #include <sys/epoll.h>
11*8ac5aef8SEnji Cooper #include <sys/inotify.h>
12*8ac5aef8SEnji Cooper #include <sys/fanotify.h>
13*8ac5aef8SEnji Cooper #include <sys/mman.h>
14*8ac5aef8SEnji Cooper #include <sys/capability.h>  // Requires e.g. libcap-dev package for POSIX.1e capabilities headers
15*8ac5aef8SEnji Cooper #include <linux/aio_abi.h>
16*8ac5aef8SEnji Cooper #include <linux/filter.h>
17*8ac5aef8SEnji Cooper #include <linux/seccomp.h>
18*8ac5aef8SEnji Cooper #include <linux/version.h>
19*8ac5aef8SEnji Cooper #include <poll.h>
20*8ac5aef8SEnji Cooper #include <sched.h>
21*8ac5aef8SEnji Cooper #include <signal.h>
22*8ac5aef8SEnji Cooper #include <fcntl.h>
23*8ac5aef8SEnji Cooper #include <unistd.h>
24*8ac5aef8SEnji Cooper 
25*8ac5aef8SEnji Cooper #include <string>
26*8ac5aef8SEnji Cooper 
27*8ac5aef8SEnji Cooper #include "capsicum.h"
28*8ac5aef8SEnji Cooper #include "syscalls.h"
29*8ac5aef8SEnji Cooper #include "capsicum-test.h"
30*8ac5aef8SEnji Cooper 
31*8ac5aef8SEnji Cooper TEST(Linux, TimerFD) {
32*8ac5aef8SEnji Cooper   int fd = timerfd_create(CLOCK_MONOTONIC, 0);
33*8ac5aef8SEnji Cooper 
34*8ac5aef8SEnji Cooper   cap_rights_t r_ro;
35*8ac5aef8SEnji Cooper   cap_rights_init(&r_ro, CAP_READ);
36*8ac5aef8SEnji Cooper   cap_rights_t r_wo;
37*8ac5aef8SEnji Cooper   cap_rights_init(&r_wo, CAP_WRITE);
38*8ac5aef8SEnji Cooper   cap_rights_t r_rw;
39*8ac5aef8SEnji Cooper   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
40*8ac5aef8SEnji Cooper   cap_rights_t r_rwpoll;
41*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwpoll, CAP_READ, CAP_WRITE, CAP_EVENT);
42*8ac5aef8SEnji Cooper 
43*8ac5aef8SEnji Cooper   int cap_fd_ro = dup(fd);
44*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_ro);
45*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_ro));
46*8ac5aef8SEnji Cooper   int cap_fd_wo = dup(fd);
47*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_wo);
48*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_wo));
49*8ac5aef8SEnji Cooper   int cap_fd_rw = dup(fd);
50*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_rw);
51*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rw));
52*8ac5aef8SEnji Cooper   int cap_fd_all = dup(fd);
53*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_all);
54*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwpoll));
55*8ac5aef8SEnji Cooper 
56*8ac5aef8SEnji Cooper   struct itimerspec old_ispec;
57*8ac5aef8SEnji Cooper   struct itimerspec ispec;
58*8ac5aef8SEnji Cooper   ispec.it_interval.tv_sec = 0;
59*8ac5aef8SEnji Cooper   ispec.it_interval.tv_nsec = 0;
60*8ac5aef8SEnji Cooper   ispec.it_value.tv_sec = 0;
61*8ac5aef8SEnji Cooper   ispec.it_value.tv_nsec = 100000000;  // 100ms
62*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_ro, 0, &ispec, NULL));
63*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_wo, 0, &ispec, &old_ispec));
64*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_settime(cap_fd_wo, 0, &ispec, NULL));
65*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_settime(cap_fd_rw, 0, &ispec, NULL));
66*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_settime(cap_fd_all, 0, &ispec, NULL));
67*8ac5aef8SEnji Cooper 
68*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(timerfd_gettime(cap_fd_wo, &old_ispec));
69*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_gettime(cap_fd_ro, &old_ispec));
70*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_gettime(cap_fd_rw, &old_ispec));
71*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
72*8ac5aef8SEnji Cooper 
73*8ac5aef8SEnji Cooper   // To be able to poll() for the timer pop, still need CAP_EVENT.
74*8ac5aef8SEnji Cooper   struct pollfd poll_fd;
75*8ac5aef8SEnji Cooper   for (int ii = 0; ii < 3; ii++) {
76*8ac5aef8SEnji Cooper     poll_fd.revents = 0;
77*8ac5aef8SEnji Cooper     poll_fd.events = POLLIN;
78*8ac5aef8SEnji Cooper     switch (ii) {
79*8ac5aef8SEnji Cooper     case 0: poll_fd.fd = cap_fd_ro; break;
80*8ac5aef8SEnji Cooper     case 1: poll_fd.fd = cap_fd_wo; break;
81*8ac5aef8SEnji Cooper     case 2: poll_fd.fd = cap_fd_rw; break;
82*8ac5aef8SEnji Cooper     }
83*8ac5aef8SEnji Cooper     // Poll immediately returns with POLLNVAL
84*8ac5aef8SEnji Cooper     EXPECT_OK(poll(&poll_fd, 1, 400));
85*8ac5aef8SEnji Cooper     EXPECT_EQ(0, (poll_fd.revents & POLLIN));
86*8ac5aef8SEnji Cooper     EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
87*8ac5aef8SEnji Cooper   }
88*8ac5aef8SEnji Cooper 
89*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_all;
90*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 400));
91*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLIN));
92*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
93*8ac5aef8SEnji Cooper 
94*8ac5aef8SEnji Cooper   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
95*8ac5aef8SEnji Cooper   EXPECT_EQ(0, old_ispec.it_value.tv_sec);
96*8ac5aef8SEnji Cooper   EXPECT_EQ(0, old_ispec.it_value.tv_nsec);
97*8ac5aef8SEnji Cooper   EXPECT_EQ(0, old_ispec.it_interval.tv_sec);
98*8ac5aef8SEnji Cooper   EXPECT_EQ(0, old_ispec.it_interval.tv_nsec);
99*8ac5aef8SEnji Cooper 
100*8ac5aef8SEnji Cooper   close(cap_fd_all);
101*8ac5aef8SEnji Cooper   close(cap_fd_rw);
102*8ac5aef8SEnji Cooper   close(cap_fd_wo);
103*8ac5aef8SEnji Cooper   close(cap_fd_ro);
104*8ac5aef8SEnji Cooper   close(fd);
105*8ac5aef8SEnji Cooper }
106*8ac5aef8SEnji Cooper 
107*8ac5aef8SEnji Cooper FORK_TEST(Linux, SignalFD) {
108*8ac5aef8SEnji Cooper   if (force_mt) {
109*8ac5aef8SEnji Cooper     TEST_SKIPPED("multi-threaded run clashes with signals");
110*8ac5aef8SEnji Cooper     return;
111*8ac5aef8SEnji Cooper   }
112*8ac5aef8SEnji Cooper   pid_t me = getpid();
113*8ac5aef8SEnji Cooper   sigset_t mask;
114*8ac5aef8SEnji Cooper   sigemptyset(&mask);
115*8ac5aef8SEnji Cooper   sigaddset(&mask, SIGUSR1);
116*8ac5aef8SEnji Cooper 
117*8ac5aef8SEnji Cooper   // Block signals before registering against a new signal FD.
118*8ac5aef8SEnji Cooper   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
119*8ac5aef8SEnji Cooper   int fd = signalfd(-1, &mask, 0);
120*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
121*8ac5aef8SEnji Cooper 
122*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
123*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
124*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
125*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
126*8ac5aef8SEnji Cooper   cap_rights_t r_sig;
127*8ac5aef8SEnji Cooper   cap_rights_init(&r_sig, CAP_FSIGNAL);
128*8ac5aef8SEnji Cooper   cap_rights_t r_rssig;
129*8ac5aef8SEnji Cooper   cap_rights_init(&r_rssig, CAP_FSIGNAL, CAP_READ, CAP_SEEK);
130*8ac5aef8SEnji Cooper   cap_rights_t r_rssig_poll;
131*8ac5aef8SEnji Cooper   cap_rights_init(&r_rssig_poll, CAP_FSIGNAL, CAP_READ, CAP_SEEK, CAP_EVENT);
132*8ac5aef8SEnji Cooper 
133*8ac5aef8SEnji Cooper   // Various capability variants.
134*8ac5aef8SEnji Cooper   int cap_fd_none = dup(fd);
135*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_none);
136*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_none, &r_ws));
137*8ac5aef8SEnji Cooper   int cap_fd_read = dup(fd);
138*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_read);
139*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_read, &r_rs));
140*8ac5aef8SEnji Cooper   int cap_fd_sig = dup(fd);
141*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_sig);
142*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_sig, &r_sig));
143*8ac5aef8SEnji Cooper   int cap_fd_sig_read = dup(fd);
144*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_sig_read);
145*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_sig_read, &r_rssig));
146*8ac5aef8SEnji Cooper   int cap_fd_all = dup(fd);
147*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_all);
148*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rssig_poll));
149*8ac5aef8SEnji Cooper 
150*8ac5aef8SEnji Cooper   struct signalfd_siginfo fdsi;
151*8ac5aef8SEnji Cooper 
152*8ac5aef8SEnji Cooper   // Need CAP_READ to read the signal information
153*8ac5aef8SEnji Cooper   kill(me, SIGUSR1);
154*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(read(cap_fd_none, &fdsi, sizeof(struct signalfd_siginfo)));
155*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(read(cap_fd_sig, &fdsi, sizeof(struct signalfd_siginfo)));
156*8ac5aef8SEnji Cooper   int len = read(cap_fd_read, &fdsi, sizeof(struct signalfd_siginfo));
157*8ac5aef8SEnji Cooper   EXPECT_OK(len);
158*8ac5aef8SEnji Cooper   EXPECT_EQ(sizeof(struct signalfd_siginfo), (size_t)len);
159*8ac5aef8SEnji Cooper   EXPECT_EQ(SIGUSR1, (int)fdsi.ssi_signo);
160*8ac5aef8SEnji Cooper 
161*8ac5aef8SEnji Cooper   // Need CAP_FSIGNAL to modify the signal mask.
162*8ac5aef8SEnji Cooper   sigemptyset(&mask);
163*8ac5aef8SEnji Cooper   sigaddset(&mask, SIGUSR1);
164*8ac5aef8SEnji Cooper   sigaddset(&mask, SIGUSR2);
165*8ac5aef8SEnji Cooper   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
166*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(signalfd(cap_fd_none, &mask, 0));
167*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(signalfd(cap_fd_read, &mask, 0));
168*8ac5aef8SEnji Cooper   EXPECT_EQ(cap_fd_sig, signalfd(cap_fd_sig, &mask, 0));
169*8ac5aef8SEnji Cooper 
170*8ac5aef8SEnji Cooper   // Need CAP_EVENT to get notification of a signal in poll(2).
171*8ac5aef8SEnji Cooper   kill(me, SIGUSR2);
172*8ac5aef8SEnji Cooper 
173*8ac5aef8SEnji Cooper   struct pollfd poll_fd;
174*8ac5aef8SEnji Cooper   poll_fd.revents = 0;
175*8ac5aef8SEnji Cooper   poll_fd.events = POLLIN;
176*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_sig_read;
177*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 400));
178*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
179*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
180*8ac5aef8SEnji Cooper 
181*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_all;
182*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 400));
183*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLIN));
184*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
185*8ac5aef8SEnji Cooper }
186*8ac5aef8SEnji Cooper 
187*8ac5aef8SEnji Cooper TEST(Linux, EventFD) {
188*8ac5aef8SEnji Cooper   int fd = eventfd(0, 0);
189*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
190*8ac5aef8SEnji Cooper 
191*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
192*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
193*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
194*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
195*8ac5aef8SEnji Cooper   cap_rights_t r_rws;
196*8ac5aef8SEnji Cooper   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
197*8ac5aef8SEnji Cooper   cap_rights_t r_rwspoll;
198*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
199*8ac5aef8SEnji Cooper 
200*8ac5aef8SEnji Cooper   int cap_ro = dup(fd);
201*8ac5aef8SEnji Cooper   EXPECT_OK(cap_ro);
202*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
203*8ac5aef8SEnji Cooper   int cap_wo = dup(fd);
204*8ac5aef8SEnji Cooper   EXPECT_OK(cap_wo);
205*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
206*8ac5aef8SEnji Cooper   int cap_rw = dup(fd);
207*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rw);
208*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_rw, &r_rws));
209*8ac5aef8SEnji Cooper   int cap_all = dup(fd);
210*8ac5aef8SEnji Cooper   EXPECT_OK(cap_all);
211*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_all, &r_rwspoll));
212*8ac5aef8SEnji Cooper 
213*8ac5aef8SEnji Cooper   pid_t child = fork();
214*8ac5aef8SEnji Cooper   if (child == 0) {
215*8ac5aef8SEnji Cooper     // Child: write counter to eventfd
216*8ac5aef8SEnji Cooper     uint64_t u = 42;
217*8ac5aef8SEnji Cooper     EXPECT_NOTCAPABLE(write(cap_ro, &u, sizeof(u)));
218*8ac5aef8SEnji Cooper     EXPECT_OK(write(cap_wo, &u, sizeof(u)));
219*8ac5aef8SEnji Cooper     exit(HasFailure());
220*8ac5aef8SEnji Cooper   }
221*8ac5aef8SEnji Cooper 
222*8ac5aef8SEnji Cooper   sleep(1);  // Allow child to write
223*8ac5aef8SEnji Cooper 
224*8ac5aef8SEnji Cooper   struct pollfd poll_fd;
225*8ac5aef8SEnji Cooper   poll_fd.revents = 0;
226*8ac5aef8SEnji Cooper   poll_fd.events = POLLIN;
227*8ac5aef8SEnji Cooper   poll_fd.fd = cap_rw;
228*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 400));
229*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
230*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
231*8ac5aef8SEnji Cooper 
232*8ac5aef8SEnji Cooper   poll_fd.fd = cap_all;
233*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 400));
234*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLIN));
235*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
236*8ac5aef8SEnji Cooper 
237*8ac5aef8SEnji Cooper   uint64_t u;
238*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(read(cap_wo, &u, sizeof(u)));
239*8ac5aef8SEnji Cooper   EXPECT_OK(read(cap_ro, &u, sizeof(u)));
240*8ac5aef8SEnji Cooper   EXPECT_EQ(42, (int)u);
241*8ac5aef8SEnji Cooper 
242*8ac5aef8SEnji Cooper   // Wait for the child.
243*8ac5aef8SEnji Cooper   int status;
244*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, 0));
245*8ac5aef8SEnji Cooper   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
246*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
247*8ac5aef8SEnji Cooper 
248*8ac5aef8SEnji Cooper   close(cap_all);
249*8ac5aef8SEnji Cooper   close(cap_rw);
250*8ac5aef8SEnji Cooper   close(cap_wo);
251*8ac5aef8SEnji Cooper   close(cap_ro);
252*8ac5aef8SEnji Cooper   close(fd);
253*8ac5aef8SEnji Cooper }
254*8ac5aef8SEnji Cooper 
255*8ac5aef8SEnji Cooper FORK_TEST(Linux, epoll) {
256*8ac5aef8SEnji Cooper   int sock_fds[2];
257*8ac5aef8SEnji Cooper   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds));
258*8ac5aef8SEnji Cooper   // Queue some data.
259*8ac5aef8SEnji Cooper   char buffer[4] = {1, 2, 3, 4};
260*8ac5aef8SEnji Cooper   EXPECT_OK(write(sock_fds[1], buffer, sizeof(buffer)));
261*8ac5aef8SEnji Cooper 
262*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());  // Enter capability mode.
263*8ac5aef8SEnji Cooper 
264*8ac5aef8SEnji Cooper   int epoll_fd = epoll_create(1);
265*8ac5aef8SEnji Cooper   EXPECT_OK(epoll_fd);
266*8ac5aef8SEnji Cooper 
267*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
268*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
269*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
270*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
271*8ac5aef8SEnji Cooper   cap_rights_t r_rws;
272*8ac5aef8SEnji Cooper   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
273*8ac5aef8SEnji Cooper   cap_rights_t r_rwspoll;
274*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
275*8ac5aef8SEnji Cooper   cap_rights_t r_epoll;
276*8ac5aef8SEnji Cooper   cap_rights_init(&r_epoll, CAP_EPOLL_CTL);
277*8ac5aef8SEnji Cooper 
278*8ac5aef8SEnji Cooper   int cap_epoll_wo = dup(epoll_fd);
279*8ac5aef8SEnji Cooper   EXPECT_OK(cap_epoll_wo);
280*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_epoll_wo, &r_ws));
281*8ac5aef8SEnji Cooper   int cap_epoll_ro = dup(epoll_fd);
282*8ac5aef8SEnji Cooper   EXPECT_OK(cap_epoll_ro);
283*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_epoll_ro, &r_rs));
284*8ac5aef8SEnji Cooper   int cap_epoll_rw = dup(epoll_fd);
285*8ac5aef8SEnji Cooper   EXPECT_OK(cap_epoll_rw);
286*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_epoll_rw, &r_rws));
287*8ac5aef8SEnji Cooper   int cap_epoll_poll = dup(epoll_fd);
288*8ac5aef8SEnji Cooper   EXPECT_OK(cap_epoll_poll);
289*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_epoll_poll, &r_rwspoll));
290*8ac5aef8SEnji Cooper   int cap_epoll_ctl = dup(epoll_fd);
291*8ac5aef8SEnji Cooper   EXPECT_OK(cap_epoll_ctl);
292*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_epoll_ctl, &r_epoll));
293*8ac5aef8SEnji Cooper 
294*8ac5aef8SEnji Cooper   // Can only modify the FDs being monitored if the CAP_EPOLL_CTL right is present.
295*8ac5aef8SEnji Cooper   struct epoll_event eev;
296*8ac5aef8SEnji Cooper   memset(&eev, 0, sizeof(eev));
297*8ac5aef8SEnji Cooper   eev.events = EPOLLIN|EPOLLOUT|EPOLLPRI;
298*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_ADD, sock_fds[0], &eev));
299*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_ADD, sock_fds[0], &eev));
300*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_ADD, sock_fds[0], &eev));
301*8ac5aef8SEnji Cooper   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_ADD, sock_fds[0], &eev));
302*8ac5aef8SEnji Cooper   eev.events = EPOLLIN|EPOLLOUT;
303*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_MOD, sock_fds[0], &eev));
304*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_MOD, sock_fds[0], &eev));
305*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_MOD, sock_fds[0], &eev));
306*8ac5aef8SEnji Cooper   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_MOD, sock_fds[0], &eev));
307*8ac5aef8SEnji Cooper 
308*8ac5aef8SEnji Cooper   // Running epoll_pwait(2) requires CAP_EVENT.
309*8ac5aef8SEnji Cooper   eev.events = 0;
310*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_ro, &eev, 1, 100, NULL));
311*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_wo, &eev, 1, 100, NULL));
312*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_rw, &eev, 1, 100, NULL));
313*8ac5aef8SEnji Cooper   EXPECT_OK(epoll_pwait(cap_epoll_poll, &eev, 1, 100, NULL));
314*8ac5aef8SEnji Cooper   EXPECT_EQ(EPOLLIN, eev.events & EPOLLIN);
315*8ac5aef8SEnji Cooper 
316*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_DEL, sock_fds[0], &eev));
317*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_DEL, sock_fds[0], &eev));
318*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_DEL, sock_fds[0], &eev));
319*8ac5aef8SEnji Cooper   EXPECT_OK(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, sock_fds[0], &eev));
320*8ac5aef8SEnji Cooper 
321*8ac5aef8SEnji Cooper   close(cap_epoll_ctl);
322*8ac5aef8SEnji Cooper   close(cap_epoll_poll);
323*8ac5aef8SEnji Cooper   close(cap_epoll_rw);
324*8ac5aef8SEnji Cooper   close(cap_epoll_ro);
325*8ac5aef8SEnji Cooper   close(cap_epoll_wo);
326*8ac5aef8SEnji Cooper   close(epoll_fd);
327*8ac5aef8SEnji Cooper   close(sock_fds[1]);
328*8ac5aef8SEnji Cooper   close(sock_fds[0]);
329*8ac5aef8SEnji Cooper }
330*8ac5aef8SEnji Cooper 
331*8ac5aef8SEnji Cooper TEST(Linux, fstatat) {
332*8ac5aef8SEnji Cooper   int fd = open(TmpFile("cap_fstatat"), O_CREAT|O_RDWR, 0644);
333*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
334*8ac5aef8SEnji Cooper   unsigned char buffer[] = {1, 2, 3, 4};
335*8ac5aef8SEnji Cooper   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
336*8ac5aef8SEnji Cooper   cap_rights_t rights;
337*8ac5aef8SEnji Cooper   int cap_rf = dup(fd);
338*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rf);
339*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
340*8ac5aef8SEnji Cooper   int cap_ro = dup(fd);
341*8ac5aef8SEnji Cooper   EXPECT_OK(cap_ro);
342*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_ro, cap_rights_init(&rights, CAP_READ)));
343*8ac5aef8SEnji Cooper 
344*8ac5aef8SEnji Cooper   struct stat info;
345*8ac5aef8SEnji Cooper   EXPECT_OK(fstatat(fd, "", &info, AT_EMPTY_PATH));
346*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fstatat(cap_ro, "", &info, AT_EMPTY_PATH));
347*8ac5aef8SEnji Cooper   EXPECT_OK(fstatat(cap_rf, "", &info, AT_EMPTY_PATH));
348*8ac5aef8SEnji Cooper 
349*8ac5aef8SEnji Cooper   close(cap_ro);
350*8ac5aef8SEnji Cooper   close(cap_rf);
351*8ac5aef8SEnji Cooper   close(fd);
352*8ac5aef8SEnji Cooper 
353*8ac5aef8SEnji Cooper   int dir = open(tmpdir.c_str(), O_RDONLY);
354*8ac5aef8SEnji Cooper   EXPECT_OK(dir);
355*8ac5aef8SEnji Cooper   int dir_rf = dup(dir);
356*8ac5aef8SEnji Cooper   EXPECT_OK(dir_rf);
357*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(dir_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
358*8ac5aef8SEnji Cooper   int dir_ro = dup(fd);
359*8ac5aef8SEnji Cooper   EXPECT_OK(dir_ro);
360*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(dir_ro, cap_rights_init(&rights, CAP_READ)));
361*8ac5aef8SEnji Cooper 
362*8ac5aef8SEnji Cooper   EXPECT_OK(fstatat(dir, "cap_fstatat", &info, AT_EMPTY_PATH));
363*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fstatat(dir_ro, "cap_fstatat", &info, AT_EMPTY_PATH));
364*8ac5aef8SEnji Cooper   EXPECT_OK(fstatat(dir_rf, "cap_fstatat", &info, AT_EMPTY_PATH));
365*8ac5aef8SEnji Cooper 
366*8ac5aef8SEnji Cooper   close(dir_ro);
367*8ac5aef8SEnji Cooper   close(dir_rf);
368*8ac5aef8SEnji Cooper   close(dir);
369*8ac5aef8SEnji Cooper 
370*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_fstatat"));
371*8ac5aef8SEnji Cooper }
372*8ac5aef8SEnji Cooper 
373*8ac5aef8SEnji Cooper // fanotify support may not be available at compile-time
374*8ac5aef8SEnji Cooper #ifdef __NR_fanotify_init
375*8ac5aef8SEnji Cooper TEST(Linux, fanotify) {
376*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
377*8ac5aef8SEnji Cooper   int fa_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDWR);
378*8ac5aef8SEnji Cooper   EXPECT_OK(fa_fd);
379*8ac5aef8SEnji Cooper   if (fa_fd < 0) return;  // May not be enabled
380*8ac5aef8SEnji Cooper 
381*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
382*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
383*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
384*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
385*8ac5aef8SEnji Cooper   cap_rights_t r_rws;
386*8ac5aef8SEnji Cooper   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
387*8ac5aef8SEnji Cooper   cap_rights_t r_rwspoll;
388*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
389*8ac5aef8SEnji Cooper   cap_rights_t r_rwsnotify;
390*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
391*8ac5aef8SEnji Cooper   cap_rights_t r_rsl;
392*8ac5aef8SEnji Cooper   cap_rights_init(&r_rsl, CAP_READ, CAP_SEEK, CAP_LOOKUP);
393*8ac5aef8SEnji Cooper   cap_rights_t r_rslstat;
394*8ac5aef8SEnji Cooper   cap_rights_init(&r_rslstat, CAP_READ, CAP_SEEK, CAP_LOOKUP, CAP_FSTAT);
395*8ac5aef8SEnji Cooper   cap_rights_t r_rsstat;
396*8ac5aef8SEnji Cooper   cap_rights_init(&r_rsstat, CAP_READ, CAP_SEEK, CAP_FSTAT);
397*8ac5aef8SEnji Cooper 
398*8ac5aef8SEnji Cooper   int cap_fd_ro = dup(fa_fd);
399*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_ro);
400*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
401*8ac5aef8SEnji Cooper   int cap_fd_wo = dup(fa_fd);
402*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_wo);
403*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
404*8ac5aef8SEnji Cooper   int cap_fd_rw = dup(fa_fd);
405*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_rw);
406*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
407*8ac5aef8SEnji Cooper   int cap_fd_poll = dup(fa_fd);
408*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_poll);
409*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_poll, &r_rwspoll));
410*8ac5aef8SEnji Cooper   int cap_fd_not = dup(fa_fd);
411*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_not);
412*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_not, &r_rwsnotify));
413*8ac5aef8SEnji Cooper 
414*8ac5aef8SEnji Cooper   int rc = mkdir(TmpFile("cap_notify"), 0755);
415*8ac5aef8SEnji Cooper   EXPECT_TRUE(rc == 0 || errno == EEXIST);
416*8ac5aef8SEnji Cooper   int dfd = open(TmpFile("cap_notify"), O_RDONLY);
417*8ac5aef8SEnji Cooper   EXPECT_OK(dfd);
418*8ac5aef8SEnji Cooper   int fd = open(TmpFile("cap_notify/file"), O_CREAT|O_RDWR, 0644);
419*8ac5aef8SEnji Cooper   close(fd);
420*8ac5aef8SEnji Cooper   int cap_dfd = dup(dfd);
421*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd);
422*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_dfd, &r_rslstat));
423*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd);
424*8ac5aef8SEnji Cooper   int cap_dfd_rs = dup(dfd);
425*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rs);
426*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_dfd_rs, &r_rs));
427*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rs);
428*8ac5aef8SEnji Cooper   int cap_dfd_rsstat = dup(dfd);
429*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rsstat);
430*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_dfd_rsstat, &r_rsstat));
431*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rsstat);
432*8ac5aef8SEnji Cooper   int cap_dfd_rsl = dup(dfd);
433*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rsl);
434*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_dfd_rsl, &r_rsl));
435*8ac5aef8SEnji Cooper   EXPECT_OK(cap_dfd_rsl);
436*8ac5aef8SEnji Cooper 
437*8ac5aef8SEnji Cooper   // Need CAP_NOTIFY to change what's monitored.
438*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_ro, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
439*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_wo, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
440*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_rw, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
441*8ac5aef8SEnji Cooper   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
442*8ac5aef8SEnji Cooper 
443*8ac5aef8SEnji Cooper   // Need CAP_FSTAT on the thing monitored.
444*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rs, NULL));
445*8ac5aef8SEnji Cooper   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rsstat, NULL));
446*8ac5aef8SEnji Cooper 
447*8ac5aef8SEnji Cooper   // Too add monitoring of a file under a dfd, need CAP_LOOKUP|CAP_FSTAT on the dfd.
448*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsstat, "file"));
449*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsl, "file"));
450*8ac5aef8SEnji Cooper   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd, "file"));
451*8ac5aef8SEnji Cooper 
452*8ac5aef8SEnji Cooper   pid_t child = fork();
453*8ac5aef8SEnji Cooper   if (child == 0) {
454*8ac5aef8SEnji Cooper     // Child: Perform activity in the directory under notify.
455*8ac5aef8SEnji Cooper     sleep(1);
456*8ac5aef8SEnji Cooper     unlink(TmpFile("cap_notify/temp"));
457*8ac5aef8SEnji Cooper     int fd = open(TmpFile("cap_notify/temp"), O_CREAT|O_RDWR, 0644);
458*8ac5aef8SEnji Cooper     close(fd);
459*8ac5aef8SEnji Cooper     exit(0);
460*8ac5aef8SEnji Cooper   }
461*8ac5aef8SEnji Cooper 
462*8ac5aef8SEnji Cooper   // Need CAP_EVENT to poll.
463*8ac5aef8SEnji Cooper   struct pollfd poll_fd;
464*8ac5aef8SEnji Cooper   poll_fd.revents = 0;
465*8ac5aef8SEnji Cooper   poll_fd.events = POLLIN;
466*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_rw;
467*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 1400));
468*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
469*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
470*8ac5aef8SEnji Cooper 
471*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_not;
472*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 1400));
473*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
474*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
475*8ac5aef8SEnji Cooper 
476*8ac5aef8SEnji Cooper   poll_fd.fd = cap_fd_poll;
477*8ac5aef8SEnji Cooper   EXPECT_OK(poll(&poll_fd, 1, 1400));
478*8ac5aef8SEnji Cooper   EXPECT_NE(0, (poll_fd.revents & POLLIN));
479*8ac5aef8SEnji Cooper   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
480*8ac5aef8SEnji Cooper 
481*8ac5aef8SEnji Cooper   // Need CAP_READ to read.
482*8ac5aef8SEnji Cooper   struct fanotify_event_metadata ev;
483*8ac5aef8SEnji Cooper   memset(&ev, 0, sizeof(ev));
484*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(read(cap_fd_wo, &ev, sizeof(ev)));
485*8ac5aef8SEnji Cooper   rc = read(fa_fd, &ev, sizeof(ev));
486*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
487*8ac5aef8SEnji Cooper   EXPECT_EQ((int)sizeof(struct fanotify_event_metadata), rc);
488*8ac5aef8SEnji Cooper   EXPECT_EQ(child, ev.pid);
489*8ac5aef8SEnji Cooper   EXPECT_NE(0, ev.fd);
490*8ac5aef8SEnji Cooper 
491*8ac5aef8SEnji Cooper   // TODO(drysdale): reinstate if/when capsicum-linux propagates rights
492*8ac5aef8SEnji Cooper   // to fanotify-generated FDs.
493*8ac5aef8SEnji Cooper #ifdef OMIT
494*8ac5aef8SEnji Cooper   // fanotify(7) gives us a FD for the changed file.  This should
495*8ac5aef8SEnji Cooper   // only have rights that are a subset of those for the original
496*8ac5aef8SEnji Cooper   // monitored directory file descriptor.
497*8ac5aef8SEnji Cooper   cap_rights_t rights;
498*8ac5aef8SEnji Cooper   CAP_SET_ALL(&rights);
499*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_get(ev.fd, &rights));
500*8ac5aef8SEnji Cooper   EXPECT_RIGHTS_IN(&rights, &r_rslstat);
501*8ac5aef8SEnji Cooper #endif
502*8ac5aef8SEnji Cooper 
503*8ac5aef8SEnji Cooper   // Wait for the child.
504*8ac5aef8SEnji Cooper   int status;
505*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, 0));
506*8ac5aef8SEnji Cooper   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
507*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
508*8ac5aef8SEnji Cooper 
509*8ac5aef8SEnji Cooper   close(cap_dfd_rsstat);
510*8ac5aef8SEnji Cooper   close(cap_dfd_rsl);
511*8ac5aef8SEnji Cooper   close(cap_dfd_rs);
512*8ac5aef8SEnji Cooper   close(cap_dfd);
513*8ac5aef8SEnji Cooper   close(dfd);
514*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_notify/file"));
515*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_notify/temp"));
516*8ac5aef8SEnji Cooper   rmdir(TmpFile("cap_notify"));
517*8ac5aef8SEnji Cooper   close(cap_fd_not);
518*8ac5aef8SEnji Cooper   close(cap_fd_poll);
519*8ac5aef8SEnji Cooper   close(cap_fd_rw);
520*8ac5aef8SEnji Cooper   close(cap_fd_wo);
521*8ac5aef8SEnji Cooper   close(cap_fd_ro);
522*8ac5aef8SEnji Cooper   close(fa_fd);
523*8ac5aef8SEnji Cooper }
524*8ac5aef8SEnji Cooper #endif
525*8ac5aef8SEnji Cooper 
526*8ac5aef8SEnji Cooper TEST(Linux, inotify) {
527*8ac5aef8SEnji Cooper   int i_fd = inotify_init();
528*8ac5aef8SEnji Cooper   EXPECT_OK(i_fd);
529*8ac5aef8SEnji Cooper 
530*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
531*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
532*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
533*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
534*8ac5aef8SEnji Cooper   cap_rights_t r_rws;
535*8ac5aef8SEnji Cooper   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
536*8ac5aef8SEnji Cooper   cap_rights_t r_rwsnotify;
537*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
538*8ac5aef8SEnji Cooper 
539*8ac5aef8SEnji Cooper   int cap_fd_ro = dup(i_fd);
540*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_ro);
541*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
542*8ac5aef8SEnji Cooper   int cap_fd_wo = dup(i_fd);
543*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_wo);
544*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
545*8ac5aef8SEnji Cooper   int cap_fd_rw = dup(i_fd);
546*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_rw);
547*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
548*8ac5aef8SEnji Cooper   int cap_fd_all = dup(i_fd);
549*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_all);
550*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwsnotify));
551*8ac5aef8SEnji Cooper 
552*8ac5aef8SEnji Cooper   int fd = open(TmpFile("cap_inotify"), O_CREAT|O_RDWR, 0644);
553*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(inotify_add_watch(cap_fd_rw, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY));
554*8ac5aef8SEnji Cooper   int wd = inotify_add_watch(i_fd, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY);
555*8ac5aef8SEnji Cooper   EXPECT_OK(wd);
556*8ac5aef8SEnji Cooper 
557*8ac5aef8SEnji Cooper   unsigned char buffer[] = {1, 2, 3, 4};
558*8ac5aef8SEnji Cooper   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
559*8ac5aef8SEnji Cooper 
560*8ac5aef8SEnji Cooper   struct inotify_event iev;
561*8ac5aef8SEnji Cooper   memset(&iev, 0, sizeof(iev));
562*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(read(cap_fd_wo, &iev, sizeof(iev)));
563*8ac5aef8SEnji Cooper   int rc = read(cap_fd_ro, &iev, sizeof(iev));
564*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
565*8ac5aef8SEnji Cooper   EXPECT_EQ((int)sizeof(iev), rc);
566*8ac5aef8SEnji Cooper   EXPECT_EQ(wd, iev.wd);
567*8ac5aef8SEnji Cooper 
568*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(inotify_rm_watch(cap_fd_wo, wd));
569*8ac5aef8SEnji Cooper   EXPECT_OK(inotify_rm_watch(cap_fd_all, wd));
570*8ac5aef8SEnji Cooper 
571*8ac5aef8SEnji Cooper   close(fd);
572*8ac5aef8SEnji Cooper   close(cap_fd_all);
573*8ac5aef8SEnji Cooper   close(cap_fd_rw);
574*8ac5aef8SEnji Cooper   close(cap_fd_wo);
575*8ac5aef8SEnji Cooper   close(cap_fd_ro);
576*8ac5aef8SEnji Cooper   close(i_fd);
577*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_inotify"));
578*8ac5aef8SEnji Cooper }
579*8ac5aef8SEnji Cooper 
580*8ac5aef8SEnji Cooper TEST(Linux, ArchChange) {
581*8ac5aef8SEnji Cooper   const char* prog_candidates[] = {"./mini-me.32", "./mini-me.x32", "./mini-me.64"};
582*8ac5aef8SEnji Cooper   const char* progs[] = {NULL, NULL, NULL};
583*8ac5aef8SEnji Cooper   char* argv_pass[] = {(char*)"to-come", (char*)"--capmode", NULL};
584*8ac5aef8SEnji Cooper   char* null_envp[] = {NULL};
585*8ac5aef8SEnji Cooper   int fds[3];
586*8ac5aef8SEnji Cooper   int count = 0;
587*8ac5aef8SEnji Cooper 
588*8ac5aef8SEnji Cooper   for (int ii = 0; ii < 3; ii++) {
589*8ac5aef8SEnji Cooper     fds[count] = open(prog_candidates[ii], O_RDONLY);
590*8ac5aef8SEnji Cooper     if (fds[count] >= 0) {
591*8ac5aef8SEnji Cooper       progs[count] = prog_candidates[ii];
592*8ac5aef8SEnji Cooper       count++;
593*8ac5aef8SEnji Cooper     }
594*8ac5aef8SEnji Cooper   }
595*8ac5aef8SEnji Cooper   if (count == 0) {
596*8ac5aef8SEnji Cooper     TEST_SKIPPED("no different-architecture programs available");
597*8ac5aef8SEnji Cooper     return;
598*8ac5aef8SEnji Cooper   }
599*8ac5aef8SEnji Cooper 
600*8ac5aef8SEnji Cooper   for (int ii = 0; ii < count; ii++) {
601*8ac5aef8SEnji Cooper     // Fork-and-exec a binary of this architecture.
602*8ac5aef8SEnji Cooper     pid_t child = fork();
603*8ac5aef8SEnji Cooper     if (child == 0) {
604*8ac5aef8SEnji Cooper       EXPECT_OK(cap_enter());  // Enter capability mode
605*8ac5aef8SEnji Cooper       if (verbose) fprintf(stderr, "[%d] call fexecve(%s, %s)\n",
606*8ac5aef8SEnji Cooper                            getpid_(), progs[ii], argv_pass[1]);
607*8ac5aef8SEnji Cooper       argv_pass[0] = (char *)progs[ii];
608*8ac5aef8SEnji Cooper       int rc = fexecve_(fds[ii], argv_pass, null_envp);
609*8ac5aef8SEnji Cooper       fprintf(stderr, "fexecve(%s) returned %d errno %d\n", progs[ii], rc, errno);
610*8ac5aef8SEnji Cooper       exit(99);  // Should not reach here.
611*8ac5aef8SEnji Cooper     }
612*8ac5aef8SEnji Cooper     int status;
613*8ac5aef8SEnji Cooper     EXPECT_EQ(child, waitpid(child, &status, 0));
614*8ac5aef8SEnji Cooper     int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
615*8ac5aef8SEnji Cooper     EXPECT_EQ(0, rc);
616*8ac5aef8SEnji Cooper     close(fds[ii]);
617*8ac5aef8SEnji Cooper   }
618*8ac5aef8SEnji Cooper }
619*8ac5aef8SEnji Cooper 
620*8ac5aef8SEnji Cooper FORK_TEST(Linux, Namespace) {
621*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
622*8ac5aef8SEnji Cooper   pid_t me = getpid_();
623*8ac5aef8SEnji Cooper 
624*8ac5aef8SEnji Cooper   // Create a new UTS namespace.
625*8ac5aef8SEnji Cooper   EXPECT_OK(unshare(CLONE_NEWUTS));
626*8ac5aef8SEnji Cooper   // Open an FD to its symlink.
627*8ac5aef8SEnji Cooper   char buffer[256];
628*8ac5aef8SEnji Cooper   sprintf(buffer, "/proc/%d/ns/uts", me);
629*8ac5aef8SEnji Cooper   int ns_fd = open(buffer, O_RDONLY);
630*8ac5aef8SEnji Cooper 
631*8ac5aef8SEnji Cooper   cap_rights_t r_rwlstat;
632*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwlstat, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT);
633*8ac5aef8SEnji Cooper   cap_rights_t r_rwlstatns;
634*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwlstatns, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT, CAP_SETNS);
635*8ac5aef8SEnji Cooper 
636*8ac5aef8SEnji Cooper   int cap_fd = dup(ns_fd);
637*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd);
638*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd, &r_rwlstat));
639*8ac5aef8SEnji Cooper   int cap_fd_setns = dup(ns_fd);
640*8ac5aef8SEnji Cooper   EXPECT_OK(cap_fd_setns);
641*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_fd_setns, &r_rwlstatns));
642*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(setns(cap_fd, CLONE_NEWUTS));
643*8ac5aef8SEnji Cooper   EXPECT_OK(setns(cap_fd_setns, CLONE_NEWUTS));
644*8ac5aef8SEnji Cooper 
645*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());  // Enter capability mode.
646*8ac5aef8SEnji Cooper 
647*8ac5aef8SEnji Cooper   // No setns(2) but unshare(2) is allowed.
648*8ac5aef8SEnji Cooper   EXPECT_CAPMODE(setns(ns_fd, CLONE_NEWUTS));
649*8ac5aef8SEnji Cooper   EXPECT_OK(unshare(CLONE_NEWUTS));
650*8ac5aef8SEnji Cooper }
651*8ac5aef8SEnji Cooper 
652*8ac5aef8SEnji Cooper static void SendFD(int fd, int over) {
653*8ac5aef8SEnji Cooper   struct msghdr mh;
654*8ac5aef8SEnji Cooper   mh.msg_name = NULL;  // No address needed
655*8ac5aef8SEnji Cooper   mh.msg_namelen = 0;
656*8ac5aef8SEnji Cooper   char buffer1[1024];
657*8ac5aef8SEnji Cooper   struct iovec iov[1];
658*8ac5aef8SEnji Cooper   iov[0].iov_base = buffer1;
659*8ac5aef8SEnji Cooper   iov[0].iov_len = sizeof(buffer1);
660*8ac5aef8SEnji Cooper   mh.msg_iov = iov;
661*8ac5aef8SEnji Cooper   mh.msg_iovlen = 1;
662*8ac5aef8SEnji Cooper   char buffer2[1024];
663*8ac5aef8SEnji Cooper   mh.msg_control = buffer2;
664*8ac5aef8SEnji Cooper   mh.msg_controllen = CMSG_LEN(sizeof(int));
665*8ac5aef8SEnji Cooper   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
666*8ac5aef8SEnji Cooper   cmptr->cmsg_level = SOL_SOCKET;
667*8ac5aef8SEnji Cooper   cmptr->cmsg_type = SCM_RIGHTS;
668*8ac5aef8SEnji Cooper   cmptr->cmsg_len = CMSG_LEN(sizeof(int));
669*8ac5aef8SEnji Cooper   *(int *)CMSG_DATA(cmptr) = fd;
670*8ac5aef8SEnji Cooper   buffer1[0] = 0;
671*8ac5aef8SEnji Cooper   iov[0].iov_len = 1;
672*8ac5aef8SEnji Cooper   int rc = sendmsg(over, &mh, 0);
673*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
674*8ac5aef8SEnji Cooper }
675*8ac5aef8SEnji Cooper 
676*8ac5aef8SEnji Cooper static int ReceiveFD(int over) {
677*8ac5aef8SEnji Cooper   struct msghdr mh;
678*8ac5aef8SEnji Cooper   mh.msg_name = NULL;  // No address needed
679*8ac5aef8SEnji Cooper   mh.msg_namelen = 0;
680*8ac5aef8SEnji Cooper   char buffer1[1024];
681*8ac5aef8SEnji Cooper   struct iovec iov[1];
682*8ac5aef8SEnji Cooper   iov[0].iov_base = buffer1;
683*8ac5aef8SEnji Cooper   iov[0].iov_len = sizeof(buffer1);
684*8ac5aef8SEnji Cooper   mh.msg_iov = iov;
685*8ac5aef8SEnji Cooper   mh.msg_iovlen = 1;
686*8ac5aef8SEnji Cooper   char buffer2[1024];
687*8ac5aef8SEnji Cooper   mh.msg_control = buffer2;
688*8ac5aef8SEnji Cooper   mh.msg_controllen = sizeof(buffer2);
689*8ac5aef8SEnji Cooper   int rc = recvmsg(over, &mh, 0);
690*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
691*8ac5aef8SEnji Cooper   EXPECT_LE(CMSG_LEN(sizeof(int)), mh.msg_controllen);
692*8ac5aef8SEnji Cooper   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
693*8ac5aef8SEnji Cooper   int fd = *(int*)CMSG_DATA(cmptr);
694*8ac5aef8SEnji Cooper   EXPECT_EQ(CMSG_LEN(sizeof(int)), cmptr->cmsg_len);
695*8ac5aef8SEnji Cooper   cmptr = CMSG_NXTHDR(&mh, cmptr);
696*8ac5aef8SEnji Cooper   EXPECT_TRUE(cmptr == NULL);
697*8ac5aef8SEnji Cooper   return fd;
698*8ac5aef8SEnji Cooper }
699*8ac5aef8SEnji Cooper 
700*8ac5aef8SEnji Cooper static int shared_pd = -1;
701*8ac5aef8SEnji Cooper static int shared_sock_fds[2];
702*8ac5aef8SEnji Cooper 
703*8ac5aef8SEnji Cooper static int ChildFunc(void *arg) {
704*8ac5aef8SEnji Cooper   // This function is running in a new PID namespace, and so is pid 1.
705*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
706*8ac5aef8SEnji Cooper   EXPECT_EQ(1, getpid_());
707*8ac5aef8SEnji Cooper   EXPECT_EQ(0, getppid());
708*8ac5aef8SEnji Cooper 
709*8ac5aef8SEnji Cooper   // The shared process descriptor is outside our namespace, so we cannot
710*8ac5aef8SEnji Cooper   // get its pid.
711*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: shared_pd=%d\n", shared_pd);
712*8ac5aef8SEnji Cooper   pid_t shared_child = -1;
713*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(shared_pd, &shared_child));
714*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: corresponding pid=%d\n", shared_child);
715*8ac5aef8SEnji Cooper   EXPECT_EQ(0, shared_child);
716*8ac5aef8SEnji Cooper 
717*8ac5aef8SEnji Cooper   // But we can pdkill() it even so.
718*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: call pdkill(pd=%d)\n", shared_pd);
719*8ac5aef8SEnji Cooper   EXPECT_OK(pdkill(shared_pd, SIGINT));
720*8ac5aef8SEnji Cooper 
721*8ac5aef8SEnji Cooper   int pd;
722*8ac5aef8SEnji Cooper   pid_t child = pdfork(&pd, 0);
723*8ac5aef8SEnji Cooper   EXPECT_OK(child);
724*8ac5aef8SEnji Cooper   if (child == 0) {
725*8ac5aef8SEnji Cooper     // Child: expect pid 2.
726*8ac5aef8SEnji Cooper     if (verbose) fprintf(stderr, "      child of ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
727*8ac5aef8SEnji Cooper     EXPECT_EQ(2, getpid_());
728*8ac5aef8SEnji Cooper     EXPECT_EQ(1, getppid());
729*8ac5aef8SEnji Cooper     while (true) {
730*8ac5aef8SEnji Cooper       if (verbose) fprintf(stderr, "      child of ChildFunc: \"I aten't dead\"\n");
731*8ac5aef8SEnji Cooper       sleep(1);
732*8ac5aef8SEnji Cooper     }
733*8ac5aef8SEnji Cooper     exit(0);
734*8ac5aef8SEnji Cooper   }
735*8ac5aef8SEnji Cooper   EXPECT_EQ(2, child);
736*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(child);
737*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
738*8ac5aef8SEnji Cooper                        pd, child, ProcessState(child));
739*8ac5aef8SEnji Cooper 
740*8ac5aef8SEnji Cooper   pid_t pid;
741*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(pd, &pid));
742*8ac5aef8SEnji Cooper   EXPECT_EQ(child, pid);
743*8ac5aef8SEnji Cooper 
744*8ac5aef8SEnji Cooper   sleep(2);
745*8ac5aef8SEnji Cooper 
746*8ac5aef8SEnji Cooper   // Send the process descriptor over UNIX domain socket back to parent.
747*8ac5aef8SEnji Cooper   SendFD(pd, shared_sock_fds[1]);
748*8ac5aef8SEnji Cooper 
749*8ac5aef8SEnji Cooper   // Wait for death of (grand)child, killed by our parent.
750*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: wait on pid=%d\n", child);
751*8ac5aef8SEnji Cooper   int status;
752*8ac5aef8SEnji Cooper   EXPECT_EQ(child, wait4(child, &status, __WALL, NULL));
753*8ac5aef8SEnji Cooper 
754*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "    ChildFunc: return 0\n");
755*8ac5aef8SEnji Cooper   return 0;
756*8ac5aef8SEnji Cooper }
757*8ac5aef8SEnji Cooper 
758*8ac5aef8SEnji Cooper #define STACK_SIZE (1024 * 1024)
759*8ac5aef8SEnji Cooper static char child_stack[STACK_SIZE];
760*8ac5aef8SEnji Cooper 
761*8ac5aef8SEnji Cooper // TODO(drysdale): fork into a user namespace first so REQUIRE_ROOT can be removed.
762*8ac5aef8SEnji Cooper TEST(Linux, PidNamespacePdFork) {
763*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
764*8ac5aef8SEnji Cooper   // Pass process descriptors in both directions across a PID namespace boundary.
765*8ac5aef8SEnji Cooper   // pdfork() off a child before we start, holding its process descriptor in a global
766*8ac5aef8SEnji Cooper   // variable that's accessible to children.
767*8ac5aef8SEnji Cooper   pid_t firstborn = pdfork(&shared_pd, 0);
768*8ac5aef8SEnji Cooper   EXPECT_OK(firstborn);
769*8ac5aef8SEnji Cooper   if (firstborn == 0) {
770*8ac5aef8SEnji Cooper     while (true) {
771*8ac5aef8SEnji Cooper       if (verbose) fprintf(stderr, "  Firstborn: \"I aten't dead\"\n");
772*8ac5aef8SEnji Cooper       sleep(1);
773*8ac5aef8SEnji Cooper     }
774*8ac5aef8SEnji Cooper     exit(0);
775*8ac5aef8SEnji Cooper   }
776*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(firstborn);
777*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: pre-pdfork()ed pd=%d, pid=%d state='%c'\n",
778*8ac5aef8SEnji Cooper                        shared_pd, firstborn, ProcessState(firstborn));
779*8ac5aef8SEnji Cooper   sleep(2);
780*8ac5aef8SEnji Cooper 
781*8ac5aef8SEnji Cooper   // Prepare sockets to communicate with child process.
782*8ac5aef8SEnji Cooper   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
783*8ac5aef8SEnji Cooper 
784*8ac5aef8SEnji Cooper   // Clone into a child process with a new pid namespace.
785*8ac5aef8SEnji Cooper   pid_t child = clone(ChildFunc, child_stack + STACK_SIZE,
786*8ac5aef8SEnji Cooper                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
787*8ac5aef8SEnji Cooper   EXPECT_OK(child);
788*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(child);
789*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
790*8ac5aef8SEnji Cooper 
791*8ac5aef8SEnji Cooper   // Ensure the child runs.  First thing it does is to kill our firstborn, using shared_pd.
792*8ac5aef8SEnji Cooper   sleep(1);
793*8ac5aef8SEnji Cooper   EXPECT_PID_DEAD(firstborn);
794*8ac5aef8SEnji Cooper 
795*8ac5aef8SEnji Cooper   // But we can still retrieve firstborn's PID, as it's not been reaped yet.
796*8ac5aef8SEnji Cooper   pid_t child0;
797*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(shared_pd, &child0));
798*8ac5aef8SEnji Cooper   EXPECT_EQ(firstborn, child0);
799*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: check on firstborn: pdgetpid(pd=%d) -> child=%d state='%c'\n",
800*8ac5aef8SEnji Cooper                        shared_pd, child0, ProcessState(child0));
801*8ac5aef8SEnji Cooper 
802*8ac5aef8SEnji Cooper   // Now reap it.
803*8ac5aef8SEnji Cooper   int status;
804*8ac5aef8SEnji Cooper   EXPECT_EQ(firstborn, waitpid(firstborn, &status, __WALL));
805*8ac5aef8SEnji Cooper 
806*8ac5aef8SEnji Cooper   // Get the process descriptor of the child-of-child via socket transfer.
807*8ac5aef8SEnji Cooper   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
808*8ac5aef8SEnji Cooper 
809*8ac5aef8SEnji Cooper   // Our notion of the pid associated with the grandchild is in the main PID namespace.
810*8ac5aef8SEnji Cooper   pid_t grandchild;
811*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
812*8ac5aef8SEnji Cooper   EXPECT_NE(2, grandchild);
813*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: pre-pdkill:  pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
814*8ac5aef8SEnji Cooper                        grandchild_pd, grandchild, ProcessState(grandchild));
815*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(grandchild);
816*8ac5aef8SEnji Cooper 
817*8ac5aef8SEnji Cooper   // Kill the grandchild via the process descriptor.
818*8ac5aef8SEnji Cooper   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
819*8ac5aef8SEnji Cooper   usleep(10000);
820*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: post-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
821*8ac5aef8SEnji Cooper                        grandchild_pd, grandchild, ProcessState(grandchild));
822*8ac5aef8SEnji Cooper   EXPECT_PID_DEAD(grandchild);
823*8ac5aef8SEnji Cooper 
824*8ac5aef8SEnji Cooper   sleep(2);
825*8ac5aef8SEnji Cooper 
826*8ac5aef8SEnji Cooper   // Wait for the child.
827*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
828*8ac5aef8SEnji Cooper   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
829*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
830*8ac5aef8SEnji Cooper 
831*8ac5aef8SEnji Cooper   close(shared_sock_fds[0]);
832*8ac5aef8SEnji Cooper   close(shared_sock_fds[1]);
833*8ac5aef8SEnji Cooper   close(shared_pd);
834*8ac5aef8SEnji Cooper   close(grandchild_pd);
835*8ac5aef8SEnji Cooper }
836*8ac5aef8SEnji Cooper 
837*8ac5aef8SEnji Cooper int NSInit(void *data) {
838*8ac5aef8SEnji Cooper   // This function is running in a new PID namespace, and so is pid 1.
839*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "  NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
840*8ac5aef8SEnji Cooper   EXPECT_EQ(1, getpid_());
841*8ac5aef8SEnji Cooper   EXPECT_EQ(0, getppid());
842*8ac5aef8SEnji Cooper 
843*8ac5aef8SEnji Cooper   int pd;
844*8ac5aef8SEnji Cooper   pid_t child = pdfork(&pd, 0);
845*8ac5aef8SEnji Cooper   EXPECT_OK(child);
846*8ac5aef8SEnji Cooper   if (child == 0) {
847*8ac5aef8SEnji Cooper     // Child: loop forever until terminated.
848*8ac5aef8SEnji Cooper     if (verbose) fprintf(stderr, "    child of NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
849*8ac5aef8SEnji Cooper     while (true) {
850*8ac5aef8SEnji Cooper       if (verbose) fprintf(stderr, "    child of NSInit: \"I aten't dead\"\n");
851*8ac5aef8SEnji Cooper       usleep(100000);
852*8ac5aef8SEnji Cooper     }
853*8ac5aef8SEnji Cooper     exit(0);
854*8ac5aef8SEnji Cooper   }
855*8ac5aef8SEnji Cooper   EXPECT_EQ(2, child);
856*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(child);
857*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "  NSInit: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
858*8ac5aef8SEnji Cooper                        pd, child, ProcessState(child));
859*8ac5aef8SEnji Cooper   sleep(1);
860*8ac5aef8SEnji Cooper 
861*8ac5aef8SEnji Cooper   // Send the process descriptor over UNIX domain socket back to parent.
862*8ac5aef8SEnji Cooper   SendFD(pd, shared_sock_fds[1]);
863*8ac5aef8SEnji Cooper   close(pd);
864*8ac5aef8SEnji Cooper 
865*8ac5aef8SEnji Cooper   // Wait for a byte back in the other direction.
866*8ac5aef8SEnji Cooper   int value;
867*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "  NSInit: block waiting for value\n");
868*8ac5aef8SEnji Cooper   read(shared_sock_fds[1], &value, sizeof(value));
869*8ac5aef8SEnji Cooper 
870*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "  NSInit: return 0\n");
871*8ac5aef8SEnji Cooper   return 0;
872*8ac5aef8SEnji Cooper }
873*8ac5aef8SEnji Cooper 
874*8ac5aef8SEnji Cooper TEST(Linux, DeadNSInit) {
875*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
876*8ac5aef8SEnji Cooper 
877*8ac5aef8SEnji Cooper   // Prepare sockets to communicate with child process.
878*8ac5aef8SEnji Cooper   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
879*8ac5aef8SEnji Cooper 
880*8ac5aef8SEnji Cooper   // Clone into a child process with a new pid namespace.
881*8ac5aef8SEnji Cooper   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
882*8ac5aef8SEnji Cooper                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
883*8ac5aef8SEnji Cooper   usleep(10000);
884*8ac5aef8SEnji Cooper   EXPECT_OK(child);
885*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(child);
886*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
887*8ac5aef8SEnji Cooper 
888*8ac5aef8SEnji Cooper   // Get the process descriptor of the child-of-child via socket transfer.
889*8ac5aef8SEnji Cooper   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
890*8ac5aef8SEnji Cooper   pid_t grandchild;
891*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
892*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
893*8ac5aef8SEnji Cooper 
894*8ac5aef8SEnji Cooper   // Send an int to the child to trigger its termination.  Grandchild should also
895*8ac5aef8SEnji Cooper   // go, as its init process is gone.
896*8ac5aef8SEnji Cooper   int zero = 0;
897*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
898*8ac5aef8SEnji Cooper   write(shared_sock_fds[0], &zero, sizeof(zero));
899*8ac5aef8SEnji Cooper   EXPECT_PID_ZOMBIE(child);
900*8ac5aef8SEnji Cooper   EXPECT_PID_GONE(grandchild);
901*8ac5aef8SEnji Cooper 
902*8ac5aef8SEnji Cooper   // Wait for the child.
903*8ac5aef8SEnji Cooper   int status;
904*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
905*8ac5aef8SEnji Cooper   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
906*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
907*8ac5aef8SEnji Cooper   EXPECT_PID_GONE(child);
908*8ac5aef8SEnji Cooper 
909*8ac5aef8SEnji Cooper   close(shared_sock_fds[0]);
910*8ac5aef8SEnji Cooper   close(shared_sock_fds[1]);
911*8ac5aef8SEnji Cooper   close(grandchild_pd);
912*8ac5aef8SEnji Cooper 
913*8ac5aef8SEnji Cooper   if (verbose) {
914*8ac5aef8SEnji Cooper     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
915*8ac5aef8SEnji Cooper     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
916*8ac5aef8SEnji Cooper   }
917*8ac5aef8SEnji Cooper }
918*8ac5aef8SEnji Cooper 
919*8ac5aef8SEnji Cooper TEST(Linux, DeadNSInit2) {
920*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
921*8ac5aef8SEnji Cooper 
922*8ac5aef8SEnji Cooper   // Prepare sockets to communicate with child process.
923*8ac5aef8SEnji Cooper   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
924*8ac5aef8SEnji Cooper 
925*8ac5aef8SEnji Cooper   // Clone into a child process with a new pid namespace.
926*8ac5aef8SEnji Cooper   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
927*8ac5aef8SEnji Cooper                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
928*8ac5aef8SEnji Cooper   usleep(10000);
929*8ac5aef8SEnji Cooper   EXPECT_OK(child);
930*8ac5aef8SEnji Cooper   EXPECT_PID_ALIVE(child);
931*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
932*8ac5aef8SEnji Cooper 
933*8ac5aef8SEnji Cooper   // Get the process descriptor of the child-of-child via socket transfer.
934*8ac5aef8SEnji Cooper   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
935*8ac5aef8SEnji Cooper   pid_t grandchild;
936*8ac5aef8SEnji Cooper   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
937*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
938*8ac5aef8SEnji Cooper 
939*8ac5aef8SEnji Cooper   // Kill the grandchild
940*8ac5aef8SEnji Cooper   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
941*8ac5aef8SEnji Cooper   usleep(10000);
942*8ac5aef8SEnji Cooper   EXPECT_PID_ZOMBIE(grandchild);
943*8ac5aef8SEnji Cooper   // Close the process descriptor, so there are now no procdesc references to grandchild.
944*8ac5aef8SEnji Cooper   close(grandchild_pd);
945*8ac5aef8SEnji Cooper 
946*8ac5aef8SEnji Cooper   // Send an int to the child to trigger its termination.  Grandchild should also
947*8ac5aef8SEnji Cooper   // go, as its init process is gone.
948*8ac5aef8SEnji Cooper   int zero = 0;
949*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
950*8ac5aef8SEnji Cooper   write(shared_sock_fds[0], &zero, sizeof(zero));
951*8ac5aef8SEnji Cooper   EXPECT_PID_ZOMBIE(child);
952*8ac5aef8SEnji Cooper   EXPECT_PID_GONE(grandchild);
953*8ac5aef8SEnji Cooper 
954*8ac5aef8SEnji Cooper   // Wait for the child.
955*8ac5aef8SEnji Cooper   int status;
956*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
957*8ac5aef8SEnji Cooper   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
958*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
959*8ac5aef8SEnji Cooper 
960*8ac5aef8SEnji Cooper   close(shared_sock_fds[0]);
961*8ac5aef8SEnji Cooper   close(shared_sock_fds[1]);
962*8ac5aef8SEnji Cooper 
963*8ac5aef8SEnji Cooper   if (verbose) {
964*8ac5aef8SEnji Cooper     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
965*8ac5aef8SEnji Cooper     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
966*8ac5aef8SEnji Cooper   }
967*8ac5aef8SEnji Cooper }
968*8ac5aef8SEnji Cooper 
969*8ac5aef8SEnji Cooper #ifdef __x86_64__
970*8ac5aef8SEnji Cooper FORK_TEST(Linux, CheckHighWord) {
971*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());  // Enter capability mode.
972*8ac5aef8SEnji Cooper 
973*8ac5aef8SEnji Cooper   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
974*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
975*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // no_new_privs = 1
976*8ac5aef8SEnji Cooper 
977*8ac5aef8SEnji Cooper   // Set some of the high 32-bits of argument zero.
978*8ac5aef8SEnji Cooper   uint64_t big_cmd = PR_GET_NO_NEW_PRIVS | 0x100000000LL;
979*8ac5aef8SEnji Cooper   EXPECT_CAPMODE(syscall(__NR_prctl, big_cmd, 0, 0, 0, 0));
980*8ac5aef8SEnji Cooper }
981*8ac5aef8SEnji Cooper #endif
982*8ac5aef8SEnji Cooper 
983*8ac5aef8SEnji Cooper FORK_TEST(Linux, PrctlOpenatBeneath) {
984*8ac5aef8SEnji Cooper   // Set no_new_privs = 1
985*8ac5aef8SEnji Cooper   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
986*8ac5aef8SEnji Cooper   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
987*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
988*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // no_new_privs = 1
989*8ac5aef8SEnji Cooper 
990*8ac5aef8SEnji Cooper   // Set openat-beneath mode
991*8ac5aef8SEnji Cooper   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 1, 0, 0, 0));
992*8ac5aef8SEnji Cooper   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
993*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
994*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // openat_beneath = 1
995*8ac5aef8SEnji Cooper 
996*8ac5aef8SEnji Cooper   // Clear openat-beneath mode
997*8ac5aef8SEnji Cooper   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
998*8ac5aef8SEnji Cooper   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
999*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
1000*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);  // openat_beneath = 0
1001*8ac5aef8SEnji Cooper 
1002*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());  // Enter capability mode
1003*8ac5aef8SEnji Cooper 
1004*8ac5aef8SEnji Cooper   // Expect to be in openat_beneath mode
1005*8ac5aef8SEnji Cooper   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1006*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
1007*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // openat_beneath = 1
1008*8ac5aef8SEnji Cooper 
1009*8ac5aef8SEnji Cooper   // Expect this to be immutable.
1010*8ac5aef8SEnji Cooper   EXPECT_CAPMODE(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
1011*8ac5aef8SEnji Cooper   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1012*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
1013*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // openat_beneath = 1
1014*8ac5aef8SEnji Cooper 
1015*8ac5aef8SEnji Cooper }
1016*8ac5aef8SEnji Cooper 
1017*8ac5aef8SEnji Cooper FORK_TEST(Linux, NoNewPrivs) {
1018*8ac5aef8SEnji Cooper   if (getuid() == 0) {
1019*8ac5aef8SEnji Cooper     // If root, drop CAP_SYS_ADMIN POSIX.1e capability.
1020*8ac5aef8SEnji Cooper     struct __user_cap_header_struct hdr;
1021*8ac5aef8SEnji Cooper     hdr.version = _LINUX_CAPABILITY_VERSION_3;
1022*8ac5aef8SEnji Cooper     hdr.pid = getpid_();
1023*8ac5aef8SEnji Cooper     struct __user_cap_data_struct data[3];
1024*8ac5aef8SEnji Cooper     EXPECT_OK(capget(&hdr, &data[0]));
1025*8ac5aef8SEnji Cooper     data[0].effective &= ~(1 << CAP_SYS_ADMIN);
1026*8ac5aef8SEnji Cooper     data[0].permitted &= ~(1 << CAP_SYS_ADMIN);
1027*8ac5aef8SEnji Cooper     data[0].inheritable &= ~(1 << CAP_SYS_ADMIN);
1028*8ac5aef8SEnji Cooper     EXPECT_OK(capset(&hdr, &data[0]));
1029*8ac5aef8SEnji Cooper   }
1030*8ac5aef8SEnji Cooper   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1031*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
1032*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);  // no_new_privs == 0
1033*8ac5aef8SEnji Cooper 
1034*8ac5aef8SEnji Cooper   // Can't enter seccomp-bpf mode with no_new_privs == 0
1035*8ac5aef8SEnji Cooper   struct sock_filter filter[] = {
1036*8ac5aef8SEnji Cooper     BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1037*8ac5aef8SEnji Cooper   };
1038*8ac5aef8SEnji Cooper   struct sock_fprog bpf;
1039*8ac5aef8SEnji Cooper   bpf.len = (sizeof(filter) / sizeof(filter[0]));
1040*8ac5aef8SEnji Cooper   bpf.filter = filter;
1041*8ac5aef8SEnji Cooper   rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0);
1042*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, rc);
1043*8ac5aef8SEnji Cooper   EXPECT_EQ(EACCES, errno);
1044*8ac5aef8SEnji Cooper 
1045*8ac5aef8SEnji Cooper   // Set no_new_privs = 1
1046*8ac5aef8SEnji Cooper   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1047*8ac5aef8SEnji Cooper   rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1048*8ac5aef8SEnji Cooper   EXPECT_OK(rc);
1049*8ac5aef8SEnji Cooper   EXPECT_EQ(1, rc);  // no_new_privs = 1
1050*8ac5aef8SEnji Cooper 
1051*8ac5aef8SEnji Cooper   // Can now turn on seccomp mode
1052*8ac5aef8SEnji Cooper   EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1053*8ac5aef8SEnji Cooper }
1054*8ac5aef8SEnji Cooper 
1055*8ac5aef8SEnji Cooper /* Macros for BPF generation */
1056*8ac5aef8SEnji Cooper #define BPF_RETURN_ERRNO(err) \
1057*8ac5aef8SEnji Cooper   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | (err & 0xFFFF))
1058*8ac5aef8SEnji Cooper #define BPF_KILL_PROCESS \
1059*8ac5aef8SEnji Cooper   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
1060*8ac5aef8SEnji Cooper #define BPF_ALLOW \
1061*8ac5aef8SEnji Cooper   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1062*8ac5aef8SEnji Cooper #define EXAMINE_SYSCALL \
1063*8ac5aef8SEnji Cooper   BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr))
1064*8ac5aef8SEnji Cooper #define ALLOW_SYSCALL(name) \
1065*8ac5aef8SEnji Cooper   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1066*8ac5aef8SEnji Cooper   BPF_ALLOW
1067*8ac5aef8SEnji Cooper #define KILL_SYSCALL(name) \
1068*8ac5aef8SEnji Cooper   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1069*8ac5aef8SEnji Cooper   BPF_KILL_PROCESS
1070*8ac5aef8SEnji Cooper #define FAIL_SYSCALL(name, err) \
1071*8ac5aef8SEnji Cooper   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1072*8ac5aef8SEnji Cooper   BPF_RETURN_ERRNO(err)
1073*8ac5aef8SEnji Cooper 
1074*8ac5aef8SEnji Cooper TEST(Linux, CapModeWithBPF) {
1075*8ac5aef8SEnji Cooper   pid_t child = fork();
1076*8ac5aef8SEnji Cooper   EXPECT_OK(child);
1077*8ac5aef8SEnji Cooper   if (child == 0) {
1078*8ac5aef8SEnji Cooper     int fd = open(TmpFile("cap_bpf_capmode"), O_CREAT|O_RDWR, 0644);
1079*8ac5aef8SEnji Cooper     cap_rights_t rights;
1080*8ac5aef8SEnji Cooper     cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1081*8ac5aef8SEnji Cooper     EXPECT_OK(cap_rights_limit(fd, &rights));
1082*8ac5aef8SEnji Cooper 
1083*8ac5aef8SEnji Cooper     struct sock_filter filter[] = { EXAMINE_SYSCALL,
1084*8ac5aef8SEnji Cooper                                     FAIL_SYSCALL(fchmod, ENOMEM),
1085*8ac5aef8SEnji Cooper                                     FAIL_SYSCALL(fstat, ENOEXEC),
1086*8ac5aef8SEnji Cooper                                     ALLOW_SYSCALL(close),
1087*8ac5aef8SEnji Cooper                                     KILL_SYSCALL(fsync),
1088*8ac5aef8SEnji Cooper                                     BPF_ALLOW };
1089*8ac5aef8SEnji Cooper     struct sock_fprog bpf = {.len = (sizeof(filter) / sizeof(filter[0])),
1090*8ac5aef8SEnji Cooper                              .filter = filter};
1091*8ac5aef8SEnji Cooper     // Set up seccomp-bpf first.
1092*8ac5aef8SEnji Cooper     EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1093*8ac5aef8SEnji Cooper     EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1094*8ac5aef8SEnji Cooper 
1095*8ac5aef8SEnji Cooper     EXPECT_OK(cap_enter());  // Enter capability mode.
1096*8ac5aef8SEnji Cooper 
1097*8ac5aef8SEnji Cooper     // fchmod is allowed by Capsicum, but failed by BPF.
1098*8ac5aef8SEnji Cooper     EXPECT_SYSCALL_FAIL(ENOMEM, fchmod(fd, 0644));
1099*8ac5aef8SEnji Cooper     // open is allowed by BPF, but failed by Capsicum
1100*8ac5aef8SEnji Cooper     EXPECT_SYSCALL_FAIL(ECAPMODE, open(TmpFile("cap_bpf_capmode"), O_RDONLY));
1101*8ac5aef8SEnji Cooper     // fstat is failed by both BPF and Capsicum; tie-break is on errno
1102*8ac5aef8SEnji Cooper     struct stat buf;
1103*8ac5aef8SEnji Cooper     EXPECT_SYSCALL_FAIL(ENOEXEC, fstat(fd, &buf));
1104*8ac5aef8SEnji Cooper     // fsync is allowed by Capsicum, but BPF's SIGSYS generation take precedence
1105*8ac5aef8SEnji Cooper     fsync(fd);  // terminate with unhandled SIGSYS
1106*8ac5aef8SEnji Cooper     exit(0);
1107*8ac5aef8SEnji Cooper   }
1108*8ac5aef8SEnji Cooper   int status;
1109*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, 0));
1110*8ac5aef8SEnji Cooper   EXPECT_TRUE(WIFSIGNALED(status));
1111*8ac5aef8SEnji Cooper   EXPECT_EQ(SIGSYS, WTERMSIG(status));
1112*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_bpf_capmode"));
1113*8ac5aef8SEnji Cooper }
1114*8ac5aef8SEnji Cooper 
1115*8ac5aef8SEnji Cooper TEST(Linux, AIO) {
1116*8ac5aef8SEnji Cooper   int fd = open(TmpFile("cap_aio"), O_CREAT|O_RDWR, 0644);
1117*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1118*8ac5aef8SEnji Cooper 
1119*8ac5aef8SEnji Cooper   cap_rights_t r_rs;
1120*8ac5aef8SEnji Cooper   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
1121*8ac5aef8SEnji Cooper   cap_rights_t r_ws;
1122*8ac5aef8SEnji Cooper   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
1123*8ac5aef8SEnji Cooper   cap_rights_t r_rwssync;
1124*8ac5aef8SEnji Cooper   cap_rights_init(&r_rwssync, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1125*8ac5aef8SEnji Cooper 
1126*8ac5aef8SEnji Cooper   int cap_ro = dup(fd);
1127*8ac5aef8SEnji Cooper   EXPECT_OK(cap_ro);
1128*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
1129*8ac5aef8SEnji Cooper   EXPECT_OK(cap_ro);
1130*8ac5aef8SEnji Cooper   int cap_wo = dup(fd);
1131*8ac5aef8SEnji Cooper   EXPECT_OK(cap_wo);
1132*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
1133*8ac5aef8SEnji Cooper   EXPECT_OK(cap_wo);
1134*8ac5aef8SEnji Cooper   int cap_all = dup(fd);
1135*8ac5aef8SEnji Cooper   EXPECT_OK(cap_all);
1136*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap_all, &r_rwssync));
1137*8ac5aef8SEnji Cooper   EXPECT_OK(cap_all);
1138*8ac5aef8SEnji Cooper 
1139*8ac5aef8SEnji Cooper   // Linux: io_setup, io_submit, io_getevents, io_cancel, io_destroy
1140*8ac5aef8SEnji Cooper   aio_context_t ctx = 0;
1141*8ac5aef8SEnji Cooper   EXPECT_OK(syscall(__NR_io_setup, 10, &ctx));
1142*8ac5aef8SEnji Cooper 
1143*8ac5aef8SEnji Cooper   unsigned char buffer[32] = {1, 2, 3, 4};
1144*8ac5aef8SEnji Cooper   struct iocb req;
1145*8ac5aef8SEnji Cooper   memset(&req, 0, sizeof(req));
1146*8ac5aef8SEnji Cooper   req.aio_reqprio = 0;
1147*8ac5aef8SEnji Cooper   req.aio_fildes = fd;
1148*8ac5aef8SEnji Cooper   uintptr_t bufaddr = (uintptr_t)buffer;
1149*8ac5aef8SEnji Cooper   req.aio_buf = (__u64)bufaddr;
1150*8ac5aef8SEnji Cooper   req.aio_nbytes = 4;
1151*8ac5aef8SEnji Cooper   req.aio_offset = 0;
1152*8ac5aef8SEnji Cooper   struct iocb* reqs[1] = {&req};
1153*8ac5aef8SEnji Cooper 
1154*8ac5aef8SEnji Cooper   // Write operation
1155*8ac5aef8SEnji Cooper   req.aio_lio_opcode = IOCB_CMD_PWRITE;
1156*8ac5aef8SEnji Cooper   req.aio_fildes = cap_ro;
1157*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1158*8ac5aef8SEnji Cooper   req.aio_fildes = cap_wo;
1159*8ac5aef8SEnji Cooper   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1160*8ac5aef8SEnji Cooper 
1161*8ac5aef8SEnji Cooper   // Sync operation
1162*8ac5aef8SEnji Cooper   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1163*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1164*8ac5aef8SEnji Cooper   req.aio_lio_opcode = IOCB_CMD_FDSYNC;
1165*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1166*8ac5aef8SEnji Cooper   // Even with CAP_FSYNC, turns out fsync/fdsync aren't implemented
1167*8ac5aef8SEnji Cooper   req.aio_fildes = cap_all;
1168*8ac5aef8SEnji Cooper   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1169*8ac5aef8SEnji Cooper   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1170*8ac5aef8SEnji Cooper   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1171*8ac5aef8SEnji Cooper 
1172*8ac5aef8SEnji Cooper   // Read operation
1173*8ac5aef8SEnji Cooper   req.aio_lio_opcode = IOCB_CMD_PREAD;
1174*8ac5aef8SEnji Cooper   req.aio_fildes = cap_wo;
1175*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1176*8ac5aef8SEnji Cooper   req.aio_fildes = cap_ro;
1177*8ac5aef8SEnji Cooper   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1178*8ac5aef8SEnji Cooper 
1179*8ac5aef8SEnji Cooper   EXPECT_OK(syscall(__NR_io_destroy, ctx));
1180*8ac5aef8SEnji Cooper 
1181*8ac5aef8SEnji Cooper   close(cap_all);
1182*8ac5aef8SEnji Cooper   close(cap_wo);
1183*8ac5aef8SEnji Cooper   close(cap_ro);
1184*8ac5aef8SEnji Cooper   close(fd);
1185*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_aio"));
1186*8ac5aef8SEnji Cooper }
1187*8ac5aef8SEnji Cooper 
1188*8ac5aef8SEnji Cooper #ifndef KCMP_FILE
1189*8ac5aef8SEnji Cooper #define KCMP_FILE 0
1190*8ac5aef8SEnji Cooper #endif
1191*8ac5aef8SEnji Cooper TEST(Linux, Kcmp) {
1192*8ac5aef8SEnji Cooper   // This requires CONFIG_CHECKPOINT_RESTORE in kernel config.
1193*8ac5aef8SEnji Cooper   int fd = open("/etc/passwd", O_RDONLY);
1194*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1195*8ac5aef8SEnji Cooper   pid_t parent = getpid_();
1196*8ac5aef8SEnji Cooper 
1197*8ac5aef8SEnji Cooper   errno = 0;
1198*8ac5aef8SEnji Cooper   int rc = syscall(__NR_kcmp, parent, parent, KCMP_FILE, fd, fd);
1199*8ac5aef8SEnji Cooper   if (rc == -1 && errno == ENOSYS) {
1200*8ac5aef8SEnji Cooper     TEST_SKIPPED("kcmp(2) gives -ENOSYS");
1201*8ac5aef8SEnji Cooper     return;
1202*8ac5aef8SEnji Cooper   }
1203*8ac5aef8SEnji Cooper 
1204*8ac5aef8SEnji Cooper   pid_t child = fork();
1205*8ac5aef8SEnji Cooper   if (child == 0) {
1206*8ac5aef8SEnji Cooper     // Child: limit rights on FD.
1207*8ac5aef8SEnji Cooper     child = getpid_();
1208*8ac5aef8SEnji Cooper     EXPECT_OK(syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1209*8ac5aef8SEnji Cooper     cap_rights_t rights;
1210*8ac5aef8SEnji Cooper     cap_rights_init(&rights, CAP_READ, CAP_WRITE);
1211*8ac5aef8SEnji Cooper     EXPECT_OK(cap_rights_limit(fd, &rights));
1212*8ac5aef8SEnji Cooper     // A capability wrapping a normal FD is different (from a kcmp(2) perspective)
1213*8ac5aef8SEnji Cooper     // than the original file.
1214*8ac5aef8SEnji Cooper     EXPECT_NE(0, syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1215*8ac5aef8SEnji Cooper     exit(HasFailure());
1216*8ac5aef8SEnji Cooper   }
1217*8ac5aef8SEnji Cooper   // Wait for the child.
1218*8ac5aef8SEnji Cooper   int status;
1219*8ac5aef8SEnji Cooper   EXPECT_EQ(child, waitpid(child, &status, 0));
1220*8ac5aef8SEnji Cooper   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
1221*8ac5aef8SEnji Cooper   EXPECT_EQ(0, rc);
1222*8ac5aef8SEnji Cooper 
1223*8ac5aef8SEnji Cooper   close(fd);
1224*8ac5aef8SEnji Cooper }
1225*8ac5aef8SEnji Cooper 
1226*8ac5aef8SEnji Cooper TEST(Linux, ProcFS) {
1227*8ac5aef8SEnji Cooper   cap_rights_t rights;
1228*8ac5aef8SEnji Cooper   cap_rights_init(&rights, CAP_READ, CAP_SEEK);
1229*8ac5aef8SEnji Cooper   int fd = open("/etc/passwd", O_RDONLY);
1230*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1231*8ac5aef8SEnji Cooper   lseek(fd, 4, SEEK_SET);
1232*8ac5aef8SEnji Cooper   int cap = dup(fd);
1233*8ac5aef8SEnji Cooper   EXPECT_OK(cap);
1234*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(cap, &rights));
1235*8ac5aef8SEnji Cooper   pid_t me = getpid_();
1236*8ac5aef8SEnji Cooper 
1237*8ac5aef8SEnji Cooper   char buffer[1024];
1238*8ac5aef8SEnji Cooper   sprintf(buffer, "/proc/%d/fdinfo/%d", me, cap);
1239*8ac5aef8SEnji Cooper   int procfd = open(buffer, O_RDONLY);
1240*8ac5aef8SEnji Cooper   EXPECT_OK(procfd) << " failed to open " << buffer;
1241*8ac5aef8SEnji Cooper   if (procfd < 0) return;
1242*8ac5aef8SEnji Cooper   int proccap = dup(procfd);
1243*8ac5aef8SEnji Cooper   EXPECT_OK(proccap);
1244*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(proccap, &rights));
1245*8ac5aef8SEnji Cooper 
1246*8ac5aef8SEnji Cooper   EXPECT_OK(read(proccap, buffer, sizeof(buffer)));
1247*8ac5aef8SEnji Cooper   // The fdinfo should include the file pos of the underlying file
1248*8ac5aef8SEnji Cooper   EXPECT_NE((char*)NULL, strstr(buffer, "pos:\t4"));
1249*8ac5aef8SEnji Cooper   // ...and the rights of the Capsicum capability.
1250*8ac5aef8SEnji Cooper   EXPECT_NE((char*)NULL, strstr(buffer, "rights:\t0x"));
1251*8ac5aef8SEnji Cooper 
1252*8ac5aef8SEnji Cooper   close(procfd);
1253*8ac5aef8SEnji Cooper   close(proccap);
1254*8ac5aef8SEnji Cooper   close(cap);
1255*8ac5aef8SEnji Cooper   close(fd);
1256*8ac5aef8SEnji Cooper }
1257*8ac5aef8SEnji Cooper 
1258*8ac5aef8SEnji Cooper FORK_TEST(Linux, ProcessClocks) {
1259*8ac5aef8SEnji Cooper   pid_t self = getpid_();
1260*8ac5aef8SEnji Cooper   pid_t child = fork();
1261*8ac5aef8SEnji Cooper   EXPECT_OK(child);
1262*8ac5aef8SEnji Cooper   if (child == 0) {
1263*8ac5aef8SEnji Cooper     child = getpid_();
1264*8ac5aef8SEnji Cooper     usleep(100000);
1265*8ac5aef8SEnji Cooper     exit(0);
1266*8ac5aef8SEnji Cooper   }
1267*8ac5aef8SEnji Cooper 
1268*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());  // Enter capability mode.
1269*8ac5aef8SEnji Cooper 
1270*8ac5aef8SEnji Cooper   // Nefariously build a clock ID for the child's CPU time.
1271*8ac5aef8SEnji Cooper   // This relies on knowledge of the internal layout of clock IDs.
1272*8ac5aef8SEnji Cooper   clockid_t child_clock;
1273*8ac5aef8SEnji Cooper   child_clock = ((~child) << 3) | 0x0;
1274*8ac5aef8SEnji Cooper   struct timespec ts;
1275*8ac5aef8SEnji Cooper   memset(&ts, 0, sizeof(ts));
1276*8ac5aef8SEnji Cooper 
1277*8ac5aef8SEnji Cooper   // TODO(drysdale): Should not be possible to retrieve info about a
1278*8ac5aef8SEnji Cooper   // different process, as the PID global namespace should be locked
1279*8ac5aef8SEnji Cooper   // down.
1280*8ac5aef8SEnji Cooper   EXPECT_OK(clock_gettime(child_clock, &ts));
1281*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(child=%d->0x%08x) is %ld.%09ld \n",
1282*8ac5aef8SEnji Cooper                        self, child, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1283*8ac5aef8SEnji Cooper 
1284*8ac5aef8SEnji Cooper   child_clock = ((~1) << 3) | 0x0;
1285*8ac5aef8SEnji Cooper   memset(&ts, 0, sizeof(ts));
1286*8ac5aef8SEnji Cooper   EXPECT_OK(clock_gettime(child_clock, &ts));
1287*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(init=1->0x%08x) is %ld.%09ld \n",
1288*8ac5aef8SEnji Cooper                        self, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1289*8ac5aef8SEnji Cooper 
1290*8ac5aef8SEnji Cooper   // Orphan the child.
1291*8ac5aef8SEnji Cooper }
1292*8ac5aef8SEnji Cooper 
1293*8ac5aef8SEnji Cooper TEST(Linux, SetLease) {
1294*8ac5aef8SEnji Cooper   int fd_all = open(TmpFile("cap_lease"), O_CREAT|O_RDWR, 0644);
1295*8ac5aef8SEnji Cooper   EXPECT_OK(fd_all);
1296*8ac5aef8SEnji Cooper   int fd_rw = dup(fd_all);
1297*8ac5aef8SEnji Cooper   EXPECT_OK(fd_rw);
1298*8ac5aef8SEnji Cooper 
1299*8ac5aef8SEnji Cooper   cap_rights_t r_all;
1300*8ac5aef8SEnji Cooper   cap_rights_init(&r_all, CAP_READ, CAP_WRITE, CAP_FLOCK, CAP_FSIGNAL);
1301*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(fd_all, &r_all));
1302*8ac5aef8SEnji Cooper 
1303*8ac5aef8SEnji Cooper   cap_rights_t r_rw;
1304*8ac5aef8SEnji Cooper   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
1305*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(fd_rw, &r_rw));
1306*8ac5aef8SEnji Cooper 
1307*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_SETLEASE, F_WRLCK));
1308*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_GETLEASE));
1309*8ac5aef8SEnji Cooper 
1310*8ac5aef8SEnji Cooper   if (!tmpdir_on_tmpfs) {  // tmpfs doesn't support leases
1311*8ac5aef8SEnji Cooper     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_WRLCK));
1312*8ac5aef8SEnji Cooper     EXPECT_EQ(F_WRLCK, fcntl(fd_all, F_GETLEASE));
1313*8ac5aef8SEnji Cooper 
1314*8ac5aef8SEnji Cooper     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_UNLCK, 0));
1315*8ac5aef8SEnji Cooper     EXPECT_EQ(F_UNLCK, fcntl(fd_all, F_GETLEASE));
1316*8ac5aef8SEnji Cooper   }
1317*8ac5aef8SEnji Cooper   close(fd_all);
1318*8ac5aef8SEnji Cooper   close(fd_rw);
1319*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_lease"));
1320*8ac5aef8SEnji Cooper }
1321*8ac5aef8SEnji Cooper 
1322*8ac5aef8SEnji Cooper TEST(Linux, InvalidRightsSyscall) {
1323*8ac5aef8SEnji Cooper   int fd = open(TmpFile("cap_invalid_rights"), O_RDONLY|O_CREAT, 0644);
1324*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1325*8ac5aef8SEnji Cooper 
1326*8ac5aef8SEnji Cooper   cap_rights_t rights;
1327*8ac5aef8SEnji Cooper   cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FCHMOD, CAP_FSTAT);
1328*8ac5aef8SEnji Cooper 
1329*8ac5aef8SEnji Cooper   // Use the raw syscall throughout.
1330*8ac5aef8SEnji Cooper   EXPECT_EQ(0, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1331*8ac5aef8SEnji Cooper 
1332*8ac5aef8SEnji Cooper   // Directly access the syscall, and find all unseemly manner of use for it.
1333*8ac5aef8SEnji Cooper   //  - Invalid flags
1334*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 1));
1335*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1336*8ac5aef8SEnji Cooper   //  - Specify an fcntl subright, but no CAP_FCNTL set
1337*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, CAP_FCNTL_GETFL, 0, NULL, 0));
1338*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1339*8ac5aef8SEnji Cooper   //  - Specify an ioctl subright, but no CAP_IOCTL set
1340*8ac5aef8SEnji Cooper   unsigned int ioctl1 = 1;
1341*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, &ioctl1, 0));
1342*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1343*8ac5aef8SEnji Cooper   //  - N ioctls, but null pointer passed
1344*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, NULL, 0));
1345*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1346*8ac5aef8SEnji Cooper   //  - Invalid nioctls
1347*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, -2, NULL, 0));
1348*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1349*8ac5aef8SEnji Cooper   //  - Null primary rights
1350*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, NULL, 0, 0, NULL, 0));
1351*8ac5aef8SEnji Cooper   EXPECT_EQ(EFAULT, errno);
1352*8ac5aef8SEnji Cooper   //  - Invalid index bitmask
1353*8ac5aef8SEnji Cooper   rights.cr_rights[0] |= 3ULL << 57;
1354*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1355*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1356*8ac5aef8SEnji Cooper   //  - Invalid version
1357*8ac5aef8SEnji Cooper   rights.cr_rights[0] |= 2ULL << 62;
1358*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1359*8ac5aef8SEnji Cooper   EXPECT_EQ(EINVAL, errno);
1360*8ac5aef8SEnji Cooper 
1361*8ac5aef8SEnji Cooper   close(fd);
1362*8ac5aef8SEnji Cooper   unlink(TmpFile("cap_invalid_rights"));
1363*8ac5aef8SEnji Cooper }
1364*8ac5aef8SEnji Cooper 
1365*8ac5aef8SEnji Cooper FORK_TEST_ON(Linux, OpenByHandleAt, TmpFile("cap_openbyhandle_testfile")) {
1366*8ac5aef8SEnji Cooper   REQUIRE_ROOT();
1367*8ac5aef8SEnji Cooper   int dir = open(tmpdir.c_str(), O_RDONLY);
1368*8ac5aef8SEnji Cooper   EXPECT_OK(dir);
1369*8ac5aef8SEnji Cooper   int fd = openat(dir, "cap_openbyhandle_testfile", O_RDWR|O_CREAT, 0644);
1370*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1371*8ac5aef8SEnji Cooper   const char* message = "Saved text";
1372*8ac5aef8SEnji Cooper   EXPECT_OK(write(fd, message, strlen(message)));
1373*8ac5aef8SEnji Cooper   close(fd);
1374*8ac5aef8SEnji Cooper 
1375*8ac5aef8SEnji Cooper   struct file_handle* fhandle = (struct file_handle*)malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
1376*8ac5aef8SEnji Cooper   fhandle->handle_bytes = MAX_HANDLE_SZ;
1377*8ac5aef8SEnji Cooper   int mount_id;
1378*8ac5aef8SEnji Cooper   EXPECT_OK(name_to_handle_at(dir, "cap_openbyhandle_testfile", fhandle,  &mount_id, 0));
1379*8ac5aef8SEnji Cooper 
1380*8ac5aef8SEnji Cooper   fd = open_by_handle_at(dir, fhandle, O_RDONLY);
1381*8ac5aef8SEnji Cooper   EXPECT_OK(fd);
1382*8ac5aef8SEnji Cooper   char buffer[200];
1383*8ac5aef8SEnji Cooper   EXPECT_OK(read(fd, buffer, 199));
1384*8ac5aef8SEnji Cooper   EXPECT_EQ(std::string(message), std::string(buffer));
1385*8ac5aef8SEnji Cooper   close(fd);
1386*8ac5aef8SEnji Cooper 
1387*8ac5aef8SEnji Cooper   // Cannot issue open_by_handle_at after entering capability mode.
1388*8ac5aef8SEnji Cooper   cap_enter();
1389*8ac5aef8SEnji Cooper   EXPECT_CAPMODE(open_by_handle_at(dir, fhandle, O_RDONLY));
1390*8ac5aef8SEnji Cooper 
1391*8ac5aef8SEnji Cooper   close(dir);
1392*8ac5aef8SEnji Cooper }
1393*8ac5aef8SEnji Cooper 
1394*8ac5aef8SEnji Cooper int getrandom_(void *buf, size_t buflen, unsigned int flags) {
1395*8ac5aef8SEnji Cooper #ifdef __NR_getrandom
1396*8ac5aef8SEnji Cooper   return syscall(__NR_getrandom, buf, buflen, flags);
1397*8ac5aef8SEnji Cooper #else
1398*8ac5aef8SEnji Cooper   errno = ENOSYS;
1399*8ac5aef8SEnji Cooper   return -1;
1400*8ac5aef8SEnji Cooper #endif
1401*8ac5aef8SEnji Cooper }
1402*8ac5aef8SEnji Cooper 
1403*8ac5aef8SEnji Cooper #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1404*8ac5aef8SEnji Cooper #include <linux/random.h>  // Requires 3.17 kernel
1405*8ac5aef8SEnji Cooper FORK_TEST(Linux, GetRandom) {
1406*8ac5aef8SEnji Cooper   EXPECT_OK(cap_enter());
1407*8ac5aef8SEnji Cooper   unsigned char buffer[1024];
1408*8ac5aef8SEnji Cooper   unsigned char buffer2[1024];
1409*8ac5aef8SEnji Cooper   EXPECT_OK(getrandom_(buffer, sizeof(buffer), GRND_NONBLOCK));
1410*8ac5aef8SEnji Cooper   EXPECT_OK(getrandom_(buffer2, sizeof(buffer2), GRND_NONBLOCK));
1411*8ac5aef8SEnji Cooper   EXPECT_NE(0, memcmp(buffer, buffer2, sizeof(buffer)));
1412*8ac5aef8SEnji Cooper }
1413*8ac5aef8SEnji Cooper #endif
1414*8ac5aef8SEnji Cooper 
1415*8ac5aef8SEnji Cooper int memfd_create_(const char *name, unsigned int flags) {
1416*8ac5aef8SEnji Cooper #ifdef __NR_memfd_create
1417*8ac5aef8SEnji Cooper   return syscall(__NR_memfd_create, name, flags);
1418*8ac5aef8SEnji Cooper #else
1419*8ac5aef8SEnji Cooper   errno = ENOSYS;
1420*8ac5aef8SEnji Cooper   return -1;
1421*8ac5aef8SEnji Cooper #endif
1422*8ac5aef8SEnji Cooper }
1423*8ac5aef8SEnji Cooper 
1424*8ac5aef8SEnji Cooper #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1425*8ac5aef8SEnji Cooper #include <linux/memfd.h>  // Requires 3.17 kernel
1426*8ac5aef8SEnji Cooper TEST(Linux, MemFDDeathTest) {
1427*8ac5aef8SEnji Cooper   int memfd = memfd_create_("capsicum-test", MFD_ALLOW_SEALING);
1428*8ac5aef8SEnji Cooper   if (memfd == -1 && errno == ENOSYS) {
1429*8ac5aef8SEnji Cooper     TEST_SKIPPED("memfd_create(2) gives -ENOSYS");
1430*8ac5aef8SEnji Cooper     return;
1431*8ac5aef8SEnji Cooper   }
1432*8ac5aef8SEnji Cooper   const int LEN = 16;
1433*8ac5aef8SEnji Cooper   EXPECT_OK(ftruncate(memfd, LEN));
1434*8ac5aef8SEnji Cooper   int memfd_ro = dup(memfd);
1435*8ac5aef8SEnji Cooper   int memfd_rw = dup(memfd);
1436*8ac5aef8SEnji Cooper   EXPECT_OK(memfd_ro);
1437*8ac5aef8SEnji Cooper   EXPECT_OK(memfd_rw);
1438*8ac5aef8SEnji Cooper   cap_rights_t rights;
1439*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(memfd_ro, cap_rights_init(&rights, CAP_MMAP_R, CAP_FSTAT)));
1440*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW, CAP_FCHMOD)));
1441*8ac5aef8SEnji Cooper 
1442*8ac5aef8SEnji Cooper   unsigned char *p_ro = (unsigned char *)mmap(NULL, LEN, PROT_READ, MAP_SHARED, memfd_ro, 0);
1443*8ac5aef8SEnji Cooper   EXPECT_NE((unsigned char *)MAP_FAILED, p_ro);
1444*8ac5aef8SEnji Cooper   unsigned char *p_rw = (unsigned char *)mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_rw, 0);
1445*8ac5aef8SEnji Cooper   EXPECT_NE((unsigned char *)MAP_FAILED, p_rw);
1446*8ac5aef8SEnji Cooper   EXPECT_EQ(MAP_FAILED,
1447*8ac5aef8SEnji Cooper             mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_ro, 0));
1448*8ac5aef8SEnji Cooper 
1449*8ac5aef8SEnji Cooper   *p_rw = 42;
1450*8ac5aef8SEnji Cooper   EXPECT_EQ(42, *p_ro);
1451*8ac5aef8SEnji Cooper   EXPECT_DEATH(*p_ro = 42, "");
1452*8ac5aef8SEnji Cooper 
1453*8ac5aef8SEnji Cooper #ifndef F_ADD_SEALS
1454*8ac5aef8SEnji Cooper   // Hack for when libc6 does not yet include the updated linux/fcntl.h from kernel 3.17
1455*8ac5aef8SEnji Cooper #define _F_LINUX_SPECIFIC_BASE F_SETLEASE
1456*8ac5aef8SEnji Cooper #define F_ADD_SEALS	(_F_LINUX_SPECIFIC_BASE + 9)
1457*8ac5aef8SEnji Cooper #define F_GET_SEALS	(_F_LINUX_SPECIFIC_BASE + 10)
1458*8ac5aef8SEnji Cooper #define F_SEAL_SEAL	0x0001	/* prevent further seals from being set */
1459*8ac5aef8SEnji Cooper #define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
1460*8ac5aef8SEnji Cooper #define F_SEAL_GROW	0x0004	/* prevent file from growing */
1461*8ac5aef8SEnji Cooper #define F_SEAL_WRITE	0x0008	/* prevent writes */
1462*8ac5aef8SEnji Cooper #endif
1463*8ac5aef8SEnji Cooper 
1464*8ac5aef8SEnji Cooper   // Reading the seal information requires CAP_FSTAT.
1465*8ac5aef8SEnji Cooper   int seals = fcntl(memfd, F_GET_SEALS);
1466*8ac5aef8SEnji Cooper   EXPECT_OK(seals);
1467*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1468*8ac5aef8SEnji Cooper   int seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1469*8ac5aef8SEnji Cooper   EXPECT_EQ(seals, seals_ro);
1470*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1471*8ac5aef8SEnji Cooper   int seals_rw = fcntl(memfd_rw, F_GET_SEALS);
1472*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(seals_rw);
1473*8ac5aef8SEnji Cooper 
1474*8ac5aef8SEnji Cooper   // Fail to seal as a writable mapping exists.
1475*8ac5aef8SEnji Cooper   EXPECT_EQ(-1, fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1476*8ac5aef8SEnji Cooper   EXPECT_EQ(EBUSY, errno);
1477*8ac5aef8SEnji Cooper   *p_rw = 42;
1478*8ac5aef8SEnji Cooper 
1479*8ac5aef8SEnji Cooper   // Seal the rw version; need to unmap first.
1480*8ac5aef8SEnji Cooper   munmap(p_rw, LEN);
1481*8ac5aef8SEnji Cooper   munmap(p_ro, LEN);
1482*8ac5aef8SEnji Cooper   EXPECT_OK(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1483*8ac5aef8SEnji Cooper 
1484*8ac5aef8SEnji Cooper   seals = fcntl(memfd, F_GET_SEALS);
1485*8ac5aef8SEnji Cooper   EXPECT_OK(seals);
1486*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1487*8ac5aef8SEnji Cooper   seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1488*8ac5aef8SEnji Cooper   EXPECT_EQ(seals, seals_ro);
1489*8ac5aef8SEnji Cooper   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1490*8ac5aef8SEnji Cooper 
1491*8ac5aef8SEnji Cooper   // Remove the CAP_FCHMOD right, can no longer add seals.
1492*8ac5aef8SEnji Cooper   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW)));
1493*8ac5aef8SEnji Cooper   EXPECT_NOTCAPABLE(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1494*8ac5aef8SEnji Cooper 
1495*8ac5aef8SEnji Cooper   close(memfd);
1496*8ac5aef8SEnji Cooper   close(memfd_ro);
1497*8ac5aef8SEnji Cooper   close(memfd_rw);
1498*8ac5aef8SEnji Cooper }
1499*8ac5aef8SEnji Cooper #endif
1500*8ac5aef8SEnji Cooper 
1501*8ac5aef8SEnji Cooper #else
1502*8ac5aef8SEnji Cooper void noop() {}
1503*8ac5aef8SEnji Cooper #endif
1504