xref: /freebsd/tests/sys/capsicum/linux.cc (revision 670b568ec1c36464c6d55e400382c290b0391ccf)
1*670b568eSEd Maste // Tests of Linux-specific functionality
2*670b568eSEd Maste #ifdef __linux__
3*670b568eSEd Maste 
4*670b568eSEd Maste #include <sys/types.h>
5*670b568eSEd Maste #include <sys/stat.h>
6*670b568eSEd Maste #include <sys/socket.h>
7*670b568eSEd Maste #include <sys/timerfd.h>
8*670b568eSEd Maste #include <sys/signalfd.h>
9*670b568eSEd Maste #include <sys/eventfd.h>
10*670b568eSEd Maste #include <sys/epoll.h>
11*670b568eSEd Maste #include <sys/inotify.h>
12*670b568eSEd Maste #include <sys/fanotify.h>
13*670b568eSEd Maste #include <sys/mman.h>
14*670b568eSEd Maste #include <sys/capability.h>  // Requires e.g. libcap-dev package for POSIX.1e capabilities headers
15*670b568eSEd Maste #include <linux/aio_abi.h>
16*670b568eSEd Maste #include <linux/filter.h>
17*670b568eSEd Maste #include <linux/seccomp.h>
18*670b568eSEd Maste #include <linux/version.h>
19*670b568eSEd Maste #include <poll.h>
20*670b568eSEd Maste #include <sched.h>
21*670b568eSEd Maste #include <signal.h>
22*670b568eSEd Maste #include <fcntl.h>
23*670b568eSEd Maste #include <unistd.h>
24*670b568eSEd Maste 
25*670b568eSEd Maste #include <string>
26*670b568eSEd Maste 
27*670b568eSEd Maste #include "capsicum.h"
28*670b568eSEd Maste #include "syscalls.h"
29*670b568eSEd Maste #include "capsicum-test.h"
30*670b568eSEd Maste 
TEST(Linux,TimerFD)31*670b568eSEd Maste TEST(Linux, TimerFD) {
32*670b568eSEd Maste   int fd = timerfd_create(CLOCK_MONOTONIC, 0);
33*670b568eSEd Maste 
34*670b568eSEd Maste   cap_rights_t r_ro;
35*670b568eSEd Maste   cap_rights_init(&r_ro, CAP_READ);
36*670b568eSEd Maste   cap_rights_t r_wo;
37*670b568eSEd Maste   cap_rights_init(&r_wo, CAP_WRITE);
38*670b568eSEd Maste   cap_rights_t r_rw;
39*670b568eSEd Maste   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
40*670b568eSEd Maste   cap_rights_t r_rwpoll;
41*670b568eSEd Maste   cap_rights_init(&r_rwpoll, CAP_READ, CAP_WRITE, CAP_EVENT);
42*670b568eSEd Maste 
43*670b568eSEd Maste   int cap_fd_ro = dup(fd);
44*670b568eSEd Maste   EXPECT_OK(cap_fd_ro);
45*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_ro));
46*670b568eSEd Maste   int cap_fd_wo = dup(fd);
47*670b568eSEd Maste   EXPECT_OK(cap_fd_wo);
48*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_wo));
49*670b568eSEd Maste   int cap_fd_rw = dup(fd);
50*670b568eSEd Maste   EXPECT_OK(cap_fd_rw);
51*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rw));
52*670b568eSEd Maste   int cap_fd_all = dup(fd);
53*670b568eSEd Maste   EXPECT_OK(cap_fd_all);
54*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwpoll));
55*670b568eSEd Maste 
56*670b568eSEd Maste   struct itimerspec old_ispec;
57*670b568eSEd Maste   struct itimerspec ispec;
58*670b568eSEd Maste   ispec.it_interval.tv_sec = 0;
59*670b568eSEd Maste   ispec.it_interval.tv_nsec = 0;
60*670b568eSEd Maste   ispec.it_value.tv_sec = 0;
61*670b568eSEd Maste   ispec.it_value.tv_nsec = 100000000;  // 100ms
62*670b568eSEd Maste   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_ro, 0, &ispec, NULL));
63*670b568eSEd Maste   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_wo, 0, &ispec, &old_ispec));
64*670b568eSEd Maste   EXPECT_OK(timerfd_settime(cap_fd_wo, 0, &ispec, NULL));
65*670b568eSEd Maste   EXPECT_OK(timerfd_settime(cap_fd_rw, 0, &ispec, NULL));
66*670b568eSEd Maste   EXPECT_OK(timerfd_settime(cap_fd_all, 0, &ispec, NULL));
67*670b568eSEd Maste 
68*670b568eSEd Maste   EXPECT_NOTCAPABLE(timerfd_gettime(cap_fd_wo, &old_ispec));
69*670b568eSEd Maste   EXPECT_OK(timerfd_gettime(cap_fd_ro, &old_ispec));
70*670b568eSEd Maste   EXPECT_OK(timerfd_gettime(cap_fd_rw, &old_ispec));
71*670b568eSEd Maste   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
72*670b568eSEd Maste 
73*670b568eSEd Maste   // To be able to poll() for the timer pop, still need CAP_EVENT.
74*670b568eSEd Maste   struct pollfd poll_fd;
75*670b568eSEd Maste   for (int ii = 0; ii < 3; ii++) {
76*670b568eSEd Maste     poll_fd.revents = 0;
77*670b568eSEd Maste     poll_fd.events = POLLIN;
78*670b568eSEd Maste     switch (ii) {
79*670b568eSEd Maste     case 0: poll_fd.fd = cap_fd_ro; break;
80*670b568eSEd Maste     case 1: poll_fd.fd = cap_fd_wo; break;
81*670b568eSEd Maste     case 2: poll_fd.fd = cap_fd_rw; break;
82*670b568eSEd Maste     }
83*670b568eSEd Maste     // Poll immediately returns with POLLNVAL
84*670b568eSEd Maste     EXPECT_OK(poll(&poll_fd, 1, 400));
85*670b568eSEd Maste     EXPECT_EQ(0, (poll_fd.revents & POLLIN));
86*670b568eSEd Maste     EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
87*670b568eSEd Maste   }
88*670b568eSEd Maste 
89*670b568eSEd Maste   poll_fd.fd = cap_fd_all;
90*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 400));
91*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLIN));
92*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
93*670b568eSEd Maste 
94*670b568eSEd Maste   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
95*670b568eSEd Maste   EXPECT_EQ(0, old_ispec.it_value.tv_sec);
96*670b568eSEd Maste   EXPECT_EQ(0, old_ispec.it_value.tv_nsec);
97*670b568eSEd Maste   EXPECT_EQ(0, old_ispec.it_interval.tv_sec);
98*670b568eSEd Maste   EXPECT_EQ(0, old_ispec.it_interval.tv_nsec);
99*670b568eSEd Maste 
100*670b568eSEd Maste   close(cap_fd_all);
101*670b568eSEd Maste   close(cap_fd_rw);
102*670b568eSEd Maste   close(cap_fd_wo);
103*670b568eSEd Maste   close(cap_fd_ro);
104*670b568eSEd Maste   close(fd);
105*670b568eSEd Maste }
106*670b568eSEd Maste 
FORK_TEST(Linux,SignalFDIfSingleThreaded)107*670b568eSEd Maste FORK_TEST(Linux, SignalFDIfSingleThreaded) {
108*670b568eSEd Maste   if (force_mt) {
109*670b568eSEd Maste     GTEST_SKIP() << "multi-threaded run clashes with signals";
110*670b568eSEd Maste   }
111*670b568eSEd Maste   pid_t me = getpid();
112*670b568eSEd Maste   sigset_t mask;
113*670b568eSEd Maste   sigemptyset(&mask);
114*670b568eSEd Maste   sigaddset(&mask, SIGUSR1);
115*670b568eSEd Maste 
116*670b568eSEd Maste   // Block signals before registering against a new signal FD.
117*670b568eSEd Maste   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
118*670b568eSEd Maste   int fd = signalfd(-1, &mask, 0);
119*670b568eSEd Maste   EXPECT_OK(fd);
120*670b568eSEd Maste 
121*670b568eSEd Maste   cap_rights_t r_rs;
122*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
123*670b568eSEd Maste   cap_rights_t r_ws;
124*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
125*670b568eSEd Maste   cap_rights_t r_sig;
126*670b568eSEd Maste   cap_rights_init(&r_sig, CAP_FSIGNAL);
127*670b568eSEd Maste   cap_rights_t r_rssig;
128*670b568eSEd Maste   cap_rights_init(&r_rssig, CAP_FSIGNAL, CAP_READ, CAP_SEEK);
129*670b568eSEd Maste   cap_rights_t r_rssig_poll;
130*670b568eSEd Maste   cap_rights_init(&r_rssig_poll, CAP_FSIGNAL, CAP_READ, CAP_SEEK, CAP_EVENT);
131*670b568eSEd Maste 
132*670b568eSEd Maste   // Various capability variants.
133*670b568eSEd Maste   int cap_fd_none = dup(fd);
134*670b568eSEd Maste   EXPECT_OK(cap_fd_none);
135*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_none, &r_ws));
136*670b568eSEd Maste   int cap_fd_read = dup(fd);
137*670b568eSEd Maste   EXPECT_OK(cap_fd_read);
138*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_read, &r_rs));
139*670b568eSEd Maste   int cap_fd_sig = dup(fd);
140*670b568eSEd Maste   EXPECT_OK(cap_fd_sig);
141*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_sig, &r_sig));
142*670b568eSEd Maste   int cap_fd_sig_read = dup(fd);
143*670b568eSEd Maste   EXPECT_OK(cap_fd_sig_read);
144*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_sig_read, &r_rssig));
145*670b568eSEd Maste   int cap_fd_all = dup(fd);
146*670b568eSEd Maste   EXPECT_OK(cap_fd_all);
147*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rssig_poll));
148*670b568eSEd Maste 
149*670b568eSEd Maste   struct signalfd_siginfo fdsi;
150*670b568eSEd Maste 
151*670b568eSEd Maste   // Need CAP_READ to read the signal information
152*670b568eSEd Maste   kill(me, SIGUSR1);
153*670b568eSEd Maste   EXPECT_NOTCAPABLE(read(cap_fd_none, &fdsi, sizeof(struct signalfd_siginfo)));
154*670b568eSEd Maste   EXPECT_NOTCAPABLE(read(cap_fd_sig, &fdsi, sizeof(struct signalfd_siginfo)));
155*670b568eSEd Maste   int len = read(cap_fd_read, &fdsi, sizeof(struct signalfd_siginfo));
156*670b568eSEd Maste   EXPECT_OK(len);
157*670b568eSEd Maste   EXPECT_EQ(sizeof(struct signalfd_siginfo), (size_t)len);
158*670b568eSEd Maste   EXPECT_EQ(SIGUSR1, (int)fdsi.ssi_signo);
159*670b568eSEd Maste 
160*670b568eSEd Maste   // Need CAP_FSIGNAL to modify the signal mask.
161*670b568eSEd Maste   sigemptyset(&mask);
162*670b568eSEd Maste   sigaddset(&mask, SIGUSR1);
163*670b568eSEd Maste   sigaddset(&mask, SIGUSR2);
164*670b568eSEd Maste   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
165*670b568eSEd Maste   EXPECT_NOTCAPABLE(signalfd(cap_fd_none, &mask, 0));
166*670b568eSEd Maste   EXPECT_NOTCAPABLE(signalfd(cap_fd_read, &mask, 0));
167*670b568eSEd Maste   EXPECT_EQ(cap_fd_sig, signalfd(cap_fd_sig, &mask, 0));
168*670b568eSEd Maste 
169*670b568eSEd Maste   // Need CAP_EVENT to get notification of a signal in poll(2).
170*670b568eSEd Maste   kill(me, SIGUSR2);
171*670b568eSEd Maste 
172*670b568eSEd Maste   struct pollfd poll_fd;
173*670b568eSEd Maste   poll_fd.revents = 0;
174*670b568eSEd Maste   poll_fd.events = POLLIN;
175*670b568eSEd Maste   poll_fd.fd = cap_fd_sig_read;
176*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 400));
177*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
178*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
179*670b568eSEd Maste 
180*670b568eSEd Maste   poll_fd.fd = cap_fd_all;
181*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 400));
182*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLIN));
183*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
184*670b568eSEd Maste }
185*670b568eSEd Maste 
TEST(Linux,EventFD)186*670b568eSEd Maste TEST(Linux, EventFD) {
187*670b568eSEd Maste   int fd = eventfd(0, 0);
188*670b568eSEd Maste   EXPECT_OK(fd);
189*670b568eSEd Maste 
190*670b568eSEd Maste   cap_rights_t r_rs;
191*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
192*670b568eSEd Maste   cap_rights_t r_ws;
193*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
194*670b568eSEd Maste   cap_rights_t r_rws;
195*670b568eSEd Maste   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
196*670b568eSEd Maste   cap_rights_t r_rwspoll;
197*670b568eSEd Maste   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
198*670b568eSEd Maste 
199*670b568eSEd Maste   int cap_ro = dup(fd);
200*670b568eSEd Maste   EXPECT_OK(cap_ro);
201*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
202*670b568eSEd Maste   int cap_wo = dup(fd);
203*670b568eSEd Maste   EXPECT_OK(cap_wo);
204*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
205*670b568eSEd Maste   int cap_rw = dup(fd);
206*670b568eSEd Maste   EXPECT_OK(cap_rw);
207*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_rw, &r_rws));
208*670b568eSEd Maste   int cap_all = dup(fd);
209*670b568eSEd Maste   EXPECT_OK(cap_all);
210*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_all, &r_rwspoll));
211*670b568eSEd Maste 
212*670b568eSEd Maste   pid_t child = fork();
213*670b568eSEd Maste   if (child == 0) {
214*670b568eSEd Maste     // Child: write counter to eventfd
215*670b568eSEd Maste     uint64_t u = 42;
216*670b568eSEd Maste     EXPECT_NOTCAPABLE(write(cap_ro, &u, sizeof(u)));
217*670b568eSEd Maste     EXPECT_OK(write(cap_wo, &u, sizeof(u)));
218*670b568eSEd Maste     exit(HasFailure());
219*670b568eSEd Maste   }
220*670b568eSEd Maste 
221*670b568eSEd Maste   sleep(1);  // Allow child to write
222*670b568eSEd Maste 
223*670b568eSEd Maste   struct pollfd poll_fd;
224*670b568eSEd Maste   poll_fd.revents = 0;
225*670b568eSEd Maste   poll_fd.events = POLLIN;
226*670b568eSEd Maste   poll_fd.fd = cap_rw;
227*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 400));
228*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
229*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
230*670b568eSEd Maste 
231*670b568eSEd Maste   poll_fd.fd = cap_all;
232*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 400));
233*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLIN));
234*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
235*670b568eSEd Maste 
236*670b568eSEd Maste   uint64_t u;
237*670b568eSEd Maste   EXPECT_NOTCAPABLE(read(cap_wo, &u, sizeof(u)));
238*670b568eSEd Maste   EXPECT_OK(read(cap_ro, &u, sizeof(u)));
239*670b568eSEd Maste   EXPECT_EQ(42, (int)u);
240*670b568eSEd Maste 
241*670b568eSEd Maste   // Wait for the child.
242*670b568eSEd Maste   int status;
243*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, 0));
244*670b568eSEd Maste   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
245*670b568eSEd Maste   EXPECT_EQ(0, rc);
246*670b568eSEd Maste 
247*670b568eSEd Maste   close(cap_all);
248*670b568eSEd Maste   close(cap_rw);
249*670b568eSEd Maste   close(cap_wo);
250*670b568eSEd Maste   close(cap_ro);
251*670b568eSEd Maste   close(fd);
252*670b568eSEd Maste }
253*670b568eSEd Maste 
FORK_TEST(Linux,epoll)254*670b568eSEd Maste FORK_TEST(Linux, epoll) {
255*670b568eSEd Maste   int sock_fds[2];
256*670b568eSEd Maste   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds));
257*670b568eSEd Maste   // Queue some data.
258*670b568eSEd Maste   char buffer[4] = {1, 2, 3, 4};
259*670b568eSEd Maste   EXPECT_OK(write(sock_fds[1], buffer, sizeof(buffer)));
260*670b568eSEd Maste 
261*670b568eSEd Maste   EXPECT_OK(cap_enter());  // Enter capability mode.
262*670b568eSEd Maste 
263*670b568eSEd Maste   int epoll_fd = epoll_create(1);
264*670b568eSEd Maste   EXPECT_OK(epoll_fd);
265*670b568eSEd Maste 
266*670b568eSEd Maste   cap_rights_t r_rs;
267*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
268*670b568eSEd Maste   cap_rights_t r_ws;
269*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
270*670b568eSEd Maste   cap_rights_t r_rws;
271*670b568eSEd Maste   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
272*670b568eSEd Maste   cap_rights_t r_rwspoll;
273*670b568eSEd Maste   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
274*670b568eSEd Maste   cap_rights_t r_epoll;
275*670b568eSEd Maste   cap_rights_init(&r_epoll, CAP_EPOLL_CTL);
276*670b568eSEd Maste 
277*670b568eSEd Maste   int cap_epoll_wo = dup(epoll_fd);
278*670b568eSEd Maste   EXPECT_OK(cap_epoll_wo);
279*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_epoll_wo, &r_ws));
280*670b568eSEd Maste   int cap_epoll_ro = dup(epoll_fd);
281*670b568eSEd Maste   EXPECT_OK(cap_epoll_ro);
282*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_epoll_ro, &r_rs));
283*670b568eSEd Maste   int cap_epoll_rw = dup(epoll_fd);
284*670b568eSEd Maste   EXPECT_OK(cap_epoll_rw);
285*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_epoll_rw, &r_rws));
286*670b568eSEd Maste   int cap_epoll_poll = dup(epoll_fd);
287*670b568eSEd Maste   EXPECT_OK(cap_epoll_poll);
288*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_epoll_poll, &r_rwspoll));
289*670b568eSEd Maste   int cap_epoll_ctl = dup(epoll_fd);
290*670b568eSEd Maste   EXPECT_OK(cap_epoll_ctl);
291*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_epoll_ctl, &r_epoll));
292*670b568eSEd Maste 
293*670b568eSEd Maste   // Can only modify the FDs being monitored if the CAP_EPOLL_CTL right is present.
294*670b568eSEd Maste   struct epoll_event eev;
295*670b568eSEd Maste   memset(&eev, 0, sizeof(eev));
296*670b568eSEd Maste   eev.events = EPOLLIN|EPOLLOUT|EPOLLPRI;
297*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_ADD, sock_fds[0], &eev));
298*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_ADD, sock_fds[0], &eev));
299*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_ADD, sock_fds[0], &eev));
300*670b568eSEd Maste   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_ADD, sock_fds[0], &eev));
301*670b568eSEd Maste   eev.events = EPOLLIN|EPOLLOUT;
302*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_MOD, sock_fds[0], &eev));
303*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_MOD, sock_fds[0], &eev));
304*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_MOD, sock_fds[0], &eev));
305*670b568eSEd Maste   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_MOD, sock_fds[0], &eev));
306*670b568eSEd Maste 
307*670b568eSEd Maste   // Running epoll_pwait(2) requires CAP_EVENT.
308*670b568eSEd Maste   eev.events = 0;
309*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_ro, &eev, 1, 100, NULL));
310*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_wo, &eev, 1, 100, NULL));
311*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_rw, &eev, 1, 100, NULL));
312*670b568eSEd Maste   EXPECT_OK(epoll_pwait(cap_epoll_poll, &eev, 1, 100, NULL));
313*670b568eSEd Maste   EXPECT_EQ(EPOLLIN, eev.events & EPOLLIN);
314*670b568eSEd Maste 
315*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_DEL, sock_fds[0], &eev));
316*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_DEL, sock_fds[0], &eev));
317*670b568eSEd Maste   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_DEL, sock_fds[0], &eev));
318*670b568eSEd Maste   EXPECT_OK(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, sock_fds[0], &eev));
319*670b568eSEd Maste 
320*670b568eSEd Maste   close(cap_epoll_ctl);
321*670b568eSEd Maste   close(cap_epoll_poll);
322*670b568eSEd Maste   close(cap_epoll_rw);
323*670b568eSEd Maste   close(cap_epoll_ro);
324*670b568eSEd Maste   close(cap_epoll_wo);
325*670b568eSEd Maste   close(epoll_fd);
326*670b568eSEd Maste   close(sock_fds[1]);
327*670b568eSEd Maste   close(sock_fds[0]);
328*670b568eSEd Maste }
329*670b568eSEd Maste 
TEST(Linux,fstatat)330*670b568eSEd Maste TEST(Linux, fstatat) {
331*670b568eSEd Maste   int fd = open(TmpFile("cap_fstatat"), O_CREAT|O_RDWR, 0644);
332*670b568eSEd Maste   EXPECT_OK(fd);
333*670b568eSEd Maste   unsigned char buffer[] = {1, 2, 3, 4};
334*670b568eSEd Maste   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
335*670b568eSEd Maste   cap_rights_t rights;
336*670b568eSEd Maste   int cap_rf = dup(fd);
337*670b568eSEd Maste   EXPECT_OK(cap_rf);
338*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
339*670b568eSEd Maste   int cap_ro = dup(fd);
340*670b568eSEd Maste   EXPECT_OK(cap_ro);
341*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_ro, cap_rights_init(&rights, CAP_READ)));
342*670b568eSEd Maste 
343*670b568eSEd Maste   struct stat info;
344*670b568eSEd Maste   EXPECT_OK(fstatat(fd, "", &info, AT_EMPTY_PATH));
345*670b568eSEd Maste   EXPECT_NOTCAPABLE(fstatat(cap_ro, "", &info, AT_EMPTY_PATH));
346*670b568eSEd Maste   EXPECT_OK(fstatat(cap_rf, "", &info, AT_EMPTY_PATH));
347*670b568eSEd Maste 
348*670b568eSEd Maste   close(cap_ro);
349*670b568eSEd Maste   close(cap_rf);
350*670b568eSEd Maste   close(fd);
351*670b568eSEd Maste 
352*670b568eSEd Maste   int dir = open(tmpdir.c_str(), O_RDONLY);
353*670b568eSEd Maste   EXPECT_OK(dir);
354*670b568eSEd Maste   int dir_rf = dup(dir);
355*670b568eSEd Maste   EXPECT_OK(dir_rf);
356*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(dir_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
357*670b568eSEd Maste   int dir_ro = dup(fd);
358*670b568eSEd Maste   EXPECT_OK(dir_ro);
359*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(dir_ro, cap_rights_init(&rights, CAP_READ)));
360*670b568eSEd Maste 
361*670b568eSEd Maste   EXPECT_OK(fstatat(dir, "cap_fstatat", &info, AT_EMPTY_PATH));
362*670b568eSEd Maste   EXPECT_NOTCAPABLE(fstatat(dir_ro, "cap_fstatat", &info, AT_EMPTY_PATH));
363*670b568eSEd Maste   EXPECT_OK(fstatat(dir_rf, "cap_fstatat", &info, AT_EMPTY_PATH));
364*670b568eSEd Maste 
365*670b568eSEd Maste   close(dir_ro);
366*670b568eSEd Maste   close(dir_rf);
367*670b568eSEd Maste   close(dir);
368*670b568eSEd Maste 
369*670b568eSEd Maste   unlink(TmpFile("cap_fstatat"));
370*670b568eSEd Maste }
371*670b568eSEd Maste 
372*670b568eSEd Maste // fanotify support may not be available at compile-time
373*670b568eSEd Maste #ifdef __NR_fanotify_init
TEST(Linux,FanotifyIfRoot)374*670b568eSEd Maste TEST(Linux, FanotifyIfRoot) {
375*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
376*670b568eSEd Maste   int fa_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDWR);
377*670b568eSEd Maste   EXPECT_OK(fa_fd);
378*670b568eSEd Maste   if (fa_fd < 0) return;  // May not be enabled
379*670b568eSEd Maste 
380*670b568eSEd Maste   cap_rights_t r_rs;
381*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
382*670b568eSEd Maste   cap_rights_t r_ws;
383*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
384*670b568eSEd Maste   cap_rights_t r_rws;
385*670b568eSEd Maste   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
386*670b568eSEd Maste   cap_rights_t r_rwspoll;
387*670b568eSEd Maste   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
388*670b568eSEd Maste   cap_rights_t r_rwsnotify;
389*670b568eSEd Maste   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
390*670b568eSEd Maste   cap_rights_t r_rsl;
391*670b568eSEd Maste   cap_rights_init(&r_rsl, CAP_READ, CAP_SEEK, CAP_LOOKUP);
392*670b568eSEd Maste   cap_rights_t r_rslstat;
393*670b568eSEd Maste   cap_rights_init(&r_rslstat, CAP_READ, CAP_SEEK, CAP_LOOKUP, CAP_FSTAT);
394*670b568eSEd Maste   cap_rights_t r_rsstat;
395*670b568eSEd Maste   cap_rights_init(&r_rsstat, CAP_READ, CAP_SEEK, CAP_FSTAT);
396*670b568eSEd Maste 
397*670b568eSEd Maste   int cap_fd_ro = dup(fa_fd);
398*670b568eSEd Maste   EXPECT_OK(cap_fd_ro);
399*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
400*670b568eSEd Maste   int cap_fd_wo = dup(fa_fd);
401*670b568eSEd Maste   EXPECT_OK(cap_fd_wo);
402*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
403*670b568eSEd Maste   int cap_fd_rw = dup(fa_fd);
404*670b568eSEd Maste   EXPECT_OK(cap_fd_rw);
405*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
406*670b568eSEd Maste   int cap_fd_poll = dup(fa_fd);
407*670b568eSEd Maste   EXPECT_OK(cap_fd_poll);
408*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_poll, &r_rwspoll));
409*670b568eSEd Maste   int cap_fd_not = dup(fa_fd);
410*670b568eSEd Maste   EXPECT_OK(cap_fd_not);
411*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_not, &r_rwsnotify));
412*670b568eSEd Maste 
413*670b568eSEd Maste   int rc = mkdir(TmpFile("cap_notify"), 0755);
414*670b568eSEd Maste   EXPECT_TRUE(rc == 0 || errno == EEXIST);
415*670b568eSEd Maste   int dfd = open(TmpFile("cap_notify"), O_RDONLY);
416*670b568eSEd Maste   EXPECT_OK(dfd);
417*670b568eSEd Maste   int fd = open(TmpFile("cap_notify/file"), O_CREAT|O_RDWR, 0644);
418*670b568eSEd Maste   close(fd);
419*670b568eSEd Maste   int cap_dfd = dup(dfd);
420*670b568eSEd Maste   EXPECT_OK(cap_dfd);
421*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_dfd, &r_rslstat));
422*670b568eSEd Maste   EXPECT_OK(cap_dfd);
423*670b568eSEd Maste   int cap_dfd_rs = dup(dfd);
424*670b568eSEd Maste   EXPECT_OK(cap_dfd_rs);
425*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_dfd_rs, &r_rs));
426*670b568eSEd Maste   EXPECT_OK(cap_dfd_rs);
427*670b568eSEd Maste   int cap_dfd_rsstat = dup(dfd);
428*670b568eSEd Maste   EXPECT_OK(cap_dfd_rsstat);
429*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_dfd_rsstat, &r_rsstat));
430*670b568eSEd Maste   EXPECT_OK(cap_dfd_rsstat);
431*670b568eSEd Maste   int cap_dfd_rsl = dup(dfd);
432*670b568eSEd Maste   EXPECT_OK(cap_dfd_rsl);
433*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_dfd_rsl, &r_rsl));
434*670b568eSEd Maste   EXPECT_OK(cap_dfd_rsl);
435*670b568eSEd Maste 
436*670b568eSEd Maste   // Need CAP_NOTIFY to change what's monitored.
437*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_ro, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
438*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_wo, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
439*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_rw, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
440*670b568eSEd Maste   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
441*670b568eSEd Maste 
442*670b568eSEd Maste   // Need CAP_FSTAT on the thing monitored.
443*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rs, NULL));
444*670b568eSEd Maste   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rsstat, NULL));
445*670b568eSEd Maste 
446*670b568eSEd Maste   // Too add monitoring of a file under a dfd, need CAP_LOOKUP|CAP_FSTAT on the dfd.
447*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsstat, "file"));
448*670b568eSEd Maste   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsl, "file"));
449*670b568eSEd Maste   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd, "file"));
450*670b568eSEd Maste 
451*670b568eSEd Maste   pid_t child = fork();
452*670b568eSEd Maste   if (child == 0) {
453*670b568eSEd Maste     // Child: Perform activity in the directory under notify.
454*670b568eSEd Maste     sleep(1);
455*670b568eSEd Maste     unlink(TmpFile("cap_notify/temp"));
456*670b568eSEd Maste     int fd = open(TmpFile("cap_notify/temp"), O_CREAT|O_RDWR, 0644);
457*670b568eSEd Maste     close(fd);
458*670b568eSEd Maste     exit(0);
459*670b568eSEd Maste   }
460*670b568eSEd Maste 
461*670b568eSEd Maste   // Need CAP_EVENT to poll.
462*670b568eSEd Maste   struct pollfd poll_fd;
463*670b568eSEd Maste   poll_fd.revents = 0;
464*670b568eSEd Maste   poll_fd.events = POLLIN;
465*670b568eSEd Maste   poll_fd.fd = cap_fd_rw;
466*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 1400));
467*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
468*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
469*670b568eSEd Maste 
470*670b568eSEd Maste   poll_fd.fd = cap_fd_not;
471*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 1400));
472*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
473*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
474*670b568eSEd Maste 
475*670b568eSEd Maste   poll_fd.fd = cap_fd_poll;
476*670b568eSEd Maste   EXPECT_OK(poll(&poll_fd, 1, 1400));
477*670b568eSEd Maste   EXPECT_NE(0, (poll_fd.revents & POLLIN));
478*670b568eSEd Maste   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
479*670b568eSEd Maste 
480*670b568eSEd Maste   // Need CAP_READ to read.
481*670b568eSEd Maste   struct fanotify_event_metadata ev;
482*670b568eSEd Maste   memset(&ev, 0, sizeof(ev));
483*670b568eSEd Maste   EXPECT_NOTCAPABLE(read(cap_fd_wo, &ev, sizeof(ev)));
484*670b568eSEd Maste   rc = read(fa_fd, &ev, sizeof(ev));
485*670b568eSEd Maste   EXPECT_OK(rc);
486*670b568eSEd Maste   EXPECT_EQ((int)sizeof(struct fanotify_event_metadata), rc);
487*670b568eSEd Maste   EXPECT_EQ(child, ev.pid);
488*670b568eSEd Maste   EXPECT_NE(0, ev.fd);
489*670b568eSEd Maste 
490*670b568eSEd Maste   // TODO(drysdale): reinstate if/when capsicum-linux propagates rights
491*670b568eSEd Maste   // to fanotify-generated FDs.
492*670b568eSEd Maste #ifdef OMIT
493*670b568eSEd Maste   // fanotify(7) gives us a FD for the changed file.  This should
494*670b568eSEd Maste   // only have rights that are a subset of those for the original
495*670b568eSEd Maste   // monitored directory file descriptor.
496*670b568eSEd Maste   cap_rights_t rights;
497*670b568eSEd Maste   CAP_SET_ALL(&rights);
498*670b568eSEd Maste   EXPECT_OK(cap_rights_get(ev.fd, &rights));
499*670b568eSEd Maste   EXPECT_RIGHTS_IN(&rights, &r_rslstat);
500*670b568eSEd Maste #endif
501*670b568eSEd Maste 
502*670b568eSEd Maste   // Wait for the child.
503*670b568eSEd Maste   int status;
504*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, 0));
505*670b568eSEd Maste   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
506*670b568eSEd Maste   EXPECT_EQ(0, rc);
507*670b568eSEd Maste 
508*670b568eSEd Maste   close(cap_dfd_rsstat);
509*670b568eSEd Maste   close(cap_dfd_rsl);
510*670b568eSEd Maste   close(cap_dfd_rs);
511*670b568eSEd Maste   close(cap_dfd);
512*670b568eSEd Maste   close(dfd);
513*670b568eSEd Maste   unlink(TmpFile("cap_notify/file"));
514*670b568eSEd Maste   unlink(TmpFile("cap_notify/temp"));
515*670b568eSEd Maste   rmdir(TmpFile("cap_notify"));
516*670b568eSEd Maste   close(cap_fd_not);
517*670b568eSEd Maste   close(cap_fd_poll);
518*670b568eSEd Maste   close(cap_fd_rw);
519*670b568eSEd Maste   close(cap_fd_wo);
520*670b568eSEd Maste   close(cap_fd_ro);
521*670b568eSEd Maste   close(fa_fd);
522*670b568eSEd Maste }
523*670b568eSEd Maste #endif
524*670b568eSEd Maste 
TEST(Linux,inotify)525*670b568eSEd Maste TEST(Linux, inotify) {
526*670b568eSEd Maste   int i_fd = inotify_init();
527*670b568eSEd Maste   EXPECT_OK(i_fd);
528*670b568eSEd Maste 
529*670b568eSEd Maste   cap_rights_t r_rs;
530*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
531*670b568eSEd Maste   cap_rights_t r_ws;
532*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
533*670b568eSEd Maste   cap_rights_t r_rws;
534*670b568eSEd Maste   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
535*670b568eSEd Maste   cap_rights_t r_rwsnotify;
536*670b568eSEd Maste   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
537*670b568eSEd Maste 
538*670b568eSEd Maste   int cap_fd_ro = dup(i_fd);
539*670b568eSEd Maste   EXPECT_OK(cap_fd_ro);
540*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
541*670b568eSEd Maste   int cap_fd_wo = dup(i_fd);
542*670b568eSEd Maste   EXPECT_OK(cap_fd_wo);
543*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
544*670b568eSEd Maste   int cap_fd_rw = dup(i_fd);
545*670b568eSEd Maste   EXPECT_OK(cap_fd_rw);
546*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
547*670b568eSEd Maste   int cap_fd_all = dup(i_fd);
548*670b568eSEd Maste   EXPECT_OK(cap_fd_all);
549*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwsnotify));
550*670b568eSEd Maste 
551*670b568eSEd Maste   int fd = open(TmpFile("cap_inotify"), O_CREAT|O_RDWR, 0644);
552*670b568eSEd Maste   EXPECT_NOTCAPABLE(inotify_add_watch(cap_fd_rw, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY));
553*670b568eSEd Maste   int wd = inotify_add_watch(i_fd, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY);
554*670b568eSEd Maste   EXPECT_OK(wd);
555*670b568eSEd Maste 
556*670b568eSEd Maste   unsigned char buffer[] = {1, 2, 3, 4};
557*670b568eSEd Maste   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
558*670b568eSEd Maste 
559*670b568eSEd Maste   struct inotify_event iev;
560*670b568eSEd Maste   memset(&iev, 0, sizeof(iev));
561*670b568eSEd Maste   EXPECT_NOTCAPABLE(read(cap_fd_wo, &iev, sizeof(iev)));
562*670b568eSEd Maste   int rc = read(cap_fd_ro, &iev, sizeof(iev));
563*670b568eSEd Maste   EXPECT_OK(rc);
564*670b568eSEd Maste   EXPECT_EQ((int)sizeof(iev), rc);
565*670b568eSEd Maste   EXPECT_EQ(wd, iev.wd);
566*670b568eSEd Maste 
567*670b568eSEd Maste   EXPECT_NOTCAPABLE(inotify_rm_watch(cap_fd_wo, wd));
568*670b568eSEd Maste   EXPECT_OK(inotify_rm_watch(cap_fd_all, wd));
569*670b568eSEd Maste 
570*670b568eSEd Maste   close(fd);
571*670b568eSEd Maste   close(cap_fd_all);
572*670b568eSEd Maste   close(cap_fd_rw);
573*670b568eSEd Maste   close(cap_fd_wo);
574*670b568eSEd Maste   close(cap_fd_ro);
575*670b568eSEd Maste   close(i_fd);
576*670b568eSEd Maste   unlink(TmpFile("cap_inotify"));
577*670b568eSEd Maste }
578*670b568eSEd Maste 
TEST(Linux,ArchChangeIfAvailable)579*670b568eSEd Maste TEST(Linux, ArchChangeIfAvailable) {
580*670b568eSEd Maste   const char* prog_candidates[] = {"./mini-me.32", "./mini-me.x32", "./mini-me.64"};
581*670b568eSEd Maste   const char* progs[] = {NULL, NULL, NULL};
582*670b568eSEd Maste   char* argv_pass[] = {(char*)"to-come", (char*)"--capmode", NULL};
583*670b568eSEd Maste   char* null_envp[] = {NULL};
584*670b568eSEd Maste   int fds[3];
585*670b568eSEd Maste   int count = 0;
586*670b568eSEd Maste 
587*670b568eSEd Maste   for (int ii = 0; ii < 3; ii++) {
588*670b568eSEd Maste     fds[count] = open(prog_candidates[ii], O_RDONLY);
589*670b568eSEd Maste     if (fds[count] >= 0) {
590*670b568eSEd Maste       progs[count] = prog_candidates[ii];
591*670b568eSEd Maste       count++;
592*670b568eSEd Maste     }
593*670b568eSEd Maste   }
594*670b568eSEd Maste   if (count == 0) {
595*670b568eSEd Maste     GTEST_SKIP() << "no different-architecture programs available";
596*670b568eSEd Maste   }
597*670b568eSEd Maste 
598*670b568eSEd Maste   for (int ii = 0; ii < count; ii++) {
599*670b568eSEd Maste     // Fork-and-exec a binary of this architecture.
600*670b568eSEd Maste     pid_t child = fork();
601*670b568eSEd Maste     if (child == 0) {
602*670b568eSEd Maste       EXPECT_OK(cap_enter());  // Enter capability mode
603*670b568eSEd Maste       if (verbose) fprintf(stderr, "[%d] call fexecve(%s, %s)\n",
604*670b568eSEd Maste                            getpid_(), progs[ii], argv_pass[1]);
605*670b568eSEd Maste       argv_pass[0] = (char *)progs[ii];
606*670b568eSEd Maste       int rc = fexecve_(fds[ii], argv_pass, null_envp);
607*670b568eSEd Maste       fprintf(stderr, "fexecve(%s) returned %d errno %d\n", progs[ii], rc, errno);
608*670b568eSEd Maste       exit(99);  // Should not reach here.
609*670b568eSEd Maste     }
610*670b568eSEd Maste     int status;
611*670b568eSEd Maste     EXPECT_EQ(child, waitpid(child, &status, 0));
612*670b568eSEd Maste     int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
613*670b568eSEd Maste     EXPECT_EQ(0, rc);
614*670b568eSEd Maste     close(fds[ii]);
615*670b568eSEd Maste   }
616*670b568eSEd Maste }
617*670b568eSEd Maste 
FORK_TEST(Linux,NamespaceIfRoot)618*670b568eSEd Maste FORK_TEST(Linux, NamespaceIfRoot) {
619*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
620*670b568eSEd Maste   pid_t me = getpid_();
621*670b568eSEd Maste 
622*670b568eSEd Maste   // Create a new UTS namespace.
623*670b568eSEd Maste   EXPECT_OK(unshare(CLONE_NEWUTS));
624*670b568eSEd Maste   // Open an FD to its symlink.
625*670b568eSEd Maste   char buffer[256];
626*670b568eSEd Maste   sprintf(buffer, "/proc/%d/ns/uts", me);
627*670b568eSEd Maste   int ns_fd = open(buffer, O_RDONLY);
628*670b568eSEd Maste 
629*670b568eSEd Maste   cap_rights_t r_rwlstat;
630*670b568eSEd Maste   cap_rights_init(&r_rwlstat, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT);
631*670b568eSEd Maste   cap_rights_t r_rwlstatns;
632*670b568eSEd Maste   cap_rights_init(&r_rwlstatns, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT, CAP_SETNS);
633*670b568eSEd Maste 
634*670b568eSEd Maste   int cap_fd = dup(ns_fd);
635*670b568eSEd Maste   EXPECT_OK(cap_fd);
636*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd, &r_rwlstat));
637*670b568eSEd Maste   int cap_fd_setns = dup(ns_fd);
638*670b568eSEd Maste   EXPECT_OK(cap_fd_setns);
639*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_fd_setns, &r_rwlstatns));
640*670b568eSEd Maste   EXPECT_NOTCAPABLE(setns(cap_fd, CLONE_NEWUTS));
641*670b568eSEd Maste   EXPECT_OK(setns(cap_fd_setns, CLONE_NEWUTS));
642*670b568eSEd Maste 
643*670b568eSEd Maste   EXPECT_OK(cap_enter());  // Enter capability mode.
644*670b568eSEd Maste 
645*670b568eSEd Maste   // No setns(2) but unshare(2) is allowed.
646*670b568eSEd Maste   EXPECT_CAPMODE(setns(ns_fd, CLONE_NEWUTS));
647*670b568eSEd Maste   EXPECT_OK(unshare(CLONE_NEWUTS));
648*670b568eSEd Maste }
649*670b568eSEd Maste 
SendFD(int fd,int over)650*670b568eSEd Maste static void SendFD(int fd, int over) {
651*670b568eSEd Maste   struct msghdr mh;
652*670b568eSEd Maste   mh.msg_name = NULL;  // No address needed
653*670b568eSEd Maste   mh.msg_namelen = 0;
654*670b568eSEd Maste   char buffer1[1024];
655*670b568eSEd Maste   struct iovec iov[1];
656*670b568eSEd Maste   iov[0].iov_base = buffer1;
657*670b568eSEd Maste   iov[0].iov_len = sizeof(buffer1);
658*670b568eSEd Maste   mh.msg_iov = iov;
659*670b568eSEd Maste   mh.msg_iovlen = 1;
660*670b568eSEd Maste   char buffer2[1024];
661*670b568eSEd Maste   mh.msg_control = buffer2;
662*670b568eSEd Maste   mh.msg_controllen = CMSG_LEN(sizeof(int));
663*670b568eSEd Maste   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
664*670b568eSEd Maste   cmptr->cmsg_level = SOL_SOCKET;
665*670b568eSEd Maste   cmptr->cmsg_type = SCM_RIGHTS;
666*670b568eSEd Maste   cmptr->cmsg_len = CMSG_LEN(sizeof(int));
667*670b568eSEd Maste   *(int *)CMSG_DATA(cmptr) = fd;
668*670b568eSEd Maste   buffer1[0] = 0;
669*670b568eSEd Maste   iov[0].iov_len = 1;
670*670b568eSEd Maste   int rc = sendmsg(over, &mh, 0);
671*670b568eSEd Maste   EXPECT_OK(rc);
672*670b568eSEd Maste }
673*670b568eSEd Maste 
ReceiveFD(int over)674*670b568eSEd Maste static int ReceiveFD(int over) {
675*670b568eSEd Maste   struct msghdr mh;
676*670b568eSEd Maste   mh.msg_name = NULL;  // No address needed
677*670b568eSEd Maste   mh.msg_namelen = 0;
678*670b568eSEd Maste   char buffer1[1024];
679*670b568eSEd Maste   struct iovec iov[1];
680*670b568eSEd Maste   iov[0].iov_base = buffer1;
681*670b568eSEd Maste   iov[0].iov_len = sizeof(buffer1);
682*670b568eSEd Maste   mh.msg_iov = iov;
683*670b568eSEd Maste   mh.msg_iovlen = 1;
684*670b568eSEd Maste   char buffer2[1024];
685*670b568eSEd Maste   mh.msg_control = buffer2;
686*670b568eSEd Maste   mh.msg_controllen = sizeof(buffer2);
687*670b568eSEd Maste   int rc = recvmsg(over, &mh, 0);
688*670b568eSEd Maste   EXPECT_OK(rc);
689*670b568eSEd Maste   EXPECT_LE(CMSG_LEN(sizeof(int)), mh.msg_controllen);
690*670b568eSEd Maste   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
691*670b568eSEd Maste   int fd = *(int*)CMSG_DATA(cmptr);
692*670b568eSEd Maste   EXPECT_EQ(CMSG_LEN(sizeof(int)), cmptr->cmsg_len);
693*670b568eSEd Maste   cmptr = CMSG_NXTHDR(&mh, cmptr);
694*670b568eSEd Maste   EXPECT_TRUE(cmptr == NULL);
695*670b568eSEd Maste   return fd;
696*670b568eSEd Maste }
697*670b568eSEd Maste 
698*670b568eSEd Maste static int shared_pd = -1;
699*670b568eSEd Maste static int shared_sock_fds[2];
700*670b568eSEd Maste 
ChildFunc(void * arg)701*670b568eSEd Maste static int ChildFunc(void *arg) {
702*670b568eSEd Maste   // This function is running in a new PID namespace, and so is pid 1.
703*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
704*670b568eSEd Maste   EXPECT_EQ(1, getpid_());
705*670b568eSEd Maste   EXPECT_EQ(0, getppid());
706*670b568eSEd Maste 
707*670b568eSEd Maste   // The shared process descriptor is outside our namespace, so we cannot
708*670b568eSEd Maste   // get its pid.
709*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: shared_pd=%d\n", shared_pd);
710*670b568eSEd Maste   pid_t shared_child = -1;
711*670b568eSEd Maste   EXPECT_OK(pdgetpid(shared_pd, &shared_child));
712*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: corresponding pid=%d\n", shared_child);
713*670b568eSEd Maste   EXPECT_EQ(0, shared_child);
714*670b568eSEd Maste 
715*670b568eSEd Maste   // But we can pdkill() it even so.
716*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: call pdkill(pd=%d)\n", shared_pd);
717*670b568eSEd Maste   EXPECT_OK(pdkill(shared_pd, SIGINT));
718*670b568eSEd Maste 
719*670b568eSEd Maste   int pd;
720*670b568eSEd Maste   pid_t child = pdfork(&pd, 0);
721*670b568eSEd Maste   EXPECT_OK(child);
722*670b568eSEd Maste   if (child == 0) {
723*670b568eSEd Maste     // Child: expect pid 2.
724*670b568eSEd Maste     if (verbose) fprintf(stderr, "      child of ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
725*670b568eSEd Maste     EXPECT_EQ(2, getpid_());
726*670b568eSEd Maste     EXPECT_EQ(1, getppid());
727*670b568eSEd Maste     while (true) {
728*670b568eSEd Maste       if (verbose) fprintf(stderr, "      child of ChildFunc: \"I aten't dead\"\n");
729*670b568eSEd Maste       sleep(1);
730*670b568eSEd Maste     }
731*670b568eSEd Maste     exit(0);
732*670b568eSEd Maste   }
733*670b568eSEd Maste   EXPECT_EQ(2, child);
734*670b568eSEd Maste   EXPECT_PID_ALIVE(child);
735*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
736*670b568eSEd Maste                        pd, child, ProcessState(child));
737*670b568eSEd Maste 
738*670b568eSEd Maste   pid_t pid;
739*670b568eSEd Maste   EXPECT_OK(pdgetpid(pd, &pid));
740*670b568eSEd Maste   EXPECT_EQ(child, pid);
741*670b568eSEd Maste 
742*670b568eSEd Maste   sleep(2);
743*670b568eSEd Maste 
744*670b568eSEd Maste   // Send the process descriptor over UNIX domain socket back to parent.
745*670b568eSEd Maste   SendFD(pd, shared_sock_fds[1]);
746*670b568eSEd Maste 
747*670b568eSEd Maste   // Wait for death of (grand)child, killed by our parent.
748*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: wait on pid=%d\n", child);
749*670b568eSEd Maste   int status;
750*670b568eSEd Maste   EXPECT_EQ(child, wait4(child, &status, __WALL, NULL));
751*670b568eSEd Maste 
752*670b568eSEd Maste   if (verbose) fprintf(stderr, "    ChildFunc: return 0\n");
753*670b568eSEd Maste   return 0;
754*670b568eSEd Maste }
755*670b568eSEd Maste 
756*670b568eSEd Maste #define STACK_SIZE (1024 * 1024)
757*670b568eSEd Maste static char child_stack[STACK_SIZE];
758*670b568eSEd Maste 
759*670b568eSEd Maste // TODO(drysdale): fork into a user namespace first so GTEST_SKIP_IF_NOT_ROOT can be removed.
TEST(Linux,PidNamespacePdForkIfRoot)760*670b568eSEd Maste TEST(Linux, PidNamespacePdForkIfRoot) {
761*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
762*670b568eSEd Maste   // Pass process descriptors in both directions across a PID namespace boundary.
763*670b568eSEd Maste   // pdfork() off a child before we start, holding its process descriptor in a global
764*670b568eSEd Maste   // variable that's accessible to children.
765*670b568eSEd Maste   pid_t firstborn = pdfork(&shared_pd, 0);
766*670b568eSEd Maste   EXPECT_OK(firstborn);
767*670b568eSEd Maste   if (firstborn == 0) {
768*670b568eSEd Maste     while (true) {
769*670b568eSEd Maste       if (verbose) fprintf(stderr, "  Firstborn: \"I aten't dead\"\n");
770*670b568eSEd Maste       sleep(1);
771*670b568eSEd Maste     }
772*670b568eSEd Maste     exit(0);
773*670b568eSEd Maste   }
774*670b568eSEd Maste   EXPECT_PID_ALIVE(firstborn);
775*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: pre-pdfork()ed pd=%d, pid=%d state='%c'\n",
776*670b568eSEd Maste                        shared_pd, firstborn, ProcessState(firstborn));
777*670b568eSEd Maste   sleep(2);
778*670b568eSEd Maste 
779*670b568eSEd Maste   // Prepare sockets to communicate with child process.
780*670b568eSEd Maste   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
781*670b568eSEd Maste 
782*670b568eSEd Maste   // Clone into a child process with a new pid namespace.
783*670b568eSEd Maste   pid_t child = clone(ChildFunc, child_stack + STACK_SIZE,
784*670b568eSEd Maste                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
785*670b568eSEd Maste   EXPECT_OK(child);
786*670b568eSEd Maste   EXPECT_PID_ALIVE(child);
787*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
788*670b568eSEd Maste 
789*670b568eSEd Maste   // Ensure the child runs.  First thing it does is to kill our firstborn, using shared_pd.
790*670b568eSEd Maste   sleep(1);
791*670b568eSEd Maste   EXPECT_PID_DEAD(firstborn);
792*670b568eSEd Maste 
793*670b568eSEd Maste   // But we can still retrieve firstborn's PID, as it's not been reaped yet.
794*670b568eSEd Maste   pid_t child0;
795*670b568eSEd Maste   EXPECT_OK(pdgetpid(shared_pd, &child0));
796*670b568eSEd Maste   EXPECT_EQ(firstborn, child0);
797*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: check on firstborn: pdgetpid(pd=%d) -> child=%d state='%c'\n",
798*670b568eSEd Maste                        shared_pd, child0, ProcessState(child0));
799*670b568eSEd Maste 
800*670b568eSEd Maste   // Now reap it.
801*670b568eSEd Maste   int status;
802*670b568eSEd Maste   EXPECT_EQ(firstborn, waitpid(firstborn, &status, __WALL));
803*670b568eSEd Maste 
804*670b568eSEd Maste   // Get the process descriptor of the child-of-child via socket transfer.
805*670b568eSEd Maste   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
806*670b568eSEd Maste 
807*670b568eSEd Maste   // Our notion of the pid associated with the grandchild is in the main PID namespace.
808*670b568eSEd Maste   pid_t grandchild;
809*670b568eSEd Maste   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
810*670b568eSEd Maste   EXPECT_NE(2, grandchild);
811*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: pre-pdkill:  pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
812*670b568eSEd Maste                        grandchild_pd, grandchild, ProcessState(grandchild));
813*670b568eSEd Maste   EXPECT_PID_ALIVE(grandchild);
814*670b568eSEd Maste 
815*670b568eSEd Maste   // Kill the grandchild via the process descriptor.
816*670b568eSEd Maste   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
817*670b568eSEd Maste   usleep(10000);
818*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: post-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
819*670b568eSEd Maste                        grandchild_pd, grandchild, ProcessState(grandchild));
820*670b568eSEd Maste   EXPECT_PID_DEAD(grandchild);
821*670b568eSEd Maste 
822*670b568eSEd Maste   sleep(2);
823*670b568eSEd Maste 
824*670b568eSEd Maste   // Wait for the child.
825*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
826*670b568eSEd Maste   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
827*670b568eSEd Maste   EXPECT_EQ(0, rc);
828*670b568eSEd Maste 
829*670b568eSEd Maste   close(shared_sock_fds[0]);
830*670b568eSEd Maste   close(shared_sock_fds[1]);
831*670b568eSEd Maste   close(shared_pd);
832*670b568eSEd Maste   close(grandchild_pd);
833*670b568eSEd Maste }
834*670b568eSEd Maste 
NSInit(void * data)835*670b568eSEd Maste int NSInit(void *data) {
836*670b568eSEd Maste   // This function is running in a new PID namespace, and so is pid 1.
837*670b568eSEd Maste   if (verbose) fprintf(stderr, "  NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
838*670b568eSEd Maste   EXPECT_EQ(1, getpid_());
839*670b568eSEd Maste   EXPECT_EQ(0, getppid());
840*670b568eSEd Maste 
841*670b568eSEd Maste   int pd;
842*670b568eSEd Maste   pid_t child = pdfork(&pd, 0);
843*670b568eSEd Maste   EXPECT_OK(child);
844*670b568eSEd Maste   if (child == 0) {
845*670b568eSEd Maste     // Child: loop forever until terminated.
846*670b568eSEd Maste     if (verbose) fprintf(stderr, "    child of NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
847*670b568eSEd Maste     while (true) {
848*670b568eSEd Maste       if (verbose) fprintf(stderr, "    child of NSInit: \"I aten't dead\"\n");
849*670b568eSEd Maste       usleep(100000);
850*670b568eSEd Maste     }
851*670b568eSEd Maste     exit(0);
852*670b568eSEd Maste   }
853*670b568eSEd Maste   EXPECT_EQ(2, child);
854*670b568eSEd Maste   EXPECT_PID_ALIVE(child);
855*670b568eSEd Maste   if (verbose) fprintf(stderr, "  NSInit: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
856*670b568eSEd Maste                        pd, child, ProcessState(child));
857*670b568eSEd Maste   sleep(1);
858*670b568eSEd Maste 
859*670b568eSEd Maste   // Send the process descriptor over UNIX domain socket back to parent.
860*670b568eSEd Maste   SendFD(pd, shared_sock_fds[1]);
861*670b568eSEd Maste   close(pd);
862*670b568eSEd Maste 
863*670b568eSEd Maste   // Wait for a byte back in the other direction.
864*670b568eSEd Maste   int value;
865*670b568eSEd Maste   if (verbose) fprintf(stderr, "  NSInit: block waiting for value\n");
866*670b568eSEd Maste   read(shared_sock_fds[1], &value, sizeof(value));
867*670b568eSEd Maste 
868*670b568eSEd Maste   if (verbose) fprintf(stderr, "  NSInit: return 0\n");
869*670b568eSEd Maste   return 0;
870*670b568eSEd Maste }
871*670b568eSEd Maste 
TEST(Linux,DeadNSInitIfRoot)872*670b568eSEd Maste TEST(Linux, DeadNSInitIfRoot) {
873*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
874*670b568eSEd Maste 
875*670b568eSEd Maste   // Prepare sockets to communicate with child process.
876*670b568eSEd Maste   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
877*670b568eSEd Maste 
878*670b568eSEd Maste   // Clone into a child process with a new pid namespace.
879*670b568eSEd Maste   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
880*670b568eSEd Maste                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
881*670b568eSEd Maste   usleep(10000);
882*670b568eSEd Maste   EXPECT_OK(child);
883*670b568eSEd Maste   EXPECT_PID_ALIVE(child);
884*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
885*670b568eSEd Maste 
886*670b568eSEd Maste   // Get the process descriptor of the child-of-child via socket transfer.
887*670b568eSEd Maste   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
888*670b568eSEd Maste   pid_t grandchild;
889*670b568eSEd Maste   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
890*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
891*670b568eSEd Maste 
892*670b568eSEd Maste   // Send an int to the child to trigger its termination.  Grandchild should also
893*670b568eSEd Maste   // go, as its init process is gone.
894*670b568eSEd Maste   int zero = 0;
895*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
896*670b568eSEd Maste   write(shared_sock_fds[0], &zero, sizeof(zero));
897*670b568eSEd Maste   EXPECT_PID_ZOMBIE(child);
898*670b568eSEd Maste   EXPECT_PID_GONE(grandchild);
899*670b568eSEd Maste 
900*670b568eSEd Maste   // Wait for the child.
901*670b568eSEd Maste   int status;
902*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
903*670b568eSEd Maste   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
904*670b568eSEd Maste   EXPECT_EQ(0, rc);
905*670b568eSEd Maste   EXPECT_PID_GONE(child);
906*670b568eSEd Maste 
907*670b568eSEd Maste   close(shared_sock_fds[0]);
908*670b568eSEd Maste   close(shared_sock_fds[1]);
909*670b568eSEd Maste   close(grandchild_pd);
910*670b568eSEd Maste 
911*670b568eSEd Maste   if (verbose) {
912*670b568eSEd Maste     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
913*670b568eSEd Maste     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
914*670b568eSEd Maste   }
915*670b568eSEd Maste }
916*670b568eSEd Maste 
TEST(Linux,DeadNSInit2IfRoot)917*670b568eSEd Maste TEST(Linux, DeadNSInit2IfRoot) {
918*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
919*670b568eSEd Maste 
920*670b568eSEd Maste   // Prepare sockets to communicate with child process.
921*670b568eSEd Maste   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
922*670b568eSEd Maste 
923*670b568eSEd Maste   // Clone into a child process with a new pid namespace.
924*670b568eSEd Maste   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
925*670b568eSEd Maste                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
926*670b568eSEd Maste   usleep(10000);
927*670b568eSEd Maste   EXPECT_OK(child);
928*670b568eSEd Maste   EXPECT_PID_ALIVE(child);
929*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
930*670b568eSEd Maste 
931*670b568eSEd Maste   // Get the process descriptor of the child-of-child via socket transfer.
932*670b568eSEd Maste   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
933*670b568eSEd Maste   pid_t grandchild;
934*670b568eSEd Maste   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
935*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
936*670b568eSEd Maste 
937*670b568eSEd Maste   // Kill the grandchild
938*670b568eSEd Maste   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
939*670b568eSEd Maste   usleep(10000);
940*670b568eSEd Maste   EXPECT_PID_ZOMBIE(grandchild);
941*670b568eSEd Maste   // Close the process descriptor, so there are now no procdesc references to grandchild.
942*670b568eSEd Maste   close(grandchild_pd);
943*670b568eSEd Maste 
944*670b568eSEd Maste   // Send an int to the child to trigger its termination.  Grandchild should also
945*670b568eSEd Maste   // go, as its init process is gone.
946*670b568eSEd Maste   int zero = 0;
947*670b568eSEd Maste   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
948*670b568eSEd Maste   write(shared_sock_fds[0], &zero, sizeof(zero));
949*670b568eSEd Maste   EXPECT_PID_ZOMBIE(child);
950*670b568eSEd Maste   EXPECT_PID_GONE(grandchild);
951*670b568eSEd Maste 
952*670b568eSEd Maste   // Wait for the child.
953*670b568eSEd Maste   int status;
954*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
955*670b568eSEd Maste   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
956*670b568eSEd Maste   EXPECT_EQ(0, rc);
957*670b568eSEd Maste 
958*670b568eSEd Maste   close(shared_sock_fds[0]);
959*670b568eSEd Maste   close(shared_sock_fds[1]);
960*670b568eSEd Maste 
961*670b568eSEd Maste   if (verbose) {
962*670b568eSEd Maste     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
963*670b568eSEd Maste     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
964*670b568eSEd Maste   }
965*670b568eSEd Maste }
966*670b568eSEd Maste 
967*670b568eSEd Maste #ifdef __x86_64__
FORK_TEST(Linux,CheckHighWord)968*670b568eSEd Maste FORK_TEST(Linux, CheckHighWord) {
969*670b568eSEd Maste   EXPECT_OK(cap_enter());  // Enter capability mode.
970*670b568eSEd Maste 
971*670b568eSEd Maste   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
972*670b568eSEd Maste   EXPECT_OK(rc);
973*670b568eSEd Maste   EXPECT_EQ(1, rc);  // no_new_privs = 1
974*670b568eSEd Maste 
975*670b568eSEd Maste   // Set some of the high 32-bits of argument zero.
976*670b568eSEd Maste   uint64_t big_cmd = PR_GET_NO_NEW_PRIVS | 0x100000000LL;
977*670b568eSEd Maste   EXPECT_CAPMODE(syscall(__NR_prctl, big_cmd, 0, 0, 0, 0));
978*670b568eSEd Maste }
979*670b568eSEd Maste #endif
980*670b568eSEd Maste 
FORK_TEST(Linux,PrctlOpenatBeneath)981*670b568eSEd Maste FORK_TEST(Linux, PrctlOpenatBeneath) {
982*670b568eSEd Maste   // Set no_new_privs = 1
983*670b568eSEd Maste   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
984*670b568eSEd Maste   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
985*670b568eSEd Maste   EXPECT_OK(rc);
986*670b568eSEd Maste   EXPECT_EQ(1, rc);  // no_new_privs = 1
987*670b568eSEd Maste 
988*670b568eSEd Maste   // Set openat-beneath mode
989*670b568eSEd Maste   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 1, 0, 0, 0));
990*670b568eSEd Maste   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
991*670b568eSEd Maste   EXPECT_OK(rc);
992*670b568eSEd Maste   EXPECT_EQ(1, rc);  // openat_beneath = 1
993*670b568eSEd Maste 
994*670b568eSEd Maste   // Clear openat-beneath mode
995*670b568eSEd Maste   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
996*670b568eSEd Maste   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
997*670b568eSEd Maste   EXPECT_OK(rc);
998*670b568eSEd Maste   EXPECT_EQ(0, rc);  // openat_beneath = 0
999*670b568eSEd Maste 
1000*670b568eSEd Maste   EXPECT_OK(cap_enter());  // Enter capability mode
1001*670b568eSEd Maste 
1002*670b568eSEd Maste   // Expect to be in openat_beneath mode
1003*670b568eSEd Maste   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1004*670b568eSEd Maste   EXPECT_OK(rc);
1005*670b568eSEd Maste   EXPECT_EQ(1, rc);  // openat_beneath = 1
1006*670b568eSEd Maste 
1007*670b568eSEd Maste   // Expect this to be immutable.
1008*670b568eSEd Maste   EXPECT_CAPMODE(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
1009*670b568eSEd Maste   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1010*670b568eSEd Maste   EXPECT_OK(rc);
1011*670b568eSEd Maste   EXPECT_EQ(1, rc);  // openat_beneath = 1
1012*670b568eSEd Maste 
1013*670b568eSEd Maste }
1014*670b568eSEd Maste 
FORK_TEST(Linux,NoNewPrivs)1015*670b568eSEd Maste FORK_TEST(Linux, NoNewPrivs) {
1016*670b568eSEd Maste   if (getuid() == 0) {
1017*670b568eSEd Maste     // If root, drop CAP_SYS_ADMIN POSIX.1e capability.
1018*670b568eSEd Maste     struct __user_cap_header_struct hdr;
1019*670b568eSEd Maste     hdr.version = _LINUX_CAPABILITY_VERSION_3;
1020*670b568eSEd Maste     hdr.pid = getpid_();
1021*670b568eSEd Maste     struct __user_cap_data_struct data[3];
1022*670b568eSEd Maste     EXPECT_OK(capget(&hdr, &data[0]));
1023*670b568eSEd Maste     data[0].effective &= ~(1 << CAP_SYS_ADMIN);
1024*670b568eSEd Maste     data[0].permitted &= ~(1 << CAP_SYS_ADMIN);
1025*670b568eSEd Maste     data[0].inheritable &= ~(1 << CAP_SYS_ADMIN);
1026*670b568eSEd Maste     EXPECT_OK(capset(&hdr, &data[0]));
1027*670b568eSEd Maste   }
1028*670b568eSEd Maste   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1029*670b568eSEd Maste   EXPECT_OK(rc);
1030*670b568eSEd Maste   EXPECT_EQ(0, rc);  // no_new_privs == 0
1031*670b568eSEd Maste 
1032*670b568eSEd Maste   // Can't enter seccomp-bpf mode with no_new_privs == 0
1033*670b568eSEd Maste   struct sock_filter filter[] = {
1034*670b568eSEd Maste     BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1035*670b568eSEd Maste   };
1036*670b568eSEd Maste   struct sock_fprog bpf;
1037*670b568eSEd Maste   bpf.len = (sizeof(filter) / sizeof(filter[0]));
1038*670b568eSEd Maste   bpf.filter = filter;
1039*670b568eSEd Maste   rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0);
1040*670b568eSEd Maste   EXPECT_EQ(-1, rc);
1041*670b568eSEd Maste   EXPECT_EQ(EACCES, errno);
1042*670b568eSEd Maste 
1043*670b568eSEd Maste   // Set no_new_privs = 1
1044*670b568eSEd Maste   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1045*670b568eSEd Maste   rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1046*670b568eSEd Maste   EXPECT_OK(rc);
1047*670b568eSEd Maste   EXPECT_EQ(1, rc);  // no_new_privs = 1
1048*670b568eSEd Maste 
1049*670b568eSEd Maste   // Can now turn on seccomp mode
1050*670b568eSEd Maste   EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1051*670b568eSEd Maste }
1052*670b568eSEd Maste 
1053*670b568eSEd Maste /* Macros for BPF generation */
1054*670b568eSEd Maste #define BPF_RETURN_ERRNO(err) \
1055*670b568eSEd Maste   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | (err & 0xFFFF))
1056*670b568eSEd Maste #define BPF_KILL_PROCESS \
1057*670b568eSEd Maste   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
1058*670b568eSEd Maste #define BPF_ALLOW \
1059*670b568eSEd Maste   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1060*670b568eSEd Maste #define EXAMINE_SYSCALL \
1061*670b568eSEd Maste   BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr))
1062*670b568eSEd Maste #define ALLOW_SYSCALL(name) \
1063*670b568eSEd Maste   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1064*670b568eSEd Maste   BPF_ALLOW
1065*670b568eSEd Maste #define KILL_SYSCALL(name) \
1066*670b568eSEd Maste   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1067*670b568eSEd Maste   BPF_KILL_PROCESS
1068*670b568eSEd Maste #define FAIL_SYSCALL(name, err) \
1069*670b568eSEd Maste   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1070*670b568eSEd Maste   BPF_RETURN_ERRNO(err)
1071*670b568eSEd Maste 
TEST(Linux,CapModeWithBPF)1072*670b568eSEd Maste TEST(Linux, CapModeWithBPF) {
1073*670b568eSEd Maste   pid_t child = fork();
1074*670b568eSEd Maste   EXPECT_OK(child);
1075*670b568eSEd Maste   if (child == 0) {
1076*670b568eSEd Maste     int fd = open(TmpFile("cap_bpf_capmode"), O_CREAT|O_RDWR, 0644);
1077*670b568eSEd Maste     cap_rights_t rights;
1078*670b568eSEd Maste     cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1079*670b568eSEd Maste     EXPECT_OK(cap_rights_limit(fd, &rights));
1080*670b568eSEd Maste 
1081*670b568eSEd Maste     struct sock_filter filter[] = { EXAMINE_SYSCALL,
1082*670b568eSEd Maste                                     FAIL_SYSCALL(fchmod, ENOMEM),
1083*670b568eSEd Maste                                     FAIL_SYSCALL(fstat, ENOEXEC),
1084*670b568eSEd Maste                                     ALLOW_SYSCALL(close),
1085*670b568eSEd Maste                                     KILL_SYSCALL(fsync),
1086*670b568eSEd Maste                                     BPF_ALLOW };
1087*670b568eSEd Maste     struct sock_fprog bpf = {.len = (sizeof(filter) / sizeof(filter[0])),
1088*670b568eSEd Maste                              .filter = filter};
1089*670b568eSEd Maste     // Set up seccomp-bpf first.
1090*670b568eSEd Maste     EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1091*670b568eSEd Maste     EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1092*670b568eSEd Maste 
1093*670b568eSEd Maste     EXPECT_OK(cap_enter());  // Enter capability mode.
1094*670b568eSEd Maste 
1095*670b568eSEd Maste     // fchmod is allowed by Capsicum, but failed by BPF.
1096*670b568eSEd Maste     EXPECT_SYSCALL_FAIL(ENOMEM, fchmod(fd, 0644));
1097*670b568eSEd Maste     // open is allowed by BPF, but failed by Capsicum
1098*670b568eSEd Maste     EXPECT_SYSCALL_FAIL(ECAPMODE, open(TmpFile("cap_bpf_capmode"), O_RDONLY));
1099*670b568eSEd Maste     // fstat is failed by both BPF and Capsicum; tie-break is on errno
1100*670b568eSEd Maste     struct stat buf;
1101*670b568eSEd Maste     EXPECT_SYSCALL_FAIL(ENOEXEC, fstat(fd, &buf));
1102*670b568eSEd Maste     // fsync is allowed by Capsicum, but BPF's SIGSYS generation take precedence
1103*670b568eSEd Maste     fsync(fd);  // terminate with unhandled SIGSYS
1104*670b568eSEd Maste     exit(0);
1105*670b568eSEd Maste   }
1106*670b568eSEd Maste   int status;
1107*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, 0));
1108*670b568eSEd Maste   EXPECT_TRUE(WIFSIGNALED(status));
1109*670b568eSEd Maste   EXPECT_EQ(SIGSYS, WTERMSIG(status));
1110*670b568eSEd Maste   unlink(TmpFile("cap_bpf_capmode"));
1111*670b568eSEd Maste }
1112*670b568eSEd Maste 
TEST(Linux,AIO)1113*670b568eSEd Maste TEST(Linux, AIO) {
1114*670b568eSEd Maste   int fd = open(TmpFile("cap_aio"), O_CREAT|O_RDWR, 0644);
1115*670b568eSEd Maste   EXPECT_OK(fd);
1116*670b568eSEd Maste 
1117*670b568eSEd Maste   cap_rights_t r_rs;
1118*670b568eSEd Maste   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
1119*670b568eSEd Maste   cap_rights_t r_ws;
1120*670b568eSEd Maste   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
1121*670b568eSEd Maste   cap_rights_t r_rwssync;
1122*670b568eSEd Maste   cap_rights_init(&r_rwssync, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1123*670b568eSEd Maste 
1124*670b568eSEd Maste   int cap_ro = dup(fd);
1125*670b568eSEd Maste   EXPECT_OK(cap_ro);
1126*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
1127*670b568eSEd Maste   EXPECT_OK(cap_ro);
1128*670b568eSEd Maste   int cap_wo = dup(fd);
1129*670b568eSEd Maste   EXPECT_OK(cap_wo);
1130*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
1131*670b568eSEd Maste   EXPECT_OK(cap_wo);
1132*670b568eSEd Maste   int cap_all = dup(fd);
1133*670b568eSEd Maste   EXPECT_OK(cap_all);
1134*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap_all, &r_rwssync));
1135*670b568eSEd Maste   EXPECT_OK(cap_all);
1136*670b568eSEd Maste 
1137*670b568eSEd Maste   // Linux: io_setup, io_submit, io_getevents, io_cancel, io_destroy
1138*670b568eSEd Maste   aio_context_t ctx = 0;
1139*670b568eSEd Maste   EXPECT_OK(syscall(__NR_io_setup, 10, &ctx));
1140*670b568eSEd Maste 
1141*670b568eSEd Maste   unsigned char buffer[32] = {1, 2, 3, 4};
1142*670b568eSEd Maste   struct iocb req;
1143*670b568eSEd Maste   memset(&req, 0, sizeof(req));
1144*670b568eSEd Maste   req.aio_reqprio = 0;
1145*670b568eSEd Maste   req.aio_fildes = fd;
1146*670b568eSEd Maste   uintptr_t bufaddr = (uintptr_t)buffer;
1147*670b568eSEd Maste   req.aio_buf = (__u64)bufaddr;
1148*670b568eSEd Maste   req.aio_nbytes = 4;
1149*670b568eSEd Maste   req.aio_offset = 0;
1150*670b568eSEd Maste   struct iocb* reqs[1] = {&req};
1151*670b568eSEd Maste 
1152*670b568eSEd Maste   // Write operation
1153*670b568eSEd Maste   req.aio_lio_opcode = IOCB_CMD_PWRITE;
1154*670b568eSEd Maste   req.aio_fildes = cap_ro;
1155*670b568eSEd Maste   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1156*670b568eSEd Maste   req.aio_fildes = cap_wo;
1157*670b568eSEd Maste   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1158*670b568eSEd Maste 
1159*670b568eSEd Maste   // Sync operation
1160*670b568eSEd Maste   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1161*670b568eSEd Maste   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1162*670b568eSEd Maste   req.aio_lio_opcode = IOCB_CMD_FDSYNC;
1163*670b568eSEd Maste   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1164*670b568eSEd Maste   // Even with CAP_FSYNC, turns out fsync/fdsync aren't implemented
1165*670b568eSEd Maste   req.aio_fildes = cap_all;
1166*670b568eSEd Maste   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1167*670b568eSEd Maste   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1168*670b568eSEd Maste   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1169*670b568eSEd Maste 
1170*670b568eSEd Maste   // Read operation
1171*670b568eSEd Maste   req.aio_lio_opcode = IOCB_CMD_PREAD;
1172*670b568eSEd Maste   req.aio_fildes = cap_wo;
1173*670b568eSEd Maste   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1174*670b568eSEd Maste   req.aio_fildes = cap_ro;
1175*670b568eSEd Maste   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1176*670b568eSEd Maste 
1177*670b568eSEd Maste   EXPECT_OK(syscall(__NR_io_destroy, ctx));
1178*670b568eSEd Maste 
1179*670b568eSEd Maste   close(cap_all);
1180*670b568eSEd Maste   close(cap_wo);
1181*670b568eSEd Maste   close(cap_ro);
1182*670b568eSEd Maste   close(fd);
1183*670b568eSEd Maste   unlink(TmpFile("cap_aio"));
1184*670b568eSEd Maste }
1185*670b568eSEd Maste 
1186*670b568eSEd Maste #ifndef KCMP_FILE
1187*670b568eSEd Maste #define KCMP_FILE 0
1188*670b568eSEd Maste #endif
TEST(Linux,KcmpIfAvailable)1189*670b568eSEd Maste TEST(Linux, KcmpIfAvailable) {
1190*670b568eSEd Maste   // This requires CONFIG_CHECKPOINT_RESTORE in kernel config.
1191*670b568eSEd Maste   int fd = open("/etc/passwd", O_RDONLY);
1192*670b568eSEd Maste   EXPECT_OK(fd);
1193*670b568eSEd Maste   pid_t parent = getpid_();
1194*670b568eSEd Maste 
1195*670b568eSEd Maste   errno = 0;
1196*670b568eSEd Maste   int rc = syscall(__NR_kcmp, parent, parent, KCMP_FILE, fd, fd);
1197*670b568eSEd Maste   if (rc == -1 && errno == ENOSYS) {
1198*670b568eSEd Maste     GTEST_SKIP() << "kcmp(2) gives -ENOSYS";
1199*670b568eSEd Maste   }
1200*670b568eSEd Maste 
1201*670b568eSEd Maste   pid_t child = fork();
1202*670b568eSEd Maste   if (child == 0) {
1203*670b568eSEd Maste     // Child: limit rights on FD.
1204*670b568eSEd Maste     child = getpid_();
1205*670b568eSEd Maste     EXPECT_OK(syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1206*670b568eSEd Maste     cap_rights_t rights;
1207*670b568eSEd Maste     cap_rights_init(&rights, CAP_READ, CAP_WRITE);
1208*670b568eSEd Maste     EXPECT_OK(cap_rights_limit(fd, &rights));
1209*670b568eSEd Maste     // A capability wrapping a normal FD is different (from a kcmp(2) perspective)
1210*670b568eSEd Maste     // than the original file.
1211*670b568eSEd Maste     EXPECT_NE(0, syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1212*670b568eSEd Maste     exit(HasFailure());
1213*670b568eSEd Maste   }
1214*670b568eSEd Maste   // Wait for the child.
1215*670b568eSEd Maste   int status;
1216*670b568eSEd Maste   EXPECT_EQ(child, waitpid(child, &status, 0));
1217*670b568eSEd Maste   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
1218*670b568eSEd Maste   EXPECT_EQ(0, rc);
1219*670b568eSEd Maste 
1220*670b568eSEd Maste   close(fd);
1221*670b568eSEd Maste }
1222*670b568eSEd Maste 
TEST(Linux,ProcFS)1223*670b568eSEd Maste TEST(Linux, ProcFS) {
1224*670b568eSEd Maste   cap_rights_t rights;
1225*670b568eSEd Maste   cap_rights_init(&rights, CAP_READ, CAP_SEEK);
1226*670b568eSEd Maste   int fd = open("/etc/passwd", O_RDONLY);
1227*670b568eSEd Maste   EXPECT_OK(fd);
1228*670b568eSEd Maste   lseek(fd, 4, SEEK_SET);
1229*670b568eSEd Maste   int cap = dup(fd);
1230*670b568eSEd Maste   EXPECT_OK(cap);
1231*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(cap, &rights));
1232*670b568eSEd Maste   pid_t me = getpid_();
1233*670b568eSEd Maste 
1234*670b568eSEd Maste   char buffer[1024];
1235*670b568eSEd Maste   sprintf(buffer, "/proc/%d/fdinfo/%d", me, cap);
1236*670b568eSEd Maste   int procfd = open(buffer, O_RDONLY);
1237*670b568eSEd Maste   EXPECT_OK(procfd) << " failed to open " << buffer;
1238*670b568eSEd Maste   if (procfd < 0) return;
1239*670b568eSEd Maste   int proccap = dup(procfd);
1240*670b568eSEd Maste   EXPECT_OK(proccap);
1241*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(proccap, &rights));
1242*670b568eSEd Maste 
1243*670b568eSEd Maste   EXPECT_OK(read(proccap, buffer, sizeof(buffer)));
1244*670b568eSEd Maste   // The fdinfo should include the file pos of the underlying file
1245*670b568eSEd Maste   EXPECT_NE((char*)NULL, strstr(buffer, "pos:\t4"));
1246*670b568eSEd Maste   // ...and the rights of the Capsicum capability.
1247*670b568eSEd Maste   EXPECT_NE((char*)NULL, strstr(buffer, "rights:\t0x"));
1248*670b568eSEd Maste 
1249*670b568eSEd Maste   close(procfd);
1250*670b568eSEd Maste   close(proccap);
1251*670b568eSEd Maste   close(cap);
1252*670b568eSEd Maste   close(fd);
1253*670b568eSEd Maste }
1254*670b568eSEd Maste 
FORK_TEST(Linux,ProcessClocks)1255*670b568eSEd Maste FORK_TEST(Linux, ProcessClocks) {
1256*670b568eSEd Maste   pid_t self = getpid_();
1257*670b568eSEd Maste   pid_t child = fork();
1258*670b568eSEd Maste   EXPECT_OK(child);
1259*670b568eSEd Maste   if (child == 0) {
1260*670b568eSEd Maste     child = getpid_();
1261*670b568eSEd Maste     usleep(100000);
1262*670b568eSEd Maste     exit(0);
1263*670b568eSEd Maste   }
1264*670b568eSEd Maste 
1265*670b568eSEd Maste   EXPECT_OK(cap_enter());  // Enter capability mode.
1266*670b568eSEd Maste 
1267*670b568eSEd Maste   // Nefariously build a clock ID for the child's CPU time.
1268*670b568eSEd Maste   // This relies on knowledge of the internal layout of clock IDs.
1269*670b568eSEd Maste   clockid_t child_clock;
1270*670b568eSEd Maste   child_clock = ((~child) << 3) | 0x0;
1271*670b568eSEd Maste   struct timespec ts;
1272*670b568eSEd Maste   memset(&ts, 0, sizeof(ts));
1273*670b568eSEd Maste 
1274*670b568eSEd Maste   // TODO(drysdale): Should not be possible to retrieve info about a
1275*670b568eSEd Maste   // different process, as the PID global namespace should be locked
1276*670b568eSEd Maste   // down.
1277*670b568eSEd Maste   EXPECT_OK(clock_gettime(child_clock, &ts));
1278*670b568eSEd Maste   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(child=%d->0x%08x) is %ld.%09ld \n",
1279*670b568eSEd Maste                        self, child, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1280*670b568eSEd Maste 
1281*670b568eSEd Maste   child_clock = ((~1) << 3) | 0x0;
1282*670b568eSEd Maste   memset(&ts, 0, sizeof(ts));
1283*670b568eSEd Maste   EXPECT_OK(clock_gettime(child_clock, &ts));
1284*670b568eSEd Maste   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(init=1->0x%08x) is %ld.%09ld \n",
1285*670b568eSEd Maste                        self, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1286*670b568eSEd Maste 
1287*670b568eSEd Maste   // Orphan the child.
1288*670b568eSEd Maste }
1289*670b568eSEd Maste 
TEST(Linux,SetLease)1290*670b568eSEd Maste TEST(Linux, SetLease) {
1291*670b568eSEd Maste   int fd_all = open(TmpFile("cap_lease"), O_CREAT|O_RDWR, 0644);
1292*670b568eSEd Maste   EXPECT_OK(fd_all);
1293*670b568eSEd Maste   int fd_rw = dup(fd_all);
1294*670b568eSEd Maste   EXPECT_OK(fd_rw);
1295*670b568eSEd Maste 
1296*670b568eSEd Maste   cap_rights_t r_all;
1297*670b568eSEd Maste   cap_rights_init(&r_all, CAP_READ, CAP_WRITE, CAP_FLOCK, CAP_FSIGNAL);
1298*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(fd_all, &r_all));
1299*670b568eSEd Maste 
1300*670b568eSEd Maste   cap_rights_t r_rw;
1301*670b568eSEd Maste   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
1302*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(fd_rw, &r_rw));
1303*670b568eSEd Maste 
1304*670b568eSEd Maste   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_SETLEASE, F_WRLCK));
1305*670b568eSEd Maste   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_GETLEASE));
1306*670b568eSEd Maste 
1307*670b568eSEd Maste   if (!tmpdir_on_tmpfs) {  // tmpfs doesn't support leases
1308*670b568eSEd Maste     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_WRLCK));
1309*670b568eSEd Maste     EXPECT_EQ(F_WRLCK, fcntl(fd_all, F_GETLEASE));
1310*670b568eSEd Maste 
1311*670b568eSEd Maste     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_UNLCK, 0));
1312*670b568eSEd Maste     EXPECT_EQ(F_UNLCK, fcntl(fd_all, F_GETLEASE));
1313*670b568eSEd Maste   }
1314*670b568eSEd Maste   close(fd_all);
1315*670b568eSEd Maste   close(fd_rw);
1316*670b568eSEd Maste   unlink(TmpFile("cap_lease"));
1317*670b568eSEd Maste }
1318*670b568eSEd Maste 
TEST(Linux,InvalidRightsSyscall)1319*670b568eSEd Maste TEST(Linux, InvalidRightsSyscall) {
1320*670b568eSEd Maste   int fd = open(TmpFile("cap_invalid_rights"), O_RDONLY|O_CREAT, 0644);
1321*670b568eSEd Maste   EXPECT_OK(fd);
1322*670b568eSEd Maste 
1323*670b568eSEd Maste   cap_rights_t rights;
1324*670b568eSEd Maste   cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FCHMOD, CAP_FSTAT);
1325*670b568eSEd Maste 
1326*670b568eSEd Maste   // Use the raw syscall throughout.
1327*670b568eSEd Maste   EXPECT_EQ(0, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1328*670b568eSEd Maste 
1329*670b568eSEd Maste   // Directly access the syscall, and find all unseemly manner of use for it.
1330*670b568eSEd Maste   //  - Invalid flags
1331*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 1));
1332*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1333*670b568eSEd Maste   //  - Specify an fcntl subright, but no CAP_FCNTL set
1334*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, CAP_FCNTL_GETFL, 0, NULL, 0));
1335*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1336*670b568eSEd Maste   //  - Specify an ioctl subright, but no CAP_IOCTL set
1337*670b568eSEd Maste   unsigned int ioctl1 = 1;
1338*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, &ioctl1, 0));
1339*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1340*670b568eSEd Maste   //  - N ioctls, but null pointer passed
1341*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, NULL, 0));
1342*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1343*670b568eSEd Maste   //  - Invalid nioctls
1344*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, -2, NULL, 0));
1345*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1346*670b568eSEd Maste   //  - Null primary rights
1347*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, NULL, 0, 0, NULL, 0));
1348*670b568eSEd Maste   EXPECT_EQ(EFAULT, errno);
1349*670b568eSEd Maste   //  - Invalid index bitmask
1350*670b568eSEd Maste   rights.cr_rights[0] |= 3ULL << 57;
1351*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1352*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1353*670b568eSEd Maste   //  - Invalid version
1354*670b568eSEd Maste   rights.cr_rights[0] |= 2ULL << 62;
1355*670b568eSEd Maste   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1356*670b568eSEd Maste   EXPECT_EQ(EINVAL, errno);
1357*670b568eSEd Maste 
1358*670b568eSEd Maste   close(fd);
1359*670b568eSEd Maste   unlink(TmpFile("cap_invalid_rights"));
1360*670b568eSEd Maste }
1361*670b568eSEd Maste 
1362*670b568eSEd Maste FORK_TEST_ON(Linux, OpenByHandleAtIfRoot, TmpFile("cap_openbyhandle_testfile")) {
1363*670b568eSEd Maste   GTEST_SKIP_IF_NOT_ROOT();
1364*670b568eSEd Maste   int dir = open(tmpdir.c_str(), O_RDONLY);
1365*670b568eSEd Maste   EXPECT_OK(dir);
1366*670b568eSEd Maste   int fd = openat(dir, "cap_openbyhandle_testfile", O_RDWR|O_CREAT, 0644);
1367*670b568eSEd Maste   EXPECT_OK(fd);
1368*670b568eSEd Maste   const char* message = "Saved text";
1369*670b568eSEd Maste   EXPECT_OK(write(fd, message, strlen(message)));
1370*670b568eSEd Maste   close(fd);
1371*670b568eSEd Maste 
1372*670b568eSEd Maste   struct file_handle* fhandle = (struct file_handle*)malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
1373*670b568eSEd Maste   fhandle->handle_bytes = MAX_HANDLE_SZ;
1374*670b568eSEd Maste   int mount_id;
1375*670b568eSEd Maste   EXPECT_OK(name_to_handle_at(dir, "cap_openbyhandle_testfile", fhandle,  &mount_id, 0));
1376*670b568eSEd Maste 
1377*670b568eSEd Maste   fd = open_by_handle_at(dir, fhandle, O_RDONLY);
1378*670b568eSEd Maste   EXPECT_OK(fd);
1379*670b568eSEd Maste   char buffer[200];
1380*670b568eSEd Maste   ssize_t len = read(fd, buffer, 199);
1381*670b568eSEd Maste   EXPECT_OK(len);
1382*670b568eSEd Maste   EXPECT_EQ(std::string(message), std::string(buffer, len));
1383*670b568eSEd Maste   close(fd);
1384*670b568eSEd Maste 
1385*670b568eSEd Maste   // Cannot issue open_by_handle_at after entering capability mode.
1386*670b568eSEd Maste   cap_enter();
1387*670b568eSEd Maste   EXPECT_CAPMODE(open_by_handle_at(dir, fhandle, O_RDONLY));
1388*670b568eSEd Maste 
1389*670b568eSEd Maste   close(dir);
1390*670b568eSEd Maste }
1391*670b568eSEd Maste 
getrandom_(void * buf,size_t buflen,unsigned int flags)1392*670b568eSEd Maste int getrandom_(void *buf, size_t buflen, unsigned int flags) {
1393*670b568eSEd Maste #ifdef __NR_getrandom
1394*670b568eSEd Maste   return syscall(__NR_getrandom, buf, buflen, flags);
1395*670b568eSEd Maste #else
1396*670b568eSEd Maste   errno = ENOSYS;
1397*670b568eSEd Maste   return -1;
1398*670b568eSEd Maste #endif
1399*670b568eSEd Maste }
1400*670b568eSEd Maste 
1401*670b568eSEd Maste #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1402*670b568eSEd Maste #include <linux/random.h>  // Requires 3.17 kernel
FORK_TEST(Linux,GetRandom)1403*670b568eSEd Maste FORK_TEST(Linux, GetRandom) {
1404*670b568eSEd Maste   EXPECT_OK(cap_enter());
1405*670b568eSEd Maste   unsigned char buffer[1024];
1406*670b568eSEd Maste   unsigned char buffer2[1024];
1407*670b568eSEd Maste   EXPECT_OK(getrandom_(buffer, sizeof(buffer), GRND_NONBLOCK));
1408*670b568eSEd Maste   EXPECT_OK(getrandom_(buffer2, sizeof(buffer2), GRND_NONBLOCK));
1409*670b568eSEd Maste   EXPECT_NE(0, memcmp(buffer, buffer2, sizeof(buffer)));
1410*670b568eSEd Maste }
1411*670b568eSEd Maste #endif
1412*670b568eSEd Maste 
memfd_create_(const char * name,unsigned int flags)1413*670b568eSEd Maste int memfd_create_(const char *name, unsigned int flags) {
1414*670b568eSEd Maste #ifdef __NR_memfd_create
1415*670b568eSEd Maste   return syscall(__NR_memfd_create, name, flags);
1416*670b568eSEd Maste #else
1417*670b568eSEd Maste   errno = ENOSYS;
1418*670b568eSEd Maste   return -1;
1419*670b568eSEd Maste #endif
1420*670b568eSEd Maste }
1421*670b568eSEd Maste 
1422*670b568eSEd Maste #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1423*670b568eSEd Maste #include <linux/memfd.h>  // Requires 3.17 kernel
TEST(Linux,MemFDDeathTestIfAvailable)1424*670b568eSEd Maste TEST(Linux, MemFDDeathTestIfAvailable) {
1425*670b568eSEd Maste   int memfd = memfd_create_("capsicum-test", MFD_ALLOW_SEALING);
1426*670b568eSEd Maste   if (memfd == -1 && errno == ENOSYS) {
1427*670b568eSEd Maste     GTEST_SKIP() << "memfd_create(2) gives -ENOSYS";
1428*670b568eSEd Maste   }
1429*670b568eSEd Maste   const int LEN = 16;
1430*670b568eSEd Maste   EXPECT_OK(ftruncate(memfd, LEN));
1431*670b568eSEd Maste   int memfd_ro = dup(memfd);
1432*670b568eSEd Maste   int memfd_rw = dup(memfd);
1433*670b568eSEd Maste   EXPECT_OK(memfd_ro);
1434*670b568eSEd Maste   EXPECT_OK(memfd_rw);
1435*670b568eSEd Maste   cap_rights_t rights;
1436*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(memfd_ro, cap_rights_init(&rights, CAP_MMAP_R, CAP_FSTAT)));
1437*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW, CAP_FCHMOD)));
1438*670b568eSEd Maste 
1439*670b568eSEd Maste   unsigned char *p_ro = (unsigned char *)mmap(NULL, LEN, PROT_READ, MAP_SHARED, memfd_ro, 0);
1440*670b568eSEd Maste   EXPECT_NE((unsigned char *)MAP_FAILED, p_ro);
1441*670b568eSEd Maste   unsigned char *p_rw = (unsigned char *)mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_rw, 0);
1442*670b568eSEd Maste   EXPECT_NE((unsigned char *)MAP_FAILED, p_rw);
1443*670b568eSEd Maste   EXPECT_EQ(MAP_FAILED,
1444*670b568eSEd Maste             mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_ro, 0));
1445*670b568eSEd Maste 
1446*670b568eSEd Maste   *p_rw = 42;
1447*670b568eSEd Maste   EXPECT_EQ(42, *p_ro);
1448*670b568eSEd Maste   EXPECT_DEATH(*p_ro = 42, "");
1449*670b568eSEd Maste 
1450*670b568eSEd Maste #ifndef F_ADD_SEALS
1451*670b568eSEd Maste   // Hack for when libc6 does not yet include the updated linux/fcntl.h from kernel 3.17
1452*670b568eSEd Maste #define _F_LINUX_SPECIFIC_BASE F_SETLEASE
1453*670b568eSEd Maste #define F_ADD_SEALS	(_F_LINUX_SPECIFIC_BASE + 9)
1454*670b568eSEd Maste #define F_GET_SEALS	(_F_LINUX_SPECIFIC_BASE + 10)
1455*670b568eSEd Maste #define F_SEAL_SEAL	0x0001	/* prevent further seals from being set */
1456*670b568eSEd Maste #define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
1457*670b568eSEd Maste #define F_SEAL_GROW	0x0004	/* prevent file from growing */
1458*670b568eSEd Maste #define F_SEAL_WRITE	0x0008	/* prevent writes */
1459*670b568eSEd Maste #endif
1460*670b568eSEd Maste 
1461*670b568eSEd Maste   // Reading the seal information requires CAP_FSTAT.
1462*670b568eSEd Maste   int seals = fcntl(memfd, F_GET_SEALS);
1463*670b568eSEd Maste   EXPECT_OK(seals);
1464*670b568eSEd Maste   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1465*670b568eSEd Maste   int seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1466*670b568eSEd Maste   EXPECT_EQ(seals, seals_ro);
1467*670b568eSEd Maste   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1468*670b568eSEd Maste   int seals_rw = fcntl(memfd_rw, F_GET_SEALS);
1469*670b568eSEd Maste   EXPECT_NOTCAPABLE(seals_rw);
1470*670b568eSEd Maste 
1471*670b568eSEd Maste   // Fail to seal as a writable mapping exists.
1472*670b568eSEd Maste   EXPECT_EQ(-1, fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1473*670b568eSEd Maste   EXPECT_EQ(EBUSY, errno);
1474*670b568eSEd Maste   *p_rw = 42;
1475*670b568eSEd Maste 
1476*670b568eSEd Maste   // Seal the rw version; need to unmap first.
1477*670b568eSEd Maste   munmap(p_rw, LEN);
1478*670b568eSEd Maste   munmap(p_ro, LEN);
1479*670b568eSEd Maste   EXPECT_OK(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1480*670b568eSEd Maste 
1481*670b568eSEd Maste   seals = fcntl(memfd, F_GET_SEALS);
1482*670b568eSEd Maste   EXPECT_OK(seals);
1483*670b568eSEd Maste   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1484*670b568eSEd Maste   seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1485*670b568eSEd Maste   EXPECT_EQ(seals, seals_ro);
1486*670b568eSEd Maste   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1487*670b568eSEd Maste 
1488*670b568eSEd Maste   // Remove the CAP_FCHMOD right, can no longer add seals.
1489*670b568eSEd Maste   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW)));
1490*670b568eSEd Maste   EXPECT_NOTCAPABLE(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1491*670b568eSEd Maste 
1492*670b568eSEd Maste   close(memfd);
1493*670b568eSEd Maste   close(memfd_ro);
1494*670b568eSEd Maste   close(memfd_rw);
1495*670b568eSEd Maste }
1496*670b568eSEd Maste #endif
1497*670b568eSEd Maste 
1498*670b568eSEd Maste #else
noop()1499*670b568eSEd Maste void noop() {}
1500*670b568eSEd Maste #endif
1501