xref: /freebsd/contrib/capsicum-test/linux.cc (revision 55141f2c8991b2a6adbf30bb0fe3e6cbc303f06d)
1 // Tests of Linux-specific functionality
2 #ifdef __linux__
3 
4 #include <sys/types.h>
5 #include <sys/stat.h>
6 #include <sys/socket.h>
7 #include <sys/timerfd.h>
8 #include <sys/signalfd.h>
9 #include <sys/eventfd.h>
10 #include <sys/epoll.h>
11 #include <sys/inotify.h>
12 #include <sys/fanotify.h>
13 #include <sys/mman.h>
14 #include <sys/capability.h>  // Requires e.g. libcap-dev package for POSIX.1e capabilities headers
15 #include <linux/aio_abi.h>
16 #include <linux/filter.h>
17 #include <linux/seccomp.h>
18 #include <linux/version.h>
19 #include <poll.h>
20 #include <sched.h>
21 #include <signal.h>
22 #include <fcntl.h>
23 #include <unistd.h>
24 
25 #include <string>
26 
27 #include "capsicum.h"
28 #include "syscalls.h"
29 #include "capsicum-test.h"
30 
31 TEST(Linux, TimerFD) {
32   int fd = timerfd_create(CLOCK_MONOTONIC, 0);
33 
34   cap_rights_t r_ro;
35   cap_rights_init(&r_ro, CAP_READ);
36   cap_rights_t r_wo;
37   cap_rights_init(&r_wo, CAP_WRITE);
38   cap_rights_t r_rw;
39   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
40   cap_rights_t r_rwpoll;
41   cap_rights_init(&r_rwpoll, CAP_READ, CAP_WRITE, CAP_EVENT);
42 
43   int cap_fd_ro = dup(fd);
44   EXPECT_OK(cap_fd_ro);
45   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_ro));
46   int cap_fd_wo = dup(fd);
47   EXPECT_OK(cap_fd_wo);
48   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_wo));
49   int cap_fd_rw = dup(fd);
50   EXPECT_OK(cap_fd_rw);
51   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rw));
52   int cap_fd_all = dup(fd);
53   EXPECT_OK(cap_fd_all);
54   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwpoll));
55 
56   struct itimerspec old_ispec;
57   struct itimerspec ispec;
58   ispec.it_interval.tv_sec = 0;
59   ispec.it_interval.tv_nsec = 0;
60   ispec.it_value.tv_sec = 0;
61   ispec.it_value.tv_nsec = 100000000;  // 100ms
62   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_ro, 0, &ispec, NULL));
63   EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_wo, 0, &ispec, &old_ispec));
64   EXPECT_OK(timerfd_settime(cap_fd_wo, 0, &ispec, NULL));
65   EXPECT_OK(timerfd_settime(cap_fd_rw, 0, &ispec, NULL));
66   EXPECT_OK(timerfd_settime(cap_fd_all, 0, &ispec, NULL));
67 
68   EXPECT_NOTCAPABLE(timerfd_gettime(cap_fd_wo, &old_ispec));
69   EXPECT_OK(timerfd_gettime(cap_fd_ro, &old_ispec));
70   EXPECT_OK(timerfd_gettime(cap_fd_rw, &old_ispec));
71   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
72 
73   // To be able to poll() for the timer pop, still need CAP_EVENT.
74   struct pollfd poll_fd;
75   for (int ii = 0; ii < 3; ii++) {
76     poll_fd.revents = 0;
77     poll_fd.events = POLLIN;
78     switch (ii) {
79     case 0: poll_fd.fd = cap_fd_ro; break;
80     case 1: poll_fd.fd = cap_fd_wo; break;
81     case 2: poll_fd.fd = cap_fd_rw; break;
82     }
83     // Poll immediately returns with POLLNVAL
84     EXPECT_OK(poll(&poll_fd, 1, 400));
85     EXPECT_EQ(0, (poll_fd.revents & POLLIN));
86     EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
87   }
88 
89   poll_fd.fd = cap_fd_all;
90   EXPECT_OK(poll(&poll_fd, 1, 400));
91   EXPECT_NE(0, (poll_fd.revents & POLLIN));
92   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
93 
94   EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
95   EXPECT_EQ(0, old_ispec.it_value.tv_sec);
96   EXPECT_EQ(0, old_ispec.it_value.tv_nsec);
97   EXPECT_EQ(0, old_ispec.it_interval.tv_sec);
98   EXPECT_EQ(0, old_ispec.it_interval.tv_nsec);
99 
100   close(cap_fd_all);
101   close(cap_fd_rw);
102   close(cap_fd_wo);
103   close(cap_fd_ro);
104   close(fd);
105 }
106 
107 FORK_TEST(Linux, SignalFDIfSingleThreaded) {
108   if (force_mt) {
109     GTEST_SKIP() << "multi-threaded run clashes with signals";
110   }
111   pid_t me = getpid();
112   sigset_t mask;
113   sigemptyset(&mask);
114   sigaddset(&mask, SIGUSR1);
115 
116   // Block signals before registering against a new signal FD.
117   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
118   int fd = signalfd(-1, &mask, 0);
119   EXPECT_OK(fd);
120 
121   cap_rights_t r_rs;
122   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
123   cap_rights_t r_ws;
124   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
125   cap_rights_t r_sig;
126   cap_rights_init(&r_sig, CAP_FSIGNAL);
127   cap_rights_t r_rssig;
128   cap_rights_init(&r_rssig, CAP_FSIGNAL, CAP_READ, CAP_SEEK);
129   cap_rights_t r_rssig_poll;
130   cap_rights_init(&r_rssig_poll, CAP_FSIGNAL, CAP_READ, CAP_SEEK, CAP_EVENT);
131 
132   // Various capability variants.
133   int cap_fd_none = dup(fd);
134   EXPECT_OK(cap_fd_none);
135   EXPECT_OK(cap_rights_limit(cap_fd_none, &r_ws));
136   int cap_fd_read = dup(fd);
137   EXPECT_OK(cap_fd_read);
138   EXPECT_OK(cap_rights_limit(cap_fd_read, &r_rs));
139   int cap_fd_sig = dup(fd);
140   EXPECT_OK(cap_fd_sig);
141   EXPECT_OK(cap_rights_limit(cap_fd_sig, &r_sig));
142   int cap_fd_sig_read = dup(fd);
143   EXPECT_OK(cap_fd_sig_read);
144   EXPECT_OK(cap_rights_limit(cap_fd_sig_read, &r_rssig));
145   int cap_fd_all = dup(fd);
146   EXPECT_OK(cap_fd_all);
147   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rssig_poll));
148 
149   struct signalfd_siginfo fdsi;
150 
151   // Need CAP_READ to read the signal information
152   kill(me, SIGUSR1);
153   EXPECT_NOTCAPABLE(read(cap_fd_none, &fdsi, sizeof(struct signalfd_siginfo)));
154   EXPECT_NOTCAPABLE(read(cap_fd_sig, &fdsi, sizeof(struct signalfd_siginfo)));
155   int len = read(cap_fd_read, &fdsi, sizeof(struct signalfd_siginfo));
156   EXPECT_OK(len);
157   EXPECT_EQ(sizeof(struct signalfd_siginfo), (size_t)len);
158   EXPECT_EQ(SIGUSR1, (int)fdsi.ssi_signo);
159 
160   // Need CAP_FSIGNAL to modify the signal mask.
161   sigemptyset(&mask);
162   sigaddset(&mask, SIGUSR1);
163   sigaddset(&mask, SIGUSR2);
164   EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
165   EXPECT_NOTCAPABLE(signalfd(cap_fd_none, &mask, 0));
166   EXPECT_NOTCAPABLE(signalfd(cap_fd_read, &mask, 0));
167   EXPECT_EQ(cap_fd_sig, signalfd(cap_fd_sig, &mask, 0));
168 
169   // Need CAP_EVENT to get notification of a signal in poll(2).
170   kill(me, SIGUSR2);
171 
172   struct pollfd poll_fd;
173   poll_fd.revents = 0;
174   poll_fd.events = POLLIN;
175   poll_fd.fd = cap_fd_sig_read;
176   EXPECT_OK(poll(&poll_fd, 1, 400));
177   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
178   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
179 
180   poll_fd.fd = cap_fd_all;
181   EXPECT_OK(poll(&poll_fd, 1, 400));
182   EXPECT_NE(0, (poll_fd.revents & POLLIN));
183   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
184 }
185 
186 TEST(Linux, EventFD) {
187   int fd = eventfd(0, 0);
188   EXPECT_OK(fd);
189 
190   cap_rights_t r_rs;
191   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
192   cap_rights_t r_ws;
193   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
194   cap_rights_t r_rws;
195   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
196   cap_rights_t r_rwspoll;
197   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
198 
199   int cap_ro = dup(fd);
200   EXPECT_OK(cap_ro);
201   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
202   int cap_wo = dup(fd);
203   EXPECT_OK(cap_wo);
204   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
205   int cap_rw = dup(fd);
206   EXPECT_OK(cap_rw);
207   EXPECT_OK(cap_rights_limit(cap_rw, &r_rws));
208   int cap_all = dup(fd);
209   EXPECT_OK(cap_all);
210   EXPECT_OK(cap_rights_limit(cap_all, &r_rwspoll));
211 
212   pid_t child = fork();
213   if (child == 0) {
214     // Child: write counter to eventfd
215     uint64_t u = 42;
216     EXPECT_NOTCAPABLE(write(cap_ro, &u, sizeof(u)));
217     EXPECT_OK(write(cap_wo, &u, sizeof(u)));
218     exit(HasFailure());
219   }
220 
221   sleep(1);  // Allow child to write
222 
223   struct pollfd poll_fd;
224   poll_fd.revents = 0;
225   poll_fd.events = POLLIN;
226   poll_fd.fd = cap_rw;
227   EXPECT_OK(poll(&poll_fd, 1, 400));
228   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
229   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
230 
231   poll_fd.fd = cap_all;
232   EXPECT_OK(poll(&poll_fd, 1, 400));
233   EXPECT_NE(0, (poll_fd.revents & POLLIN));
234   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
235 
236   uint64_t u;
237   EXPECT_NOTCAPABLE(read(cap_wo, &u, sizeof(u)));
238   EXPECT_OK(read(cap_ro, &u, sizeof(u)));
239   EXPECT_EQ(42, (int)u);
240 
241   // Wait for the child.
242   int status;
243   EXPECT_EQ(child, waitpid(child, &status, 0));
244   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
245   EXPECT_EQ(0, rc);
246 
247   close(cap_all);
248   close(cap_rw);
249   close(cap_wo);
250   close(cap_ro);
251   close(fd);
252 }
253 
254 FORK_TEST(Linux, epoll) {
255   int sock_fds[2];
256   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds));
257   // Queue some data.
258   char buffer[4] = {1, 2, 3, 4};
259   EXPECT_OK(write(sock_fds[1], buffer, sizeof(buffer)));
260 
261   EXPECT_OK(cap_enter());  // Enter capability mode.
262 
263   int epoll_fd = epoll_create(1);
264   EXPECT_OK(epoll_fd);
265 
266   cap_rights_t r_rs;
267   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
268   cap_rights_t r_ws;
269   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
270   cap_rights_t r_rws;
271   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
272   cap_rights_t r_rwspoll;
273   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
274   cap_rights_t r_epoll;
275   cap_rights_init(&r_epoll, CAP_EPOLL_CTL);
276 
277   int cap_epoll_wo = dup(epoll_fd);
278   EXPECT_OK(cap_epoll_wo);
279   EXPECT_OK(cap_rights_limit(cap_epoll_wo, &r_ws));
280   int cap_epoll_ro = dup(epoll_fd);
281   EXPECT_OK(cap_epoll_ro);
282   EXPECT_OK(cap_rights_limit(cap_epoll_ro, &r_rs));
283   int cap_epoll_rw = dup(epoll_fd);
284   EXPECT_OK(cap_epoll_rw);
285   EXPECT_OK(cap_rights_limit(cap_epoll_rw, &r_rws));
286   int cap_epoll_poll = dup(epoll_fd);
287   EXPECT_OK(cap_epoll_poll);
288   EXPECT_OK(cap_rights_limit(cap_epoll_poll, &r_rwspoll));
289   int cap_epoll_ctl = dup(epoll_fd);
290   EXPECT_OK(cap_epoll_ctl);
291   EXPECT_OK(cap_rights_limit(cap_epoll_ctl, &r_epoll));
292 
293   // Can only modify the FDs being monitored if the CAP_EPOLL_CTL right is present.
294   struct epoll_event eev;
295   memset(&eev, 0, sizeof(eev));
296   eev.events = EPOLLIN|EPOLLOUT|EPOLLPRI;
297   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_ADD, sock_fds[0], &eev));
298   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_ADD, sock_fds[0], &eev));
299   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_ADD, sock_fds[0], &eev));
300   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_ADD, sock_fds[0], &eev));
301   eev.events = EPOLLIN|EPOLLOUT;
302   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_MOD, sock_fds[0], &eev));
303   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_MOD, sock_fds[0], &eev));
304   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_MOD, sock_fds[0], &eev));
305   EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_MOD, sock_fds[0], &eev));
306 
307   // Running epoll_pwait(2) requires CAP_EVENT.
308   eev.events = 0;
309   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_ro, &eev, 1, 100, NULL));
310   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_wo, &eev, 1, 100, NULL));
311   EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_rw, &eev, 1, 100, NULL));
312   EXPECT_OK(epoll_pwait(cap_epoll_poll, &eev, 1, 100, NULL));
313   EXPECT_EQ(EPOLLIN, eev.events & EPOLLIN);
314 
315   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_DEL, sock_fds[0], &eev));
316   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_DEL, sock_fds[0], &eev));
317   EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_DEL, sock_fds[0], &eev));
318   EXPECT_OK(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, sock_fds[0], &eev));
319 
320   close(cap_epoll_ctl);
321   close(cap_epoll_poll);
322   close(cap_epoll_rw);
323   close(cap_epoll_ro);
324   close(cap_epoll_wo);
325   close(epoll_fd);
326   close(sock_fds[1]);
327   close(sock_fds[0]);
328 }
329 
330 TEST(Linux, fstatat) {
331   int fd = open(TmpFile("cap_fstatat"), O_CREAT|O_RDWR, 0644);
332   EXPECT_OK(fd);
333   unsigned char buffer[] = {1, 2, 3, 4};
334   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
335   cap_rights_t rights;
336   int cap_rf = dup(fd);
337   EXPECT_OK(cap_rf);
338   EXPECT_OK(cap_rights_limit(cap_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
339   int cap_ro = dup(fd);
340   EXPECT_OK(cap_ro);
341   EXPECT_OK(cap_rights_limit(cap_ro, cap_rights_init(&rights, CAP_READ)));
342 
343   struct stat info;
344   EXPECT_OK(fstatat(fd, "", &info, AT_EMPTY_PATH));
345   EXPECT_NOTCAPABLE(fstatat(cap_ro, "", &info, AT_EMPTY_PATH));
346   EXPECT_OK(fstatat(cap_rf, "", &info, AT_EMPTY_PATH));
347 
348   close(cap_ro);
349   close(cap_rf);
350   close(fd);
351 
352   int dir = open(tmpdir.c_str(), O_RDONLY);
353   EXPECT_OK(dir);
354   int dir_rf = dup(dir);
355   EXPECT_OK(dir_rf);
356   EXPECT_OK(cap_rights_limit(dir_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
357   int dir_ro = dup(fd);
358   EXPECT_OK(dir_ro);
359   EXPECT_OK(cap_rights_limit(dir_ro, cap_rights_init(&rights, CAP_READ)));
360 
361   EXPECT_OK(fstatat(dir, "cap_fstatat", &info, AT_EMPTY_PATH));
362   EXPECT_NOTCAPABLE(fstatat(dir_ro, "cap_fstatat", &info, AT_EMPTY_PATH));
363   EXPECT_OK(fstatat(dir_rf, "cap_fstatat", &info, AT_EMPTY_PATH));
364 
365   close(dir_ro);
366   close(dir_rf);
367   close(dir);
368 
369   unlink(TmpFile("cap_fstatat"));
370 }
371 
372 // fanotify support may not be available at compile-time
373 #ifdef __NR_fanotify_init
374 TEST(Linux, FanotifyIfRoot) {
375   GTEST_SKIP_IF_NOT_ROOT();
376   int fa_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDWR);
377   EXPECT_OK(fa_fd);
378   if (fa_fd < 0) return;  // May not be enabled
379 
380   cap_rights_t r_rs;
381   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
382   cap_rights_t r_ws;
383   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
384   cap_rights_t r_rws;
385   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
386   cap_rights_t r_rwspoll;
387   cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
388   cap_rights_t r_rwsnotify;
389   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
390   cap_rights_t r_rsl;
391   cap_rights_init(&r_rsl, CAP_READ, CAP_SEEK, CAP_LOOKUP);
392   cap_rights_t r_rslstat;
393   cap_rights_init(&r_rslstat, CAP_READ, CAP_SEEK, CAP_LOOKUP, CAP_FSTAT);
394   cap_rights_t r_rsstat;
395   cap_rights_init(&r_rsstat, CAP_READ, CAP_SEEK, CAP_FSTAT);
396 
397   int cap_fd_ro = dup(fa_fd);
398   EXPECT_OK(cap_fd_ro);
399   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
400   int cap_fd_wo = dup(fa_fd);
401   EXPECT_OK(cap_fd_wo);
402   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
403   int cap_fd_rw = dup(fa_fd);
404   EXPECT_OK(cap_fd_rw);
405   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
406   int cap_fd_poll = dup(fa_fd);
407   EXPECT_OK(cap_fd_poll);
408   EXPECT_OK(cap_rights_limit(cap_fd_poll, &r_rwspoll));
409   int cap_fd_not = dup(fa_fd);
410   EXPECT_OK(cap_fd_not);
411   EXPECT_OK(cap_rights_limit(cap_fd_not, &r_rwsnotify));
412 
413   int rc = mkdir(TmpFile("cap_notify"), 0755);
414   EXPECT_TRUE(rc == 0 || errno == EEXIST);
415   int dfd = open(TmpFile("cap_notify"), O_RDONLY);
416   EXPECT_OK(dfd);
417   int fd = open(TmpFile("cap_notify/file"), O_CREAT|O_RDWR, 0644);
418   close(fd);
419   int cap_dfd = dup(dfd);
420   EXPECT_OK(cap_dfd);
421   EXPECT_OK(cap_rights_limit(cap_dfd, &r_rslstat));
422   EXPECT_OK(cap_dfd);
423   int cap_dfd_rs = dup(dfd);
424   EXPECT_OK(cap_dfd_rs);
425   EXPECT_OK(cap_rights_limit(cap_dfd_rs, &r_rs));
426   EXPECT_OK(cap_dfd_rs);
427   int cap_dfd_rsstat = dup(dfd);
428   EXPECT_OK(cap_dfd_rsstat);
429   EXPECT_OK(cap_rights_limit(cap_dfd_rsstat, &r_rsstat));
430   EXPECT_OK(cap_dfd_rsstat);
431   int cap_dfd_rsl = dup(dfd);
432   EXPECT_OK(cap_dfd_rsl);
433   EXPECT_OK(cap_rights_limit(cap_dfd_rsl, &r_rsl));
434   EXPECT_OK(cap_dfd_rsl);
435 
436   // Need CAP_NOTIFY to change what's monitored.
437   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_ro, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
438   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_wo, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
439   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_rw, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
440   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
441 
442   // Need CAP_FSTAT on the thing monitored.
443   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rs, NULL));
444   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rsstat, NULL));
445 
446   // Too add monitoring of a file under a dfd, need CAP_LOOKUP|CAP_FSTAT on the dfd.
447   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsstat, "file"));
448   EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsl, "file"));
449   EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd, "file"));
450 
451   pid_t child = fork();
452   if (child == 0) {
453     // Child: Perform activity in the directory under notify.
454     sleep(1);
455     unlink(TmpFile("cap_notify/temp"));
456     int fd = open(TmpFile("cap_notify/temp"), O_CREAT|O_RDWR, 0644);
457     close(fd);
458     exit(0);
459   }
460 
461   // Need CAP_EVENT to poll.
462   struct pollfd poll_fd;
463   poll_fd.revents = 0;
464   poll_fd.events = POLLIN;
465   poll_fd.fd = cap_fd_rw;
466   EXPECT_OK(poll(&poll_fd, 1, 1400));
467   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
468   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
469 
470   poll_fd.fd = cap_fd_not;
471   EXPECT_OK(poll(&poll_fd, 1, 1400));
472   EXPECT_EQ(0, (poll_fd.revents & POLLIN));
473   EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
474 
475   poll_fd.fd = cap_fd_poll;
476   EXPECT_OK(poll(&poll_fd, 1, 1400));
477   EXPECT_NE(0, (poll_fd.revents & POLLIN));
478   EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
479 
480   // Need CAP_READ to read.
481   struct fanotify_event_metadata ev;
482   memset(&ev, 0, sizeof(ev));
483   EXPECT_NOTCAPABLE(read(cap_fd_wo, &ev, sizeof(ev)));
484   rc = read(fa_fd, &ev, sizeof(ev));
485   EXPECT_OK(rc);
486   EXPECT_EQ((int)sizeof(struct fanotify_event_metadata), rc);
487   EXPECT_EQ(child, ev.pid);
488   EXPECT_NE(0, ev.fd);
489 
490   // TODO(drysdale): reinstate if/when capsicum-linux propagates rights
491   // to fanotify-generated FDs.
492 #ifdef OMIT
493   // fanotify(7) gives us a FD for the changed file.  This should
494   // only have rights that are a subset of those for the original
495   // monitored directory file descriptor.
496   cap_rights_t rights;
497   CAP_SET_ALL(&rights);
498   EXPECT_OK(cap_rights_get(ev.fd, &rights));
499   EXPECT_RIGHTS_IN(&rights, &r_rslstat);
500 #endif
501 
502   // Wait for the child.
503   int status;
504   EXPECT_EQ(child, waitpid(child, &status, 0));
505   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
506   EXPECT_EQ(0, rc);
507 
508   close(cap_dfd_rsstat);
509   close(cap_dfd_rsl);
510   close(cap_dfd_rs);
511   close(cap_dfd);
512   close(dfd);
513   unlink(TmpFile("cap_notify/file"));
514   unlink(TmpFile("cap_notify/temp"));
515   rmdir(TmpFile("cap_notify"));
516   close(cap_fd_not);
517   close(cap_fd_poll);
518   close(cap_fd_rw);
519   close(cap_fd_wo);
520   close(cap_fd_ro);
521   close(fa_fd);
522 }
523 #endif
524 
525 TEST(Linux, inotify) {
526   int i_fd = inotify_init();
527   EXPECT_OK(i_fd);
528 
529   cap_rights_t r_rs;
530   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
531   cap_rights_t r_ws;
532   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
533   cap_rights_t r_rws;
534   cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
535   cap_rights_t r_rwsnotify;
536   cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
537 
538   int cap_fd_ro = dup(i_fd);
539   EXPECT_OK(cap_fd_ro);
540   EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
541   int cap_fd_wo = dup(i_fd);
542   EXPECT_OK(cap_fd_wo);
543   EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
544   int cap_fd_rw = dup(i_fd);
545   EXPECT_OK(cap_fd_rw);
546   EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
547   int cap_fd_all = dup(i_fd);
548   EXPECT_OK(cap_fd_all);
549   EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwsnotify));
550 
551   int fd = open(TmpFile("cap_inotify"), O_CREAT|O_RDWR, 0644);
552   EXPECT_NOTCAPABLE(inotify_add_watch(cap_fd_rw, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY));
553   int wd = inotify_add_watch(i_fd, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY);
554   EXPECT_OK(wd);
555 
556   unsigned char buffer[] = {1, 2, 3, 4};
557   EXPECT_OK(write(fd, buffer, sizeof(buffer)));
558 
559   struct inotify_event iev;
560   memset(&iev, 0, sizeof(iev));
561   EXPECT_NOTCAPABLE(read(cap_fd_wo, &iev, sizeof(iev)));
562   int rc = read(cap_fd_ro, &iev, sizeof(iev));
563   EXPECT_OK(rc);
564   EXPECT_EQ((int)sizeof(iev), rc);
565   EXPECT_EQ(wd, iev.wd);
566 
567   EXPECT_NOTCAPABLE(inotify_rm_watch(cap_fd_wo, wd));
568   EXPECT_OK(inotify_rm_watch(cap_fd_all, wd));
569 
570   close(fd);
571   close(cap_fd_all);
572   close(cap_fd_rw);
573   close(cap_fd_wo);
574   close(cap_fd_ro);
575   close(i_fd);
576   unlink(TmpFile("cap_inotify"));
577 }
578 
579 TEST(Linux, ArchChangeIfAvailable) {
580   const char* prog_candidates[] = {"./mini-me.32", "./mini-me.x32", "./mini-me.64"};
581   const char* progs[] = {NULL, NULL, NULL};
582   char* argv_pass[] = {(char*)"to-come", (char*)"--capmode", NULL};
583   char* null_envp[] = {NULL};
584   int fds[3];
585   int count = 0;
586 
587   for (int ii = 0; ii < 3; ii++) {
588     fds[count] = open(prog_candidates[ii], O_RDONLY);
589     if (fds[count] >= 0) {
590       progs[count] = prog_candidates[ii];
591       count++;
592     }
593   }
594   if (count == 0) {
595     GTEST_SKIP() << "no different-architecture programs available";
596   }
597 
598   for (int ii = 0; ii < count; ii++) {
599     // Fork-and-exec a binary of this architecture.
600     pid_t child = fork();
601     if (child == 0) {
602       EXPECT_OK(cap_enter());  // Enter capability mode
603       if (verbose) fprintf(stderr, "[%d] call fexecve(%s, %s)\n",
604                            getpid_(), progs[ii], argv_pass[1]);
605       argv_pass[0] = (char *)progs[ii];
606       int rc = fexecve_(fds[ii], argv_pass, null_envp);
607       fprintf(stderr, "fexecve(%s) returned %d errno %d\n", progs[ii], rc, errno);
608       exit(99);  // Should not reach here.
609     }
610     int status;
611     EXPECT_EQ(child, waitpid(child, &status, 0));
612     int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
613     EXPECT_EQ(0, rc);
614     close(fds[ii]);
615   }
616 }
617 
618 FORK_TEST(Linux, NamespaceIfRoot) {
619   GTEST_SKIP_IF_NOT_ROOT();
620   pid_t me = getpid_();
621 
622   // Create a new UTS namespace.
623   EXPECT_OK(unshare(CLONE_NEWUTS));
624   // Open an FD to its symlink.
625   char buffer[256];
626   sprintf(buffer, "/proc/%d/ns/uts", me);
627   int ns_fd = open(buffer, O_RDONLY);
628 
629   cap_rights_t r_rwlstat;
630   cap_rights_init(&r_rwlstat, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT);
631   cap_rights_t r_rwlstatns;
632   cap_rights_init(&r_rwlstatns, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT, CAP_SETNS);
633 
634   int cap_fd = dup(ns_fd);
635   EXPECT_OK(cap_fd);
636   EXPECT_OK(cap_rights_limit(cap_fd, &r_rwlstat));
637   int cap_fd_setns = dup(ns_fd);
638   EXPECT_OK(cap_fd_setns);
639   EXPECT_OK(cap_rights_limit(cap_fd_setns, &r_rwlstatns));
640   EXPECT_NOTCAPABLE(setns(cap_fd, CLONE_NEWUTS));
641   EXPECT_OK(setns(cap_fd_setns, CLONE_NEWUTS));
642 
643   EXPECT_OK(cap_enter());  // Enter capability mode.
644 
645   // No setns(2) but unshare(2) is allowed.
646   EXPECT_CAPMODE(setns(ns_fd, CLONE_NEWUTS));
647   EXPECT_OK(unshare(CLONE_NEWUTS));
648 }
649 
650 static void SendFD(int fd, int over) {
651   struct msghdr mh;
652   mh.msg_name = NULL;  // No address needed
653   mh.msg_namelen = 0;
654   char buffer1[1024];
655   struct iovec iov[1];
656   iov[0].iov_base = buffer1;
657   iov[0].iov_len = sizeof(buffer1);
658   mh.msg_iov = iov;
659   mh.msg_iovlen = 1;
660   char buffer2[1024];
661   mh.msg_control = buffer2;
662   mh.msg_controllen = CMSG_LEN(sizeof(int));
663   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
664   cmptr->cmsg_level = SOL_SOCKET;
665   cmptr->cmsg_type = SCM_RIGHTS;
666   cmptr->cmsg_len = CMSG_LEN(sizeof(int));
667   *(int *)CMSG_DATA(cmptr) = fd;
668   buffer1[0] = 0;
669   iov[0].iov_len = 1;
670   int rc = sendmsg(over, &mh, 0);
671   EXPECT_OK(rc);
672 }
673 
674 static int ReceiveFD(int over) {
675   struct msghdr mh;
676   mh.msg_name = NULL;  // No address needed
677   mh.msg_namelen = 0;
678   char buffer1[1024];
679   struct iovec iov[1];
680   iov[0].iov_base = buffer1;
681   iov[0].iov_len = sizeof(buffer1);
682   mh.msg_iov = iov;
683   mh.msg_iovlen = 1;
684   char buffer2[1024];
685   mh.msg_control = buffer2;
686   mh.msg_controllen = sizeof(buffer2);
687   int rc = recvmsg(over, &mh, 0);
688   EXPECT_OK(rc);
689   EXPECT_LE(CMSG_LEN(sizeof(int)), mh.msg_controllen);
690   struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
691   int fd = *(int*)CMSG_DATA(cmptr);
692   EXPECT_EQ(CMSG_LEN(sizeof(int)), cmptr->cmsg_len);
693   cmptr = CMSG_NXTHDR(&mh, cmptr);
694   EXPECT_TRUE(cmptr == NULL);
695   return fd;
696 }
697 
698 static int shared_pd = -1;
699 static int shared_sock_fds[2];
700 
701 static int ChildFunc(void *arg) {
702   // This function is running in a new PID namespace, and so is pid 1.
703   if (verbose) fprintf(stderr, "    ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
704   EXPECT_EQ(1, getpid_());
705   EXPECT_EQ(0, getppid());
706 
707   // The shared process descriptor is outside our namespace, so we cannot
708   // get its pid.
709   if (verbose) fprintf(stderr, "    ChildFunc: shared_pd=%d\n", shared_pd);
710   pid_t shared_child = -1;
711   EXPECT_OK(pdgetpid(shared_pd, &shared_child));
712   if (verbose) fprintf(stderr, "    ChildFunc: corresponding pid=%d\n", shared_child);
713   EXPECT_EQ(0, shared_child);
714 
715   // But we can pdkill() it even so.
716   if (verbose) fprintf(stderr, "    ChildFunc: call pdkill(pd=%d)\n", shared_pd);
717   EXPECT_OK(pdkill(shared_pd, SIGINT));
718 
719   int pd;
720   pid_t child = pdfork(&pd, 0);
721   EXPECT_OK(child);
722   if (child == 0) {
723     // Child: expect pid 2.
724     if (verbose) fprintf(stderr, "      child of ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
725     EXPECT_EQ(2, getpid_());
726     EXPECT_EQ(1, getppid());
727     while (true) {
728       if (verbose) fprintf(stderr, "      child of ChildFunc: \"I aten't dead\"\n");
729       sleep(1);
730     }
731     exit(0);
732   }
733   EXPECT_EQ(2, child);
734   EXPECT_PID_ALIVE(child);
735   if (verbose) fprintf(stderr, "    ChildFunc: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
736                        pd, child, ProcessState(child));
737 
738   pid_t pid;
739   EXPECT_OK(pdgetpid(pd, &pid));
740   EXPECT_EQ(child, pid);
741 
742   sleep(2);
743 
744   // Send the process descriptor over UNIX domain socket back to parent.
745   SendFD(pd, shared_sock_fds[1]);
746 
747   // Wait for death of (grand)child, killed by our parent.
748   if (verbose) fprintf(stderr, "    ChildFunc: wait on pid=%d\n", child);
749   int status;
750   EXPECT_EQ(child, wait4(child, &status, __WALL, NULL));
751 
752   if (verbose) fprintf(stderr, "    ChildFunc: return 0\n");
753   return 0;
754 }
755 
756 #define STACK_SIZE (1024 * 1024)
757 static char child_stack[STACK_SIZE];
758 
759 // TODO(drysdale): fork into a user namespace first so GTEST_SKIP_IF_NOT_ROOT can be removed.
760 TEST(Linux, PidNamespacePdForkIfRoot) {
761   GTEST_SKIP_IF_NOT_ROOT();
762   // Pass process descriptors in both directions across a PID namespace boundary.
763   // pdfork() off a child before we start, holding its process descriptor in a global
764   // variable that's accessible to children.
765   pid_t firstborn = pdfork(&shared_pd, 0);
766   EXPECT_OK(firstborn);
767   if (firstborn == 0) {
768     while (true) {
769       if (verbose) fprintf(stderr, "  Firstborn: \"I aten't dead\"\n");
770       sleep(1);
771     }
772     exit(0);
773   }
774   EXPECT_PID_ALIVE(firstborn);
775   if (verbose) fprintf(stderr, "Parent: pre-pdfork()ed pd=%d, pid=%d state='%c'\n",
776                        shared_pd, firstborn, ProcessState(firstborn));
777   sleep(2);
778 
779   // Prepare sockets to communicate with child process.
780   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
781 
782   // Clone into a child process with a new pid namespace.
783   pid_t child = clone(ChildFunc, child_stack + STACK_SIZE,
784                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
785   EXPECT_OK(child);
786   EXPECT_PID_ALIVE(child);
787   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
788 
789   // Ensure the child runs.  First thing it does is to kill our firstborn, using shared_pd.
790   sleep(1);
791   EXPECT_PID_DEAD(firstborn);
792 
793   // But we can still retrieve firstborn's PID, as it's not been reaped yet.
794   pid_t child0;
795   EXPECT_OK(pdgetpid(shared_pd, &child0));
796   EXPECT_EQ(firstborn, child0);
797   if (verbose) fprintf(stderr, "Parent: check on firstborn: pdgetpid(pd=%d) -> child=%d state='%c'\n",
798                        shared_pd, child0, ProcessState(child0));
799 
800   // Now reap it.
801   int status;
802   EXPECT_EQ(firstborn, waitpid(firstborn, &status, __WALL));
803 
804   // Get the process descriptor of the child-of-child via socket transfer.
805   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
806 
807   // Our notion of the pid associated with the grandchild is in the main PID namespace.
808   pid_t grandchild;
809   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
810   EXPECT_NE(2, grandchild);
811   if (verbose) fprintf(stderr, "Parent: pre-pdkill:  pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
812                        grandchild_pd, grandchild, ProcessState(grandchild));
813   EXPECT_PID_ALIVE(grandchild);
814 
815   // Kill the grandchild via the process descriptor.
816   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
817   usleep(10000);
818   if (verbose) fprintf(stderr, "Parent: post-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
819                        grandchild_pd, grandchild, ProcessState(grandchild));
820   EXPECT_PID_DEAD(grandchild);
821 
822   sleep(2);
823 
824   // Wait for the child.
825   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
826   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
827   EXPECT_EQ(0, rc);
828 
829   close(shared_sock_fds[0]);
830   close(shared_sock_fds[1]);
831   close(shared_pd);
832   close(grandchild_pd);
833 }
834 
835 int NSInit(void *data) {
836   // This function is running in a new PID namespace, and so is pid 1.
837   if (verbose) fprintf(stderr, "  NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
838   EXPECT_EQ(1, getpid_());
839   EXPECT_EQ(0, getppid());
840 
841   int pd;
842   pid_t child = pdfork(&pd, 0);
843   EXPECT_OK(child);
844   if (child == 0) {
845     // Child: loop forever until terminated.
846     if (verbose) fprintf(stderr, "    child of NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
847     while (true) {
848       if (verbose) fprintf(stderr, "    child of NSInit: \"I aten't dead\"\n");
849       usleep(100000);
850     }
851     exit(0);
852   }
853   EXPECT_EQ(2, child);
854   EXPECT_PID_ALIVE(child);
855   if (verbose) fprintf(stderr, "  NSInit: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
856                        pd, child, ProcessState(child));
857   sleep(1);
858 
859   // Send the process descriptor over UNIX domain socket back to parent.
860   SendFD(pd, shared_sock_fds[1]);
861   close(pd);
862 
863   // Wait for a byte back in the other direction.
864   int value;
865   if (verbose) fprintf(stderr, "  NSInit: block waiting for value\n");
866   read(shared_sock_fds[1], &value, sizeof(value));
867 
868   if (verbose) fprintf(stderr, "  NSInit: return 0\n");
869   return 0;
870 }
871 
872 TEST(Linux, DeadNSInitIfRoot) {
873   GTEST_SKIP_IF_NOT_ROOT();
874 
875   // Prepare sockets to communicate with child process.
876   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
877 
878   // Clone into a child process with a new pid namespace.
879   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
880                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
881   usleep(10000);
882   EXPECT_OK(child);
883   EXPECT_PID_ALIVE(child);
884   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
885 
886   // Get the process descriptor of the child-of-child via socket transfer.
887   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
888   pid_t grandchild;
889   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
890   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
891 
892   // Send an int to the child to trigger its termination.  Grandchild should also
893   // go, as its init process is gone.
894   int zero = 0;
895   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
896   write(shared_sock_fds[0], &zero, sizeof(zero));
897   EXPECT_PID_ZOMBIE(child);
898   EXPECT_PID_GONE(grandchild);
899 
900   // Wait for the child.
901   int status;
902   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
903   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
904   EXPECT_EQ(0, rc);
905   EXPECT_PID_GONE(child);
906 
907   close(shared_sock_fds[0]);
908   close(shared_sock_fds[1]);
909   close(grandchild_pd);
910 
911   if (verbose) {
912     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
913     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
914   }
915 }
916 
917 TEST(Linux, DeadNSInit2IfRoot) {
918   GTEST_SKIP_IF_NOT_ROOT();
919 
920   // Prepare sockets to communicate with child process.
921   EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
922 
923   // Clone into a child process with a new pid namespace.
924   pid_t child = clone(NSInit, child_stack + STACK_SIZE,
925                       CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
926   usleep(10000);
927   EXPECT_OK(child);
928   EXPECT_PID_ALIVE(child);
929   if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
930 
931   // Get the process descriptor of the child-of-child via socket transfer.
932   int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
933   pid_t grandchild;
934   EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
935   if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
936 
937   // Kill the grandchild
938   EXPECT_OK(pdkill(grandchild_pd, SIGINT));
939   usleep(10000);
940   EXPECT_PID_ZOMBIE(grandchild);
941   // Close the process descriptor, so there are now no procdesc references to grandchild.
942   close(grandchild_pd);
943 
944   // Send an int to the child to trigger its termination.  Grandchild should also
945   // go, as its init process is gone.
946   int zero = 0;
947   if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
948   write(shared_sock_fds[0], &zero, sizeof(zero));
949   EXPECT_PID_ZOMBIE(child);
950   EXPECT_PID_GONE(grandchild);
951 
952   // Wait for the child.
953   int status;
954   EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
955   int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
956   EXPECT_EQ(0, rc);
957 
958   close(shared_sock_fds[0]);
959   close(shared_sock_fds[1]);
960 
961   if (verbose) {
962     fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
963     fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
964   }
965 }
966 
967 #ifdef __x86_64__
968 FORK_TEST(Linux, CheckHighWord) {
969   EXPECT_OK(cap_enter());  // Enter capability mode.
970 
971   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
972   EXPECT_OK(rc);
973   EXPECT_EQ(1, rc);  // no_new_privs = 1
974 
975   // Set some of the high 32-bits of argument zero.
976   uint64_t big_cmd = PR_GET_NO_NEW_PRIVS | 0x100000000LL;
977   EXPECT_CAPMODE(syscall(__NR_prctl, big_cmd, 0, 0, 0, 0));
978 }
979 #endif
980 
981 FORK_TEST(Linux, PrctlOpenatBeneath) {
982   // Set no_new_privs = 1
983   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
984   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
985   EXPECT_OK(rc);
986   EXPECT_EQ(1, rc);  // no_new_privs = 1
987 
988   // Set openat-beneath mode
989   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 1, 0, 0, 0));
990   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
991   EXPECT_OK(rc);
992   EXPECT_EQ(1, rc);  // openat_beneath = 1
993 
994   // Clear openat-beneath mode
995   EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
996   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
997   EXPECT_OK(rc);
998   EXPECT_EQ(0, rc);  // openat_beneath = 0
999 
1000   EXPECT_OK(cap_enter());  // Enter capability mode
1001 
1002   // Expect to be in openat_beneath mode
1003   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1004   EXPECT_OK(rc);
1005   EXPECT_EQ(1, rc);  // openat_beneath = 1
1006 
1007   // Expect this to be immutable.
1008   EXPECT_CAPMODE(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
1009   rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1010   EXPECT_OK(rc);
1011   EXPECT_EQ(1, rc);  // openat_beneath = 1
1012 
1013 }
1014 
1015 FORK_TEST(Linux, NoNewPrivs) {
1016   if (getuid() == 0) {
1017     // If root, drop CAP_SYS_ADMIN POSIX.1e capability.
1018     struct __user_cap_header_struct hdr;
1019     hdr.version = _LINUX_CAPABILITY_VERSION_3;
1020     hdr.pid = getpid_();
1021     struct __user_cap_data_struct data[3];
1022     EXPECT_OK(capget(&hdr, &data[0]));
1023     data[0].effective &= ~(1 << CAP_SYS_ADMIN);
1024     data[0].permitted &= ~(1 << CAP_SYS_ADMIN);
1025     data[0].inheritable &= ~(1 << CAP_SYS_ADMIN);
1026     EXPECT_OK(capset(&hdr, &data[0]));
1027   }
1028   int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1029   EXPECT_OK(rc);
1030   EXPECT_EQ(0, rc);  // no_new_privs == 0
1031 
1032   // Can't enter seccomp-bpf mode with no_new_privs == 0
1033   struct sock_filter filter[] = {
1034     BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1035   };
1036   struct sock_fprog bpf;
1037   bpf.len = (sizeof(filter) / sizeof(filter[0]));
1038   bpf.filter = filter;
1039   rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0);
1040   EXPECT_EQ(-1, rc);
1041   EXPECT_EQ(EACCES, errno);
1042 
1043   // Set no_new_privs = 1
1044   EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1045   rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1046   EXPECT_OK(rc);
1047   EXPECT_EQ(1, rc);  // no_new_privs = 1
1048 
1049   // Can now turn on seccomp mode
1050   EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1051 }
1052 
1053 /* Macros for BPF generation */
1054 #define BPF_RETURN_ERRNO(err) \
1055   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | (err & 0xFFFF))
1056 #define BPF_KILL_PROCESS \
1057   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
1058 #define BPF_ALLOW \
1059   BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1060 #define EXAMINE_SYSCALL \
1061   BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr))
1062 #define ALLOW_SYSCALL(name) \
1063   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1064   BPF_ALLOW
1065 #define KILL_SYSCALL(name) \
1066   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1067   BPF_KILL_PROCESS
1068 #define FAIL_SYSCALL(name, err) \
1069   BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1070   BPF_RETURN_ERRNO(err)
1071 
1072 TEST(Linux, CapModeWithBPF) {
1073   pid_t child = fork();
1074   EXPECT_OK(child);
1075   if (child == 0) {
1076     int fd = open(TmpFile("cap_bpf_capmode"), O_CREAT|O_RDWR, 0644);
1077     cap_rights_t rights;
1078     cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1079     EXPECT_OK(cap_rights_limit(fd, &rights));
1080 
1081     struct sock_filter filter[] = { EXAMINE_SYSCALL,
1082                                     FAIL_SYSCALL(fchmod, ENOMEM),
1083                                     FAIL_SYSCALL(fstat, ENOEXEC),
1084                                     ALLOW_SYSCALL(close),
1085                                     KILL_SYSCALL(fsync),
1086                                     BPF_ALLOW };
1087     struct sock_fprog bpf = {.len = (sizeof(filter) / sizeof(filter[0])),
1088                              .filter = filter};
1089     // Set up seccomp-bpf first.
1090     EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1091     EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1092 
1093     EXPECT_OK(cap_enter());  // Enter capability mode.
1094 
1095     // fchmod is allowed by Capsicum, but failed by BPF.
1096     EXPECT_SYSCALL_FAIL(ENOMEM, fchmod(fd, 0644));
1097     // open is allowed by BPF, but failed by Capsicum
1098     EXPECT_SYSCALL_FAIL(ECAPMODE, open(TmpFile("cap_bpf_capmode"), O_RDONLY));
1099     // fstat is failed by both BPF and Capsicum; tie-break is on errno
1100     struct stat buf;
1101     EXPECT_SYSCALL_FAIL(ENOEXEC, fstat(fd, &buf));
1102     // fsync is allowed by Capsicum, but BPF's SIGSYS generation take precedence
1103     fsync(fd);  // terminate with unhandled SIGSYS
1104     exit(0);
1105   }
1106   int status;
1107   EXPECT_EQ(child, waitpid(child, &status, 0));
1108   EXPECT_TRUE(WIFSIGNALED(status));
1109   EXPECT_EQ(SIGSYS, WTERMSIG(status));
1110   unlink(TmpFile("cap_bpf_capmode"));
1111 }
1112 
1113 TEST(Linux, AIO) {
1114   int fd = open(TmpFile("cap_aio"), O_CREAT|O_RDWR, 0644);
1115   EXPECT_OK(fd);
1116 
1117   cap_rights_t r_rs;
1118   cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
1119   cap_rights_t r_ws;
1120   cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
1121   cap_rights_t r_rwssync;
1122   cap_rights_init(&r_rwssync, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1123 
1124   int cap_ro = dup(fd);
1125   EXPECT_OK(cap_ro);
1126   EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
1127   EXPECT_OK(cap_ro);
1128   int cap_wo = dup(fd);
1129   EXPECT_OK(cap_wo);
1130   EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
1131   EXPECT_OK(cap_wo);
1132   int cap_all = dup(fd);
1133   EXPECT_OK(cap_all);
1134   EXPECT_OK(cap_rights_limit(cap_all, &r_rwssync));
1135   EXPECT_OK(cap_all);
1136 
1137   // Linux: io_setup, io_submit, io_getevents, io_cancel, io_destroy
1138   aio_context_t ctx = 0;
1139   EXPECT_OK(syscall(__NR_io_setup, 10, &ctx));
1140 
1141   unsigned char buffer[32] = {1, 2, 3, 4};
1142   struct iocb req;
1143   memset(&req, 0, sizeof(req));
1144   req.aio_reqprio = 0;
1145   req.aio_fildes = fd;
1146   uintptr_t bufaddr = (uintptr_t)buffer;
1147   req.aio_buf = (__u64)bufaddr;
1148   req.aio_nbytes = 4;
1149   req.aio_offset = 0;
1150   struct iocb* reqs[1] = {&req};
1151 
1152   // Write operation
1153   req.aio_lio_opcode = IOCB_CMD_PWRITE;
1154   req.aio_fildes = cap_ro;
1155   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1156   req.aio_fildes = cap_wo;
1157   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1158 
1159   // Sync operation
1160   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1161   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1162   req.aio_lio_opcode = IOCB_CMD_FDSYNC;
1163   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1164   // Even with CAP_FSYNC, turns out fsync/fdsync aren't implemented
1165   req.aio_fildes = cap_all;
1166   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1167   req.aio_lio_opcode = IOCB_CMD_FSYNC;
1168   EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1169 
1170   // Read operation
1171   req.aio_lio_opcode = IOCB_CMD_PREAD;
1172   req.aio_fildes = cap_wo;
1173   EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1174   req.aio_fildes = cap_ro;
1175   EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1176 
1177   EXPECT_OK(syscall(__NR_io_destroy, ctx));
1178 
1179   close(cap_all);
1180   close(cap_wo);
1181   close(cap_ro);
1182   close(fd);
1183   unlink(TmpFile("cap_aio"));
1184 }
1185 
1186 #ifndef KCMP_FILE
1187 #define KCMP_FILE 0
1188 #endif
1189 TEST(Linux, KcmpIfAvailable) {
1190   // This requires CONFIG_CHECKPOINT_RESTORE in kernel config.
1191   int fd = open("/etc/passwd", O_RDONLY);
1192   EXPECT_OK(fd);
1193   pid_t parent = getpid_();
1194 
1195   errno = 0;
1196   int rc = syscall(__NR_kcmp, parent, parent, KCMP_FILE, fd, fd);
1197   if (rc == -1 && errno == ENOSYS) {
1198     GTEST_SKIP() << "kcmp(2) gives -ENOSYS";
1199   }
1200 
1201   pid_t child = fork();
1202   if (child == 0) {
1203     // Child: limit rights on FD.
1204     child = getpid_();
1205     EXPECT_OK(syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1206     cap_rights_t rights;
1207     cap_rights_init(&rights, CAP_READ, CAP_WRITE);
1208     EXPECT_OK(cap_rights_limit(fd, &rights));
1209     // A capability wrapping a normal FD is different (from a kcmp(2) perspective)
1210     // than the original file.
1211     EXPECT_NE(0, syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1212     exit(HasFailure());
1213   }
1214   // Wait for the child.
1215   int status;
1216   EXPECT_EQ(child, waitpid(child, &status, 0));
1217   rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
1218   EXPECT_EQ(0, rc);
1219 
1220   close(fd);
1221 }
1222 
1223 TEST(Linux, ProcFS) {
1224   cap_rights_t rights;
1225   cap_rights_init(&rights, CAP_READ, CAP_SEEK);
1226   int fd = open("/etc/passwd", O_RDONLY);
1227   EXPECT_OK(fd);
1228   lseek(fd, 4, SEEK_SET);
1229   int cap = dup(fd);
1230   EXPECT_OK(cap);
1231   EXPECT_OK(cap_rights_limit(cap, &rights));
1232   pid_t me = getpid_();
1233 
1234   char buffer[1024];
1235   sprintf(buffer, "/proc/%d/fdinfo/%d", me, cap);
1236   int procfd = open(buffer, O_RDONLY);
1237   EXPECT_OK(procfd) << " failed to open " << buffer;
1238   if (procfd < 0) return;
1239   int proccap = dup(procfd);
1240   EXPECT_OK(proccap);
1241   EXPECT_OK(cap_rights_limit(proccap, &rights));
1242 
1243   EXPECT_OK(read(proccap, buffer, sizeof(buffer)));
1244   // The fdinfo should include the file pos of the underlying file
1245   EXPECT_NE((char*)NULL, strstr(buffer, "pos:\t4"));
1246   // ...and the rights of the Capsicum capability.
1247   EXPECT_NE((char*)NULL, strstr(buffer, "rights:\t0x"));
1248 
1249   close(procfd);
1250   close(proccap);
1251   close(cap);
1252   close(fd);
1253 }
1254 
1255 FORK_TEST(Linux, ProcessClocks) {
1256   pid_t self = getpid_();
1257   pid_t child = fork();
1258   EXPECT_OK(child);
1259   if (child == 0) {
1260     child = getpid_();
1261     usleep(100000);
1262     exit(0);
1263   }
1264 
1265   EXPECT_OK(cap_enter());  // Enter capability mode.
1266 
1267   // Nefariously build a clock ID for the child's CPU time.
1268   // This relies on knowledge of the internal layout of clock IDs.
1269   clockid_t child_clock;
1270   child_clock = ((~child) << 3) | 0x0;
1271   struct timespec ts;
1272   memset(&ts, 0, sizeof(ts));
1273 
1274   // TODO(drysdale): Should not be possible to retrieve info about a
1275   // different process, as the PID global namespace should be locked
1276   // down.
1277   EXPECT_OK(clock_gettime(child_clock, &ts));
1278   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(child=%d->0x%08x) is %ld.%09ld \n",
1279                        self, child, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1280 
1281   child_clock = ((~1) << 3) | 0x0;
1282   memset(&ts, 0, sizeof(ts));
1283   EXPECT_OK(clock_gettime(child_clock, &ts));
1284   if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(init=1->0x%08x) is %ld.%09ld \n",
1285                        self, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1286 
1287   // Orphan the child.
1288 }
1289 
1290 TEST(Linux, SetLease) {
1291   int fd_all = open(TmpFile("cap_lease"), O_CREAT|O_RDWR, 0644);
1292   EXPECT_OK(fd_all);
1293   int fd_rw = dup(fd_all);
1294   EXPECT_OK(fd_rw);
1295 
1296   cap_rights_t r_all;
1297   cap_rights_init(&r_all, CAP_READ, CAP_WRITE, CAP_FLOCK, CAP_FSIGNAL);
1298   EXPECT_OK(cap_rights_limit(fd_all, &r_all));
1299 
1300   cap_rights_t r_rw;
1301   cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
1302   EXPECT_OK(cap_rights_limit(fd_rw, &r_rw));
1303 
1304   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_SETLEASE, F_WRLCK));
1305   EXPECT_NOTCAPABLE(fcntl(fd_rw, F_GETLEASE));
1306 
1307   if (!tmpdir_on_tmpfs) {  // tmpfs doesn't support leases
1308     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_WRLCK));
1309     EXPECT_EQ(F_WRLCK, fcntl(fd_all, F_GETLEASE));
1310 
1311     EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_UNLCK, 0));
1312     EXPECT_EQ(F_UNLCK, fcntl(fd_all, F_GETLEASE));
1313   }
1314   close(fd_all);
1315   close(fd_rw);
1316   unlink(TmpFile("cap_lease"));
1317 }
1318 
1319 TEST(Linux, InvalidRightsSyscall) {
1320   int fd = open(TmpFile("cap_invalid_rights"), O_RDONLY|O_CREAT, 0644);
1321   EXPECT_OK(fd);
1322 
1323   cap_rights_t rights;
1324   cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FCHMOD, CAP_FSTAT);
1325 
1326   // Use the raw syscall throughout.
1327   EXPECT_EQ(0, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1328 
1329   // Directly access the syscall, and find all unseemly manner of use for it.
1330   //  - Invalid flags
1331   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 1));
1332   EXPECT_EQ(EINVAL, errno);
1333   //  - Specify an fcntl subright, but no CAP_FCNTL set
1334   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, CAP_FCNTL_GETFL, 0, NULL, 0));
1335   EXPECT_EQ(EINVAL, errno);
1336   //  - Specify an ioctl subright, but no CAP_IOCTL set
1337   unsigned int ioctl1 = 1;
1338   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, &ioctl1, 0));
1339   EXPECT_EQ(EINVAL, errno);
1340   //  - N ioctls, but null pointer passed
1341   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, NULL, 0));
1342   EXPECT_EQ(EINVAL, errno);
1343   //  - Invalid nioctls
1344   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, -2, NULL, 0));
1345   EXPECT_EQ(EINVAL, errno);
1346   //  - Null primary rights
1347   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, NULL, 0, 0, NULL, 0));
1348   EXPECT_EQ(EFAULT, errno);
1349   //  - Invalid index bitmask
1350   rights.cr_rights[0] |= 3ULL << 57;
1351   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1352   EXPECT_EQ(EINVAL, errno);
1353   //  - Invalid version
1354   rights.cr_rights[0] |= 2ULL << 62;
1355   EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1356   EXPECT_EQ(EINVAL, errno);
1357 
1358   close(fd);
1359   unlink(TmpFile("cap_invalid_rights"));
1360 }
1361 
1362 FORK_TEST_ON(Linux, OpenByHandleAtIfRoot, TmpFile("cap_openbyhandle_testfile")) {
1363   GTEST_SKIP_IF_NOT_ROOT();
1364   int dir = open(tmpdir.c_str(), O_RDONLY);
1365   EXPECT_OK(dir);
1366   int fd = openat(dir, "cap_openbyhandle_testfile", O_RDWR|O_CREAT, 0644);
1367   EXPECT_OK(fd);
1368   const char* message = "Saved text";
1369   EXPECT_OK(write(fd, message, strlen(message)));
1370   close(fd);
1371 
1372   struct file_handle* fhandle = (struct file_handle*)malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
1373   fhandle->handle_bytes = MAX_HANDLE_SZ;
1374   int mount_id;
1375   EXPECT_OK(name_to_handle_at(dir, "cap_openbyhandle_testfile", fhandle,  &mount_id, 0));
1376 
1377   fd = open_by_handle_at(dir, fhandle, O_RDONLY);
1378   EXPECT_OK(fd);
1379   char buffer[200];
1380   ssize_t len = read(fd, buffer, 199);
1381   EXPECT_OK(len);
1382   EXPECT_EQ(std::string(message), std::string(buffer, len));
1383   close(fd);
1384 
1385   // Cannot issue open_by_handle_at after entering capability mode.
1386   cap_enter();
1387   EXPECT_CAPMODE(open_by_handle_at(dir, fhandle, O_RDONLY));
1388 
1389   close(dir);
1390 }
1391 
1392 int getrandom_(void *buf, size_t buflen, unsigned int flags) {
1393 #ifdef __NR_getrandom
1394   return syscall(__NR_getrandom, buf, buflen, flags);
1395 #else
1396   errno = ENOSYS;
1397   return -1;
1398 #endif
1399 }
1400 
1401 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1402 #include <linux/random.h>  // Requires 3.17 kernel
1403 FORK_TEST(Linux, GetRandom) {
1404   EXPECT_OK(cap_enter());
1405   unsigned char buffer[1024];
1406   unsigned char buffer2[1024];
1407   EXPECT_OK(getrandom_(buffer, sizeof(buffer), GRND_NONBLOCK));
1408   EXPECT_OK(getrandom_(buffer2, sizeof(buffer2), GRND_NONBLOCK));
1409   EXPECT_NE(0, memcmp(buffer, buffer2, sizeof(buffer)));
1410 }
1411 #endif
1412 
1413 int memfd_create_(const char *name, unsigned int flags) {
1414 #ifdef __NR_memfd_create
1415   return syscall(__NR_memfd_create, name, flags);
1416 #else
1417   errno = ENOSYS;
1418   return -1;
1419 #endif
1420 }
1421 
1422 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1423 #include <linux/memfd.h>  // Requires 3.17 kernel
1424 TEST(Linux, MemFDDeathTestIfAvailable) {
1425   int memfd = memfd_create_("capsicum-test", MFD_ALLOW_SEALING);
1426   if (memfd == -1 && errno == ENOSYS) {
1427     GTEST_SKIP() << "memfd_create(2) gives -ENOSYS";
1428   }
1429   const int LEN = 16;
1430   EXPECT_OK(ftruncate(memfd, LEN));
1431   int memfd_ro = dup(memfd);
1432   int memfd_rw = dup(memfd);
1433   EXPECT_OK(memfd_ro);
1434   EXPECT_OK(memfd_rw);
1435   cap_rights_t rights;
1436   EXPECT_OK(cap_rights_limit(memfd_ro, cap_rights_init(&rights, CAP_MMAP_R, CAP_FSTAT)));
1437   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW, CAP_FCHMOD)));
1438 
1439   unsigned char *p_ro = (unsigned char *)mmap(NULL, LEN, PROT_READ, MAP_SHARED, memfd_ro, 0);
1440   EXPECT_NE((unsigned char *)MAP_FAILED, p_ro);
1441   unsigned char *p_rw = (unsigned char *)mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_rw, 0);
1442   EXPECT_NE((unsigned char *)MAP_FAILED, p_rw);
1443   EXPECT_EQ(MAP_FAILED,
1444             mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_ro, 0));
1445 
1446   *p_rw = 42;
1447   EXPECT_EQ(42, *p_ro);
1448   EXPECT_DEATH(*p_ro = 42, "");
1449 
1450 #ifndef F_ADD_SEALS
1451   // Hack for when libc6 does not yet include the updated linux/fcntl.h from kernel 3.17
1452 #define _F_LINUX_SPECIFIC_BASE F_SETLEASE
1453 #define F_ADD_SEALS	(_F_LINUX_SPECIFIC_BASE + 9)
1454 #define F_GET_SEALS	(_F_LINUX_SPECIFIC_BASE + 10)
1455 #define F_SEAL_SEAL	0x0001	/* prevent further seals from being set */
1456 #define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
1457 #define F_SEAL_GROW	0x0004	/* prevent file from growing */
1458 #define F_SEAL_WRITE	0x0008	/* prevent writes */
1459 #endif
1460 
1461   // Reading the seal information requires CAP_FSTAT.
1462   int seals = fcntl(memfd, F_GET_SEALS);
1463   EXPECT_OK(seals);
1464   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1465   int seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1466   EXPECT_EQ(seals, seals_ro);
1467   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1468   int seals_rw = fcntl(memfd_rw, F_GET_SEALS);
1469   EXPECT_NOTCAPABLE(seals_rw);
1470 
1471   // Fail to seal as a writable mapping exists.
1472   EXPECT_EQ(-1, fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1473   EXPECT_EQ(EBUSY, errno);
1474   *p_rw = 42;
1475 
1476   // Seal the rw version; need to unmap first.
1477   munmap(p_rw, LEN);
1478   munmap(p_ro, LEN);
1479   EXPECT_OK(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1480 
1481   seals = fcntl(memfd, F_GET_SEALS);
1482   EXPECT_OK(seals);
1483   if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1484   seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1485   EXPECT_EQ(seals, seals_ro);
1486   if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1487 
1488   // Remove the CAP_FCHMOD right, can no longer add seals.
1489   EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW)));
1490   EXPECT_NOTCAPABLE(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1491 
1492   close(memfd);
1493   close(memfd_ro);
1494   close(memfd_rw);
1495 }
1496 #endif
1497 
1498 #else
1499 void noop() {}
1500 #endif
1501