1 // Tests of Linux-specific functionality 2 #ifdef __linux__ 3 4 #include <sys/types.h> 5 #include <sys/stat.h> 6 #include <sys/socket.h> 7 #include <sys/timerfd.h> 8 #include <sys/signalfd.h> 9 #include <sys/eventfd.h> 10 #include <sys/epoll.h> 11 #include <sys/inotify.h> 12 #include <sys/fanotify.h> 13 #include <sys/mman.h> 14 #include <sys/capability.h> // Requires e.g. libcap-dev package for POSIX.1e capabilities headers 15 #include <linux/aio_abi.h> 16 #include <linux/filter.h> 17 #include <linux/seccomp.h> 18 #include <linux/version.h> 19 #include <poll.h> 20 #include <sched.h> 21 #include <signal.h> 22 #include <fcntl.h> 23 #include <unistd.h> 24 25 #include <string> 26 27 #include "capsicum.h" 28 #include "syscalls.h" 29 #include "capsicum-test.h" 30 31 TEST(Linux, TimerFD) { 32 int fd = timerfd_create(CLOCK_MONOTONIC, 0); 33 34 cap_rights_t r_ro; 35 cap_rights_init(&r_ro, CAP_READ); 36 cap_rights_t r_wo; 37 cap_rights_init(&r_wo, CAP_WRITE); 38 cap_rights_t r_rw; 39 cap_rights_init(&r_rw, CAP_READ, CAP_WRITE); 40 cap_rights_t r_rwpoll; 41 cap_rights_init(&r_rwpoll, CAP_READ, CAP_WRITE, CAP_EVENT); 42 43 int cap_fd_ro = dup(fd); 44 EXPECT_OK(cap_fd_ro); 45 EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_ro)); 46 int cap_fd_wo = dup(fd); 47 EXPECT_OK(cap_fd_wo); 48 EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_wo)); 49 int cap_fd_rw = dup(fd); 50 EXPECT_OK(cap_fd_rw); 51 EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rw)); 52 int cap_fd_all = dup(fd); 53 EXPECT_OK(cap_fd_all); 54 EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwpoll)); 55 56 struct itimerspec old_ispec; 57 struct itimerspec ispec; 58 ispec.it_interval.tv_sec = 0; 59 ispec.it_interval.tv_nsec = 0; 60 ispec.it_value.tv_sec = 0; 61 ispec.it_value.tv_nsec = 100000000; // 100ms 62 EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_ro, 0, &ispec, NULL)); 63 EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_wo, 0, &ispec, &old_ispec)); 64 EXPECT_OK(timerfd_settime(cap_fd_wo, 0, &ispec, NULL)); 65 EXPECT_OK(timerfd_settime(cap_fd_rw, 0, &ispec, NULL)); 66 EXPECT_OK(timerfd_settime(cap_fd_all, 0, &ispec, NULL)); 67 68 EXPECT_NOTCAPABLE(timerfd_gettime(cap_fd_wo, &old_ispec)); 69 EXPECT_OK(timerfd_gettime(cap_fd_ro, &old_ispec)); 70 EXPECT_OK(timerfd_gettime(cap_fd_rw, &old_ispec)); 71 EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec)); 72 73 // To be able to poll() for the timer pop, still need CAP_EVENT. 74 struct pollfd poll_fd; 75 for (int ii = 0; ii < 3; ii++) { 76 poll_fd.revents = 0; 77 poll_fd.events = POLLIN; 78 switch (ii) { 79 case 0: poll_fd.fd = cap_fd_ro; break; 80 case 1: poll_fd.fd = cap_fd_wo; break; 81 case 2: poll_fd.fd = cap_fd_rw; break; 82 } 83 // Poll immediately returns with POLLNVAL 84 EXPECT_OK(poll(&poll_fd, 1, 400)); 85 EXPECT_EQ(0, (poll_fd.revents & POLLIN)); 86 EXPECT_NE(0, (poll_fd.revents & POLLNVAL)); 87 } 88 89 poll_fd.fd = cap_fd_all; 90 EXPECT_OK(poll(&poll_fd, 1, 400)); 91 EXPECT_NE(0, (poll_fd.revents & POLLIN)); 92 EXPECT_EQ(0, (poll_fd.revents & POLLNVAL)); 93 94 EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec)); 95 EXPECT_EQ(0, old_ispec.it_value.tv_sec); 96 EXPECT_EQ(0, old_ispec.it_value.tv_nsec); 97 EXPECT_EQ(0, old_ispec.it_interval.tv_sec); 98 EXPECT_EQ(0, old_ispec.it_interval.tv_nsec); 99 100 close(cap_fd_all); 101 close(cap_fd_rw); 102 close(cap_fd_wo); 103 close(cap_fd_ro); 104 close(fd); 105 } 106 107 FORK_TEST(Linux, SignalFD) { 108 if (force_mt) { 109 TEST_SKIPPED("multi-threaded run clashes with signals"); 110 return; 111 } 112 pid_t me = getpid(); 113 sigset_t mask; 114 sigemptyset(&mask); 115 sigaddset(&mask, SIGUSR1); 116 117 // Block signals before registering against a new signal FD. 118 EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL)); 119 int fd = signalfd(-1, &mask, 0); 120 EXPECT_OK(fd); 121 122 cap_rights_t r_rs; 123 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 124 cap_rights_t r_ws; 125 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 126 cap_rights_t r_sig; 127 cap_rights_init(&r_sig, CAP_FSIGNAL); 128 cap_rights_t r_rssig; 129 cap_rights_init(&r_rssig, CAP_FSIGNAL, CAP_READ, CAP_SEEK); 130 cap_rights_t r_rssig_poll; 131 cap_rights_init(&r_rssig_poll, CAP_FSIGNAL, CAP_READ, CAP_SEEK, CAP_EVENT); 132 133 // Various capability variants. 134 int cap_fd_none = dup(fd); 135 EXPECT_OK(cap_fd_none); 136 EXPECT_OK(cap_rights_limit(cap_fd_none, &r_ws)); 137 int cap_fd_read = dup(fd); 138 EXPECT_OK(cap_fd_read); 139 EXPECT_OK(cap_rights_limit(cap_fd_read, &r_rs)); 140 int cap_fd_sig = dup(fd); 141 EXPECT_OK(cap_fd_sig); 142 EXPECT_OK(cap_rights_limit(cap_fd_sig, &r_sig)); 143 int cap_fd_sig_read = dup(fd); 144 EXPECT_OK(cap_fd_sig_read); 145 EXPECT_OK(cap_rights_limit(cap_fd_sig_read, &r_rssig)); 146 int cap_fd_all = dup(fd); 147 EXPECT_OK(cap_fd_all); 148 EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rssig_poll)); 149 150 struct signalfd_siginfo fdsi; 151 152 // Need CAP_READ to read the signal information 153 kill(me, SIGUSR1); 154 EXPECT_NOTCAPABLE(read(cap_fd_none, &fdsi, sizeof(struct signalfd_siginfo))); 155 EXPECT_NOTCAPABLE(read(cap_fd_sig, &fdsi, sizeof(struct signalfd_siginfo))); 156 int len = read(cap_fd_read, &fdsi, sizeof(struct signalfd_siginfo)); 157 EXPECT_OK(len); 158 EXPECT_EQ(sizeof(struct signalfd_siginfo), (size_t)len); 159 EXPECT_EQ(SIGUSR1, (int)fdsi.ssi_signo); 160 161 // Need CAP_FSIGNAL to modify the signal mask. 162 sigemptyset(&mask); 163 sigaddset(&mask, SIGUSR1); 164 sigaddset(&mask, SIGUSR2); 165 EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL)); 166 EXPECT_NOTCAPABLE(signalfd(cap_fd_none, &mask, 0)); 167 EXPECT_NOTCAPABLE(signalfd(cap_fd_read, &mask, 0)); 168 EXPECT_EQ(cap_fd_sig, signalfd(cap_fd_sig, &mask, 0)); 169 170 // Need CAP_EVENT to get notification of a signal in poll(2). 171 kill(me, SIGUSR2); 172 173 struct pollfd poll_fd; 174 poll_fd.revents = 0; 175 poll_fd.events = POLLIN; 176 poll_fd.fd = cap_fd_sig_read; 177 EXPECT_OK(poll(&poll_fd, 1, 400)); 178 EXPECT_EQ(0, (poll_fd.revents & POLLIN)); 179 EXPECT_NE(0, (poll_fd.revents & POLLNVAL)); 180 181 poll_fd.fd = cap_fd_all; 182 EXPECT_OK(poll(&poll_fd, 1, 400)); 183 EXPECT_NE(0, (poll_fd.revents & POLLIN)); 184 EXPECT_EQ(0, (poll_fd.revents & POLLNVAL)); 185 } 186 187 TEST(Linux, EventFD) { 188 int fd = eventfd(0, 0); 189 EXPECT_OK(fd); 190 191 cap_rights_t r_rs; 192 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 193 cap_rights_t r_ws; 194 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 195 cap_rights_t r_rws; 196 cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK); 197 cap_rights_t r_rwspoll; 198 cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT); 199 200 int cap_ro = dup(fd); 201 EXPECT_OK(cap_ro); 202 EXPECT_OK(cap_rights_limit(cap_ro, &r_rs)); 203 int cap_wo = dup(fd); 204 EXPECT_OK(cap_wo); 205 EXPECT_OK(cap_rights_limit(cap_wo, &r_ws)); 206 int cap_rw = dup(fd); 207 EXPECT_OK(cap_rw); 208 EXPECT_OK(cap_rights_limit(cap_rw, &r_rws)); 209 int cap_all = dup(fd); 210 EXPECT_OK(cap_all); 211 EXPECT_OK(cap_rights_limit(cap_all, &r_rwspoll)); 212 213 pid_t child = fork(); 214 if (child == 0) { 215 // Child: write counter to eventfd 216 uint64_t u = 42; 217 EXPECT_NOTCAPABLE(write(cap_ro, &u, sizeof(u))); 218 EXPECT_OK(write(cap_wo, &u, sizeof(u))); 219 exit(HasFailure()); 220 } 221 222 sleep(1); // Allow child to write 223 224 struct pollfd poll_fd; 225 poll_fd.revents = 0; 226 poll_fd.events = POLLIN; 227 poll_fd.fd = cap_rw; 228 EXPECT_OK(poll(&poll_fd, 1, 400)); 229 EXPECT_EQ(0, (poll_fd.revents & POLLIN)); 230 EXPECT_NE(0, (poll_fd.revents & POLLNVAL)); 231 232 poll_fd.fd = cap_all; 233 EXPECT_OK(poll(&poll_fd, 1, 400)); 234 EXPECT_NE(0, (poll_fd.revents & POLLIN)); 235 EXPECT_EQ(0, (poll_fd.revents & POLLNVAL)); 236 237 uint64_t u; 238 EXPECT_NOTCAPABLE(read(cap_wo, &u, sizeof(u))); 239 EXPECT_OK(read(cap_ro, &u, sizeof(u))); 240 EXPECT_EQ(42, (int)u); 241 242 // Wait for the child. 243 int status; 244 EXPECT_EQ(child, waitpid(child, &status, 0)); 245 int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 246 EXPECT_EQ(0, rc); 247 248 close(cap_all); 249 close(cap_rw); 250 close(cap_wo); 251 close(cap_ro); 252 close(fd); 253 } 254 255 FORK_TEST(Linux, epoll) { 256 int sock_fds[2]; 257 EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds)); 258 // Queue some data. 259 char buffer[4] = {1, 2, 3, 4}; 260 EXPECT_OK(write(sock_fds[1], buffer, sizeof(buffer))); 261 262 EXPECT_OK(cap_enter()); // Enter capability mode. 263 264 int epoll_fd = epoll_create(1); 265 EXPECT_OK(epoll_fd); 266 267 cap_rights_t r_rs; 268 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 269 cap_rights_t r_ws; 270 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 271 cap_rights_t r_rws; 272 cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK); 273 cap_rights_t r_rwspoll; 274 cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT); 275 cap_rights_t r_epoll; 276 cap_rights_init(&r_epoll, CAP_EPOLL_CTL); 277 278 int cap_epoll_wo = dup(epoll_fd); 279 EXPECT_OK(cap_epoll_wo); 280 EXPECT_OK(cap_rights_limit(cap_epoll_wo, &r_ws)); 281 int cap_epoll_ro = dup(epoll_fd); 282 EXPECT_OK(cap_epoll_ro); 283 EXPECT_OK(cap_rights_limit(cap_epoll_ro, &r_rs)); 284 int cap_epoll_rw = dup(epoll_fd); 285 EXPECT_OK(cap_epoll_rw); 286 EXPECT_OK(cap_rights_limit(cap_epoll_rw, &r_rws)); 287 int cap_epoll_poll = dup(epoll_fd); 288 EXPECT_OK(cap_epoll_poll); 289 EXPECT_OK(cap_rights_limit(cap_epoll_poll, &r_rwspoll)); 290 int cap_epoll_ctl = dup(epoll_fd); 291 EXPECT_OK(cap_epoll_ctl); 292 EXPECT_OK(cap_rights_limit(cap_epoll_ctl, &r_epoll)); 293 294 // Can only modify the FDs being monitored if the CAP_EPOLL_CTL right is present. 295 struct epoll_event eev; 296 memset(&eev, 0, sizeof(eev)); 297 eev.events = EPOLLIN|EPOLLOUT|EPOLLPRI; 298 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_ADD, sock_fds[0], &eev)); 299 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_ADD, sock_fds[0], &eev)); 300 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_ADD, sock_fds[0], &eev)); 301 EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_ADD, sock_fds[0], &eev)); 302 eev.events = EPOLLIN|EPOLLOUT; 303 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_MOD, sock_fds[0], &eev)); 304 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_MOD, sock_fds[0], &eev)); 305 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_MOD, sock_fds[0], &eev)); 306 EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_MOD, sock_fds[0], &eev)); 307 308 // Running epoll_pwait(2) requires CAP_EVENT. 309 eev.events = 0; 310 EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_ro, &eev, 1, 100, NULL)); 311 EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_wo, &eev, 1, 100, NULL)); 312 EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_rw, &eev, 1, 100, NULL)); 313 EXPECT_OK(epoll_pwait(cap_epoll_poll, &eev, 1, 100, NULL)); 314 EXPECT_EQ(EPOLLIN, eev.events & EPOLLIN); 315 316 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_DEL, sock_fds[0], &eev)); 317 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_DEL, sock_fds[0], &eev)); 318 EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_DEL, sock_fds[0], &eev)); 319 EXPECT_OK(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, sock_fds[0], &eev)); 320 321 close(cap_epoll_ctl); 322 close(cap_epoll_poll); 323 close(cap_epoll_rw); 324 close(cap_epoll_ro); 325 close(cap_epoll_wo); 326 close(epoll_fd); 327 close(sock_fds[1]); 328 close(sock_fds[0]); 329 } 330 331 TEST(Linux, fstatat) { 332 int fd = open(TmpFile("cap_fstatat"), O_CREAT|O_RDWR, 0644); 333 EXPECT_OK(fd); 334 unsigned char buffer[] = {1, 2, 3, 4}; 335 EXPECT_OK(write(fd, buffer, sizeof(buffer))); 336 cap_rights_t rights; 337 int cap_rf = dup(fd); 338 EXPECT_OK(cap_rf); 339 EXPECT_OK(cap_rights_limit(cap_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT))); 340 int cap_ro = dup(fd); 341 EXPECT_OK(cap_ro); 342 EXPECT_OK(cap_rights_limit(cap_ro, cap_rights_init(&rights, CAP_READ))); 343 344 struct stat info; 345 EXPECT_OK(fstatat(fd, "", &info, AT_EMPTY_PATH)); 346 EXPECT_NOTCAPABLE(fstatat(cap_ro, "", &info, AT_EMPTY_PATH)); 347 EXPECT_OK(fstatat(cap_rf, "", &info, AT_EMPTY_PATH)); 348 349 close(cap_ro); 350 close(cap_rf); 351 close(fd); 352 353 int dir = open(tmpdir.c_str(), O_RDONLY); 354 EXPECT_OK(dir); 355 int dir_rf = dup(dir); 356 EXPECT_OK(dir_rf); 357 EXPECT_OK(cap_rights_limit(dir_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT))); 358 int dir_ro = dup(fd); 359 EXPECT_OK(dir_ro); 360 EXPECT_OK(cap_rights_limit(dir_ro, cap_rights_init(&rights, CAP_READ))); 361 362 EXPECT_OK(fstatat(dir, "cap_fstatat", &info, AT_EMPTY_PATH)); 363 EXPECT_NOTCAPABLE(fstatat(dir_ro, "cap_fstatat", &info, AT_EMPTY_PATH)); 364 EXPECT_OK(fstatat(dir_rf, "cap_fstatat", &info, AT_EMPTY_PATH)); 365 366 close(dir_ro); 367 close(dir_rf); 368 close(dir); 369 370 unlink(TmpFile("cap_fstatat")); 371 } 372 373 // fanotify support may not be available at compile-time 374 #ifdef __NR_fanotify_init 375 TEST(Linux, fanotify) { 376 REQUIRE_ROOT(); 377 int fa_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDWR); 378 EXPECT_OK(fa_fd); 379 if (fa_fd < 0) return; // May not be enabled 380 381 cap_rights_t r_rs; 382 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 383 cap_rights_t r_ws; 384 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 385 cap_rights_t r_rws; 386 cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK); 387 cap_rights_t r_rwspoll; 388 cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT); 389 cap_rights_t r_rwsnotify; 390 cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY); 391 cap_rights_t r_rsl; 392 cap_rights_init(&r_rsl, CAP_READ, CAP_SEEK, CAP_LOOKUP); 393 cap_rights_t r_rslstat; 394 cap_rights_init(&r_rslstat, CAP_READ, CAP_SEEK, CAP_LOOKUP, CAP_FSTAT); 395 cap_rights_t r_rsstat; 396 cap_rights_init(&r_rsstat, CAP_READ, CAP_SEEK, CAP_FSTAT); 397 398 int cap_fd_ro = dup(fa_fd); 399 EXPECT_OK(cap_fd_ro); 400 EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs)); 401 int cap_fd_wo = dup(fa_fd); 402 EXPECT_OK(cap_fd_wo); 403 EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws)); 404 int cap_fd_rw = dup(fa_fd); 405 EXPECT_OK(cap_fd_rw); 406 EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws)); 407 int cap_fd_poll = dup(fa_fd); 408 EXPECT_OK(cap_fd_poll); 409 EXPECT_OK(cap_rights_limit(cap_fd_poll, &r_rwspoll)); 410 int cap_fd_not = dup(fa_fd); 411 EXPECT_OK(cap_fd_not); 412 EXPECT_OK(cap_rights_limit(cap_fd_not, &r_rwsnotify)); 413 414 int rc = mkdir(TmpFile("cap_notify"), 0755); 415 EXPECT_TRUE(rc == 0 || errno == EEXIST); 416 int dfd = open(TmpFile("cap_notify"), O_RDONLY); 417 EXPECT_OK(dfd); 418 int fd = open(TmpFile("cap_notify/file"), O_CREAT|O_RDWR, 0644); 419 close(fd); 420 int cap_dfd = dup(dfd); 421 EXPECT_OK(cap_dfd); 422 EXPECT_OK(cap_rights_limit(cap_dfd, &r_rslstat)); 423 EXPECT_OK(cap_dfd); 424 int cap_dfd_rs = dup(dfd); 425 EXPECT_OK(cap_dfd_rs); 426 EXPECT_OK(cap_rights_limit(cap_dfd_rs, &r_rs)); 427 EXPECT_OK(cap_dfd_rs); 428 int cap_dfd_rsstat = dup(dfd); 429 EXPECT_OK(cap_dfd_rsstat); 430 EXPECT_OK(cap_rights_limit(cap_dfd_rsstat, &r_rsstat)); 431 EXPECT_OK(cap_dfd_rsstat); 432 int cap_dfd_rsl = dup(dfd); 433 EXPECT_OK(cap_dfd_rsl); 434 EXPECT_OK(cap_rights_limit(cap_dfd_rsl, &r_rsl)); 435 EXPECT_OK(cap_dfd_rsl); 436 437 // Need CAP_NOTIFY to change what's monitored. 438 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_ro, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL)); 439 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_wo, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL)); 440 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_rw, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL)); 441 EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL)); 442 443 // Need CAP_FSTAT on the thing monitored. 444 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rs, NULL)); 445 EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rsstat, NULL)); 446 447 // Too add monitoring of a file under a dfd, need CAP_LOOKUP|CAP_FSTAT on the dfd. 448 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsstat, "file")); 449 EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsl, "file")); 450 EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd, "file")); 451 452 pid_t child = fork(); 453 if (child == 0) { 454 // Child: Perform activity in the directory under notify. 455 sleep(1); 456 unlink(TmpFile("cap_notify/temp")); 457 int fd = open(TmpFile("cap_notify/temp"), O_CREAT|O_RDWR, 0644); 458 close(fd); 459 exit(0); 460 } 461 462 // Need CAP_EVENT to poll. 463 struct pollfd poll_fd; 464 poll_fd.revents = 0; 465 poll_fd.events = POLLIN; 466 poll_fd.fd = cap_fd_rw; 467 EXPECT_OK(poll(&poll_fd, 1, 1400)); 468 EXPECT_EQ(0, (poll_fd.revents & POLLIN)); 469 EXPECT_NE(0, (poll_fd.revents & POLLNVAL)); 470 471 poll_fd.fd = cap_fd_not; 472 EXPECT_OK(poll(&poll_fd, 1, 1400)); 473 EXPECT_EQ(0, (poll_fd.revents & POLLIN)); 474 EXPECT_NE(0, (poll_fd.revents & POLLNVAL)); 475 476 poll_fd.fd = cap_fd_poll; 477 EXPECT_OK(poll(&poll_fd, 1, 1400)); 478 EXPECT_NE(0, (poll_fd.revents & POLLIN)); 479 EXPECT_EQ(0, (poll_fd.revents & POLLNVAL)); 480 481 // Need CAP_READ to read. 482 struct fanotify_event_metadata ev; 483 memset(&ev, 0, sizeof(ev)); 484 EXPECT_NOTCAPABLE(read(cap_fd_wo, &ev, sizeof(ev))); 485 rc = read(fa_fd, &ev, sizeof(ev)); 486 EXPECT_OK(rc); 487 EXPECT_EQ((int)sizeof(struct fanotify_event_metadata), rc); 488 EXPECT_EQ(child, ev.pid); 489 EXPECT_NE(0, ev.fd); 490 491 // TODO(drysdale): reinstate if/when capsicum-linux propagates rights 492 // to fanotify-generated FDs. 493 #ifdef OMIT 494 // fanotify(7) gives us a FD for the changed file. This should 495 // only have rights that are a subset of those for the original 496 // monitored directory file descriptor. 497 cap_rights_t rights; 498 CAP_SET_ALL(&rights); 499 EXPECT_OK(cap_rights_get(ev.fd, &rights)); 500 EXPECT_RIGHTS_IN(&rights, &r_rslstat); 501 #endif 502 503 // Wait for the child. 504 int status; 505 EXPECT_EQ(child, waitpid(child, &status, 0)); 506 rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 507 EXPECT_EQ(0, rc); 508 509 close(cap_dfd_rsstat); 510 close(cap_dfd_rsl); 511 close(cap_dfd_rs); 512 close(cap_dfd); 513 close(dfd); 514 unlink(TmpFile("cap_notify/file")); 515 unlink(TmpFile("cap_notify/temp")); 516 rmdir(TmpFile("cap_notify")); 517 close(cap_fd_not); 518 close(cap_fd_poll); 519 close(cap_fd_rw); 520 close(cap_fd_wo); 521 close(cap_fd_ro); 522 close(fa_fd); 523 } 524 #endif 525 526 TEST(Linux, inotify) { 527 int i_fd = inotify_init(); 528 EXPECT_OK(i_fd); 529 530 cap_rights_t r_rs; 531 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 532 cap_rights_t r_ws; 533 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 534 cap_rights_t r_rws; 535 cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK); 536 cap_rights_t r_rwsnotify; 537 cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY); 538 539 int cap_fd_ro = dup(i_fd); 540 EXPECT_OK(cap_fd_ro); 541 EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs)); 542 int cap_fd_wo = dup(i_fd); 543 EXPECT_OK(cap_fd_wo); 544 EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws)); 545 int cap_fd_rw = dup(i_fd); 546 EXPECT_OK(cap_fd_rw); 547 EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws)); 548 int cap_fd_all = dup(i_fd); 549 EXPECT_OK(cap_fd_all); 550 EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwsnotify)); 551 552 int fd = open(TmpFile("cap_inotify"), O_CREAT|O_RDWR, 0644); 553 EXPECT_NOTCAPABLE(inotify_add_watch(cap_fd_rw, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY)); 554 int wd = inotify_add_watch(i_fd, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY); 555 EXPECT_OK(wd); 556 557 unsigned char buffer[] = {1, 2, 3, 4}; 558 EXPECT_OK(write(fd, buffer, sizeof(buffer))); 559 560 struct inotify_event iev; 561 memset(&iev, 0, sizeof(iev)); 562 EXPECT_NOTCAPABLE(read(cap_fd_wo, &iev, sizeof(iev))); 563 int rc = read(cap_fd_ro, &iev, sizeof(iev)); 564 EXPECT_OK(rc); 565 EXPECT_EQ((int)sizeof(iev), rc); 566 EXPECT_EQ(wd, iev.wd); 567 568 EXPECT_NOTCAPABLE(inotify_rm_watch(cap_fd_wo, wd)); 569 EXPECT_OK(inotify_rm_watch(cap_fd_all, wd)); 570 571 close(fd); 572 close(cap_fd_all); 573 close(cap_fd_rw); 574 close(cap_fd_wo); 575 close(cap_fd_ro); 576 close(i_fd); 577 unlink(TmpFile("cap_inotify")); 578 } 579 580 TEST(Linux, ArchChange) { 581 const char* prog_candidates[] = {"./mini-me.32", "./mini-me.x32", "./mini-me.64"}; 582 const char* progs[] = {NULL, NULL, NULL}; 583 char* argv_pass[] = {(char*)"to-come", (char*)"--capmode", NULL}; 584 char* null_envp[] = {NULL}; 585 int fds[3]; 586 int count = 0; 587 588 for (int ii = 0; ii < 3; ii++) { 589 fds[count] = open(prog_candidates[ii], O_RDONLY); 590 if (fds[count] >= 0) { 591 progs[count] = prog_candidates[ii]; 592 count++; 593 } 594 } 595 if (count == 0) { 596 TEST_SKIPPED("no different-architecture programs available"); 597 return; 598 } 599 600 for (int ii = 0; ii < count; ii++) { 601 // Fork-and-exec a binary of this architecture. 602 pid_t child = fork(); 603 if (child == 0) { 604 EXPECT_OK(cap_enter()); // Enter capability mode 605 if (verbose) fprintf(stderr, "[%d] call fexecve(%s, %s)\n", 606 getpid_(), progs[ii], argv_pass[1]); 607 argv_pass[0] = (char *)progs[ii]; 608 int rc = fexecve_(fds[ii], argv_pass, null_envp); 609 fprintf(stderr, "fexecve(%s) returned %d errno %d\n", progs[ii], rc, errno); 610 exit(99); // Should not reach here. 611 } 612 int status; 613 EXPECT_EQ(child, waitpid(child, &status, 0)); 614 int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 615 EXPECT_EQ(0, rc); 616 close(fds[ii]); 617 } 618 } 619 620 FORK_TEST(Linux, Namespace) { 621 REQUIRE_ROOT(); 622 pid_t me = getpid_(); 623 624 // Create a new UTS namespace. 625 EXPECT_OK(unshare(CLONE_NEWUTS)); 626 // Open an FD to its symlink. 627 char buffer[256]; 628 sprintf(buffer, "/proc/%d/ns/uts", me); 629 int ns_fd = open(buffer, O_RDONLY); 630 631 cap_rights_t r_rwlstat; 632 cap_rights_init(&r_rwlstat, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT); 633 cap_rights_t r_rwlstatns; 634 cap_rights_init(&r_rwlstatns, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT, CAP_SETNS); 635 636 int cap_fd = dup(ns_fd); 637 EXPECT_OK(cap_fd); 638 EXPECT_OK(cap_rights_limit(cap_fd, &r_rwlstat)); 639 int cap_fd_setns = dup(ns_fd); 640 EXPECT_OK(cap_fd_setns); 641 EXPECT_OK(cap_rights_limit(cap_fd_setns, &r_rwlstatns)); 642 EXPECT_NOTCAPABLE(setns(cap_fd, CLONE_NEWUTS)); 643 EXPECT_OK(setns(cap_fd_setns, CLONE_NEWUTS)); 644 645 EXPECT_OK(cap_enter()); // Enter capability mode. 646 647 // No setns(2) but unshare(2) is allowed. 648 EXPECT_CAPMODE(setns(ns_fd, CLONE_NEWUTS)); 649 EXPECT_OK(unshare(CLONE_NEWUTS)); 650 } 651 652 static void SendFD(int fd, int over) { 653 struct msghdr mh; 654 mh.msg_name = NULL; // No address needed 655 mh.msg_namelen = 0; 656 char buffer1[1024]; 657 struct iovec iov[1]; 658 iov[0].iov_base = buffer1; 659 iov[0].iov_len = sizeof(buffer1); 660 mh.msg_iov = iov; 661 mh.msg_iovlen = 1; 662 char buffer2[1024]; 663 mh.msg_control = buffer2; 664 mh.msg_controllen = CMSG_LEN(sizeof(int)); 665 struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh); 666 cmptr->cmsg_level = SOL_SOCKET; 667 cmptr->cmsg_type = SCM_RIGHTS; 668 cmptr->cmsg_len = CMSG_LEN(sizeof(int)); 669 *(int *)CMSG_DATA(cmptr) = fd; 670 buffer1[0] = 0; 671 iov[0].iov_len = 1; 672 int rc = sendmsg(over, &mh, 0); 673 EXPECT_OK(rc); 674 } 675 676 static int ReceiveFD(int over) { 677 struct msghdr mh; 678 mh.msg_name = NULL; // No address needed 679 mh.msg_namelen = 0; 680 char buffer1[1024]; 681 struct iovec iov[1]; 682 iov[0].iov_base = buffer1; 683 iov[0].iov_len = sizeof(buffer1); 684 mh.msg_iov = iov; 685 mh.msg_iovlen = 1; 686 char buffer2[1024]; 687 mh.msg_control = buffer2; 688 mh.msg_controllen = sizeof(buffer2); 689 int rc = recvmsg(over, &mh, 0); 690 EXPECT_OK(rc); 691 EXPECT_LE(CMSG_LEN(sizeof(int)), mh.msg_controllen); 692 struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh); 693 int fd = *(int*)CMSG_DATA(cmptr); 694 EXPECT_EQ(CMSG_LEN(sizeof(int)), cmptr->cmsg_len); 695 cmptr = CMSG_NXTHDR(&mh, cmptr); 696 EXPECT_TRUE(cmptr == NULL); 697 return fd; 698 } 699 700 static int shared_pd = -1; 701 static int shared_sock_fds[2]; 702 703 static int ChildFunc(void *arg) { 704 // This function is running in a new PID namespace, and so is pid 1. 705 if (verbose) fprintf(stderr, " ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid()); 706 EXPECT_EQ(1, getpid_()); 707 EXPECT_EQ(0, getppid()); 708 709 // The shared process descriptor is outside our namespace, so we cannot 710 // get its pid. 711 if (verbose) fprintf(stderr, " ChildFunc: shared_pd=%d\n", shared_pd); 712 pid_t shared_child = -1; 713 EXPECT_OK(pdgetpid(shared_pd, &shared_child)); 714 if (verbose) fprintf(stderr, " ChildFunc: corresponding pid=%d\n", shared_child); 715 EXPECT_EQ(0, shared_child); 716 717 // But we can pdkill() it even so. 718 if (verbose) fprintf(stderr, " ChildFunc: call pdkill(pd=%d)\n", shared_pd); 719 EXPECT_OK(pdkill(shared_pd, SIGINT)); 720 721 int pd; 722 pid_t child = pdfork(&pd, 0); 723 EXPECT_OK(child); 724 if (child == 0) { 725 // Child: expect pid 2. 726 if (verbose) fprintf(stderr, " child of ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid()); 727 EXPECT_EQ(2, getpid_()); 728 EXPECT_EQ(1, getppid()); 729 while (true) { 730 if (verbose) fprintf(stderr, " child of ChildFunc: \"I aten't dead\"\n"); 731 sleep(1); 732 } 733 exit(0); 734 } 735 EXPECT_EQ(2, child); 736 EXPECT_PID_ALIVE(child); 737 if (verbose) fprintf(stderr, " ChildFunc: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n", 738 pd, child, ProcessState(child)); 739 740 pid_t pid; 741 EXPECT_OK(pdgetpid(pd, &pid)); 742 EXPECT_EQ(child, pid); 743 744 sleep(2); 745 746 // Send the process descriptor over UNIX domain socket back to parent. 747 SendFD(pd, shared_sock_fds[1]); 748 749 // Wait for death of (grand)child, killed by our parent. 750 if (verbose) fprintf(stderr, " ChildFunc: wait on pid=%d\n", child); 751 int status; 752 EXPECT_EQ(child, wait4(child, &status, __WALL, NULL)); 753 754 if (verbose) fprintf(stderr, " ChildFunc: return 0\n"); 755 return 0; 756 } 757 758 #define STACK_SIZE (1024 * 1024) 759 static char child_stack[STACK_SIZE]; 760 761 // TODO(drysdale): fork into a user namespace first so REQUIRE_ROOT can be removed. 762 TEST(Linux, PidNamespacePdFork) { 763 REQUIRE_ROOT(); 764 // Pass process descriptors in both directions across a PID namespace boundary. 765 // pdfork() off a child before we start, holding its process descriptor in a global 766 // variable that's accessible to children. 767 pid_t firstborn = pdfork(&shared_pd, 0); 768 EXPECT_OK(firstborn); 769 if (firstborn == 0) { 770 while (true) { 771 if (verbose) fprintf(stderr, " Firstborn: \"I aten't dead\"\n"); 772 sleep(1); 773 } 774 exit(0); 775 } 776 EXPECT_PID_ALIVE(firstborn); 777 if (verbose) fprintf(stderr, "Parent: pre-pdfork()ed pd=%d, pid=%d state='%c'\n", 778 shared_pd, firstborn, ProcessState(firstborn)); 779 sleep(2); 780 781 // Prepare sockets to communicate with child process. 782 EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds)); 783 784 // Clone into a child process with a new pid namespace. 785 pid_t child = clone(ChildFunc, child_stack + STACK_SIZE, 786 CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL); 787 EXPECT_OK(child); 788 EXPECT_PID_ALIVE(child); 789 if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child)); 790 791 // Ensure the child runs. First thing it does is to kill our firstborn, using shared_pd. 792 sleep(1); 793 EXPECT_PID_DEAD(firstborn); 794 795 // But we can still retrieve firstborn's PID, as it's not been reaped yet. 796 pid_t child0; 797 EXPECT_OK(pdgetpid(shared_pd, &child0)); 798 EXPECT_EQ(firstborn, child0); 799 if (verbose) fprintf(stderr, "Parent: check on firstborn: pdgetpid(pd=%d) -> child=%d state='%c'\n", 800 shared_pd, child0, ProcessState(child0)); 801 802 // Now reap it. 803 int status; 804 EXPECT_EQ(firstborn, waitpid(firstborn, &status, __WALL)); 805 806 // Get the process descriptor of the child-of-child via socket transfer. 807 int grandchild_pd = ReceiveFD(shared_sock_fds[0]); 808 809 // Our notion of the pid associated with the grandchild is in the main PID namespace. 810 pid_t grandchild; 811 EXPECT_OK(pdgetpid(grandchild_pd, &grandchild)); 812 EXPECT_NE(2, grandchild); 813 if (verbose) fprintf(stderr, "Parent: pre-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n", 814 grandchild_pd, grandchild, ProcessState(grandchild)); 815 EXPECT_PID_ALIVE(grandchild); 816 817 // Kill the grandchild via the process descriptor. 818 EXPECT_OK(pdkill(grandchild_pd, SIGINT)); 819 usleep(10000); 820 if (verbose) fprintf(stderr, "Parent: post-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n", 821 grandchild_pd, grandchild, ProcessState(grandchild)); 822 EXPECT_PID_DEAD(grandchild); 823 824 sleep(2); 825 826 // Wait for the child. 827 EXPECT_EQ(child, waitpid(child, &status, WNOHANG)); 828 int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 829 EXPECT_EQ(0, rc); 830 831 close(shared_sock_fds[0]); 832 close(shared_sock_fds[1]); 833 close(shared_pd); 834 close(grandchild_pd); 835 } 836 837 int NSInit(void *data) { 838 // This function is running in a new PID namespace, and so is pid 1. 839 if (verbose) fprintf(stderr, " NSInit: pid=%d, ppid=%d\n", getpid_(), getppid()); 840 EXPECT_EQ(1, getpid_()); 841 EXPECT_EQ(0, getppid()); 842 843 int pd; 844 pid_t child = pdfork(&pd, 0); 845 EXPECT_OK(child); 846 if (child == 0) { 847 // Child: loop forever until terminated. 848 if (verbose) fprintf(stderr, " child of NSInit: pid=%d, ppid=%d\n", getpid_(), getppid()); 849 while (true) { 850 if (verbose) fprintf(stderr, " child of NSInit: \"I aten't dead\"\n"); 851 usleep(100000); 852 } 853 exit(0); 854 } 855 EXPECT_EQ(2, child); 856 EXPECT_PID_ALIVE(child); 857 if (verbose) fprintf(stderr, " NSInit: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n", 858 pd, child, ProcessState(child)); 859 sleep(1); 860 861 // Send the process descriptor over UNIX domain socket back to parent. 862 SendFD(pd, shared_sock_fds[1]); 863 close(pd); 864 865 // Wait for a byte back in the other direction. 866 int value; 867 if (verbose) fprintf(stderr, " NSInit: block waiting for value\n"); 868 read(shared_sock_fds[1], &value, sizeof(value)); 869 870 if (verbose) fprintf(stderr, " NSInit: return 0\n"); 871 return 0; 872 } 873 874 TEST(Linux, DeadNSInit) { 875 REQUIRE_ROOT(); 876 877 // Prepare sockets to communicate with child process. 878 EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds)); 879 880 // Clone into a child process with a new pid namespace. 881 pid_t child = clone(NSInit, child_stack + STACK_SIZE, 882 CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL); 883 usleep(10000); 884 EXPECT_OK(child); 885 EXPECT_PID_ALIVE(child); 886 if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child)); 887 888 // Get the process descriptor of the child-of-child via socket transfer. 889 int grandchild_pd = ReceiveFD(shared_sock_fds[0]); 890 pid_t grandchild; 891 EXPECT_OK(pdgetpid(grandchild_pd, &grandchild)); 892 if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild)); 893 894 // Send an int to the child to trigger its termination. Grandchild should also 895 // go, as its init process is gone. 896 int zero = 0; 897 if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n"); 898 write(shared_sock_fds[0], &zero, sizeof(zero)); 899 EXPECT_PID_ZOMBIE(child); 900 EXPECT_PID_GONE(grandchild); 901 902 // Wait for the child. 903 int status; 904 EXPECT_EQ(child, waitpid(child, &status, WNOHANG)); 905 int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 906 EXPECT_EQ(0, rc); 907 EXPECT_PID_GONE(child); 908 909 close(shared_sock_fds[0]); 910 close(shared_sock_fds[1]); 911 close(grandchild_pd); 912 913 if (verbose) { 914 fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child)); 915 fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild)); 916 } 917 } 918 919 TEST(Linux, DeadNSInit2) { 920 REQUIRE_ROOT(); 921 922 // Prepare sockets to communicate with child process. 923 EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds)); 924 925 // Clone into a child process with a new pid namespace. 926 pid_t child = clone(NSInit, child_stack + STACK_SIZE, 927 CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL); 928 usleep(10000); 929 EXPECT_OK(child); 930 EXPECT_PID_ALIVE(child); 931 if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child)); 932 933 // Get the process descriptor of the child-of-child via socket transfer. 934 int grandchild_pd = ReceiveFD(shared_sock_fds[0]); 935 pid_t grandchild; 936 EXPECT_OK(pdgetpid(grandchild_pd, &grandchild)); 937 if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild)); 938 939 // Kill the grandchild 940 EXPECT_OK(pdkill(grandchild_pd, SIGINT)); 941 usleep(10000); 942 EXPECT_PID_ZOMBIE(grandchild); 943 // Close the process descriptor, so there are now no procdesc references to grandchild. 944 close(grandchild_pd); 945 946 // Send an int to the child to trigger its termination. Grandchild should also 947 // go, as its init process is gone. 948 int zero = 0; 949 if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n"); 950 write(shared_sock_fds[0], &zero, sizeof(zero)); 951 EXPECT_PID_ZOMBIE(child); 952 EXPECT_PID_GONE(grandchild); 953 954 // Wait for the child. 955 int status; 956 EXPECT_EQ(child, waitpid(child, &status, WNOHANG)); 957 int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 958 EXPECT_EQ(0, rc); 959 960 close(shared_sock_fds[0]); 961 close(shared_sock_fds[1]); 962 963 if (verbose) { 964 fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child)); 965 fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild)); 966 } 967 } 968 969 #ifdef __x86_64__ 970 FORK_TEST(Linux, CheckHighWord) { 971 EXPECT_OK(cap_enter()); // Enter capability mode. 972 973 int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); 974 EXPECT_OK(rc); 975 EXPECT_EQ(1, rc); // no_new_privs = 1 976 977 // Set some of the high 32-bits of argument zero. 978 uint64_t big_cmd = PR_GET_NO_NEW_PRIVS | 0x100000000LL; 979 EXPECT_CAPMODE(syscall(__NR_prctl, big_cmd, 0, 0, 0, 0)); 980 } 981 #endif 982 983 FORK_TEST(Linux, PrctlOpenatBeneath) { 984 // Set no_new_privs = 1 985 EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); 986 int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); 987 EXPECT_OK(rc); 988 EXPECT_EQ(1, rc); // no_new_privs = 1 989 990 // Set openat-beneath mode 991 EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 1, 0, 0, 0)); 992 rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0); 993 EXPECT_OK(rc); 994 EXPECT_EQ(1, rc); // openat_beneath = 1 995 996 // Clear openat-beneath mode 997 EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0)); 998 rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0); 999 EXPECT_OK(rc); 1000 EXPECT_EQ(0, rc); // openat_beneath = 0 1001 1002 EXPECT_OK(cap_enter()); // Enter capability mode 1003 1004 // Expect to be in openat_beneath mode 1005 rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0); 1006 EXPECT_OK(rc); 1007 EXPECT_EQ(1, rc); // openat_beneath = 1 1008 1009 // Expect this to be immutable. 1010 EXPECT_CAPMODE(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0)); 1011 rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0); 1012 EXPECT_OK(rc); 1013 EXPECT_EQ(1, rc); // openat_beneath = 1 1014 1015 } 1016 1017 FORK_TEST(Linux, NoNewPrivs) { 1018 if (getuid() == 0) { 1019 // If root, drop CAP_SYS_ADMIN POSIX.1e capability. 1020 struct __user_cap_header_struct hdr; 1021 hdr.version = _LINUX_CAPABILITY_VERSION_3; 1022 hdr.pid = getpid_(); 1023 struct __user_cap_data_struct data[3]; 1024 EXPECT_OK(capget(&hdr, &data[0])); 1025 data[0].effective &= ~(1 << CAP_SYS_ADMIN); 1026 data[0].permitted &= ~(1 << CAP_SYS_ADMIN); 1027 data[0].inheritable &= ~(1 << CAP_SYS_ADMIN); 1028 EXPECT_OK(capset(&hdr, &data[0])); 1029 } 1030 int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); 1031 EXPECT_OK(rc); 1032 EXPECT_EQ(0, rc); // no_new_privs == 0 1033 1034 // Can't enter seccomp-bpf mode with no_new_privs == 0 1035 struct sock_filter filter[] = { 1036 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) 1037 }; 1038 struct sock_fprog bpf; 1039 bpf.len = (sizeof(filter) / sizeof(filter[0])); 1040 bpf.filter = filter; 1041 rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0); 1042 EXPECT_EQ(-1, rc); 1043 EXPECT_EQ(EACCES, errno); 1044 1045 // Set no_new_privs = 1 1046 EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); 1047 rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); 1048 EXPECT_OK(rc); 1049 EXPECT_EQ(1, rc); // no_new_privs = 1 1050 1051 // Can now turn on seccomp mode 1052 EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0)); 1053 } 1054 1055 /* Macros for BPF generation */ 1056 #define BPF_RETURN_ERRNO(err) \ 1057 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | (err & 0xFFFF)) 1058 #define BPF_KILL_PROCESS \ 1059 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) 1060 #define BPF_ALLOW \ 1061 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) 1062 #define EXAMINE_SYSCALL \ 1063 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)) 1064 #define ALLOW_SYSCALL(name) \ 1065 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \ 1066 BPF_ALLOW 1067 #define KILL_SYSCALL(name) \ 1068 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \ 1069 BPF_KILL_PROCESS 1070 #define FAIL_SYSCALL(name, err) \ 1071 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \ 1072 BPF_RETURN_ERRNO(err) 1073 1074 TEST(Linux, CapModeWithBPF) { 1075 pid_t child = fork(); 1076 EXPECT_OK(child); 1077 if (child == 0) { 1078 int fd = open(TmpFile("cap_bpf_capmode"), O_CREAT|O_RDWR, 0644); 1079 cap_rights_t rights; 1080 cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC); 1081 EXPECT_OK(cap_rights_limit(fd, &rights)); 1082 1083 struct sock_filter filter[] = { EXAMINE_SYSCALL, 1084 FAIL_SYSCALL(fchmod, ENOMEM), 1085 FAIL_SYSCALL(fstat, ENOEXEC), 1086 ALLOW_SYSCALL(close), 1087 KILL_SYSCALL(fsync), 1088 BPF_ALLOW }; 1089 struct sock_fprog bpf = {.len = (sizeof(filter) / sizeof(filter[0])), 1090 .filter = filter}; 1091 // Set up seccomp-bpf first. 1092 EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); 1093 EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0)); 1094 1095 EXPECT_OK(cap_enter()); // Enter capability mode. 1096 1097 // fchmod is allowed by Capsicum, but failed by BPF. 1098 EXPECT_SYSCALL_FAIL(ENOMEM, fchmod(fd, 0644)); 1099 // open is allowed by BPF, but failed by Capsicum 1100 EXPECT_SYSCALL_FAIL(ECAPMODE, open(TmpFile("cap_bpf_capmode"), O_RDONLY)); 1101 // fstat is failed by both BPF and Capsicum; tie-break is on errno 1102 struct stat buf; 1103 EXPECT_SYSCALL_FAIL(ENOEXEC, fstat(fd, &buf)); 1104 // fsync is allowed by Capsicum, but BPF's SIGSYS generation take precedence 1105 fsync(fd); // terminate with unhandled SIGSYS 1106 exit(0); 1107 } 1108 int status; 1109 EXPECT_EQ(child, waitpid(child, &status, 0)); 1110 EXPECT_TRUE(WIFSIGNALED(status)); 1111 EXPECT_EQ(SIGSYS, WTERMSIG(status)); 1112 unlink(TmpFile("cap_bpf_capmode")); 1113 } 1114 1115 TEST(Linux, AIO) { 1116 int fd = open(TmpFile("cap_aio"), O_CREAT|O_RDWR, 0644); 1117 EXPECT_OK(fd); 1118 1119 cap_rights_t r_rs; 1120 cap_rights_init(&r_rs, CAP_READ, CAP_SEEK); 1121 cap_rights_t r_ws; 1122 cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK); 1123 cap_rights_t r_rwssync; 1124 cap_rights_init(&r_rwssync, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC); 1125 1126 int cap_ro = dup(fd); 1127 EXPECT_OK(cap_ro); 1128 EXPECT_OK(cap_rights_limit(cap_ro, &r_rs)); 1129 EXPECT_OK(cap_ro); 1130 int cap_wo = dup(fd); 1131 EXPECT_OK(cap_wo); 1132 EXPECT_OK(cap_rights_limit(cap_wo, &r_ws)); 1133 EXPECT_OK(cap_wo); 1134 int cap_all = dup(fd); 1135 EXPECT_OK(cap_all); 1136 EXPECT_OK(cap_rights_limit(cap_all, &r_rwssync)); 1137 EXPECT_OK(cap_all); 1138 1139 // Linux: io_setup, io_submit, io_getevents, io_cancel, io_destroy 1140 aio_context_t ctx = 0; 1141 EXPECT_OK(syscall(__NR_io_setup, 10, &ctx)); 1142 1143 unsigned char buffer[32] = {1, 2, 3, 4}; 1144 struct iocb req; 1145 memset(&req, 0, sizeof(req)); 1146 req.aio_reqprio = 0; 1147 req.aio_fildes = fd; 1148 uintptr_t bufaddr = (uintptr_t)buffer; 1149 req.aio_buf = (__u64)bufaddr; 1150 req.aio_nbytes = 4; 1151 req.aio_offset = 0; 1152 struct iocb* reqs[1] = {&req}; 1153 1154 // Write operation 1155 req.aio_lio_opcode = IOCB_CMD_PWRITE; 1156 req.aio_fildes = cap_ro; 1157 EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1158 req.aio_fildes = cap_wo; 1159 EXPECT_OK(syscall(__NR_io_submit, ctx, 1, reqs)); 1160 1161 // Sync operation 1162 req.aio_lio_opcode = IOCB_CMD_FSYNC; 1163 EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1164 req.aio_lio_opcode = IOCB_CMD_FDSYNC; 1165 EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1166 // Even with CAP_FSYNC, turns out fsync/fdsync aren't implemented 1167 req.aio_fildes = cap_all; 1168 EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1169 req.aio_lio_opcode = IOCB_CMD_FSYNC; 1170 EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1171 1172 // Read operation 1173 req.aio_lio_opcode = IOCB_CMD_PREAD; 1174 req.aio_fildes = cap_wo; 1175 EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs)); 1176 req.aio_fildes = cap_ro; 1177 EXPECT_OK(syscall(__NR_io_submit, ctx, 1, reqs)); 1178 1179 EXPECT_OK(syscall(__NR_io_destroy, ctx)); 1180 1181 close(cap_all); 1182 close(cap_wo); 1183 close(cap_ro); 1184 close(fd); 1185 unlink(TmpFile("cap_aio")); 1186 } 1187 1188 #ifndef KCMP_FILE 1189 #define KCMP_FILE 0 1190 #endif 1191 TEST(Linux, Kcmp) { 1192 // This requires CONFIG_CHECKPOINT_RESTORE in kernel config. 1193 int fd = open("/etc/passwd", O_RDONLY); 1194 EXPECT_OK(fd); 1195 pid_t parent = getpid_(); 1196 1197 errno = 0; 1198 int rc = syscall(__NR_kcmp, parent, parent, KCMP_FILE, fd, fd); 1199 if (rc == -1 && errno == ENOSYS) { 1200 TEST_SKIPPED("kcmp(2) gives -ENOSYS"); 1201 return; 1202 } 1203 1204 pid_t child = fork(); 1205 if (child == 0) { 1206 // Child: limit rights on FD. 1207 child = getpid_(); 1208 EXPECT_OK(syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd)); 1209 cap_rights_t rights; 1210 cap_rights_init(&rights, CAP_READ, CAP_WRITE); 1211 EXPECT_OK(cap_rights_limit(fd, &rights)); 1212 // A capability wrapping a normal FD is different (from a kcmp(2) perspective) 1213 // than the original file. 1214 EXPECT_NE(0, syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd)); 1215 exit(HasFailure()); 1216 } 1217 // Wait for the child. 1218 int status; 1219 EXPECT_EQ(child, waitpid(child, &status, 0)); 1220 rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1; 1221 EXPECT_EQ(0, rc); 1222 1223 close(fd); 1224 } 1225 1226 TEST(Linux, ProcFS) { 1227 cap_rights_t rights; 1228 cap_rights_init(&rights, CAP_READ, CAP_SEEK); 1229 int fd = open("/etc/passwd", O_RDONLY); 1230 EXPECT_OK(fd); 1231 lseek(fd, 4, SEEK_SET); 1232 int cap = dup(fd); 1233 EXPECT_OK(cap); 1234 EXPECT_OK(cap_rights_limit(cap, &rights)); 1235 pid_t me = getpid_(); 1236 1237 char buffer[1024]; 1238 sprintf(buffer, "/proc/%d/fdinfo/%d", me, cap); 1239 int procfd = open(buffer, O_RDONLY); 1240 EXPECT_OK(procfd) << " failed to open " << buffer; 1241 if (procfd < 0) return; 1242 int proccap = dup(procfd); 1243 EXPECT_OK(proccap); 1244 EXPECT_OK(cap_rights_limit(proccap, &rights)); 1245 1246 EXPECT_OK(read(proccap, buffer, sizeof(buffer))); 1247 // The fdinfo should include the file pos of the underlying file 1248 EXPECT_NE((char*)NULL, strstr(buffer, "pos:\t4")); 1249 // ...and the rights of the Capsicum capability. 1250 EXPECT_NE((char*)NULL, strstr(buffer, "rights:\t0x")); 1251 1252 close(procfd); 1253 close(proccap); 1254 close(cap); 1255 close(fd); 1256 } 1257 1258 FORK_TEST(Linux, ProcessClocks) { 1259 pid_t self = getpid_(); 1260 pid_t child = fork(); 1261 EXPECT_OK(child); 1262 if (child == 0) { 1263 child = getpid_(); 1264 usleep(100000); 1265 exit(0); 1266 } 1267 1268 EXPECT_OK(cap_enter()); // Enter capability mode. 1269 1270 // Nefariously build a clock ID for the child's CPU time. 1271 // This relies on knowledge of the internal layout of clock IDs. 1272 clockid_t child_clock; 1273 child_clock = ((~child) << 3) | 0x0; 1274 struct timespec ts; 1275 memset(&ts, 0, sizeof(ts)); 1276 1277 // TODO(drysdale): Should not be possible to retrieve info about a 1278 // different process, as the PID global namespace should be locked 1279 // down. 1280 EXPECT_OK(clock_gettime(child_clock, &ts)); 1281 if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(child=%d->0x%08x) is %ld.%09ld \n", 1282 self, child, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec); 1283 1284 child_clock = ((~1) << 3) | 0x0; 1285 memset(&ts, 0, sizeof(ts)); 1286 EXPECT_OK(clock_gettime(child_clock, &ts)); 1287 if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(init=1->0x%08x) is %ld.%09ld \n", 1288 self, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec); 1289 1290 // Orphan the child. 1291 } 1292 1293 TEST(Linux, SetLease) { 1294 int fd_all = open(TmpFile("cap_lease"), O_CREAT|O_RDWR, 0644); 1295 EXPECT_OK(fd_all); 1296 int fd_rw = dup(fd_all); 1297 EXPECT_OK(fd_rw); 1298 1299 cap_rights_t r_all; 1300 cap_rights_init(&r_all, CAP_READ, CAP_WRITE, CAP_FLOCK, CAP_FSIGNAL); 1301 EXPECT_OK(cap_rights_limit(fd_all, &r_all)); 1302 1303 cap_rights_t r_rw; 1304 cap_rights_init(&r_rw, CAP_READ, CAP_WRITE); 1305 EXPECT_OK(cap_rights_limit(fd_rw, &r_rw)); 1306 1307 EXPECT_NOTCAPABLE(fcntl(fd_rw, F_SETLEASE, F_WRLCK)); 1308 EXPECT_NOTCAPABLE(fcntl(fd_rw, F_GETLEASE)); 1309 1310 if (!tmpdir_on_tmpfs) { // tmpfs doesn't support leases 1311 EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_WRLCK)); 1312 EXPECT_EQ(F_WRLCK, fcntl(fd_all, F_GETLEASE)); 1313 1314 EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_UNLCK, 0)); 1315 EXPECT_EQ(F_UNLCK, fcntl(fd_all, F_GETLEASE)); 1316 } 1317 close(fd_all); 1318 close(fd_rw); 1319 unlink(TmpFile("cap_lease")); 1320 } 1321 1322 TEST(Linux, InvalidRightsSyscall) { 1323 int fd = open(TmpFile("cap_invalid_rights"), O_RDONLY|O_CREAT, 0644); 1324 EXPECT_OK(fd); 1325 1326 cap_rights_t rights; 1327 cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FCHMOD, CAP_FSTAT); 1328 1329 // Use the raw syscall throughout. 1330 EXPECT_EQ(0, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0)); 1331 1332 // Directly access the syscall, and find all unseemly manner of use for it. 1333 // - Invalid flags 1334 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 1)); 1335 EXPECT_EQ(EINVAL, errno); 1336 // - Specify an fcntl subright, but no CAP_FCNTL set 1337 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, CAP_FCNTL_GETFL, 0, NULL, 0)); 1338 EXPECT_EQ(EINVAL, errno); 1339 // - Specify an ioctl subright, but no CAP_IOCTL set 1340 unsigned int ioctl1 = 1; 1341 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, &ioctl1, 0)); 1342 EXPECT_EQ(EINVAL, errno); 1343 // - N ioctls, but null pointer passed 1344 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, NULL, 0)); 1345 EXPECT_EQ(EINVAL, errno); 1346 // - Invalid nioctls 1347 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, -2, NULL, 0)); 1348 EXPECT_EQ(EINVAL, errno); 1349 // - Null primary rights 1350 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, NULL, 0, 0, NULL, 0)); 1351 EXPECT_EQ(EFAULT, errno); 1352 // - Invalid index bitmask 1353 rights.cr_rights[0] |= 3ULL << 57; 1354 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0)); 1355 EXPECT_EQ(EINVAL, errno); 1356 // - Invalid version 1357 rights.cr_rights[0] |= 2ULL << 62; 1358 EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0)); 1359 EXPECT_EQ(EINVAL, errno); 1360 1361 close(fd); 1362 unlink(TmpFile("cap_invalid_rights")); 1363 } 1364 1365 FORK_TEST_ON(Linux, OpenByHandleAt, TmpFile("cap_openbyhandle_testfile")) { 1366 REQUIRE_ROOT(); 1367 int dir = open(tmpdir.c_str(), O_RDONLY); 1368 EXPECT_OK(dir); 1369 int fd = openat(dir, "cap_openbyhandle_testfile", O_RDWR|O_CREAT, 0644); 1370 EXPECT_OK(fd); 1371 const char* message = "Saved text"; 1372 EXPECT_OK(write(fd, message, strlen(message))); 1373 close(fd); 1374 1375 struct file_handle* fhandle = (struct file_handle*)malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); 1376 fhandle->handle_bytes = MAX_HANDLE_SZ; 1377 int mount_id; 1378 EXPECT_OK(name_to_handle_at(dir, "cap_openbyhandle_testfile", fhandle, &mount_id, 0)); 1379 1380 fd = open_by_handle_at(dir, fhandle, O_RDONLY); 1381 EXPECT_OK(fd); 1382 char buffer[200]; 1383 EXPECT_OK(read(fd, buffer, 199)); 1384 EXPECT_EQ(std::string(message), std::string(buffer)); 1385 close(fd); 1386 1387 // Cannot issue open_by_handle_at after entering capability mode. 1388 cap_enter(); 1389 EXPECT_CAPMODE(open_by_handle_at(dir, fhandle, O_RDONLY)); 1390 1391 close(dir); 1392 } 1393 1394 int getrandom_(void *buf, size_t buflen, unsigned int flags) { 1395 #ifdef __NR_getrandom 1396 return syscall(__NR_getrandom, buf, buflen, flags); 1397 #else 1398 errno = ENOSYS; 1399 return -1; 1400 #endif 1401 } 1402 1403 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) 1404 #include <linux/random.h> // Requires 3.17 kernel 1405 FORK_TEST(Linux, GetRandom) { 1406 EXPECT_OK(cap_enter()); 1407 unsigned char buffer[1024]; 1408 unsigned char buffer2[1024]; 1409 EXPECT_OK(getrandom_(buffer, sizeof(buffer), GRND_NONBLOCK)); 1410 EXPECT_OK(getrandom_(buffer2, sizeof(buffer2), GRND_NONBLOCK)); 1411 EXPECT_NE(0, memcmp(buffer, buffer2, sizeof(buffer))); 1412 } 1413 #endif 1414 1415 int memfd_create_(const char *name, unsigned int flags) { 1416 #ifdef __NR_memfd_create 1417 return syscall(__NR_memfd_create, name, flags); 1418 #else 1419 errno = ENOSYS; 1420 return -1; 1421 #endif 1422 } 1423 1424 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) 1425 #include <linux/memfd.h> // Requires 3.17 kernel 1426 TEST(Linux, MemFDDeathTest) { 1427 int memfd = memfd_create_("capsicum-test", MFD_ALLOW_SEALING); 1428 if (memfd == -1 && errno == ENOSYS) { 1429 TEST_SKIPPED("memfd_create(2) gives -ENOSYS"); 1430 return; 1431 } 1432 const int LEN = 16; 1433 EXPECT_OK(ftruncate(memfd, LEN)); 1434 int memfd_ro = dup(memfd); 1435 int memfd_rw = dup(memfd); 1436 EXPECT_OK(memfd_ro); 1437 EXPECT_OK(memfd_rw); 1438 cap_rights_t rights; 1439 EXPECT_OK(cap_rights_limit(memfd_ro, cap_rights_init(&rights, CAP_MMAP_R, CAP_FSTAT))); 1440 EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW, CAP_FCHMOD))); 1441 1442 unsigned char *p_ro = (unsigned char *)mmap(NULL, LEN, PROT_READ, MAP_SHARED, memfd_ro, 0); 1443 EXPECT_NE((unsigned char *)MAP_FAILED, p_ro); 1444 unsigned char *p_rw = (unsigned char *)mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_rw, 0); 1445 EXPECT_NE((unsigned char *)MAP_FAILED, p_rw); 1446 EXPECT_EQ(MAP_FAILED, 1447 mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_ro, 0)); 1448 1449 *p_rw = 42; 1450 EXPECT_EQ(42, *p_ro); 1451 EXPECT_DEATH(*p_ro = 42, ""); 1452 1453 #ifndef F_ADD_SEALS 1454 // Hack for when libc6 does not yet include the updated linux/fcntl.h from kernel 3.17 1455 #define _F_LINUX_SPECIFIC_BASE F_SETLEASE 1456 #define F_ADD_SEALS (_F_LINUX_SPECIFIC_BASE + 9) 1457 #define F_GET_SEALS (_F_LINUX_SPECIFIC_BASE + 10) 1458 #define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ 1459 #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ 1460 #define F_SEAL_GROW 0x0004 /* prevent file from growing */ 1461 #define F_SEAL_WRITE 0x0008 /* prevent writes */ 1462 #endif 1463 1464 // Reading the seal information requires CAP_FSTAT. 1465 int seals = fcntl(memfd, F_GET_SEALS); 1466 EXPECT_OK(seals); 1467 if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals); 1468 int seals_ro = fcntl(memfd_ro, F_GET_SEALS); 1469 EXPECT_EQ(seals, seals_ro); 1470 if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro); 1471 int seals_rw = fcntl(memfd_rw, F_GET_SEALS); 1472 EXPECT_NOTCAPABLE(seals_rw); 1473 1474 // Fail to seal as a writable mapping exists. 1475 EXPECT_EQ(-1, fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE)); 1476 EXPECT_EQ(EBUSY, errno); 1477 *p_rw = 42; 1478 1479 // Seal the rw version; need to unmap first. 1480 munmap(p_rw, LEN); 1481 munmap(p_ro, LEN); 1482 EXPECT_OK(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE)); 1483 1484 seals = fcntl(memfd, F_GET_SEALS); 1485 EXPECT_OK(seals); 1486 if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals); 1487 seals_ro = fcntl(memfd_ro, F_GET_SEALS); 1488 EXPECT_EQ(seals, seals_ro); 1489 if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro); 1490 1491 // Remove the CAP_FCHMOD right, can no longer add seals. 1492 EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW))); 1493 EXPECT_NOTCAPABLE(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE)); 1494 1495 close(memfd); 1496 close(memfd_ro); 1497 close(memfd_rw); 1498 } 1499 #endif 1500 1501 #else 1502 void noop() {} 1503 #endif 1504