1 /* SPDX-License-Identifier: GPL-2.0 */ 2 3 #define _GNU_SOURCE 4 5 #include <errno.h> 6 #include <fcntl.h> 7 #include <linux/limits.h> 8 #include <poll.h> 9 #include <signal.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <sys/inotify.h> 14 #include <sys/stat.h> 15 #include <sys/types.h> 16 #include <sys/wait.h> 17 #include <unistd.h> 18 19 #include "cgroup_util.h" 20 #include "../clone3/clone3_selftests.h" 21 22 /* Returns read len on success, or -errno on failure. */ 23 static ssize_t read_text(const char *path, char *buf, size_t max_len) 24 { 25 ssize_t len; 26 int fd; 27 28 fd = open(path, O_RDONLY); 29 if (fd < 0) 30 return -errno; 31 32 len = read(fd, buf, max_len - 1); 33 34 if (len >= 0) 35 buf[len] = 0; 36 37 close(fd); 38 return len < 0 ? -errno : len; 39 } 40 41 /* Returns written len on success, or -errno on failure. */ 42 static ssize_t write_text(const char *path, char *buf, ssize_t len) 43 { 44 int fd; 45 46 fd = open(path, O_WRONLY | O_APPEND); 47 if (fd < 0) 48 return -errno; 49 50 len = write(fd, buf, len); 51 close(fd); 52 return len < 0 ? -errno : len; 53 } 54 55 char *cg_name(const char *root, const char *name) 56 { 57 size_t len = strlen(root) + strlen(name) + 2; 58 char *ret = malloc(len); 59 60 snprintf(ret, len, "%s/%s", root, name); 61 62 return ret; 63 } 64 65 char *cg_name_indexed(const char *root, const char *name, int index) 66 { 67 size_t len = strlen(root) + strlen(name) + 10; 68 char *ret = malloc(len); 69 70 snprintf(ret, len, "%s/%s_%d", root, name, index); 71 72 return ret; 73 } 74 75 char *cg_control(const char *cgroup, const char *control) 76 { 77 size_t len = strlen(cgroup) + strlen(control) + 2; 78 char *ret = malloc(len); 79 80 snprintf(ret, len, "%s/%s", cgroup, control); 81 82 return ret; 83 } 84 85 /* Returns 0 on success, or -errno on failure. */ 86 int cg_read(const char *cgroup, const char *control, char *buf, size_t len) 87 { 88 char path[PATH_MAX]; 89 ssize_t ret; 90 91 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 92 93 ret = read_text(path, buf, len); 94 return ret >= 0 ? 0 : ret; 95 } 96 97 int cg_read_strcmp(const char *cgroup, const char *control, 98 const char *expected) 99 { 100 size_t size; 101 char *buf; 102 int ret; 103 104 /* Handle the case of comparing against empty string */ 105 if (!expected) 106 return -1; 107 else 108 size = strlen(expected) + 1; 109 110 buf = malloc(size); 111 if (!buf) 112 return -1; 113 114 if (cg_read(cgroup, control, buf, size)) { 115 free(buf); 116 return -1; 117 } 118 119 ret = strcmp(expected, buf); 120 free(buf); 121 return ret; 122 } 123 124 int cg_read_strstr(const char *cgroup, const char *control, const char *needle) 125 { 126 char buf[PAGE_SIZE]; 127 128 if (cg_read(cgroup, control, buf, sizeof(buf))) 129 return -1; 130 131 return strstr(buf, needle) ? 0 : -1; 132 } 133 134 long cg_read_long(const char *cgroup, const char *control) 135 { 136 char buf[128]; 137 138 if (cg_read(cgroup, control, buf, sizeof(buf))) 139 return -1; 140 141 return atol(buf); 142 } 143 144 long cg_read_long_fd(int fd) 145 { 146 char buf[128]; 147 148 if (pread(fd, buf, sizeof(buf), 0) <= 0) 149 return -1; 150 151 return atol(buf); 152 } 153 154 long cg_read_key_long(const char *cgroup, const char *control, const char *key) 155 { 156 char buf[PAGE_SIZE]; 157 char *ptr; 158 159 if (cg_read(cgroup, control, buf, sizeof(buf))) 160 return -1; 161 162 ptr = strstr(buf, key); 163 if (!ptr) 164 return -1; 165 166 return atol(ptr + strlen(key)); 167 } 168 169 long cg_read_lc(const char *cgroup, const char *control) 170 { 171 char buf[PAGE_SIZE]; 172 const char delim[] = "\n"; 173 char *line; 174 long cnt = 0; 175 176 if (cg_read(cgroup, control, buf, sizeof(buf))) 177 return -1; 178 179 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 180 cnt++; 181 182 return cnt; 183 } 184 185 /* Returns 0 on success, or -errno on failure. */ 186 int cg_write(const char *cgroup, const char *control, char *buf) 187 { 188 char path[PATH_MAX]; 189 ssize_t len = strlen(buf), ret; 190 191 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 192 ret = write_text(path, buf, len); 193 return ret == len ? 0 : ret; 194 } 195 196 /* 197 * Returns fd on success, or -1 on failure. 198 * (fd should be closed with close() as usual) 199 */ 200 int cg_open(const char *cgroup, const char *control, int flags) 201 { 202 char path[PATH_MAX]; 203 204 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 205 return open(path, flags); 206 } 207 208 int cg_write_numeric(const char *cgroup, const char *control, long value) 209 { 210 char buf[64]; 211 int ret; 212 213 ret = sprintf(buf, "%lu", value); 214 if (ret < 0) 215 return ret; 216 217 return cg_write(cgroup, control, buf); 218 } 219 220 int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) 221 { 222 char buf[10 * PAGE_SIZE]; 223 char *fs, *mount, *type, *options; 224 const char delim[] = "\n\t "; 225 226 if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) 227 return -1; 228 229 /* 230 * Example: 231 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0 232 */ 233 for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { 234 mount = strtok(NULL, delim); 235 type = strtok(NULL, delim); 236 options = strtok(NULL, delim); 237 strtok(NULL, delim); 238 strtok(NULL, delim); 239 240 if (strcmp(type, "cgroup2") == 0) { 241 strncpy(root, mount, len); 242 if (nsdelegate) 243 *nsdelegate = !!strstr(options, "nsdelegate"); 244 return 0; 245 } 246 } 247 248 return -1; 249 } 250 251 int cg_create(const char *cgroup) 252 { 253 return mkdir(cgroup, 0755); 254 } 255 256 int cg_wait_for_proc_count(const char *cgroup, int count) 257 { 258 char buf[10 * PAGE_SIZE] = {0}; 259 int attempts; 260 char *ptr; 261 262 for (attempts = 10; attempts >= 0; attempts--) { 263 int nr = 0; 264 265 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 266 break; 267 268 for (ptr = buf; *ptr; ptr++) 269 if (*ptr == '\n') 270 nr++; 271 272 if (nr >= count) 273 return 0; 274 275 usleep(100000); 276 } 277 278 return -1; 279 } 280 281 int cg_killall(const char *cgroup) 282 { 283 char buf[PAGE_SIZE]; 284 char *ptr = buf; 285 286 /* If cgroup.kill exists use it. */ 287 if (!cg_write(cgroup, "cgroup.kill", "1")) 288 return 0; 289 290 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 291 return -1; 292 293 while (ptr < buf + sizeof(buf)) { 294 int pid = strtol(ptr, &ptr, 10); 295 296 if (pid == 0) 297 break; 298 if (*ptr) 299 ptr++; 300 else 301 break; 302 if (kill(pid, SIGKILL)) 303 return -1; 304 } 305 306 return 0; 307 } 308 309 int cg_destroy(const char *cgroup) 310 { 311 int ret; 312 313 if (!cgroup) 314 return 0; 315 retry: 316 ret = rmdir(cgroup); 317 if (ret && errno == EBUSY) { 318 cg_killall(cgroup); 319 usleep(100); 320 goto retry; 321 } 322 323 if (ret && errno == ENOENT) 324 ret = 0; 325 326 return ret; 327 } 328 329 int cg_enter(const char *cgroup, int pid) 330 { 331 char pidbuf[64]; 332 333 snprintf(pidbuf, sizeof(pidbuf), "%d", pid); 334 return cg_write(cgroup, "cgroup.procs", pidbuf); 335 } 336 337 int cg_enter_current(const char *cgroup) 338 { 339 return cg_write(cgroup, "cgroup.procs", "0"); 340 } 341 342 int cg_enter_current_thread(const char *cgroup) 343 { 344 return cg_write(cgroup, "cgroup.threads", "0"); 345 } 346 347 int cg_run(const char *cgroup, 348 int (*fn)(const char *cgroup, void *arg), 349 void *arg) 350 { 351 int pid, retcode; 352 353 pid = fork(); 354 if (pid < 0) { 355 return pid; 356 } else if (pid == 0) { 357 char buf[64]; 358 359 snprintf(buf, sizeof(buf), "%d", getpid()); 360 if (cg_write(cgroup, "cgroup.procs", buf)) 361 exit(EXIT_FAILURE); 362 exit(fn(cgroup, arg)); 363 } else { 364 waitpid(pid, &retcode, 0); 365 if (WIFEXITED(retcode)) 366 return WEXITSTATUS(retcode); 367 else 368 return -1; 369 } 370 } 371 372 pid_t clone_into_cgroup(int cgroup_fd) 373 { 374 #ifdef CLONE_ARGS_SIZE_VER2 375 pid_t pid; 376 377 struct __clone_args args = { 378 .flags = CLONE_INTO_CGROUP, 379 .exit_signal = SIGCHLD, 380 .cgroup = cgroup_fd, 381 }; 382 383 pid = sys_clone3(&args, sizeof(struct __clone_args)); 384 /* 385 * Verify that this is a genuine test failure: 386 * ENOSYS -> clone3() not available 387 * E2BIG -> CLONE_INTO_CGROUP not available 388 */ 389 if (pid < 0 && (errno == ENOSYS || errno == E2BIG)) 390 goto pretend_enosys; 391 392 return pid; 393 394 pretend_enosys: 395 #endif 396 errno = ENOSYS; 397 return -ENOSYS; 398 } 399 400 int clone_reap(pid_t pid, int options) 401 { 402 int ret; 403 siginfo_t info = { 404 .si_signo = 0, 405 }; 406 407 again: 408 ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD); 409 if (ret < 0) { 410 if (errno == EINTR) 411 goto again; 412 return -1; 413 } 414 415 if (options & WEXITED) { 416 if (WIFEXITED(info.si_status)) 417 return WEXITSTATUS(info.si_status); 418 } 419 420 if (options & WSTOPPED) { 421 if (WIFSTOPPED(info.si_status)) 422 return WSTOPSIG(info.si_status); 423 } 424 425 if (options & WCONTINUED) { 426 if (WIFCONTINUED(info.si_status)) 427 return 0; 428 } 429 430 return -1; 431 } 432 433 int dirfd_open_opath(const char *dir) 434 { 435 return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 436 } 437 438 #define close_prot_errno(fd) \ 439 if (fd >= 0) { \ 440 int _e_ = errno; \ 441 close(fd); \ 442 errno = _e_; \ 443 } 444 445 static int clone_into_cgroup_run_nowait(const char *cgroup, 446 int (*fn)(const char *cgroup, void *arg), 447 void *arg) 448 { 449 int cgroup_fd; 450 pid_t pid; 451 452 cgroup_fd = dirfd_open_opath(cgroup); 453 if (cgroup_fd < 0) 454 return -1; 455 456 pid = clone_into_cgroup(cgroup_fd); 457 close_prot_errno(cgroup_fd); 458 if (pid == 0) 459 exit(fn(cgroup, arg)); 460 461 return pid; 462 } 463 464 int cg_run_nowait(const char *cgroup, 465 int (*fn)(const char *cgroup, void *arg), 466 void *arg) 467 { 468 int pid; 469 470 pid = clone_into_cgroup_run_nowait(cgroup, fn, arg); 471 if (pid > 0) 472 return pid; 473 474 /* Genuine test failure. */ 475 if (pid < 0 && errno != ENOSYS) 476 return -1; 477 478 pid = fork(); 479 if (pid == 0) { 480 char buf[64]; 481 482 snprintf(buf, sizeof(buf), "%d", getpid()); 483 if (cg_write(cgroup, "cgroup.procs", buf)) 484 exit(EXIT_FAILURE); 485 exit(fn(cgroup, arg)); 486 } 487 488 return pid; 489 } 490 491 int get_temp_fd(void) 492 { 493 return open(".", O_TMPFILE | O_RDWR | O_EXCL); 494 } 495 496 int alloc_pagecache(int fd, size_t size) 497 { 498 char buf[PAGE_SIZE]; 499 struct stat st; 500 int i; 501 502 if (fstat(fd, &st)) 503 goto cleanup; 504 505 size += st.st_size; 506 507 if (ftruncate(fd, size)) 508 goto cleanup; 509 510 for (i = 0; i < size; i += sizeof(buf)) 511 read(fd, buf, sizeof(buf)); 512 513 return 0; 514 515 cleanup: 516 return -1; 517 } 518 519 int alloc_anon(const char *cgroup, void *arg) 520 { 521 size_t size = (unsigned long)arg; 522 char *buf, *ptr; 523 524 buf = malloc(size); 525 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 526 *ptr = 0; 527 528 free(buf); 529 return 0; 530 } 531 532 int is_swap_enabled(void) 533 { 534 char buf[PAGE_SIZE]; 535 const char delim[] = "\n"; 536 int cnt = 0; 537 char *line; 538 539 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) 540 return -1; 541 542 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 543 cnt++; 544 545 return cnt > 1; 546 } 547 548 int set_oom_adj_score(int pid, int score) 549 { 550 char path[PATH_MAX]; 551 int fd, len; 552 553 sprintf(path, "/proc/%d/oom_score_adj", pid); 554 555 fd = open(path, O_WRONLY | O_APPEND); 556 if (fd < 0) 557 return fd; 558 559 len = dprintf(fd, "%d", score); 560 if (len < 0) { 561 close(fd); 562 return len; 563 } 564 565 close(fd); 566 return 0; 567 } 568 569 int proc_mount_contains(const char *option) 570 { 571 char buf[4 * PAGE_SIZE]; 572 ssize_t read; 573 574 read = read_text("/proc/mounts", buf, sizeof(buf)); 575 if (read < 0) 576 return read; 577 578 return strstr(buf, option) != NULL; 579 } 580 581 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) 582 { 583 char path[PATH_MAX]; 584 ssize_t ret; 585 586 if (!pid) 587 snprintf(path, sizeof(path), "/proc/%s/%s", 588 thread ? "thread-self" : "self", item); 589 else 590 snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); 591 592 ret = read_text(path, buf, size); 593 return ret < 0 ? -1 : ret; 594 } 595 596 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) 597 { 598 char buf[PAGE_SIZE]; 599 600 if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) 601 return -1; 602 603 return strstr(buf, needle) ? 0 : -1; 604 } 605 606 int clone_into_cgroup_run_wait(const char *cgroup) 607 { 608 int cgroup_fd; 609 pid_t pid; 610 611 cgroup_fd = dirfd_open_opath(cgroup); 612 if (cgroup_fd < 0) 613 return -1; 614 615 pid = clone_into_cgroup(cgroup_fd); 616 close_prot_errno(cgroup_fd); 617 if (pid < 0) 618 return -1; 619 620 if (pid == 0) 621 exit(EXIT_SUCCESS); 622 623 /* 624 * We don't care whether this fails. We only care whether the initial 625 * clone succeeded. 626 */ 627 (void)clone_reap(pid, WEXITED); 628 return 0; 629 } 630 631 static int __prepare_for_wait(const char *cgroup, const char *filename) 632 { 633 int fd, ret = -1; 634 635 fd = inotify_init1(0); 636 if (fd == -1) 637 return fd; 638 639 ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY); 640 if (ret == -1) { 641 close(fd); 642 fd = -1; 643 } 644 645 return fd; 646 } 647 648 int cg_prepare_for_wait(const char *cgroup) 649 { 650 return __prepare_for_wait(cgroup, "cgroup.events"); 651 } 652 653 int memcg_prepare_for_wait(const char *cgroup) 654 { 655 return __prepare_for_wait(cgroup, "memory.events"); 656 } 657 658 int cg_wait_for(int fd) 659 { 660 int ret = -1; 661 struct pollfd fds = { 662 .fd = fd, 663 .events = POLLIN, 664 }; 665 666 while (true) { 667 ret = poll(&fds, 1, 10000); 668 669 if (ret == -1) { 670 if (errno == EINTR) 671 continue; 672 673 break; 674 } 675 676 if (ret > 0 && fds.revents & POLLIN) { 677 ret = 0; 678 break; 679 } 680 } 681 682 return ret; 683 } 684