1 /* SPDX-License-Identifier: GPL-2.0 */ 2 3 #define _GNU_SOURCE 4 5 #include <errno.h> 6 #include <fcntl.h> 7 #include <linux/limits.h> 8 #include <poll.h> 9 #include <signal.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <sys/inotify.h> 14 #include <sys/stat.h> 15 #include <sys/types.h> 16 #include <sys/wait.h> 17 #include <unistd.h> 18 19 #include "cgroup_util.h" 20 #include "../clone3/clone3_selftests.h" 21 22 /* Returns read len on success, or -errno on failure. */ 23 static ssize_t read_text(const char *path, char *buf, size_t max_len) 24 { 25 ssize_t len; 26 int fd; 27 28 fd = open(path, O_RDONLY); 29 if (fd < 0) 30 return -errno; 31 32 len = read(fd, buf, max_len - 1); 33 34 if (len >= 0) 35 buf[len] = 0; 36 37 close(fd); 38 return len < 0 ? -errno : len; 39 } 40 41 /* Returns written len on success, or -errno on failure. */ 42 static ssize_t write_text(const char *path, char *buf, ssize_t len) 43 { 44 int fd; 45 46 fd = open(path, O_WRONLY | O_APPEND); 47 if (fd < 0) 48 return -errno; 49 50 len = write(fd, buf, len); 51 close(fd); 52 return len < 0 ? -errno : len; 53 } 54 55 char *cg_name(const char *root, const char *name) 56 { 57 size_t len = strlen(root) + strlen(name) + 2; 58 char *ret = malloc(len); 59 60 snprintf(ret, len, "%s/%s", root, name); 61 62 return ret; 63 } 64 65 char *cg_name_indexed(const char *root, const char *name, int index) 66 { 67 size_t len = strlen(root) + strlen(name) + 10; 68 char *ret = malloc(len); 69 70 snprintf(ret, len, "%s/%s_%d", root, name, index); 71 72 return ret; 73 } 74 75 char *cg_control(const char *cgroup, const char *control) 76 { 77 size_t len = strlen(cgroup) + strlen(control) + 2; 78 char *ret = malloc(len); 79 80 snprintf(ret, len, "%s/%s", cgroup, control); 81 82 return ret; 83 } 84 85 /* Returns 0 on success, or -errno on failure. */ 86 int cg_read(const char *cgroup, const char *control, char *buf, size_t len) 87 { 88 char path[PATH_MAX]; 89 ssize_t ret; 90 91 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 92 93 ret = read_text(path, buf, len); 94 return ret >= 0 ? 0 : ret; 95 } 96 97 int cg_read_strcmp(const char *cgroup, const char *control, 98 const char *expected) 99 { 100 size_t size; 101 char *buf; 102 int ret; 103 104 /* Handle the case of comparing against empty string */ 105 if (!expected) 106 return -1; 107 else 108 size = strlen(expected) + 1; 109 110 buf = malloc(size); 111 if (!buf) 112 return -1; 113 114 if (cg_read(cgroup, control, buf, size)) { 115 free(buf); 116 return -1; 117 } 118 119 ret = strcmp(expected, buf); 120 free(buf); 121 return ret; 122 } 123 124 int cg_read_strstr(const char *cgroup, const char *control, const char *needle) 125 { 126 char buf[PAGE_SIZE]; 127 128 if (cg_read(cgroup, control, buf, sizeof(buf))) 129 return -1; 130 131 return strstr(buf, needle) ? 0 : -1; 132 } 133 134 long cg_read_long(const char *cgroup, const char *control) 135 { 136 char buf[128]; 137 138 if (cg_read(cgroup, control, buf, sizeof(buf))) 139 return -1; 140 141 return atol(buf); 142 } 143 144 long cg_read_key_long(const char *cgroup, const char *control, const char *key) 145 { 146 char buf[PAGE_SIZE]; 147 char *ptr; 148 149 if (cg_read(cgroup, control, buf, sizeof(buf))) 150 return -1; 151 152 ptr = strstr(buf, key); 153 if (!ptr) 154 return -1; 155 156 return atol(ptr + strlen(key)); 157 } 158 159 long cg_read_lc(const char *cgroup, const char *control) 160 { 161 char buf[PAGE_SIZE]; 162 const char delim[] = "\n"; 163 char *line; 164 long cnt = 0; 165 166 if (cg_read(cgroup, control, buf, sizeof(buf))) 167 return -1; 168 169 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 170 cnt++; 171 172 return cnt; 173 } 174 175 /* Returns 0 on success, or -errno on failure. */ 176 int cg_write(const char *cgroup, const char *control, char *buf) 177 { 178 char path[PATH_MAX]; 179 ssize_t len = strlen(buf), ret; 180 181 snprintf(path, sizeof(path), "%s/%s", cgroup, control); 182 ret = write_text(path, buf, len); 183 return ret == len ? 0 : ret; 184 } 185 186 int cg_write_numeric(const char *cgroup, const char *control, long value) 187 { 188 char buf[64]; 189 int ret; 190 191 ret = sprintf(buf, "%lu", value); 192 if (ret < 0) 193 return ret; 194 195 return cg_write(cgroup, control, buf); 196 } 197 198 int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) 199 { 200 char buf[10 * PAGE_SIZE]; 201 char *fs, *mount, *type, *options; 202 const char delim[] = "\n\t "; 203 204 if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0) 205 return -1; 206 207 /* 208 * Example: 209 * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0 210 */ 211 for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) { 212 mount = strtok(NULL, delim); 213 type = strtok(NULL, delim); 214 options = strtok(NULL, delim); 215 strtok(NULL, delim); 216 strtok(NULL, delim); 217 218 if (strcmp(type, "cgroup2") == 0) { 219 strncpy(root, mount, len); 220 if (nsdelegate) 221 *nsdelegate = !!strstr(options, "nsdelegate"); 222 return 0; 223 } 224 } 225 226 return -1; 227 } 228 229 int cg_create(const char *cgroup) 230 { 231 return mkdir(cgroup, 0755); 232 } 233 234 int cg_wait_for_proc_count(const char *cgroup, int count) 235 { 236 char buf[10 * PAGE_SIZE] = {0}; 237 int attempts; 238 char *ptr; 239 240 for (attempts = 10; attempts >= 0; attempts--) { 241 int nr = 0; 242 243 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 244 break; 245 246 for (ptr = buf; *ptr; ptr++) 247 if (*ptr == '\n') 248 nr++; 249 250 if (nr >= count) 251 return 0; 252 253 usleep(100000); 254 } 255 256 return -1; 257 } 258 259 int cg_killall(const char *cgroup) 260 { 261 char buf[PAGE_SIZE]; 262 char *ptr = buf; 263 264 /* If cgroup.kill exists use it. */ 265 if (!cg_write(cgroup, "cgroup.kill", "1")) 266 return 0; 267 268 if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) 269 return -1; 270 271 while (ptr < buf + sizeof(buf)) { 272 int pid = strtol(ptr, &ptr, 10); 273 274 if (pid == 0) 275 break; 276 if (*ptr) 277 ptr++; 278 else 279 break; 280 if (kill(pid, SIGKILL)) 281 return -1; 282 } 283 284 return 0; 285 } 286 287 int cg_destroy(const char *cgroup) 288 { 289 int ret; 290 291 if (!cgroup) 292 return 0; 293 retry: 294 ret = rmdir(cgroup); 295 if (ret && errno == EBUSY) { 296 cg_killall(cgroup); 297 usleep(100); 298 goto retry; 299 } 300 301 if (ret && errno == ENOENT) 302 ret = 0; 303 304 return ret; 305 } 306 307 int cg_enter(const char *cgroup, int pid) 308 { 309 char pidbuf[64]; 310 311 snprintf(pidbuf, sizeof(pidbuf), "%d", pid); 312 return cg_write(cgroup, "cgroup.procs", pidbuf); 313 } 314 315 int cg_enter_current(const char *cgroup) 316 { 317 return cg_write(cgroup, "cgroup.procs", "0"); 318 } 319 320 int cg_enter_current_thread(const char *cgroup) 321 { 322 return cg_write(cgroup, "cgroup.threads", "0"); 323 } 324 325 int cg_run(const char *cgroup, 326 int (*fn)(const char *cgroup, void *arg), 327 void *arg) 328 { 329 int pid, retcode; 330 331 pid = fork(); 332 if (pid < 0) { 333 return pid; 334 } else if (pid == 0) { 335 char buf[64]; 336 337 snprintf(buf, sizeof(buf), "%d", getpid()); 338 if (cg_write(cgroup, "cgroup.procs", buf)) 339 exit(EXIT_FAILURE); 340 exit(fn(cgroup, arg)); 341 } else { 342 waitpid(pid, &retcode, 0); 343 if (WIFEXITED(retcode)) 344 return WEXITSTATUS(retcode); 345 else 346 return -1; 347 } 348 } 349 350 pid_t clone_into_cgroup(int cgroup_fd) 351 { 352 #ifdef CLONE_ARGS_SIZE_VER2 353 pid_t pid; 354 355 struct __clone_args args = { 356 .flags = CLONE_INTO_CGROUP, 357 .exit_signal = SIGCHLD, 358 .cgroup = cgroup_fd, 359 }; 360 361 pid = sys_clone3(&args, sizeof(struct __clone_args)); 362 /* 363 * Verify that this is a genuine test failure: 364 * ENOSYS -> clone3() not available 365 * E2BIG -> CLONE_INTO_CGROUP not available 366 */ 367 if (pid < 0 && (errno == ENOSYS || errno == E2BIG)) 368 goto pretend_enosys; 369 370 return pid; 371 372 pretend_enosys: 373 #endif 374 errno = ENOSYS; 375 return -ENOSYS; 376 } 377 378 int clone_reap(pid_t pid, int options) 379 { 380 int ret; 381 siginfo_t info = { 382 .si_signo = 0, 383 }; 384 385 again: 386 ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD); 387 if (ret < 0) { 388 if (errno == EINTR) 389 goto again; 390 return -1; 391 } 392 393 if (options & WEXITED) { 394 if (WIFEXITED(info.si_status)) 395 return WEXITSTATUS(info.si_status); 396 } 397 398 if (options & WSTOPPED) { 399 if (WIFSTOPPED(info.si_status)) 400 return WSTOPSIG(info.si_status); 401 } 402 403 if (options & WCONTINUED) { 404 if (WIFCONTINUED(info.si_status)) 405 return 0; 406 } 407 408 return -1; 409 } 410 411 int dirfd_open_opath(const char *dir) 412 { 413 return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 414 } 415 416 #define close_prot_errno(fd) \ 417 if (fd >= 0) { \ 418 int _e_ = errno; \ 419 close(fd); \ 420 errno = _e_; \ 421 } 422 423 static int clone_into_cgroup_run_nowait(const char *cgroup, 424 int (*fn)(const char *cgroup, void *arg), 425 void *arg) 426 { 427 int cgroup_fd; 428 pid_t pid; 429 430 cgroup_fd = dirfd_open_opath(cgroup); 431 if (cgroup_fd < 0) 432 return -1; 433 434 pid = clone_into_cgroup(cgroup_fd); 435 close_prot_errno(cgroup_fd); 436 if (pid == 0) 437 exit(fn(cgroup, arg)); 438 439 return pid; 440 } 441 442 int cg_run_nowait(const char *cgroup, 443 int (*fn)(const char *cgroup, void *arg), 444 void *arg) 445 { 446 int pid; 447 448 pid = clone_into_cgroup_run_nowait(cgroup, fn, arg); 449 if (pid > 0) 450 return pid; 451 452 /* Genuine test failure. */ 453 if (pid < 0 && errno != ENOSYS) 454 return -1; 455 456 pid = fork(); 457 if (pid == 0) { 458 char buf[64]; 459 460 snprintf(buf, sizeof(buf), "%d", getpid()); 461 if (cg_write(cgroup, "cgroup.procs", buf)) 462 exit(EXIT_FAILURE); 463 exit(fn(cgroup, arg)); 464 } 465 466 return pid; 467 } 468 469 int get_temp_fd(void) 470 { 471 return open(".", O_TMPFILE | O_RDWR | O_EXCL); 472 } 473 474 int alloc_pagecache(int fd, size_t size) 475 { 476 char buf[PAGE_SIZE]; 477 struct stat st; 478 int i; 479 480 if (fstat(fd, &st)) 481 goto cleanup; 482 483 size += st.st_size; 484 485 if (ftruncate(fd, size)) 486 goto cleanup; 487 488 for (i = 0; i < size; i += sizeof(buf)) 489 read(fd, buf, sizeof(buf)); 490 491 return 0; 492 493 cleanup: 494 return -1; 495 } 496 497 int alloc_anon(const char *cgroup, void *arg) 498 { 499 size_t size = (unsigned long)arg; 500 char *buf, *ptr; 501 502 buf = malloc(size); 503 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) 504 *ptr = 0; 505 506 free(buf); 507 return 0; 508 } 509 510 int is_swap_enabled(void) 511 { 512 char buf[PAGE_SIZE]; 513 const char delim[] = "\n"; 514 int cnt = 0; 515 char *line; 516 517 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) 518 return -1; 519 520 for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) 521 cnt++; 522 523 return cnt > 1; 524 } 525 526 int set_oom_adj_score(int pid, int score) 527 { 528 char path[PATH_MAX]; 529 int fd, len; 530 531 sprintf(path, "/proc/%d/oom_score_adj", pid); 532 533 fd = open(path, O_WRONLY | O_APPEND); 534 if (fd < 0) 535 return fd; 536 537 len = dprintf(fd, "%d", score); 538 if (len < 0) { 539 close(fd); 540 return len; 541 } 542 543 close(fd); 544 return 0; 545 } 546 547 int proc_mount_contains(const char *option) 548 { 549 char buf[4 * PAGE_SIZE]; 550 ssize_t read; 551 552 read = read_text("/proc/mounts", buf, sizeof(buf)); 553 if (read < 0) 554 return read; 555 556 return strstr(buf, option) != NULL; 557 } 558 559 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) 560 { 561 char path[PATH_MAX]; 562 ssize_t ret; 563 564 if (!pid) 565 snprintf(path, sizeof(path), "/proc/%s/%s", 566 thread ? "thread-self" : "self", item); 567 else 568 snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); 569 570 ret = read_text(path, buf, size); 571 return ret < 0 ? -1 : ret; 572 } 573 574 int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) 575 { 576 char buf[PAGE_SIZE]; 577 578 if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) 579 return -1; 580 581 return strstr(buf, needle) ? 0 : -1; 582 } 583 584 int clone_into_cgroup_run_wait(const char *cgroup) 585 { 586 int cgroup_fd; 587 pid_t pid; 588 589 cgroup_fd = dirfd_open_opath(cgroup); 590 if (cgroup_fd < 0) 591 return -1; 592 593 pid = clone_into_cgroup(cgroup_fd); 594 close_prot_errno(cgroup_fd); 595 if (pid < 0) 596 return -1; 597 598 if (pid == 0) 599 exit(EXIT_SUCCESS); 600 601 /* 602 * We don't care whether this fails. We only care whether the initial 603 * clone succeeded. 604 */ 605 (void)clone_reap(pid, WEXITED); 606 return 0; 607 } 608 609 static int __prepare_for_wait(const char *cgroup, const char *filename) 610 { 611 int fd, ret = -1; 612 613 fd = inotify_init1(0); 614 if (fd == -1) 615 return fd; 616 617 ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY); 618 if (ret == -1) { 619 close(fd); 620 fd = -1; 621 } 622 623 return fd; 624 } 625 626 int cg_prepare_for_wait(const char *cgroup) 627 { 628 return __prepare_for_wait(cgroup, "cgroup.events"); 629 } 630 631 int memcg_prepare_for_wait(const char *cgroup) 632 { 633 return __prepare_for_wait(cgroup, "memory.events"); 634 } 635 636 int cg_wait_for(int fd) 637 { 638 int ret = -1; 639 struct pollfd fds = { 640 .fd = fd, 641 .events = POLLIN, 642 }; 643 644 while (true) { 645 ret = poll(&fds, 1, 10000); 646 647 if (ret == -1) { 648 if (errno == EINTR) 649 continue; 650 651 break; 652 } 653 654 if (ret > 0 && fds.revents & POLLIN) { 655 ret = 0; 656 break; 657 } 658 } 659 660 return ret; 661 } 662