/* SPDX-License-Identifier: GPL-2.0 */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../kselftest.h" #include "cgroup_util.h" static bool has_localevents; static bool has_recursiveprot; /* * This test creates two nested cgroups with and without enabling * the memory controller. */ static int test_memcg_subtree_control(const char *root) { char *parent, *child, *parent2 = NULL, *child2 = NULL; int ret = KSFT_FAIL; char buf[PAGE_SIZE]; /* Create two nested cgroups with the memory controller enabled */ parent = cg_name(root, "memcg_test_0"); child = cg_name(root, "memcg_test_0/memcg_test_1"); if (!parent || !child) goto cleanup_free; if (cg_create(parent)) goto cleanup_free; if (cg_write(parent, "cgroup.subtree_control", "+memory")) goto cleanup_parent; if (cg_create(child)) goto cleanup_parent; if (cg_read_strstr(child, "cgroup.controllers", "memory")) goto cleanup_child; /* Create two nested cgroups without enabling memory controller */ parent2 = cg_name(root, "memcg_test_1"); child2 = cg_name(root, "memcg_test_1/memcg_test_1"); if (!parent2 || !child2) goto cleanup_free2; if (cg_create(parent2)) goto cleanup_free2; if (cg_create(child2)) goto cleanup_parent2; if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf))) goto cleanup_all; if (!cg_read_strstr(child2, "cgroup.controllers", "memory")) goto cleanup_all; ret = KSFT_PASS; cleanup_all: cg_destroy(child2); cleanup_parent2: cg_destroy(parent2); cleanup_free2: free(parent2); free(child2); cleanup_child: cg_destroy(child); cleanup_parent: cg_destroy(parent); cleanup_free: free(parent); free(child); return ret; } static int alloc_anon_50M_check(const char *cgroup, void *arg) { size_t size = MB(50); char *buf, *ptr; long anon, current; int ret = -1; buf = malloc(size); if (buf == NULL) { fprintf(stderr, "malloc() failed\n"); return -1; } for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; current = cg_read_long(cgroup, "memory.current"); if (current < size) goto cleanup; if (!values_close(size, current, 3)) goto cleanup; anon = cg_read_key_long(cgroup, "memory.stat", "anon "); if (anon < 0) goto cleanup; if (!values_close(anon, current, 3)) goto cleanup; ret = 0; cleanup: free(buf); return ret; } static int alloc_pagecache_50M_check(const char *cgroup, void *arg) { size_t size = MB(50); int ret = -1; long current, file; int fd; fd = get_temp_fd(); if (fd < 0) return -1; if (alloc_pagecache(fd, size)) goto cleanup; current = cg_read_long(cgroup, "memory.current"); if (current < size) goto cleanup; file = cg_read_key_long(cgroup, "memory.stat", "file "); if (file < 0) goto cleanup; if (!values_close(file, current, 10)) goto cleanup; ret = 0; cleanup: close(fd); return ret; } /* * This test create a memory cgroup, allocates * some anonymous memory and some pagecache * and checks memory.current, memory.peak, and some memory.stat values. */ static int test_memcg_current_peak(const char *root) { int ret = KSFT_FAIL; long current, peak, peak_reset; char *memcg; bool fd2_closed = false, fd3_closed = false, fd4_closed = false; int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1; struct stat ss; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; current = cg_read_long(memcg, "memory.current"); if (current != 0) goto cleanup; peak = cg_read_long(memcg, "memory.peak"); if (peak != 0) goto cleanup; if (cg_run(memcg, alloc_anon_50M_check, NULL)) goto cleanup; peak = cg_read_long(memcg, "memory.peak"); if (peak < MB(50)) goto cleanup; /* * We'll open a few FDs for the same memory.peak file to exercise the free-path * We need at least three to be closed in a different order than writes occurred to test * the linked-list handling. */ peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (peak_fd == -1) { if (errno == ENOENT) ret = KSFT_SKIP; goto cleanup; } /* * Before we try to use memory.peak's fd, try to figure out whether * this kernel supports writing to that file in the first place. (by * checking the writable bit on the file's st_mode) */ if (fstat(peak_fd, &ss)) goto cleanup; if ((ss.st_mode & S_IWUSR) == 0) { ret = KSFT_SKIP; goto cleanup; } peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (peak_fd2 == -1) goto cleanup; peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (peak_fd3 == -1) goto cleanup; /* any non-empty string resets, but make it clear */ static const char reset_string[] = "reset\n"; peak_reset = write(peak_fd, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; peak_reset = write(peak_fd2, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; peak_reset = write(peak_fd3, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; /* Make sure a completely independent read isn't affected by our FD-local reset above*/ peak = cg_read_long(memcg, "memory.peak"); if (peak < MB(50)) goto cleanup; fd2_closed = true; if (close(peak_fd2)) goto cleanup; peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (peak_fd4 == -1) goto cleanup; peak_reset = write(peak_fd4, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; peak = cg_read_long_fd(peak_fd); if (peak > MB(30) || peak < 0) goto cleanup; if (cg_run(memcg, alloc_pagecache_50M_check, NULL)) goto cleanup; peak = cg_read_long(memcg, "memory.peak"); if (peak < MB(50)) goto cleanup; /* Make sure everything is back to normal */ peak = cg_read_long_fd(peak_fd); if (peak < MB(50)) goto cleanup; peak = cg_read_long_fd(peak_fd4); if (peak < MB(50)) goto cleanup; fd3_closed = true; if (close(peak_fd3)) goto cleanup; fd4_closed = true; if (close(peak_fd4)) goto cleanup; ret = KSFT_PASS; cleanup: close(peak_fd); if (!fd2_closed) close(peak_fd2); if (!fd3_closed) close(peak_fd3); if (!fd4_closed) close(peak_fd4); cg_destroy(memcg); free(memcg); return ret; } static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) { int fd = (long)arg; int ppid = getppid(); if (alloc_pagecache(fd, MB(50))) return -1; while (getppid() == ppid) sleep(1); return 0; } static int alloc_anon_noexit(const char *cgroup, void *arg) { int ppid = getppid(); size_t size = (unsigned long)arg; char *buf, *ptr; buf = malloc(size); if (buf == NULL) { fprintf(stderr, "malloc() failed\n"); return -1; } for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; while (getppid() == ppid) sleep(1); free(buf); return 0; } /* * Wait until processes are killed asynchronously by the OOM killer * If we exceed a timeout, fail. */ static int cg_test_proc_killed(const char *cgroup) { int limit; for (limit = 10; limit > 0; limit--) { if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0) return 0; usleep(100000); } return -1; } static bool reclaim_until(const char *memcg, long goal); /* * First, this test creates the following hierarchy: * A memory.min = 0, memory.max = 200M * A/B memory.min = 50M * A/B/C memory.min = 75M, memory.current = 50M * A/B/D memory.min = 25M, memory.current = 50M * A/B/E memory.min = 0, memory.current = 50M * A/B/F memory.min = 500M, memory.current = 0 * * (or memory.low if we test soft protection) * * Usages are pagecache and the test keeps a running * process in every leaf cgroup. * Then it creates A/G and creates a significant * memory pressure in A. * * Then it checks actual memory usages and expects that: * A/B memory.current ~= 50M * A/B/C memory.current ~= 29M * A/B/D memory.current ~= 21M * A/B/E memory.current ~= 0 * A/B/F memory.current = 0 * (for origin of the numbers, see model in memcg_protection.m.) * * After that it tries to allocate more than there is * unprotected memory in A available, and checks that: * a) memory.min protects pagecache even in this case, * b) memory.low allows reclaiming page cache with low events. * * Then we try to reclaim from A/B/C using memory.reclaim until its * usage reaches 10M. * This makes sure that: * (a) We ignore the protection of the reclaim target memcg. * (b) The previously calculated emin value (~29M) should be dismissed. */ static int test_memcg_protection(const char *root, bool min) { int ret = KSFT_FAIL, rc; char *parent[3] = {NULL}; char *children[4] = {NULL}; const char *attribute = min ? "memory.min" : "memory.low"; long c[4]; long current; int i, attempts; int fd; fd = get_temp_fd(); if (fd < 0) goto cleanup; parent[0] = cg_name(root, "memcg_test_0"); if (!parent[0]) goto cleanup; parent[1] = cg_name(parent[0], "memcg_test_1"); if (!parent[1]) goto cleanup; parent[2] = cg_name(parent[0], "memcg_test_2"); if (!parent[2]) goto cleanup; if (cg_create(parent[0])) goto cleanup; if (cg_read_long(parent[0], attribute)) { /* No memory.min on older kernels is fine */ if (min) ret = KSFT_SKIP; goto cleanup; } if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) goto cleanup; if (cg_write(parent[0], "memory.max", "200M")) goto cleanup; if (cg_write(parent[0], "memory.swap.max", "0")) goto cleanup; if (cg_create(parent[1])) goto cleanup; if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) goto cleanup; if (cg_create(parent[2])) goto cleanup; for (i = 0; i < ARRAY_SIZE(children); i++) { children[i] = cg_name_indexed(parent[1], "child_memcg", i); if (!children[i]) goto cleanup; if (cg_create(children[i])) goto cleanup; if (i > 2) continue; cg_run_nowait(children[i], alloc_pagecache_50M_noexit, (void *)(long)fd); } if (cg_write(parent[1], attribute, "50M")) goto cleanup; if (cg_write(children[0], attribute, "75M")) goto cleanup; if (cg_write(children[1], attribute, "25M")) goto cleanup; if (cg_write(children[2], attribute, "0")) goto cleanup; if (cg_write(children[3], attribute, "500M")) goto cleanup; attempts = 0; while (!values_close(cg_read_long(parent[1], "memory.current"), MB(150), 3)) { if (attempts++ > 5) break; sleep(1); } if (cg_run(parent[2], alloc_anon, (void *)MB(148))) goto cleanup; if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) goto cleanup; for (i = 0; i < ARRAY_SIZE(children); i++) c[i] = cg_read_long(children[i], "memory.current"); if (!values_close(c[0], MB(29), 10)) goto cleanup; if (!values_close(c[1], MB(21), 10)) goto cleanup; if (c[3] != 0) goto cleanup; rc = cg_run(parent[2], alloc_anon, (void *)MB(170)); if (min && !rc) goto cleanup; else if (!min && rc) { fprintf(stderr, "memory.low prevents from allocating anon memory\n"); goto cleanup; } current = min ? MB(50) : MB(30); if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3)) goto cleanup; if (!reclaim_until(children[0], MB(10))) goto cleanup; if (min) { ret = KSFT_PASS; goto cleanup; } for (i = 0; i < ARRAY_SIZE(children); i++) { int no_low_events_index = 1; long low, oom; oom = cg_read_key_long(children[i], "memory.events", "oom "); low = cg_read_key_long(children[i], "memory.events", "low "); if (oom) goto cleanup; if (i <= no_low_events_index && low <= 0) goto cleanup; if (i > no_low_events_index && low) goto cleanup; } ret = KSFT_PASS; cleanup: for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { if (!children[i]) continue; cg_destroy(children[i]); free(children[i]); } for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { if (!parent[i]) continue; cg_destroy(parent[i]); free(parent[i]); } close(fd); return ret; } static int test_memcg_min(const char *root) { return test_memcg_protection(root, true); } static int test_memcg_low(const char *root) { return test_memcg_protection(root, false); } static int alloc_pagecache_max_30M(const char *cgroup, void *arg) { size_t size = MB(50); int ret = -1; long current, high, max; int fd; high = cg_read_long(cgroup, "memory.high"); max = cg_read_long(cgroup, "memory.max"); if (high != MB(30) && max != MB(30)) return -1; fd = get_temp_fd(); if (fd < 0) return -1; if (alloc_pagecache(fd, size)) goto cleanup; current = cg_read_long(cgroup, "memory.current"); if (!values_close(current, MB(30), 5)) goto cleanup; ret = 0; cleanup: close(fd); return ret; } /* * This test checks that memory.high limits the amount of * memory which can be consumed by either anonymous memory * or pagecache. */ static int test_memcg_high(const char *root) { int ret = KSFT_FAIL; char *memcg; long high; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; if (cg_read_strcmp(memcg, "memory.high", "max\n")) goto cleanup; if (cg_write(memcg, "memory.swap.max", "0")) goto cleanup; if (cg_write(memcg, "memory.high", "30M")) goto cleanup; if (cg_run(memcg, alloc_anon, (void *)MB(31))) goto cleanup; if (!cg_run(memcg, alloc_pagecache_50M_check, NULL)) goto cleanup; if (cg_run(memcg, alloc_pagecache_max_30M, NULL)) goto cleanup; high = cg_read_key_long(memcg, "memory.events", "high "); if (high <= 0) goto cleanup; ret = KSFT_PASS; cleanup: cg_destroy(memcg); free(memcg); return ret; } static int alloc_anon_mlock(const char *cgroup, void *arg) { size_t size = (size_t)arg; void *buf; buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); if (buf == MAP_FAILED) return -1; mlock(buf, size); munmap(buf, size); return 0; } /* * This test checks that memory.high is able to throttle big single shot * allocation i.e. large allocation within one kernel entry. */ static int test_memcg_high_sync(const char *root) { int ret = KSFT_FAIL, pid, fd = -1; char *memcg; long pre_high, pre_max; long post_high, post_max; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; pre_high = cg_read_key_long(memcg, "memory.events", "high "); pre_max = cg_read_key_long(memcg, "memory.events", "max "); if (pre_high < 0 || pre_max < 0) goto cleanup; if (cg_write(memcg, "memory.swap.max", "0")) goto cleanup; if (cg_write(memcg, "memory.high", "30M")) goto cleanup; if (cg_write(memcg, "memory.max", "140M")) goto cleanup; fd = memcg_prepare_for_wait(memcg); if (fd < 0) goto cleanup; pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200)); if (pid < 0) goto cleanup; cg_wait_for(fd); post_high = cg_read_key_long(memcg, "memory.events", "high "); post_max = cg_read_key_long(memcg, "memory.events", "max "); if (post_high < 0 || post_max < 0) goto cleanup; if (pre_high == post_high || pre_max != post_max) goto cleanup; ret = KSFT_PASS; cleanup: if (fd >= 0) close(fd); cg_destroy(memcg); free(memcg); return ret; } /* * This test checks that memory.max limits the amount of * memory which can be consumed by either anonymous memory * or pagecache. */ static int test_memcg_max(const char *root) { int ret = KSFT_FAIL; char *memcg; long current, max; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; if (cg_read_strcmp(memcg, "memory.max", "max\n")) goto cleanup; if (cg_write(memcg, "memory.swap.max", "0")) goto cleanup; if (cg_write(memcg, "memory.max", "30M")) goto cleanup; /* Should be killed by OOM killer */ if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_run(memcg, alloc_pagecache_max_30M, NULL)) goto cleanup; current = cg_read_long(memcg, "memory.current"); if (current > MB(30) || !current) goto cleanup; max = cg_read_key_long(memcg, "memory.events", "max "); if (max <= 0) goto cleanup; ret = KSFT_PASS; cleanup: cg_destroy(memcg); free(memcg); return ret; } /* * Reclaim from @memcg until usage reaches @goal by writing to * memory.reclaim. * * This function will return false if the usage is already below the * goal. * * This function assumes that writing to memory.reclaim is the only * source of change in memory.current (no concurrent allocations or * reclaim). * * This function makes sure memory.reclaim is sane. It will return * false if memory.reclaim's error codes do not make sense, even if * the usage goal was satisfied. */ static bool reclaim_until(const char *memcg, long goal) { char buf[64]; int retries, err; long current, to_reclaim; bool reclaimed = false; for (retries = 5; retries > 0; retries--) { current = cg_read_long(memcg, "memory.current"); if (current < goal || values_close(current, goal, 3)) break; /* Did memory.reclaim return 0 incorrectly? */ else if (reclaimed) return false; to_reclaim = current - goal; snprintf(buf, sizeof(buf), "%ld", to_reclaim); err = cg_write(memcg, "memory.reclaim", buf); if (!err) reclaimed = true; else if (err != -EAGAIN) return false; } return reclaimed; } /* * This test checks that memory.reclaim reclaims the given * amount of memory (from both anon and file, if possible). */ static int test_memcg_reclaim(const char *root) { int ret = KSFT_FAIL; int fd = -1; int retries; char *memcg; long current, expected_usage; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; current = cg_read_long(memcg, "memory.current"); if (current != 0) goto cleanup; fd = get_temp_fd(); if (fd < 0) goto cleanup; cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd); /* * If swap is enabled, try to reclaim from both anon and file, else try * to reclaim from file only. */ if (is_swap_enabled()) { cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50)); expected_usage = MB(100); } else expected_usage = MB(50); /* * Wait until current usage reaches the expected usage (or we run out of * retries). */ retries = 5; while (!values_close(cg_read_long(memcg, "memory.current"), expected_usage, 10)) { if (retries--) { sleep(1); continue; } else { fprintf(stderr, "failed to allocate %ld for memcg reclaim test\n", expected_usage); goto cleanup; } } /* * Reclaim until current reaches 30M, this makes sure we hit both anon * and file if swap is enabled. */ if (!reclaim_until(memcg, MB(30))) goto cleanup; ret = KSFT_PASS; cleanup: cg_destroy(memcg); free(memcg); close(fd); return ret; } static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) { long mem_max = (long)arg; size_t size = MB(50); char *buf, *ptr; long mem_current, swap_current; int ret = -1; buf = malloc(size); if (buf == NULL) { fprintf(stderr, "malloc() failed\n"); return -1; } for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) *ptr = 0; mem_current = cg_read_long(cgroup, "memory.current"); if (!mem_current || !values_close(mem_current, mem_max, 3)) goto cleanup; swap_current = cg_read_long(cgroup, "memory.swap.current"); if (!swap_current || !values_close(mem_current + swap_current, size, 3)) goto cleanup; ret = 0; cleanup: free(buf); return ret; } /* * This test checks that memory.swap.max limits the amount of * anonymous memory which can be swapped out. Additionally, it verifies that * memory.swap.peak reflects the high watermark and can be reset. */ static int test_memcg_swap_max_peak(const char *root) { int ret = KSFT_FAIL; char *memcg; long max, peak; struct stat ss; int swap_peak_fd = -1, mem_peak_fd = -1; /* any non-empty string resets */ static const char reset_string[] = "foobarbaz"; if (!is_swap_enabled()) return KSFT_SKIP; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; if (cg_read_long(memcg, "memory.swap.current")) { ret = KSFT_SKIP; goto cleanup; } swap_peak_fd = cg_open(memcg, "memory.swap.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (swap_peak_fd == -1) { if (errno == ENOENT) ret = KSFT_SKIP; goto cleanup; } /* * Before we try to use memory.swap.peak's fd, try to figure out * whether this kernel supports writing to that file in the first * place. (by checking the writable bit on the file's st_mode) */ if (fstat(swap_peak_fd, &ss)) goto cleanup; if ((ss.st_mode & S_IWUSR) == 0) { ret = KSFT_SKIP; goto cleanup; } mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC); if (mem_peak_fd == -1) goto cleanup; if (cg_read_long(memcg, "memory.swap.peak")) goto cleanup; if (cg_read_long_fd(swap_peak_fd)) goto cleanup; /* switch the swap and mem fds into local-peak tracking mode*/ int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; if (cg_read_long_fd(swap_peak_fd)) goto cleanup; if (cg_read_long(memcg, "memory.peak")) goto cleanup; if (cg_read_long_fd(mem_peak_fd)) goto cleanup; peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; if (cg_read_long_fd(mem_peak_fd)) goto cleanup; if (cg_read_strcmp(memcg, "memory.max", "max\n")) goto cleanup; if (cg_read_strcmp(memcg, "memory.swap.max", "max\n")) goto cleanup; if (cg_write(memcg, "memory.swap.max", "30M")) goto cleanup; if (cg_write(memcg, "memory.max", "30M")) goto cleanup; /* Should be killed by OOM killer */ if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_read_key_long(memcg, "memory.events", "oom ") != 1) goto cleanup; if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) goto cleanup; peak = cg_read_long(memcg, "memory.peak"); if (peak < MB(29)) goto cleanup; peak = cg_read_long(memcg, "memory.swap.peak"); if (peak < MB(29)) goto cleanup; peak = cg_read_long_fd(mem_peak_fd); if (peak < MB(29)) goto cleanup; peak = cg_read_long_fd(swap_peak_fd); if (peak < MB(29)) goto cleanup; /* * open, reset and close the peak swap on another FD to make sure * multiple extant fds don't corrupt the linked-list */ peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string); if (peak_reset) goto cleanup; peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string); if (peak_reset) goto cleanup; /* actually reset on the fds */ peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string)); if (peak_reset != sizeof(reset_string)) goto cleanup; peak = cg_read_long_fd(swap_peak_fd); if (peak > MB(10)) goto cleanup; /* * The cgroup is now empty, but there may be a page or two associated * with the open FD accounted to it. */ peak = cg_read_long_fd(mem_peak_fd); if (peak > MB(1)) goto cleanup; if (cg_read_long(memcg, "memory.peak") < MB(29)) goto cleanup; if (cg_read_long(memcg, "memory.swap.peak") < MB(29)) goto cleanup; if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30))) goto cleanup; max = cg_read_key_long(memcg, "memory.events", "max "); if (max <= 0) goto cleanup; peak = cg_read_long(memcg, "memory.peak"); if (peak < MB(29)) goto cleanup; peak = cg_read_long(memcg, "memory.swap.peak"); if (peak < MB(29)) goto cleanup; peak = cg_read_long_fd(mem_peak_fd); if (peak < MB(29)) goto cleanup; peak = cg_read_long_fd(swap_peak_fd); if (peak < MB(19)) goto cleanup; ret = KSFT_PASS; cleanup: if (mem_peak_fd != -1 && close(mem_peak_fd)) ret = KSFT_FAIL; if (swap_peak_fd != -1 && close(swap_peak_fd)) ret = KSFT_FAIL; cg_destroy(memcg); free(memcg); return ret; } /* * This test disables swapping and tries to allocate anonymous memory * up to OOM. Then it checks for oom and oom_kill events in * memory.events. */ static int test_memcg_oom_events(const char *root) { int ret = KSFT_FAIL; char *memcg; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; if (cg_write(memcg, "memory.max", "30M")) goto cleanup; if (cg_write(memcg, "memory.swap.max", "0")) goto cleanup; if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_read_strcmp(memcg, "cgroup.procs", "")) goto cleanup; if (cg_read_key_long(memcg, "memory.events", "oom ") != 1) goto cleanup; if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1) goto cleanup; ret = KSFT_PASS; cleanup: cg_destroy(memcg); free(memcg); return ret; } struct tcp_server_args { unsigned short port; int ctl[2]; }; static int tcp_server(const char *cgroup, void *arg) { struct tcp_server_args *srv_args = arg; struct sockaddr_in6 saddr = { 0 }; socklen_t slen = sizeof(saddr); int sk, client_sk, ctl_fd, yes = 1, ret = -1; close(srv_args->ctl[0]); ctl_fd = srv_args->ctl[1]; saddr.sin6_family = AF_INET6; saddr.sin6_addr = in6addr_any; saddr.sin6_port = htons(srv_args->port); sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) return ret; if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) goto cleanup; if (bind(sk, (struct sockaddr *)&saddr, slen)) { write(ctl_fd, &errno, sizeof(errno)); goto cleanup; } if (listen(sk, 1)) goto cleanup; ret = 0; if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { ret = -1; goto cleanup; } client_sk = accept(sk, NULL, NULL); if (client_sk < 0) goto cleanup; ret = -1; for (;;) { uint8_t buf[0x100000]; if (write(client_sk, buf, sizeof(buf)) <= 0) { if (errno == ECONNRESET) ret = 0; break; } } close(client_sk); cleanup: close(sk); return ret; } static int tcp_client(const char *cgroup, unsigned short port) { const char server[] = "localhost"; struct addrinfo *ai; char servport[6]; int retries = 0x10; /* nice round number */ int sk, ret; long allocated; allocated = cg_read_long(cgroup, "memory.current"); snprintf(servport, sizeof(servport), "%hd", port); ret = getaddrinfo(server, servport, NULL, &ai); if (ret) return ret; sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (sk < 0) goto free_ainfo; ret = connect(sk, ai->ai_addr, ai->ai_addrlen); if (ret < 0) goto close_sk; ret = KSFT_FAIL; while (retries--) { uint8_t buf[0x100000]; long current, sock; if (read(sk, buf, sizeof(buf)) <= 0) goto close_sk; current = cg_read_long(cgroup, "memory.current"); sock = cg_read_key_long(cgroup, "memory.stat", "sock "); if (current < 0 || sock < 0) goto close_sk; /* exclude the memory not related to socket connection */ if (values_close(current - allocated, sock, 10)) { ret = KSFT_PASS; break; } } close_sk: close(sk); free_ainfo: freeaddrinfo(ai); return ret; } /* * This test checks socket memory accounting. * The test forks a TCP server listens on a random port between 1000 * and 61000. Once it gets a client connection, it starts writing to * its socket. * The TCP client interleaves reads from the socket with check whether * memory.current and memory.stat.sock are similar. */ static int test_memcg_sock(const char *root) { int bind_retries = 5, ret = KSFT_FAIL, pid, err; unsigned short port; char *memcg; memcg = cg_name(root, "memcg_test"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; while (bind_retries--) { struct tcp_server_args args; if (pipe(args.ctl)) goto cleanup; port = args.port = 1000 + rand() % 60000; pid = cg_run_nowait(memcg, tcp_server, &args); if (pid < 0) goto cleanup; close(args.ctl[1]); if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err)) goto cleanup; close(args.ctl[0]); if (!err) break; if (err != EADDRINUSE) goto cleanup; waitpid(pid, NULL, 0); } if (err == EADDRINUSE) { ret = KSFT_SKIP; goto cleanup; } if (tcp_client(memcg, port) != KSFT_PASS) goto cleanup; waitpid(pid, &err, 0); if (WEXITSTATUS(err)) goto cleanup; if (cg_read_long(memcg, "memory.current") < 0) goto cleanup; if (cg_read_key_long(memcg, "memory.stat", "sock ")) goto cleanup; ret = KSFT_PASS; cleanup: cg_destroy(memcg); free(memcg); return ret; } /* * This test disables swapping and tries to allocate anonymous memory * up to OOM with memory.group.oom set. Then it checks that all * processes in the leaf were killed. It also checks that oom_events * were propagated to the parent level. */ static int test_memcg_oom_group_leaf_events(const char *root) { int ret = KSFT_FAIL; char *parent, *child; long parent_oom_events; parent = cg_name(root, "memcg_test_0"); child = cg_name(root, "memcg_test_0/memcg_test_1"); if (!parent || !child) goto cleanup; if (cg_create(parent)) goto cleanup; if (cg_create(child)) goto cleanup; if (cg_write(parent, "cgroup.subtree_control", "+memory")) goto cleanup; if (cg_write(child, "memory.max", "50M")) goto cleanup; if (cg_write(child, "memory.swap.max", "0")) goto cleanup; if (cg_write(child, "memory.oom.group", "1")) goto cleanup; cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); if (!cg_run(child, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_test_proc_killed(child)) goto cleanup; if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) goto cleanup; parent_oom_events = cg_read_key_long( parent, "memory.events", "oom_kill "); /* * If memory_localevents is not enabled (the default), the parent should * count OOM events in its children groups. Otherwise, it should not * have observed any events. */ if (has_localevents && parent_oom_events != 0) goto cleanup; else if (!has_localevents && parent_oom_events <= 0) goto cleanup; ret = KSFT_PASS; cleanup: if (child) cg_destroy(child); if (parent) cg_destroy(parent); free(child); free(parent); return ret; } /* * This test disables swapping and tries to allocate anonymous memory * up to OOM with memory.group.oom set. Then it checks that all * processes in the parent and leaf were killed. */ static int test_memcg_oom_group_parent_events(const char *root) { int ret = KSFT_FAIL; char *parent, *child; parent = cg_name(root, "memcg_test_0"); child = cg_name(root, "memcg_test_0/memcg_test_1"); if (!parent || !child) goto cleanup; if (cg_create(parent)) goto cleanup; if (cg_create(child)) goto cleanup; if (cg_write(parent, "memory.max", "80M")) goto cleanup; if (cg_write(parent, "memory.swap.max", "0")) goto cleanup; if (cg_write(parent, "memory.oom.group", "1")) goto cleanup; cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); if (!cg_run(child, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_test_proc_killed(child)) goto cleanup; if (cg_test_proc_killed(parent)) goto cleanup; ret = KSFT_PASS; cleanup: if (child) cg_destroy(child); if (parent) cg_destroy(parent); free(child); free(parent); return ret; } /* * This test disables swapping and tries to allocate anonymous memory * up to OOM with memory.group.oom set. Then it checks that all * processes were killed except those set with OOM_SCORE_ADJ_MIN */ static int test_memcg_oom_group_score_events(const char *root) { int ret = KSFT_FAIL; char *memcg; int safe_pid; memcg = cg_name(root, "memcg_test_0"); if (!memcg) goto cleanup; if (cg_create(memcg)) goto cleanup; if (cg_write(memcg, "memory.max", "50M")) goto cleanup; if (cg_write(memcg, "memory.swap.max", "0")) goto cleanup; if (cg_write(memcg, "memory.oom.group", "1")) goto cleanup; safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN)) goto cleanup; cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); if (!cg_run(memcg, alloc_anon, (void *)MB(100))) goto cleanup; if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) goto cleanup; if (kill(safe_pid, SIGKILL)) goto cleanup; ret = KSFT_PASS; cleanup: if (memcg) cg_destroy(memcg); free(memcg); return ret; } #define T(x) { x, #x } struct memcg_test { int (*fn)(const char *root); const char *name; } tests[] = { T(test_memcg_subtree_control), T(test_memcg_current_peak), T(test_memcg_min), T(test_memcg_low), T(test_memcg_high), T(test_memcg_high_sync), T(test_memcg_max), T(test_memcg_reclaim), T(test_memcg_oom_events), T(test_memcg_swap_max_peak), T(test_memcg_sock), T(test_memcg_oom_group_leaf_events), T(test_memcg_oom_group_parent_events), T(test_memcg_oom_group_score_events), }; #undef T int main(int argc, char **argv) { char root[PATH_MAX]; int i, proc_status, ret = EXIT_SUCCESS; if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); /* * Check that memory controller is available: * memory is listed in cgroup.controllers */ if (cg_read_strstr(root, "cgroup.controllers", "memory")) ksft_exit_skip("memory controller isn't available\n"); if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) if (cg_write(root, "cgroup.subtree_control", "+memory")) ksft_exit_skip("Failed to set memory controller\n"); proc_status = proc_mount_contains("memory_recursiveprot"); if (proc_status < 0) ksft_exit_skip("Failed to query cgroup mount option\n"); has_recursiveprot = proc_status; proc_status = proc_mount_contains("memory_localevents"); if (proc_status < 0) ksft_exit_skip("Failed to query cgroup mount option\n"); has_localevents = proc_status; for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { case KSFT_PASS: ksft_test_result_pass("%s\n", tests[i].name); break; case KSFT_SKIP: ksft_test_result_skip("%s\n", tests[i].name); break; default: ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } return ret; }