1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16
17 #include "kselftest.h"
18 #include "cgroup_util.h"
19
20
21 /*
22 * Memory cgroup charging is performed using percpu batches 64 pages
23 * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
24 * the maximum discrepancy between charge and vmstat entries is number
25 * of cpus multiplied by 64 pages.
26 */
27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
28
29 #define KMEM_DEAD_WAIT_RETRIES 80
30
alloc_dcache(const char * cgroup,void * arg)31 static int alloc_dcache(const char *cgroup, void *arg)
32 {
33 unsigned long i;
34 struct stat st;
35 char buf[128];
36
37 for (i = 0; i < (unsigned long)arg; i++) {
38 snprintf(buf, sizeof(buf),
39 "/something-non-existent-with-a-long-name-%64lu-%d",
40 i, getpid());
41 stat(buf, &st);
42 }
43
44 return 0;
45 }
46
47 /*
48 * This test allocates 100000 of negative dentries with long names.
49 * Then it checks that "slab" in memory.stat is larger than 1M.
50 * Then it sets memory.high to 1M and checks that at least 1/2
51 * of slab memory has been reclaimed.
52 */
test_kmem_basic(const char * root)53 static int test_kmem_basic(const char *root)
54 {
55 int ret = KSFT_FAIL;
56 char *cg = NULL;
57 long slab0, slab1, current;
58
59 cg = cg_name(root, "kmem_basic_test");
60 if (!cg)
61 goto cleanup;
62
63 if (cg_create(cg))
64 goto cleanup;
65
66 if (cg_run(cg, alloc_dcache, (void *)100000))
67 goto cleanup;
68
69 slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
70 if (slab0 < (1 << 20))
71 goto cleanup;
72
73 cg_write(cg, "memory.high", "1M");
74
75 /* wait for RCU freeing */
76 sleep(1);
77
78 slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
79 if (slab1 < 0)
80 goto cleanup;
81
82 current = cg_read_long(cg, "memory.current");
83 if (current < 0)
84 goto cleanup;
85
86 if (slab1 < slab0 / 2 && current < slab0 / 2)
87 ret = KSFT_PASS;
88 cleanup:
89 cg_destroy(cg);
90 free(cg);
91
92 return ret;
93 }
94
alloc_kmem_fn(void * arg)95 static void *alloc_kmem_fn(void *arg)
96 {
97 alloc_dcache(NULL, (void *)100);
98 return NULL;
99 }
100
alloc_kmem_smp(const char * cgroup,void * arg)101 static int alloc_kmem_smp(const char *cgroup, void *arg)
102 {
103 int nr_threads = 2 * get_nprocs();
104 pthread_t *tinfo;
105 unsigned long i;
106 int ret = -1;
107
108 tinfo = calloc(nr_threads, sizeof(pthread_t));
109 if (tinfo == NULL)
110 return -1;
111
112 for (i = 0; i < nr_threads; i++) {
113 if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
114 (void *)i)) {
115 free(tinfo);
116 return -1;
117 }
118 }
119
120 for (i = 0; i < nr_threads; i++) {
121 ret = pthread_join(tinfo[i], NULL);
122 if (ret)
123 break;
124 }
125
126 free(tinfo);
127 return ret;
128 }
129
cg_run_in_subcgroups(const char * parent,int (* fn)(const char * cgroup,void * arg),void * arg,int times)130 static int cg_run_in_subcgroups(const char *parent,
131 int (*fn)(const char *cgroup, void *arg),
132 void *arg, int times)
133 {
134 char *child;
135 int i;
136
137 for (i = 0; i < times; i++) {
138 child = cg_name_indexed(parent, "child", i);
139 if (!child)
140 return -1;
141
142 if (cg_create(child)) {
143 cg_destroy(child);
144 free(child);
145 return -1;
146 }
147
148 if (cg_run(child, fn, NULL)) {
149 cg_destroy(child);
150 free(child);
151 return -1;
152 }
153
154 cg_destroy(child);
155 free(child);
156 }
157
158 return 0;
159 }
160
161 /*
162 * The test creates and destroys a large number of cgroups. In each cgroup it
163 * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
164 * threads. Then it checks the sanity of numbers on the parent level:
165 * the total size of the cgroups should be roughly equal to
166 * anon + file + kernel + sock.
167 */
test_kmem_memcg_deletion(const char * root)168 static int test_kmem_memcg_deletion(const char *root)
169 {
170 long current, anon, file, kernel, sock, sum;
171 int ret = KSFT_FAIL;
172 char *parent;
173
174 parent = cg_name(root, "kmem_memcg_deletion_test");
175 if (!parent)
176 goto cleanup;
177
178 if (cg_create(parent))
179 goto cleanup;
180
181 if (cg_write(parent, "cgroup.subtree_control", "+memory"))
182 goto cleanup;
183
184 if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
185 goto cleanup;
186
187 current = cg_read_long(parent, "memory.current");
188 anon = cg_read_key_long(parent, "memory.stat", "anon ");
189 file = cg_read_key_long(parent, "memory.stat", "file ");
190 kernel = cg_read_key_long(parent, "memory.stat", "kernel ");
191 sock = cg_read_key_long(parent, "memory.stat", "sock ");
192 if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0)
193 goto cleanup;
194
195 sum = anon + file + kernel + sock;
196 if (labs(sum - current) < MAX_VMSTAT_ERROR) {
197 ret = KSFT_PASS;
198 } else {
199 printf("memory.current = %ld\n", current);
200 printf("anon + file + kernel + sock = %ld\n", sum);
201 printf("anon = %ld\n", anon);
202 printf("file = %ld\n", file);
203 printf("kernel = %ld\n", kernel);
204 printf("sock = %ld\n", sock);
205 }
206
207 cleanup:
208 cg_destroy(parent);
209 free(parent);
210
211 return ret;
212 }
213
214 /*
215 * The test reads the entire /proc/kpagecgroup. If the operation went
216 * successfully (and the kernel didn't panic), the test is treated as passed.
217 */
test_kmem_proc_kpagecgroup(const char * root)218 static int test_kmem_proc_kpagecgroup(const char *root)
219 {
220 unsigned long buf[128];
221 int ret = KSFT_FAIL;
222 ssize_t len;
223 int fd;
224
225 fd = open("/proc/kpagecgroup", O_RDONLY);
226 if (fd < 0)
227 return ret;
228
229 do {
230 len = read(fd, buf, sizeof(buf));
231 } while (len > 0);
232
233 if (len == 0)
234 ret = KSFT_PASS;
235
236 close(fd);
237 return ret;
238 }
239
pthread_wait_fn(void * arg)240 static void *pthread_wait_fn(void *arg)
241 {
242 sleep(100);
243 return NULL;
244 }
245
spawn_1000_threads(const char * cgroup,void * arg)246 static int spawn_1000_threads(const char *cgroup, void *arg)
247 {
248 int nr_threads = 1000;
249 pthread_t *tinfo;
250 unsigned long i;
251 long stack;
252 int ret = -1;
253
254 tinfo = calloc(nr_threads, sizeof(pthread_t));
255 if (tinfo == NULL)
256 return -1;
257
258 for (i = 0; i < nr_threads; i++) {
259 if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
260 (void *)i)) {
261 free(tinfo);
262 return(-1);
263 }
264 }
265
266 stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
267 if (stack >= 4096 * 1000)
268 ret = 0;
269
270 free(tinfo);
271 return ret;
272 }
273
274 /*
275 * The test spawns a process, which spawns 1000 threads. Then it checks
276 * that memory.stat's kernel_stack is at least 1000 pages large.
277 */
test_kmem_kernel_stacks(const char * root)278 static int test_kmem_kernel_stacks(const char *root)
279 {
280 int ret = KSFT_FAIL;
281 char *cg = NULL;
282
283 cg = cg_name(root, "kmem_kernel_stacks_test");
284 if (!cg)
285 goto cleanup;
286
287 if (cg_create(cg))
288 goto cleanup;
289
290 if (cg_run(cg, spawn_1000_threads, NULL))
291 goto cleanup;
292
293 ret = KSFT_PASS;
294 cleanup:
295 cg_destroy(cg);
296 free(cg);
297
298 return ret;
299 }
300
301 /*
302 * This test sequentionally creates 30 child cgroups, allocates some
303 * kernel memory in each of them, and deletes them. Then it checks
304 * that the number of dying cgroups on the parent level is 0.
305 */
test_kmem_dead_cgroups(const char * root)306 static int test_kmem_dead_cgroups(const char *root)
307 {
308 int ret = KSFT_FAIL;
309 char *parent;
310 long dead = -1;
311
312 parent = cg_name(root, "kmem_dead_cgroups_test");
313 if (!parent)
314 goto cleanup;
315
316 if (cg_create(parent))
317 goto cleanup;
318
319 if (cg_write(parent, "cgroup.subtree_control", "+memory"))
320 goto cleanup;
321
322 if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
323 goto cleanup;
324
325 /*
326 * Allow up to ~8s for reclaim of dying descendants to complete.
327 * This is a generous upper bound derived from stress testing, not
328 * from a specific kernel constant, and can be adjusted if reclaim
329 * behavior changes in the future.
330 */
331 dead = cg_read_key_long_poll(parent, "cgroup.stat",
332 "nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES,
333 DEFAULT_WAIT_INTERVAL_US);
334 if (dead)
335 goto cleanup;
336
337 ret = KSFT_PASS;
338
339 cleanup:
340 cg_destroy(parent);
341 free(parent);
342
343 return ret;
344 }
345
346 /*
347 * This test creates a sub-tree with 1000 memory cgroups.
348 * Then it checks that the memory.current on the parent level
349 * is greater than 0 and approximates matches the percpu value
350 * from memory.stat.
351 */
test_percpu_basic(const char * root)352 static int test_percpu_basic(const char *root)
353 {
354 int ret = KSFT_FAIL;
355 char *parent, *child;
356 long current, percpu;
357 int i;
358
359 parent = cg_name(root, "percpu_basic_test");
360 if (!parent)
361 goto cleanup;
362
363 if (cg_create(parent))
364 goto cleanup;
365
366 if (cg_write(parent, "cgroup.subtree_control", "+memory"))
367 goto cleanup;
368
369 for (i = 0; i < 1000; i++) {
370 child = cg_name_indexed(parent, "child", i);
371 if (!child) {
372 ret = -1;
373 goto cleanup_children;
374 }
375
376 if (cg_create(child)) {
377 free(child);
378 goto cleanup_children;
379 }
380
381 free(child);
382 }
383
384 current = cg_read_long(parent, "memory.current");
385 percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
386
387 if (current > 0 && percpu > 0 && labs(current - percpu) <
388 MAX_VMSTAT_ERROR)
389 ret = KSFT_PASS;
390 else
391 printf("memory.current %ld\npercpu %ld\n",
392 current, percpu);
393
394 cleanup_children:
395 for (i = 0; i < 1000; i++) {
396 child = cg_name_indexed(parent, "child", i);
397 cg_destroy(child);
398 free(child);
399 }
400
401 cleanup:
402 cg_destroy(parent);
403 free(parent);
404
405 return ret;
406 }
407
408 #define T(x) { x, #x }
409 struct kmem_test {
410 int (*fn)(const char *root);
411 const char *name;
412 } tests[] = {
413 T(test_kmem_basic),
414 T(test_kmem_memcg_deletion),
415 T(test_kmem_proc_kpagecgroup),
416 T(test_kmem_kernel_stacks),
417 T(test_kmem_dead_cgroups),
418 T(test_percpu_basic),
419 };
420 #undef T
421
main(int argc,char ** argv)422 int main(int argc, char **argv)
423 {
424 char root[PATH_MAX];
425 int i;
426
427 ksft_print_header();
428 ksft_set_plan(ARRAY_SIZE(tests));
429 if (cg_find_unified_root(root, sizeof(root), NULL))
430 ksft_exit_skip("cgroup v2 isn't mounted\n");
431
432 /*
433 * Check that memory controller is available:
434 * memory is listed in cgroup.controllers
435 */
436 if (cg_read_strstr(root, "cgroup.controllers", "memory"))
437 ksft_exit_skip("memory controller isn't available\n");
438
439 if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
440 if (cg_write(root, "cgroup.subtree_control", "+memory"))
441 ksft_exit_skip("Failed to set memory controller\n");
442
443 for (i = 0; i < ARRAY_SIZE(tests); i++) {
444 switch (tests[i].fn(root)) {
445 case KSFT_PASS:
446 ksft_test_result_pass("%s\n", tests[i].name);
447 break;
448 case KSFT_SKIP:
449 ksft_test_result_skip("%s\n", tests[i].name);
450 break;
451 default:
452 ksft_test_result_fail("%s\n", tests[i].name);
453 break;
454 }
455 }
456
457 ksft_finished();
458 }
459