xref: /linux/tools/testing/selftests/cgroup/test_kmem.c (revision 50133c09d189a26f4cc6e78e382864fd599a1dc4)
1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16 
17 #include "kselftest.h"
18 #include "cgroup_util.h"
19 
20 
21 /*
22  * Memory cgroup charging is performed using percpu batches 64 pages
23  * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
24  * the maximum discrepancy between charge and vmstat entries is number
25  * of cpus multiplied by 64 pages.
26  */
27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
28 
29 #define KMEM_DEAD_WAIT_RETRIES        80
30 
31 static int alloc_dcache(const char *cgroup, void *arg)
32 {
33 	unsigned long i;
34 	struct stat st;
35 	char buf[128];
36 
37 	for (i = 0; i < (unsigned long)arg; i++) {
38 		snprintf(buf, sizeof(buf),
39 			"/something-non-existent-with-a-long-name-%64lu-%d",
40 			 i, getpid());
41 		stat(buf, &st);
42 	}
43 
44 	return 0;
45 }
46 
47 /*
48  * This test allocates 100000 of negative dentries with long names.
49  * Then it checks that "slab" in memory.stat is larger than 1M.
50  * Then it sets memory.high to 1M and checks that at least 1/2
51  * of slab memory has been reclaimed.
52  */
53 static int test_kmem_basic(const char *root)
54 {
55 	int ret = KSFT_FAIL;
56 	char *cg = NULL;
57 	long slab0, slab1, current;
58 
59 	cg = cg_name(root, "kmem_basic_test");
60 	if (!cg)
61 		goto cleanup;
62 
63 	if (cg_create(cg))
64 		goto cleanup;
65 
66 	if (cg_run(cg, alloc_dcache, (void *)100000))
67 		goto cleanup;
68 
69 	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
70 	if (slab0 < (1 << 20))
71 		goto cleanup;
72 
73 	cg_write(cg, "memory.high", "1M");
74 
75 	/* wait for RCU freeing */
76 	sleep(1);
77 
78 	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
79 	if (slab1 < 0)
80 		goto cleanup;
81 
82 	current = cg_read_long(cg, "memory.current");
83 	if (current < 0)
84 		goto cleanup;
85 
86 	if (slab1 < slab0 / 2 && current < slab0 / 2)
87 		ret = KSFT_PASS;
88 cleanup:
89 	cg_destroy(cg);
90 	free(cg);
91 
92 	return ret;
93 }
94 
95 static void *alloc_kmem_fn(void *arg)
96 {
97 	alloc_dcache(NULL, (void *)100);
98 	return NULL;
99 }
100 
101 static int alloc_kmem_smp(const char *cgroup, void *arg)
102 {
103 	int nr_threads = 2 * get_nprocs();
104 	pthread_t *tinfo;
105 	unsigned long i;
106 	int ret = -1;
107 
108 	tinfo = calloc(nr_threads, sizeof(pthread_t));
109 	if (tinfo == NULL)
110 		return -1;
111 
112 	for (i = 0; i < nr_threads; i++) {
113 		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
114 				   (void *)i)) {
115 			free(tinfo);
116 			return -1;
117 		}
118 	}
119 
120 	for (i = 0; i < nr_threads; i++) {
121 		ret = pthread_join(tinfo[i], NULL);
122 		if (ret)
123 			break;
124 	}
125 
126 	free(tinfo);
127 	return ret;
128 }
129 
130 static int cg_run_in_subcgroups(const char *parent,
131 				int (*fn)(const char *cgroup, void *arg),
132 				void *arg, int times)
133 {
134 	char *child;
135 	int i;
136 
137 	for (i = 0; i < times; i++) {
138 		child = cg_name_indexed(parent, "child", i);
139 		if (!child)
140 			return -1;
141 
142 		if (cg_create(child)) {
143 			cg_destroy(child);
144 			free(child);
145 			return -1;
146 		}
147 
148 		if (cg_run(child, fn, NULL)) {
149 			cg_destroy(child);
150 			free(child);
151 			return -1;
152 		}
153 
154 		cg_destroy(child);
155 		free(child);
156 	}
157 
158 	return 0;
159 }
160 
161 /*
162  * The test creates and destroys a large number of cgroups. In each cgroup it
163  * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
164  * threads. Then it checks the sanity of numbers on the parent level:
165  * the total size of the cgroups should be roughly equal to
166  * anon + file + kernel + sock.
167  */
168 static int test_kmem_memcg_deletion(const char *root)
169 {
170 	long current, anon, file, kernel, sock, sum;
171 	int ret = KSFT_FAIL;
172 	char *parent;
173 
174 	parent = cg_name(root, "kmem_memcg_deletion_test");
175 	if (!parent)
176 		goto cleanup;
177 
178 	if (cg_create(parent))
179 		goto cleanup;
180 
181 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
182 		goto cleanup;
183 
184 	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
185 		goto cleanup;
186 
187 	current = cg_read_long(parent, "memory.current");
188 	anon = cg_read_key_long(parent, "memory.stat", "anon ");
189 	file = cg_read_key_long(parent, "memory.stat", "file ");
190 	kernel = cg_read_key_long(parent, "memory.stat", "kernel ");
191 	sock = cg_read_key_long(parent, "memory.stat", "sock ");
192 	if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0)
193 		goto cleanup;
194 
195 	sum = anon + file + kernel + sock;
196 	if (labs(sum - current) < MAX_VMSTAT_ERROR) {
197 		ret = KSFT_PASS;
198 	} else {
199 		printf("memory.current = %ld\n", current);
200 		printf("anon + file + kernel + sock = %ld\n", sum);
201 		printf("anon = %ld\n", anon);
202 		printf("file = %ld\n", file);
203 		printf("kernel = %ld\n", kernel);
204 		printf("sock = %ld\n", sock);
205 	}
206 
207 cleanup:
208 	cg_destroy(parent);
209 	free(parent);
210 
211 	return ret;
212 }
213 
214 /*
215  * The test reads the entire /proc/kpagecgroup. If the operation went
216  * successfully (and the kernel didn't panic), the test is treated as passed.
217  */
218 static int test_kmem_proc_kpagecgroup(const char *root)
219 {
220 	unsigned long buf[128];
221 	int ret = KSFT_FAIL;
222 	ssize_t len;
223 	int fd;
224 
225 	fd = open("/proc/kpagecgroup", O_RDONLY);
226 	if (fd < 0)
227 		return ret;
228 
229 	do {
230 		len = read(fd, buf, sizeof(buf));
231 	} while (len > 0);
232 
233 	if (len == 0)
234 		ret = KSFT_PASS;
235 
236 	close(fd);
237 	return ret;
238 }
239 
240 static void *pthread_wait_fn(void *arg)
241 {
242 	sleep(100);
243 	return NULL;
244 }
245 
246 static int spawn_1000_threads(const char *cgroup, void *arg)
247 {
248 	int nr_threads = 1000;
249 	pthread_t *tinfo;
250 	unsigned long i;
251 	long stack;
252 	int ret = -1;
253 
254 	tinfo = calloc(nr_threads, sizeof(pthread_t));
255 	if (tinfo == NULL)
256 		return -1;
257 
258 	for (i = 0; i < nr_threads; i++) {
259 		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
260 				   (void *)i)) {
261 			free(tinfo);
262 			return(-1);
263 		}
264 	}
265 
266 	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
267 	if (stack >= 4096 * 1000)
268 		ret = 0;
269 
270 	free(tinfo);
271 	return ret;
272 }
273 
274 /*
275  * The test spawns a process, which spawns 1000 threads. Then it checks
276  * that memory.stat's kernel_stack is at least 1000 pages large.
277  */
278 static int test_kmem_kernel_stacks(const char *root)
279 {
280 	int ret = KSFT_FAIL;
281 	char *cg = NULL;
282 
283 	cg = cg_name(root, "kmem_kernel_stacks_test");
284 	if (!cg)
285 		goto cleanup;
286 
287 	if (cg_create(cg))
288 		goto cleanup;
289 
290 	if (cg_run(cg, spawn_1000_threads, NULL))
291 		goto cleanup;
292 
293 	ret = KSFT_PASS;
294 cleanup:
295 	cg_destroy(cg);
296 	free(cg);
297 
298 	return ret;
299 }
300 
301 /*
302  * This test sequentionally creates 30 child cgroups, allocates some
303  * kernel memory in each of them, and deletes them. Then it checks
304  * that the number of dying cgroups on the parent level is 0.
305  */
306 static int test_kmem_dead_cgroups(const char *root)
307 {
308 	int ret = KSFT_FAIL;
309 	char *parent;
310 	long dead = -1;
311 
312 	parent = cg_name(root, "kmem_dead_cgroups_test");
313 	if (!parent)
314 		goto cleanup;
315 
316 	if (cg_create(parent))
317 		goto cleanup;
318 
319 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
320 		goto cleanup;
321 
322 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
323 		goto cleanup;
324 
325 	/*
326 	 * Allow up to ~8s for reclaim of dying descendants to complete.
327 	 * This is a generous upper bound derived from stress testing, not
328 	 * from a specific kernel constant, and can be adjusted if reclaim
329 	 * behavior changes in the future.
330 	 */
331 	dead = cg_read_key_long_poll(parent, "cgroup.stat",
332 					"nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES,
333 					DEFAULT_WAIT_INTERVAL_US);
334 	if (dead)
335 		goto cleanup;
336 
337 	ret = KSFT_PASS;
338 
339 cleanup:
340 	cg_destroy(parent);
341 	free(parent);
342 
343 	return ret;
344 }
345 
346 /*
347  * This test creates a sub-tree with 1000 memory cgroups.
348  * Then it checks that the memory.current on the parent level
349  * is greater than 0 and approximates matches the percpu value
350  * from memory.stat.
351  */
352 static int test_percpu_basic(const char *root)
353 {
354 	int ret = KSFT_FAIL;
355 	char *parent, *child;
356 	long current, percpu;
357 	int i;
358 
359 	parent = cg_name(root, "percpu_basic_test");
360 	if (!parent)
361 		goto cleanup;
362 
363 	if (cg_create(parent))
364 		goto cleanup;
365 
366 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
367 		goto cleanup;
368 
369 	for (i = 0; i < 1000; i++) {
370 		child = cg_name_indexed(parent, "child", i);
371 		if (!child)
372 			return -1;
373 
374 		if (cg_create(child))
375 			goto cleanup_children;
376 
377 		free(child);
378 	}
379 
380 	current = cg_read_long(parent, "memory.current");
381 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
382 
383 	if (current > 0 && percpu > 0 && labs(current - percpu) <
384 	    MAX_VMSTAT_ERROR)
385 		ret = KSFT_PASS;
386 	else
387 		printf("memory.current %ld\npercpu %ld\n",
388 		       current, percpu);
389 
390 cleanup_children:
391 	for (i = 0; i < 1000; i++) {
392 		child = cg_name_indexed(parent, "child", i);
393 		cg_destroy(child);
394 		free(child);
395 	}
396 
397 cleanup:
398 	cg_destroy(parent);
399 	free(parent);
400 
401 	return ret;
402 }
403 
404 #define T(x) { x, #x }
405 struct kmem_test {
406 	int (*fn)(const char *root);
407 	const char *name;
408 } tests[] = {
409 	T(test_kmem_basic),
410 	T(test_kmem_memcg_deletion),
411 	T(test_kmem_proc_kpagecgroup),
412 	T(test_kmem_kernel_stacks),
413 	T(test_kmem_dead_cgroups),
414 	T(test_percpu_basic),
415 };
416 #undef T
417 
418 int main(int argc, char **argv)
419 {
420 	char root[PATH_MAX];
421 	int i;
422 
423 	ksft_print_header();
424 	ksft_set_plan(ARRAY_SIZE(tests));
425 	if (cg_find_unified_root(root, sizeof(root), NULL))
426 		ksft_exit_skip("cgroup v2 isn't mounted\n");
427 
428 	/*
429 	 * Check that memory controller is available:
430 	 * memory is listed in cgroup.controllers
431 	 */
432 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
433 		ksft_exit_skip("memory controller isn't available\n");
434 
435 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
436 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
437 			ksft_exit_skip("Failed to set memory controller\n");
438 
439 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
440 		switch (tests[i].fn(root)) {
441 		case KSFT_PASS:
442 			ksft_test_result_pass("%s\n", tests[i].name);
443 			break;
444 		case KSFT_SKIP:
445 			ksft_test_result_skip("%s\n", tests[i].name);
446 			break;
447 		default:
448 			ksft_test_result_fail("%s\n", tests[i].name);
449 			break;
450 		}
451 	}
452 
453 	ksft_finished();
454 }
455