xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 906fd46a65383cd639e5eec72a047efc33045d86)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	if (buf == NULL) {
102 		fprintf(stderr, "malloc() failed\n");
103 		return -1;
104 	}
105 
106 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107 		*ptr = 0;
108 
109 	current = cg_read_long(cgroup, "memory.current");
110 	if (current < size)
111 		goto cleanup;
112 
113 	if (!values_close(size, current, 3))
114 		goto cleanup;
115 
116 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117 	if (anon < 0)
118 		goto cleanup;
119 
120 	if (!values_close(anon, current, 3))
121 		goto cleanup;
122 
123 	ret = 0;
124 cleanup:
125 	free(buf);
126 	return ret;
127 }
128 
129 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130 {
131 	size_t size = MB(50);
132 	int ret = -1;
133 	long current, file;
134 	int fd;
135 
136 	fd = get_temp_fd();
137 	if (fd < 0)
138 		return -1;
139 
140 	if (alloc_pagecache(fd, size))
141 		goto cleanup;
142 
143 	current = cg_read_long(cgroup, "memory.current");
144 	if (current < size)
145 		goto cleanup;
146 
147 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148 	if (file < 0)
149 		goto cleanup;
150 
151 	if (!values_close(file, current, 10))
152 		goto cleanup;
153 
154 	ret = 0;
155 
156 cleanup:
157 	close(fd);
158 	return ret;
159 }
160 
161 /*
162  * This test create a memory cgroup, allocates
163  * some anonymous memory and some pagecache
164  * and check memory.current and some memory.stat values.
165  */
166 static int test_memcg_current(const char *root)
167 {
168 	int ret = KSFT_FAIL;
169 	long current;
170 	char *memcg;
171 
172 	memcg = cg_name(root, "memcg_test");
173 	if (!memcg)
174 		goto cleanup;
175 
176 	if (cg_create(memcg))
177 		goto cleanup;
178 
179 	current = cg_read_long(memcg, "memory.current");
180 	if (current != 0)
181 		goto cleanup;
182 
183 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
184 		goto cleanup;
185 
186 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
187 		goto cleanup;
188 
189 	ret = KSFT_PASS;
190 
191 cleanup:
192 	cg_destroy(memcg);
193 	free(memcg);
194 
195 	return ret;
196 }
197 
198 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
199 {
200 	int fd = (long)arg;
201 	int ppid = getppid();
202 
203 	if (alloc_pagecache(fd, MB(50)))
204 		return -1;
205 
206 	while (getppid() == ppid)
207 		sleep(1);
208 
209 	return 0;
210 }
211 
212 static int alloc_anon_noexit(const char *cgroup, void *arg)
213 {
214 	int ppid = getppid();
215 	size_t size = (unsigned long)arg;
216 	char *buf, *ptr;
217 
218 	buf = malloc(size);
219 	if (buf == NULL) {
220 		fprintf(stderr, "malloc() failed\n");
221 		return -1;
222 	}
223 
224 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
225 		*ptr = 0;
226 
227 	while (getppid() == ppid)
228 		sleep(1);
229 
230 	free(buf);
231 	return 0;
232 }
233 
234 /*
235  * Wait until processes are killed asynchronously by the OOM killer
236  * If we exceed a timeout, fail.
237  */
238 static int cg_test_proc_killed(const char *cgroup)
239 {
240 	int limit;
241 
242 	for (limit = 10; limit > 0; limit--) {
243 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
244 			return 0;
245 
246 		usleep(100000);
247 	}
248 	return -1;
249 }
250 
251 static bool reclaim_until(const char *memcg, long goal);
252 
253 /*
254  * First, this test creates the following hierarchy:
255  * A       memory.min = 0,    memory.max = 200M
256  * A/B     memory.min = 50M
257  * A/B/C   memory.min = 75M,  memory.current = 50M
258  * A/B/D   memory.min = 25M,  memory.current = 50M
259  * A/B/E   memory.min = 0,    memory.current = 50M
260  * A/B/F   memory.min = 500M, memory.current = 0
261  *
262  * (or memory.low if we test soft protection)
263  *
264  * Usages are pagecache and the test keeps a running
265  * process in every leaf cgroup.
266  * Then it creates A/G and creates a significant
267  * memory pressure in A.
268  *
269  * Then it checks actual memory usages and expects that:
270  * A/B    memory.current ~= 50M
271  * A/B/C  memory.current ~= 29M
272  * A/B/D  memory.current ~= 21M
273  * A/B/E  memory.current ~= 0
274  * A/B/F  memory.current  = 0
275  * (for origin of the numbers, see model in memcg_protection.m.)
276  *
277  * After that it tries to allocate more than there is
278  * unprotected memory in A available, and checks that:
279  * a) memory.min protects pagecache even in this case,
280  * b) memory.low allows reclaiming page cache with low events.
281  *
282  * Then we try to reclaim from A/B/C using memory.reclaim until its
283  * usage reaches 10M.
284  * This makes sure that:
285  * (a) We ignore the protection of the reclaim target memcg.
286  * (b) The previously calculated emin value (~29M) should be dismissed.
287  */
288 static int test_memcg_protection(const char *root, bool min)
289 {
290 	int ret = KSFT_FAIL, rc;
291 	char *parent[3] = {NULL};
292 	char *children[4] = {NULL};
293 	const char *attribute = min ? "memory.min" : "memory.low";
294 	long c[4];
295 	long current;
296 	int i, attempts;
297 	int fd;
298 
299 	fd = get_temp_fd();
300 	if (fd < 0)
301 		goto cleanup;
302 
303 	parent[0] = cg_name(root, "memcg_test_0");
304 	if (!parent[0])
305 		goto cleanup;
306 
307 	parent[1] = cg_name(parent[0], "memcg_test_1");
308 	if (!parent[1])
309 		goto cleanup;
310 
311 	parent[2] = cg_name(parent[0], "memcg_test_2");
312 	if (!parent[2])
313 		goto cleanup;
314 
315 	if (cg_create(parent[0]))
316 		goto cleanup;
317 
318 	if (cg_read_long(parent[0], attribute)) {
319 		/* No memory.min on older kernels is fine */
320 		if (min)
321 			ret = KSFT_SKIP;
322 		goto cleanup;
323 	}
324 
325 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
326 		goto cleanup;
327 
328 	if (cg_write(parent[0], "memory.max", "200M"))
329 		goto cleanup;
330 
331 	if (cg_write(parent[0], "memory.swap.max", "0"))
332 		goto cleanup;
333 
334 	if (cg_create(parent[1]))
335 		goto cleanup;
336 
337 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
338 		goto cleanup;
339 
340 	if (cg_create(parent[2]))
341 		goto cleanup;
342 
343 	for (i = 0; i < ARRAY_SIZE(children); i++) {
344 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
345 		if (!children[i])
346 			goto cleanup;
347 
348 		if (cg_create(children[i]))
349 			goto cleanup;
350 
351 		if (i > 2)
352 			continue;
353 
354 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
355 			      (void *)(long)fd);
356 	}
357 
358 	if (cg_write(parent[1],   attribute, "50M"))
359 		goto cleanup;
360 	if (cg_write(children[0], attribute, "75M"))
361 		goto cleanup;
362 	if (cg_write(children[1], attribute, "25M"))
363 		goto cleanup;
364 	if (cg_write(children[2], attribute, "0"))
365 		goto cleanup;
366 	if (cg_write(children[3], attribute, "500M"))
367 		goto cleanup;
368 
369 	attempts = 0;
370 	while (!values_close(cg_read_long(parent[1], "memory.current"),
371 			     MB(150), 3)) {
372 		if (attempts++ > 5)
373 			break;
374 		sleep(1);
375 	}
376 
377 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
378 		goto cleanup;
379 
380 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
381 		goto cleanup;
382 
383 	for (i = 0; i < ARRAY_SIZE(children); i++)
384 		c[i] = cg_read_long(children[i], "memory.current");
385 
386 	if (!values_close(c[0], MB(29), 10))
387 		goto cleanup;
388 
389 	if (!values_close(c[1], MB(21), 10))
390 		goto cleanup;
391 
392 	if (c[3] != 0)
393 		goto cleanup;
394 
395 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
396 	if (min && !rc)
397 		goto cleanup;
398 	else if (!min && rc) {
399 		fprintf(stderr,
400 			"memory.low prevents from allocating anon memory\n");
401 		goto cleanup;
402 	}
403 
404 	current = min ? MB(50) : MB(30);
405 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
406 		goto cleanup;
407 
408 	if (!reclaim_until(children[0], MB(10)))
409 		goto cleanup;
410 
411 	if (min) {
412 		ret = KSFT_PASS;
413 		goto cleanup;
414 	}
415 
416 	for (i = 0; i < ARRAY_SIZE(children); i++) {
417 		int no_low_events_index = 1;
418 		long low, oom;
419 
420 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
421 		low = cg_read_key_long(children[i], "memory.events", "low ");
422 
423 		if (oom)
424 			goto cleanup;
425 		if (i <= no_low_events_index && low <= 0)
426 			goto cleanup;
427 		if (i > no_low_events_index && low)
428 			goto cleanup;
429 
430 	}
431 
432 	ret = KSFT_PASS;
433 
434 cleanup:
435 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
436 		if (!children[i])
437 			continue;
438 
439 		cg_destroy(children[i]);
440 		free(children[i]);
441 	}
442 
443 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
444 		if (!parent[i])
445 			continue;
446 
447 		cg_destroy(parent[i]);
448 		free(parent[i]);
449 	}
450 	close(fd);
451 	return ret;
452 }
453 
454 static int test_memcg_min(const char *root)
455 {
456 	return test_memcg_protection(root, true);
457 }
458 
459 static int test_memcg_low(const char *root)
460 {
461 	return test_memcg_protection(root, false);
462 }
463 
464 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
465 {
466 	size_t size = MB(50);
467 	int ret = -1;
468 	long current, high, max;
469 	int fd;
470 
471 	high = cg_read_long(cgroup, "memory.high");
472 	max = cg_read_long(cgroup, "memory.max");
473 	if (high != MB(30) && max != MB(30))
474 		return -1;
475 
476 	fd = get_temp_fd();
477 	if (fd < 0)
478 		return -1;
479 
480 	if (alloc_pagecache(fd, size))
481 		goto cleanup;
482 
483 	current = cg_read_long(cgroup, "memory.current");
484 	if (!values_close(current, MB(30), 5))
485 		goto cleanup;
486 
487 	ret = 0;
488 
489 cleanup:
490 	close(fd);
491 	return ret;
492 
493 }
494 
495 /*
496  * This test checks that memory.high limits the amount of
497  * memory which can be consumed by either anonymous memory
498  * or pagecache.
499  */
500 static int test_memcg_high(const char *root)
501 {
502 	int ret = KSFT_FAIL;
503 	char *memcg;
504 	long high;
505 
506 	memcg = cg_name(root, "memcg_test");
507 	if (!memcg)
508 		goto cleanup;
509 
510 	if (cg_create(memcg))
511 		goto cleanup;
512 
513 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
514 		goto cleanup;
515 
516 	if (cg_write(memcg, "memory.swap.max", "0"))
517 		goto cleanup;
518 
519 	if (cg_write(memcg, "memory.high", "30M"))
520 		goto cleanup;
521 
522 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
523 		goto cleanup;
524 
525 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
526 		goto cleanup;
527 
528 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
529 		goto cleanup;
530 
531 	high = cg_read_key_long(memcg, "memory.events", "high ");
532 	if (high <= 0)
533 		goto cleanup;
534 
535 	ret = KSFT_PASS;
536 
537 cleanup:
538 	cg_destroy(memcg);
539 	free(memcg);
540 
541 	return ret;
542 }
543 
544 static int alloc_anon_mlock(const char *cgroup, void *arg)
545 {
546 	size_t size = (size_t)arg;
547 	void *buf;
548 
549 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
550 		   0, 0);
551 	if (buf == MAP_FAILED)
552 		return -1;
553 
554 	mlock(buf, size);
555 	munmap(buf, size);
556 	return 0;
557 }
558 
559 /*
560  * This test checks that memory.high is able to throttle big single shot
561  * allocation i.e. large allocation within one kernel entry.
562  */
563 static int test_memcg_high_sync(const char *root)
564 {
565 	int ret = KSFT_FAIL, pid, fd = -1;
566 	char *memcg;
567 	long pre_high, pre_max;
568 	long post_high, post_max;
569 
570 	memcg = cg_name(root, "memcg_test");
571 	if (!memcg)
572 		goto cleanup;
573 
574 	if (cg_create(memcg))
575 		goto cleanup;
576 
577 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
578 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
579 	if (pre_high < 0 || pre_max < 0)
580 		goto cleanup;
581 
582 	if (cg_write(memcg, "memory.swap.max", "0"))
583 		goto cleanup;
584 
585 	if (cg_write(memcg, "memory.high", "30M"))
586 		goto cleanup;
587 
588 	if (cg_write(memcg, "memory.max", "140M"))
589 		goto cleanup;
590 
591 	fd = memcg_prepare_for_wait(memcg);
592 	if (fd < 0)
593 		goto cleanup;
594 
595 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
596 	if (pid < 0)
597 		goto cleanup;
598 
599 	cg_wait_for(fd);
600 
601 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
602 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
603 	if (post_high < 0 || post_max < 0)
604 		goto cleanup;
605 
606 	if (pre_high == post_high || pre_max != post_max)
607 		goto cleanup;
608 
609 	ret = KSFT_PASS;
610 
611 cleanup:
612 	if (fd >= 0)
613 		close(fd);
614 	cg_destroy(memcg);
615 	free(memcg);
616 
617 	return ret;
618 }
619 
620 /*
621  * This test checks that memory.max limits the amount of
622  * memory which can be consumed by either anonymous memory
623  * or pagecache.
624  */
625 static int test_memcg_max(const char *root)
626 {
627 	int ret = KSFT_FAIL;
628 	char *memcg;
629 	long current, max;
630 
631 	memcg = cg_name(root, "memcg_test");
632 	if (!memcg)
633 		goto cleanup;
634 
635 	if (cg_create(memcg))
636 		goto cleanup;
637 
638 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
639 		goto cleanup;
640 
641 	if (cg_write(memcg, "memory.swap.max", "0"))
642 		goto cleanup;
643 
644 	if (cg_write(memcg, "memory.max", "30M"))
645 		goto cleanup;
646 
647 	/* Should be killed by OOM killer */
648 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
649 		goto cleanup;
650 
651 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
652 		goto cleanup;
653 
654 	current = cg_read_long(memcg, "memory.current");
655 	if (current > MB(30) || !current)
656 		goto cleanup;
657 
658 	max = cg_read_key_long(memcg, "memory.events", "max ");
659 	if (max <= 0)
660 		goto cleanup;
661 
662 	ret = KSFT_PASS;
663 
664 cleanup:
665 	cg_destroy(memcg);
666 	free(memcg);
667 
668 	return ret;
669 }
670 
671 /*
672  * Reclaim from @memcg until usage reaches @goal by writing to
673  * memory.reclaim.
674  *
675  * This function will return false if the usage is already below the
676  * goal.
677  *
678  * This function assumes that writing to memory.reclaim is the only
679  * source of change in memory.current (no concurrent allocations or
680  * reclaim).
681  *
682  * This function makes sure memory.reclaim is sane. It will return
683  * false if memory.reclaim's error codes do not make sense, even if
684  * the usage goal was satisfied.
685  */
686 static bool reclaim_until(const char *memcg, long goal)
687 {
688 	char buf[64];
689 	int retries, err;
690 	long current, to_reclaim;
691 	bool reclaimed = false;
692 
693 	for (retries = 5; retries > 0; retries--) {
694 		current = cg_read_long(memcg, "memory.current");
695 
696 		if (current < goal || values_close(current, goal, 3))
697 			break;
698 		/* Did memory.reclaim return 0 incorrectly? */
699 		else if (reclaimed)
700 			return false;
701 
702 		to_reclaim = current - goal;
703 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
704 		err = cg_write(memcg, "memory.reclaim", buf);
705 		if (!err)
706 			reclaimed = true;
707 		else if (err != -EAGAIN)
708 			return false;
709 	}
710 	return reclaimed;
711 }
712 
713 /*
714  * This test checks that memory.reclaim reclaims the given
715  * amount of memory (from both anon and file, if possible).
716  */
717 static int test_memcg_reclaim(const char *root)
718 {
719 	int ret = KSFT_FAIL;
720 	int fd = -1;
721 	int retries;
722 	char *memcg;
723 	long current, expected_usage;
724 
725 	memcg = cg_name(root, "memcg_test");
726 	if (!memcg)
727 		goto cleanup;
728 
729 	if (cg_create(memcg))
730 		goto cleanup;
731 
732 	current = cg_read_long(memcg, "memory.current");
733 	if (current != 0)
734 		goto cleanup;
735 
736 	fd = get_temp_fd();
737 	if (fd < 0)
738 		goto cleanup;
739 
740 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
741 
742 	/*
743 	 * If swap is enabled, try to reclaim from both anon and file, else try
744 	 * to reclaim from file only.
745 	 */
746 	if (is_swap_enabled()) {
747 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
748 		expected_usage = MB(100);
749 	} else
750 		expected_usage = MB(50);
751 
752 	/*
753 	 * Wait until current usage reaches the expected usage (or we run out of
754 	 * retries).
755 	 */
756 	retries = 5;
757 	while (!values_close(cg_read_long(memcg, "memory.current"),
758 			    expected_usage, 10)) {
759 		if (retries--) {
760 			sleep(1);
761 			continue;
762 		} else {
763 			fprintf(stderr,
764 				"failed to allocate %ld for memcg reclaim test\n",
765 				expected_usage);
766 			goto cleanup;
767 		}
768 	}
769 
770 	/*
771 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
772 	 * and file if swap is enabled.
773 	 */
774 	if (!reclaim_until(memcg, MB(30)))
775 		goto cleanup;
776 
777 	ret = KSFT_PASS;
778 cleanup:
779 	cg_destroy(memcg);
780 	free(memcg);
781 	close(fd);
782 
783 	return ret;
784 }
785 
786 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
787 {
788 	long mem_max = (long)arg;
789 	size_t size = MB(50);
790 	char *buf, *ptr;
791 	long mem_current, swap_current;
792 	int ret = -1;
793 
794 	buf = malloc(size);
795 	if (buf == NULL) {
796 		fprintf(stderr, "malloc() failed\n");
797 		return -1;
798 	}
799 
800 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
801 		*ptr = 0;
802 
803 	mem_current = cg_read_long(cgroup, "memory.current");
804 	if (!mem_current || !values_close(mem_current, mem_max, 3))
805 		goto cleanup;
806 
807 	swap_current = cg_read_long(cgroup, "memory.swap.current");
808 	if (!swap_current ||
809 	    !values_close(mem_current + swap_current, size, 3))
810 		goto cleanup;
811 
812 	ret = 0;
813 cleanup:
814 	free(buf);
815 	return ret;
816 }
817 
818 /*
819  * This test checks that memory.swap.max limits the amount of
820  * anonymous memory which can be swapped out.
821  */
822 static int test_memcg_swap_max(const char *root)
823 {
824 	int ret = KSFT_FAIL;
825 	char *memcg;
826 	long max;
827 
828 	if (!is_swap_enabled())
829 		return KSFT_SKIP;
830 
831 	memcg = cg_name(root, "memcg_test");
832 	if (!memcg)
833 		goto cleanup;
834 
835 	if (cg_create(memcg))
836 		goto cleanup;
837 
838 	if (cg_read_long(memcg, "memory.swap.current")) {
839 		ret = KSFT_SKIP;
840 		goto cleanup;
841 	}
842 
843 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
844 		goto cleanup;
845 
846 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
847 		goto cleanup;
848 
849 	if (cg_write(memcg, "memory.swap.max", "30M"))
850 		goto cleanup;
851 
852 	if (cg_write(memcg, "memory.max", "30M"))
853 		goto cleanup;
854 
855 	/* Should be killed by OOM killer */
856 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
857 		goto cleanup;
858 
859 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
860 		goto cleanup;
861 
862 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
863 		goto cleanup;
864 
865 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
866 		goto cleanup;
867 
868 	max = cg_read_key_long(memcg, "memory.events", "max ");
869 	if (max <= 0)
870 		goto cleanup;
871 
872 	ret = KSFT_PASS;
873 
874 cleanup:
875 	cg_destroy(memcg);
876 	free(memcg);
877 
878 	return ret;
879 }
880 
881 /*
882  * This test disables swapping and tries to allocate anonymous memory
883  * up to OOM. Then it checks for oom and oom_kill events in
884  * memory.events.
885  */
886 static int test_memcg_oom_events(const char *root)
887 {
888 	int ret = KSFT_FAIL;
889 	char *memcg;
890 
891 	memcg = cg_name(root, "memcg_test");
892 	if (!memcg)
893 		goto cleanup;
894 
895 	if (cg_create(memcg))
896 		goto cleanup;
897 
898 	if (cg_write(memcg, "memory.max", "30M"))
899 		goto cleanup;
900 
901 	if (cg_write(memcg, "memory.swap.max", "0"))
902 		goto cleanup;
903 
904 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
905 		goto cleanup;
906 
907 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
908 		goto cleanup;
909 
910 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
911 		goto cleanup;
912 
913 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
914 		goto cleanup;
915 
916 	ret = KSFT_PASS;
917 
918 cleanup:
919 	cg_destroy(memcg);
920 	free(memcg);
921 
922 	return ret;
923 }
924 
925 struct tcp_server_args {
926 	unsigned short port;
927 	int ctl[2];
928 };
929 
930 static int tcp_server(const char *cgroup, void *arg)
931 {
932 	struct tcp_server_args *srv_args = arg;
933 	struct sockaddr_in6 saddr = { 0 };
934 	socklen_t slen = sizeof(saddr);
935 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
936 
937 	close(srv_args->ctl[0]);
938 	ctl_fd = srv_args->ctl[1];
939 
940 	saddr.sin6_family = AF_INET6;
941 	saddr.sin6_addr = in6addr_any;
942 	saddr.sin6_port = htons(srv_args->port);
943 
944 	sk = socket(AF_INET6, SOCK_STREAM, 0);
945 	if (sk < 0)
946 		return ret;
947 
948 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
949 		goto cleanup;
950 
951 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
952 		write(ctl_fd, &errno, sizeof(errno));
953 		goto cleanup;
954 	}
955 
956 	if (listen(sk, 1))
957 		goto cleanup;
958 
959 	ret = 0;
960 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
961 		ret = -1;
962 		goto cleanup;
963 	}
964 
965 	client_sk = accept(sk, NULL, NULL);
966 	if (client_sk < 0)
967 		goto cleanup;
968 
969 	ret = -1;
970 	for (;;) {
971 		uint8_t buf[0x100000];
972 
973 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
974 			if (errno == ECONNRESET)
975 				ret = 0;
976 			break;
977 		}
978 	}
979 
980 	close(client_sk);
981 
982 cleanup:
983 	close(sk);
984 	return ret;
985 }
986 
987 static int tcp_client(const char *cgroup, unsigned short port)
988 {
989 	const char server[] = "localhost";
990 	struct addrinfo *ai;
991 	char servport[6];
992 	int retries = 0x10; /* nice round number */
993 	int sk, ret;
994 	long allocated;
995 
996 	allocated = cg_read_long(cgroup, "memory.current");
997 	snprintf(servport, sizeof(servport), "%hd", port);
998 	ret = getaddrinfo(server, servport, NULL, &ai);
999 	if (ret)
1000 		return ret;
1001 
1002 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1003 	if (sk < 0)
1004 		goto free_ainfo;
1005 
1006 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1007 	if (ret < 0)
1008 		goto close_sk;
1009 
1010 	ret = KSFT_FAIL;
1011 	while (retries--) {
1012 		uint8_t buf[0x100000];
1013 		long current, sock;
1014 
1015 		if (read(sk, buf, sizeof(buf)) <= 0)
1016 			goto close_sk;
1017 
1018 		current = cg_read_long(cgroup, "memory.current");
1019 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1020 
1021 		if (current < 0 || sock < 0)
1022 			goto close_sk;
1023 
1024 		/* exclude the memory not related to socket connection */
1025 		if (values_close(current - allocated, sock, 10)) {
1026 			ret = KSFT_PASS;
1027 			break;
1028 		}
1029 	}
1030 
1031 close_sk:
1032 	close(sk);
1033 free_ainfo:
1034 	freeaddrinfo(ai);
1035 	return ret;
1036 }
1037 
1038 /*
1039  * This test checks socket memory accounting.
1040  * The test forks a TCP server listens on a random port between 1000
1041  * and 61000. Once it gets a client connection, it starts writing to
1042  * its socket.
1043  * The TCP client interleaves reads from the socket with check whether
1044  * memory.current and memory.stat.sock are similar.
1045  */
1046 static int test_memcg_sock(const char *root)
1047 {
1048 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1049 	unsigned short port;
1050 	char *memcg;
1051 
1052 	memcg = cg_name(root, "memcg_test");
1053 	if (!memcg)
1054 		goto cleanup;
1055 
1056 	if (cg_create(memcg))
1057 		goto cleanup;
1058 
1059 	while (bind_retries--) {
1060 		struct tcp_server_args args;
1061 
1062 		if (pipe(args.ctl))
1063 			goto cleanup;
1064 
1065 		port = args.port = 1000 + rand() % 60000;
1066 
1067 		pid = cg_run_nowait(memcg, tcp_server, &args);
1068 		if (pid < 0)
1069 			goto cleanup;
1070 
1071 		close(args.ctl[1]);
1072 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1073 			goto cleanup;
1074 		close(args.ctl[0]);
1075 
1076 		if (!err)
1077 			break;
1078 		if (err != EADDRINUSE)
1079 			goto cleanup;
1080 
1081 		waitpid(pid, NULL, 0);
1082 	}
1083 
1084 	if (err == EADDRINUSE) {
1085 		ret = KSFT_SKIP;
1086 		goto cleanup;
1087 	}
1088 
1089 	if (tcp_client(memcg, port) != KSFT_PASS)
1090 		goto cleanup;
1091 
1092 	waitpid(pid, &err, 0);
1093 	if (WEXITSTATUS(err))
1094 		goto cleanup;
1095 
1096 	if (cg_read_long(memcg, "memory.current") < 0)
1097 		goto cleanup;
1098 
1099 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1100 		goto cleanup;
1101 
1102 	ret = KSFT_PASS;
1103 
1104 cleanup:
1105 	cg_destroy(memcg);
1106 	free(memcg);
1107 
1108 	return ret;
1109 }
1110 
1111 /*
1112  * This test disables swapping and tries to allocate anonymous memory
1113  * up to OOM with memory.group.oom set. Then it checks that all
1114  * processes in the leaf were killed. It also checks that oom_events
1115  * were propagated to the parent level.
1116  */
1117 static int test_memcg_oom_group_leaf_events(const char *root)
1118 {
1119 	int ret = KSFT_FAIL;
1120 	char *parent, *child;
1121 	long parent_oom_events;
1122 
1123 	parent = cg_name(root, "memcg_test_0");
1124 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1125 
1126 	if (!parent || !child)
1127 		goto cleanup;
1128 
1129 	if (cg_create(parent))
1130 		goto cleanup;
1131 
1132 	if (cg_create(child))
1133 		goto cleanup;
1134 
1135 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1136 		goto cleanup;
1137 
1138 	if (cg_write(child, "memory.max", "50M"))
1139 		goto cleanup;
1140 
1141 	if (cg_write(child, "memory.swap.max", "0"))
1142 		goto cleanup;
1143 
1144 	if (cg_write(child, "memory.oom.group", "1"))
1145 		goto cleanup;
1146 
1147 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1148 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1149 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1150 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1151 		goto cleanup;
1152 
1153 	if (cg_test_proc_killed(child))
1154 		goto cleanup;
1155 
1156 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1157 		goto cleanup;
1158 
1159 	parent_oom_events = cg_read_key_long(
1160 			parent, "memory.events", "oom_kill ");
1161 	/*
1162 	 * If memory_localevents is not enabled (the default), the parent should
1163 	 * count OOM events in its children groups. Otherwise, it should not
1164 	 * have observed any events.
1165 	 */
1166 	if (has_localevents && parent_oom_events != 0)
1167 		goto cleanup;
1168 	else if (!has_localevents && parent_oom_events <= 0)
1169 		goto cleanup;
1170 
1171 	ret = KSFT_PASS;
1172 
1173 cleanup:
1174 	if (child)
1175 		cg_destroy(child);
1176 	if (parent)
1177 		cg_destroy(parent);
1178 	free(child);
1179 	free(parent);
1180 
1181 	return ret;
1182 }
1183 
1184 /*
1185  * This test disables swapping and tries to allocate anonymous memory
1186  * up to OOM with memory.group.oom set. Then it checks that all
1187  * processes in the parent and leaf were killed.
1188  */
1189 static int test_memcg_oom_group_parent_events(const char *root)
1190 {
1191 	int ret = KSFT_FAIL;
1192 	char *parent, *child;
1193 
1194 	parent = cg_name(root, "memcg_test_0");
1195 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1196 
1197 	if (!parent || !child)
1198 		goto cleanup;
1199 
1200 	if (cg_create(parent))
1201 		goto cleanup;
1202 
1203 	if (cg_create(child))
1204 		goto cleanup;
1205 
1206 	if (cg_write(parent, "memory.max", "80M"))
1207 		goto cleanup;
1208 
1209 	if (cg_write(parent, "memory.swap.max", "0"))
1210 		goto cleanup;
1211 
1212 	if (cg_write(parent, "memory.oom.group", "1"))
1213 		goto cleanup;
1214 
1215 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1216 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1217 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1218 
1219 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1220 		goto cleanup;
1221 
1222 	if (cg_test_proc_killed(child))
1223 		goto cleanup;
1224 	if (cg_test_proc_killed(parent))
1225 		goto cleanup;
1226 
1227 	ret = KSFT_PASS;
1228 
1229 cleanup:
1230 	if (child)
1231 		cg_destroy(child);
1232 	if (parent)
1233 		cg_destroy(parent);
1234 	free(child);
1235 	free(parent);
1236 
1237 	return ret;
1238 }
1239 
1240 /*
1241  * This test disables swapping and tries to allocate anonymous memory
1242  * up to OOM with memory.group.oom set. Then it checks that all
1243  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1244  */
1245 static int test_memcg_oom_group_score_events(const char *root)
1246 {
1247 	int ret = KSFT_FAIL;
1248 	char *memcg;
1249 	int safe_pid;
1250 
1251 	memcg = cg_name(root, "memcg_test_0");
1252 
1253 	if (!memcg)
1254 		goto cleanup;
1255 
1256 	if (cg_create(memcg))
1257 		goto cleanup;
1258 
1259 	if (cg_write(memcg, "memory.max", "50M"))
1260 		goto cleanup;
1261 
1262 	if (cg_write(memcg, "memory.swap.max", "0"))
1263 		goto cleanup;
1264 
1265 	if (cg_write(memcg, "memory.oom.group", "1"))
1266 		goto cleanup;
1267 
1268 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1269 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1270 		goto cleanup;
1271 
1272 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1273 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1274 		goto cleanup;
1275 
1276 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1277 		goto cleanup;
1278 
1279 	if (kill(safe_pid, SIGKILL))
1280 		goto cleanup;
1281 
1282 	ret = KSFT_PASS;
1283 
1284 cleanup:
1285 	if (memcg)
1286 		cg_destroy(memcg);
1287 	free(memcg);
1288 
1289 	return ret;
1290 }
1291 
1292 #define T(x) { x, #x }
1293 struct memcg_test {
1294 	int (*fn)(const char *root);
1295 	const char *name;
1296 } tests[] = {
1297 	T(test_memcg_subtree_control),
1298 	T(test_memcg_current),
1299 	T(test_memcg_min),
1300 	T(test_memcg_low),
1301 	T(test_memcg_high),
1302 	T(test_memcg_high_sync),
1303 	T(test_memcg_max),
1304 	T(test_memcg_reclaim),
1305 	T(test_memcg_oom_events),
1306 	T(test_memcg_swap_max),
1307 	T(test_memcg_sock),
1308 	T(test_memcg_oom_group_leaf_events),
1309 	T(test_memcg_oom_group_parent_events),
1310 	T(test_memcg_oom_group_score_events),
1311 };
1312 #undef T
1313 
1314 int main(int argc, char **argv)
1315 {
1316 	char root[PATH_MAX];
1317 	int i, proc_status, ret = EXIT_SUCCESS;
1318 
1319 	if (cg_find_unified_root(root, sizeof(root), NULL))
1320 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1321 
1322 	/*
1323 	 * Check that memory controller is available:
1324 	 * memory is listed in cgroup.controllers
1325 	 */
1326 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1327 		ksft_exit_skip("memory controller isn't available\n");
1328 
1329 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1330 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1331 			ksft_exit_skip("Failed to set memory controller\n");
1332 
1333 	proc_status = proc_mount_contains("memory_recursiveprot");
1334 	if (proc_status < 0)
1335 		ksft_exit_skip("Failed to query cgroup mount option\n");
1336 	has_recursiveprot = proc_status;
1337 
1338 	proc_status = proc_mount_contains("memory_localevents");
1339 	if (proc_status < 0)
1340 		ksft_exit_skip("Failed to query cgroup mount option\n");
1341 	has_localevents = proc_status;
1342 
1343 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1344 		switch (tests[i].fn(root)) {
1345 		case KSFT_PASS:
1346 			ksft_test_result_pass("%s\n", tests[i].name);
1347 			break;
1348 		case KSFT_SKIP:
1349 			ksft_test_result_skip("%s\n", tests[i].name);
1350 			break;
1351 		default:
1352 			ret = EXIT_FAILURE;
1353 			ksft_test_result_fail("%s\n", tests[i].name);
1354 			break;
1355 		}
1356 	}
1357 
1358 	return ret;
1359 }
1360