xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision e77a8005748547fb1f10645097f13ccdd804d7e5)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	if (buf == NULL) {
102 		fprintf(stderr, "malloc() failed\n");
103 		return -1;
104 	}
105 
106 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107 		*ptr = 0;
108 
109 	current = cg_read_long(cgroup, "memory.current");
110 	if (current < size)
111 		goto cleanup;
112 
113 	if (!values_close(size, current, 3))
114 		goto cleanup;
115 
116 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117 	if (anon < 0)
118 		goto cleanup;
119 
120 	if (!values_close(anon, current, 3))
121 		goto cleanup;
122 
123 	ret = 0;
124 cleanup:
125 	free(buf);
126 	return ret;
127 }
128 
129 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130 {
131 	size_t size = MB(50);
132 	int ret = -1;
133 	long current, file;
134 	int fd;
135 
136 	fd = get_temp_fd();
137 	if (fd < 0)
138 		return -1;
139 
140 	if (alloc_pagecache(fd, size))
141 		goto cleanup;
142 
143 	current = cg_read_long(cgroup, "memory.current");
144 	if (current < size)
145 		goto cleanup;
146 
147 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148 	if (file < 0)
149 		goto cleanup;
150 
151 	if (!values_close(file, current, 10))
152 		goto cleanup;
153 
154 	ret = 0;
155 
156 cleanup:
157 	close(fd);
158 	return ret;
159 }
160 
161 /*
162  * This test create a memory cgroup, allocates
163  * some anonymous memory and some pagecache
164  * and checks memory.current, memory.peak, and some memory.stat values.
165  */
166 static int test_memcg_current_peak(const char *root)
167 {
168 	int ret = KSFT_FAIL;
169 	long current, peak, peak_reset;
170 	char *memcg;
171 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
172 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
173 	struct stat ss;
174 
175 	memcg = cg_name(root, "memcg_test");
176 	if (!memcg)
177 		goto cleanup;
178 
179 	if (cg_create(memcg))
180 		goto cleanup;
181 
182 	current = cg_read_long(memcg, "memory.current");
183 	if (current != 0)
184 		goto cleanup;
185 
186 	peak = cg_read_long(memcg, "memory.peak");
187 	if (peak != 0)
188 		goto cleanup;
189 
190 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
191 		goto cleanup;
192 
193 	peak = cg_read_long(memcg, "memory.peak");
194 	if (peak < MB(50))
195 		goto cleanup;
196 
197 	/*
198 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
199 	 * We need at least three to be closed in a different order than writes occurred to test
200 	 * the linked-list handling.
201 	 */
202 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
203 
204 	if (peak_fd == -1) {
205 		if (errno == ENOENT)
206 			ret = KSFT_SKIP;
207 		goto cleanup;
208 	}
209 
210 	/*
211 	 * Before we try to use memory.peak's fd, try to figure out whether
212 	 * this kernel supports writing to that file in the first place. (by
213 	 * checking the writable bit on the file's st_mode)
214 	 */
215 	if (fstat(peak_fd, &ss))
216 		goto cleanup;
217 
218 	if ((ss.st_mode & S_IWUSR) == 0) {
219 		ret = KSFT_SKIP;
220 		goto cleanup;
221 	}
222 
223 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
224 
225 	if (peak_fd2 == -1)
226 		goto cleanup;
227 
228 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
229 
230 	if (peak_fd3 == -1)
231 		goto cleanup;
232 
233 	/* any non-empty string resets, but make it clear */
234 	static const char reset_string[] = "reset\n";
235 
236 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
237 	if (peak_reset != sizeof(reset_string))
238 		goto cleanup;
239 
240 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
241 	if (peak_reset != sizeof(reset_string))
242 		goto cleanup;
243 
244 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
245 	if (peak_reset != sizeof(reset_string))
246 		goto cleanup;
247 
248 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
249 	peak = cg_read_long(memcg, "memory.peak");
250 	if (peak < MB(50))
251 		goto cleanup;
252 
253 	fd2_closed = true;
254 	if (close(peak_fd2))
255 		goto cleanup;
256 
257 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
258 
259 	if (peak_fd4 == -1)
260 		goto cleanup;
261 
262 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
263 	if (peak_reset != sizeof(reset_string))
264 		goto cleanup;
265 
266 	peak = cg_read_long_fd(peak_fd);
267 	if (peak > MB(30) || peak < 0)
268 		goto cleanup;
269 
270 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
271 		goto cleanup;
272 
273 	peak = cg_read_long(memcg, "memory.peak");
274 	if (peak < MB(50))
275 		goto cleanup;
276 
277 	/* Make sure everything is back to normal */
278 	peak = cg_read_long_fd(peak_fd);
279 	if (peak < MB(50))
280 		goto cleanup;
281 
282 	peak = cg_read_long_fd(peak_fd4);
283 	if (peak < MB(50))
284 		goto cleanup;
285 
286 	fd3_closed = true;
287 	if (close(peak_fd3))
288 		goto cleanup;
289 
290 	fd4_closed = true;
291 	if (close(peak_fd4))
292 		goto cleanup;
293 
294 	ret = KSFT_PASS;
295 
296 cleanup:
297 	close(peak_fd);
298 	if (!fd2_closed)
299 		close(peak_fd2);
300 	if (!fd3_closed)
301 		close(peak_fd3);
302 	if (!fd4_closed)
303 		close(peak_fd4);
304 	cg_destroy(memcg);
305 	free(memcg);
306 
307 	return ret;
308 }
309 
310 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
311 {
312 	int fd = (long)arg;
313 	int ppid = getppid();
314 
315 	if (alloc_pagecache(fd, MB(50)))
316 		return -1;
317 
318 	while (getppid() == ppid)
319 		sleep(1);
320 
321 	return 0;
322 }
323 
324 static int alloc_anon_noexit(const char *cgroup, void *arg)
325 {
326 	int ppid = getppid();
327 	size_t size = (unsigned long)arg;
328 	char *buf, *ptr;
329 
330 	buf = malloc(size);
331 	if (buf == NULL) {
332 		fprintf(stderr, "malloc() failed\n");
333 		return -1;
334 	}
335 
336 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
337 		*ptr = 0;
338 
339 	while (getppid() == ppid)
340 		sleep(1);
341 
342 	free(buf);
343 	return 0;
344 }
345 
346 /*
347  * Wait until processes are killed asynchronously by the OOM killer
348  * If we exceed a timeout, fail.
349  */
350 static int cg_test_proc_killed(const char *cgroup)
351 {
352 	int limit;
353 
354 	for (limit = 10; limit > 0; limit--) {
355 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
356 			return 0;
357 
358 		usleep(100000);
359 	}
360 	return -1;
361 }
362 
363 static bool reclaim_until(const char *memcg, long goal);
364 
365 /*
366  * First, this test creates the following hierarchy:
367  * A       memory.min = 0,    memory.max = 200M
368  * A/B     memory.min = 50M
369  * A/B/C   memory.min = 75M,  memory.current = 50M
370  * A/B/D   memory.min = 25M,  memory.current = 50M
371  * A/B/E   memory.min = 0,    memory.current = 50M
372  * A/B/F   memory.min = 500M, memory.current = 0
373  *
374  * (or memory.low if we test soft protection)
375  *
376  * Usages are pagecache and the test keeps a running
377  * process in every leaf cgroup.
378  * Then it creates A/G and creates a significant
379  * memory pressure in A.
380  *
381  * Then it checks actual memory usages and expects that:
382  * A/B    memory.current ~= 50M
383  * A/B/C  memory.current ~= 29M
384  * A/B/D  memory.current ~= 21M
385  * A/B/E  memory.current ~= 0
386  * A/B/F  memory.current  = 0
387  * (for origin of the numbers, see model in memcg_protection.m.)
388  *
389  * After that it tries to allocate more than there is
390  * unprotected memory in A available, and checks that:
391  * a) memory.min protects pagecache even in this case,
392  * b) memory.low allows reclaiming page cache with low events.
393  *
394  * Then we try to reclaim from A/B/C using memory.reclaim until its
395  * usage reaches 10M.
396  * This makes sure that:
397  * (a) We ignore the protection of the reclaim target memcg.
398  * (b) The previously calculated emin value (~29M) should be dismissed.
399  */
400 static int test_memcg_protection(const char *root, bool min)
401 {
402 	int ret = KSFT_FAIL, rc;
403 	char *parent[3] = {NULL};
404 	char *children[4] = {NULL};
405 	const char *attribute = min ? "memory.min" : "memory.low";
406 	long c[4];
407 	long current;
408 	int i, attempts;
409 	int fd;
410 
411 	fd = get_temp_fd();
412 	if (fd < 0)
413 		goto cleanup;
414 
415 	parent[0] = cg_name(root, "memcg_test_0");
416 	if (!parent[0])
417 		goto cleanup;
418 
419 	parent[1] = cg_name(parent[0], "memcg_test_1");
420 	if (!parent[1])
421 		goto cleanup;
422 
423 	parent[2] = cg_name(parent[0], "memcg_test_2");
424 	if (!parent[2])
425 		goto cleanup;
426 
427 	if (cg_create(parent[0]))
428 		goto cleanup;
429 
430 	if (cg_read_long(parent[0], attribute)) {
431 		/* No memory.min on older kernels is fine */
432 		if (min)
433 			ret = KSFT_SKIP;
434 		goto cleanup;
435 	}
436 
437 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
438 		goto cleanup;
439 
440 	if (cg_write(parent[0], "memory.max", "200M"))
441 		goto cleanup;
442 
443 	if (cg_write(parent[0], "memory.swap.max", "0"))
444 		goto cleanup;
445 
446 	if (cg_create(parent[1]))
447 		goto cleanup;
448 
449 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
450 		goto cleanup;
451 
452 	if (cg_create(parent[2]))
453 		goto cleanup;
454 
455 	for (i = 0; i < ARRAY_SIZE(children); i++) {
456 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
457 		if (!children[i])
458 			goto cleanup;
459 
460 		if (cg_create(children[i]))
461 			goto cleanup;
462 
463 		if (i > 2)
464 			continue;
465 
466 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
467 			      (void *)(long)fd);
468 	}
469 
470 	if (cg_write(parent[1],   attribute, "50M"))
471 		goto cleanup;
472 	if (cg_write(children[0], attribute, "75M"))
473 		goto cleanup;
474 	if (cg_write(children[1], attribute, "25M"))
475 		goto cleanup;
476 	if (cg_write(children[2], attribute, "0"))
477 		goto cleanup;
478 	if (cg_write(children[3], attribute, "500M"))
479 		goto cleanup;
480 
481 	attempts = 0;
482 	while (!values_close(cg_read_long(parent[1], "memory.current"),
483 			     MB(150), 3)) {
484 		if (attempts++ > 5)
485 			break;
486 		sleep(1);
487 	}
488 
489 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
490 		goto cleanup;
491 
492 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
493 		goto cleanup;
494 
495 	for (i = 0; i < ARRAY_SIZE(children); i++)
496 		c[i] = cg_read_long(children[i], "memory.current");
497 
498 	if (!values_close(c[0], MB(29), 10))
499 		goto cleanup;
500 
501 	if (!values_close(c[1], MB(21), 10))
502 		goto cleanup;
503 
504 	if (c[3] != 0)
505 		goto cleanup;
506 
507 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
508 	if (min && !rc)
509 		goto cleanup;
510 	else if (!min && rc) {
511 		fprintf(stderr,
512 			"memory.low prevents from allocating anon memory\n");
513 		goto cleanup;
514 	}
515 
516 	current = min ? MB(50) : MB(30);
517 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
518 		goto cleanup;
519 
520 	if (!reclaim_until(children[0], MB(10)))
521 		goto cleanup;
522 
523 	if (min) {
524 		ret = KSFT_PASS;
525 		goto cleanup;
526 	}
527 
528 	for (i = 0; i < ARRAY_SIZE(children); i++) {
529 		int no_low_events_index = 1;
530 		long low, oom;
531 
532 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
533 		low = cg_read_key_long(children[i], "memory.events", "low ");
534 
535 		if (oom)
536 			goto cleanup;
537 		if (i <= no_low_events_index && low <= 0)
538 			goto cleanup;
539 		if (i > no_low_events_index && low)
540 			goto cleanup;
541 
542 	}
543 
544 	ret = KSFT_PASS;
545 
546 cleanup:
547 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
548 		if (!children[i])
549 			continue;
550 
551 		cg_destroy(children[i]);
552 		free(children[i]);
553 	}
554 
555 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
556 		if (!parent[i])
557 			continue;
558 
559 		cg_destroy(parent[i]);
560 		free(parent[i]);
561 	}
562 	close(fd);
563 	return ret;
564 }
565 
566 static int test_memcg_min(const char *root)
567 {
568 	return test_memcg_protection(root, true);
569 }
570 
571 static int test_memcg_low(const char *root)
572 {
573 	return test_memcg_protection(root, false);
574 }
575 
576 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
577 {
578 	size_t size = MB(50);
579 	int ret = -1;
580 	long current, high, max;
581 	int fd;
582 
583 	high = cg_read_long(cgroup, "memory.high");
584 	max = cg_read_long(cgroup, "memory.max");
585 	if (high != MB(30) && max != MB(30))
586 		return -1;
587 
588 	fd = get_temp_fd();
589 	if (fd < 0)
590 		return -1;
591 
592 	if (alloc_pagecache(fd, size))
593 		goto cleanup;
594 
595 	current = cg_read_long(cgroup, "memory.current");
596 	if (!values_close(current, MB(30), 5))
597 		goto cleanup;
598 
599 	ret = 0;
600 
601 cleanup:
602 	close(fd);
603 	return ret;
604 
605 }
606 
607 /*
608  * This test checks that memory.high limits the amount of
609  * memory which can be consumed by either anonymous memory
610  * or pagecache.
611  */
612 static int test_memcg_high(const char *root)
613 {
614 	int ret = KSFT_FAIL;
615 	char *memcg;
616 	long high;
617 
618 	memcg = cg_name(root, "memcg_test");
619 	if (!memcg)
620 		goto cleanup;
621 
622 	if (cg_create(memcg))
623 		goto cleanup;
624 
625 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
626 		goto cleanup;
627 
628 	if (cg_write(memcg, "memory.swap.max", "0"))
629 		goto cleanup;
630 
631 	if (cg_write(memcg, "memory.high", "30M"))
632 		goto cleanup;
633 
634 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
635 		goto cleanup;
636 
637 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
638 		goto cleanup;
639 
640 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
641 		goto cleanup;
642 
643 	high = cg_read_key_long(memcg, "memory.events", "high ");
644 	if (high <= 0)
645 		goto cleanup;
646 
647 	ret = KSFT_PASS;
648 
649 cleanup:
650 	cg_destroy(memcg);
651 	free(memcg);
652 
653 	return ret;
654 }
655 
656 static int alloc_anon_mlock(const char *cgroup, void *arg)
657 {
658 	size_t size = (size_t)arg;
659 	void *buf;
660 
661 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
662 		   0, 0);
663 	if (buf == MAP_FAILED)
664 		return -1;
665 
666 	mlock(buf, size);
667 	munmap(buf, size);
668 	return 0;
669 }
670 
671 /*
672  * This test checks that memory.high is able to throttle big single shot
673  * allocation i.e. large allocation within one kernel entry.
674  */
675 static int test_memcg_high_sync(const char *root)
676 {
677 	int ret = KSFT_FAIL, pid, fd = -1;
678 	char *memcg;
679 	long pre_high, pre_max;
680 	long post_high, post_max;
681 
682 	memcg = cg_name(root, "memcg_test");
683 	if (!memcg)
684 		goto cleanup;
685 
686 	if (cg_create(memcg))
687 		goto cleanup;
688 
689 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
690 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
691 	if (pre_high < 0 || pre_max < 0)
692 		goto cleanup;
693 
694 	if (cg_write(memcg, "memory.swap.max", "0"))
695 		goto cleanup;
696 
697 	if (cg_write(memcg, "memory.high", "30M"))
698 		goto cleanup;
699 
700 	if (cg_write(memcg, "memory.max", "140M"))
701 		goto cleanup;
702 
703 	fd = memcg_prepare_for_wait(memcg);
704 	if (fd < 0)
705 		goto cleanup;
706 
707 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
708 	if (pid < 0)
709 		goto cleanup;
710 
711 	cg_wait_for(fd);
712 
713 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
714 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
715 	if (post_high < 0 || post_max < 0)
716 		goto cleanup;
717 
718 	if (pre_high == post_high || pre_max != post_max)
719 		goto cleanup;
720 
721 	ret = KSFT_PASS;
722 
723 cleanup:
724 	if (fd >= 0)
725 		close(fd);
726 	cg_destroy(memcg);
727 	free(memcg);
728 
729 	return ret;
730 }
731 
732 /*
733  * This test checks that memory.max limits the amount of
734  * memory which can be consumed by either anonymous memory
735  * or pagecache.
736  */
737 static int test_memcg_max(const char *root)
738 {
739 	int ret = KSFT_FAIL;
740 	char *memcg;
741 	long current, max;
742 
743 	memcg = cg_name(root, "memcg_test");
744 	if (!memcg)
745 		goto cleanup;
746 
747 	if (cg_create(memcg))
748 		goto cleanup;
749 
750 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
751 		goto cleanup;
752 
753 	if (cg_write(memcg, "memory.swap.max", "0"))
754 		goto cleanup;
755 
756 	if (cg_write(memcg, "memory.max", "30M"))
757 		goto cleanup;
758 
759 	/* Should be killed by OOM killer */
760 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
761 		goto cleanup;
762 
763 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
764 		goto cleanup;
765 
766 	current = cg_read_long(memcg, "memory.current");
767 	if (current > MB(30) || !current)
768 		goto cleanup;
769 
770 	max = cg_read_key_long(memcg, "memory.events", "max ");
771 	if (max <= 0)
772 		goto cleanup;
773 
774 	ret = KSFT_PASS;
775 
776 cleanup:
777 	cg_destroy(memcg);
778 	free(memcg);
779 
780 	return ret;
781 }
782 
783 /*
784  * Reclaim from @memcg until usage reaches @goal by writing to
785  * memory.reclaim.
786  *
787  * This function will return false if the usage is already below the
788  * goal.
789  *
790  * This function assumes that writing to memory.reclaim is the only
791  * source of change in memory.current (no concurrent allocations or
792  * reclaim).
793  *
794  * This function makes sure memory.reclaim is sane. It will return
795  * false if memory.reclaim's error codes do not make sense, even if
796  * the usage goal was satisfied.
797  */
798 static bool reclaim_until(const char *memcg, long goal)
799 {
800 	char buf[64];
801 	int retries, err;
802 	long current, to_reclaim;
803 	bool reclaimed = false;
804 
805 	for (retries = 5; retries > 0; retries--) {
806 		current = cg_read_long(memcg, "memory.current");
807 
808 		if (current < goal || values_close(current, goal, 3))
809 			break;
810 		/* Did memory.reclaim return 0 incorrectly? */
811 		else if (reclaimed)
812 			return false;
813 
814 		to_reclaim = current - goal;
815 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
816 		err = cg_write(memcg, "memory.reclaim", buf);
817 		if (!err)
818 			reclaimed = true;
819 		else if (err != -EAGAIN)
820 			return false;
821 	}
822 	return reclaimed;
823 }
824 
825 /*
826  * This test checks that memory.reclaim reclaims the given
827  * amount of memory (from both anon and file, if possible).
828  */
829 static int test_memcg_reclaim(const char *root)
830 {
831 	int ret = KSFT_FAIL;
832 	int fd = -1;
833 	int retries;
834 	char *memcg;
835 	long current, expected_usage;
836 
837 	memcg = cg_name(root, "memcg_test");
838 	if (!memcg)
839 		goto cleanup;
840 
841 	if (cg_create(memcg))
842 		goto cleanup;
843 
844 	current = cg_read_long(memcg, "memory.current");
845 	if (current != 0)
846 		goto cleanup;
847 
848 	fd = get_temp_fd();
849 	if (fd < 0)
850 		goto cleanup;
851 
852 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
853 
854 	/*
855 	 * If swap is enabled, try to reclaim from both anon and file, else try
856 	 * to reclaim from file only.
857 	 */
858 	if (is_swap_enabled()) {
859 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
860 		expected_usage = MB(100);
861 	} else
862 		expected_usage = MB(50);
863 
864 	/*
865 	 * Wait until current usage reaches the expected usage (or we run out of
866 	 * retries).
867 	 */
868 	retries = 5;
869 	while (!values_close(cg_read_long(memcg, "memory.current"),
870 			    expected_usage, 10)) {
871 		if (retries--) {
872 			sleep(1);
873 			continue;
874 		} else {
875 			fprintf(stderr,
876 				"failed to allocate %ld for memcg reclaim test\n",
877 				expected_usage);
878 			goto cleanup;
879 		}
880 	}
881 
882 	/*
883 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
884 	 * and file if swap is enabled.
885 	 */
886 	if (!reclaim_until(memcg, MB(30)))
887 		goto cleanup;
888 
889 	ret = KSFT_PASS;
890 cleanup:
891 	cg_destroy(memcg);
892 	free(memcg);
893 	close(fd);
894 
895 	return ret;
896 }
897 
898 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
899 {
900 	long mem_max = (long)arg;
901 	size_t size = MB(50);
902 	char *buf, *ptr;
903 	long mem_current, swap_current;
904 	int ret = -1;
905 
906 	buf = malloc(size);
907 	if (buf == NULL) {
908 		fprintf(stderr, "malloc() failed\n");
909 		return -1;
910 	}
911 
912 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
913 		*ptr = 0;
914 
915 	mem_current = cg_read_long(cgroup, "memory.current");
916 	if (!mem_current || !values_close(mem_current, mem_max, 3))
917 		goto cleanup;
918 
919 	swap_current = cg_read_long(cgroup, "memory.swap.current");
920 	if (!swap_current ||
921 	    !values_close(mem_current + swap_current, size, 3))
922 		goto cleanup;
923 
924 	ret = 0;
925 cleanup:
926 	free(buf);
927 	return ret;
928 }
929 
930 /*
931  * This test checks that memory.swap.max limits the amount of
932  * anonymous memory which can be swapped out. Additionally, it verifies that
933  * memory.swap.peak reflects the high watermark and can be reset.
934  */
935 static int test_memcg_swap_max_peak(const char *root)
936 {
937 	int ret = KSFT_FAIL;
938 	char *memcg;
939 	long max, peak;
940 	struct stat ss;
941 	int swap_peak_fd = -1, mem_peak_fd = -1;
942 
943 	/* any non-empty string resets */
944 	static const char reset_string[] = "foobarbaz";
945 
946 	if (!is_swap_enabled())
947 		return KSFT_SKIP;
948 
949 	memcg = cg_name(root, "memcg_test");
950 	if (!memcg)
951 		goto cleanup;
952 
953 	if (cg_create(memcg))
954 		goto cleanup;
955 
956 	if (cg_read_long(memcg, "memory.swap.current")) {
957 		ret = KSFT_SKIP;
958 		goto cleanup;
959 	}
960 
961 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
962 			       O_RDWR | O_APPEND | O_CLOEXEC);
963 
964 	if (swap_peak_fd == -1) {
965 		if (errno == ENOENT)
966 			ret = KSFT_SKIP;
967 		goto cleanup;
968 	}
969 
970 	/*
971 	 * Before we try to use memory.swap.peak's fd, try to figure out
972 	 * whether this kernel supports writing to that file in the first
973 	 * place. (by checking the writable bit on the file's st_mode)
974 	 */
975 	if (fstat(swap_peak_fd, &ss))
976 		goto cleanup;
977 
978 	if ((ss.st_mode & S_IWUSR) == 0) {
979 		ret = KSFT_SKIP;
980 		goto cleanup;
981 	}
982 
983 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
984 
985 	if (mem_peak_fd == -1)
986 		goto cleanup;
987 
988 	if (cg_read_long(memcg, "memory.swap.peak"))
989 		goto cleanup;
990 
991 	if (cg_read_long_fd(swap_peak_fd))
992 		goto cleanup;
993 
994 	/* switch the swap and mem fds into local-peak tracking mode*/
995 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
996 
997 	if (peak_reset != sizeof(reset_string))
998 		goto cleanup;
999 
1000 	if (cg_read_long_fd(swap_peak_fd))
1001 		goto cleanup;
1002 
1003 	if (cg_read_long(memcg, "memory.peak"))
1004 		goto cleanup;
1005 
1006 	if (cg_read_long_fd(mem_peak_fd))
1007 		goto cleanup;
1008 
1009 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1010 	if (peak_reset != sizeof(reset_string))
1011 		goto cleanup;
1012 
1013 	if (cg_read_long_fd(mem_peak_fd))
1014 		goto cleanup;
1015 
1016 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1017 		goto cleanup;
1018 
1019 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1020 		goto cleanup;
1021 
1022 	if (cg_write(memcg, "memory.swap.max", "30M"))
1023 		goto cleanup;
1024 
1025 	if (cg_write(memcg, "memory.max", "30M"))
1026 		goto cleanup;
1027 
1028 	/* Should be killed by OOM killer */
1029 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1030 		goto cleanup;
1031 
1032 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1033 		goto cleanup;
1034 
1035 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1036 		goto cleanup;
1037 
1038 	peak = cg_read_long(memcg, "memory.peak");
1039 	if (peak < MB(29))
1040 		goto cleanup;
1041 
1042 	peak = cg_read_long(memcg, "memory.swap.peak");
1043 	if (peak < MB(29))
1044 		goto cleanup;
1045 
1046 	peak = cg_read_long_fd(mem_peak_fd);
1047 	if (peak < MB(29))
1048 		goto cleanup;
1049 
1050 	peak = cg_read_long_fd(swap_peak_fd);
1051 	if (peak < MB(29))
1052 		goto cleanup;
1053 
1054 	/*
1055 	 * open, reset and close the peak swap on another FD to make sure
1056 	 * multiple extant fds don't corrupt the linked-list
1057 	 */
1058 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1059 	if (peak_reset)
1060 		goto cleanup;
1061 
1062 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1063 	if (peak_reset)
1064 		goto cleanup;
1065 
1066 	/* actually reset on the fds */
1067 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1068 	if (peak_reset != sizeof(reset_string))
1069 		goto cleanup;
1070 
1071 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1072 	if (peak_reset != sizeof(reset_string))
1073 		goto cleanup;
1074 
1075 	peak = cg_read_long_fd(swap_peak_fd);
1076 	if (peak > MB(10))
1077 		goto cleanup;
1078 
1079 	/*
1080 	 * The cgroup is now empty, but there may be a page or two associated
1081 	 * with the open FD accounted to it.
1082 	 */
1083 	peak = cg_read_long_fd(mem_peak_fd);
1084 	if (peak > MB(1))
1085 		goto cleanup;
1086 
1087 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1088 		goto cleanup;
1089 
1090 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1091 		goto cleanup;
1092 
1093 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1094 		goto cleanup;
1095 
1096 	max = cg_read_key_long(memcg, "memory.events", "max ");
1097 	if (max <= 0)
1098 		goto cleanup;
1099 
1100 	peak = cg_read_long(memcg, "memory.peak");
1101 	if (peak < MB(29))
1102 		goto cleanup;
1103 
1104 	peak = cg_read_long(memcg, "memory.swap.peak");
1105 	if (peak < MB(29))
1106 		goto cleanup;
1107 
1108 	peak = cg_read_long_fd(mem_peak_fd);
1109 	if (peak < MB(29))
1110 		goto cleanup;
1111 
1112 	peak = cg_read_long_fd(swap_peak_fd);
1113 	if (peak < MB(19))
1114 		goto cleanup;
1115 
1116 	ret = KSFT_PASS;
1117 
1118 cleanup:
1119 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1120 		ret = KSFT_FAIL;
1121 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1122 		ret = KSFT_FAIL;
1123 	cg_destroy(memcg);
1124 	free(memcg);
1125 
1126 	return ret;
1127 }
1128 
1129 /*
1130  * This test disables swapping and tries to allocate anonymous memory
1131  * up to OOM. Then it checks for oom and oom_kill events in
1132  * memory.events.
1133  */
1134 static int test_memcg_oom_events(const char *root)
1135 {
1136 	int ret = KSFT_FAIL;
1137 	char *memcg;
1138 
1139 	memcg = cg_name(root, "memcg_test");
1140 	if (!memcg)
1141 		goto cleanup;
1142 
1143 	if (cg_create(memcg))
1144 		goto cleanup;
1145 
1146 	if (cg_write(memcg, "memory.max", "30M"))
1147 		goto cleanup;
1148 
1149 	if (cg_write(memcg, "memory.swap.max", "0"))
1150 		goto cleanup;
1151 
1152 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1153 		goto cleanup;
1154 
1155 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1156 		goto cleanup;
1157 
1158 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1159 		goto cleanup;
1160 
1161 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1162 		goto cleanup;
1163 
1164 	ret = KSFT_PASS;
1165 
1166 cleanup:
1167 	cg_destroy(memcg);
1168 	free(memcg);
1169 
1170 	return ret;
1171 }
1172 
1173 struct tcp_server_args {
1174 	unsigned short port;
1175 	int ctl[2];
1176 };
1177 
1178 static int tcp_server(const char *cgroup, void *arg)
1179 {
1180 	struct tcp_server_args *srv_args = arg;
1181 	struct sockaddr_in6 saddr = { 0 };
1182 	socklen_t slen = sizeof(saddr);
1183 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1184 
1185 	close(srv_args->ctl[0]);
1186 	ctl_fd = srv_args->ctl[1];
1187 
1188 	saddr.sin6_family = AF_INET6;
1189 	saddr.sin6_addr = in6addr_any;
1190 	saddr.sin6_port = htons(srv_args->port);
1191 
1192 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1193 	if (sk < 0)
1194 		return ret;
1195 
1196 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1197 		goto cleanup;
1198 
1199 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1200 		write(ctl_fd, &errno, sizeof(errno));
1201 		goto cleanup;
1202 	}
1203 
1204 	if (listen(sk, 1))
1205 		goto cleanup;
1206 
1207 	ret = 0;
1208 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1209 		ret = -1;
1210 		goto cleanup;
1211 	}
1212 
1213 	client_sk = accept(sk, NULL, NULL);
1214 	if (client_sk < 0)
1215 		goto cleanup;
1216 
1217 	ret = -1;
1218 	for (;;) {
1219 		uint8_t buf[0x100000];
1220 
1221 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1222 			if (errno == ECONNRESET)
1223 				ret = 0;
1224 			break;
1225 		}
1226 	}
1227 
1228 	close(client_sk);
1229 
1230 cleanup:
1231 	close(sk);
1232 	return ret;
1233 }
1234 
1235 static int tcp_client(const char *cgroup, unsigned short port)
1236 {
1237 	const char server[] = "localhost";
1238 	struct addrinfo *ai;
1239 	char servport[6];
1240 	int retries = 0x10; /* nice round number */
1241 	int sk, ret;
1242 	long allocated;
1243 
1244 	allocated = cg_read_long(cgroup, "memory.current");
1245 	snprintf(servport, sizeof(servport), "%hd", port);
1246 	ret = getaddrinfo(server, servport, NULL, &ai);
1247 	if (ret)
1248 		return ret;
1249 
1250 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1251 	if (sk < 0)
1252 		goto free_ainfo;
1253 
1254 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1255 	if (ret < 0)
1256 		goto close_sk;
1257 
1258 	ret = KSFT_FAIL;
1259 	while (retries--) {
1260 		uint8_t buf[0x100000];
1261 		long current, sock;
1262 
1263 		if (read(sk, buf, sizeof(buf)) <= 0)
1264 			goto close_sk;
1265 
1266 		current = cg_read_long(cgroup, "memory.current");
1267 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1268 
1269 		if (current < 0 || sock < 0)
1270 			goto close_sk;
1271 
1272 		/* exclude the memory not related to socket connection */
1273 		if (values_close(current - allocated, sock, 10)) {
1274 			ret = KSFT_PASS;
1275 			break;
1276 		}
1277 	}
1278 
1279 close_sk:
1280 	close(sk);
1281 free_ainfo:
1282 	freeaddrinfo(ai);
1283 	return ret;
1284 }
1285 
1286 /*
1287  * This test checks socket memory accounting.
1288  * The test forks a TCP server listens on a random port between 1000
1289  * and 61000. Once it gets a client connection, it starts writing to
1290  * its socket.
1291  * The TCP client interleaves reads from the socket with check whether
1292  * memory.current and memory.stat.sock are similar.
1293  */
1294 static int test_memcg_sock(const char *root)
1295 {
1296 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1297 	unsigned short port;
1298 	char *memcg;
1299 
1300 	memcg = cg_name(root, "memcg_test");
1301 	if (!memcg)
1302 		goto cleanup;
1303 
1304 	if (cg_create(memcg))
1305 		goto cleanup;
1306 
1307 	while (bind_retries--) {
1308 		struct tcp_server_args args;
1309 
1310 		if (pipe(args.ctl))
1311 			goto cleanup;
1312 
1313 		port = args.port = 1000 + rand() % 60000;
1314 
1315 		pid = cg_run_nowait(memcg, tcp_server, &args);
1316 		if (pid < 0)
1317 			goto cleanup;
1318 
1319 		close(args.ctl[1]);
1320 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1321 			goto cleanup;
1322 		close(args.ctl[0]);
1323 
1324 		if (!err)
1325 			break;
1326 		if (err != EADDRINUSE)
1327 			goto cleanup;
1328 
1329 		waitpid(pid, NULL, 0);
1330 	}
1331 
1332 	if (err == EADDRINUSE) {
1333 		ret = KSFT_SKIP;
1334 		goto cleanup;
1335 	}
1336 
1337 	if (tcp_client(memcg, port) != KSFT_PASS)
1338 		goto cleanup;
1339 
1340 	waitpid(pid, &err, 0);
1341 	if (WEXITSTATUS(err))
1342 		goto cleanup;
1343 
1344 	if (cg_read_long(memcg, "memory.current") < 0)
1345 		goto cleanup;
1346 
1347 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1348 		goto cleanup;
1349 
1350 	ret = KSFT_PASS;
1351 
1352 cleanup:
1353 	cg_destroy(memcg);
1354 	free(memcg);
1355 
1356 	return ret;
1357 }
1358 
1359 /*
1360  * This test disables swapping and tries to allocate anonymous memory
1361  * up to OOM with memory.group.oom set. Then it checks that all
1362  * processes in the leaf were killed. It also checks that oom_events
1363  * were propagated to the parent level.
1364  */
1365 static int test_memcg_oom_group_leaf_events(const char *root)
1366 {
1367 	int ret = KSFT_FAIL;
1368 	char *parent, *child;
1369 	long parent_oom_events;
1370 
1371 	parent = cg_name(root, "memcg_test_0");
1372 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1373 
1374 	if (!parent || !child)
1375 		goto cleanup;
1376 
1377 	if (cg_create(parent))
1378 		goto cleanup;
1379 
1380 	if (cg_create(child))
1381 		goto cleanup;
1382 
1383 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1384 		goto cleanup;
1385 
1386 	if (cg_write(child, "memory.max", "50M"))
1387 		goto cleanup;
1388 
1389 	if (cg_write(child, "memory.swap.max", "0"))
1390 		goto cleanup;
1391 
1392 	if (cg_write(child, "memory.oom.group", "1"))
1393 		goto cleanup;
1394 
1395 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1396 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1397 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1398 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1399 		goto cleanup;
1400 
1401 	if (cg_test_proc_killed(child))
1402 		goto cleanup;
1403 
1404 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1405 		goto cleanup;
1406 
1407 	parent_oom_events = cg_read_key_long(
1408 			parent, "memory.events", "oom_kill ");
1409 	/*
1410 	 * If memory_localevents is not enabled (the default), the parent should
1411 	 * count OOM events in its children groups. Otherwise, it should not
1412 	 * have observed any events.
1413 	 */
1414 	if (has_localevents && parent_oom_events != 0)
1415 		goto cleanup;
1416 	else if (!has_localevents && parent_oom_events <= 0)
1417 		goto cleanup;
1418 
1419 	ret = KSFT_PASS;
1420 
1421 cleanup:
1422 	if (child)
1423 		cg_destroy(child);
1424 	if (parent)
1425 		cg_destroy(parent);
1426 	free(child);
1427 	free(parent);
1428 
1429 	return ret;
1430 }
1431 
1432 /*
1433  * This test disables swapping and tries to allocate anonymous memory
1434  * up to OOM with memory.group.oom set. Then it checks that all
1435  * processes in the parent and leaf were killed.
1436  */
1437 static int test_memcg_oom_group_parent_events(const char *root)
1438 {
1439 	int ret = KSFT_FAIL;
1440 	char *parent, *child;
1441 
1442 	parent = cg_name(root, "memcg_test_0");
1443 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1444 
1445 	if (!parent || !child)
1446 		goto cleanup;
1447 
1448 	if (cg_create(parent))
1449 		goto cleanup;
1450 
1451 	if (cg_create(child))
1452 		goto cleanup;
1453 
1454 	if (cg_write(parent, "memory.max", "80M"))
1455 		goto cleanup;
1456 
1457 	if (cg_write(parent, "memory.swap.max", "0"))
1458 		goto cleanup;
1459 
1460 	if (cg_write(parent, "memory.oom.group", "1"))
1461 		goto cleanup;
1462 
1463 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1464 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1465 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1466 
1467 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1468 		goto cleanup;
1469 
1470 	if (cg_test_proc_killed(child))
1471 		goto cleanup;
1472 	if (cg_test_proc_killed(parent))
1473 		goto cleanup;
1474 
1475 	ret = KSFT_PASS;
1476 
1477 cleanup:
1478 	if (child)
1479 		cg_destroy(child);
1480 	if (parent)
1481 		cg_destroy(parent);
1482 	free(child);
1483 	free(parent);
1484 
1485 	return ret;
1486 }
1487 
1488 /*
1489  * This test disables swapping and tries to allocate anonymous memory
1490  * up to OOM with memory.group.oom set. Then it checks that all
1491  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1492  */
1493 static int test_memcg_oom_group_score_events(const char *root)
1494 {
1495 	int ret = KSFT_FAIL;
1496 	char *memcg;
1497 	int safe_pid;
1498 
1499 	memcg = cg_name(root, "memcg_test_0");
1500 
1501 	if (!memcg)
1502 		goto cleanup;
1503 
1504 	if (cg_create(memcg))
1505 		goto cleanup;
1506 
1507 	if (cg_write(memcg, "memory.max", "50M"))
1508 		goto cleanup;
1509 
1510 	if (cg_write(memcg, "memory.swap.max", "0"))
1511 		goto cleanup;
1512 
1513 	if (cg_write(memcg, "memory.oom.group", "1"))
1514 		goto cleanup;
1515 
1516 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1517 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1518 		goto cleanup;
1519 
1520 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1521 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1522 		goto cleanup;
1523 
1524 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1525 		goto cleanup;
1526 
1527 	if (kill(safe_pid, SIGKILL))
1528 		goto cleanup;
1529 
1530 	ret = KSFT_PASS;
1531 
1532 cleanup:
1533 	if (memcg)
1534 		cg_destroy(memcg);
1535 	free(memcg);
1536 
1537 	return ret;
1538 }
1539 
1540 #define T(x) { x, #x }
1541 struct memcg_test {
1542 	int (*fn)(const char *root);
1543 	const char *name;
1544 } tests[] = {
1545 	T(test_memcg_subtree_control),
1546 	T(test_memcg_current_peak),
1547 	T(test_memcg_min),
1548 	T(test_memcg_low),
1549 	T(test_memcg_high),
1550 	T(test_memcg_high_sync),
1551 	T(test_memcg_max),
1552 	T(test_memcg_reclaim),
1553 	T(test_memcg_oom_events),
1554 	T(test_memcg_swap_max_peak),
1555 	T(test_memcg_sock),
1556 	T(test_memcg_oom_group_leaf_events),
1557 	T(test_memcg_oom_group_parent_events),
1558 	T(test_memcg_oom_group_score_events),
1559 };
1560 #undef T
1561 
1562 int main(int argc, char **argv)
1563 {
1564 	char root[PATH_MAX];
1565 	int i, proc_status, ret = EXIT_SUCCESS;
1566 
1567 	if (cg_find_unified_root(root, sizeof(root), NULL))
1568 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1569 
1570 	/*
1571 	 * Check that memory controller is available:
1572 	 * memory is listed in cgroup.controllers
1573 	 */
1574 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1575 		ksft_exit_skip("memory controller isn't available\n");
1576 
1577 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1578 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1579 			ksft_exit_skip("Failed to set memory controller\n");
1580 
1581 	proc_status = proc_mount_contains("memory_recursiveprot");
1582 	if (proc_status < 0)
1583 		ksft_exit_skip("Failed to query cgroup mount option\n");
1584 	has_recursiveprot = proc_status;
1585 
1586 	proc_status = proc_mount_contains("memory_localevents");
1587 	if (proc_status < 0)
1588 		ksft_exit_skip("Failed to query cgroup mount option\n");
1589 	has_localevents = proc_status;
1590 
1591 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1592 		switch (tests[i].fn(root)) {
1593 		case KSFT_PASS:
1594 			ksft_test_result_pass("%s\n", tests[i].name);
1595 			break;
1596 		case KSFT_SKIP:
1597 			ksft_test_result_skip("%s\n", tests[i].name);
1598 			break;
1599 		default:
1600 			ret = EXIT_FAILURE;
1601 			ksft_test_result_fail("%s\n", tests[i].name);
1602 			break;
1603 		}
1604 	}
1605 
1606 	return ret;
1607 }
1608