xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision acc53a0b4c156877773da6e9eea4113dc7e770ae)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	if (buf == NULL) {
102 		fprintf(stderr, "malloc() failed\n");
103 		return -1;
104 	}
105 
106 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107 		*ptr = 0;
108 
109 	current = cg_read_long(cgroup, "memory.current");
110 	if (current < size)
111 		goto cleanup;
112 
113 	if (!values_close(size, current, 3))
114 		goto cleanup;
115 
116 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117 	if (anon < 0)
118 		goto cleanup;
119 
120 	if (!values_close(anon, current, 3))
121 		goto cleanup;
122 
123 	ret = 0;
124 cleanup:
125 	free(buf);
126 	return ret;
127 }
128 
129 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130 {
131 	size_t size = MB(50);
132 	int ret = -1;
133 	long current, file;
134 	int fd;
135 
136 	fd = get_temp_fd();
137 	if (fd < 0)
138 		return -1;
139 
140 	if (alloc_pagecache(fd, size))
141 		goto cleanup;
142 
143 	current = cg_read_long(cgroup, "memory.current");
144 	if (current < size)
145 		goto cleanup;
146 
147 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148 	if (file < 0)
149 		goto cleanup;
150 
151 	if (!values_close(file, current, 10))
152 		goto cleanup;
153 
154 	ret = 0;
155 
156 cleanup:
157 	close(fd);
158 	return ret;
159 }
160 
161 /*
162  * This test create a memory cgroup, allocates
163  * some anonymous memory and some pagecache
164  * and checks memory.current, memory.peak, and some memory.stat values.
165  */
166 static int test_memcg_current_peak(const char *root)
167 {
168 	int ret = KSFT_FAIL;
169 	long current, peak, peak_reset;
170 	char *memcg;
171 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
172 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
173 	struct stat ss;
174 
175 	memcg = cg_name(root, "memcg_test");
176 	if (!memcg)
177 		goto cleanup;
178 
179 	if (cg_create(memcg))
180 		goto cleanup;
181 
182 	current = cg_read_long(memcg, "memory.current");
183 	if (current != 0)
184 		goto cleanup;
185 
186 	peak = cg_read_long(memcg, "memory.peak");
187 	if (peak != 0)
188 		goto cleanup;
189 
190 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
191 		goto cleanup;
192 
193 	peak = cg_read_long(memcg, "memory.peak");
194 	if (peak < MB(50))
195 		goto cleanup;
196 
197 	/*
198 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
199 	 * We need at least three to be closed in a different order than writes occurred to test
200 	 * the linked-list handling.
201 	 */
202 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
203 
204 	if (peak_fd == -1) {
205 		if (errno == ENOENT)
206 			ret = KSFT_SKIP;
207 		goto cleanup;
208 	}
209 
210 	/*
211 	 * Before we try to use memory.peak's fd, try to figure out whether
212 	 * this kernel supports writing to that file in the first place. (by
213 	 * checking the writable bit on the file's st_mode)
214 	 */
215 	if (fstat(peak_fd, &ss))
216 		goto cleanup;
217 
218 	if ((ss.st_mode & S_IWUSR) == 0) {
219 		ret = KSFT_SKIP;
220 		goto cleanup;
221 	}
222 
223 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
224 
225 	if (peak_fd2 == -1)
226 		goto cleanup;
227 
228 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
229 
230 	if (peak_fd3 == -1)
231 		goto cleanup;
232 
233 	/* any non-empty string resets, but make it clear */
234 	static const char reset_string[] = "reset\n";
235 
236 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
237 	if (peak_reset != sizeof(reset_string))
238 		goto cleanup;
239 
240 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
241 	if (peak_reset != sizeof(reset_string))
242 		goto cleanup;
243 
244 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
245 	if (peak_reset != sizeof(reset_string))
246 		goto cleanup;
247 
248 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
249 	peak = cg_read_long(memcg, "memory.peak");
250 	if (peak < MB(50))
251 		goto cleanup;
252 
253 	fd2_closed = true;
254 	if (close(peak_fd2))
255 		goto cleanup;
256 
257 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
258 
259 	if (peak_fd4 == -1)
260 		goto cleanup;
261 
262 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
263 	if (peak_reset != sizeof(reset_string))
264 		goto cleanup;
265 
266 	peak = cg_read_long_fd(peak_fd);
267 	if (peak > MB(30) || peak < 0)
268 		goto cleanup;
269 
270 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
271 		goto cleanup;
272 
273 	peak = cg_read_long(memcg, "memory.peak");
274 	if (peak < MB(50))
275 		goto cleanup;
276 
277 	/* Make sure everything is back to normal */
278 	peak = cg_read_long_fd(peak_fd);
279 	if (peak < MB(50))
280 		goto cleanup;
281 
282 	peak = cg_read_long_fd(peak_fd4);
283 	if (peak < MB(50))
284 		goto cleanup;
285 
286 	fd3_closed = true;
287 	if (close(peak_fd3))
288 		goto cleanup;
289 
290 	fd4_closed = true;
291 	if (close(peak_fd4))
292 		goto cleanup;
293 
294 	ret = KSFT_PASS;
295 
296 cleanup:
297 	close(peak_fd);
298 	if (!fd2_closed)
299 		close(peak_fd2);
300 	if (!fd3_closed)
301 		close(peak_fd3);
302 	if (!fd4_closed)
303 		close(peak_fd4);
304 	cg_destroy(memcg);
305 	free(memcg);
306 
307 	return ret;
308 }
309 
310 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
311 {
312 	int fd = (long)arg;
313 	int ppid = getppid();
314 
315 	if (alloc_pagecache(fd, MB(50)))
316 		return -1;
317 
318 	while (getppid() == ppid)
319 		sleep(1);
320 
321 	return 0;
322 }
323 
324 static int alloc_anon_noexit(const char *cgroup, void *arg)
325 {
326 	int ppid = getppid();
327 	size_t size = (unsigned long)arg;
328 	char *buf, *ptr;
329 
330 	buf = malloc(size);
331 	if (buf == NULL) {
332 		fprintf(stderr, "malloc() failed\n");
333 		return -1;
334 	}
335 
336 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
337 		*ptr = 0;
338 
339 	while (getppid() == ppid)
340 		sleep(1);
341 
342 	free(buf);
343 	return 0;
344 }
345 
346 /*
347  * Wait until processes are killed asynchronously by the OOM killer
348  * If we exceed a timeout, fail.
349  */
350 static int cg_test_proc_killed(const char *cgroup)
351 {
352 	int limit;
353 
354 	for (limit = 10; limit > 0; limit--) {
355 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
356 			return 0;
357 
358 		usleep(100000);
359 	}
360 	return -1;
361 }
362 
363 static bool reclaim_until(const char *memcg, long goal);
364 
365 /*
366  * First, this test creates the following hierarchy:
367  * A       memory.min = 0,    memory.max = 200M
368  * A/B     memory.min = 50M
369  * A/B/C   memory.min = 75M,  memory.current = 50M
370  * A/B/D   memory.min = 25M,  memory.current = 50M
371  * A/B/E   memory.min = 0,    memory.current = 50M
372  * A/B/F   memory.min = 500M, memory.current = 0
373  *
374  * (or memory.low if we test soft protection)
375  *
376  * Usages are pagecache and the test keeps a running
377  * process in every leaf cgroup.
378  * Then it creates A/G and creates a significant
379  * memory pressure in A.
380  *
381  * Then it checks actual memory usages and expects that:
382  * A/B    memory.current ~= 50M
383  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
384  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
385  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
386  *				 undefined otherwise]
387  * A/B/F  memory.current  = 0   [memory.events:low == 0]
388  * (for origin of the numbers, see model in memcg_protection.m.)
389  *
390  * After that it tries to allocate more than there is
391  * unprotected memory in A available, and checks that:
392  * a) memory.min protects pagecache even in this case,
393  * b) memory.low allows reclaiming page cache with low events.
394  *
395  * Then we try to reclaim from A/B/C using memory.reclaim until its
396  * usage reaches 10M.
397  * This makes sure that:
398  * (a) We ignore the protection of the reclaim target memcg.
399  * (b) The previously calculated emin value (~29M) should be dismissed.
400  */
401 static int test_memcg_protection(const char *root, bool min)
402 {
403 	int ret = KSFT_FAIL, rc;
404 	char *parent[3] = {NULL};
405 	char *children[4] = {NULL};
406 	const char *attribute = min ? "memory.min" : "memory.low";
407 	long c[4];
408 	long current;
409 	int i, attempts;
410 	int fd;
411 
412 	fd = get_temp_fd();
413 	if (fd < 0)
414 		goto cleanup;
415 
416 	parent[0] = cg_name(root, "memcg_test_0");
417 	if (!parent[0])
418 		goto cleanup;
419 
420 	parent[1] = cg_name(parent[0], "memcg_test_1");
421 	if (!parent[1])
422 		goto cleanup;
423 
424 	parent[2] = cg_name(parent[0], "memcg_test_2");
425 	if (!parent[2])
426 		goto cleanup;
427 
428 	if (cg_create(parent[0]))
429 		goto cleanup;
430 
431 	if (cg_read_long(parent[0], attribute)) {
432 		/* No memory.min on older kernels is fine */
433 		if (min)
434 			ret = KSFT_SKIP;
435 		goto cleanup;
436 	}
437 
438 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
439 		goto cleanup;
440 
441 	if (cg_write(parent[0], "memory.max", "200M"))
442 		goto cleanup;
443 
444 	if (cg_write(parent[0], "memory.swap.max", "0"))
445 		goto cleanup;
446 
447 	if (cg_create(parent[1]))
448 		goto cleanup;
449 
450 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
451 		goto cleanup;
452 
453 	if (cg_create(parent[2]))
454 		goto cleanup;
455 
456 	for (i = 0; i < ARRAY_SIZE(children); i++) {
457 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
458 		if (!children[i])
459 			goto cleanup;
460 
461 		if (cg_create(children[i]))
462 			goto cleanup;
463 
464 		if (i > 2)
465 			continue;
466 
467 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
468 			      (void *)(long)fd);
469 	}
470 
471 	if (cg_write(parent[1],   attribute, "50M"))
472 		goto cleanup;
473 	if (cg_write(children[0], attribute, "75M"))
474 		goto cleanup;
475 	if (cg_write(children[1], attribute, "25M"))
476 		goto cleanup;
477 	if (cg_write(children[2], attribute, "0"))
478 		goto cleanup;
479 	if (cg_write(children[3], attribute, "500M"))
480 		goto cleanup;
481 
482 	attempts = 0;
483 	while (!values_close(cg_read_long(parent[1], "memory.current"),
484 			     MB(150), 3)) {
485 		if (attempts++ > 5)
486 			break;
487 		sleep(1);
488 	}
489 
490 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
491 		goto cleanup;
492 
493 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
494 		goto cleanup;
495 
496 	for (i = 0; i < ARRAY_SIZE(children); i++)
497 		c[i] = cg_read_long(children[i], "memory.current");
498 
499 	if (!values_close(c[0], MB(29), 15))
500 		goto cleanup;
501 
502 	if (!values_close(c[1], MB(21), 20))
503 		goto cleanup;
504 
505 	if (c[3] != 0)
506 		goto cleanup;
507 
508 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
509 	if (min && !rc)
510 		goto cleanup;
511 	else if (!min && rc) {
512 		fprintf(stderr,
513 			"memory.low prevents from allocating anon memory\n");
514 		goto cleanup;
515 	}
516 
517 	current = min ? MB(50) : MB(30);
518 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
519 		goto cleanup;
520 
521 	if (!reclaim_until(children[0], MB(10)))
522 		goto cleanup;
523 
524 	if (min) {
525 		ret = KSFT_PASS;
526 		goto cleanup;
527 	}
528 
529 	/*
530 	 * Child 2 has memory.low=0, but some low protection may still be
531 	 * distributed down from its parent with memory.low=50M if cgroup2
532 	 * memory_recursiveprot mount option is enabled. Ignore the low
533 	 * event count in this case.
534 	 */
535 	for (i = 0; i < ARRAY_SIZE(children); i++) {
536 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
537 		int no_low_events_index = 1;
538 		long low, oom;
539 
540 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
541 		low = cg_read_key_long(children[i], "memory.events", "low ");
542 
543 		if (oom)
544 			goto cleanup;
545 		if (i == ignore_low_events_index)
546 			continue;
547 		if (i <= no_low_events_index && low <= 0)
548 			goto cleanup;
549 		if (i > no_low_events_index && low)
550 			goto cleanup;
551 
552 	}
553 
554 	ret = KSFT_PASS;
555 
556 cleanup:
557 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
558 		if (!children[i])
559 			continue;
560 
561 		cg_destroy(children[i]);
562 		free(children[i]);
563 	}
564 
565 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
566 		if (!parent[i])
567 			continue;
568 
569 		cg_destroy(parent[i]);
570 		free(parent[i]);
571 	}
572 	close(fd);
573 	return ret;
574 }
575 
576 static int test_memcg_min(const char *root)
577 {
578 	return test_memcg_protection(root, true);
579 }
580 
581 static int test_memcg_low(const char *root)
582 {
583 	return test_memcg_protection(root, false);
584 }
585 
586 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
587 {
588 	size_t size = MB(50);
589 	int ret = -1;
590 	long current, high, max;
591 	int fd;
592 
593 	high = cg_read_long(cgroup, "memory.high");
594 	max = cg_read_long(cgroup, "memory.max");
595 	if (high != MB(30) && max != MB(30))
596 		return -1;
597 
598 	fd = get_temp_fd();
599 	if (fd < 0)
600 		return -1;
601 
602 	if (alloc_pagecache(fd, size))
603 		goto cleanup;
604 
605 	current = cg_read_long(cgroup, "memory.current");
606 	if (!values_close(current, MB(30), 5))
607 		goto cleanup;
608 
609 	ret = 0;
610 
611 cleanup:
612 	close(fd);
613 	return ret;
614 
615 }
616 
617 /*
618  * This test checks that memory.high limits the amount of
619  * memory which can be consumed by either anonymous memory
620  * or pagecache.
621  */
622 static int test_memcg_high(const char *root)
623 {
624 	int ret = KSFT_FAIL;
625 	char *memcg;
626 	long high;
627 
628 	memcg = cg_name(root, "memcg_test");
629 	if (!memcg)
630 		goto cleanup;
631 
632 	if (cg_create(memcg))
633 		goto cleanup;
634 
635 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
636 		goto cleanup;
637 
638 	if (cg_write(memcg, "memory.swap.max", "0"))
639 		goto cleanup;
640 
641 	if (cg_write(memcg, "memory.high", "30M"))
642 		goto cleanup;
643 
644 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
645 		goto cleanup;
646 
647 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
648 		goto cleanup;
649 
650 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
651 		goto cleanup;
652 
653 	high = cg_read_key_long(memcg, "memory.events", "high ");
654 	if (high <= 0)
655 		goto cleanup;
656 
657 	ret = KSFT_PASS;
658 
659 cleanup:
660 	cg_destroy(memcg);
661 	free(memcg);
662 
663 	return ret;
664 }
665 
666 static int alloc_anon_mlock(const char *cgroup, void *arg)
667 {
668 	size_t size = (size_t)arg;
669 	void *buf;
670 
671 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
672 		   0, 0);
673 	if (buf == MAP_FAILED)
674 		return -1;
675 
676 	mlock(buf, size);
677 	munmap(buf, size);
678 	return 0;
679 }
680 
681 /*
682  * This test checks that memory.high is able to throttle big single shot
683  * allocation i.e. large allocation within one kernel entry.
684  */
685 static int test_memcg_high_sync(const char *root)
686 {
687 	int ret = KSFT_FAIL, pid, fd = -1;
688 	char *memcg;
689 	long pre_high, pre_max;
690 	long post_high, post_max;
691 
692 	memcg = cg_name(root, "memcg_test");
693 	if (!memcg)
694 		goto cleanup;
695 
696 	if (cg_create(memcg))
697 		goto cleanup;
698 
699 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
700 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
701 	if (pre_high < 0 || pre_max < 0)
702 		goto cleanup;
703 
704 	if (cg_write(memcg, "memory.swap.max", "0"))
705 		goto cleanup;
706 
707 	if (cg_write(memcg, "memory.high", "30M"))
708 		goto cleanup;
709 
710 	if (cg_write(memcg, "memory.max", "140M"))
711 		goto cleanup;
712 
713 	fd = memcg_prepare_for_wait(memcg);
714 	if (fd < 0)
715 		goto cleanup;
716 
717 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
718 	if (pid < 0)
719 		goto cleanup;
720 
721 	cg_wait_for(fd);
722 
723 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
724 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
725 	if (post_high < 0 || post_max < 0)
726 		goto cleanup;
727 
728 	if (pre_high == post_high || pre_max != post_max)
729 		goto cleanup;
730 
731 	ret = KSFT_PASS;
732 
733 cleanup:
734 	if (fd >= 0)
735 		close(fd);
736 	cg_destroy(memcg);
737 	free(memcg);
738 
739 	return ret;
740 }
741 
742 /*
743  * This test checks that memory.max limits the amount of
744  * memory which can be consumed by either anonymous memory
745  * or pagecache.
746  */
747 static int test_memcg_max(const char *root)
748 {
749 	int ret = KSFT_FAIL;
750 	char *memcg;
751 	long current, max;
752 
753 	memcg = cg_name(root, "memcg_test");
754 	if (!memcg)
755 		goto cleanup;
756 
757 	if (cg_create(memcg))
758 		goto cleanup;
759 
760 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
761 		goto cleanup;
762 
763 	if (cg_write(memcg, "memory.swap.max", "0"))
764 		goto cleanup;
765 
766 	if (cg_write(memcg, "memory.max", "30M"))
767 		goto cleanup;
768 
769 	/* Should be killed by OOM killer */
770 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
771 		goto cleanup;
772 
773 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
774 		goto cleanup;
775 
776 	current = cg_read_long(memcg, "memory.current");
777 	if (current > MB(30) || !current)
778 		goto cleanup;
779 
780 	max = cg_read_key_long(memcg, "memory.events", "max ");
781 	if (max <= 0)
782 		goto cleanup;
783 
784 	ret = KSFT_PASS;
785 
786 cleanup:
787 	cg_destroy(memcg);
788 	free(memcg);
789 
790 	return ret;
791 }
792 
793 /*
794  * Reclaim from @memcg until usage reaches @goal by writing to
795  * memory.reclaim.
796  *
797  * This function will return false if the usage is already below the
798  * goal.
799  *
800  * This function assumes that writing to memory.reclaim is the only
801  * source of change in memory.current (no concurrent allocations or
802  * reclaim).
803  *
804  * This function makes sure memory.reclaim is sane. It will return
805  * false if memory.reclaim's error codes do not make sense, even if
806  * the usage goal was satisfied.
807  */
808 static bool reclaim_until(const char *memcg, long goal)
809 {
810 	char buf[64];
811 	int retries, err;
812 	long current, to_reclaim;
813 	bool reclaimed = false;
814 
815 	for (retries = 5; retries > 0; retries--) {
816 		current = cg_read_long(memcg, "memory.current");
817 
818 		if (current < goal || values_close(current, goal, 3))
819 			break;
820 		/* Did memory.reclaim return 0 incorrectly? */
821 		else if (reclaimed)
822 			return false;
823 
824 		to_reclaim = current - goal;
825 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
826 		err = cg_write(memcg, "memory.reclaim", buf);
827 		if (!err)
828 			reclaimed = true;
829 		else if (err != -EAGAIN)
830 			return false;
831 	}
832 	return reclaimed;
833 }
834 
835 /*
836  * This test checks that memory.reclaim reclaims the given
837  * amount of memory (from both anon and file, if possible).
838  */
839 static int test_memcg_reclaim(const char *root)
840 {
841 	int ret = KSFT_FAIL;
842 	int fd = -1;
843 	int retries;
844 	char *memcg;
845 	long current, expected_usage;
846 
847 	memcg = cg_name(root, "memcg_test");
848 	if (!memcg)
849 		goto cleanup;
850 
851 	if (cg_create(memcg))
852 		goto cleanup;
853 
854 	current = cg_read_long(memcg, "memory.current");
855 	if (current != 0)
856 		goto cleanup;
857 
858 	fd = get_temp_fd();
859 	if (fd < 0)
860 		goto cleanup;
861 
862 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
863 
864 	/*
865 	 * If swap is enabled, try to reclaim from both anon and file, else try
866 	 * to reclaim from file only.
867 	 */
868 	if (is_swap_enabled()) {
869 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
870 		expected_usage = MB(100);
871 	} else
872 		expected_usage = MB(50);
873 
874 	/*
875 	 * Wait until current usage reaches the expected usage (or we run out of
876 	 * retries).
877 	 */
878 	retries = 5;
879 	while (!values_close(cg_read_long(memcg, "memory.current"),
880 			    expected_usage, 10)) {
881 		if (retries--) {
882 			sleep(1);
883 			continue;
884 		} else {
885 			fprintf(stderr,
886 				"failed to allocate %ld for memcg reclaim test\n",
887 				expected_usage);
888 			goto cleanup;
889 		}
890 	}
891 
892 	/*
893 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
894 	 * and file if swap is enabled.
895 	 */
896 	if (!reclaim_until(memcg, MB(30)))
897 		goto cleanup;
898 
899 	ret = KSFT_PASS;
900 cleanup:
901 	cg_destroy(memcg);
902 	free(memcg);
903 	close(fd);
904 
905 	return ret;
906 }
907 
908 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
909 {
910 	long mem_max = (long)arg;
911 	size_t size = MB(50);
912 	char *buf, *ptr;
913 	long mem_current, swap_current;
914 	int ret = -1;
915 
916 	buf = malloc(size);
917 	if (buf == NULL) {
918 		fprintf(stderr, "malloc() failed\n");
919 		return -1;
920 	}
921 
922 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
923 		*ptr = 0;
924 
925 	mem_current = cg_read_long(cgroup, "memory.current");
926 	if (!mem_current || !values_close(mem_current, mem_max, 3))
927 		goto cleanup;
928 
929 	swap_current = cg_read_long(cgroup, "memory.swap.current");
930 	if (!swap_current ||
931 	    !values_close(mem_current + swap_current, size, 3))
932 		goto cleanup;
933 
934 	ret = 0;
935 cleanup:
936 	free(buf);
937 	return ret;
938 }
939 
940 /*
941  * This test checks that memory.swap.max limits the amount of
942  * anonymous memory which can be swapped out. Additionally, it verifies that
943  * memory.swap.peak reflects the high watermark and can be reset.
944  */
945 static int test_memcg_swap_max_peak(const char *root)
946 {
947 	int ret = KSFT_FAIL;
948 	char *memcg;
949 	long max, peak;
950 	struct stat ss;
951 	int swap_peak_fd = -1, mem_peak_fd = -1;
952 
953 	/* any non-empty string resets */
954 	static const char reset_string[] = "foobarbaz";
955 
956 	if (!is_swap_enabled())
957 		return KSFT_SKIP;
958 
959 	memcg = cg_name(root, "memcg_test");
960 	if (!memcg)
961 		goto cleanup;
962 
963 	if (cg_create(memcg))
964 		goto cleanup;
965 
966 	if (cg_read_long(memcg, "memory.swap.current")) {
967 		ret = KSFT_SKIP;
968 		goto cleanup;
969 	}
970 
971 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
972 			       O_RDWR | O_APPEND | O_CLOEXEC);
973 
974 	if (swap_peak_fd == -1) {
975 		if (errno == ENOENT)
976 			ret = KSFT_SKIP;
977 		goto cleanup;
978 	}
979 
980 	/*
981 	 * Before we try to use memory.swap.peak's fd, try to figure out
982 	 * whether this kernel supports writing to that file in the first
983 	 * place. (by checking the writable bit on the file's st_mode)
984 	 */
985 	if (fstat(swap_peak_fd, &ss))
986 		goto cleanup;
987 
988 	if ((ss.st_mode & S_IWUSR) == 0) {
989 		ret = KSFT_SKIP;
990 		goto cleanup;
991 	}
992 
993 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
994 
995 	if (mem_peak_fd == -1)
996 		goto cleanup;
997 
998 	if (cg_read_long(memcg, "memory.swap.peak"))
999 		goto cleanup;
1000 
1001 	if (cg_read_long_fd(swap_peak_fd))
1002 		goto cleanup;
1003 
1004 	/* switch the swap and mem fds into local-peak tracking mode*/
1005 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1006 
1007 	if (peak_reset != sizeof(reset_string))
1008 		goto cleanup;
1009 
1010 	if (cg_read_long_fd(swap_peak_fd))
1011 		goto cleanup;
1012 
1013 	if (cg_read_long(memcg, "memory.peak"))
1014 		goto cleanup;
1015 
1016 	if (cg_read_long_fd(mem_peak_fd))
1017 		goto cleanup;
1018 
1019 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1020 	if (peak_reset != sizeof(reset_string))
1021 		goto cleanup;
1022 
1023 	if (cg_read_long_fd(mem_peak_fd))
1024 		goto cleanup;
1025 
1026 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1027 		goto cleanup;
1028 
1029 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1030 		goto cleanup;
1031 
1032 	if (cg_write(memcg, "memory.swap.max", "30M"))
1033 		goto cleanup;
1034 
1035 	if (cg_write(memcg, "memory.max", "30M"))
1036 		goto cleanup;
1037 
1038 	/* Should be killed by OOM killer */
1039 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1040 		goto cleanup;
1041 
1042 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1043 		goto cleanup;
1044 
1045 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1046 		goto cleanup;
1047 
1048 	peak = cg_read_long(memcg, "memory.peak");
1049 	if (peak < MB(29))
1050 		goto cleanup;
1051 
1052 	peak = cg_read_long(memcg, "memory.swap.peak");
1053 	if (peak < MB(29))
1054 		goto cleanup;
1055 
1056 	peak = cg_read_long_fd(mem_peak_fd);
1057 	if (peak < MB(29))
1058 		goto cleanup;
1059 
1060 	peak = cg_read_long_fd(swap_peak_fd);
1061 	if (peak < MB(29))
1062 		goto cleanup;
1063 
1064 	/*
1065 	 * open, reset and close the peak swap on another FD to make sure
1066 	 * multiple extant fds don't corrupt the linked-list
1067 	 */
1068 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1069 	if (peak_reset)
1070 		goto cleanup;
1071 
1072 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1073 	if (peak_reset)
1074 		goto cleanup;
1075 
1076 	/* actually reset on the fds */
1077 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1078 	if (peak_reset != sizeof(reset_string))
1079 		goto cleanup;
1080 
1081 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1082 	if (peak_reset != sizeof(reset_string))
1083 		goto cleanup;
1084 
1085 	peak = cg_read_long_fd(swap_peak_fd);
1086 	if (peak > MB(10))
1087 		goto cleanup;
1088 
1089 	/*
1090 	 * The cgroup is now empty, but there may be a page or two associated
1091 	 * with the open FD accounted to it.
1092 	 */
1093 	peak = cg_read_long_fd(mem_peak_fd);
1094 	if (peak > MB(1))
1095 		goto cleanup;
1096 
1097 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1098 		goto cleanup;
1099 
1100 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1101 		goto cleanup;
1102 
1103 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1104 		goto cleanup;
1105 
1106 	max = cg_read_key_long(memcg, "memory.events", "max ");
1107 	if (max <= 0)
1108 		goto cleanup;
1109 
1110 	peak = cg_read_long(memcg, "memory.peak");
1111 	if (peak < MB(29))
1112 		goto cleanup;
1113 
1114 	peak = cg_read_long(memcg, "memory.swap.peak");
1115 	if (peak < MB(29))
1116 		goto cleanup;
1117 
1118 	peak = cg_read_long_fd(mem_peak_fd);
1119 	if (peak < MB(29))
1120 		goto cleanup;
1121 
1122 	peak = cg_read_long_fd(swap_peak_fd);
1123 	if (peak < MB(19))
1124 		goto cleanup;
1125 
1126 	ret = KSFT_PASS;
1127 
1128 cleanup:
1129 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1130 		ret = KSFT_FAIL;
1131 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1132 		ret = KSFT_FAIL;
1133 	cg_destroy(memcg);
1134 	free(memcg);
1135 
1136 	return ret;
1137 }
1138 
1139 /*
1140  * This test disables swapping and tries to allocate anonymous memory
1141  * up to OOM. Then it checks for oom and oom_kill events in
1142  * memory.events.
1143  */
1144 static int test_memcg_oom_events(const char *root)
1145 {
1146 	int ret = KSFT_FAIL;
1147 	char *memcg;
1148 
1149 	memcg = cg_name(root, "memcg_test");
1150 	if (!memcg)
1151 		goto cleanup;
1152 
1153 	if (cg_create(memcg))
1154 		goto cleanup;
1155 
1156 	if (cg_write(memcg, "memory.max", "30M"))
1157 		goto cleanup;
1158 
1159 	if (cg_write(memcg, "memory.swap.max", "0"))
1160 		goto cleanup;
1161 
1162 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1163 		goto cleanup;
1164 
1165 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1166 		goto cleanup;
1167 
1168 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1169 		goto cleanup;
1170 
1171 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1172 		goto cleanup;
1173 
1174 	ret = KSFT_PASS;
1175 
1176 cleanup:
1177 	cg_destroy(memcg);
1178 	free(memcg);
1179 
1180 	return ret;
1181 }
1182 
1183 struct tcp_server_args {
1184 	unsigned short port;
1185 	int ctl[2];
1186 };
1187 
1188 static int tcp_server(const char *cgroup, void *arg)
1189 {
1190 	struct tcp_server_args *srv_args = arg;
1191 	struct sockaddr_in6 saddr = { 0 };
1192 	socklen_t slen = sizeof(saddr);
1193 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1194 
1195 	close(srv_args->ctl[0]);
1196 	ctl_fd = srv_args->ctl[1];
1197 
1198 	saddr.sin6_family = AF_INET6;
1199 	saddr.sin6_addr = in6addr_any;
1200 	saddr.sin6_port = htons(srv_args->port);
1201 
1202 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1203 	if (sk < 0)
1204 		return ret;
1205 
1206 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1207 		goto cleanup;
1208 
1209 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1210 		write(ctl_fd, &errno, sizeof(errno));
1211 		goto cleanup;
1212 	}
1213 
1214 	if (listen(sk, 1))
1215 		goto cleanup;
1216 
1217 	ret = 0;
1218 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1219 		ret = -1;
1220 		goto cleanup;
1221 	}
1222 
1223 	client_sk = accept(sk, NULL, NULL);
1224 	if (client_sk < 0)
1225 		goto cleanup;
1226 
1227 	ret = -1;
1228 	for (;;) {
1229 		uint8_t buf[0x100000];
1230 
1231 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1232 			if (errno == ECONNRESET)
1233 				ret = 0;
1234 			break;
1235 		}
1236 	}
1237 
1238 	close(client_sk);
1239 
1240 cleanup:
1241 	close(sk);
1242 	return ret;
1243 }
1244 
1245 static int tcp_client(const char *cgroup, unsigned short port)
1246 {
1247 	const char server[] = "localhost";
1248 	struct addrinfo *ai;
1249 	char servport[6];
1250 	int retries = 0x10; /* nice round number */
1251 	int sk, ret;
1252 	long allocated;
1253 
1254 	allocated = cg_read_long(cgroup, "memory.current");
1255 	snprintf(servport, sizeof(servport), "%hd", port);
1256 	ret = getaddrinfo(server, servport, NULL, &ai);
1257 	if (ret)
1258 		return ret;
1259 
1260 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1261 	if (sk < 0)
1262 		goto free_ainfo;
1263 
1264 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1265 	if (ret < 0)
1266 		goto close_sk;
1267 
1268 	ret = KSFT_FAIL;
1269 	while (retries--) {
1270 		uint8_t buf[0x100000];
1271 		long current, sock;
1272 
1273 		if (read(sk, buf, sizeof(buf)) <= 0)
1274 			goto close_sk;
1275 
1276 		current = cg_read_long(cgroup, "memory.current");
1277 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1278 
1279 		if (current < 0 || sock < 0)
1280 			goto close_sk;
1281 
1282 		/* exclude the memory not related to socket connection */
1283 		if (values_close(current - allocated, sock, 10)) {
1284 			ret = KSFT_PASS;
1285 			break;
1286 		}
1287 	}
1288 
1289 close_sk:
1290 	close(sk);
1291 free_ainfo:
1292 	freeaddrinfo(ai);
1293 	return ret;
1294 }
1295 
1296 /*
1297  * This test checks socket memory accounting.
1298  * The test forks a TCP server listens on a random port between 1000
1299  * and 61000. Once it gets a client connection, it starts writing to
1300  * its socket.
1301  * The TCP client interleaves reads from the socket with check whether
1302  * memory.current and memory.stat.sock are similar.
1303  */
1304 static int test_memcg_sock(const char *root)
1305 {
1306 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1307 	unsigned short port;
1308 	char *memcg;
1309 
1310 	memcg = cg_name(root, "memcg_test");
1311 	if (!memcg)
1312 		goto cleanup;
1313 
1314 	if (cg_create(memcg))
1315 		goto cleanup;
1316 
1317 	while (bind_retries--) {
1318 		struct tcp_server_args args;
1319 
1320 		if (pipe(args.ctl))
1321 			goto cleanup;
1322 
1323 		port = args.port = 1000 + rand() % 60000;
1324 
1325 		pid = cg_run_nowait(memcg, tcp_server, &args);
1326 		if (pid < 0)
1327 			goto cleanup;
1328 
1329 		close(args.ctl[1]);
1330 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1331 			goto cleanup;
1332 		close(args.ctl[0]);
1333 
1334 		if (!err)
1335 			break;
1336 		if (err != EADDRINUSE)
1337 			goto cleanup;
1338 
1339 		waitpid(pid, NULL, 0);
1340 	}
1341 
1342 	if (err == EADDRINUSE) {
1343 		ret = KSFT_SKIP;
1344 		goto cleanup;
1345 	}
1346 
1347 	if (tcp_client(memcg, port) != KSFT_PASS)
1348 		goto cleanup;
1349 
1350 	waitpid(pid, &err, 0);
1351 	if (WEXITSTATUS(err))
1352 		goto cleanup;
1353 
1354 	if (cg_read_long(memcg, "memory.current") < 0)
1355 		goto cleanup;
1356 
1357 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1358 		goto cleanup;
1359 
1360 	ret = KSFT_PASS;
1361 
1362 cleanup:
1363 	cg_destroy(memcg);
1364 	free(memcg);
1365 
1366 	return ret;
1367 }
1368 
1369 /*
1370  * This test disables swapping and tries to allocate anonymous memory
1371  * up to OOM with memory.group.oom set. Then it checks that all
1372  * processes in the leaf were killed. It also checks that oom_events
1373  * were propagated to the parent level.
1374  */
1375 static int test_memcg_oom_group_leaf_events(const char *root)
1376 {
1377 	int ret = KSFT_FAIL;
1378 	char *parent, *child;
1379 	long parent_oom_events;
1380 
1381 	parent = cg_name(root, "memcg_test_0");
1382 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1383 
1384 	if (!parent || !child)
1385 		goto cleanup;
1386 
1387 	if (cg_create(parent))
1388 		goto cleanup;
1389 
1390 	if (cg_create(child))
1391 		goto cleanup;
1392 
1393 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1394 		goto cleanup;
1395 
1396 	if (cg_write(child, "memory.max", "50M"))
1397 		goto cleanup;
1398 
1399 	if (cg_write(child, "memory.swap.max", "0"))
1400 		goto cleanup;
1401 
1402 	if (cg_write(child, "memory.oom.group", "1"))
1403 		goto cleanup;
1404 
1405 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1406 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1407 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1408 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1409 		goto cleanup;
1410 
1411 	if (cg_test_proc_killed(child))
1412 		goto cleanup;
1413 
1414 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1415 		goto cleanup;
1416 
1417 	parent_oom_events = cg_read_key_long(
1418 			parent, "memory.events", "oom_kill ");
1419 	/*
1420 	 * If memory_localevents is not enabled (the default), the parent should
1421 	 * count OOM events in its children groups. Otherwise, it should not
1422 	 * have observed any events.
1423 	 */
1424 	if (has_localevents && parent_oom_events != 0)
1425 		goto cleanup;
1426 	else if (!has_localevents && parent_oom_events <= 0)
1427 		goto cleanup;
1428 
1429 	ret = KSFT_PASS;
1430 
1431 cleanup:
1432 	if (child)
1433 		cg_destroy(child);
1434 	if (parent)
1435 		cg_destroy(parent);
1436 	free(child);
1437 	free(parent);
1438 
1439 	return ret;
1440 }
1441 
1442 /*
1443  * This test disables swapping and tries to allocate anonymous memory
1444  * up to OOM with memory.group.oom set. Then it checks that all
1445  * processes in the parent and leaf were killed.
1446  */
1447 static int test_memcg_oom_group_parent_events(const char *root)
1448 {
1449 	int ret = KSFT_FAIL;
1450 	char *parent, *child;
1451 
1452 	parent = cg_name(root, "memcg_test_0");
1453 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1454 
1455 	if (!parent || !child)
1456 		goto cleanup;
1457 
1458 	if (cg_create(parent))
1459 		goto cleanup;
1460 
1461 	if (cg_create(child))
1462 		goto cleanup;
1463 
1464 	if (cg_write(parent, "memory.max", "80M"))
1465 		goto cleanup;
1466 
1467 	if (cg_write(parent, "memory.swap.max", "0"))
1468 		goto cleanup;
1469 
1470 	if (cg_write(parent, "memory.oom.group", "1"))
1471 		goto cleanup;
1472 
1473 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1474 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1475 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1476 
1477 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1478 		goto cleanup;
1479 
1480 	if (cg_test_proc_killed(child))
1481 		goto cleanup;
1482 	if (cg_test_proc_killed(parent))
1483 		goto cleanup;
1484 
1485 	ret = KSFT_PASS;
1486 
1487 cleanup:
1488 	if (child)
1489 		cg_destroy(child);
1490 	if (parent)
1491 		cg_destroy(parent);
1492 	free(child);
1493 	free(parent);
1494 
1495 	return ret;
1496 }
1497 
1498 /*
1499  * This test disables swapping and tries to allocate anonymous memory
1500  * up to OOM with memory.group.oom set. Then it checks that all
1501  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1502  */
1503 static int test_memcg_oom_group_score_events(const char *root)
1504 {
1505 	int ret = KSFT_FAIL;
1506 	char *memcg;
1507 	int safe_pid;
1508 
1509 	memcg = cg_name(root, "memcg_test_0");
1510 
1511 	if (!memcg)
1512 		goto cleanup;
1513 
1514 	if (cg_create(memcg))
1515 		goto cleanup;
1516 
1517 	if (cg_write(memcg, "memory.max", "50M"))
1518 		goto cleanup;
1519 
1520 	if (cg_write(memcg, "memory.swap.max", "0"))
1521 		goto cleanup;
1522 
1523 	if (cg_write(memcg, "memory.oom.group", "1"))
1524 		goto cleanup;
1525 
1526 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1527 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1528 		goto cleanup;
1529 
1530 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1531 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1532 		goto cleanup;
1533 
1534 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1535 		goto cleanup;
1536 
1537 	if (kill(safe_pid, SIGKILL))
1538 		goto cleanup;
1539 
1540 	ret = KSFT_PASS;
1541 
1542 cleanup:
1543 	if (memcg)
1544 		cg_destroy(memcg);
1545 	free(memcg);
1546 
1547 	return ret;
1548 }
1549 
1550 #define T(x) { x, #x }
1551 struct memcg_test {
1552 	int (*fn)(const char *root);
1553 	const char *name;
1554 } tests[] = {
1555 	T(test_memcg_subtree_control),
1556 	T(test_memcg_current_peak),
1557 	T(test_memcg_min),
1558 	T(test_memcg_low),
1559 	T(test_memcg_high),
1560 	T(test_memcg_high_sync),
1561 	T(test_memcg_max),
1562 	T(test_memcg_reclaim),
1563 	T(test_memcg_oom_events),
1564 	T(test_memcg_swap_max_peak),
1565 	T(test_memcg_sock),
1566 	T(test_memcg_oom_group_leaf_events),
1567 	T(test_memcg_oom_group_parent_events),
1568 	T(test_memcg_oom_group_score_events),
1569 };
1570 #undef T
1571 
1572 int main(int argc, char **argv)
1573 {
1574 	char root[PATH_MAX];
1575 	int i, proc_status, ret = EXIT_SUCCESS;
1576 
1577 	if (cg_find_unified_root(root, sizeof(root), NULL))
1578 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1579 
1580 	/*
1581 	 * Check that memory controller is available:
1582 	 * memory is listed in cgroup.controllers
1583 	 */
1584 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1585 		ksft_exit_skip("memory controller isn't available\n");
1586 
1587 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1588 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1589 			ksft_exit_skip("Failed to set memory controller\n");
1590 
1591 	proc_status = proc_mount_contains("memory_recursiveprot");
1592 	if (proc_status < 0)
1593 		ksft_exit_skip("Failed to query cgroup mount option\n");
1594 	has_recursiveprot = proc_status;
1595 
1596 	proc_status = proc_mount_contains("memory_localevents");
1597 	if (proc_status < 0)
1598 		ksft_exit_skip("Failed to query cgroup mount option\n");
1599 	has_localevents = proc_status;
1600 
1601 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1602 		switch (tests[i].fn(root)) {
1603 		case KSFT_PASS:
1604 			ksft_test_result_pass("%s\n", tests[i].name);
1605 			break;
1606 		case KSFT_SKIP:
1607 			ksft_test_result_skip("%s\n", tests[i].name);
1608 			break;
1609 		default:
1610 			ret = EXIT_FAILURE;
1611 			ksft_test_result_fail("%s\n", tests[i].name);
1612 			break;
1613 		}
1614 	}
1615 
1616 	return ret;
1617 }
1618