xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 7f9039c524a351c684149ecf1b3c5145a0dff2fe)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 int get_temp_fd(void)
28 {
29 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
30 }
31 
32 int alloc_pagecache(int fd, size_t size)
33 {
34 	char buf[PAGE_SIZE];
35 	struct stat st;
36 	int i;
37 
38 	if (fstat(fd, &st))
39 		goto cleanup;
40 
41 	size += st.st_size;
42 
43 	if (ftruncate(fd, size))
44 		goto cleanup;
45 
46 	for (i = 0; i < size; i += sizeof(buf))
47 		read(fd, buf, sizeof(buf));
48 
49 	return 0;
50 
51 cleanup:
52 	return -1;
53 }
54 
55 int alloc_anon(const char *cgroup, void *arg)
56 {
57 	size_t size = (unsigned long)arg;
58 	char *buf, *ptr;
59 
60 	buf = malloc(size);
61 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
62 		*ptr = 0;
63 
64 	free(buf);
65 	return 0;
66 }
67 
68 int is_swap_enabled(void)
69 {
70 	char buf[PAGE_SIZE];
71 	const char delim[] = "\n";
72 	int cnt = 0;
73 	char *line;
74 
75 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
76 		return -1;
77 
78 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
79 		cnt++;
80 
81 	return cnt > 1;
82 }
83 
84 int set_oom_adj_score(int pid, int score)
85 {
86 	char path[PATH_MAX];
87 	int fd, len;
88 
89 	sprintf(path, "/proc/%d/oom_score_adj", pid);
90 
91 	fd = open(path, O_WRONLY | O_APPEND);
92 	if (fd < 0)
93 		return fd;
94 
95 	len = dprintf(fd, "%d", score);
96 	if (len < 0) {
97 		close(fd);
98 		return len;
99 	}
100 
101 	close(fd);
102 	return 0;
103 }
104 
105 /*
106  * This test creates two nested cgroups with and without enabling
107  * the memory controller.
108  */
109 static int test_memcg_subtree_control(const char *root)
110 {
111 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
112 	int ret = KSFT_FAIL;
113 	char buf[PAGE_SIZE];
114 
115 	/* Create two nested cgroups with the memory controller enabled */
116 	parent = cg_name(root, "memcg_test_0");
117 	child = cg_name(root, "memcg_test_0/memcg_test_1");
118 	if (!parent || !child)
119 		goto cleanup_free;
120 
121 	if (cg_create(parent))
122 		goto cleanup_free;
123 
124 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
125 		goto cleanup_parent;
126 
127 	if (cg_create(child))
128 		goto cleanup_parent;
129 
130 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
131 		goto cleanup_child;
132 
133 	/* Create two nested cgroups without enabling memory controller */
134 	parent2 = cg_name(root, "memcg_test_1");
135 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
136 	if (!parent2 || !child2)
137 		goto cleanup_free2;
138 
139 	if (cg_create(parent2))
140 		goto cleanup_free2;
141 
142 	if (cg_create(child2))
143 		goto cleanup_parent2;
144 
145 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
146 		goto cleanup_all;
147 
148 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
149 		goto cleanup_all;
150 
151 	ret = KSFT_PASS;
152 
153 cleanup_all:
154 	cg_destroy(child2);
155 cleanup_parent2:
156 	cg_destroy(parent2);
157 cleanup_free2:
158 	free(parent2);
159 	free(child2);
160 cleanup_child:
161 	cg_destroy(child);
162 cleanup_parent:
163 	cg_destroy(parent);
164 cleanup_free:
165 	free(parent);
166 	free(child);
167 
168 	return ret;
169 }
170 
171 static int alloc_anon_50M_check(const char *cgroup, void *arg)
172 {
173 	size_t size = MB(50);
174 	char *buf, *ptr;
175 	long anon, current;
176 	int ret = -1;
177 
178 	buf = malloc(size);
179 	if (buf == NULL) {
180 		fprintf(stderr, "malloc() failed\n");
181 		return -1;
182 	}
183 
184 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
185 		*ptr = 0;
186 
187 	current = cg_read_long(cgroup, "memory.current");
188 	if (current < size)
189 		goto cleanup;
190 
191 	if (!values_close(size, current, 3))
192 		goto cleanup;
193 
194 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
195 	if (anon < 0)
196 		goto cleanup;
197 
198 	if (!values_close(anon, current, 3))
199 		goto cleanup;
200 
201 	ret = 0;
202 cleanup:
203 	free(buf);
204 	return ret;
205 }
206 
207 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
208 {
209 	size_t size = MB(50);
210 	int ret = -1;
211 	long current, file;
212 	int fd;
213 
214 	fd = get_temp_fd();
215 	if (fd < 0)
216 		return -1;
217 
218 	if (alloc_pagecache(fd, size))
219 		goto cleanup;
220 
221 	current = cg_read_long(cgroup, "memory.current");
222 	if (current < size)
223 		goto cleanup;
224 
225 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
226 	if (file < 0)
227 		goto cleanup;
228 
229 	if (!values_close(file, current, 10))
230 		goto cleanup;
231 
232 	ret = 0;
233 
234 cleanup:
235 	close(fd);
236 	return ret;
237 }
238 
239 /*
240  * This test create a memory cgroup, allocates
241  * some anonymous memory and some pagecache
242  * and checks memory.current, memory.peak, and some memory.stat values.
243  */
244 static int test_memcg_current_peak(const char *root)
245 {
246 	int ret = KSFT_FAIL;
247 	long current, peak, peak_reset;
248 	char *memcg;
249 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
250 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
251 	struct stat ss;
252 
253 	memcg = cg_name(root, "memcg_test");
254 	if (!memcg)
255 		goto cleanup;
256 
257 	if (cg_create(memcg))
258 		goto cleanup;
259 
260 	current = cg_read_long(memcg, "memory.current");
261 	if (current != 0)
262 		goto cleanup;
263 
264 	peak = cg_read_long(memcg, "memory.peak");
265 	if (peak != 0)
266 		goto cleanup;
267 
268 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
269 		goto cleanup;
270 
271 	peak = cg_read_long(memcg, "memory.peak");
272 	if (peak < MB(50))
273 		goto cleanup;
274 
275 	/*
276 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
277 	 * We need at least three to be closed in a different order than writes occurred to test
278 	 * the linked-list handling.
279 	 */
280 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
281 
282 	if (peak_fd == -1) {
283 		if (errno == ENOENT)
284 			ret = KSFT_SKIP;
285 		goto cleanup;
286 	}
287 
288 	/*
289 	 * Before we try to use memory.peak's fd, try to figure out whether
290 	 * this kernel supports writing to that file in the first place. (by
291 	 * checking the writable bit on the file's st_mode)
292 	 */
293 	if (fstat(peak_fd, &ss))
294 		goto cleanup;
295 
296 	if ((ss.st_mode & S_IWUSR) == 0) {
297 		ret = KSFT_SKIP;
298 		goto cleanup;
299 	}
300 
301 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
302 
303 	if (peak_fd2 == -1)
304 		goto cleanup;
305 
306 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
307 
308 	if (peak_fd3 == -1)
309 		goto cleanup;
310 
311 	/* any non-empty string resets, but make it clear */
312 	static const char reset_string[] = "reset\n";
313 
314 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
315 	if (peak_reset != sizeof(reset_string))
316 		goto cleanup;
317 
318 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
319 	if (peak_reset != sizeof(reset_string))
320 		goto cleanup;
321 
322 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
323 	if (peak_reset != sizeof(reset_string))
324 		goto cleanup;
325 
326 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
327 	peak = cg_read_long(memcg, "memory.peak");
328 	if (peak < MB(50))
329 		goto cleanup;
330 
331 	fd2_closed = true;
332 	if (close(peak_fd2))
333 		goto cleanup;
334 
335 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
336 
337 	if (peak_fd4 == -1)
338 		goto cleanup;
339 
340 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
341 	if (peak_reset != sizeof(reset_string))
342 		goto cleanup;
343 
344 	peak = cg_read_long_fd(peak_fd);
345 	if (peak > MB(30) || peak < 0)
346 		goto cleanup;
347 
348 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
349 		goto cleanup;
350 
351 	peak = cg_read_long(memcg, "memory.peak");
352 	if (peak < MB(50))
353 		goto cleanup;
354 
355 	/* Make sure everything is back to normal */
356 	peak = cg_read_long_fd(peak_fd);
357 	if (peak < MB(50))
358 		goto cleanup;
359 
360 	peak = cg_read_long_fd(peak_fd4);
361 	if (peak < MB(50))
362 		goto cleanup;
363 
364 	fd3_closed = true;
365 	if (close(peak_fd3))
366 		goto cleanup;
367 
368 	fd4_closed = true;
369 	if (close(peak_fd4))
370 		goto cleanup;
371 
372 	ret = KSFT_PASS;
373 
374 cleanup:
375 	close(peak_fd);
376 	if (!fd2_closed)
377 		close(peak_fd2);
378 	if (!fd3_closed)
379 		close(peak_fd3);
380 	if (!fd4_closed)
381 		close(peak_fd4);
382 	cg_destroy(memcg);
383 	free(memcg);
384 
385 	return ret;
386 }
387 
388 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
389 {
390 	int fd = (long)arg;
391 	int ppid = getppid();
392 
393 	if (alloc_pagecache(fd, MB(50)))
394 		return -1;
395 
396 	while (getppid() == ppid)
397 		sleep(1);
398 
399 	return 0;
400 }
401 
402 static int alloc_anon_noexit(const char *cgroup, void *arg)
403 {
404 	int ppid = getppid();
405 	size_t size = (unsigned long)arg;
406 	char *buf, *ptr;
407 
408 	buf = malloc(size);
409 	if (buf == NULL) {
410 		fprintf(stderr, "malloc() failed\n");
411 		return -1;
412 	}
413 
414 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
415 		*ptr = 0;
416 
417 	while (getppid() == ppid)
418 		sleep(1);
419 
420 	free(buf);
421 	return 0;
422 }
423 
424 /*
425  * Wait until processes are killed asynchronously by the OOM killer
426  * If we exceed a timeout, fail.
427  */
428 static int cg_test_proc_killed(const char *cgroup)
429 {
430 	int limit;
431 
432 	for (limit = 10; limit > 0; limit--) {
433 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
434 			return 0;
435 
436 		usleep(100000);
437 	}
438 	return -1;
439 }
440 
441 static bool reclaim_until(const char *memcg, long goal);
442 
443 /*
444  * First, this test creates the following hierarchy:
445  * A       memory.min = 0,    memory.max = 200M
446  * A/B     memory.min = 50M
447  * A/B/C   memory.min = 75M,  memory.current = 50M
448  * A/B/D   memory.min = 25M,  memory.current = 50M
449  * A/B/E   memory.min = 0,    memory.current = 50M
450  * A/B/F   memory.min = 500M, memory.current = 0
451  *
452  * (or memory.low if we test soft protection)
453  *
454  * Usages are pagecache and the test keeps a running
455  * process in every leaf cgroup.
456  * Then it creates A/G and creates a significant
457  * memory pressure in A.
458  *
459  * Then it checks actual memory usages and expects that:
460  * A/B    memory.current ~= 50M
461  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
462  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
463  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
464  *				 undefined otherwise]
465  * A/B/F  memory.current  = 0   [memory.events:low == 0]
466  * (for origin of the numbers, see model in memcg_protection.m.)
467  *
468  * After that it tries to allocate more than there is
469  * unprotected memory in A available, and checks that:
470  * a) memory.min protects pagecache even in this case,
471  * b) memory.low allows reclaiming page cache with low events.
472  *
473  * Then we try to reclaim from A/B/C using memory.reclaim until its
474  * usage reaches 10M.
475  * This makes sure that:
476  * (a) We ignore the protection of the reclaim target memcg.
477  * (b) The previously calculated emin value (~29M) should be dismissed.
478  */
479 static int test_memcg_protection(const char *root, bool min)
480 {
481 	int ret = KSFT_FAIL, rc;
482 	char *parent[3] = {NULL};
483 	char *children[4] = {NULL};
484 	const char *attribute = min ? "memory.min" : "memory.low";
485 	long c[4];
486 	long current;
487 	int i, attempts;
488 	int fd;
489 
490 	fd = get_temp_fd();
491 	if (fd < 0)
492 		goto cleanup;
493 
494 	parent[0] = cg_name(root, "memcg_test_0");
495 	if (!parent[0])
496 		goto cleanup;
497 
498 	parent[1] = cg_name(parent[0], "memcg_test_1");
499 	if (!parent[1])
500 		goto cleanup;
501 
502 	parent[2] = cg_name(parent[0], "memcg_test_2");
503 	if (!parent[2])
504 		goto cleanup;
505 
506 	if (cg_create(parent[0]))
507 		goto cleanup;
508 
509 	if (cg_read_long(parent[0], attribute)) {
510 		/* No memory.min on older kernels is fine */
511 		if (min)
512 			ret = KSFT_SKIP;
513 		goto cleanup;
514 	}
515 
516 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
517 		goto cleanup;
518 
519 	if (cg_write(parent[0], "memory.max", "200M"))
520 		goto cleanup;
521 
522 	if (cg_write(parent[0], "memory.swap.max", "0"))
523 		goto cleanup;
524 
525 	if (cg_create(parent[1]))
526 		goto cleanup;
527 
528 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
529 		goto cleanup;
530 
531 	if (cg_create(parent[2]))
532 		goto cleanup;
533 
534 	for (i = 0; i < ARRAY_SIZE(children); i++) {
535 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
536 		if (!children[i])
537 			goto cleanup;
538 
539 		if (cg_create(children[i]))
540 			goto cleanup;
541 
542 		if (i > 2)
543 			continue;
544 
545 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
546 			      (void *)(long)fd);
547 	}
548 
549 	if (cg_write(parent[1],   attribute, "50M"))
550 		goto cleanup;
551 	if (cg_write(children[0], attribute, "75M"))
552 		goto cleanup;
553 	if (cg_write(children[1], attribute, "25M"))
554 		goto cleanup;
555 	if (cg_write(children[2], attribute, "0"))
556 		goto cleanup;
557 	if (cg_write(children[3], attribute, "500M"))
558 		goto cleanup;
559 
560 	attempts = 0;
561 	while (!values_close(cg_read_long(parent[1], "memory.current"),
562 			     MB(150), 3)) {
563 		if (attempts++ > 5)
564 			break;
565 		sleep(1);
566 	}
567 
568 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
569 		goto cleanup;
570 
571 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
572 		goto cleanup;
573 
574 	for (i = 0; i < ARRAY_SIZE(children); i++)
575 		c[i] = cg_read_long(children[i], "memory.current");
576 
577 	if (!values_close(c[0], MB(29), 15))
578 		goto cleanup;
579 
580 	if (!values_close(c[1], MB(21), 20))
581 		goto cleanup;
582 
583 	if (c[3] != 0)
584 		goto cleanup;
585 
586 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
587 	if (min && !rc)
588 		goto cleanup;
589 	else if (!min && rc) {
590 		fprintf(stderr,
591 			"memory.low prevents from allocating anon memory\n");
592 		goto cleanup;
593 	}
594 
595 	current = min ? MB(50) : MB(30);
596 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
597 		goto cleanup;
598 
599 	if (!reclaim_until(children[0], MB(10)))
600 		goto cleanup;
601 
602 	if (min) {
603 		ret = KSFT_PASS;
604 		goto cleanup;
605 	}
606 
607 	/*
608 	 * Child 2 has memory.low=0, but some low protection may still be
609 	 * distributed down from its parent with memory.low=50M if cgroup2
610 	 * memory_recursiveprot mount option is enabled. Ignore the low
611 	 * event count in this case.
612 	 */
613 	for (i = 0; i < ARRAY_SIZE(children); i++) {
614 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
615 		int no_low_events_index = 1;
616 		long low, oom;
617 
618 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
619 		low = cg_read_key_long(children[i], "memory.events", "low ");
620 
621 		if (oom)
622 			goto cleanup;
623 		if (i == ignore_low_events_index)
624 			continue;
625 		if (i <= no_low_events_index && low <= 0)
626 			goto cleanup;
627 		if (i > no_low_events_index && low)
628 			goto cleanup;
629 
630 	}
631 
632 	ret = KSFT_PASS;
633 
634 cleanup:
635 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
636 		if (!children[i])
637 			continue;
638 
639 		cg_destroy(children[i]);
640 		free(children[i]);
641 	}
642 
643 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
644 		if (!parent[i])
645 			continue;
646 
647 		cg_destroy(parent[i]);
648 		free(parent[i]);
649 	}
650 	close(fd);
651 	return ret;
652 }
653 
654 static int test_memcg_min(const char *root)
655 {
656 	return test_memcg_protection(root, true);
657 }
658 
659 static int test_memcg_low(const char *root)
660 {
661 	return test_memcg_protection(root, false);
662 }
663 
664 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
665 {
666 	size_t size = MB(50);
667 	int ret = -1;
668 	long current, high, max;
669 	int fd;
670 
671 	high = cg_read_long(cgroup, "memory.high");
672 	max = cg_read_long(cgroup, "memory.max");
673 	if (high != MB(30) && max != MB(30))
674 		return -1;
675 
676 	fd = get_temp_fd();
677 	if (fd < 0)
678 		return -1;
679 
680 	if (alloc_pagecache(fd, size))
681 		goto cleanup;
682 
683 	current = cg_read_long(cgroup, "memory.current");
684 	if (!values_close(current, MB(30), 5))
685 		goto cleanup;
686 
687 	ret = 0;
688 
689 cleanup:
690 	close(fd);
691 	return ret;
692 
693 }
694 
695 /*
696  * This test checks that memory.high limits the amount of
697  * memory which can be consumed by either anonymous memory
698  * or pagecache.
699  */
700 static int test_memcg_high(const char *root)
701 {
702 	int ret = KSFT_FAIL;
703 	char *memcg;
704 	long high;
705 
706 	memcg = cg_name(root, "memcg_test");
707 	if (!memcg)
708 		goto cleanup;
709 
710 	if (cg_create(memcg))
711 		goto cleanup;
712 
713 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
714 		goto cleanup;
715 
716 	if (cg_write(memcg, "memory.swap.max", "0"))
717 		goto cleanup;
718 
719 	if (cg_write(memcg, "memory.high", "30M"))
720 		goto cleanup;
721 
722 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
723 		goto cleanup;
724 
725 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
726 		goto cleanup;
727 
728 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
729 		goto cleanup;
730 
731 	high = cg_read_key_long(memcg, "memory.events", "high ");
732 	if (high <= 0)
733 		goto cleanup;
734 
735 	ret = KSFT_PASS;
736 
737 cleanup:
738 	cg_destroy(memcg);
739 	free(memcg);
740 
741 	return ret;
742 }
743 
744 static int alloc_anon_mlock(const char *cgroup, void *arg)
745 {
746 	size_t size = (size_t)arg;
747 	void *buf;
748 
749 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
750 		   0, 0);
751 	if (buf == MAP_FAILED)
752 		return -1;
753 
754 	mlock(buf, size);
755 	munmap(buf, size);
756 	return 0;
757 }
758 
759 /*
760  * This test checks that memory.high is able to throttle big single shot
761  * allocation i.e. large allocation within one kernel entry.
762  */
763 static int test_memcg_high_sync(const char *root)
764 {
765 	int ret = KSFT_FAIL, pid, fd = -1;
766 	char *memcg;
767 	long pre_high, pre_max;
768 	long post_high, post_max;
769 
770 	memcg = cg_name(root, "memcg_test");
771 	if (!memcg)
772 		goto cleanup;
773 
774 	if (cg_create(memcg))
775 		goto cleanup;
776 
777 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
778 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
779 	if (pre_high < 0 || pre_max < 0)
780 		goto cleanup;
781 
782 	if (cg_write(memcg, "memory.swap.max", "0"))
783 		goto cleanup;
784 
785 	if (cg_write(memcg, "memory.high", "30M"))
786 		goto cleanup;
787 
788 	if (cg_write(memcg, "memory.max", "140M"))
789 		goto cleanup;
790 
791 	fd = memcg_prepare_for_wait(memcg);
792 	if (fd < 0)
793 		goto cleanup;
794 
795 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
796 	if (pid < 0)
797 		goto cleanup;
798 
799 	cg_wait_for(fd);
800 
801 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
802 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
803 	if (post_high < 0 || post_max < 0)
804 		goto cleanup;
805 
806 	if (pre_high == post_high || pre_max != post_max)
807 		goto cleanup;
808 
809 	ret = KSFT_PASS;
810 
811 cleanup:
812 	if (fd >= 0)
813 		close(fd);
814 	cg_destroy(memcg);
815 	free(memcg);
816 
817 	return ret;
818 }
819 
820 /*
821  * This test checks that memory.max limits the amount of
822  * memory which can be consumed by either anonymous memory
823  * or pagecache.
824  */
825 static int test_memcg_max(const char *root)
826 {
827 	int ret = KSFT_FAIL;
828 	char *memcg;
829 	long current, max;
830 
831 	memcg = cg_name(root, "memcg_test");
832 	if (!memcg)
833 		goto cleanup;
834 
835 	if (cg_create(memcg))
836 		goto cleanup;
837 
838 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
839 		goto cleanup;
840 
841 	if (cg_write(memcg, "memory.swap.max", "0"))
842 		goto cleanup;
843 
844 	if (cg_write(memcg, "memory.max", "30M"))
845 		goto cleanup;
846 
847 	/* Should be killed by OOM killer */
848 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
849 		goto cleanup;
850 
851 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
852 		goto cleanup;
853 
854 	current = cg_read_long(memcg, "memory.current");
855 	if (current > MB(30) || !current)
856 		goto cleanup;
857 
858 	max = cg_read_key_long(memcg, "memory.events", "max ");
859 	if (max <= 0)
860 		goto cleanup;
861 
862 	ret = KSFT_PASS;
863 
864 cleanup:
865 	cg_destroy(memcg);
866 	free(memcg);
867 
868 	return ret;
869 }
870 
871 /*
872  * Reclaim from @memcg until usage reaches @goal by writing to
873  * memory.reclaim.
874  *
875  * This function will return false if the usage is already below the
876  * goal.
877  *
878  * This function assumes that writing to memory.reclaim is the only
879  * source of change in memory.current (no concurrent allocations or
880  * reclaim).
881  *
882  * This function makes sure memory.reclaim is sane. It will return
883  * false if memory.reclaim's error codes do not make sense, even if
884  * the usage goal was satisfied.
885  */
886 static bool reclaim_until(const char *memcg, long goal)
887 {
888 	char buf[64];
889 	int retries, err;
890 	long current, to_reclaim;
891 	bool reclaimed = false;
892 
893 	for (retries = 5; retries > 0; retries--) {
894 		current = cg_read_long(memcg, "memory.current");
895 
896 		if (current < goal || values_close(current, goal, 3))
897 			break;
898 		/* Did memory.reclaim return 0 incorrectly? */
899 		else if (reclaimed)
900 			return false;
901 
902 		to_reclaim = current - goal;
903 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
904 		err = cg_write(memcg, "memory.reclaim", buf);
905 		if (!err)
906 			reclaimed = true;
907 		else if (err != -EAGAIN)
908 			return false;
909 	}
910 	return reclaimed;
911 }
912 
913 /*
914  * This test checks that memory.reclaim reclaims the given
915  * amount of memory (from both anon and file, if possible).
916  */
917 static int test_memcg_reclaim(const char *root)
918 {
919 	int ret = KSFT_FAIL;
920 	int fd = -1;
921 	int retries;
922 	char *memcg;
923 	long current, expected_usage;
924 
925 	memcg = cg_name(root, "memcg_test");
926 	if (!memcg)
927 		goto cleanup;
928 
929 	if (cg_create(memcg))
930 		goto cleanup;
931 
932 	current = cg_read_long(memcg, "memory.current");
933 	if (current != 0)
934 		goto cleanup;
935 
936 	fd = get_temp_fd();
937 	if (fd < 0)
938 		goto cleanup;
939 
940 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
941 
942 	/*
943 	 * If swap is enabled, try to reclaim from both anon and file, else try
944 	 * to reclaim from file only.
945 	 */
946 	if (is_swap_enabled()) {
947 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
948 		expected_usage = MB(100);
949 	} else
950 		expected_usage = MB(50);
951 
952 	/*
953 	 * Wait until current usage reaches the expected usage (or we run out of
954 	 * retries).
955 	 */
956 	retries = 5;
957 	while (!values_close(cg_read_long(memcg, "memory.current"),
958 			    expected_usage, 10)) {
959 		if (retries--) {
960 			sleep(1);
961 			continue;
962 		} else {
963 			fprintf(stderr,
964 				"failed to allocate %ld for memcg reclaim test\n",
965 				expected_usage);
966 			goto cleanup;
967 		}
968 	}
969 
970 	/*
971 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
972 	 * and file if swap is enabled.
973 	 */
974 	if (!reclaim_until(memcg, MB(30)))
975 		goto cleanup;
976 
977 	ret = KSFT_PASS;
978 cleanup:
979 	cg_destroy(memcg);
980 	free(memcg);
981 	close(fd);
982 
983 	return ret;
984 }
985 
986 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
987 {
988 	long mem_max = (long)arg;
989 	size_t size = MB(50);
990 	char *buf, *ptr;
991 	long mem_current, swap_current;
992 	int ret = -1;
993 
994 	buf = malloc(size);
995 	if (buf == NULL) {
996 		fprintf(stderr, "malloc() failed\n");
997 		return -1;
998 	}
999 
1000 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
1001 		*ptr = 0;
1002 
1003 	mem_current = cg_read_long(cgroup, "memory.current");
1004 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1005 		goto cleanup;
1006 
1007 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1008 	if (!swap_current ||
1009 	    !values_close(mem_current + swap_current, size, 3))
1010 		goto cleanup;
1011 
1012 	ret = 0;
1013 cleanup:
1014 	free(buf);
1015 	return ret;
1016 }
1017 
1018 /*
1019  * This test checks that memory.swap.max limits the amount of
1020  * anonymous memory which can be swapped out. Additionally, it verifies that
1021  * memory.swap.peak reflects the high watermark and can be reset.
1022  */
1023 static int test_memcg_swap_max_peak(const char *root)
1024 {
1025 	int ret = KSFT_FAIL;
1026 	char *memcg;
1027 	long max, peak;
1028 	struct stat ss;
1029 	int swap_peak_fd = -1, mem_peak_fd = -1;
1030 
1031 	/* any non-empty string resets */
1032 	static const char reset_string[] = "foobarbaz";
1033 
1034 	if (!is_swap_enabled())
1035 		return KSFT_SKIP;
1036 
1037 	memcg = cg_name(root, "memcg_test");
1038 	if (!memcg)
1039 		goto cleanup;
1040 
1041 	if (cg_create(memcg))
1042 		goto cleanup;
1043 
1044 	if (cg_read_long(memcg, "memory.swap.current")) {
1045 		ret = KSFT_SKIP;
1046 		goto cleanup;
1047 	}
1048 
1049 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1050 			       O_RDWR | O_APPEND | O_CLOEXEC);
1051 
1052 	if (swap_peak_fd == -1) {
1053 		if (errno == ENOENT)
1054 			ret = KSFT_SKIP;
1055 		goto cleanup;
1056 	}
1057 
1058 	/*
1059 	 * Before we try to use memory.swap.peak's fd, try to figure out
1060 	 * whether this kernel supports writing to that file in the first
1061 	 * place. (by checking the writable bit on the file's st_mode)
1062 	 */
1063 	if (fstat(swap_peak_fd, &ss))
1064 		goto cleanup;
1065 
1066 	if ((ss.st_mode & S_IWUSR) == 0) {
1067 		ret = KSFT_SKIP;
1068 		goto cleanup;
1069 	}
1070 
1071 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1072 
1073 	if (mem_peak_fd == -1)
1074 		goto cleanup;
1075 
1076 	if (cg_read_long(memcg, "memory.swap.peak"))
1077 		goto cleanup;
1078 
1079 	if (cg_read_long_fd(swap_peak_fd))
1080 		goto cleanup;
1081 
1082 	/* switch the swap and mem fds into local-peak tracking mode*/
1083 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1084 
1085 	if (peak_reset != sizeof(reset_string))
1086 		goto cleanup;
1087 
1088 	if (cg_read_long_fd(swap_peak_fd))
1089 		goto cleanup;
1090 
1091 	if (cg_read_long(memcg, "memory.peak"))
1092 		goto cleanup;
1093 
1094 	if (cg_read_long_fd(mem_peak_fd))
1095 		goto cleanup;
1096 
1097 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1098 	if (peak_reset != sizeof(reset_string))
1099 		goto cleanup;
1100 
1101 	if (cg_read_long_fd(mem_peak_fd))
1102 		goto cleanup;
1103 
1104 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1105 		goto cleanup;
1106 
1107 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1108 		goto cleanup;
1109 
1110 	if (cg_write(memcg, "memory.swap.max", "30M"))
1111 		goto cleanup;
1112 
1113 	if (cg_write(memcg, "memory.max", "30M"))
1114 		goto cleanup;
1115 
1116 	/* Should be killed by OOM killer */
1117 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1118 		goto cleanup;
1119 
1120 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1121 		goto cleanup;
1122 
1123 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1124 		goto cleanup;
1125 
1126 	peak = cg_read_long(memcg, "memory.peak");
1127 	if (peak < MB(29))
1128 		goto cleanup;
1129 
1130 	peak = cg_read_long(memcg, "memory.swap.peak");
1131 	if (peak < MB(29))
1132 		goto cleanup;
1133 
1134 	peak = cg_read_long_fd(mem_peak_fd);
1135 	if (peak < MB(29))
1136 		goto cleanup;
1137 
1138 	peak = cg_read_long_fd(swap_peak_fd);
1139 	if (peak < MB(29))
1140 		goto cleanup;
1141 
1142 	/*
1143 	 * open, reset and close the peak swap on another FD to make sure
1144 	 * multiple extant fds don't corrupt the linked-list
1145 	 */
1146 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1147 	if (peak_reset)
1148 		goto cleanup;
1149 
1150 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1151 	if (peak_reset)
1152 		goto cleanup;
1153 
1154 	/* actually reset on the fds */
1155 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1156 	if (peak_reset != sizeof(reset_string))
1157 		goto cleanup;
1158 
1159 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1160 	if (peak_reset != sizeof(reset_string))
1161 		goto cleanup;
1162 
1163 	peak = cg_read_long_fd(swap_peak_fd);
1164 	if (peak > MB(10))
1165 		goto cleanup;
1166 
1167 	/*
1168 	 * The cgroup is now empty, but there may be a page or two associated
1169 	 * with the open FD accounted to it.
1170 	 */
1171 	peak = cg_read_long_fd(mem_peak_fd);
1172 	if (peak > MB(1))
1173 		goto cleanup;
1174 
1175 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1176 		goto cleanup;
1177 
1178 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1179 		goto cleanup;
1180 
1181 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1182 		goto cleanup;
1183 
1184 	max = cg_read_key_long(memcg, "memory.events", "max ");
1185 	if (max <= 0)
1186 		goto cleanup;
1187 
1188 	peak = cg_read_long(memcg, "memory.peak");
1189 	if (peak < MB(29))
1190 		goto cleanup;
1191 
1192 	peak = cg_read_long(memcg, "memory.swap.peak");
1193 	if (peak < MB(29))
1194 		goto cleanup;
1195 
1196 	peak = cg_read_long_fd(mem_peak_fd);
1197 	if (peak < MB(29))
1198 		goto cleanup;
1199 
1200 	peak = cg_read_long_fd(swap_peak_fd);
1201 	if (peak < MB(19))
1202 		goto cleanup;
1203 
1204 	ret = KSFT_PASS;
1205 
1206 cleanup:
1207 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1208 		ret = KSFT_FAIL;
1209 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1210 		ret = KSFT_FAIL;
1211 	cg_destroy(memcg);
1212 	free(memcg);
1213 
1214 	return ret;
1215 }
1216 
1217 /*
1218  * This test disables swapping and tries to allocate anonymous memory
1219  * up to OOM. Then it checks for oom and oom_kill events in
1220  * memory.events.
1221  */
1222 static int test_memcg_oom_events(const char *root)
1223 {
1224 	int ret = KSFT_FAIL;
1225 	char *memcg;
1226 
1227 	memcg = cg_name(root, "memcg_test");
1228 	if (!memcg)
1229 		goto cleanup;
1230 
1231 	if (cg_create(memcg))
1232 		goto cleanup;
1233 
1234 	if (cg_write(memcg, "memory.max", "30M"))
1235 		goto cleanup;
1236 
1237 	if (cg_write(memcg, "memory.swap.max", "0"))
1238 		goto cleanup;
1239 
1240 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1241 		goto cleanup;
1242 
1243 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1244 		goto cleanup;
1245 
1246 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1247 		goto cleanup;
1248 
1249 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1250 		goto cleanup;
1251 
1252 	ret = KSFT_PASS;
1253 
1254 cleanup:
1255 	cg_destroy(memcg);
1256 	free(memcg);
1257 
1258 	return ret;
1259 }
1260 
1261 struct tcp_server_args {
1262 	unsigned short port;
1263 	int ctl[2];
1264 };
1265 
1266 static int tcp_server(const char *cgroup, void *arg)
1267 {
1268 	struct tcp_server_args *srv_args = arg;
1269 	struct sockaddr_in6 saddr = { 0 };
1270 	socklen_t slen = sizeof(saddr);
1271 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1272 
1273 	close(srv_args->ctl[0]);
1274 	ctl_fd = srv_args->ctl[1];
1275 
1276 	saddr.sin6_family = AF_INET6;
1277 	saddr.sin6_addr = in6addr_any;
1278 	saddr.sin6_port = htons(srv_args->port);
1279 
1280 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1281 	if (sk < 0)
1282 		return ret;
1283 
1284 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1285 		goto cleanup;
1286 
1287 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1288 		write(ctl_fd, &errno, sizeof(errno));
1289 		goto cleanup;
1290 	}
1291 
1292 	if (listen(sk, 1))
1293 		goto cleanup;
1294 
1295 	ret = 0;
1296 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1297 		ret = -1;
1298 		goto cleanup;
1299 	}
1300 
1301 	client_sk = accept(sk, NULL, NULL);
1302 	if (client_sk < 0)
1303 		goto cleanup;
1304 
1305 	ret = -1;
1306 	for (;;) {
1307 		uint8_t buf[0x100000];
1308 
1309 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1310 			if (errno == ECONNRESET)
1311 				ret = 0;
1312 			break;
1313 		}
1314 	}
1315 
1316 	close(client_sk);
1317 
1318 cleanup:
1319 	close(sk);
1320 	return ret;
1321 }
1322 
1323 static int tcp_client(const char *cgroup, unsigned short port)
1324 {
1325 	const char server[] = "localhost";
1326 	struct addrinfo *ai;
1327 	char servport[6];
1328 	int retries = 0x10; /* nice round number */
1329 	int sk, ret;
1330 	long allocated;
1331 
1332 	allocated = cg_read_long(cgroup, "memory.current");
1333 	snprintf(servport, sizeof(servport), "%hd", port);
1334 	ret = getaddrinfo(server, servport, NULL, &ai);
1335 	if (ret)
1336 		return ret;
1337 
1338 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1339 	if (sk < 0)
1340 		goto free_ainfo;
1341 
1342 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1343 	if (ret < 0)
1344 		goto close_sk;
1345 
1346 	ret = KSFT_FAIL;
1347 	while (retries--) {
1348 		uint8_t buf[0x100000];
1349 		long current, sock;
1350 
1351 		if (read(sk, buf, sizeof(buf)) <= 0)
1352 			goto close_sk;
1353 
1354 		current = cg_read_long(cgroup, "memory.current");
1355 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1356 
1357 		if (current < 0 || sock < 0)
1358 			goto close_sk;
1359 
1360 		/* exclude the memory not related to socket connection */
1361 		if (values_close(current - allocated, sock, 10)) {
1362 			ret = KSFT_PASS;
1363 			break;
1364 		}
1365 	}
1366 
1367 close_sk:
1368 	close(sk);
1369 free_ainfo:
1370 	freeaddrinfo(ai);
1371 	return ret;
1372 }
1373 
1374 /*
1375  * This test checks socket memory accounting.
1376  * The test forks a TCP server listens on a random port between 1000
1377  * and 61000. Once it gets a client connection, it starts writing to
1378  * its socket.
1379  * The TCP client interleaves reads from the socket with check whether
1380  * memory.current and memory.stat.sock are similar.
1381  */
1382 static int test_memcg_sock(const char *root)
1383 {
1384 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1385 	unsigned short port;
1386 	char *memcg;
1387 
1388 	memcg = cg_name(root, "memcg_test");
1389 	if (!memcg)
1390 		goto cleanup;
1391 
1392 	if (cg_create(memcg))
1393 		goto cleanup;
1394 
1395 	while (bind_retries--) {
1396 		struct tcp_server_args args;
1397 
1398 		if (pipe(args.ctl))
1399 			goto cleanup;
1400 
1401 		port = args.port = 1000 + rand() % 60000;
1402 
1403 		pid = cg_run_nowait(memcg, tcp_server, &args);
1404 		if (pid < 0)
1405 			goto cleanup;
1406 
1407 		close(args.ctl[1]);
1408 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1409 			goto cleanup;
1410 		close(args.ctl[0]);
1411 
1412 		if (!err)
1413 			break;
1414 		if (err != EADDRINUSE)
1415 			goto cleanup;
1416 
1417 		waitpid(pid, NULL, 0);
1418 	}
1419 
1420 	if (err == EADDRINUSE) {
1421 		ret = KSFT_SKIP;
1422 		goto cleanup;
1423 	}
1424 
1425 	if (tcp_client(memcg, port) != KSFT_PASS)
1426 		goto cleanup;
1427 
1428 	waitpid(pid, &err, 0);
1429 	if (WEXITSTATUS(err))
1430 		goto cleanup;
1431 
1432 	if (cg_read_long(memcg, "memory.current") < 0)
1433 		goto cleanup;
1434 
1435 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1436 		goto cleanup;
1437 
1438 	ret = KSFT_PASS;
1439 
1440 cleanup:
1441 	cg_destroy(memcg);
1442 	free(memcg);
1443 
1444 	return ret;
1445 }
1446 
1447 /*
1448  * This test disables swapping and tries to allocate anonymous memory
1449  * up to OOM with memory.group.oom set. Then it checks that all
1450  * processes in the leaf were killed. It also checks that oom_events
1451  * were propagated to the parent level.
1452  */
1453 static int test_memcg_oom_group_leaf_events(const char *root)
1454 {
1455 	int ret = KSFT_FAIL;
1456 	char *parent, *child;
1457 	long parent_oom_events;
1458 
1459 	parent = cg_name(root, "memcg_test_0");
1460 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1461 
1462 	if (!parent || !child)
1463 		goto cleanup;
1464 
1465 	if (cg_create(parent))
1466 		goto cleanup;
1467 
1468 	if (cg_create(child))
1469 		goto cleanup;
1470 
1471 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1472 		goto cleanup;
1473 
1474 	if (cg_write(child, "memory.max", "50M"))
1475 		goto cleanup;
1476 
1477 	if (cg_write(child, "memory.swap.max", "0"))
1478 		goto cleanup;
1479 
1480 	if (cg_write(child, "memory.oom.group", "1"))
1481 		goto cleanup;
1482 
1483 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1484 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1485 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1486 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1487 		goto cleanup;
1488 
1489 	if (cg_test_proc_killed(child))
1490 		goto cleanup;
1491 
1492 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1493 		goto cleanup;
1494 
1495 	parent_oom_events = cg_read_key_long(
1496 			parent, "memory.events", "oom_kill ");
1497 	/*
1498 	 * If memory_localevents is not enabled (the default), the parent should
1499 	 * count OOM events in its children groups. Otherwise, it should not
1500 	 * have observed any events.
1501 	 */
1502 	if (has_localevents && parent_oom_events != 0)
1503 		goto cleanup;
1504 	else if (!has_localevents && parent_oom_events <= 0)
1505 		goto cleanup;
1506 
1507 	ret = KSFT_PASS;
1508 
1509 cleanup:
1510 	if (child)
1511 		cg_destroy(child);
1512 	if (parent)
1513 		cg_destroy(parent);
1514 	free(child);
1515 	free(parent);
1516 
1517 	return ret;
1518 }
1519 
1520 /*
1521  * This test disables swapping and tries to allocate anonymous memory
1522  * up to OOM with memory.group.oom set. Then it checks that all
1523  * processes in the parent and leaf were killed.
1524  */
1525 static int test_memcg_oom_group_parent_events(const char *root)
1526 {
1527 	int ret = KSFT_FAIL;
1528 	char *parent, *child;
1529 
1530 	parent = cg_name(root, "memcg_test_0");
1531 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1532 
1533 	if (!parent || !child)
1534 		goto cleanup;
1535 
1536 	if (cg_create(parent))
1537 		goto cleanup;
1538 
1539 	if (cg_create(child))
1540 		goto cleanup;
1541 
1542 	if (cg_write(parent, "memory.max", "80M"))
1543 		goto cleanup;
1544 
1545 	if (cg_write(parent, "memory.swap.max", "0"))
1546 		goto cleanup;
1547 
1548 	if (cg_write(parent, "memory.oom.group", "1"))
1549 		goto cleanup;
1550 
1551 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1552 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1553 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1554 
1555 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1556 		goto cleanup;
1557 
1558 	if (cg_test_proc_killed(child))
1559 		goto cleanup;
1560 	if (cg_test_proc_killed(parent))
1561 		goto cleanup;
1562 
1563 	ret = KSFT_PASS;
1564 
1565 cleanup:
1566 	if (child)
1567 		cg_destroy(child);
1568 	if (parent)
1569 		cg_destroy(parent);
1570 	free(child);
1571 	free(parent);
1572 
1573 	return ret;
1574 }
1575 
1576 /*
1577  * This test disables swapping and tries to allocate anonymous memory
1578  * up to OOM with memory.group.oom set. Then it checks that all
1579  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1580  */
1581 static int test_memcg_oom_group_score_events(const char *root)
1582 {
1583 	int ret = KSFT_FAIL;
1584 	char *memcg;
1585 	int safe_pid;
1586 
1587 	memcg = cg_name(root, "memcg_test_0");
1588 
1589 	if (!memcg)
1590 		goto cleanup;
1591 
1592 	if (cg_create(memcg))
1593 		goto cleanup;
1594 
1595 	if (cg_write(memcg, "memory.max", "50M"))
1596 		goto cleanup;
1597 
1598 	if (cg_write(memcg, "memory.swap.max", "0"))
1599 		goto cleanup;
1600 
1601 	if (cg_write(memcg, "memory.oom.group", "1"))
1602 		goto cleanup;
1603 
1604 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1605 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1606 		goto cleanup;
1607 
1608 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1609 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1610 		goto cleanup;
1611 
1612 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1613 		goto cleanup;
1614 
1615 	if (kill(safe_pid, SIGKILL))
1616 		goto cleanup;
1617 
1618 	ret = KSFT_PASS;
1619 
1620 cleanup:
1621 	if (memcg)
1622 		cg_destroy(memcg);
1623 	free(memcg);
1624 
1625 	return ret;
1626 }
1627 
1628 #define T(x) { x, #x }
1629 struct memcg_test {
1630 	int (*fn)(const char *root);
1631 	const char *name;
1632 } tests[] = {
1633 	T(test_memcg_subtree_control),
1634 	T(test_memcg_current_peak),
1635 	T(test_memcg_min),
1636 	T(test_memcg_low),
1637 	T(test_memcg_high),
1638 	T(test_memcg_high_sync),
1639 	T(test_memcg_max),
1640 	T(test_memcg_reclaim),
1641 	T(test_memcg_oom_events),
1642 	T(test_memcg_swap_max_peak),
1643 	T(test_memcg_sock),
1644 	T(test_memcg_oom_group_leaf_events),
1645 	T(test_memcg_oom_group_parent_events),
1646 	T(test_memcg_oom_group_score_events),
1647 };
1648 #undef T
1649 
1650 int main(int argc, char **argv)
1651 {
1652 	char root[PATH_MAX];
1653 	int i, proc_status, ret = EXIT_SUCCESS;
1654 
1655 	if (cg_find_unified_root(root, sizeof(root), NULL))
1656 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1657 
1658 	/*
1659 	 * Check that memory controller is available:
1660 	 * memory is listed in cgroup.controllers
1661 	 */
1662 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1663 		ksft_exit_skip("memory controller isn't available\n");
1664 
1665 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1666 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1667 			ksft_exit_skip("Failed to set memory controller\n");
1668 
1669 	proc_status = proc_mount_contains("memory_recursiveprot");
1670 	if (proc_status < 0)
1671 		ksft_exit_skip("Failed to query cgroup mount option\n");
1672 	has_recursiveprot = proc_status;
1673 
1674 	proc_status = proc_mount_contains("memory_localevents");
1675 	if (proc_status < 0)
1676 		ksft_exit_skip("Failed to query cgroup mount option\n");
1677 	has_localevents = proc_status;
1678 
1679 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1680 		switch (tests[i].fn(root)) {
1681 		case KSFT_PASS:
1682 			ksft_test_result_pass("%s\n", tests[i].name);
1683 			break;
1684 		case KSFT_SKIP:
1685 			ksft_test_result_skip("%s\n", tests[i].name);
1686 			break;
1687 		default:
1688 			ret = EXIT_FAILURE;
1689 			ksft_test_result_fail("%s\n", tests[i].name);
1690 			break;
1691 		}
1692 	}
1693 
1694 	return ret;
1695 }
1696