xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 6360d444ae32871c6a048ac880ef3b871a439bad)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "kselftest.h"
22 #include "cgroup_util.h"
23 
24 #define MEMCG_SOCKSTAT_WAIT_RETRIES        30
25 
26 static bool has_localevents;
27 static bool has_recursiveprot;
28 
29 int get_temp_fd(void)
30 {
31 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
32 }
33 
34 int alloc_pagecache(int fd, size_t size)
35 {
36 	char buf[PAGE_SIZE];
37 	struct stat st;
38 	int i;
39 
40 	if (fstat(fd, &st))
41 		goto cleanup;
42 
43 	size += st.st_size;
44 
45 	if (ftruncate(fd, size))
46 		goto cleanup;
47 
48 	for (i = 0; i < size; i += sizeof(buf))
49 		read(fd, buf, sizeof(buf));
50 
51 	return 0;
52 
53 cleanup:
54 	return -1;
55 }
56 
57 int alloc_anon(const char *cgroup, void *arg)
58 {
59 	size_t size = (unsigned long)arg;
60 	char *buf, *ptr;
61 
62 	buf = malloc(size);
63 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
64 		*ptr = 0;
65 
66 	free(buf);
67 	return 0;
68 }
69 
70 int is_swap_enabled(void)
71 {
72 	char buf[PAGE_SIZE];
73 	const char delim[] = "\n";
74 	int cnt = 0;
75 	char *line;
76 
77 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
78 		return -1;
79 
80 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
81 		cnt++;
82 
83 	return cnt > 1;
84 }
85 
86 int set_oom_adj_score(int pid, int score)
87 {
88 	char path[PATH_MAX];
89 	int fd, len;
90 
91 	sprintf(path, "/proc/%d/oom_score_adj", pid);
92 
93 	fd = open(path, O_WRONLY | O_APPEND);
94 	if (fd < 0)
95 		return fd;
96 
97 	len = dprintf(fd, "%d", score);
98 	if (len < 0) {
99 		close(fd);
100 		return len;
101 	}
102 
103 	close(fd);
104 	return 0;
105 }
106 
107 /*
108  * This test creates two nested cgroups with and without enabling
109  * the memory controller.
110  */
111 static int test_memcg_subtree_control(const char *root)
112 {
113 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
114 	int ret = KSFT_FAIL;
115 	char buf[PAGE_SIZE];
116 
117 	/* Create two nested cgroups with the memory controller enabled */
118 	parent = cg_name(root, "memcg_test_0");
119 	child = cg_name(root, "memcg_test_0/memcg_test_1");
120 	if (!parent || !child)
121 		goto cleanup_free;
122 
123 	if (cg_create(parent))
124 		goto cleanup_free;
125 
126 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
127 		goto cleanup_parent;
128 
129 	if (cg_create(child))
130 		goto cleanup_parent;
131 
132 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
133 		goto cleanup_child;
134 
135 	/* Create two nested cgroups without enabling memory controller */
136 	parent2 = cg_name(root, "memcg_test_1");
137 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
138 	if (!parent2 || !child2)
139 		goto cleanup_free2;
140 
141 	if (cg_create(parent2))
142 		goto cleanup_free2;
143 
144 	if (cg_create(child2))
145 		goto cleanup_parent2;
146 
147 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
148 		goto cleanup_all;
149 
150 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
151 		goto cleanup_all;
152 
153 	ret = KSFT_PASS;
154 
155 cleanup_all:
156 	cg_destroy(child2);
157 cleanup_parent2:
158 	cg_destroy(parent2);
159 cleanup_free2:
160 	free(parent2);
161 	free(child2);
162 cleanup_child:
163 	cg_destroy(child);
164 cleanup_parent:
165 	cg_destroy(parent);
166 cleanup_free:
167 	free(parent);
168 	free(child);
169 
170 	return ret;
171 }
172 
173 static int alloc_anon_50M_check(const char *cgroup, void *arg)
174 {
175 	size_t size = MB(50);
176 	char *buf, *ptr;
177 	long anon, current;
178 	int ret = -1;
179 
180 	buf = malloc(size);
181 	if (buf == NULL) {
182 		fprintf(stderr, "malloc() failed\n");
183 		return -1;
184 	}
185 
186 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
187 		*ptr = 0;
188 
189 	current = cg_read_long(cgroup, "memory.current");
190 	if (current < size)
191 		goto cleanup;
192 
193 	if (!values_close(size, current, 3))
194 		goto cleanup;
195 
196 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
197 	if (anon < 0)
198 		goto cleanup;
199 
200 	if (!values_close(anon, current, 3))
201 		goto cleanup;
202 
203 	ret = 0;
204 cleanup:
205 	free(buf);
206 	return ret;
207 }
208 
209 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
210 {
211 	size_t size = MB(50);
212 	int ret = -1;
213 	long current, file;
214 	int fd;
215 
216 	fd = get_temp_fd();
217 	if (fd < 0)
218 		return -1;
219 
220 	if (alloc_pagecache(fd, size))
221 		goto cleanup;
222 
223 	current = cg_read_long(cgroup, "memory.current");
224 	if (current < size)
225 		goto cleanup;
226 
227 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
228 	if (file < 0)
229 		goto cleanup;
230 
231 	if (!values_close(file, current, 10))
232 		goto cleanup;
233 
234 	ret = 0;
235 
236 cleanup:
237 	close(fd);
238 	return ret;
239 }
240 
241 /*
242  * This test create a memory cgroup, allocates
243  * some anonymous memory and some pagecache
244  * and checks memory.current, memory.peak, and some memory.stat values.
245  */
246 static int test_memcg_current_peak(const char *root)
247 {
248 	int ret = KSFT_FAIL;
249 	long current, peak, peak_reset;
250 	char *memcg;
251 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
252 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
253 	struct stat ss;
254 
255 	memcg = cg_name(root, "memcg_test");
256 	if (!memcg)
257 		goto cleanup;
258 
259 	if (cg_create(memcg))
260 		goto cleanup;
261 
262 	current = cg_read_long(memcg, "memory.current");
263 	if (current != 0)
264 		goto cleanup;
265 
266 	peak = cg_read_long(memcg, "memory.peak");
267 	if (peak != 0)
268 		goto cleanup;
269 
270 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
271 		goto cleanup;
272 
273 	peak = cg_read_long(memcg, "memory.peak");
274 	if (peak < MB(50))
275 		goto cleanup;
276 
277 	/*
278 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
279 	 * We need at least three to be closed in a different order than writes occurred to test
280 	 * the linked-list handling.
281 	 */
282 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
283 
284 	if (peak_fd == -1) {
285 		if (errno == ENOENT)
286 			ret = KSFT_SKIP;
287 		goto cleanup;
288 	}
289 
290 	/*
291 	 * Before we try to use memory.peak's fd, try to figure out whether
292 	 * this kernel supports writing to that file in the first place. (by
293 	 * checking the writable bit on the file's st_mode)
294 	 */
295 	if (fstat(peak_fd, &ss))
296 		goto cleanup;
297 
298 	if ((ss.st_mode & S_IWUSR) == 0) {
299 		ret = KSFT_SKIP;
300 		goto cleanup;
301 	}
302 
303 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
304 
305 	if (peak_fd2 == -1)
306 		goto cleanup;
307 
308 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
309 
310 	if (peak_fd3 == -1)
311 		goto cleanup;
312 
313 	/* any non-empty string resets, but make it clear */
314 	static const char reset_string[] = "reset\n";
315 
316 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
317 	if (peak_reset != sizeof(reset_string))
318 		goto cleanup;
319 
320 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
321 	if (peak_reset != sizeof(reset_string))
322 		goto cleanup;
323 
324 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
325 	if (peak_reset != sizeof(reset_string))
326 		goto cleanup;
327 
328 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
329 	peak = cg_read_long(memcg, "memory.peak");
330 	if (peak < MB(50))
331 		goto cleanup;
332 
333 	fd2_closed = true;
334 	if (close(peak_fd2))
335 		goto cleanup;
336 
337 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
338 
339 	if (peak_fd4 == -1)
340 		goto cleanup;
341 
342 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
343 	if (peak_reset != sizeof(reset_string))
344 		goto cleanup;
345 
346 	peak = cg_read_long_fd(peak_fd);
347 	if (peak > MB(30) || peak < 0)
348 		goto cleanup;
349 
350 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
351 		goto cleanup;
352 
353 	peak = cg_read_long(memcg, "memory.peak");
354 	if (peak < MB(50))
355 		goto cleanup;
356 
357 	/* Make sure everything is back to normal */
358 	peak = cg_read_long_fd(peak_fd);
359 	if (peak < MB(50))
360 		goto cleanup;
361 
362 	peak = cg_read_long_fd(peak_fd4);
363 	if (peak < MB(50))
364 		goto cleanup;
365 
366 	fd3_closed = true;
367 	if (close(peak_fd3))
368 		goto cleanup;
369 
370 	fd4_closed = true;
371 	if (close(peak_fd4))
372 		goto cleanup;
373 
374 	ret = KSFT_PASS;
375 
376 cleanup:
377 	close(peak_fd);
378 	if (!fd2_closed)
379 		close(peak_fd2);
380 	if (!fd3_closed)
381 		close(peak_fd3);
382 	if (!fd4_closed)
383 		close(peak_fd4);
384 	cg_destroy(memcg);
385 	free(memcg);
386 
387 	return ret;
388 }
389 
390 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
391 {
392 	int fd = (long)arg;
393 	int ppid = getppid();
394 
395 	if (alloc_pagecache(fd, MB(50)))
396 		return -1;
397 
398 	while (getppid() == ppid)
399 		sleep(1);
400 
401 	return 0;
402 }
403 
404 static int alloc_anon_noexit(const char *cgroup, void *arg)
405 {
406 	int ppid = getppid();
407 	size_t size = (unsigned long)arg;
408 	char *buf, *ptr;
409 
410 	buf = malloc(size);
411 	if (buf == NULL) {
412 		fprintf(stderr, "malloc() failed\n");
413 		return -1;
414 	}
415 
416 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
417 		*ptr = 0;
418 
419 	while (getppid() == ppid)
420 		sleep(1);
421 
422 	free(buf);
423 	return 0;
424 }
425 
426 /*
427  * Wait until processes are killed asynchronously by the OOM killer
428  * If we exceed a timeout, fail.
429  */
430 static int cg_test_proc_killed(const char *cgroup)
431 {
432 	int limit;
433 
434 	for (limit = 10; limit > 0; limit--) {
435 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
436 			return 0;
437 
438 		usleep(100000);
439 	}
440 	return -1;
441 }
442 
443 static bool reclaim_until(const char *memcg, long goal);
444 
445 /*
446  * First, this test creates the following hierarchy:
447  * A       memory.min = 0,    memory.max = 200M
448  * A/B     memory.min = 50M
449  * A/B/C   memory.min = 75M,  memory.current = 50M
450  * A/B/D   memory.min = 25M,  memory.current = 50M
451  * A/B/E   memory.min = 0,    memory.current = 50M
452  * A/B/F   memory.min = 500M, memory.current = 0
453  *
454  * (or memory.low if we test soft protection)
455  *
456  * Usages are pagecache and the test keeps a running
457  * process in every leaf cgroup.
458  * Then it creates A/G and creates a significant
459  * memory pressure in A.
460  *
461  * Then it checks actual memory usages and expects that:
462  * A/B    memory.current ~= 50M
463  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
464  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
465  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
466  *				 undefined otherwise]
467  * A/B/F  memory.current  = 0   [memory.events:low == 0]
468  * (for origin of the numbers, see model in memcg_protection.m.)
469  *
470  * After that it tries to allocate more than there is
471  * unprotected memory in A available, and checks that:
472  * a) memory.min protects pagecache even in this case,
473  * b) memory.low allows reclaiming page cache with low events.
474  *
475  * Then we try to reclaim from A/B/C using memory.reclaim until its
476  * usage reaches 10M.
477  * This makes sure that:
478  * (a) We ignore the protection of the reclaim target memcg.
479  * (b) The previously calculated emin value (~29M) should be dismissed.
480  */
481 static int test_memcg_protection(const char *root, bool min)
482 {
483 	int ret = KSFT_FAIL, rc;
484 	char *parent[3] = {NULL};
485 	char *children[4] = {NULL};
486 	const char *attribute = min ? "memory.min" : "memory.low";
487 	long c[4];
488 	long current;
489 	int i, attempts;
490 	int fd;
491 
492 	fd = get_temp_fd();
493 	if (fd < 0)
494 		goto cleanup;
495 
496 	parent[0] = cg_name(root, "memcg_test_0");
497 	if (!parent[0])
498 		goto cleanup;
499 
500 	parent[1] = cg_name(parent[0], "memcg_test_1");
501 	if (!parent[1])
502 		goto cleanup;
503 
504 	parent[2] = cg_name(parent[0], "memcg_test_2");
505 	if (!parent[2])
506 		goto cleanup;
507 
508 	if (cg_create(parent[0]))
509 		goto cleanup;
510 
511 	if (cg_read_long(parent[0], attribute)) {
512 		/* No memory.min on older kernels is fine */
513 		if (min)
514 			ret = KSFT_SKIP;
515 		goto cleanup;
516 	}
517 
518 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
519 		goto cleanup;
520 
521 	if (cg_write(parent[0], "memory.max", "200M"))
522 		goto cleanup;
523 
524 	if (cg_write(parent[0], "memory.swap.max", "0"))
525 		goto cleanup;
526 
527 	if (cg_create(parent[1]))
528 		goto cleanup;
529 
530 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
531 		goto cleanup;
532 
533 	if (cg_create(parent[2]))
534 		goto cleanup;
535 
536 	for (i = 0; i < ARRAY_SIZE(children); i++) {
537 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
538 		if (!children[i])
539 			goto cleanup;
540 
541 		if (cg_create(children[i]))
542 			goto cleanup;
543 
544 		if (i > 2)
545 			continue;
546 
547 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
548 			      (void *)(long)fd);
549 	}
550 
551 	if (cg_write(parent[1],   attribute, "50M"))
552 		goto cleanup;
553 	if (cg_write(children[0], attribute, "75M"))
554 		goto cleanup;
555 	if (cg_write(children[1], attribute, "25M"))
556 		goto cleanup;
557 	if (cg_write(children[2], attribute, "0"))
558 		goto cleanup;
559 	if (cg_write(children[3], attribute, "500M"))
560 		goto cleanup;
561 
562 	attempts = 0;
563 	while (!values_close(cg_read_long(parent[1], "memory.current"),
564 			     MB(150), 3)) {
565 		if (attempts++ > 5)
566 			break;
567 		sleep(1);
568 	}
569 
570 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
571 		goto cleanup;
572 
573 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
574 		goto cleanup;
575 
576 	for (i = 0; i < ARRAY_SIZE(children); i++)
577 		c[i] = cg_read_long(children[i], "memory.current");
578 
579 	if (!values_close(c[0], MB(29), 15))
580 		goto cleanup;
581 
582 	if (!values_close(c[1], MB(21), 20))
583 		goto cleanup;
584 
585 	if (c[3] != 0)
586 		goto cleanup;
587 
588 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
589 	if (min && !rc)
590 		goto cleanup;
591 	else if (!min && rc) {
592 		fprintf(stderr,
593 			"memory.low prevents from allocating anon memory\n");
594 		goto cleanup;
595 	}
596 
597 	current = min ? MB(50) : MB(30);
598 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
599 		goto cleanup;
600 
601 	if (!reclaim_until(children[0], MB(10)))
602 		goto cleanup;
603 
604 	if (min) {
605 		ret = KSFT_PASS;
606 		goto cleanup;
607 	}
608 
609 	/*
610 	 * Child 2 has memory.low=0, but some low protection may still be
611 	 * distributed down from its parent with memory.low=50M if cgroup2
612 	 * memory_recursiveprot mount option is enabled. Ignore the low
613 	 * event count in this case.
614 	 */
615 	for (i = 0; i < ARRAY_SIZE(children); i++) {
616 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
617 		int no_low_events_index = 1;
618 		long low, oom;
619 
620 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
621 		low = cg_read_key_long(children[i], "memory.events", "low ");
622 
623 		if (oom)
624 			goto cleanup;
625 		if (i == ignore_low_events_index)
626 			continue;
627 		if (i <= no_low_events_index && low <= 0)
628 			goto cleanup;
629 		if (i > no_low_events_index && low)
630 			goto cleanup;
631 
632 	}
633 
634 	ret = KSFT_PASS;
635 
636 cleanup:
637 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
638 		if (!children[i])
639 			continue;
640 
641 		cg_destroy(children[i]);
642 		free(children[i]);
643 	}
644 
645 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
646 		if (!parent[i])
647 			continue;
648 
649 		cg_destroy(parent[i]);
650 		free(parent[i]);
651 	}
652 	close(fd);
653 	return ret;
654 }
655 
656 static int test_memcg_min(const char *root)
657 {
658 	return test_memcg_protection(root, true);
659 }
660 
661 static int test_memcg_low(const char *root)
662 {
663 	return test_memcg_protection(root, false);
664 }
665 
666 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
667 {
668 	size_t size = MB(50);
669 	int ret = -1;
670 	long current, high, max;
671 	int fd;
672 
673 	high = cg_read_long(cgroup, "memory.high");
674 	max = cg_read_long(cgroup, "memory.max");
675 	if (high != MB(30) && max != MB(30))
676 		return -1;
677 
678 	fd = get_temp_fd();
679 	if (fd < 0)
680 		return -1;
681 
682 	if (alloc_pagecache(fd, size))
683 		goto cleanup;
684 
685 	current = cg_read_long(cgroup, "memory.current");
686 	if (!values_close(current, MB(30), 5))
687 		goto cleanup;
688 
689 	ret = 0;
690 
691 cleanup:
692 	close(fd);
693 	return ret;
694 
695 }
696 
697 /*
698  * This test checks that memory.high limits the amount of
699  * memory which can be consumed by either anonymous memory
700  * or pagecache.
701  */
702 static int test_memcg_high(const char *root)
703 {
704 	int ret = KSFT_FAIL;
705 	char *memcg;
706 	long high;
707 
708 	memcg = cg_name(root, "memcg_test");
709 	if (!memcg)
710 		goto cleanup;
711 
712 	if (cg_create(memcg))
713 		goto cleanup;
714 
715 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
716 		goto cleanup;
717 
718 	if (cg_write(memcg, "memory.swap.max", "0"))
719 		goto cleanup;
720 
721 	if (cg_write(memcg, "memory.high", "30M"))
722 		goto cleanup;
723 
724 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
725 		goto cleanup;
726 
727 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
728 		goto cleanup;
729 
730 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
731 		goto cleanup;
732 
733 	high = cg_read_key_long(memcg, "memory.events", "high ");
734 	if (high <= 0)
735 		goto cleanup;
736 
737 	ret = KSFT_PASS;
738 
739 cleanup:
740 	cg_destroy(memcg);
741 	free(memcg);
742 
743 	return ret;
744 }
745 
746 static int alloc_anon_mlock(const char *cgroup, void *arg)
747 {
748 	size_t size = (size_t)arg;
749 	void *buf;
750 
751 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
752 		   0, 0);
753 	if (buf == MAP_FAILED)
754 		return -1;
755 
756 	mlock(buf, size);
757 	munmap(buf, size);
758 	return 0;
759 }
760 
761 /*
762  * This test checks that memory.high is able to throttle big single shot
763  * allocation i.e. large allocation within one kernel entry.
764  */
765 static int test_memcg_high_sync(const char *root)
766 {
767 	int ret = KSFT_FAIL, pid, fd = -1;
768 	char *memcg;
769 	long pre_high, pre_max;
770 	long post_high, post_max;
771 
772 	memcg = cg_name(root, "memcg_test");
773 	if (!memcg)
774 		goto cleanup;
775 
776 	if (cg_create(memcg))
777 		goto cleanup;
778 
779 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
780 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
781 	if (pre_high < 0 || pre_max < 0)
782 		goto cleanup;
783 
784 	if (cg_write(memcg, "memory.swap.max", "0"))
785 		goto cleanup;
786 
787 	if (cg_write(memcg, "memory.high", "30M"))
788 		goto cleanup;
789 
790 	if (cg_write(memcg, "memory.max", "140M"))
791 		goto cleanup;
792 
793 	fd = memcg_prepare_for_wait(memcg);
794 	if (fd < 0)
795 		goto cleanup;
796 
797 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
798 	if (pid < 0)
799 		goto cleanup;
800 
801 	cg_wait_for(fd);
802 
803 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
804 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
805 	if (post_high < 0 || post_max < 0)
806 		goto cleanup;
807 
808 	if (pre_high == post_high || pre_max != post_max)
809 		goto cleanup;
810 
811 	ret = KSFT_PASS;
812 
813 cleanup:
814 	if (fd >= 0)
815 		close(fd);
816 	cg_destroy(memcg);
817 	free(memcg);
818 
819 	return ret;
820 }
821 
822 /*
823  * This test checks that memory.max limits the amount of
824  * memory which can be consumed by either anonymous memory
825  * or pagecache.
826  */
827 static int test_memcg_max(const char *root)
828 {
829 	int ret = KSFT_FAIL;
830 	char *memcg;
831 	long current, max;
832 
833 	memcg = cg_name(root, "memcg_test");
834 	if (!memcg)
835 		goto cleanup;
836 
837 	if (cg_create(memcg))
838 		goto cleanup;
839 
840 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
841 		goto cleanup;
842 
843 	if (cg_write(memcg, "memory.swap.max", "0"))
844 		goto cleanup;
845 
846 	if (cg_write(memcg, "memory.max", "30M"))
847 		goto cleanup;
848 
849 	/* Should be killed by OOM killer */
850 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
851 		goto cleanup;
852 
853 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
854 		goto cleanup;
855 
856 	current = cg_read_long(memcg, "memory.current");
857 	if (current > MB(30) || !current)
858 		goto cleanup;
859 
860 	max = cg_read_key_long(memcg, "memory.events", "max ");
861 	if (max <= 0)
862 		goto cleanup;
863 
864 	ret = KSFT_PASS;
865 
866 cleanup:
867 	cg_destroy(memcg);
868 	free(memcg);
869 
870 	return ret;
871 }
872 
873 /*
874  * Reclaim from @memcg until usage reaches @goal by writing to
875  * memory.reclaim.
876  *
877  * This function will return false if the usage is already below the
878  * goal.
879  *
880  * This function assumes that writing to memory.reclaim is the only
881  * source of change in memory.current (no concurrent allocations or
882  * reclaim).
883  *
884  * This function makes sure memory.reclaim is sane. It will return
885  * false if memory.reclaim's error codes do not make sense, even if
886  * the usage goal was satisfied.
887  */
888 static bool reclaim_until(const char *memcg, long goal)
889 {
890 	char buf[64];
891 	int retries, err;
892 	long current, to_reclaim;
893 	bool reclaimed = false;
894 
895 	for (retries = 5; retries > 0; retries--) {
896 		current = cg_read_long(memcg, "memory.current");
897 
898 		if (current < goal || values_close(current, goal, 3))
899 			break;
900 		/* Did memory.reclaim return 0 incorrectly? */
901 		else if (reclaimed)
902 			return false;
903 
904 		to_reclaim = current - goal;
905 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
906 		err = cg_write(memcg, "memory.reclaim", buf);
907 		if (!err)
908 			reclaimed = true;
909 		else if (err != -EAGAIN)
910 			return false;
911 	}
912 	return reclaimed;
913 }
914 
915 /*
916  * This test checks that memory.reclaim reclaims the given
917  * amount of memory (from both anon and file, if possible).
918  */
919 static int test_memcg_reclaim(const char *root)
920 {
921 	int ret = KSFT_FAIL;
922 	int fd = -1;
923 	int retries;
924 	char *memcg;
925 	long current, expected_usage;
926 
927 	memcg = cg_name(root, "memcg_test");
928 	if (!memcg)
929 		goto cleanup;
930 
931 	if (cg_create(memcg))
932 		goto cleanup;
933 
934 	current = cg_read_long(memcg, "memory.current");
935 	if (current != 0)
936 		goto cleanup;
937 
938 	fd = get_temp_fd();
939 	if (fd < 0)
940 		goto cleanup;
941 
942 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
943 
944 	/*
945 	 * If swap is enabled, try to reclaim from both anon and file, else try
946 	 * to reclaim from file only.
947 	 */
948 	if (is_swap_enabled()) {
949 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
950 		expected_usage = MB(100);
951 	} else
952 		expected_usage = MB(50);
953 
954 	/*
955 	 * Wait until current usage reaches the expected usage (or we run out of
956 	 * retries).
957 	 */
958 	retries = 5;
959 	while (!values_close(cg_read_long(memcg, "memory.current"),
960 			    expected_usage, 10)) {
961 		if (retries--) {
962 			sleep(1);
963 			continue;
964 		} else {
965 			fprintf(stderr,
966 				"failed to allocate %ld for memcg reclaim test\n",
967 				expected_usage);
968 			goto cleanup;
969 		}
970 	}
971 
972 	/*
973 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
974 	 * and file if swap is enabled.
975 	 */
976 	if (!reclaim_until(memcg, MB(30)))
977 		goto cleanup;
978 
979 	ret = KSFT_PASS;
980 cleanup:
981 	cg_destroy(memcg);
982 	free(memcg);
983 	close(fd);
984 
985 	return ret;
986 }
987 
988 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
989 {
990 	long mem_max = (long)arg;
991 	size_t size = MB(50);
992 	char *buf, *ptr;
993 	long mem_current, swap_current;
994 	int ret = -1;
995 
996 	buf = malloc(size);
997 	if (buf == NULL) {
998 		fprintf(stderr, "malloc() failed\n");
999 		return -1;
1000 	}
1001 
1002 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
1003 		*ptr = 0;
1004 
1005 	mem_current = cg_read_long(cgroup, "memory.current");
1006 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1007 		goto cleanup;
1008 
1009 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1010 	if (!swap_current ||
1011 	    !values_close(mem_current + swap_current, size, 3))
1012 		goto cleanup;
1013 
1014 	ret = 0;
1015 cleanup:
1016 	free(buf);
1017 	return ret;
1018 }
1019 
1020 /*
1021  * This test checks that memory.swap.max limits the amount of
1022  * anonymous memory which can be swapped out. Additionally, it verifies that
1023  * memory.swap.peak reflects the high watermark and can be reset.
1024  */
1025 static int test_memcg_swap_max_peak(const char *root)
1026 {
1027 	int ret = KSFT_FAIL;
1028 	char *memcg;
1029 	long max, peak;
1030 	struct stat ss;
1031 	int swap_peak_fd = -1, mem_peak_fd = -1;
1032 
1033 	/* any non-empty string resets */
1034 	static const char reset_string[] = "foobarbaz";
1035 
1036 	if (!is_swap_enabled())
1037 		return KSFT_SKIP;
1038 
1039 	memcg = cg_name(root, "memcg_test");
1040 	if (!memcg)
1041 		goto cleanup;
1042 
1043 	if (cg_create(memcg))
1044 		goto cleanup;
1045 
1046 	if (cg_read_long(memcg, "memory.swap.current")) {
1047 		ret = KSFT_SKIP;
1048 		goto cleanup;
1049 	}
1050 
1051 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1052 			       O_RDWR | O_APPEND | O_CLOEXEC);
1053 
1054 	if (swap_peak_fd == -1) {
1055 		if (errno == ENOENT)
1056 			ret = KSFT_SKIP;
1057 		goto cleanup;
1058 	}
1059 
1060 	/*
1061 	 * Before we try to use memory.swap.peak's fd, try to figure out
1062 	 * whether this kernel supports writing to that file in the first
1063 	 * place. (by checking the writable bit on the file's st_mode)
1064 	 */
1065 	if (fstat(swap_peak_fd, &ss))
1066 		goto cleanup;
1067 
1068 	if ((ss.st_mode & S_IWUSR) == 0) {
1069 		ret = KSFT_SKIP;
1070 		goto cleanup;
1071 	}
1072 
1073 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1074 
1075 	if (mem_peak_fd == -1)
1076 		goto cleanup;
1077 
1078 	if (cg_read_long(memcg, "memory.swap.peak"))
1079 		goto cleanup;
1080 
1081 	if (cg_read_long_fd(swap_peak_fd))
1082 		goto cleanup;
1083 
1084 	/* switch the swap and mem fds into local-peak tracking mode*/
1085 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1086 
1087 	if (peak_reset != sizeof(reset_string))
1088 		goto cleanup;
1089 
1090 	if (cg_read_long_fd(swap_peak_fd))
1091 		goto cleanup;
1092 
1093 	if (cg_read_long(memcg, "memory.peak"))
1094 		goto cleanup;
1095 
1096 	if (cg_read_long_fd(mem_peak_fd))
1097 		goto cleanup;
1098 
1099 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1100 	if (peak_reset != sizeof(reset_string))
1101 		goto cleanup;
1102 
1103 	if (cg_read_long_fd(mem_peak_fd))
1104 		goto cleanup;
1105 
1106 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1107 		goto cleanup;
1108 
1109 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1110 		goto cleanup;
1111 
1112 	if (cg_write(memcg, "memory.swap.max", "30M"))
1113 		goto cleanup;
1114 
1115 	if (cg_write(memcg, "memory.max", "30M"))
1116 		goto cleanup;
1117 
1118 	/* Should be killed by OOM killer */
1119 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1120 		goto cleanup;
1121 
1122 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1123 		goto cleanup;
1124 
1125 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1126 		goto cleanup;
1127 
1128 	peak = cg_read_long(memcg, "memory.peak");
1129 	if (peak < MB(29))
1130 		goto cleanup;
1131 
1132 	peak = cg_read_long(memcg, "memory.swap.peak");
1133 	if (peak < MB(29))
1134 		goto cleanup;
1135 
1136 	peak = cg_read_long_fd(mem_peak_fd);
1137 	if (peak < MB(29))
1138 		goto cleanup;
1139 
1140 	peak = cg_read_long_fd(swap_peak_fd);
1141 	if (peak < MB(29))
1142 		goto cleanup;
1143 
1144 	/*
1145 	 * open, reset and close the peak swap on another FD to make sure
1146 	 * multiple extant fds don't corrupt the linked-list
1147 	 */
1148 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1149 	if (peak_reset)
1150 		goto cleanup;
1151 
1152 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1153 	if (peak_reset)
1154 		goto cleanup;
1155 
1156 	/* actually reset on the fds */
1157 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1158 	if (peak_reset != sizeof(reset_string))
1159 		goto cleanup;
1160 
1161 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1162 	if (peak_reset != sizeof(reset_string))
1163 		goto cleanup;
1164 
1165 	peak = cg_read_long_fd(swap_peak_fd);
1166 	if (peak > MB(10))
1167 		goto cleanup;
1168 
1169 	/*
1170 	 * The cgroup is now empty, but there may be a page or two associated
1171 	 * with the open FD accounted to it.
1172 	 */
1173 	peak = cg_read_long_fd(mem_peak_fd);
1174 	if (peak > MB(1))
1175 		goto cleanup;
1176 
1177 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1178 		goto cleanup;
1179 
1180 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1181 		goto cleanup;
1182 
1183 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1184 		goto cleanup;
1185 
1186 	max = cg_read_key_long(memcg, "memory.events", "max ");
1187 	if (max <= 0)
1188 		goto cleanup;
1189 
1190 	peak = cg_read_long(memcg, "memory.peak");
1191 	if (peak < MB(29))
1192 		goto cleanup;
1193 
1194 	peak = cg_read_long(memcg, "memory.swap.peak");
1195 	if (peak < MB(29))
1196 		goto cleanup;
1197 
1198 	peak = cg_read_long_fd(mem_peak_fd);
1199 	if (peak < MB(29))
1200 		goto cleanup;
1201 
1202 	peak = cg_read_long_fd(swap_peak_fd);
1203 	if (peak < MB(19))
1204 		goto cleanup;
1205 
1206 	ret = KSFT_PASS;
1207 
1208 cleanup:
1209 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1210 		ret = KSFT_FAIL;
1211 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1212 		ret = KSFT_FAIL;
1213 	cg_destroy(memcg);
1214 	free(memcg);
1215 
1216 	return ret;
1217 }
1218 
1219 /*
1220  * This test disables swapping and tries to allocate anonymous memory
1221  * up to OOM. Then it checks for oom and oom_kill events in
1222  * memory.events.
1223  */
1224 static int test_memcg_oom_events(const char *root)
1225 {
1226 	int ret = KSFT_FAIL;
1227 	char *memcg;
1228 
1229 	memcg = cg_name(root, "memcg_test");
1230 	if (!memcg)
1231 		goto cleanup;
1232 
1233 	if (cg_create(memcg))
1234 		goto cleanup;
1235 
1236 	if (cg_write(memcg, "memory.max", "30M"))
1237 		goto cleanup;
1238 
1239 	if (cg_write(memcg, "memory.swap.max", "0"))
1240 		goto cleanup;
1241 
1242 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1243 		goto cleanup;
1244 
1245 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1246 		goto cleanup;
1247 
1248 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1249 		goto cleanup;
1250 
1251 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1252 		goto cleanup;
1253 
1254 	ret = KSFT_PASS;
1255 
1256 cleanup:
1257 	cg_destroy(memcg);
1258 	free(memcg);
1259 
1260 	return ret;
1261 }
1262 
1263 struct tcp_server_args {
1264 	unsigned short port;
1265 	int ctl[2];
1266 };
1267 
1268 static int tcp_server(const char *cgroup, void *arg)
1269 {
1270 	struct tcp_server_args *srv_args = arg;
1271 	struct sockaddr_in6 saddr = { 0 };
1272 	socklen_t slen = sizeof(saddr);
1273 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1274 
1275 	close(srv_args->ctl[0]);
1276 	ctl_fd = srv_args->ctl[1];
1277 
1278 	saddr.sin6_family = AF_INET6;
1279 	saddr.sin6_addr = in6addr_any;
1280 	saddr.sin6_port = htons(srv_args->port);
1281 
1282 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1283 	if (sk < 0)
1284 		return ret;
1285 
1286 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1287 		goto cleanup;
1288 
1289 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1290 		write(ctl_fd, &errno, sizeof(errno));
1291 		goto cleanup;
1292 	}
1293 
1294 	if (listen(sk, 1))
1295 		goto cleanup;
1296 
1297 	ret = 0;
1298 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1299 		ret = -1;
1300 		goto cleanup;
1301 	}
1302 
1303 	client_sk = accept(sk, NULL, NULL);
1304 	if (client_sk < 0)
1305 		goto cleanup;
1306 
1307 	ret = -1;
1308 	for (;;) {
1309 		uint8_t buf[0x100000];
1310 
1311 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1312 			if (errno == ECONNRESET)
1313 				ret = 0;
1314 			break;
1315 		}
1316 	}
1317 
1318 	close(client_sk);
1319 
1320 cleanup:
1321 	close(sk);
1322 	return ret;
1323 }
1324 
1325 static int tcp_client(const char *cgroup, unsigned short port)
1326 {
1327 	const char server[] = "localhost";
1328 	struct addrinfo *ai;
1329 	char servport[6];
1330 	int retries = 0x10; /* nice round number */
1331 	int sk, ret;
1332 	long allocated;
1333 
1334 	allocated = cg_read_long(cgroup, "memory.current");
1335 	snprintf(servport, sizeof(servport), "%hd", port);
1336 	ret = getaddrinfo(server, servport, NULL, &ai);
1337 	if (ret)
1338 		return ret;
1339 
1340 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1341 	if (sk < 0)
1342 		goto free_ainfo;
1343 
1344 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1345 	if (ret < 0)
1346 		goto close_sk;
1347 
1348 	ret = KSFT_FAIL;
1349 	while (retries--) {
1350 		uint8_t buf[0x100000];
1351 		long current, sock;
1352 
1353 		if (read(sk, buf, sizeof(buf)) <= 0)
1354 			goto close_sk;
1355 
1356 		current = cg_read_long(cgroup, "memory.current");
1357 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1358 
1359 		if (current < 0 || sock < 0)
1360 			goto close_sk;
1361 
1362 		/* exclude the memory not related to socket connection */
1363 		if (values_close(current - allocated, sock, 10)) {
1364 			ret = KSFT_PASS;
1365 			break;
1366 		}
1367 	}
1368 
1369 close_sk:
1370 	close(sk);
1371 free_ainfo:
1372 	freeaddrinfo(ai);
1373 	return ret;
1374 }
1375 
1376 /*
1377  * This test checks socket memory accounting.
1378  * The test forks a TCP server listens on a random port between 1000
1379  * and 61000. Once it gets a client connection, it starts writing to
1380  * its socket.
1381  * The TCP client interleaves reads from the socket with check whether
1382  * memory.current and memory.stat.sock are similar.
1383  */
1384 static int test_memcg_sock(const char *root)
1385 {
1386 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1387 	unsigned short port;
1388 	char *memcg;
1389 	long sock_post = -1;
1390 
1391 	memcg = cg_name(root, "memcg_test");
1392 	if (!memcg)
1393 		goto cleanup;
1394 
1395 	if (cg_create(memcg))
1396 		goto cleanup;
1397 
1398 	while (bind_retries--) {
1399 		struct tcp_server_args args;
1400 
1401 		if (pipe(args.ctl))
1402 			goto cleanup;
1403 
1404 		port = args.port = 1000 + rand() % 60000;
1405 
1406 		pid = cg_run_nowait(memcg, tcp_server, &args);
1407 		if (pid < 0)
1408 			goto cleanup;
1409 
1410 		close(args.ctl[1]);
1411 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1412 			goto cleanup;
1413 		close(args.ctl[0]);
1414 
1415 		if (!err)
1416 			break;
1417 		if (err != EADDRINUSE)
1418 			goto cleanup;
1419 
1420 		waitpid(pid, NULL, 0);
1421 	}
1422 
1423 	if (err == EADDRINUSE) {
1424 		ret = KSFT_SKIP;
1425 		goto cleanup;
1426 	}
1427 
1428 	if (tcp_client(memcg, port) != KSFT_PASS)
1429 		goto cleanup;
1430 
1431 	waitpid(pid, &err, 0);
1432 	if (WEXITSTATUS(err))
1433 		goto cleanup;
1434 
1435 	if (cg_read_long(memcg, "memory.current") < 0)
1436 		goto cleanup;
1437 
1438 	/*
1439 	 * memory.stat is updated asynchronously via the memcg rstat
1440 	 * flushing worker, which runs periodically (every 2 seconds,
1441 	 * see FLUSH_TIME). On a busy system, the "sock " counter may
1442 	 * stay non-zero for a short period of time after the TCP
1443 	 * connection is closed and all socket memory has been
1444 	 * uncharged.
1445 	 *
1446 	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1447 	 * scheduling slack) and require that the "sock " counter
1448 	 * eventually drops to zero.
1449 	 */
1450 	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1451 					 MEMCG_SOCKSTAT_WAIT_RETRIES,
1452 					 DEFAULT_WAIT_INTERVAL_US);
1453 	if (sock_post)
1454 		goto cleanup;
1455 
1456 	ret = KSFT_PASS;
1457 
1458 cleanup:
1459 	cg_destroy(memcg);
1460 	free(memcg);
1461 
1462 	return ret;
1463 }
1464 
1465 /*
1466  * This test disables swapping and tries to allocate anonymous memory
1467  * up to OOM with memory.group.oom set. Then it checks that all
1468  * processes in the leaf were killed. It also checks that oom_events
1469  * were propagated to the parent level.
1470  */
1471 static int test_memcg_oom_group_leaf_events(const char *root)
1472 {
1473 	int ret = KSFT_FAIL;
1474 	char *parent, *child;
1475 	long parent_oom_events;
1476 
1477 	parent = cg_name(root, "memcg_test_0");
1478 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1479 
1480 	if (!parent || !child)
1481 		goto cleanup;
1482 
1483 	if (cg_create(parent))
1484 		goto cleanup;
1485 
1486 	if (cg_create(child))
1487 		goto cleanup;
1488 
1489 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1490 		goto cleanup;
1491 
1492 	if (cg_write(child, "memory.max", "50M"))
1493 		goto cleanup;
1494 
1495 	if (cg_write(child, "memory.swap.max", "0"))
1496 		goto cleanup;
1497 
1498 	if (cg_write(child, "memory.oom.group", "1"))
1499 		goto cleanup;
1500 
1501 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1502 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1503 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1504 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1505 		goto cleanup;
1506 
1507 	if (cg_test_proc_killed(child))
1508 		goto cleanup;
1509 
1510 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1511 		goto cleanup;
1512 
1513 	parent_oom_events = cg_read_key_long(
1514 			parent, "memory.events", "oom_kill ");
1515 	/*
1516 	 * If memory_localevents is not enabled (the default), the parent should
1517 	 * count OOM events in its children groups. Otherwise, it should not
1518 	 * have observed any events.
1519 	 */
1520 	if (has_localevents && parent_oom_events != 0)
1521 		goto cleanup;
1522 	else if (!has_localevents && parent_oom_events <= 0)
1523 		goto cleanup;
1524 
1525 	ret = KSFT_PASS;
1526 
1527 cleanup:
1528 	if (child)
1529 		cg_destroy(child);
1530 	if (parent)
1531 		cg_destroy(parent);
1532 	free(child);
1533 	free(parent);
1534 
1535 	return ret;
1536 }
1537 
1538 /*
1539  * This test disables swapping and tries to allocate anonymous memory
1540  * up to OOM with memory.group.oom set. Then it checks that all
1541  * processes in the parent and leaf were killed.
1542  */
1543 static int test_memcg_oom_group_parent_events(const char *root)
1544 {
1545 	int ret = KSFT_FAIL;
1546 	char *parent, *child;
1547 
1548 	parent = cg_name(root, "memcg_test_0");
1549 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1550 
1551 	if (!parent || !child)
1552 		goto cleanup;
1553 
1554 	if (cg_create(parent))
1555 		goto cleanup;
1556 
1557 	if (cg_create(child))
1558 		goto cleanup;
1559 
1560 	if (cg_write(parent, "memory.max", "80M"))
1561 		goto cleanup;
1562 
1563 	if (cg_write(parent, "memory.swap.max", "0"))
1564 		goto cleanup;
1565 
1566 	if (cg_write(parent, "memory.oom.group", "1"))
1567 		goto cleanup;
1568 
1569 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1570 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1571 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1572 
1573 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1574 		goto cleanup;
1575 
1576 	if (cg_test_proc_killed(child))
1577 		goto cleanup;
1578 	if (cg_test_proc_killed(parent))
1579 		goto cleanup;
1580 
1581 	ret = KSFT_PASS;
1582 
1583 cleanup:
1584 	if (child)
1585 		cg_destroy(child);
1586 	if (parent)
1587 		cg_destroy(parent);
1588 	free(child);
1589 	free(parent);
1590 
1591 	return ret;
1592 }
1593 
1594 /*
1595  * This test disables swapping and tries to allocate anonymous memory
1596  * up to OOM with memory.group.oom set. Then it checks that all
1597  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1598  */
1599 static int test_memcg_oom_group_score_events(const char *root)
1600 {
1601 	int ret = KSFT_FAIL;
1602 	char *memcg;
1603 	int safe_pid;
1604 
1605 	memcg = cg_name(root, "memcg_test_0");
1606 
1607 	if (!memcg)
1608 		goto cleanup;
1609 
1610 	if (cg_create(memcg))
1611 		goto cleanup;
1612 
1613 	if (cg_write(memcg, "memory.max", "50M"))
1614 		goto cleanup;
1615 
1616 	if (cg_write(memcg, "memory.swap.max", "0"))
1617 		goto cleanup;
1618 
1619 	if (cg_write(memcg, "memory.oom.group", "1"))
1620 		goto cleanup;
1621 
1622 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1623 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1624 		goto cleanup;
1625 
1626 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1627 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1628 		goto cleanup;
1629 
1630 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1631 		goto cleanup;
1632 
1633 	if (kill(safe_pid, SIGKILL))
1634 		goto cleanup;
1635 
1636 	ret = KSFT_PASS;
1637 
1638 cleanup:
1639 	if (memcg)
1640 		cg_destroy(memcg);
1641 	free(memcg);
1642 
1643 	return ret;
1644 }
1645 
1646 #define T(x) { x, #x }
1647 struct memcg_test {
1648 	int (*fn)(const char *root);
1649 	const char *name;
1650 } tests[] = {
1651 	T(test_memcg_subtree_control),
1652 	T(test_memcg_current_peak),
1653 	T(test_memcg_min),
1654 	T(test_memcg_low),
1655 	T(test_memcg_high),
1656 	T(test_memcg_high_sync),
1657 	T(test_memcg_max),
1658 	T(test_memcg_reclaim),
1659 	T(test_memcg_oom_events),
1660 	T(test_memcg_swap_max_peak),
1661 	T(test_memcg_sock),
1662 	T(test_memcg_oom_group_leaf_events),
1663 	T(test_memcg_oom_group_parent_events),
1664 	T(test_memcg_oom_group_score_events),
1665 };
1666 #undef T
1667 
1668 int main(int argc, char **argv)
1669 {
1670 	char root[PATH_MAX];
1671 	int i, proc_status;
1672 
1673 	ksft_print_header();
1674 	ksft_set_plan(ARRAY_SIZE(tests));
1675 	if (cg_find_unified_root(root, sizeof(root), NULL))
1676 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1677 
1678 	/*
1679 	 * Check that memory controller is available:
1680 	 * memory is listed in cgroup.controllers
1681 	 */
1682 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1683 		ksft_exit_skip("memory controller isn't available\n");
1684 
1685 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1686 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1687 			ksft_exit_skip("Failed to set memory controller\n");
1688 
1689 	proc_status = proc_mount_contains("memory_recursiveprot");
1690 	if (proc_status < 0)
1691 		ksft_exit_skip("Failed to query cgroup mount option\n");
1692 	has_recursiveprot = proc_status;
1693 
1694 	proc_status = proc_mount_contains("memory_localevents");
1695 	if (proc_status < 0)
1696 		ksft_exit_skip("Failed to query cgroup mount option\n");
1697 	has_localevents = proc_status;
1698 
1699 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1700 		switch (tests[i].fn(root)) {
1701 		case KSFT_PASS:
1702 			ksft_test_result_pass("%s\n", tests[i].name);
1703 			break;
1704 		case KSFT_SKIP:
1705 			ksft_test_result_skip("%s\n", tests[i].name);
1706 			break;
1707 		default:
1708 			ksft_test_result_fail("%s\n", tests[i].name);
1709 			break;
1710 		}
1711 	}
1712 
1713 	ksft_finished();
1714 }
1715