xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 2c754a84ff16ae835cea470c4146fabe94fa129f)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 int get_temp_fd(void)
28 {
29 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
30 }
31 
32 int alloc_pagecache(int fd, size_t size)
33 {
34 	char buf[PAGE_SIZE];
35 	struct stat st;
36 	int i;
37 
38 	if (fstat(fd, &st))
39 		goto cleanup;
40 
41 	size += st.st_size;
42 
43 	if (ftruncate(fd, size))
44 		goto cleanup;
45 
46 	for (i = 0; i < size; i += sizeof(buf))
47 		read(fd, buf, sizeof(buf));
48 
49 	return 0;
50 
51 cleanup:
52 	return -1;
53 }
54 
55 int alloc_anon(const char *cgroup, void *arg)
56 {
57 	size_t size = (unsigned long)arg;
58 	char *buf, *ptr;
59 
60 	buf = malloc(size);
61 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
62 		*ptr = 0;
63 
64 	free(buf);
65 	return 0;
66 }
67 
68 int is_swap_enabled(void)
69 {
70 	char buf[PAGE_SIZE];
71 	const char delim[] = "\n";
72 	int cnt = 0;
73 	char *line;
74 
75 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
76 		return -1;
77 
78 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
79 		cnt++;
80 
81 	return cnt > 1;
82 }
83 
84 int set_oom_adj_score(int pid, int score)
85 {
86 	char path[PATH_MAX];
87 	int fd, len;
88 
89 	sprintf(path, "/proc/%d/oom_score_adj", pid);
90 
91 	fd = open(path, O_WRONLY | O_APPEND);
92 	if (fd < 0)
93 		return fd;
94 
95 	len = dprintf(fd, "%d", score);
96 	if (len < 0) {
97 		close(fd);
98 		return len;
99 	}
100 
101 	close(fd);
102 	return 0;
103 }
104 
105 /*
106  * This test creates two nested cgroups with and without enabling
107  * the memory controller.
108  */
109 static int test_memcg_subtree_control(const char *root)
110 {
111 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
112 	int ret = KSFT_FAIL;
113 	char buf[PAGE_SIZE];
114 
115 	/* Create two nested cgroups with the memory controller enabled */
116 	parent = cg_name(root, "memcg_test_0");
117 	child = cg_name(root, "memcg_test_0/memcg_test_1");
118 	if (!parent || !child)
119 		goto cleanup_free;
120 
121 	if (cg_create(parent))
122 		goto cleanup_free;
123 
124 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
125 		goto cleanup_parent;
126 
127 	if (cg_create(child))
128 		goto cleanup_parent;
129 
130 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
131 		goto cleanup_child;
132 
133 	/* Create two nested cgroups without enabling memory controller */
134 	parent2 = cg_name(root, "memcg_test_1");
135 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
136 	if (!parent2 || !child2)
137 		goto cleanup_free2;
138 
139 	if (cg_create(parent2))
140 		goto cleanup_free2;
141 
142 	if (cg_create(child2))
143 		goto cleanup_parent2;
144 
145 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
146 		goto cleanup_all;
147 
148 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
149 		goto cleanup_all;
150 
151 	ret = KSFT_PASS;
152 
153 cleanup_all:
154 	cg_destroy(child2);
155 cleanup_parent2:
156 	cg_destroy(parent2);
157 cleanup_free2:
158 	free(parent2);
159 	free(child2);
160 cleanup_child:
161 	cg_destroy(child);
162 cleanup_parent:
163 	cg_destroy(parent);
164 cleanup_free:
165 	free(parent);
166 	free(child);
167 
168 	return ret;
169 }
170 
171 static int alloc_anon_50M_check(const char *cgroup, void *arg)
172 {
173 	size_t size = MB(50);
174 	char *buf, *ptr;
175 	long anon, current;
176 	int ret = -1;
177 
178 	buf = malloc(size);
179 	if (buf == NULL) {
180 		fprintf(stderr, "malloc() failed\n");
181 		return -1;
182 	}
183 
184 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
185 		*ptr = 0;
186 
187 	current = cg_read_long(cgroup, "memory.current");
188 	if (current < size)
189 		goto cleanup;
190 
191 	if (!values_close(size, current, 3))
192 		goto cleanup;
193 
194 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
195 	if (anon < 0)
196 		goto cleanup;
197 
198 	if (!values_close(anon, current, 3))
199 		goto cleanup;
200 
201 	ret = 0;
202 cleanup:
203 	free(buf);
204 	return ret;
205 }
206 
207 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
208 {
209 	size_t size = MB(50);
210 	int ret = -1;
211 	long current, file;
212 	int fd;
213 
214 	fd = get_temp_fd();
215 	if (fd < 0)
216 		return -1;
217 
218 	if (alloc_pagecache(fd, size))
219 		goto cleanup;
220 
221 	current = cg_read_long(cgroup, "memory.current");
222 	if (current < size)
223 		goto cleanup;
224 
225 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
226 	if (file < 0)
227 		goto cleanup;
228 
229 	if (!values_close(file, current, 10))
230 		goto cleanup;
231 
232 	ret = 0;
233 
234 cleanup:
235 	close(fd);
236 	return ret;
237 }
238 
239 /*
240  * This test create a memory cgroup, allocates
241  * some anonymous memory and some pagecache
242  * and checks memory.current, memory.peak, and some memory.stat values.
243  */
244 static int test_memcg_current_peak(const char *root)
245 {
246 	int ret = KSFT_FAIL;
247 	long current, peak, peak_reset;
248 	char *memcg;
249 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
250 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
251 	struct stat ss;
252 
253 	memcg = cg_name(root, "memcg_test");
254 	if (!memcg)
255 		goto cleanup;
256 
257 	if (cg_create(memcg))
258 		goto cleanup;
259 
260 	current = cg_read_long(memcg, "memory.current");
261 	if (current != 0)
262 		goto cleanup;
263 
264 	peak = cg_read_long(memcg, "memory.peak");
265 	if (peak != 0)
266 		goto cleanup;
267 
268 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
269 		goto cleanup;
270 
271 	peak = cg_read_long(memcg, "memory.peak");
272 	if (peak < MB(50))
273 		goto cleanup;
274 
275 	/*
276 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
277 	 * We need at least three to be closed in a different order than writes occurred to test
278 	 * the linked-list handling.
279 	 */
280 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
281 
282 	if (peak_fd == -1) {
283 		if (errno == ENOENT)
284 			ret = KSFT_SKIP;
285 		goto cleanup;
286 	}
287 
288 	/*
289 	 * Before we try to use memory.peak's fd, try to figure out whether
290 	 * this kernel supports writing to that file in the first place. (by
291 	 * checking the writable bit on the file's st_mode)
292 	 */
293 	if (fstat(peak_fd, &ss))
294 		goto cleanup;
295 
296 	if ((ss.st_mode & S_IWUSR) == 0) {
297 		ret = KSFT_SKIP;
298 		goto cleanup;
299 	}
300 
301 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
302 
303 	if (peak_fd2 == -1)
304 		goto cleanup;
305 
306 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
307 
308 	if (peak_fd3 == -1)
309 		goto cleanup;
310 
311 	/* any non-empty string resets, but make it clear */
312 	static const char reset_string[] = "reset\n";
313 
314 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
315 	if (peak_reset != sizeof(reset_string))
316 		goto cleanup;
317 
318 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
319 	if (peak_reset != sizeof(reset_string))
320 		goto cleanup;
321 
322 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
323 	if (peak_reset != sizeof(reset_string))
324 		goto cleanup;
325 
326 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
327 	peak = cg_read_long(memcg, "memory.peak");
328 	if (peak < MB(50))
329 		goto cleanup;
330 
331 	fd2_closed = true;
332 	if (close(peak_fd2))
333 		goto cleanup;
334 
335 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
336 
337 	if (peak_fd4 == -1)
338 		goto cleanup;
339 
340 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
341 	if (peak_reset != sizeof(reset_string))
342 		goto cleanup;
343 
344 	peak = cg_read_long_fd(peak_fd);
345 	if (peak > MB(30) || peak < 0)
346 		goto cleanup;
347 
348 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
349 		goto cleanup;
350 
351 	peak = cg_read_long(memcg, "memory.peak");
352 	if (peak < MB(50))
353 		goto cleanup;
354 
355 	/* Make sure everything is back to normal */
356 	peak = cg_read_long_fd(peak_fd);
357 	if (peak < MB(50))
358 		goto cleanup;
359 
360 	peak = cg_read_long_fd(peak_fd4);
361 	if (peak < MB(50))
362 		goto cleanup;
363 
364 	fd3_closed = true;
365 	if (close(peak_fd3))
366 		goto cleanup;
367 
368 	fd4_closed = true;
369 	if (close(peak_fd4))
370 		goto cleanup;
371 
372 	ret = KSFT_PASS;
373 
374 cleanup:
375 	close(peak_fd);
376 	if (!fd2_closed)
377 		close(peak_fd2);
378 	if (!fd3_closed)
379 		close(peak_fd3);
380 	if (!fd4_closed)
381 		close(peak_fd4);
382 	cg_destroy(memcg);
383 	free(memcg);
384 
385 	return ret;
386 }
387 
388 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
389 {
390 	int fd = (long)arg;
391 	int ppid = getppid();
392 
393 	if (alloc_pagecache(fd, MB(50)))
394 		return -1;
395 
396 	while (getppid() == ppid)
397 		sleep(1);
398 
399 	return 0;
400 }
401 
402 static int alloc_anon_noexit(const char *cgroup, void *arg)
403 {
404 	int ppid = getppid();
405 	size_t size = (unsigned long)arg;
406 	char *buf, *ptr;
407 
408 	buf = malloc(size);
409 	if (buf == NULL) {
410 		fprintf(stderr, "malloc() failed\n");
411 		return -1;
412 	}
413 
414 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
415 		*ptr = 0;
416 
417 	while (getppid() == ppid)
418 		sleep(1);
419 
420 	free(buf);
421 	return 0;
422 }
423 
424 /*
425  * Wait until processes are killed asynchronously by the OOM killer
426  * If we exceed a timeout, fail.
427  */
428 static int cg_test_proc_killed(const char *cgroup)
429 {
430 	int limit;
431 
432 	for (limit = 10; limit > 0; limit--) {
433 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
434 			return 0;
435 
436 		usleep(100000);
437 	}
438 	return -1;
439 }
440 
441 static bool reclaim_until(const char *memcg, long goal);
442 
443 /*
444  * First, this test creates the following hierarchy:
445  * A       memory.min = 0,    memory.max = 200M
446  * A/B     memory.min = 50M
447  * A/B/C   memory.min = 75M,  memory.current = 50M
448  * A/B/D   memory.min = 25M,  memory.current = 50M
449  * A/B/E   memory.min = 0,    memory.current = 50M
450  * A/B/F   memory.min = 500M, memory.current = 0
451  *
452  * (or memory.low if we test soft protection)
453  *
454  * Usages are pagecache and the test keeps a running
455  * process in every leaf cgroup.
456  * Then it creates A/G and creates a significant
457  * memory pressure in A.
458  *
459  * Then it checks actual memory usages and expects that:
460  * A/B    memory.current ~= 50M
461  * A/B/C  memory.current ~= 29M
462  * A/B/D  memory.current ~= 21M
463  * A/B/E  memory.current ~= 0
464  * A/B/F  memory.current  = 0
465  * (for origin of the numbers, see model in memcg_protection.m.)
466  *
467  * After that it tries to allocate more than there is
468  * unprotected memory in A available, and checks that:
469  * a) memory.min protects pagecache even in this case,
470  * b) memory.low allows reclaiming page cache with low events.
471  *
472  * Then we try to reclaim from A/B/C using memory.reclaim until its
473  * usage reaches 10M.
474  * This makes sure that:
475  * (a) We ignore the protection of the reclaim target memcg.
476  * (b) The previously calculated emin value (~29M) should be dismissed.
477  */
478 static int test_memcg_protection(const char *root, bool min)
479 {
480 	int ret = KSFT_FAIL, rc;
481 	char *parent[3] = {NULL};
482 	char *children[4] = {NULL};
483 	const char *attribute = min ? "memory.min" : "memory.low";
484 	long c[4];
485 	long current;
486 	int i, attempts;
487 	int fd;
488 
489 	fd = get_temp_fd();
490 	if (fd < 0)
491 		goto cleanup;
492 
493 	parent[0] = cg_name(root, "memcg_test_0");
494 	if (!parent[0])
495 		goto cleanup;
496 
497 	parent[1] = cg_name(parent[0], "memcg_test_1");
498 	if (!parent[1])
499 		goto cleanup;
500 
501 	parent[2] = cg_name(parent[0], "memcg_test_2");
502 	if (!parent[2])
503 		goto cleanup;
504 
505 	if (cg_create(parent[0]))
506 		goto cleanup;
507 
508 	if (cg_read_long(parent[0], attribute)) {
509 		/* No memory.min on older kernels is fine */
510 		if (min)
511 			ret = KSFT_SKIP;
512 		goto cleanup;
513 	}
514 
515 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
516 		goto cleanup;
517 
518 	if (cg_write(parent[0], "memory.max", "200M"))
519 		goto cleanup;
520 
521 	if (cg_write(parent[0], "memory.swap.max", "0"))
522 		goto cleanup;
523 
524 	if (cg_create(parent[1]))
525 		goto cleanup;
526 
527 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
528 		goto cleanup;
529 
530 	if (cg_create(parent[2]))
531 		goto cleanup;
532 
533 	for (i = 0; i < ARRAY_SIZE(children); i++) {
534 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
535 		if (!children[i])
536 			goto cleanup;
537 
538 		if (cg_create(children[i]))
539 			goto cleanup;
540 
541 		if (i > 2)
542 			continue;
543 
544 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
545 			      (void *)(long)fd);
546 	}
547 
548 	if (cg_write(parent[1],   attribute, "50M"))
549 		goto cleanup;
550 	if (cg_write(children[0], attribute, "75M"))
551 		goto cleanup;
552 	if (cg_write(children[1], attribute, "25M"))
553 		goto cleanup;
554 	if (cg_write(children[2], attribute, "0"))
555 		goto cleanup;
556 	if (cg_write(children[3], attribute, "500M"))
557 		goto cleanup;
558 
559 	attempts = 0;
560 	while (!values_close(cg_read_long(parent[1], "memory.current"),
561 			     MB(150), 3)) {
562 		if (attempts++ > 5)
563 			break;
564 		sleep(1);
565 	}
566 
567 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
568 		goto cleanup;
569 
570 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
571 		goto cleanup;
572 
573 	for (i = 0; i < ARRAY_SIZE(children); i++)
574 		c[i] = cg_read_long(children[i], "memory.current");
575 
576 	if (!values_close(c[0], MB(29), 10))
577 		goto cleanup;
578 
579 	if (!values_close(c[1], MB(21), 10))
580 		goto cleanup;
581 
582 	if (c[3] != 0)
583 		goto cleanup;
584 
585 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
586 	if (min && !rc)
587 		goto cleanup;
588 	else if (!min && rc) {
589 		fprintf(stderr,
590 			"memory.low prevents from allocating anon memory\n");
591 		goto cleanup;
592 	}
593 
594 	current = min ? MB(50) : MB(30);
595 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
596 		goto cleanup;
597 
598 	if (!reclaim_until(children[0], MB(10)))
599 		goto cleanup;
600 
601 	if (min) {
602 		ret = KSFT_PASS;
603 		goto cleanup;
604 	}
605 
606 	for (i = 0; i < ARRAY_SIZE(children); i++) {
607 		int no_low_events_index = 1;
608 		long low, oom;
609 
610 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
611 		low = cg_read_key_long(children[i], "memory.events", "low ");
612 
613 		if (oom)
614 			goto cleanup;
615 		if (i <= no_low_events_index && low <= 0)
616 			goto cleanup;
617 		if (i > no_low_events_index && low)
618 			goto cleanup;
619 
620 	}
621 
622 	ret = KSFT_PASS;
623 
624 cleanup:
625 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
626 		if (!children[i])
627 			continue;
628 
629 		cg_destroy(children[i]);
630 		free(children[i]);
631 	}
632 
633 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
634 		if (!parent[i])
635 			continue;
636 
637 		cg_destroy(parent[i]);
638 		free(parent[i]);
639 	}
640 	close(fd);
641 	return ret;
642 }
643 
644 static int test_memcg_min(const char *root)
645 {
646 	return test_memcg_protection(root, true);
647 }
648 
649 static int test_memcg_low(const char *root)
650 {
651 	return test_memcg_protection(root, false);
652 }
653 
654 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
655 {
656 	size_t size = MB(50);
657 	int ret = -1;
658 	long current, high, max;
659 	int fd;
660 
661 	high = cg_read_long(cgroup, "memory.high");
662 	max = cg_read_long(cgroup, "memory.max");
663 	if (high != MB(30) && max != MB(30))
664 		return -1;
665 
666 	fd = get_temp_fd();
667 	if (fd < 0)
668 		return -1;
669 
670 	if (alloc_pagecache(fd, size))
671 		goto cleanup;
672 
673 	current = cg_read_long(cgroup, "memory.current");
674 	if (!values_close(current, MB(30), 5))
675 		goto cleanup;
676 
677 	ret = 0;
678 
679 cleanup:
680 	close(fd);
681 	return ret;
682 
683 }
684 
685 /*
686  * This test checks that memory.high limits the amount of
687  * memory which can be consumed by either anonymous memory
688  * or pagecache.
689  */
690 static int test_memcg_high(const char *root)
691 {
692 	int ret = KSFT_FAIL;
693 	char *memcg;
694 	long high;
695 
696 	memcg = cg_name(root, "memcg_test");
697 	if (!memcg)
698 		goto cleanup;
699 
700 	if (cg_create(memcg))
701 		goto cleanup;
702 
703 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
704 		goto cleanup;
705 
706 	if (cg_write(memcg, "memory.swap.max", "0"))
707 		goto cleanup;
708 
709 	if (cg_write(memcg, "memory.high", "30M"))
710 		goto cleanup;
711 
712 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
713 		goto cleanup;
714 
715 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
716 		goto cleanup;
717 
718 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
719 		goto cleanup;
720 
721 	high = cg_read_key_long(memcg, "memory.events", "high ");
722 	if (high <= 0)
723 		goto cleanup;
724 
725 	ret = KSFT_PASS;
726 
727 cleanup:
728 	cg_destroy(memcg);
729 	free(memcg);
730 
731 	return ret;
732 }
733 
734 static int alloc_anon_mlock(const char *cgroup, void *arg)
735 {
736 	size_t size = (size_t)arg;
737 	void *buf;
738 
739 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
740 		   0, 0);
741 	if (buf == MAP_FAILED)
742 		return -1;
743 
744 	mlock(buf, size);
745 	munmap(buf, size);
746 	return 0;
747 }
748 
749 /*
750  * This test checks that memory.high is able to throttle big single shot
751  * allocation i.e. large allocation within one kernel entry.
752  */
753 static int test_memcg_high_sync(const char *root)
754 {
755 	int ret = KSFT_FAIL, pid, fd = -1;
756 	char *memcg;
757 	long pre_high, pre_max;
758 	long post_high, post_max;
759 
760 	memcg = cg_name(root, "memcg_test");
761 	if (!memcg)
762 		goto cleanup;
763 
764 	if (cg_create(memcg))
765 		goto cleanup;
766 
767 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
768 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
769 	if (pre_high < 0 || pre_max < 0)
770 		goto cleanup;
771 
772 	if (cg_write(memcg, "memory.swap.max", "0"))
773 		goto cleanup;
774 
775 	if (cg_write(memcg, "memory.high", "30M"))
776 		goto cleanup;
777 
778 	if (cg_write(memcg, "memory.max", "140M"))
779 		goto cleanup;
780 
781 	fd = memcg_prepare_for_wait(memcg);
782 	if (fd < 0)
783 		goto cleanup;
784 
785 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
786 	if (pid < 0)
787 		goto cleanup;
788 
789 	cg_wait_for(fd);
790 
791 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
792 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
793 	if (post_high < 0 || post_max < 0)
794 		goto cleanup;
795 
796 	if (pre_high == post_high || pre_max != post_max)
797 		goto cleanup;
798 
799 	ret = KSFT_PASS;
800 
801 cleanup:
802 	if (fd >= 0)
803 		close(fd);
804 	cg_destroy(memcg);
805 	free(memcg);
806 
807 	return ret;
808 }
809 
810 /*
811  * This test checks that memory.max limits the amount of
812  * memory which can be consumed by either anonymous memory
813  * or pagecache.
814  */
815 static int test_memcg_max(const char *root)
816 {
817 	int ret = KSFT_FAIL;
818 	char *memcg;
819 	long current, max;
820 
821 	memcg = cg_name(root, "memcg_test");
822 	if (!memcg)
823 		goto cleanup;
824 
825 	if (cg_create(memcg))
826 		goto cleanup;
827 
828 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
829 		goto cleanup;
830 
831 	if (cg_write(memcg, "memory.swap.max", "0"))
832 		goto cleanup;
833 
834 	if (cg_write(memcg, "memory.max", "30M"))
835 		goto cleanup;
836 
837 	/* Should be killed by OOM killer */
838 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
839 		goto cleanup;
840 
841 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
842 		goto cleanup;
843 
844 	current = cg_read_long(memcg, "memory.current");
845 	if (current > MB(30) || !current)
846 		goto cleanup;
847 
848 	max = cg_read_key_long(memcg, "memory.events", "max ");
849 	if (max <= 0)
850 		goto cleanup;
851 
852 	ret = KSFT_PASS;
853 
854 cleanup:
855 	cg_destroy(memcg);
856 	free(memcg);
857 
858 	return ret;
859 }
860 
861 /*
862  * Reclaim from @memcg until usage reaches @goal by writing to
863  * memory.reclaim.
864  *
865  * This function will return false if the usage is already below the
866  * goal.
867  *
868  * This function assumes that writing to memory.reclaim is the only
869  * source of change in memory.current (no concurrent allocations or
870  * reclaim).
871  *
872  * This function makes sure memory.reclaim is sane. It will return
873  * false if memory.reclaim's error codes do not make sense, even if
874  * the usage goal was satisfied.
875  */
876 static bool reclaim_until(const char *memcg, long goal)
877 {
878 	char buf[64];
879 	int retries, err;
880 	long current, to_reclaim;
881 	bool reclaimed = false;
882 
883 	for (retries = 5; retries > 0; retries--) {
884 		current = cg_read_long(memcg, "memory.current");
885 
886 		if (current < goal || values_close(current, goal, 3))
887 			break;
888 		/* Did memory.reclaim return 0 incorrectly? */
889 		else if (reclaimed)
890 			return false;
891 
892 		to_reclaim = current - goal;
893 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
894 		err = cg_write(memcg, "memory.reclaim", buf);
895 		if (!err)
896 			reclaimed = true;
897 		else if (err != -EAGAIN)
898 			return false;
899 	}
900 	return reclaimed;
901 }
902 
903 /*
904  * This test checks that memory.reclaim reclaims the given
905  * amount of memory (from both anon and file, if possible).
906  */
907 static int test_memcg_reclaim(const char *root)
908 {
909 	int ret = KSFT_FAIL;
910 	int fd = -1;
911 	int retries;
912 	char *memcg;
913 	long current, expected_usage;
914 
915 	memcg = cg_name(root, "memcg_test");
916 	if (!memcg)
917 		goto cleanup;
918 
919 	if (cg_create(memcg))
920 		goto cleanup;
921 
922 	current = cg_read_long(memcg, "memory.current");
923 	if (current != 0)
924 		goto cleanup;
925 
926 	fd = get_temp_fd();
927 	if (fd < 0)
928 		goto cleanup;
929 
930 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
931 
932 	/*
933 	 * If swap is enabled, try to reclaim from both anon and file, else try
934 	 * to reclaim from file only.
935 	 */
936 	if (is_swap_enabled()) {
937 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
938 		expected_usage = MB(100);
939 	} else
940 		expected_usage = MB(50);
941 
942 	/*
943 	 * Wait until current usage reaches the expected usage (or we run out of
944 	 * retries).
945 	 */
946 	retries = 5;
947 	while (!values_close(cg_read_long(memcg, "memory.current"),
948 			    expected_usage, 10)) {
949 		if (retries--) {
950 			sleep(1);
951 			continue;
952 		} else {
953 			fprintf(stderr,
954 				"failed to allocate %ld for memcg reclaim test\n",
955 				expected_usage);
956 			goto cleanup;
957 		}
958 	}
959 
960 	/*
961 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
962 	 * and file if swap is enabled.
963 	 */
964 	if (!reclaim_until(memcg, MB(30)))
965 		goto cleanup;
966 
967 	ret = KSFT_PASS;
968 cleanup:
969 	cg_destroy(memcg);
970 	free(memcg);
971 	close(fd);
972 
973 	return ret;
974 }
975 
976 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
977 {
978 	long mem_max = (long)arg;
979 	size_t size = MB(50);
980 	char *buf, *ptr;
981 	long mem_current, swap_current;
982 	int ret = -1;
983 
984 	buf = malloc(size);
985 	if (buf == NULL) {
986 		fprintf(stderr, "malloc() failed\n");
987 		return -1;
988 	}
989 
990 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
991 		*ptr = 0;
992 
993 	mem_current = cg_read_long(cgroup, "memory.current");
994 	if (!mem_current || !values_close(mem_current, mem_max, 3))
995 		goto cleanup;
996 
997 	swap_current = cg_read_long(cgroup, "memory.swap.current");
998 	if (!swap_current ||
999 	    !values_close(mem_current + swap_current, size, 3))
1000 		goto cleanup;
1001 
1002 	ret = 0;
1003 cleanup:
1004 	free(buf);
1005 	return ret;
1006 }
1007 
1008 /*
1009  * This test checks that memory.swap.max limits the amount of
1010  * anonymous memory which can be swapped out. Additionally, it verifies that
1011  * memory.swap.peak reflects the high watermark and can be reset.
1012  */
1013 static int test_memcg_swap_max_peak(const char *root)
1014 {
1015 	int ret = KSFT_FAIL;
1016 	char *memcg;
1017 	long max, peak;
1018 	struct stat ss;
1019 	int swap_peak_fd = -1, mem_peak_fd = -1;
1020 
1021 	/* any non-empty string resets */
1022 	static const char reset_string[] = "foobarbaz";
1023 
1024 	if (!is_swap_enabled())
1025 		return KSFT_SKIP;
1026 
1027 	memcg = cg_name(root, "memcg_test");
1028 	if (!memcg)
1029 		goto cleanup;
1030 
1031 	if (cg_create(memcg))
1032 		goto cleanup;
1033 
1034 	if (cg_read_long(memcg, "memory.swap.current")) {
1035 		ret = KSFT_SKIP;
1036 		goto cleanup;
1037 	}
1038 
1039 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1040 			       O_RDWR | O_APPEND | O_CLOEXEC);
1041 
1042 	if (swap_peak_fd == -1) {
1043 		if (errno == ENOENT)
1044 			ret = KSFT_SKIP;
1045 		goto cleanup;
1046 	}
1047 
1048 	/*
1049 	 * Before we try to use memory.swap.peak's fd, try to figure out
1050 	 * whether this kernel supports writing to that file in the first
1051 	 * place. (by checking the writable bit on the file's st_mode)
1052 	 */
1053 	if (fstat(swap_peak_fd, &ss))
1054 		goto cleanup;
1055 
1056 	if ((ss.st_mode & S_IWUSR) == 0) {
1057 		ret = KSFT_SKIP;
1058 		goto cleanup;
1059 	}
1060 
1061 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1062 
1063 	if (mem_peak_fd == -1)
1064 		goto cleanup;
1065 
1066 	if (cg_read_long(memcg, "memory.swap.peak"))
1067 		goto cleanup;
1068 
1069 	if (cg_read_long_fd(swap_peak_fd))
1070 		goto cleanup;
1071 
1072 	/* switch the swap and mem fds into local-peak tracking mode*/
1073 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1074 
1075 	if (peak_reset != sizeof(reset_string))
1076 		goto cleanup;
1077 
1078 	if (cg_read_long_fd(swap_peak_fd))
1079 		goto cleanup;
1080 
1081 	if (cg_read_long(memcg, "memory.peak"))
1082 		goto cleanup;
1083 
1084 	if (cg_read_long_fd(mem_peak_fd))
1085 		goto cleanup;
1086 
1087 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1088 	if (peak_reset != sizeof(reset_string))
1089 		goto cleanup;
1090 
1091 	if (cg_read_long_fd(mem_peak_fd))
1092 		goto cleanup;
1093 
1094 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1095 		goto cleanup;
1096 
1097 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1098 		goto cleanup;
1099 
1100 	if (cg_write(memcg, "memory.swap.max", "30M"))
1101 		goto cleanup;
1102 
1103 	if (cg_write(memcg, "memory.max", "30M"))
1104 		goto cleanup;
1105 
1106 	/* Should be killed by OOM killer */
1107 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1108 		goto cleanup;
1109 
1110 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1111 		goto cleanup;
1112 
1113 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1114 		goto cleanup;
1115 
1116 	peak = cg_read_long(memcg, "memory.peak");
1117 	if (peak < MB(29))
1118 		goto cleanup;
1119 
1120 	peak = cg_read_long(memcg, "memory.swap.peak");
1121 	if (peak < MB(29))
1122 		goto cleanup;
1123 
1124 	peak = cg_read_long_fd(mem_peak_fd);
1125 	if (peak < MB(29))
1126 		goto cleanup;
1127 
1128 	peak = cg_read_long_fd(swap_peak_fd);
1129 	if (peak < MB(29))
1130 		goto cleanup;
1131 
1132 	/*
1133 	 * open, reset and close the peak swap on another FD to make sure
1134 	 * multiple extant fds don't corrupt the linked-list
1135 	 */
1136 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1137 	if (peak_reset)
1138 		goto cleanup;
1139 
1140 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1141 	if (peak_reset)
1142 		goto cleanup;
1143 
1144 	/* actually reset on the fds */
1145 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1146 	if (peak_reset != sizeof(reset_string))
1147 		goto cleanup;
1148 
1149 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1150 	if (peak_reset != sizeof(reset_string))
1151 		goto cleanup;
1152 
1153 	peak = cg_read_long_fd(swap_peak_fd);
1154 	if (peak > MB(10))
1155 		goto cleanup;
1156 
1157 	/*
1158 	 * The cgroup is now empty, but there may be a page or two associated
1159 	 * with the open FD accounted to it.
1160 	 */
1161 	peak = cg_read_long_fd(mem_peak_fd);
1162 	if (peak > MB(1))
1163 		goto cleanup;
1164 
1165 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1166 		goto cleanup;
1167 
1168 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1169 		goto cleanup;
1170 
1171 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1172 		goto cleanup;
1173 
1174 	max = cg_read_key_long(memcg, "memory.events", "max ");
1175 	if (max <= 0)
1176 		goto cleanup;
1177 
1178 	peak = cg_read_long(memcg, "memory.peak");
1179 	if (peak < MB(29))
1180 		goto cleanup;
1181 
1182 	peak = cg_read_long(memcg, "memory.swap.peak");
1183 	if (peak < MB(29))
1184 		goto cleanup;
1185 
1186 	peak = cg_read_long_fd(mem_peak_fd);
1187 	if (peak < MB(29))
1188 		goto cleanup;
1189 
1190 	peak = cg_read_long_fd(swap_peak_fd);
1191 	if (peak < MB(19))
1192 		goto cleanup;
1193 
1194 	ret = KSFT_PASS;
1195 
1196 cleanup:
1197 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1198 		ret = KSFT_FAIL;
1199 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1200 		ret = KSFT_FAIL;
1201 	cg_destroy(memcg);
1202 	free(memcg);
1203 
1204 	return ret;
1205 }
1206 
1207 /*
1208  * This test disables swapping and tries to allocate anonymous memory
1209  * up to OOM. Then it checks for oom and oom_kill events in
1210  * memory.events.
1211  */
1212 static int test_memcg_oom_events(const char *root)
1213 {
1214 	int ret = KSFT_FAIL;
1215 	char *memcg;
1216 
1217 	memcg = cg_name(root, "memcg_test");
1218 	if (!memcg)
1219 		goto cleanup;
1220 
1221 	if (cg_create(memcg))
1222 		goto cleanup;
1223 
1224 	if (cg_write(memcg, "memory.max", "30M"))
1225 		goto cleanup;
1226 
1227 	if (cg_write(memcg, "memory.swap.max", "0"))
1228 		goto cleanup;
1229 
1230 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1231 		goto cleanup;
1232 
1233 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1234 		goto cleanup;
1235 
1236 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1237 		goto cleanup;
1238 
1239 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1240 		goto cleanup;
1241 
1242 	ret = KSFT_PASS;
1243 
1244 cleanup:
1245 	cg_destroy(memcg);
1246 	free(memcg);
1247 
1248 	return ret;
1249 }
1250 
1251 struct tcp_server_args {
1252 	unsigned short port;
1253 	int ctl[2];
1254 };
1255 
1256 static int tcp_server(const char *cgroup, void *arg)
1257 {
1258 	struct tcp_server_args *srv_args = arg;
1259 	struct sockaddr_in6 saddr = { 0 };
1260 	socklen_t slen = sizeof(saddr);
1261 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1262 
1263 	close(srv_args->ctl[0]);
1264 	ctl_fd = srv_args->ctl[1];
1265 
1266 	saddr.sin6_family = AF_INET6;
1267 	saddr.sin6_addr = in6addr_any;
1268 	saddr.sin6_port = htons(srv_args->port);
1269 
1270 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1271 	if (sk < 0)
1272 		return ret;
1273 
1274 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1275 		goto cleanup;
1276 
1277 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1278 		write(ctl_fd, &errno, sizeof(errno));
1279 		goto cleanup;
1280 	}
1281 
1282 	if (listen(sk, 1))
1283 		goto cleanup;
1284 
1285 	ret = 0;
1286 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1287 		ret = -1;
1288 		goto cleanup;
1289 	}
1290 
1291 	client_sk = accept(sk, NULL, NULL);
1292 	if (client_sk < 0)
1293 		goto cleanup;
1294 
1295 	ret = -1;
1296 	for (;;) {
1297 		uint8_t buf[0x100000];
1298 
1299 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1300 			if (errno == ECONNRESET)
1301 				ret = 0;
1302 			break;
1303 		}
1304 	}
1305 
1306 	close(client_sk);
1307 
1308 cleanup:
1309 	close(sk);
1310 	return ret;
1311 }
1312 
1313 static int tcp_client(const char *cgroup, unsigned short port)
1314 {
1315 	const char server[] = "localhost";
1316 	struct addrinfo *ai;
1317 	char servport[6];
1318 	int retries = 0x10; /* nice round number */
1319 	int sk, ret;
1320 	long allocated;
1321 
1322 	allocated = cg_read_long(cgroup, "memory.current");
1323 	snprintf(servport, sizeof(servport), "%hd", port);
1324 	ret = getaddrinfo(server, servport, NULL, &ai);
1325 	if (ret)
1326 		return ret;
1327 
1328 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1329 	if (sk < 0)
1330 		goto free_ainfo;
1331 
1332 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1333 	if (ret < 0)
1334 		goto close_sk;
1335 
1336 	ret = KSFT_FAIL;
1337 	while (retries--) {
1338 		uint8_t buf[0x100000];
1339 		long current, sock;
1340 
1341 		if (read(sk, buf, sizeof(buf)) <= 0)
1342 			goto close_sk;
1343 
1344 		current = cg_read_long(cgroup, "memory.current");
1345 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1346 
1347 		if (current < 0 || sock < 0)
1348 			goto close_sk;
1349 
1350 		/* exclude the memory not related to socket connection */
1351 		if (values_close(current - allocated, sock, 10)) {
1352 			ret = KSFT_PASS;
1353 			break;
1354 		}
1355 	}
1356 
1357 close_sk:
1358 	close(sk);
1359 free_ainfo:
1360 	freeaddrinfo(ai);
1361 	return ret;
1362 }
1363 
1364 /*
1365  * This test checks socket memory accounting.
1366  * The test forks a TCP server listens on a random port between 1000
1367  * and 61000. Once it gets a client connection, it starts writing to
1368  * its socket.
1369  * The TCP client interleaves reads from the socket with check whether
1370  * memory.current and memory.stat.sock are similar.
1371  */
1372 static int test_memcg_sock(const char *root)
1373 {
1374 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1375 	unsigned short port;
1376 	char *memcg;
1377 
1378 	memcg = cg_name(root, "memcg_test");
1379 	if (!memcg)
1380 		goto cleanup;
1381 
1382 	if (cg_create(memcg))
1383 		goto cleanup;
1384 
1385 	while (bind_retries--) {
1386 		struct tcp_server_args args;
1387 
1388 		if (pipe(args.ctl))
1389 			goto cleanup;
1390 
1391 		port = args.port = 1000 + rand() % 60000;
1392 
1393 		pid = cg_run_nowait(memcg, tcp_server, &args);
1394 		if (pid < 0)
1395 			goto cleanup;
1396 
1397 		close(args.ctl[1]);
1398 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1399 			goto cleanup;
1400 		close(args.ctl[0]);
1401 
1402 		if (!err)
1403 			break;
1404 		if (err != EADDRINUSE)
1405 			goto cleanup;
1406 
1407 		waitpid(pid, NULL, 0);
1408 	}
1409 
1410 	if (err == EADDRINUSE) {
1411 		ret = KSFT_SKIP;
1412 		goto cleanup;
1413 	}
1414 
1415 	if (tcp_client(memcg, port) != KSFT_PASS)
1416 		goto cleanup;
1417 
1418 	waitpid(pid, &err, 0);
1419 	if (WEXITSTATUS(err))
1420 		goto cleanup;
1421 
1422 	if (cg_read_long(memcg, "memory.current") < 0)
1423 		goto cleanup;
1424 
1425 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1426 		goto cleanup;
1427 
1428 	ret = KSFT_PASS;
1429 
1430 cleanup:
1431 	cg_destroy(memcg);
1432 	free(memcg);
1433 
1434 	return ret;
1435 }
1436 
1437 /*
1438  * This test disables swapping and tries to allocate anonymous memory
1439  * up to OOM with memory.group.oom set. Then it checks that all
1440  * processes in the leaf were killed. It also checks that oom_events
1441  * were propagated to the parent level.
1442  */
1443 static int test_memcg_oom_group_leaf_events(const char *root)
1444 {
1445 	int ret = KSFT_FAIL;
1446 	char *parent, *child;
1447 	long parent_oom_events;
1448 
1449 	parent = cg_name(root, "memcg_test_0");
1450 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1451 
1452 	if (!parent || !child)
1453 		goto cleanup;
1454 
1455 	if (cg_create(parent))
1456 		goto cleanup;
1457 
1458 	if (cg_create(child))
1459 		goto cleanup;
1460 
1461 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1462 		goto cleanup;
1463 
1464 	if (cg_write(child, "memory.max", "50M"))
1465 		goto cleanup;
1466 
1467 	if (cg_write(child, "memory.swap.max", "0"))
1468 		goto cleanup;
1469 
1470 	if (cg_write(child, "memory.oom.group", "1"))
1471 		goto cleanup;
1472 
1473 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1474 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1475 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1476 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1477 		goto cleanup;
1478 
1479 	if (cg_test_proc_killed(child))
1480 		goto cleanup;
1481 
1482 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1483 		goto cleanup;
1484 
1485 	parent_oom_events = cg_read_key_long(
1486 			parent, "memory.events", "oom_kill ");
1487 	/*
1488 	 * If memory_localevents is not enabled (the default), the parent should
1489 	 * count OOM events in its children groups. Otherwise, it should not
1490 	 * have observed any events.
1491 	 */
1492 	if (has_localevents && parent_oom_events != 0)
1493 		goto cleanup;
1494 	else if (!has_localevents && parent_oom_events <= 0)
1495 		goto cleanup;
1496 
1497 	ret = KSFT_PASS;
1498 
1499 cleanup:
1500 	if (child)
1501 		cg_destroy(child);
1502 	if (parent)
1503 		cg_destroy(parent);
1504 	free(child);
1505 	free(parent);
1506 
1507 	return ret;
1508 }
1509 
1510 /*
1511  * This test disables swapping and tries to allocate anonymous memory
1512  * up to OOM with memory.group.oom set. Then it checks that all
1513  * processes in the parent and leaf were killed.
1514  */
1515 static int test_memcg_oom_group_parent_events(const char *root)
1516 {
1517 	int ret = KSFT_FAIL;
1518 	char *parent, *child;
1519 
1520 	parent = cg_name(root, "memcg_test_0");
1521 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1522 
1523 	if (!parent || !child)
1524 		goto cleanup;
1525 
1526 	if (cg_create(parent))
1527 		goto cleanup;
1528 
1529 	if (cg_create(child))
1530 		goto cleanup;
1531 
1532 	if (cg_write(parent, "memory.max", "80M"))
1533 		goto cleanup;
1534 
1535 	if (cg_write(parent, "memory.swap.max", "0"))
1536 		goto cleanup;
1537 
1538 	if (cg_write(parent, "memory.oom.group", "1"))
1539 		goto cleanup;
1540 
1541 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1542 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1543 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1544 
1545 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1546 		goto cleanup;
1547 
1548 	if (cg_test_proc_killed(child))
1549 		goto cleanup;
1550 	if (cg_test_proc_killed(parent))
1551 		goto cleanup;
1552 
1553 	ret = KSFT_PASS;
1554 
1555 cleanup:
1556 	if (child)
1557 		cg_destroy(child);
1558 	if (parent)
1559 		cg_destroy(parent);
1560 	free(child);
1561 	free(parent);
1562 
1563 	return ret;
1564 }
1565 
1566 /*
1567  * This test disables swapping and tries to allocate anonymous memory
1568  * up to OOM with memory.group.oom set. Then it checks that all
1569  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1570  */
1571 static int test_memcg_oom_group_score_events(const char *root)
1572 {
1573 	int ret = KSFT_FAIL;
1574 	char *memcg;
1575 	int safe_pid;
1576 
1577 	memcg = cg_name(root, "memcg_test_0");
1578 
1579 	if (!memcg)
1580 		goto cleanup;
1581 
1582 	if (cg_create(memcg))
1583 		goto cleanup;
1584 
1585 	if (cg_write(memcg, "memory.max", "50M"))
1586 		goto cleanup;
1587 
1588 	if (cg_write(memcg, "memory.swap.max", "0"))
1589 		goto cleanup;
1590 
1591 	if (cg_write(memcg, "memory.oom.group", "1"))
1592 		goto cleanup;
1593 
1594 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1595 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1596 		goto cleanup;
1597 
1598 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1599 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1600 		goto cleanup;
1601 
1602 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1603 		goto cleanup;
1604 
1605 	if (kill(safe_pid, SIGKILL))
1606 		goto cleanup;
1607 
1608 	ret = KSFT_PASS;
1609 
1610 cleanup:
1611 	if (memcg)
1612 		cg_destroy(memcg);
1613 	free(memcg);
1614 
1615 	return ret;
1616 }
1617 
1618 #define T(x) { x, #x }
1619 struct memcg_test {
1620 	int (*fn)(const char *root);
1621 	const char *name;
1622 } tests[] = {
1623 	T(test_memcg_subtree_control),
1624 	T(test_memcg_current_peak),
1625 	T(test_memcg_min),
1626 	T(test_memcg_low),
1627 	T(test_memcg_high),
1628 	T(test_memcg_high_sync),
1629 	T(test_memcg_max),
1630 	T(test_memcg_reclaim),
1631 	T(test_memcg_oom_events),
1632 	T(test_memcg_swap_max_peak),
1633 	T(test_memcg_sock),
1634 	T(test_memcg_oom_group_leaf_events),
1635 	T(test_memcg_oom_group_parent_events),
1636 	T(test_memcg_oom_group_score_events),
1637 };
1638 #undef T
1639 
1640 int main(int argc, char **argv)
1641 {
1642 	char root[PATH_MAX];
1643 	int i, proc_status, ret = EXIT_SUCCESS;
1644 
1645 	if (cg_find_unified_root(root, sizeof(root), NULL))
1646 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1647 
1648 	/*
1649 	 * Check that memory controller is available:
1650 	 * memory is listed in cgroup.controllers
1651 	 */
1652 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1653 		ksft_exit_skip("memory controller isn't available\n");
1654 
1655 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1656 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1657 			ksft_exit_skip("Failed to set memory controller\n");
1658 
1659 	proc_status = proc_mount_contains("memory_recursiveprot");
1660 	if (proc_status < 0)
1661 		ksft_exit_skip("Failed to query cgroup mount option\n");
1662 	has_recursiveprot = proc_status;
1663 
1664 	proc_status = proc_mount_contains("memory_localevents");
1665 	if (proc_status < 0)
1666 		ksft_exit_skip("Failed to query cgroup mount option\n");
1667 	has_localevents = proc_status;
1668 
1669 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1670 		switch (tests[i].fn(root)) {
1671 		case KSFT_PASS:
1672 			ksft_test_result_pass("%s\n", tests[i].name);
1673 			break;
1674 		case KSFT_SKIP:
1675 			ksft_test_result_skip("%s\n", tests[i].name);
1676 			break;
1677 		default:
1678 			ret = EXIT_FAILURE;
1679 			ksft_test_result_fail("%s\n", tests[i].name);
1680 			break;
1681 		}
1682 	}
1683 
1684 	return ret;
1685 }
1686