xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 3dc7c001169d112b3e514cacff6c93091c57af9a)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/inotify.h>
14 #include <sys/socket.h>
15 #include <sys/wait.h>
16 #include <arpa/inet.h>
17 #include <netinet/in.h>
18 #include <netdb.h>
19 #include <errno.h>
20 #include <sys/mman.h>
21 
22 #include "kselftest.h"
23 #include "cgroup_util.h"
24 
25 #define MEMCG_SOCKSTAT_WAIT_RETRIES        30
26 
27 static bool has_localevents;
28 static bool has_recursiveprot;
29 
30 int get_temp_fd(void)
31 {
32 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
33 }
34 
35 int alloc_pagecache(int fd, size_t size)
36 {
37 	char buf[PAGE_SIZE];
38 	struct stat st;
39 	int i;
40 
41 	if (fstat(fd, &st))
42 		goto cleanup;
43 
44 	size += st.st_size;
45 
46 	if (ftruncate(fd, size))
47 		goto cleanup;
48 
49 	for (i = 0; i < size; i += sizeof(buf))
50 		read(fd, buf, sizeof(buf));
51 
52 	return 0;
53 
54 cleanup:
55 	return -1;
56 }
57 
58 static char *alloc_and_populate_anon(size_t size)
59 {
60 	char *buf, *ptr;
61 
62 	buf = malloc(size);
63 	if (buf == NULL) {
64 		fprintf(stderr, "malloc() failed\n");
65 		return NULL;
66 	}
67 
68 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
69 		*ptr = 0;
70 
71 	return buf;
72 }
73 
74 int alloc_anon(const char *cgroup, void *arg)
75 {
76 	size_t size = (unsigned long)arg;
77 	char *buf;
78 
79 	buf = alloc_and_populate_anon(size);
80 	if (!buf)
81 		return -1;
82 
83 	free(buf);
84 	return 0;
85 }
86 
87 int is_swap_enabled(void)
88 {
89 	char buf[PAGE_SIZE];
90 	const char delim[] = "\n";
91 	int cnt = 0;
92 	char *line;
93 
94 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
95 		return -1;
96 
97 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
98 		cnt++;
99 
100 	return cnt > 1;
101 }
102 
103 int set_oom_adj_score(int pid, int score)
104 {
105 	char path[PATH_MAX];
106 	int fd, len;
107 
108 	sprintf(path, "/proc/%d/oom_score_adj", pid);
109 
110 	fd = open(path, O_WRONLY | O_APPEND);
111 	if (fd < 0)
112 		return fd;
113 
114 	len = dprintf(fd, "%d", score);
115 	if (len < 0) {
116 		close(fd);
117 		return len;
118 	}
119 
120 	close(fd);
121 	return 0;
122 }
123 
124 /*
125  * This test creates two nested cgroups with and without enabling
126  * the memory controller.
127  */
128 static int test_memcg_subtree_control(const char *root)
129 {
130 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
131 	int ret = KSFT_FAIL;
132 	char buf[PAGE_SIZE];
133 
134 	/* Create two nested cgroups with the memory controller enabled */
135 	parent = cg_name(root, "memcg_test_0");
136 	child = cg_name(root, "memcg_test_0/memcg_test_1");
137 	if (!parent || !child)
138 		goto cleanup_free;
139 
140 	if (cg_create(parent))
141 		goto cleanup_free;
142 
143 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
144 		goto cleanup_parent;
145 
146 	if (cg_create(child))
147 		goto cleanup_parent;
148 
149 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
150 		goto cleanup_child;
151 
152 	/* Create two nested cgroups without enabling memory controller */
153 	parent2 = cg_name(root, "memcg_test_1");
154 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
155 	if (!parent2 || !child2)
156 		goto cleanup_free2;
157 
158 	if (cg_create(parent2))
159 		goto cleanup_free2;
160 
161 	if (cg_create(child2))
162 		goto cleanup_parent2;
163 
164 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
165 		goto cleanup_all;
166 
167 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
168 		goto cleanup_all;
169 
170 	ret = KSFT_PASS;
171 
172 cleanup_all:
173 	cg_destroy(child2);
174 cleanup_parent2:
175 	cg_destroy(parent2);
176 cleanup_free2:
177 	free(parent2);
178 	free(child2);
179 cleanup_child:
180 	cg_destroy(child);
181 cleanup_parent:
182 	cg_destroy(parent);
183 cleanup_free:
184 	free(parent);
185 	free(child);
186 
187 	return ret;
188 }
189 
190 static int alloc_anon_50M_check(const char *cgroup, void *arg)
191 {
192 	size_t size = MB(50);
193 	char *buf;
194 	long anon, current;
195 	int ret = -1;
196 
197 	buf = alloc_and_populate_anon(size);
198 	if (!buf)
199 		return -1;
200 
201 	current = cg_read_long(cgroup, "memory.current");
202 	if (current < size)
203 		goto cleanup;
204 
205 	if (!values_close(size, current, 3))
206 		goto cleanup;
207 
208 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
209 	if (anon < 0)
210 		goto cleanup;
211 
212 	if (!values_close(anon, current, 3))
213 		goto cleanup;
214 
215 	ret = 0;
216 cleanup:
217 	free(buf);
218 	return ret;
219 }
220 
221 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
222 {
223 	size_t size = MB(50);
224 	int ret = -1;
225 	long current, file;
226 	int fd;
227 
228 	fd = get_temp_fd();
229 	if (fd < 0)
230 		return -1;
231 
232 	if (alloc_pagecache(fd, size))
233 		goto cleanup;
234 
235 	current = cg_read_long(cgroup, "memory.current");
236 	if (current < size)
237 		goto cleanup;
238 
239 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
240 	if (file < 0)
241 		goto cleanup;
242 
243 	if (!values_close(file, current, 10))
244 		goto cleanup;
245 
246 	ret = 0;
247 
248 cleanup:
249 	close(fd);
250 	return ret;
251 }
252 
253 /*
254  * This test create a memory cgroup, allocates
255  * some anonymous memory and some pagecache
256  * and checks memory.current, memory.peak, and some memory.stat values.
257  */
258 static int test_memcg_current_peak(const char *root)
259 {
260 	int ret = KSFT_FAIL;
261 	long current, peak, peak_reset;
262 	char *memcg;
263 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
264 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
265 	struct stat ss;
266 
267 	memcg = cg_name(root, "memcg_test");
268 	if (!memcg)
269 		goto cleanup;
270 
271 	if (cg_create(memcg))
272 		goto cleanup;
273 
274 	current = cg_read_long(memcg, "memory.current");
275 	if (current != 0)
276 		goto cleanup;
277 
278 	peak = cg_read_long(memcg, "memory.peak");
279 	if (peak != 0)
280 		goto cleanup;
281 
282 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
283 		goto cleanup;
284 
285 	peak = cg_read_long(memcg, "memory.peak");
286 	if (peak < MB(50))
287 		goto cleanup;
288 
289 	/*
290 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
291 	 * We need at least three to be closed in a different order than writes occurred to test
292 	 * the linked-list handling.
293 	 */
294 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
295 
296 	if (peak_fd == -1) {
297 		if (errno == ENOENT)
298 			ret = KSFT_SKIP;
299 		goto cleanup;
300 	}
301 
302 	/*
303 	 * Before we try to use memory.peak's fd, try to figure out whether
304 	 * this kernel supports writing to that file in the first place. (by
305 	 * checking the writable bit on the file's st_mode)
306 	 */
307 	if (fstat(peak_fd, &ss))
308 		goto cleanup;
309 
310 	if ((ss.st_mode & S_IWUSR) == 0) {
311 		ret = KSFT_SKIP;
312 		goto cleanup;
313 	}
314 
315 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
316 
317 	if (peak_fd2 == -1)
318 		goto cleanup;
319 
320 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
321 
322 	if (peak_fd3 == -1)
323 		goto cleanup;
324 
325 	/* any non-empty string resets, but make it clear */
326 	static const char reset_string[] = "reset\n";
327 
328 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
329 	if (peak_reset != sizeof(reset_string))
330 		goto cleanup;
331 
332 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
333 	if (peak_reset != sizeof(reset_string))
334 		goto cleanup;
335 
336 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
337 	if (peak_reset != sizeof(reset_string))
338 		goto cleanup;
339 
340 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
341 	peak = cg_read_long(memcg, "memory.peak");
342 	if (peak < MB(50))
343 		goto cleanup;
344 
345 	fd2_closed = true;
346 	if (close(peak_fd2))
347 		goto cleanup;
348 
349 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
350 
351 	if (peak_fd4 == -1)
352 		goto cleanup;
353 
354 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
355 	if (peak_reset != sizeof(reset_string))
356 		goto cleanup;
357 
358 	peak = cg_read_long_fd(peak_fd);
359 	if (peak > MB(30) || peak < 0)
360 		goto cleanup;
361 
362 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
363 		goto cleanup;
364 
365 	peak = cg_read_long(memcg, "memory.peak");
366 	if (peak < MB(50))
367 		goto cleanup;
368 
369 	/* Make sure everything is back to normal */
370 	peak = cg_read_long_fd(peak_fd);
371 	if (peak < MB(50))
372 		goto cleanup;
373 
374 	peak = cg_read_long_fd(peak_fd4);
375 	if (peak < MB(50))
376 		goto cleanup;
377 
378 	fd3_closed = true;
379 	if (close(peak_fd3))
380 		goto cleanup;
381 
382 	fd4_closed = true;
383 	if (close(peak_fd4))
384 		goto cleanup;
385 
386 	ret = KSFT_PASS;
387 
388 cleanup:
389 	close(peak_fd);
390 	if (!fd2_closed)
391 		close(peak_fd2);
392 	if (!fd3_closed)
393 		close(peak_fd3);
394 	if (!fd4_closed)
395 		close(peak_fd4);
396 	cg_destroy(memcg);
397 	free(memcg);
398 
399 	return ret;
400 }
401 
402 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
403 {
404 	int fd = (long)arg;
405 	int ppid = getppid();
406 
407 	if (alloc_pagecache(fd, MB(50)))
408 		return -1;
409 
410 	while (getppid() == ppid)
411 		sleep(1);
412 
413 	return 0;
414 }
415 
416 static int alloc_anon_noexit(const char *cgroup, void *arg)
417 {
418 	int ppid = getppid();
419 	size_t size = (unsigned long)arg;
420 	char *buf;
421 
422 	buf = alloc_and_populate_anon(size);
423 	if (!buf)
424 		return -1;
425 
426 	while (getppid() == ppid)
427 		sleep(1);
428 
429 	free(buf);
430 	return 0;
431 }
432 
433 /*
434  * Wait until processes are killed asynchronously by the OOM killer
435  * If we exceed a timeout, fail.
436  */
437 static int cg_test_proc_killed(const char *cgroup)
438 {
439 	int limit;
440 
441 	for (limit = 10; limit > 0; limit--) {
442 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
443 			return 0;
444 
445 		usleep(100000);
446 	}
447 	return -1;
448 }
449 
450 static bool reclaim_until(const char *memcg, long goal);
451 
452 /*
453  * First, this test creates the following hierarchy:
454  * A       memory.min = 0,    memory.max = 200M
455  * A/B     memory.min = 50M
456  * A/B/C   memory.min = 75M,  memory.current = 50M
457  * A/B/D   memory.min = 25M,  memory.current = 50M
458  * A/B/E   memory.min = 0,    memory.current = 50M
459  * A/B/F   memory.min = 500M, memory.current = 0
460  *
461  * (or memory.low if we test soft protection)
462  *
463  * Usages are pagecache and the test keeps a running
464  * process in every leaf cgroup.
465  * Then it creates A/G and creates a significant
466  * memory pressure in A.
467  *
468  * Then it checks actual memory usages and expects that:
469  * A/B    memory.current ~= 50M
470  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
471  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
472  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
473  *				 undefined otherwise]
474  * A/B/F  memory.current  = 0   [memory.events:low == 0]
475  * (for origin of the numbers, see model in memcg_protection.m.)
476  *
477  * After that it tries to allocate more than there is
478  * unprotected memory in A available, and checks that:
479  * a) memory.min protects pagecache even in this case,
480  * b) memory.low allows reclaiming page cache with low events.
481  *
482  * Then we try to reclaim from A/B/C using memory.reclaim until its
483  * usage reaches 10M.
484  * This makes sure that:
485  * (a) We ignore the protection of the reclaim target memcg.
486  * (b) The previously calculated emin value (~29M) should be dismissed.
487  */
488 static int test_memcg_protection(const char *root, bool min)
489 {
490 	int ret = KSFT_FAIL, rc;
491 	char *parent[3] = {NULL};
492 	char *children[4] = {NULL};
493 	const char *attribute = min ? "memory.min" : "memory.low";
494 	long c[4];
495 	long current;
496 	int i, attempts;
497 	int fd;
498 
499 	fd = get_temp_fd();
500 	if (fd < 0)
501 		goto cleanup;
502 
503 	parent[0] = cg_name(root, "memcg_test_0");
504 	if (!parent[0])
505 		goto cleanup;
506 
507 	parent[1] = cg_name(parent[0], "memcg_test_1");
508 	if (!parent[1])
509 		goto cleanup;
510 
511 	parent[2] = cg_name(parent[0], "memcg_test_2");
512 	if (!parent[2])
513 		goto cleanup;
514 
515 	if (cg_create(parent[0]))
516 		goto cleanup;
517 
518 	if (cg_read_long(parent[0], attribute)) {
519 		/* No memory.min on older kernels is fine */
520 		if (min)
521 			ret = KSFT_SKIP;
522 		goto cleanup;
523 	}
524 
525 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
526 		goto cleanup;
527 
528 	if (cg_write(parent[0], "memory.max", "200M"))
529 		goto cleanup;
530 
531 	if (cg_write(parent[0], "memory.swap.max", "0"))
532 		goto cleanup;
533 
534 	if (cg_create(parent[1]))
535 		goto cleanup;
536 
537 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
538 		goto cleanup;
539 
540 	if (cg_create(parent[2]))
541 		goto cleanup;
542 
543 	for (i = 0; i < ARRAY_SIZE(children); i++) {
544 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
545 		if (!children[i])
546 			goto cleanup;
547 
548 		if (cg_create(children[i]))
549 			goto cleanup;
550 
551 		if (i > 2)
552 			continue;
553 
554 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
555 			      (void *)(long)fd);
556 	}
557 
558 	if (cg_write(parent[1],   attribute, "50M"))
559 		goto cleanup;
560 	if (cg_write(children[0], attribute, "75M"))
561 		goto cleanup;
562 	if (cg_write(children[1], attribute, "25M"))
563 		goto cleanup;
564 	if (cg_write(children[2], attribute, "0"))
565 		goto cleanup;
566 	if (cg_write(children[3], attribute, "500M"))
567 		goto cleanup;
568 
569 	attempts = 0;
570 	while (!values_close(cg_read_long(parent[1], "memory.current"),
571 			     MB(150), 3)) {
572 		if (attempts++ > 5)
573 			break;
574 		sleep(1);
575 	}
576 
577 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
578 		goto cleanup;
579 
580 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
581 		goto cleanup;
582 
583 	for (i = 0; i < ARRAY_SIZE(children); i++)
584 		c[i] = cg_read_long(children[i], "memory.current");
585 
586 	if (!values_close(c[0], MB(29), 15))
587 		goto cleanup;
588 
589 	if (!values_close(c[1], MB(21), 20))
590 		goto cleanup;
591 
592 	if (c[3] != 0)
593 		goto cleanup;
594 
595 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
596 	if (min && !rc)
597 		goto cleanup;
598 	else if (!min && rc) {
599 		fprintf(stderr,
600 			"memory.low prevents from allocating anon memory\n");
601 		goto cleanup;
602 	}
603 
604 	current = min ? MB(50) : MB(30);
605 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
606 		goto cleanup;
607 
608 	if (!reclaim_until(children[0], MB(10)))
609 		goto cleanup;
610 
611 	if (min) {
612 		ret = KSFT_PASS;
613 		goto cleanup;
614 	}
615 
616 	/*
617 	 * Child 2 has memory.low=0, but some low protection may still be
618 	 * distributed down from its parent with memory.low=50M if cgroup2
619 	 * memory_recursiveprot mount option is enabled. Ignore the low
620 	 * event count in this case.
621 	 */
622 	for (i = 0; i < ARRAY_SIZE(children); i++) {
623 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
624 		int no_low_events_index = 1;
625 		long low, oom;
626 
627 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
628 		low = cg_read_key_long(children[i], "memory.events", "low ");
629 
630 		if (oom)
631 			goto cleanup;
632 		if (i == ignore_low_events_index)
633 			continue;
634 		if (i <= no_low_events_index && low <= 0)
635 			goto cleanup;
636 		if (i > no_low_events_index && low)
637 			goto cleanup;
638 
639 	}
640 
641 	ret = KSFT_PASS;
642 
643 cleanup:
644 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
645 		if (!children[i])
646 			continue;
647 
648 		cg_destroy(children[i]);
649 		free(children[i]);
650 	}
651 
652 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
653 		if (!parent[i])
654 			continue;
655 
656 		cg_destroy(parent[i]);
657 		free(parent[i]);
658 	}
659 	close(fd);
660 	return ret;
661 }
662 
663 static int test_memcg_min(const char *root)
664 {
665 	return test_memcg_protection(root, true);
666 }
667 
668 static int test_memcg_low(const char *root)
669 {
670 	return test_memcg_protection(root, false);
671 }
672 
673 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
674 {
675 	size_t size = MB(50);
676 	int ret = -1;
677 	long current, high, max;
678 	int fd;
679 
680 	high = cg_read_long(cgroup, "memory.high");
681 	max = cg_read_long(cgroup, "memory.max");
682 	if (high != MB(30) && max != MB(30))
683 		return -1;
684 
685 	fd = get_temp_fd();
686 	if (fd < 0)
687 		return -1;
688 
689 	if (alloc_pagecache(fd, size))
690 		goto cleanup;
691 
692 	current = cg_read_long(cgroup, "memory.current");
693 	if (!values_close(current, MB(30), 5))
694 		goto cleanup;
695 
696 	ret = 0;
697 
698 cleanup:
699 	close(fd);
700 	return ret;
701 
702 }
703 
704 /*
705  * This test checks that memory.high limits the amount of
706  * memory which can be consumed by either anonymous memory
707  * or pagecache.
708  */
709 static int test_memcg_high(const char *root)
710 {
711 	int ret = KSFT_FAIL;
712 	char *memcg;
713 	long high;
714 
715 	memcg = cg_name(root, "memcg_test");
716 	if (!memcg)
717 		goto cleanup;
718 
719 	if (cg_create(memcg))
720 		goto cleanup;
721 
722 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
723 		goto cleanup;
724 
725 	if (cg_write(memcg, "memory.swap.max", "0"))
726 		goto cleanup;
727 
728 	if (cg_write(memcg, "memory.high", "30M"))
729 		goto cleanup;
730 
731 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
732 		goto cleanup;
733 
734 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
735 		goto cleanup;
736 
737 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
738 		goto cleanup;
739 
740 	high = cg_read_key_long(memcg, "memory.events", "high ");
741 	if (high <= 0)
742 		goto cleanup;
743 
744 	ret = KSFT_PASS;
745 
746 cleanup:
747 	cg_destroy(memcg);
748 	free(memcg);
749 
750 	return ret;
751 }
752 
753 static int alloc_anon_mlock(const char *cgroup, void *arg)
754 {
755 	size_t size = (size_t)arg;
756 	void *buf;
757 
758 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
759 		   0, 0);
760 	if (buf == MAP_FAILED)
761 		return -1;
762 
763 	mlock(buf, size);
764 	munmap(buf, size);
765 	return 0;
766 }
767 
768 /*
769  * This test checks that memory.high is able to throttle big single shot
770  * allocation i.e. large allocation within one kernel entry.
771  */
772 static int test_memcg_high_sync(const char *root)
773 {
774 	int ret = KSFT_FAIL, pid, fd = -1;
775 	char *memcg;
776 	long pre_high, pre_max;
777 	long post_high, post_max;
778 
779 	memcg = cg_name(root, "memcg_test");
780 	if (!memcg)
781 		goto cleanup;
782 
783 	if (cg_create(memcg))
784 		goto cleanup;
785 
786 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
787 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
788 	if (pre_high < 0 || pre_max < 0)
789 		goto cleanup;
790 
791 	if (cg_write(memcg, "memory.swap.max", "0"))
792 		goto cleanup;
793 
794 	if (cg_write(memcg, "memory.high", "30M"))
795 		goto cleanup;
796 
797 	if (cg_write(memcg, "memory.max", "140M"))
798 		goto cleanup;
799 
800 	fd = memcg_prepare_for_wait(memcg);
801 	if (fd < 0)
802 		goto cleanup;
803 
804 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
805 	if (pid < 0)
806 		goto cleanup;
807 
808 	cg_wait_for(fd);
809 
810 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
811 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
812 	if (post_high < 0 || post_max < 0)
813 		goto cleanup;
814 
815 	if (pre_high == post_high || pre_max != post_max)
816 		goto cleanup;
817 
818 	ret = KSFT_PASS;
819 
820 cleanup:
821 	if (fd >= 0)
822 		close(fd);
823 	cg_destroy(memcg);
824 	free(memcg);
825 
826 	return ret;
827 }
828 
829 /*
830  * This test checks that memory.max limits the amount of
831  * memory which can be consumed by either anonymous memory
832  * or pagecache.
833  */
834 static int test_memcg_max(const char *root)
835 {
836 	int ret = KSFT_FAIL;
837 	char *memcg;
838 	long current, max;
839 
840 	memcg = cg_name(root, "memcg_test");
841 	if (!memcg)
842 		goto cleanup;
843 
844 	if (cg_create(memcg))
845 		goto cleanup;
846 
847 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
848 		goto cleanup;
849 
850 	if (cg_write(memcg, "memory.swap.max", "0"))
851 		goto cleanup;
852 
853 	if (cg_write(memcg, "memory.max", "30M"))
854 		goto cleanup;
855 
856 	/* Should be killed by OOM killer */
857 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
858 		goto cleanup;
859 
860 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
861 		goto cleanup;
862 
863 	current = cg_read_long(memcg, "memory.current");
864 	if (current > MB(30) || !current)
865 		goto cleanup;
866 
867 	max = cg_read_key_long(memcg, "memory.events", "max ");
868 	if (max <= 0)
869 		goto cleanup;
870 
871 	ret = KSFT_PASS;
872 
873 cleanup:
874 	cg_destroy(memcg);
875 	free(memcg);
876 
877 	return ret;
878 }
879 
880 /*
881  * Reclaim from @memcg until usage reaches @goal by writing to
882  * memory.reclaim.
883  *
884  * This function will return false if the usage is already below the
885  * goal.
886  *
887  * This function assumes that writing to memory.reclaim is the only
888  * source of change in memory.current (no concurrent allocations or
889  * reclaim).
890  *
891  * This function makes sure memory.reclaim is sane. It will return
892  * false if memory.reclaim's error codes do not make sense, even if
893  * the usage goal was satisfied.
894  */
895 static bool reclaim_until(const char *memcg, long goal)
896 {
897 	char buf[64];
898 	int retries, err;
899 	long current, to_reclaim;
900 	bool reclaimed = false;
901 
902 	for (retries = 5; retries > 0; retries--) {
903 		current = cg_read_long(memcg, "memory.current");
904 
905 		if (current < goal || values_close(current, goal, 3))
906 			break;
907 		/* Did memory.reclaim return 0 incorrectly? */
908 		else if (reclaimed)
909 			return false;
910 
911 		to_reclaim = current - goal;
912 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
913 		err = cg_write(memcg, "memory.reclaim", buf);
914 		if (!err)
915 			reclaimed = true;
916 		else if (err != -EAGAIN)
917 			return false;
918 	}
919 	return reclaimed;
920 }
921 
922 /*
923  * This test checks that memory.reclaim reclaims the given
924  * amount of memory (from both anon and file, if possible).
925  */
926 static int test_memcg_reclaim(const char *root)
927 {
928 	int ret = KSFT_FAIL;
929 	int fd = -1;
930 	int retries;
931 	char *memcg;
932 	long current, expected_usage;
933 
934 	memcg = cg_name(root, "memcg_test");
935 	if (!memcg)
936 		goto cleanup;
937 
938 	if (cg_create(memcg))
939 		goto cleanup;
940 
941 	current = cg_read_long(memcg, "memory.current");
942 	if (current != 0)
943 		goto cleanup;
944 
945 	fd = get_temp_fd();
946 	if (fd < 0)
947 		goto cleanup;
948 
949 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
950 
951 	/*
952 	 * If swap is enabled, try to reclaim from both anon and file, else try
953 	 * to reclaim from file only.
954 	 */
955 	if (is_swap_enabled()) {
956 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
957 		expected_usage = MB(100);
958 	} else
959 		expected_usage = MB(50);
960 
961 	/*
962 	 * Wait until current usage reaches the expected usage (or we run out of
963 	 * retries).
964 	 */
965 	retries = 5;
966 	while (!values_close(cg_read_long(memcg, "memory.current"),
967 			    expected_usage, 10)) {
968 		if (retries--) {
969 			sleep(1);
970 			continue;
971 		} else {
972 			fprintf(stderr,
973 				"failed to allocate %ld for memcg reclaim test\n",
974 				expected_usage);
975 			goto cleanup;
976 		}
977 	}
978 
979 	/*
980 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
981 	 * and file if swap is enabled.
982 	 */
983 	if (!reclaim_until(memcg, MB(30)))
984 		goto cleanup;
985 
986 	ret = KSFT_PASS;
987 cleanup:
988 	cg_destroy(memcg);
989 	free(memcg);
990 	close(fd);
991 
992 	return ret;
993 }
994 
995 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
996 {
997 	long mem_max = (long)arg;
998 	size_t size = MB(50);
999 	char *buf;
1000 	long mem_current, swap_current;
1001 	int ret = -1;
1002 
1003 	buf = alloc_and_populate_anon(size);
1004 	if (!buf)
1005 		return -1;
1006 
1007 	mem_current = cg_read_long(cgroup, "memory.current");
1008 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1009 		goto cleanup;
1010 
1011 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1012 	if (!swap_current ||
1013 	    !values_close(mem_current + swap_current, size, 3))
1014 		goto cleanup;
1015 
1016 	ret = 0;
1017 cleanup:
1018 	free(buf);
1019 	return ret;
1020 }
1021 
1022 /*
1023  * This test checks that memory.swap.max limits the amount of
1024  * anonymous memory which can be swapped out. Additionally, it verifies that
1025  * memory.swap.peak reflects the high watermark and can be reset.
1026  */
1027 static int test_memcg_swap_max_peak(const char *root)
1028 {
1029 	int ret = KSFT_FAIL;
1030 	char *memcg;
1031 	long max, peak;
1032 	struct stat ss;
1033 	int swap_peak_fd = -1, mem_peak_fd = -1;
1034 
1035 	/* any non-empty string resets */
1036 	static const char reset_string[] = "foobarbaz";
1037 
1038 	if (!is_swap_enabled())
1039 		return KSFT_SKIP;
1040 
1041 	memcg = cg_name(root, "memcg_test");
1042 	if (!memcg)
1043 		goto cleanup;
1044 
1045 	if (cg_create(memcg))
1046 		goto cleanup;
1047 
1048 	if (cg_read_long(memcg, "memory.swap.current")) {
1049 		ret = KSFT_SKIP;
1050 		goto cleanup;
1051 	}
1052 
1053 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1054 			       O_RDWR | O_APPEND | O_CLOEXEC);
1055 
1056 	if (swap_peak_fd == -1) {
1057 		if (errno == ENOENT)
1058 			ret = KSFT_SKIP;
1059 		goto cleanup;
1060 	}
1061 
1062 	/*
1063 	 * Before we try to use memory.swap.peak's fd, try to figure out
1064 	 * whether this kernel supports writing to that file in the first
1065 	 * place. (by checking the writable bit on the file's st_mode)
1066 	 */
1067 	if (fstat(swap_peak_fd, &ss))
1068 		goto cleanup;
1069 
1070 	if ((ss.st_mode & S_IWUSR) == 0) {
1071 		ret = KSFT_SKIP;
1072 		goto cleanup;
1073 	}
1074 
1075 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1076 
1077 	if (mem_peak_fd == -1)
1078 		goto cleanup;
1079 
1080 	if (cg_read_long(memcg, "memory.swap.peak"))
1081 		goto cleanup;
1082 
1083 	if (cg_read_long_fd(swap_peak_fd))
1084 		goto cleanup;
1085 
1086 	/* switch the swap and mem fds into local-peak tracking mode*/
1087 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1088 
1089 	if (peak_reset != sizeof(reset_string))
1090 		goto cleanup;
1091 
1092 	if (cg_read_long_fd(swap_peak_fd))
1093 		goto cleanup;
1094 
1095 	if (cg_read_long(memcg, "memory.peak"))
1096 		goto cleanup;
1097 
1098 	if (cg_read_long_fd(mem_peak_fd))
1099 		goto cleanup;
1100 
1101 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1102 	if (peak_reset != sizeof(reset_string))
1103 		goto cleanup;
1104 
1105 	if (cg_read_long_fd(mem_peak_fd))
1106 		goto cleanup;
1107 
1108 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1109 		goto cleanup;
1110 
1111 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1112 		goto cleanup;
1113 
1114 	if (cg_write(memcg, "memory.swap.max", "30M"))
1115 		goto cleanup;
1116 
1117 	if (cg_write(memcg, "memory.max", "30M"))
1118 		goto cleanup;
1119 
1120 	/* Should be killed by OOM killer */
1121 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1122 		goto cleanup;
1123 
1124 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1125 		goto cleanup;
1126 
1127 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1128 		goto cleanup;
1129 
1130 	peak = cg_read_long(memcg, "memory.peak");
1131 	if (peak < MB(29))
1132 		goto cleanup;
1133 
1134 	peak = cg_read_long(memcg, "memory.swap.peak");
1135 	if (peak < MB(29))
1136 		goto cleanup;
1137 
1138 	peak = cg_read_long_fd(mem_peak_fd);
1139 	if (peak < MB(29))
1140 		goto cleanup;
1141 
1142 	peak = cg_read_long_fd(swap_peak_fd);
1143 	if (peak < MB(29))
1144 		goto cleanup;
1145 
1146 	/*
1147 	 * open, reset and close the peak swap on another FD to make sure
1148 	 * multiple extant fds don't corrupt the linked-list
1149 	 */
1150 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1151 	if (peak_reset)
1152 		goto cleanup;
1153 
1154 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1155 	if (peak_reset)
1156 		goto cleanup;
1157 
1158 	/* actually reset on the fds */
1159 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1160 	if (peak_reset != sizeof(reset_string))
1161 		goto cleanup;
1162 
1163 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1164 	if (peak_reset != sizeof(reset_string))
1165 		goto cleanup;
1166 
1167 	peak = cg_read_long_fd(swap_peak_fd);
1168 	if (peak > MB(10))
1169 		goto cleanup;
1170 
1171 	/*
1172 	 * The cgroup is now empty, but there may be a page or two associated
1173 	 * with the open FD accounted to it.
1174 	 */
1175 	peak = cg_read_long_fd(mem_peak_fd);
1176 	if (peak > MB(1))
1177 		goto cleanup;
1178 
1179 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1180 		goto cleanup;
1181 
1182 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1183 		goto cleanup;
1184 
1185 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1186 		goto cleanup;
1187 
1188 	max = cg_read_key_long(memcg, "memory.events", "max ");
1189 	if (max <= 0)
1190 		goto cleanup;
1191 
1192 	peak = cg_read_long(memcg, "memory.peak");
1193 	if (peak < MB(29))
1194 		goto cleanup;
1195 
1196 	peak = cg_read_long(memcg, "memory.swap.peak");
1197 	if (peak < MB(29))
1198 		goto cleanup;
1199 
1200 	peak = cg_read_long_fd(mem_peak_fd);
1201 	if (peak < MB(29))
1202 		goto cleanup;
1203 
1204 	peak = cg_read_long_fd(swap_peak_fd);
1205 	if (peak < MB(19))
1206 		goto cleanup;
1207 
1208 	ret = KSFT_PASS;
1209 
1210 cleanup:
1211 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1212 		ret = KSFT_FAIL;
1213 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1214 		ret = KSFT_FAIL;
1215 	cg_destroy(memcg);
1216 	free(memcg);
1217 
1218 	return ret;
1219 }
1220 
1221 /*
1222  * This test disables swapping and tries to allocate anonymous memory
1223  * up to OOM. Then it checks for oom and oom_kill events in
1224  * memory.events.
1225  */
1226 static int test_memcg_oom_events(const char *root)
1227 {
1228 	int ret = KSFT_FAIL;
1229 	char *memcg;
1230 
1231 	memcg = cg_name(root, "memcg_test");
1232 	if (!memcg)
1233 		goto cleanup;
1234 
1235 	if (cg_create(memcg))
1236 		goto cleanup;
1237 
1238 	if (cg_write(memcg, "memory.max", "30M"))
1239 		goto cleanup;
1240 
1241 	if (cg_write(memcg, "memory.swap.max", "0"))
1242 		goto cleanup;
1243 
1244 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1245 		goto cleanup;
1246 
1247 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1248 		goto cleanup;
1249 
1250 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1251 		goto cleanup;
1252 
1253 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1254 		goto cleanup;
1255 
1256 	ret = KSFT_PASS;
1257 
1258 cleanup:
1259 	cg_destroy(memcg);
1260 	free(memcg);
1261 
1262 	return ret;
1263 }
1264 
1265 struct tcp_server_args {
1266 	unsigned short port;
1267 	int ctl[2];
1268 };
1269 
1270 static int tcp_server(const char *cgroup, void *arg)
1271 {
1272 	struct tcp_server_args *srv_args = arg;
1273 	struct sockaddr_in6 saddr = { 0 };
1274 	socklen_t slen = sizeof(saddr);
1275 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1276 
1277 	close(srv_args->ctl[0]);
1278 	ctl_fd = srv_args->ctl[1];
1279 
1280 	saddr.sin6_family = AF_INET6;
1281 	saddr.sin6_addr = in6addr_any;
1282 	saddr.sin6_port = htons(srv_args->port);
1283 
1284 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1285 	if (sk < 0) {
1286 		/* Pass back errno to the ctl_fd */
1287 		write(ctl_fd, &errno, sizeof(errno));
1288 		return ret;
1289 	}
1290 
1291 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1292 		goto cleanup;
1293 
1294 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1295 		write(ctl_fd, &errno, sizeof(errno));
1296 		goto cleanup;
1297 	}
1298 
1299 	if (listen(sk, 1))
1300 		goto cleanup;
1301 
1302 	ret = 0;
1303 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1304 		ret = -1;
1305 		goto cleanup;
1306 	}
1307 
1308 	client_sk = accept(sk, NULL, NULL);
1309 	if (client_sk < 0)
1310 		goto cleanup;
1311 
1312 	ret = -1;
1313 	for (;;) {
1314 		uint8_t buf[0x100000];
1315 
1316 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1317 			if (errno == ECONNRESET)
1318 				ret = 0;
1319 			break;
1320 		}
1321 	}
1322 
1323 	close(client_sk);
1324 
1325 cleanup:
1326 	close(sk);
1327 	return ret;
1328 }
1329 
1330 static int tcp_client(const char *cgroup, unsigned short port)
1331 {
1332 	const char server[] = "localhost";
1333 	struct addrinfo *ai;
1334 	char servport[6];
1335 	int retries = 0x10; /* nice round number */
1336 	int sk, ret;
1337 	long allocated;
1338 
1339 	allocated = cg_read_long(cgroup, "memory.current");
1340 	snprintf(servport, sizeof(servport), "%hd", port);
1341 	ret = getaddrinfo(server, servport, NULL, &ai);
1342 	if (ret)
1343 		return ret;
1344 
1345 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1346 	if (sk < 0)
1347 		goto free_ainfo;
1348 
1349 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1350 	if (ret < 0)
1351 		goto close_sk;
1352 
1353 	ret = KSFT_FAIL;
1354 	while (retries--) {
1355 		uint8_t buf[0x100000];
1356 		long current, sock;
1357 
1358 		if (read(sk, buf, sizeof(buf)) <= 0)
1359 			goto close_sk;
1360 
1361 		current = cg_read_long(cgroup, "memory.current");
1362 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1363 
1364 		if (current < 0 || sock < 0)
1365 			goto close_sk;
1366 
1367 		/* exclude the memory not related to socket connection */
1368 		if (values_close(current - allocated, sock, 10)) {
1369 			ret = KSFT_PASS;
1370 			break;
1371 		}
1372 	}
1373 
1374 close_sk:
1375 	close(sk);
1376 free_ainfo:
1377 	freeaddrinfo(ai);
1378 	return ret;
1379 }
1380 
1381 /*
1382  * This test checks socket memory accounting.
1383  * The test forks a TCP server listens on a random port between 1000
1384  * and 61000. Once it gets a client connection, it starts writing to
1385  * its socket.
1386  * The TCP client interleaves reads from the socket with check whether
1387  * memory.current and memory.stat.sock are similar.
1388  */
1389 static int test_memcg_sock(const char *root)
1390 {
1391 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1392 	unsigned short port;
1393 	char *memcg;
1394 	long sock_post = -1;
1395 
1396 	memcg = cg_name(root, "memcg_test");
1397 	if (!memcg)
1398 		goto cleanup;
1399 
1400 	if (cg_create(memcg))
1401 		goto cleanup;
1402 
1403 	while (bind_retries--) {
1404 		struct tcp_server_args args;
1405 
1406 		if (pipe(args.ctl))
1407 			goto cleanup;
1408 
1409 		port = args.port = 1000 + rand() % 60000;
1410 
1411 		pid = cg_run_nowait(memcg, tcp_server, &args);
1412 		if (pid < 0)
1413 			goto cleanup;
1414 
1415 		close(args.ctl[1]);
1416 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1417 			goto cleanup;
1418 		close(args.ctl[0]);
1419 
1420 		/* Skip if address family not supported by protocol */
1421 		if (err == EAFNOSUPPORT) {
1422 			ret = KSFT_SKIP;
1423 			goto cleanup;
1424 		}
1425 
1426 		if (!err)
1427 			break;
1428 		if (err != EADDRINUSE)
1429 			goto cleanup;
1430 
1431 		waitpid(pid, NULL, 0);
1432 	}
1433 
1434 	if (err == EADDRINUSE) {
1435 		ret = KSFT_SKIP;
1436 		goto cleanup;
1437 	}
1438 
1439 	if (tcp_client(memcg, port) != KSFT_PASS)
1440 		goto cleanup;
1441 
1442 	waitpid(pid, &err, 0);
1443 	if (WEXITSTATUS(err))
1444 		goto cleanup;
1445 
1446 	if (cg_read_long(memcg, "memory.current") < 0)
1447 		goto cleanup;
1448 
1449 	/*
1450 	 * memory.stat is updated asynchronously via the memcg rstat
1451 	 * flushing worker, which runs periodically (every 2 seconds,
1452 	 * see FLUSH_TIME). On a busy system, the "sock " counter may
1453 	 * stay non-zero for a short period of time after the TCP
1454 	 * connection is closed and all socket memory has been
1455 	 * uncharged.
1456 	 *
1457 	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1458 	 * scheduling slack) and require that the "sock " counter
1459 	 * eventually drops to zero.
1460 	 */
1461 	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1462 					 MEMCG_SOCKSTAT_WAIT_RETRIES,
1463 					 DEFAULT_WAIT_INTERVAL_US);
1464 	if (sock_post)
1465 		goto cleanup;
1466 
1467 	ret = KSFT_PASS;
1468 
1469 cleanup:
1470 	cg_destroy(memcg);
1471 	free(memcg);
1472 
1473 	return ret;
1474 }
1475 
1476 /*
1477  * This test disables swapping and tries to allocate anonymous memory
1478  * up to OOM with memory.group.oom set. Then it checks that all
1479  * processes in the leaf were killed. It also checks that oom_events
1480  * were propagated to the parent level.
1481  */
1482 static int test_memcg_oom_group_leaf_events(const char *root)
1483 {
1484 	int ret = KSFT_FAIL;
1485 	char *parent, *child;
1486 	long parent_oom_events;
1487 
1488 	parent = cg_name(root, "memcg_test_0");
1489 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1490 
1491 	if (!parent || !child)
1492 		goto cleanup;
1493 
1494 	if (cg_create(parent))
1495 		goto cleanup;
1496 
1497 	if (cg_create(child))
1498 		goto cleanup;
1499 
1500 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1501 		goto cleanup;
1502 
1503 	if (cg_write(child, "memory.max", "50M"))
1504 		goto cleanup;
1505 
1506 	if (cg_write(child, "memory.swap.max", "0"))
1507 		goto cleanup;
1508 
1509 	if (cg_write(child, "memory.oom.group", "1"))
1510 		goto cleanup;
1511 
1512 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1513 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1514 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1515 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1516 		goto cleanup;
1517 
1518 	if (cg_test_proc_killed(child))
1519 		goto cleanup;
1520 
1521 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1522 		goto cleanup;
1523 
1524 	parent_oom_events = cg_read_key_long(
1525 			parent, "memory.events", "oom_kill ");
1526 	/*
1527 	 * If memory_localevents is not enabled (the default), the parent should
1528 	 * count OOM events in its children groups. Otherwise, it should not
1529 	 * have observed any events.
1530 	 */
1531 	if (has_localevents && parent_oom_events != 0)
1532 		goto cleanup;
1533 	else if (!has_localevents && parent_oom_events <= 0)
1534 		goto cleanup;
1535 
1536 	ret = KSFT_PASS;
1537 
1538 cleanup:
1539 	if (child)
1540 		cg_destroy(child);
1541 	if (parent)
1542 		cg_destroy(parent);
1543 	free(child);
1544 	free(parent);
1545 
1546 	return ret;
1547 }
1548 
1549 /*
1550  * This test disables swapping and tries to allocate anonymous memory
1551  * up to OOM with memory.group.oom set. Then it checks that all
1552  * processes in the parent and leaf were killed.
1553  */
1554 static int test_memcg_oom_group_parent_events(const char *root)
1555 {
1556 	int ret = KSFT_FAIL;
1557 	char *parent, *child;
1558 
1559 	parent = cg_name(root, "memcg_test_0");
1560 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1561 
1562 	if (!parent || !child)
1563 		goto cleanup;
1564 
1565 	if (cg_create(parent))
1566 		goto cleanup;
1567 
1568 	if (cg_create(child))
1569 		goto cleanup;
1570 
1571 	if (cg_write(parent, "memory.max", "80M"))
1572 		goto cleanup;
1573 
1574 	if (cg_write(parent, "memory.swap.max", "0"))
1575 		goto cleanup;
1576 
1577 	if (cg_write(parent, "memory.oom.group", "1"))
1578 		goto cleanup;
1579 
1580 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1581 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1582 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1583 
1584 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1585 		goto cleanup;
1586 
1587 	if (cg_test_proc_killed(child))
1588 		goto cleanup;
1589 	if (cg_test_proc_killed(parent))
1590 		goto cleanup;
1591 
1592 	ret = KSFT_PASS;
1593 
1594 cleanup:
1595 	if (child)
1596 		cg_destroy(child);
1597 	if (parent)
1598 		cg_destroy(parent);
1599 	free(child);
1600 	free(parent);
1601 
1602 	return ret;
1603 }
1604 
1605 /*
1606  * This test disables swapping and tries to allocate anonymous memory
1607  * up to OOM with memory.group.oom set. Then it checks that all
1608  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1609  */
1610 static int test_memcg_oom_group_score_events(const char *root)
1611 {
1612 	int ret = KSFT_FAIL;
1613 	char *memcg;
1614 	int safe_pid;
1615 
1616 	memcg = cg_name(root, "memcg_test_0");
1617 
1618 	if (!memcg)
1619 		goto cleanup;
1620 
1621 	if (cg_create(memcg))
1622 		goto cleanup;
1623 
1624 	if (cg_write(memcg, "memory.max", "50M"))
1625 		goto cleanup;
1626 
1627 	if (cg_write(memcg, "memory.swap.max", "0"))
1628 		goto cleanup;
1629 
1630 	if (cg_write(memcg, "memory.oom.group", "1"))
1631 		goto cleanup;
1632 
1633 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1634 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1635 		goto cleanup;
1636 
1637 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1638 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1639 		goto cleanup;
1640 
1641 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1642 		goto cleanup;
1643 
1644 	if (kill(safe_pid, SIGKILL))
1645 		goto cleanup;
1646 
1647 	ret = KSFT_PASS;
1648 
1649 cleanup:
1650 	if (memcg)
1651 		cg_destroy(memcg);
1652 	free(memcg);
1653 
1654 	return ret;
1655 }
1656 
1657 static int read_event(int inotify_fd, int expected_event, int expected_wd)
1658 {
1659 	struct inotify_event event;
1660 	ssize_t len = 0;
1661 
1662 	len = read(inotify_fd, &event, sizeof(event));
1663 	if (len < (ssize_t)sizeof(event))
1664 		return -1;
1665 
1666 	if (event.mask != expected_event || event.wd != expected_wd) {
1667 		fprintf(stderr,
1668 			"event does not match expected values: mask %d (expected %d) wd %d (expected %d)\n",
1669 			event.mask, expected_event, event.wd, expected_wd);
1670 		return -1;
1671 	}
1672 
1673 	return 0;
1674 }
1675 
1676 static int test_memcg_inotify_delete_file(const char *root)
1677 {
1678 	int ret = KSFT_FAIL;
1679 	char *memcg = NULL;
1680 	int fd, wd;
1681 
1682 	memcg = cg_name(root, "memcg_test_0");
1683 
1684 	if (!memcg)
1685 		goto cleanup;
1686 
1687 	if (cg_create(memcg))
1688 		goto cleanup;
1689 
1690 	fd = inotify_init1(0);
1691 	if (fd == -1)
1692 		goto cleanup;
1693 
1694 	wd = inotify_add_watch(fd, cg_control(memcg, "memory.events"), IN_DELETE_SELF);
1695 	if (wd == -1)
1696 		goto cleanup;
1697 
1698 	if (cg_destroy(memcg))
1699 		goto cleanup;
1700 	free(memcg);
1701 	memcg = NULL;
1702 
1703 	if (read_event(fd, IN_DELETE_SELF, wd))
1704 		goto cleanup;
1705 
1706 	if (read_event(fd, IN_IGNORED, wd))
1707 		goto cleanup;
1708 
1709 	ret = KSFT_PASS;
1710 
1711 cleanup:
1712 	if (fd >= 0)
1713 		close(fd);
1714 	if (memcg)
1715 		cg_destroy(memcg);
1716 	free(memcg);
1717 
1718 	return ret;
1719 }
1720 
1721 static int test_memcg_inotify_delete_dir(const char *root)
1722 {
1723 	int ret = KSFT_FAIL;
1724 	char *memcg = NULL;
1725 	int fd, wd;
1726 
1727 	memcg = cg_name(root, "memcg_test_0");
1728 
1729 	if (!memcg)
1730 		goto cleanup;
1731 
1732 	if (cg_create(memcg))
1733 		goto cleanup;
1734 
1735 	fd = inotify_init1(0);
1736 	if (fd == -1)
1737 		goto cleanup;
1738 
1739 	wd = inotify_add_watch(fd, memcg, IN_DELETE_SELF);
1740 	if (wd == -1)
1741 		goto cleanup;
1742 
1743 	if (cg_destroy(memcg))
1744 		goto cleanup;
1745 	free(memcg);
1746 	memcg = NULL;
1747 
1748 	if (read_event(fd, IN_DELETE_SELF, wd))
1749 		goto cleanup;
1750 
1751 	if (read_event(fd, IN_IGNORED, wd))
1752 		goto cleanup;
1753 
1754 	ret = KSFT_PASS;
1755 
1756 cleanup:
1757 	if (fd >= 0)
1758 		close(fd);
1759 	if (memcg)
1760 		cg_destroy(memcg);
1761 	free(memcg);
1762 
1763 	return ret;
1764 }
1765 
1766 #define T(x) { x, #x }
1767 struct memcg_test {
1768 	int (*fn)(const char *root);
1769 	const char *name;
1770 } tests[] = {
1771 	T(test_memcg_subtree_control),
1772 	T(test_memcg_current_peak),
1773 	T(test_memcg_min),
1774 	T(test_memcg_low),
1775 	T(test_memcg_high),
1776 	T(test_memcg_high_sync),
1777 	T(test_memcg_max),
1778 	T(test_memcg_reclaim),
1779 	T(test_memcg_oom_events),
1780 	T(test_memcg_swap_max_peak),
1781 	T(test_memcg_sock),
1782 	T(test_memcg_oom_group_leaf_events),
1783 	T(test_memcg_oom_group_parent_events),
1784 	T(test_memcg_oom_group_score_events),
1785 	T(test_memcg_inotify_delete_file),
1786 	T(test_memcg_inotify_delete_dir),
1787 };
1788 #undef T
1789 
1790 int main(int argc, char **argv)
1791 {
1792 	char root[PATH_MAX];
1793 	int i, proc_status;
1794 
1795 	ksft_print_header();
1796 	ksft_set_plan(ARRAY_SIZE(tests));
1797 	if (cg_find_unified_root(root, sizeof(root), NULL))
1798 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1799 
1800 	/*
1801 	 * Check that memory controller is available:
1802 	 * memory is listed in cgroup.controllers
1803 	 */
1804 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1805 		ksft_exit_skip("memory controller isn't available\n");
1806 
1807 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1808 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1809 			ksft_exit_skip("Failed to set memory controller\n");
1810 
1811 	proc_status = proc_mount_contains("memory_recursiveprot");
1812 	if (proc_status < 0)
1813 		ksft_exit_skip("Failed to query cgroup mount option\n");
1814 	has_recursiveprot = proc_status;
1815 
1816 	proc_status = proc_mount_contains("memory_localevents");
1817 	if (proc_status < 0)
1818 		ksft_exit_skip("Failed to query cgroup mount option\n");
1819 	has_localevents = proc_status;
1820 
1821 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1822 		switch (tests[i].fn(root)) {
1823 		case KSFT_PASS:
1824 			ksft_test_result_pass("%s\n", tests[i].name);
1825 			break;
1826 		case KSFT_SKIP:
1827 			ksft_test_result_skip("%s\n", tests[i].name);
1828 			break;
1829 		default:
1830 			ksft_test_result_fail("%s\n", tests[i].name);
1831 			break;
1832 		}
1833 	}
1834 
1835 	ksft_finished();
1836 }
1837