xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/inotify.h>
14 #include <sys/socket.h>
15 #include <sys/wait.h>
16 #include <arpa/inet.h>
17 #include <netinet/in.h>
18 #include <netdb.h>
19 #include <errno.h>
20 #include <sys/mman.h>
21 
22 #include "kselftest.h"
23 #include "cgroup_util.h"
24 
25 #define MEMCG_SOCKSTAT_WAIT_RETRIES        30
26 
27 static bool has_localevents;
28 static bool has_recursiveprot;
29 static int page_size;
30 
31 int get_temp_fd(void)
32 {
33 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
34 }
35 
36 int alloc_pagecache(int fd, size_t size)
37 {
38 	char buf[BUF_SIZE];
39 	struct stat st;
40 	int i;
41 
42 	if (fstat(fd, &st))
43 		goto cleanup;
44 
45 	size += st.st_size;
46 
47 	if (ftruncate(fd, size))
48 		goto cleanup;
49 
50 	for (i = 0; i < size; i += sizeof(buf))
51 		read(fd, buf, sizeof(buf));
52 
53 	return 0;
54 
55 cleanup:
56 	return -1;
57 }
58 
59 static char *alloc_and_populate_anon(size_t size)
60 {
61 	char *buf, *ptr;
62 
63 	buf = malloc(size);
64 	if (buf == NULL) {
65 		fprintf(stderr, "malloc() failed\n");
66 		return NULL;
67 	}
68 
69 	for (ptr = buf; ptr < buf + size; ptr += page_size)
70 		*ptr = 0;
71 
72 	return buf;
73 }
74 
75 int alloc_anon(const char *cgroup, void *arg)
76 {
77 	size_t size = (unsigned long)arg;
78 	char *buf;
79 
80 	buf = alloc_and_populate_anon(size);
81 	if (!buf)
82 		return -1;
83 
84 	free(buf);
85 	return 0;
86 }
87 
88 int is_swap_enabled(void)
89 {
90 	char buf[BUF_SIZE];
91 	const char delim[] = "\n";
92 	int cnt = 0;
93 	char *line;
94 
95 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
96 		return -1;
97 
98 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
99 		cnt++;
100 
101 	return cnt > 1;
102 }
103 
104 int set_oom_adj_score(int pid, int score)
105 {
106 	char path[PATH_MAX];
107 	int fd, len;
108 
109 	sprintf(path, "/proc/%d/oom_score_adj", pid);
110 
111 	fd = open(path, O_WRONLY | O_APPEND);
112 	if (fd < 0)
113 		return fd;
114 
115 	len = dprintf(fd, "%d", score);
116 	if (len < 0) {
117 		close(fd);
118 		return len;
119 	}
120 
121 	close(fd);
122 	return 0;
123 }
124 
125 /*
126  * This test creates two nested cgroups with and without enabling
127  * the memory controller.
128  */
129 static int test_memcg_subtree_control(const char *root)
130 {
131 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
132 	int ret = KSFT_FAIL;
133 	char buf[BUF_SIZE];
134 
135 	/* Create two nested cgroups with the memory controller enabled */
136 	parent = cg_name(root, "memcg_test_0");
137 	child = cg_name(root, "memcg_test_0/memcg_test_1");
138 	if (!parent || !child)
139 		goto cleanup_free;
140 
141 	if (cg_create(parent))
142 		goto cleanup_free;
143 
144 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
145 		goto cleanup_parent;
146 
147 	if (cg_create(child))
148 		goto cleanup_parent;
149 
150 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
151 		goto cleanup_child;
152 
153 	/* Create two nested cgroups without enabling memory controller */
154 	parent2 = cg_name(root, "memcg_test_1");
155 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
156 	if (!parent2 || !child2)
157 		goto cleanup_free2;
158 
159 	if (cg_create(parent2))
160 		goto cleanup_free2;
161 
162 	if (cg_create(child2))
163 		goto cleanup_parent2;
164 
165 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
166 		goto cleanup_all;
167 
168 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
169 		goto cleanup_all;
170 
171 	ret = KSFT_PASS;
172 
173 cleanup_all:
174 	cg_destroy(child2);
175 cleanup_parent2:
176 	cg_destroy(parent2);
177 cleanup_free2:
178 	free(parent2);
179 	free(child2);
180 cleanup_child:
181 	cg_destroy(child);
182 cleanup_parent:
183 	cg_destroy(parent);
184 cleanup_free:
185 	free(parent);
186 	free(child);
187 
188 	return ret;
189 }
190 
191 static int alloc_anon_50M_check(const char *cgroup, void *arg)
192 {
193 	size_t size = MB(50);
194 	char *buf;
195 	long anon, current;
196 	int ret = -1;
197 
198 	buf = alloc_and_populate_anon(size);
199 	if (!buf)
200 		return -1;
201 
202 	current = cg_read_long(cgroup, "memory.current");
203 	if (current < size)
204 		goto cleanup;
205 
206 	if (!values_close(size, current, 3))
207 		goto cleanup;
208 
209 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
210 	if (anon < 0)
211 		goto cleanup;
212 
213 	if (!values_close(anon, current, 3))
214 		goto cleanup;
215 
216 	ret = 0;
217 cleanup:
218 	free(buf);
219 	return ret;
220 }
221 
222 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
223 {
224 	size_t size = MB(50);
225 	int ret = -1;
226 	long current, file;
227 	int fd;
228 
229 	fd = get_temp_fd();
230 	if (fd < 0)
231 		return -1;
232 
233 	if (alloc_pagecache(fd, size))
234 		goto cleanup;
235 
236 	current = cg_read_long(cgroup, "memory.current");
237 	if (current < size)
238 		goto cleanup;
239 
240 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
241 	if (file < 0)
242 		goto cleanup;
243 
244 	if (!values_close(file, current, 10))
245 		goto cleanup;
246 
247 	ret = 0;
248 
249 cleanup:
250 	close(fd);
251 	return ret;
252 }
253 
254 /*
255  * This test create a memory cgroup, allocates
256  * some anonymous memory and some pagecache
257  * and checks memory.current, memory.peak, and some memory.stat values.
258  */
259 static int test_memcg_current_peak(const char *root)
260 {
261 	int ret = KSFT_FAIL;
262 	long current, peak, peak_reset;
263 	char *memcg;
264 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
265 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
266 	struct stat ss;
267 
268 	memcg = cg_name(root, "memcg_test");
269 	if (!memcg)
270 		goto cleanup;
271 
272 	if (cg_create(memcg))
273 		goto cleanup;
274 
275 	current = cg_read_long(memcg, "memory.current");
276 	if (current != 0)
277 		goto cleanup;
278 
279 	peak = cg_read_long(memcg, "memory.peak");
280 	if (peak != 0)
281 		goto cleanup;
282 
283 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
284 		goto cleanup;
285 
286 	peak = cg_read_long(memcg, "memory.peak");
287 	if (peak < MB(50))
288 		goto cleanup;
289 
290 	/*
291 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
292 	 * We need at least three to be closed in a different order than writes occurred to test
293 	 * the linked-list handling.
294 	 */
295 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
296 
297 	if (peak_fd == -1) {
298 		if (errno == ENOENT)
299 			ret = KSFT_SKIP;
300 		goto cleanup;
301 	}
302 
303 	/*
304 	 * Before we try to use memory.peak's fd, try to figure out whether
305 	 * this kernel supports writing to that file in the first place. (by
306 	 * checking the writable bit on the file's st_mode)
307 	 */
308 	if (fstat(peak_fd, &ss))
309 		goto cleanup;
310 
311 	if ((ss.st_mode & S_IWUSR) == 0) {
312 		ret = KSFT_SKIP;
313 		goto cleanup;
314 	}
315 
316 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
317 
318 	if (peak_fd2 == -1)
319 		goto cleanup;
320 
321 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
322 
323 	if (peak_fd3 == -1)
324 		goto cleanup;
325 
326 	/* any non-empty string resets, but make it clear */
327 	static const char reset_string[] = "reset\n";
328 
329 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
330 	if (peak_reset != sizeof(reset_string))
331 		goto cleanup;
332 
333 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
334 	if (peak_reset != sizeof(reset_string))
335 		goto cleanup;
336 
337 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
338 	if (peak_reset != sizeof(reset_string))
339 		goto cleanup;
340 
341 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
342 	peak = cg_read_long(memcg, "memory.peak");
343 	if (peak < MB(50))
344 		goto cleanup;
345 
346 	fd2_closed = true;
347 	if (close(peak_fd2))
348 		goto cleanup;
349 
350 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
351 
352 	if (peak_fd4 == -1)
353 		goto cleanup;
354 
355 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
356 	if (peak_reset != sizeof(reset_string))
357 		goto cleanup;
358 
359 	peak = cg_read_long_fd(peak_fd);
360 	if (peak > MB(30) || peak < 0)
361 		goto cleanup;
362 
363 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
364 		goto cleanup;
365 
366 	peak = cg_read_long(memcg, "memory.peak");
367 	if (peak < MB(50))
368 		goto cleanup;
369 
370 	/* Make sure everything is back to normal */
371 	peak = cg_read_long_fd(peak_fd);
372 	if (peak < MB(50))
373 		goto cleanup;
374 
375 	peak = cg_read_long_fd(peak_fd4);
376 	if (peak < MB(50))
377 		goto cleanup;
378 
379 	fd3_closed = true;
380 	if (close(peak_fd3))
381 		goto cleanup;
382 
383 	fd4_closed = true;
384 	if (close(peak_fd4))
385 		goto cleanup;
386 
387 	ret = KSFT_PASS;
388 
389 cleanup:
390 	close(peak_fd);
391 	if (!fd2_closed)
392 		close(peak_fd2);
393 	if (!fd3_closed)
394 		close(peak_fd3);
395 	if (!fd4_closed)
396 		close(peak_fd4);
397 	cg_destroy(memcg);
398 	free(memcg);
399 
400 	return ret;
401 }
402 
403 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
404 {
405 	int fd = (long)arg;
406 	int ppid = getppid();
407 
408 	if (alloc_pagecache(fd, MB(50)))
409 		return -1;
410 
411 	while (getppid() == ppid)
412 		sleep(1);
413 
414 	return 0;
415 }
416 
417 static int alloc_anon_noexit(const char *cgroup, void *arg)
418 {
419 	int ppid = getppid();
420 	size_t size = (unsigned long)arg;
421 	char *buf;
422 
423 	buf = alloc_and_populate_anon(size);
424 	if (!buf)
425 		return -1;
426 
427 	while (getppid() == ppid)
428 		sleep(1);
429 
430 	free(buf);
431 	return 0;
432 }
433 
434 /*
435  * Wait until processes are killed asynchronously by the OOM killer
436  * If we exceed a timeout, fail.
437  */
438 static int cg_test_proc_killed(const char *cgroup)
439 {
440 	int limit;
441 
442 	for (limit = 10; limit > 0; limit--) {
443 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
444 			return 0;
445 
446 		usleep(100000);
447 	}
448 	return -1;
449 }
450 
451 static bool reclaim_until(const char *memcg, long goal);
452 
453 /*
454  * First, this test creates the following hierarchy:
455  * A       memory.min = 0,    memory.max = 200M
456  * A/B     memory.min = 50M
457  * A/B/C   memory.min = 75M,  memory.current = 50M
458  * A/B/D   memory.min = 25M,  memory.current = 50M
459  * A/B/E   memory.min = 0,    memory.current = 50M
460  * A/B/F   memory.min = 500M, memory.current = 0
461  *
462  * (or memory.low if we test soft protection)
463  *
464  * Usages are pagecache and the test keeps a running
465  * process in every leaf cgroup.
466  * Then it creates A/G and creates a significant
467  * memory pressure in A.
468  *
469  * Then it checks actual memory usages and expects that:
470  * A/B    memory.current ~= 50M
471  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
472  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
473  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
474  *				 undefined otherwise]
475  * A/B/F  memory.current  = 0   [memory.events:low == 0]
476  * (for origin of the numbers, see model in memcg_protection.m.)
477  *
478  * After that it tries to allocate more than there is
479  * unprotected memory in A available, and checks that:
480  * a) memory.min protects pagecache even in this case,
481  * b) memory.low allows reclaiming page cache with low events.
482  *
483  * Then we try to reclaim from A/B/C using memory.reclaim until its
484  * usage reaches 10M.
485  * This makes sure that:
486  * (a) We ignore the protection of the reclaim target memcg.
487  * (b) The previously calculated emin value (~29M) should be dismissed.
488  */
489 static int test_memcg_protection(const char *root, bool min)
490 {
491 	int ret = KSFT_FAIL, rc;
492 	char *parent[3] = {NULL};
493 	char *children[4] = {NULL};
494 	const char *attribute = min ? "memory.min" : "memory.low";
495 	long c[4];
496 	long current;
497 	int i, attempts;
498 	int fd;
499 
500 	fd = get_temp_fd();
501 	if (fd < 0)
502 		goto cleanup;
503 
504 	parent[0] = cg_name(root, "memcg_test_0");
505 	if (!parent[0])
506 		goto cleanup;
507 
508 	parent[1] = cg_name(parent[0], "memcg_test_1");
509 	if (!parent[1])
510 		goto cleanup;
511 
512 	parent[2] = cg_name(parent[0], "memcg_test_2");
513 	if (!parent[2])
514 		goto cleanup;
515 
516 	if (cg_create(parent[0]))
517 		goto cleanup;
518 
519 	if (cg_read_long(parent[0], attribute)) {
520 		/* No memory.min on older kernels is fine */
521 		if (min)
522 			ret = KSFT_SKIP;
523 		goto cleanup;
524 	}
525 
526 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
527 		goto cleanup;
528 
529 	if (cg_write(parent[0], "memory.max", "200M"))
530 		goto cleanup;
531 
532 	if (cg_write(parent[0], "memory.swap.max", "0"))
533 		goto cleanup;
534 
535 	if (cg_create(parent[1]))
536 		goto cleanup;
537 
538 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
539 		goto cleanup;
540 
541 	if (cg_create(parent[2]))
542 		goto cleanup;
543 
544 	for (i = 0; i < ARRAY_SIZE(children); i++) {
545 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
546 		if (!children[i])
547 			goto cleanup;
548 
549 		if (cg_create(children[i]))
550 			goto cleanup;
551 
552 		if (i > 2)
553 			continue;
554 
555 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
556 			      (void *)(long)fd);
557 	}
558 
559 	if (cg_write(parent[1],   attribute, "50M"))
560 		goto cleanup;
561 	if (cg_write(children[0], attribute, "75M"))
562 		goto cleanup;
563 	if (cg_write(children[1], attribute, "25M"))
564 		goto cleanup;
565 	if (cg_write(children[2], attribute, "0"))
566 		goto cleanup;
567 	if (cg_write(children[3], attribute, "500M"))
568 		goto cleanup;
569 
570 	attempts = 0;
571 	while (!values_close(cg_read_long(parent[1], "memory.current"),
572 			     MB(150), 3)) {
573 		if (attempts++ > 5)
574 			break;
575 		sleep(1);
576 	}
577 
578 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
579 		goto cleanup;
580 
581 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
582 		goto cleanup;
583 
584 	for (i = 0; i < ARRAY_SIZE(children); i++)
585 		c[i] = cg_read_long(children[i], "memory.current");
586 
587 	if (!values_close(c[0], MB(29), 15))
588 		goto cleanup;
589 
590 	if (!values_close(c[1], MB(21), 20))
591 		goto cleanup;
592 
593 	if (c[3] != 0)
594 		goto cleanup;
595 
596 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
597 	if (min && !rc)
598 		goto cleanup;
599 	else if (!min && rc) {
600 		fprintf(stderr,
601 			"memory.low prevents from allocating anon memory\n");
602 		goto cleanup;
603 	}
604 
605 	current = min ? MB(50) : MB(30);
606 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
607 		goto cleanup;
608 
609 	if (!reclaim_until(children[0], MB(10)))
610 		goto cleanup;
611 
612 	if (min) {
613 		ret = KSFT_PASS;
614 		goto cleanup;
615 	}
616 
617 	/*
618 	 * Child 2 has memory.low=0, but some low protection may still be
619 	 * distributed down from its parent with memory.low=50M if cgroup2
620 	 * memory_recursiveprot mount option is enabled. Ignore the low
621 	 * event count in this case.
622 	 */
623 	for (i = 0; i < ARRAY_SIZE(children); i++) {
624 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
625 		int no_low_events_index = 1;
626 		long low, oom;
627 
628 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
629 		low = cg_read_key_long(children[i], "memory.events", "low ");
630 
631 		if (oom)
632 			goto cleanup;
633 		if (i == ignore_low_events_index)
634 			continue;
635 		if (i <= no_low_events_index && low <= 0)
636 			goto cleanup;
637 		if (i > no_low_events_index && low)
638 			goto cleanup;
639 
640 	}
641 
642 	ret = KSFT_PASS;
643 
644 cleanup:
645 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
646 		if (!children[i])
647 			continue;
648 
649 		cg_destroy(children[i]);
650 		free(children[i]);
651 	}
652 
653 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
654 		if (!parent[i])
655 			continue;
656 
657 		cg_destroy(parent[i]);
658 		free(parent[i]);
659 	}
660 	close(fd);
661 	return ret;
662 }
663 
664 static int test_memcg_min(const char *root)
665 {
666 	return test_memcg_protection(root, true);
667 }
668 
669 static int test_memcg_low(const char *root)
670 {
671 	return test_memcg_protection(root, false);
672 }
673 
674 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
675 {
676 	size_t size = MB(50);
677 	int ret = -1;
678 	long current, high, max;
679 	int fd;
680 
681 	high = cg_read_long(cgroup, "memory.high");
682 	max = cg_read_long(cgroup, "memory.max");
683 	if (high != MB(30) && max != MB(30))
684 		return -1;
685 
686 	fd = get_temp_fd();
687 	if (fd < 0)
688 		return -1;
689 
690 	if (alloc_pagecache(fd, size))
691 		goto cleanup;
692 
693 	current = cg_read_long(cgroup, "memory.current");
694 	if (!values_close(current, MB(30), 5))
695 		goto cleanup;
696 
697 	ret = 0;
698 
699 cleanup:
700 	close(fd);
701 	return ret;
702 
703 }
704 
705 /*
706  * This test checks that memory.high limits the amount of
707  * memory which can be consumed by either anonymous memory
708  * or pagecache.
709  */
710 static int test_memcg_high(const char *root)
711 {
712 	int ret = KSFT_FAIL;
713 	char *memcg;
714 	long high;
715 
716 	memcg = cg_name(root, "memcg_test");
717 	if (!memcg)
718 		goto cleanup;
719 
720 	if (cg_create(memcg))
721 		goto cleanup;
722 
723 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
724 		goto cleanup;
725 
726 	if (cg_write(memcg, "memory.swap.max", "0"))
727 		goto cleanup;
728 
729 	if (cg_write(memcg, "memory.high", "30M"))
730 		goto cleanup;
731 
732 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
733 		goto cleanup;
734 
735 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
736 		goto cleanup;
737 
738 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
739 		goto cleanup;
740 
741 	high = cg_read_key_long(memcg, "memory.events", "high ");
742 	if (high <= 0)
743 		goto cleanup;
744 
745 	ret = KSFT_PASS;
746 
747 cleanup:
748 	cg_destroy(memcg);
749 	free(memcg);
750 
751 	return ret;
752 }
753 
754 static int alloc_anon_mlock(const char *cgroup, void *arg)
755 {
756 	size_t size = (size_t)arg;
757 	void *buf;
758 
759 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
760 		   0, 0);
761 	if (buf == MAP_FAILED)
762 		return -1;
763 
764 	mlock(buf, size);
765 	munmap(buf, size);
766 	return 0;
767 }
768 
769 /*
770  * This test checks that memory.high is able to throttle big single shot
771  * allocation i.e. large allocation within one kernel entry.
772  */
773 static int test_memcg_high_sync(const char *root)
774 {
775 	int ret = KSFT_FAIL, pid, fd = -1;
776 	char *memcg;
777 	long pre_high, pre_max;
778 	long post_high, post_max;
779 
780 	memcg = cg_name(root, "memcg_test");
781 	if (!memcg)
782 		goto cleanup;
783 
784 	if (cg_create(memcg))
785 		goto cleanup;
786 
787 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
788 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
789 	if (pre_high < 0 || pre_max < 0)
790 		goto cleanup;
791 
792 	if (cg_write(memcg, "memory.swap.max", "0"))
793 		goto cleanup;
794 
795 	if (cg_write(memcg, "memory.high", "30M"))
796 		goto cleanup;
797 
798 	if (cg_write(memcg, "memory.max", "140M"))
799 		goto cleanup;
800 
801 	fd = memcg_prepare_for_wait(memcg);
802 	if (fd < 0)
803 		goto cleanup;
804 
805 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
806 	if (pid < 0)
807 		goto cleanup;
808 
809 	cg_wait_for(fd);
810 
811 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
812 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
813 	if (post_high < 0 || post_max < 0)
814 		goto cleanup;
815 
816 	if (pre_high == post_high || pre_max != post_max)
817 		goto cleanup;
818 
819 	ret = KSFT_PASS;
820 
821 cleanup:
822 	if (fd >= 0)
823 		close(fd);
824 	cg_destroy(memcg);
825 	free(memcg);
826 
827 	return ret;
828 }
829 
830 /*
831  * This test checks that memory.max limits the amount of
832  * memory which can be consumed by either anonymous memory
833  * or pagecache.
834  */
835 static int test_memcg_max(const char *root)
836 {
837 	int ret = KSFT_FAIL;
838 	char *memcg;
839 	long current, max;
840 
841 	memcg = cg_name(root, "memcg_test");
842 	if (!memcg)
843 		goto cleanup;
844 
845 	if (cg_create(memcg))
846 		goto cleanup;
847 
848 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
849 		goto cleanup;
850 
851 	if (cg_write(memcg, "memory.swap.max", "0"))
852 		goto cleanup;
853 
854 	if (cg_write(memcg, "memory.max", "30M"))
855 		goto cleanup;
856 
857 	/* Should be killed by OOM killer */
858 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
859 		goto cleanup;
860 
861 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
862 		goto cleanup;
863 
864 	current = cg_read_long(memcg, "memory.current");
865 	if (current > MB(30) || !current)
866 		goto cleanup;
867 
868 	max = cg_read_key_long(memcg, "memory.events", "max ");
869 	if (max <= 0)
870 		goto cleanup;
871 
872 	ret = KSFT_PASS;
873 
874 cleanup:
875 	cg_destroy(memcg);
876 	free(memcg);
877 
878 	return ret;
879 }
880 
881 /*
882  * Reclaim from @memcg until usage reaches @goal by writing to
883  * memory.reclaim.
884  *
885  * This function will return false if the usage is already below the
886  * goal.
887  *
888  * This function assumes that writing to memory.reclaim is the only
889  * source of change in memory.current (no concurrent allocations or
890  * reclaim).
891  *
892  * This function makes sure memory.reclaim is sane. It will return
893  * false if memory.reclaim's error codes do not make sense, even if
894  * the usage goal was satisfied.
895  */
896 static bool reclaim_until(const char *memcg, long goal)
897 {
898 	char buf[64];
899 	int retries, err;
900 	long current, to_reclaim;
901 	bool reclaimed = false;
902 
903 	for (retries = 5; retries > 0; retries--) {
904 		current = cg_read_long(memcg, "memory.current");
905 
906 		if (current < goal || values_close(current, goal, 3))
907 			break;
908 		/* Did memory.reclaim return 0 incorrectly? */
909 		else if (reclaimed)
910 			return false;
911 
912 		to_reclaim = current - goal;
913 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
914 		err = cg_write(memcg, "memory.reclaim", buf);
915 		if (!err)
916 			reclaimed = true;
917 		else if (err != -EAGAIN)
918 			return false;
919 	}
920 	return reclaimed;
921 }
922 
923 /*
924  * This test checks that memory.reclaim reclaims the given
925  * amount of memory (from both anon and file, if possible).
926  */
927 static int test_memcg_reclaim(const char *root)
928 {
929 	int ret = KSFT_FAIL;
930 	int fd = -1;
931 	int retries;
932 	char *memcg;
933 	long current, expected_usage;
934 
935 	memcg = cg_name(root, "memcg_test");
936 	if (!memcg)
937 		goto cleanup;
938 
939 	if (cg_create(memcg))
940 		goto cleanup;
941 
942 	current = cg_read_long(memcg, "memory.current");
943 	if (current != 0)
944 		goto cleanup;
945 
946 	fd = get_temp_fd();
947 	if (fd < 0)
948 		goto cleanup;
949 
950 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
951 
952 	/*
953 	 * If swap is enabled, try to reclaim from both anon and file, else try
954 	 * to reclaim from file only.
955 	 */
956 	if (is_swap_enabled()) {
957 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
958 		expected_usage = MB(100);
959 	} else
960 		expected_usage = MB(50);
961 
962 	/*
963 	 * Wait until current usage reaches the expected usage (or we run out of
964 	 * retries).
965 	 */
966 	retries = 5;
967 	while (!values_close(cg_read_long(memcg, "memory.current"),
968 			    expected_usage, 10)) {
969 		if (retries--) {
970 			sleep(1);
971 			continue;
972 		} else {
973 			fprintf(stderr,
974 				"failed to allocate %ld for memcg reclaim test\n",
975 				expected_usage);
976 			goto cleanup;
977 		}
978 	}
979 
980 	/*
981 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
982 	 * and file if swap is enabled.
983 	 */
984 	if (!reclaim_until(memcg, MB(30)))
985 		goto cleanup;
986 
987 	ret = KSFT_PASS;
988 cleanup:
989 	cg_destroy(memcg);
990 	free(memcg);
991 	close(fd);
992 
993 	return ret;
994 }
995 
996 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
997 {
998 	long mem_max = (long)arg;
999 	size_t size = MB(50);
1000 	char *buf;
1001 	long mem_current, swap_current;
1002 	int ret = -1;
1003 
1004 	buf = alloc_and_populate_anon(size);
1005 	if (!buf)
1006 		return -1;
1007 
1008 	mem_current = cg_read_long(cgroup, "memory.current");
1009 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1010 		goto cleanup;
1011 
1012 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1013 	if (!swap_current ||
1014 	    !values_close(mem_current + swap_current, size, 3))
1015 		goto cleanup;
1016 
1017 	ret = 0;
1018 cleanup:
1019 	free(buf);
1020 	return ret;
1021 }
1022 
1023 /*
1024  * This test checks that memory.swap.max limits the amount of
1025  * anonymous memory which can be swapped out. Additionally, it verifies that
1026  * memory.swap.peak reflects the high watermark and can be reset.
1027  */
1028 static int test_memcg_swap_max_peak(const char *root)
1029 {
1030 	int ret = KSFT_FAIL;
1031 	char *memcg;
1032 	long max, peak;
1033 	struct stat ss;
1034 	int swap_peak_fd = -1, mem_peak_fd = -1;
1035 
1036 	/* any non-empty string resets */
1037 	static const char reset_string[] = "foobarbaz";
1038 
1039 	if (!is_swap_enabled())
1040 		return KSFT_SKIP;
1041 
1042 	memcg = cg_name(root, "memcg_test");
1043 	if (!memcg)
1044 		goto cleanup;
1045 
1046 	if (cg_create(memcg))
1047 		goto cleanup;
1048 
1049 	if (cg_read_long(memcg, "memory.swap.current")) {
1050 		ret = KSFT_SKIP;
1051 		goto cleanup;
1052 	}
1053 
1054 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1055 			       O_RDWR | O_APPEND | O_CLOEXEC);
1056 
1057 	if (swap_peak_fd == -1) {
1058 		if (errno == ENOENT)
1059 			ret = KSFT_SKIP;
1060 		goto cleanup;
1061 	}
1062 
1063 	/*
1064 	 * Before we try to use memory.swap.peak's fd, try to figure out
1065 	 * whether this kernel supports writing to that file in the first
1066 	 * place. (by checking the writable bit on the file's st_mode)
1067 	 */
1068 	if (fstat(swap_peak_fd, &ss))
1069 		goto cleanup;
1070 
1071 	if ((ss.st_mode & S_IWUSR) == 0) {
1072 		ret = KSFT_SKIP;
1073 		goto cleanup;
1074 	}
1075 
1076 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1077 
1078 	if (mem_peak_fd == -1)
1079 		goto cleanup;
1080 
1081 	if (cg_read_long(memcg, "memory.swap.peak"))
1082 		goto cleanup;
1083 
1084 	if (cg_read_long_fd(swap_peak_fd))
1085 		goto cleanup;
1086 
1087 	/* switch the swap and mem fds into local-peak tracking mode*/
1088 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1089 
1090 	if (peak_reset != sizeof(reset_string))
1091 		goto cleanup;
1092 
1093 	if (cg_read_long_fd(swap_peak_fd))
1094 		goto cleanup;
1095 
1096 	if (cg_read_long(memcg, "memory.peak"))
1097 		goto cleanup;
1098 
1099 	if (cg_read_long_fd(mem_peak_fd))
1100 		goto cleanup;
1101 
1102 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1103 	if (peak_reset != sizeof(reset_string))
1104 		goto cleanup;
1105 
1106 	if (cg_read_long_fd(mem_peak_fd))
1107 		goto cleanup;
1108 
1109 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1110 		goto cleanup;
1111 
1112 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1113 		goto cleanup;
1114 
1115 	if (cg_write(memcg, "memory.swap.max", "30M"))
1116 		goto cleanup;
1117 
1118 	if (cg_write(memcg, "memory.max", "30M"))
1119 		goto cleanup;
1120 
1121 	/* Should be killed by OOM killer */
1122 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1123 		goto cleanup;
1124 
1125 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1126 		goto cleanup;
1127 
1128 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1129 		goto cleanup;
1130 
1131 	peak = cg_read_long(memcg, "memory.peak");
1132 	if (peak < MB(29))
1133 		goto cleanup;
1134 
1135 	peak = cg_read_long(memcg, "memory.swap.peak");
1136 	if (peak < MB(29))
1137 		goto cleanup;
1138 
1139 	peak = cg_read_long_fd(mem_peak_fd);
1140 	if (peak < MB(29))
1141 		goto cleanup;
1142 
1143 	peak = cg_read_long_fd(swap_peak_fd);
1144 	if (peak < MB(29))
1145 		goto cleanup;
1146 
1147 	/*
1148 	 * open, reset and close the peak swap on another FD to make sure
1149 	 * multiple extant fds don't corrupt the linked-list
1150 	 */
1151 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1152 	if (peak_reset)
1153 		goto cleanup;
1154 
1155 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1156 	if (peak_reset)
1157 		goto cleanup;
1158 
1159 	/* actually reset on the fds */
1160 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1161 	if (peak_reset != sizeof(reset_string))
1162 		goto cleanup;
1163 
1164 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1165 	if (peak_reset != sizeof(reset_string))
1166 		goto cleanup;
1167 
1168 	peak = cg_read_long_fd(swap_peak_fd);
1169 	if (peak > MB(10))
1170 		goto cleanup;
1171 
1172 	/*
1173 	 * The cgroup is now empty, but there may be a page or two associated
1174 	 * with the open FD accounted to it.
1175 	 */
1176 	peak = cg_read_long_fd(mem_peak_fd);
1177 	if (peak > MB(1))
1178 		goto cleanup;
1179 
1180 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1181 		goto cleanup;
1182 
1183 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1184 		goto cleanup;
1185 
1186 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1187 		goto cleanup;
1188 
1189 	max = cg_read_key_long(memcg, "memory.events", "max ");
1190 	if (max <= 0)
1191 		goto cleanup;
1192 
1193 	peak = cg_read_long(memcg, "memory.peak");
1194 	if (peak < MB(29))
1195 		goto cleanup;
1196 
1197 	peak = cg_read_long(memcg, "memory.swap.peak");
1198 	if (peak < MB(29))
1199 		goto cleanup;
1200 
1201 	peak = cg_read_long_fd(mem_peak_fd);
1202 	if (peak < MB(29))
1203 		goto cleanup;
1204 
1205 	peak = cg_read_long_fd(swap_peak_fd);
1206 	if (peak < MB(19))
1207 		goto cleanup;
1208 
1209 	ret = KSFT_PASS;
1210 
1211 cleanup:
1212 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1213 		ret = KSFT_FAIL;
1214 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1215 		ret = KSFT_FAIL;
1216 	cg_destroy(memcg);
1217 	free(memcg);
1218 
1219 	return ret;
1220 }
1221 
1222 /*
1223  * This test disables swapping and tries to allocate anonymous memory
1224  * up to OOM. Then it checks for oom and oom_kill events in
1225  * memory.events.
1226  */
1227 static int test_memcg_oom_events(const char *root)
1228 {
1229 	int ret = KSFT_FAIL;
1230 	char *memcg;
1231 
1232 	memcg = cg_name(root, "memcg_test");
1233 	if (!memcg)
1234 		goto cleanup;
1235 
1236 	if (cg_create(memcg))
1237 		goto cleanup;
1238 
1239 	if (cg_write(memcg, "memory.max", "30M"))
1240 		goto cleanup;
1241 
1242 	if (cg_write(memcg, "memory.swap.max", "0"))
1243 		goto cleanup;
1244 
1245 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1246 		goto cleanup;
1247 
1248 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1249 		goto cleanup;
1250 
1251 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1252 		goto cleanup;
1253 
1254 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1255 		goto cleanup;
1256 
1257 	ret = KSFT_PASS;
1258 
1259 cleanup:
1260 	cg_destroy(memcg);
1261 	free(memcg);
1262 
1263 	return ret;
1264 }
1265 
1266 struct tcp_server_args {
1267 	unsigned short port;
1268 	int ctl[2];
1269 };
1270 
1271 static int tcp_server(const char *cgroup, void *arg)
1272 {
1273 	struct tcp_server_args *srv_args = arg;
1274 	struct sockaddr_in6 saddr = { 0 };
1275 	socklen_t slen = sizeof(saddr);
1276 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1277 
1278 	close(srv_args->ctl[0]);
1279 	ctl_fd = srv_args->ctl[1];
1280 
1281 	saddr.sin6_family = AF_INET6;
1282 	saddr.sin6_addr = in6addr_any;
1283 	saddr.sin6_port = htons(srv_args->port);
1284 
1285 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1286 	if (sk < 0) {
1287 		/* Pass back errno to the ctl_fd */
1288 		write(ctl_fd, &errno, sizeof(errno));
1289 		return ret;
1290 	}
1291 
1292 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1293 		goto cleanup;
1294 
1295 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1296 		write(ctl_fd, &errno, sizeof(errno));
1297 		goto cleanup;
1298 	}
1299 
1300 	if (listen(sk, 1))
1301 		goto cleanup;
1302 
1303 	ret = 0;
1304 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1305 		ret = -1;
1306 		goto cleanup;
1307 	}
1308 
1309 	client_sk = accept(sk, NULL, NULL);
1310 	if (client_sk < 0)
1311 		goto cleanup;
1312 
1313 	ret = -1;
1314 	for (;;) {
1315 		uint8_t buf[0x100000];
1316 
1317 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1318 			if (errno == ECONNRESET)
1319 				ret = 0;
1320 			break;
1321 		}
1322 	}
1323 
1324 	close(client_sk);
1325 
1326 cleanup:
1327 	close(sk);
1328 	return ret;
1329 }
1330 
1331 static int tcp_client(const char *cgroup, unsigned short port)
1332 {
1333 	const char server[] = "localhost";
1334 	struct addrinfo *ai;
1335 	char servport[6];
1336 	int retries = 0x10; /* nice round number */
1337 	int sk, ret;
1338 	long allocated;
1339 
1340 	allocated = cg_read_long(cgroup, "memory.current");
1341 	snprintf(servport, sizeof(servport), "%hd", port);
1342 	ret = getaddrinfo(server, servport, NULL, &ai);
1343 	if (ret)
1344 		return ret;
1345 
1346 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1347 	if (sk < 0)
1348 		goto free_ainfo;
1349 
1350 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1351 	if (ret < 0)
1352 		goto close_sk;
1353 
1354 	ret = KSFT_FAIL;
1355 	while (retries--) {
1356 		uint8_t buf[0x100000];
1357 		long current, sock;
1358 
1359 		if (read(sk, buf, sizeof(buf)) <= 0)
1360 			goto close_sk;
1361 
1362 		current = cg_read_long(cgroup, "memory.current");
1363 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1364 
1365 		if (current < 0 || sock < 0)
1366 			goto close_sk;
1367 
1368 		/* exclude the memory not related to socket connection */
1369 		if (values_close(current - allocated, sock, 10)) {
1370 			ret = KSFT_PASS;
1371 			break;
1372 		}
1373 	}
1374 
1375 close_sk:
1376 	close(sk);
1377 free_ainfo:
1378 	freeaddrinfo(ai);
1379 	return ret;
1380 }
1381 
1382 /*
1383  * This test checks socket memory accounting.
1384  * The test forks a TCP server listens on a random port between 1000
1385  * and 61000. Once it gets a client connection, it starts writing to
1386  * its socket.
1387  * The TCP client interleaves reads from the socket with check whether
1388  * memory.current and memory.stat.sock are similar.
1389  */
1390 static int test_memcg_sock(const char *root)
1391 {
1392 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1393 	unsigned short port;
1394 	char *memcg;
1395 	long sock_post = -1;
1396 
1397 	memcg = cg_name(root, "memcg_test");
1398 	if (!memcg)
1399 		goto cleanup;
1400 
1401 	if (cg_create(memcg))
1402 		goto cleanup;
1403 
1404 	while (bind_retries--) {
1405 		struct tcp_server_args args;
1406 
1407 		if (pipe(args.ctl))
1408 			goto cleanup;
1409 
1410 		port = args.port = 1000 + rand() % 60000;
1411 
1412 		pid = cg_run_nowait(memcg, tcp_server, &args);
1413 		if (pid < 0)
1414 			goto cleanup;
1415 
1416 		close(args.ctl[1]);
1417 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1418 			goto cleanup;
1419 		close(args.ctl[0]);
1420 
1421 		/* Skip if address family not supported by protocol */
1422 		if (err == EAFNOSUPPORT) {
1423 			ret = KSFT_SKIP;
1424 			goto cleanup;
1425 		}
1426 
1427 		if (!err)
1428 			break;
1429 		if (err != EADDRINUSE)
1430 			goto cleanup;
1431 
1432 		waitpid(pid, NULL, 0);
1433 	}
1434 
1435 	if (err == EADDRINUSE) {
1436 		ret = KSFT_SKIP;
1437 		goto cleanup;
1438 	}
1439 
1440 	if (tcp_client(memcg, port) != KSFT_PASS)
1441 		goto cleanup;
1442 
1443 	waitpid(pid, &err, 0);
1444 	if (WEXITSTATUS(err))
1445 		goto cleanup;
1446 
1447 	if (cg_read_long(memcg, "memory.current") < 0)
1448 		goto cleanup;
1449 
1450 	/*
1451 	 * memory.stat is updated asynchronously via the memcg rstat
1452 	 * flushing worker, which runs periodically (every 2 seconds,
1453 	 * see FLUSH_TIME). On a busy system, the "sock " counter may
1454 	 * stay non-zero for a short period of time after the TCP
1455 	 * connection is closed and all socket memory has been
1456 	 * uncharged.
1457 	 *
1458 	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1459 	 * scheduling slack) and require that the "sock " counter
1460 	 * eventually drops to zero.
1461 	 */
1462 	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1463 					 MEMCG_SOCKSTAT_WAIT_RETRIES,
1464 					 DEFAULT_WAIT_INTERVAL_US);
1465 	if (sock_post)
1466 		goto cleanup;
1467 
1468 	ret = KSFT_PASS;
1469 
1470 cleanup:
1471 	cg_destroy(memcg);
1472 	free(memcg);
1473 
1474 	return ret;
1475 }
1476 
1477 /*
1478  * This test disables swapping and tries to allocate anonymous memory
1479  * up to OOM with memory.group.oom set. Then it checks that all
1480  * processes in the leaf were killed. It also checks that oom_events
1481  * were propagated to the parent level.
1482  */
1483 static int test_memcg_oom_group_leaf_events(const char *root)
1484 {
1485 	int ret = KSFT_FAIL;
1486 	char *parent, *child;
1487 	long parent_oom_events;
1488 
1489 	parent = cg_name(root, "memcg_test_0");
1490 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1491 
1492 	if (!parent || !child)
1493 		goto cleanup;
1494 
1495 	if (cg_create(parent))
1496 		goto cleanup;
1497 
1498 	if (cg_create(child))
1499 		goto cleanup;
1500 
1501 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1502 		goto cleanup;
1503 
1504 	if (cg_write(child, "memory.max", "50M"))
1505 		goto cleanup;
1506 
1507 	if (cg_write(child, "memory.swap.max", "0"))
1508 		goto cleanup;
1509 
1510 	if (cg_write(child, "memory.oom.group", "1"))
1511 		goto cleanup;
1512 
1513 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1514 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1515 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1516 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1517 		goto cleanup;
1518 
1519 	if (cg_test_proc_killed(child))
1520 		goto cleanup;
1521 
1522 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1523 		goto cleanup;
1524 
1525 	parent_oom_events = cg_read_key_long(
1526 			parent, "memory.events", "oom_kill ");
1527 	/*
1528 	 * If memory_localevents is not enabled (the default), the parent should
1529 	 * count OOM events in its children groups. Otherwise, it should not
1530 	 * have observed any events.
1531 	 */
1532 	if (has_localevents && parent_oom_events != 0)
1533 		goto cleanup;
1534 	else if (!has_localevents && parent_oom_events <= 0)
1535 		goto cleanup;
1536 
1537 	ret = KSFT_PASS;
1538 
1539 cleanup:
1540 	if (child)
1541 		cg_destroy(child);
1542 	if (parent)
1543 		cg_destroy(parent);
1544 	free(child);
1545 	free(parent);
1546 
1547 	return ret;
1548 }
1549 
1550 /*
1551  * This test disables swapping and tries to allocate anonymous memory
1552  * up to OOM with memory.group.oom set. Then it checks that all
1553  * processes in the parent and leaf were killed.
1554  */
1555 static int test_memcg_oom_group_parent_events(const char *root)
1556 {
1557 	int ret = KSFT_FAIL;
1558 	char *parent, *child;
1559 
1560 	parent = cg_name(root, "memcg_test_0");
1561 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1562 
1563 	if (!parent || !child)
1564 		goto cleanup;
1565 
1566 	if (cg_create(parent))
1567 		goto cleanup;
1568 
1569 	if (cg_create(child))
1570 		goto cleanup;
1571 
1572 	if (cg_write(parent, "memory.max", "80M"))
1573 		goto cleanup;
1574 
1575 	if (cg_write(parent, "memory.swap.max", "0"))
1576 		goto cleanup;
1577 
1578 	if (cg_write(parent, "memory.oom.group", "1"))
1579 		goto cleanup;
1580 
1581 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1582 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1583 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1584 
1585 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1586 		goto cleanup;
1587 
1588 	if (cg_test_proc_killed(child))
1589 		goto cleanup;
1590 	if (cg_test_proc_killed(parent))
1591 		goto cleanup;
1592 
1593 	ret = KSFT_PASS;
1594 
1595 cleanup:
1596 	if (child)
1597 		cg_destroy(child);
1598 	if (parent)
1599 		cg_destroy(parent);
1600 	free(child);
1601 	free(parent);
1602 
1603 	return ret;
1604 }
1605 
1606 /*
1607  * This test disables swapping and tries to allocate anonymous memory
1608  * up to OOM with memory.group.oom set. Then it checks that all
1609  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1610  */
1611 static int test_memcg_oom_group_score_events(const char *root)
1612 {
1613 	int ret = KSFT_FAIL;
1614 	char *memcg;
1615 	int safe_pid;
1616 
1617 	memcg = cg_name(root, "memcg_test_0");
1618 
1619 	if (!memcg)
1620 		goto cleanup;
1621 
1622 	if (cg_create(memcg))
1623 		goto cleanup;
1624 
1625 	if (cg_write(memcg, "memory.max", "50M"))
1626 		goto cleanup;
1627 
1628 	if (cg_write(memcg, "memory.swap.max", "0"))
1629 		goto cleanup;
1630 
1631 	if (cg_write(memcg, "memory.oom.group", "1"))
1632 		goto cleanup;
1633 
1634 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1635 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1636 		goto cleanup;
1637 
1638 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1639 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1640 		goto cleanup;
1641 
1642 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1643 		goto cleanup;
1644 
1645 	if (kill(safe_pid, SIGKILL))
1646 		goto cleanup;
1647 
1648 	ret = KSFT_PASS;
1649 
1650 cleanup:
1651 	if (memcg)
1652 		cg_destroy(memcg);
1653 	free(memcg);
1654 
1655 	return ret;
1656 }
1657 
1658 static int read_event(int inotify_fd, int expected_event, int expected_wd)
1659 {
1660 	struct inotify_event event;
1661 	ssize_t len = 0;
1662 
1663 	len = read(inotify_fd, &event, sizeof(event));
1664 	if (len < (ssize_t)sizeof(event))
1665 		return -1;
1666 
1667 	if (event.mask != expected_event || event.wd != expected_wd) {
1668 		fprintf(stderr,
1669 			"event does not match expected values: mask %d (expected %d) wd %d (expected %d)\n",
1670 			event.mask, expected_event, event.wd, expected_wd);
1671 		return -1;
1672 	}
1673 
1674 	return 0;
1675 }
1676 
1677 static int test_memcg_inotify_delete_file(const char *root)
1678 {
1679 	int ret = KSFT_FAIL;
1680 	char *memcg = NULL;
1681 	int fd, wd;
1682 
1683 	memcg = cg_name(root, "memcg_test_0");
1684 
1685 	if (!memcg)
1686 		goto cleanup;
1687 
1688 	if (cg_create(memcg))
1689 		goto cleanup;
1690 
1691 	fd = inotify_init1(0);
1692 	if (fd == -1)
1693 		goto cleanup;
1694 
1695 	wd = inotify_add_watch(fd, cg_control(memcg, "memory.events"), IN_DELETE_SELF);
1696 	if (wd == -1)
1697 		goto cleanup;
1698 
1699 	if (cg_destroy(memcg))
1700 		goto cleanup;
1701 	free(memcg);
1702 	memcg = NULL;
1703 
1704 	if (read_event(fd, IN_DELETE_SELF, wd))
1705 		goto cleanup;
1706 
1707 	if (read_event(fd, IN_IGNORED, wd))
1708 		goto cleanup;
1709 
1710 	ret = KSFT_PASS;
1711 
1712 cleanup:
1713 	if (fd >= 0)
1714 		close(fd);
1715 	if (memcg)
1716 		cg_destroy(memcg);
1717 	free(memcg);
1718 
1719 	return ret;
1720 }
1721 
1722 static int test_memcg_inotify_delete_dir(const char *root)
1723 {
1724 	int ret = KSFT_FAIL;
1725 	char *memcg = NULL;
1726 	int fd, wd;
1727 
1728 	memcg = cg_name(root, "memcg_test_0");
1729 
1730 	if (!memcg)
1731 		goto cleanup;
1732 
1733 	if (cg_create(memcg))
1734 		goto cleanup;
1735 
1736 	fd = inotify_init1(0);
1737 	if (fd == -1)
1738 		goto cleanup;
1739 
1740 	wd = inotify_add_watch(fd, memcg, IN_DELETE_SELF);
1741 	if (wd == -1)
1742 		goto cleanup;
1743 
1744 	if (cg_destroy(memcg))
1745 		goto cleanup;
1746 	free(memcg);
1747 	memcg = NULL;
1748 
1749 	if (read_event(fd, IN_DELETE_SELF, wd))
1750 		goto cleanup;
1751 
1752 	if (read_event(fd, IN_IGNORED, wd))
1753 		goto cleanup;
1754 
1755 	ret = KSFT_PASS;
1756 
1757 cleanup:
1758 	if (fd >= 0)
1759 		close(fd);
1760 	if (memcg)
1761 		cg_destroy(memcg);
1762 	free(memcg);
1763 
1764 	return ret;
1765 }
1766 
1767 #define T(x) { x, #x }
1768 struct memcg_test {
1769 	int (*fn)(const char *root);
1770 	const char *name;
1771 } tests[] = {
1772 	T(test_memcg_subtree_control),
1773 	T(test_memcg_current_peak),
1774 	T(test_memcg_min),
1775 	T(test_memcg_low),
1776 	T(test_memcg_high),
1777 	T(test_memcg_high_sync),
1778 	T(test_memcg_max),
1779 	T(test_memcg_reclaim),
1780 	T(test_memcg_oom_events),
1781 	T(test_memcg_swap_max_peak),
1782 	T(test_memcg_sock),
1783 	T(test_memcg_oom_group_leaf_events),
1784 	T(test_memcg_oom_group_parent_events),
1785 	T(test_memcg_oom_group_score_events),
1786 	T(test_memcg_inotify_delete_file),
1787 	T(test_memcg_inotify_delete_dir),
1788 };
1789 #undef T
1790 
1791 int main(int argc, char **argv)
1792 {
1793 	char root[PATH_MAX];
1794 	int i, proc_status;
1795 
1796 	page_size = sysconf(_SC_PAGE_SIZE);
1797 	if (page_size <= 0)
1798 		page_size = BUF_SIZE;
1799 
1800 	ksft_print_header();
1801 	ksft_set_plan(ARRAY_SIZE(tests));
1802 	if (cg_find_unified_root(root, sizeof(root), NULL))
1803 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1804 
1805 	/*
1806 	 * Check that memory controller is available:
1807 	 * memory is listed in cgroup.controllers
1808 	 */
1809 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1810 		ksft_exit_skip("memory controller isn't available\n");
1811 
1812 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1813 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1814 			ksft_exit_skip("Failed to set memory controller\n");
1815 
1816 	proc_status = proc_mount_contains("memory_recursiveprot");
1817 	if (proc_status < 0)
1818 		ksft_exit_skip("Failed to query cgroup mount option\n");
1819 	has_recursiveprot = proc_status;
1820 
1821 	proc_status = proc_mount_contains("memory_localevents");
1822 	if (proc_status < 0)
1823 		ksft_exit_skip("Failed to query cgroup mount option\n");
1824 	has_localevents = proc_status;
1825 
1826 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1827 		switch (tests[i].fn(root)) {
1828 		case KSFT_PASS:
1829 			ksft_test_result_pass("%s\n", tests[i].name);
1830 			break;
1831 		case KSFT_SKIP:
1832 			ksft_test_result_skip("%s\n", tests[i].name);
1833 			break;
1834 		default:
1835 			ksft_test_result_fail("%s\n", tests[i].name);
1836 			break;
1837 		}
1838 	}
1839 
1840 	ksft_finished();
1841 }
1842