xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 8c2c7df58b5433f614d603bbdffd85f2a392b74a)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/inotify.h>
14 #include <sys/socket.h>
15 #include <sys/wait.h>
16 #include <arpa/inet.h>
17 #include <netinet/in.h>
18 #include <netdb.h>
19 #include <errno.h>
20 #include <sys/mman.h>
21 
22 #include "kselftest.h"
23 #include "cgroup_util.h"
24 
25 #define MEMCG_SOCKSTAT_WAIT_RETRIES        30
26 
27 static bool has_localevents;
28 static bool has_recursiveprot;
29 static int page_size;
30 
31 int get_temp_fd(void)
32 {
33 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
34 }
35 
36 int alloc_pagecache(int fd, size_t size)
37 {
38 	char buf[BUF_SIZE];
39 	struct stat st;
40 	int i;
41 
42 	if (fstat(fd, &st))
43 		goto cleanup;
44 
45 	size += st.st_size;
46 
47 	if (ftruncate(fd, size))
48 		goto cleanup;
49 
50 	for (i = 0; i < size; i += sizeof(buf))
51 		read(fd, buf, sizeof(buf));
52 
53 	return 0;
54 
55 cleanup:
56 	return -1;
57 }
58 
59 int alloc_anon(const char *cgroup, void *arg)
60 {
61 	size_t size = (unsigned long)arg;
62 	char *buf, *ptr;
63 
64 	buf = malloc(size);
65 	for (ptr = buf; ptr < buf + size; ptr += page_size)
66 		*ptr = 0;
67 
68 	free(buf);
69 	return 0;
70 }
71 
72 int is_swap_enabled(void)
73 {
74 	char buf[BUF_SIZE];
75 	const char delim[] = "\n";
76 	int cnt = 0;
77 	char *line;
78 
79 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
80 		return -1;
81 
82 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
83 		cnt++;
84 
85 	return cnt > 1;
86 }
87 
88 int set_oom_adj_score(int pid, int score)
89 {
90 	char path[PATH_MAX];
91 	int fd, len;
92 
93 	sprintf(path, "/proc/%d/oom_score_adj", pid);
94 
95 	fd = open(path, O_WRONLY | O_APPEND);
96 	if (fd < 0)
97 		return fd;
98 
99 	len = dprintf(fd, "%d", score);
100 	if (len < 0) {
101 		close(fd);
102 		return len;
103 	}
104 
105 	close(fd);
106 	return 0;
107 }
108 
109 /*
110  * This test creates two nested cgroups with and without enabling
111  * the memory controller.
112  */
113 static int test_memcg_subtree_control(const char *root)
114 {
115 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
116 	int ret = KSFT_FAIL;
117 	char buf[BUF_SIZE];
118 
119 	/* Create two nested cgroups with the memory controller enabled */
120 	parent = cg_name(root, "memcg_test_0");
121 	child = cg_name(root, "memcg_test_0/memcg_test_1");
122 	if (!parent || !child)
123 		goto cleanup_free;
124 
125 	if (cg_create(parent))
126 		goto cleanup_free;
127 
128 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
129 		goto cleanup_parent;
130 
131 	if (cg_create(child))
132 		goto cleanup_parent;
133 
134 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
135 		goto cleanup_child;
136 
137 	/* Create two nested cgroups without enabling memory controller */
138 	parent2 = cg_name(root, "memcg_test_1");
139 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
140 	if (!parent2 || !child2)
141 		goto cleanup_free2;
142 
143 	if (cg_create(parent2))
144 		goto cleanup_free2;
145 
146 	if (cg_create(child2))
147 		goto cleanup_parent2;
148 
149 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
150 		goto cleanup_all;
151 
152 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
153 		goto cleanup_all;
154 
155 	ret = KSFT_PASS;
156 
157 cleanup_all:
158 	cg_destroy(child2);
159 cleanup_parent2:
160 	cg_destroy(parent2);
161 cleanup_free2:
162 	free(parent2);
163 	free(child2);
164 cleanup_child:
165 	cg_destroy(child);
166 cleanup_parent:
167 	cg_destroy(parent);
168 cleanup_free:
169 	free(parent);
170 	free(child);
171 
172 	return ret;
173 }
174 
175 static int alloc_anon_50M_check(const char *cgroup, void *arg)
176 {
177 	size_t size = MB(50);
178 	char *buf, *ptr;
179 	long anon, current;
180 	int ret = -1;
181 
182 	buf = malloc(size);
183 	if (buf == NULL) {
184 		fprintf(stderr, "malloc() failed\n");
185 		return -1;
186 	}
187 
188 	for (ptr = buf; ptr < buf + size; ptr += page_size)
189 		*ptr = 0;
190 
191 	current = cg_read_long(cgroup, "memory.current");
192 	if (current < size)
193 		goto cleanup;
194 
195 	if (!values_close(size, current, 3))
196 		goto cleanup;
197 
198 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
199 	if (anon < 0)
200 		goto cleanup;
201 
202 	if (!values_close(anon, current, 3))
203 		goto cleanup;
204 
205 	ret = 0;
206 cleanup:
207 	free(buf);
208 	return ret;
209 }
210 
211 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
212 {
213 	size_t size = MB(50);
214 	int ret = -1;
215 	long current, file;
216 	int fd;
217 
218 	fd = get_temp_fd();
219 	if (fd < 0)
220 		return -1;
221 
222 	if (alloc_pagecache(fd, size))
223 		goto cleanup;
224 
225 	current = cg_read_long(cgroup, "memory.current");
226 	if (current < size)
227 		goto cleanup;
228 
229 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
230 	if (file < 0)
231 		goto cleanup;
232 
233 	if (!values_close(file, current, 10))
234 		goto cleanup;
235 
236 	ret = 0;
237 
238 cleanup:
239 	close(fd);
240 	return ret;
241 }
242 
243 /*
244  * This test create a memory cgroup, allocates
245  * some anonymous memory and some pagecache
246  * and checks memory.current, memory.peak, and some memory.stat values.
247  */
248 static int test_memcg_current_peak(const char *root)
249 {
250 	int ret = KSFT_FAIL;
251 	long current, peak, peak_reset;
252 	char *memcg;
253 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
254 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
255 	struct stat ss;
256 
257 	memcg = cg_name(root, "memcg_test");
258 	if (!memcg)
259 		goto cleanup;
260 
261 	if (cg_create(memcg))
262 		goto cleanup;
263 
264 	current = cg_read_long(memcg, "memory.current");
265 	if (current != 0)
266 		goto cleanup;
267 
268 	peak = cg_read_long(memcg, "memory.peak");
269 	if (peak != 0)
270 		goto cleanup;
271 
272 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
273 		goto cleanup;
274 
275 	peak = cg_read_long(memcg, "memory.peak");
276 	if (peak < MB(50))
277 		goto cleanup;
278 
279 	/*
280 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
281 	 * We need at least three to be closed in a different order than writes occurred to test
282 	 * the linked-list handling.
283 	 */
284 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
285 
286 	if (peak_fd == -1) {
287 		if (errno == ENOENT)
288 			ret = KSFT_SKIP;
289 		goto cleanup;
290 	}
291 
292 	/*
293 	 * Before we try to use memory.peak's fd, try to figure out whether
294 	 * this kernel supports writing to that file in the first place. (by
295 	 * checking the writable bit on the file's st_mode)
296 	 */
297 	if (fstat(peak_fd, &ss))
298 		goto cleanup;
299 
300 	if ((ss.st_mode & S_IWUSR) == 0) {
301 		ret = KSFT_SKIP;
302 		goto cleanup;
303 	}
304 
305 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
306 
307 	if (peak_fd2 == -1)
308 		goto cleanup;
309 
310 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
311 
312 	if (peak_fd3 == -1)
313 		goto cleanup;
314 
315 	/* any non-empty string resets, but make it clear */
316 	static const char reset_string[] = "reset\n";
317 
318 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
319 	if (peak_reset != sizeof(reset_string))
320 		goto cleanup;
321 
322 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
323 	if (peak_reset != sizeof(reset_string))
324 		goto cleanup;
325 
326 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
327 	if (peak_reset != sizeof(reset_string))
328 		goto cleanup;
329 
330 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
331 	peak = cg_read_long(memcg, "memory.peak");
332 	if (peak < MB(50))
333 		goto cleanup;
334 
335 	fd2_closed = true;
336 	if (close(peak_fd2))
337 		goto cleanup;
338 
339 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
340 
341 	if (peak_fd4 == -1)
342 		goto cleanup;
343 
344 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
345 	if (peak_reset != sizeof(reset_string))
346 		goto cleanup;
347 
348 	peak = cg_read_long_fd(peak_fd);
349 	if (peak > MB(30) || peak < 0)
350 		goto cleanup;
351 
352 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
353 		goto cleanup;
354 
355 	peak = cg_read_long(memcg, "memory.peak");
356 	if (peak < MB(50))
357 		goto cleanup;
358 
359 	/* Make sure everything is back to normal */
360 	peak = cg_read_long_fd(peak_fd);
361 	if (peak < MB(50))
362 		goto cleanup;
363 
364 	peak = cg_read_long_fd(peak_fd4);
365 	if (peak < MB(50))
366 		goto cleanup;
367 
368 	fd3_closed = true;
369 	if (close(peak_fd3))
370 		goto cleanup;
371 
372 	fd4_closed = true;
373 	if (close(peak_fd4))
374 		goto cleanup;
375 
376 	ret = KSFT_PASS;
377 
378 cleanup:
379 	close(peak_fd);
380 	if (!fd2_closed)
381 		close(peak_fd2);
382 	if (!fd3_closed)
383 		close(peak_fd3);
384 	if (!fd4_closed)
385 		close(peak_fd4);
386 	cg_destroy(memcg);
387 	free(memcg);
388 
389 	return ret;
390 }
391 
392 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
393 {
394 	int fd = (long)arg;
395 	int ppid = getppid();
396 
397 	if (alloc_pagecache(fd, MB(50)))
398 		return -1;
399 
400 	while (getppid() == ppid)
401 		sleep(1);
402 
403 	return 0;
404 }
405 
406 static int alloc_anon_noexit(const char *cgroup, void *arg)
407 {
408 	int ppid = getppid();
409 	size_t size = (unsigned long)arg;
410 	char *buf, *ptr;
411 
412 	buf = malloc(size);
413 	if (buf == NULL) {
414 		fprintf(stderr, "malloc() failed\n");
415 		return -1;
416 	}
417 
418 	for (ptr = buf; ptr < buf + size; ptr += page_size)
419 		*ptr = 0;
420 
421 	while (getppid() == ppid)
422 		sleep(1);
423 
424 	free(buf);
425 	return 0;
426 }
427 
428 /*
429  * Wait until processes are killed asynchronously by the OOM killer
430  * If we exceed a timeout, fail.
431  */
432 static int cg_test_proc_killed(const char *cgroup)
433 {
434 	int limit;
435 
436 	for (limit = 10; limit > 0; limit--) {
437 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
438 			return 0;
439 
440 		usleep(100000);
441 	}
442 	return -1;
443 }
444 
445 static bool reclaim_until(const char *memcg, long goal);
446 
447 /*
448  * First, this test creates the following hierarchy:
449  * A       memory.min = 0,    memory.max = 200M
450  * A/B     memory.min = 50M
451  * A/B/C   memory.min = 75M,  memory.current = 50M
452  * A/B/D   memory.min = 25M,  memory.current = 50M
453  * A/B/E   memory.min = 0,    memory.current = 50M
454  * A/B/F   memory.min = 500M, memory.current = 0
455  *
456  * (or memory.low if we test soft protection)
457  *
458  * Usages are pagecache and the test keeps a running
459  * process in every leaf cgroup.
460  * Then it creates A/G and creates a significant
461  * memory pressure in A.
462  *
463  * Then it checks actual memory usages and expects that:
464  * A/B    memory.current ~= 50M
465  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
466  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
467  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
468  *				 undefined otherwise]
469  * A/B/F  memory.current  = 0   [memory.events:low == 0]
470  * (for origin of the numbers, see model in memcg_protection.m.)
471  *
472  * After that it tries to allocate more than there is
473  * unprotected memory in A available, and checks that:
474  * a) memory.min protects pagecache even in this case,
475  * b) memory.low allows reclaiming page cache with low events.
476  *
477  * Then we try to reclaim from A/B/C using memory.reclaim until its
478  * usage reaches 10M.
479  * This makes sure that:
480  * (a) We ignore the protection of the reclaim target memcg.
481  * (b) The previously calculated emin value (~29M) should be dismissed.
482  */
483 static int test_memcg_protection(const char *root, bool min)
484 {
485 	int ret = KSFT_FAIL, rc;
486 	char *parent[3] = {NULL};
487 	char *children[4] = {NULL};
488 	const char *attribute = min ? "memory.min" : "memory.low";
489 	long c[4];
490 	long current;
491 	int i, attempts;
492 	int fd;
493 
494 	fd = get_temp_fd();
495 	if (fd < 0)
496 		goto cleanup;
497 
498 	parent[0] = cg_name(root, "memcg_test_0");
499 	if (!parent[0])
500 		goto cleanup;
501 
502 	parent[1] = cg_name(parent[0], "memcg_test_1");
503 	if (!parent[1])
504 		goto cleanup;
505 
506 	parent[2] = cg_name(parent[0], "memcg_test_2");
507 	if (!parent[2])
508 		goto cleanup;
509 
510 	if (cg_create(parent[0]))
511 		goto cleanup;
512 
513 	if (cg_read_long(parent[0], attribute)) {
514 		/* No memory.min on older kernels is fine */
515 		if (min)
516 			ret = KSFT_SKIP;
517 		goto cleanup;
518 	}
519 
520 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
521 		goto cleanup;
522 
523 	if (cg_write(parent[0], "memory.max", "200M"))
524 		goto cleanup;
525 
526 	if (cg_write(parent[0], "memory.swap.max", "0"))
527 		goto cleanup;
528 
529 	if (cg_create(parent[1]))
530 		goto cleanup;
531 
532 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
533 		goto cleanup;
534 
535 	if (cg_create(parent[2]))
536 		goto cleanup;
537 
538 	for (i = 0; i < ARRAY_SIZE(children); i++) {
539 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
540 		if (!children[i])
541 			goto cleanup;
542 
543 		if (cg_create(children[i]))
544 			goto cleanup;
545 
546 		if (i > 2)
547 			continue;
548 
549 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
550 			      (void *)(long)fd);
551 	}
552 
553 	if (cg_write(parent[1],   attribute, "50M"))
554 		goto cleanup;
555 	if (cg_write(children[0], attribute, "75M"))
556 		goto cleanup;
557 	if (cg_write(children[1], attribute, "25M"))
558 		goto cleanup;
559 	if (cg_write(children[2], attribute, "0"))
560 		goto cleanup;
561 	if (cg_write(children[3], attribute, "500M"))
562 		goto cleanup;
563 
564 	attempts = 0;
565 	while (!values_close(cg_read_long(parent[1], "memory.current"),
566 			     MB(150), 3)) {
567 		if (attempts++ > 5)
568 			break;
569 		sleep(1);
570 	}
571 
572 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
573 		goto cleanup;
574 
575 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
576 		goto cleanup;
577 
578 	for (i = 0; i < ARRAY_SIZE(children); i++)
579 		c[i] = cg_read_long(children[i], "memory.current");
580 
581 	if (!values_close(c[0], MB(29), 15))
582 		goto cleanup;
583 
584 	if (!values_close(c[1], MB(21), 20))
585 		goto cleanup;
586 
587 	if (c[3] != 0)
588 		goto cleanup;
589 
590 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
591 	if (min && !rc)
592 		goto cleanup;
593 	else if (!min && rc) {
594 		fprintf(stderr,
595 			"memory.low prevents from allocating anon memory\n");
596 		goto cleanup;
597 	}
598 
599 	current = min ? MB(50) : MB(30);
600 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
601 		goto cleanup;
602 
603 	if (!reclaim_until(children[0], MB(10)))
604 		goto cleanup;
605 
606 	if (min) {
607 		ret = KSFT_PASS;
608 		goto cleanup;
609 	}
610 
611 	/*
612 	 * Child 2 has memory.low=0, but some low protection may still be
613 	 * distributed down from its parent with memory.low=50M if cgroup2
614 	 * memory_recursiveprot mount option is enabled. Ignore the low
615 	 * event count in this case.
616 	 */
617 	for (i = 0; i < ARRAY_SIZE(children); i++) {
618 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
619 		int no_low_events_index = 1;
620 		long low, oom;
621 
622 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
623 		low = cg_read_key_long(children[i], "memory.events", "low ");
624 
625 		if (oom)
626 			goto cleanup;
627 		if (i == ignore_low_events_index)
628 			continue;
629 		if (i <= no_low_events_index && low <= 0)
630 			goto cleanup;
631 		if (i > no_low_events_index && low)
632 			goto cleanup;
633 
634 	}
635 
636 	ret = KSFT_PASS;
637 
638 cleanup:
639 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
640 		if (!children[i])
641 			continue;
642 
643 		cg_destroy(children[i]);
644 		free(children[i]);
645 	}
646 
647 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
648 		if (!parent[i])
649 			continue;
650 
651 		cg_destroy(parent[i]);
652 		free(parent[i]);
653 	}
654 	close(fd);
655 	return ret;
656 }
657 
658 static int test_memcg_min(const char *root)
659 {
660 	return test_memcg_protection(root, true);
661 }
662 
663 static int test_memcg_low(const char *root)
664 {
665 	return test_memcg_protection(root, false);
666 }
667 
668 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
669 {
670 	size_t size = MB(50);
671 	int ret = -1;
672 	long current, high, max;
673 	int fd;
674 
675 	high = cg_read_long(cgroup, "memory.high");
676 	max = cg_read_long(cgroup, "memory.max");
677 	if (high != MB(30) && max != MB(30))
678 		return -1;
679 
680 	fd = get_temp_fd();
681 	if (fd < 0)
682 		return -1;
683 
684 	if (alloc_pagecache(fd, size))
685 		goto cleanup;
686 
687 	current = cg_read_long(cgroup, "memory.current");
688 	if (!values_close(current, MB(30), 5))
689 		goto cleanup;
690 
691 	ret = 0;
692 
693 cleanup:
694 	close(fd);
695 	return ret;
696 
697 }
698 
699 /*
700  * This test checks that memory.high limits the amount of
701  * memory which can be consumed by either anonymous memory
702  * or pagecache.
703  */
704 static int test_memcg_high(const char *root)
705 {
706 	int ret = KSFT_FAIL;
707 	char *memcg;
708 	long high;
709 
710 	memcg = cg_name(root, "memcg_test");
711 	if (!memcg)
712 		goto cleanup;
713 
714 	if (cg_create(memcg))
715 		goto cleanup;
716 
717 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
718 		goto cleanup;
719 
720 	if (cg_write(memcg, "memory.swap.max", "0"))
721 		goto cleanup;
722 
723 	if (cg_write(memcg, "memory.high", "30M"))
724 		goto cleanup;
725 
726 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
727 		goto cleanup;
728 
729 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
730 		goto cleanup;
731 
732 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
733 		goto cleanup;
734 
735 	high = cg_read_key_long(memcg, "memory.events", "high ");
736 	if (high <= 0)
737 		goto cleanup;
738 
739 	ret = KSFT_PASS;
740 
741 cleanup:
742 	cg_destroy(memcg);
743 	free(memcg);
744 
745 	return ret;
746 }
747 
748 static int alloc_anon_mlock(const char *cgroup, void *arg)
749 {
750 	size_t size = (size_t)arg;
751 	void *buf;
752 
753 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
754 		   0, 0);
755 	if (buf == MAP_FAILED)
756 		return -1;
757 
758 	mlock(buf, size);
759 	munmap(buf, size);
760 	return 0;
761 }
762 
763 /*
764  * This test checks that memory.high is able to throttle big single shot
765  * allocation i.e. large allocation within one kernel entry.
766  */
767 static int test_memcg_high_sync(const char *root)
768 {
769 	int ret = KSFT_FAIL, pid, fd = -1;
770 	char *memcg;
771 	long pre_high, pre_max;
772 	long post_high, post_max;
773 
774 	memcg = cg_name(root, "memcg_test");
775 	if (!memcg)
776 		goto cleanup;
777 
778 	if (cg_create(memcg))
779 		goto cleanup;
780 
781 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
782 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
783 	if (pre_high < 0 || pre_max < 0)
784 		goto cleanup;
785 
786 	if (cg_write(memcg, "memory.swap.max", "0"))
787 		goto cleanup;
788 
789 	if (cg_write(memcg, "memory.high", "30M"))
790 		goto cleanup;
791 
792 	if (cg_write(memcg, "memory.max", "140M"))
793 		goto cleanup;
794 
795 	fd = memcg_prepare_for_wait(memcg);
796 	if (fd < 0)
797 		goto cleanup;
798 
799 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
800 	if (pid < 0)
801 		goto cleanup;
802 
803 	cg_wait_for(fd);
804 
805 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
806 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
807 	if (post_high < 0 || post_max < 0)
808 		goto cleanup;
809 
810 	if (pre_high == post_high || pre_max != post_max)
811 		goto cleanup;
812 
813 	ret = KSFT_PASS;
814 
815 cleanup:
816 	if (fd >= 0)
817 		close(fd);
818 	cg_destroy(memcg);
819 	free(memcg);
820 
821 	return ret;
822 }
823 
824 /*
825  * This test checks that memory.max limits the amount of
826  * memory which can be consumed by either anonymous memory
827  * or pagecache.
828  */
829 static int test_memcg_max(const char *root)
830 {
831 	int ret = KSFT_FAIL;
832 	char *memcg;
833 	long current, max;
834 
835 	memcg = cg_name(root, "memcg_test");
836 	if (!memcg)
837 		goto cleanup;
838 
839 	if (cg_create(memcg))
840 		goto cleanup;
841 
842 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
843 		goto cleanup;
844 
845 	if (cg_write(memcg, "memory.swap.max", "0"))
846 		goto cleanup;
847 
848 	if (cg_write(memcg, "memory.max", "30M"))
849 		goto cleanup;
850 
851 	/* Should be killed by OOM killer */
852 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
853 		goto cleanup;
854 
855 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
856 		goto cleanup;
857 
858 	current = cg_read_long(memcg, "memory.current");
859 	if (current > MB(30) || !current)
860 		goto cleanup;
861 
862 	max = cg_read_key_long(memcg, "memory.events", "max ");
863 	if (max <= 0)
864 		goto cleanup;
865 
866 	ret = KSFT_PASS;
867 
868 cleanup:
869 	cg_destroy(memcg);
870 	free(memcg);
871 
872 	return ret;
873 }
874 
875 /*
876  * Reclaim from @memcg until usage reaches @goal by writing to
877  * memory.reclaim.
878  *
879  * This function will return false if the usage is already below the
880  * goal.
881  *
882  * This function assumes that writing to memory.reclaim is the only
883  * source of change in memory.current (no concurrent allocations or
884  * reclaim).
885  *
886  * This function makes sure memory.reclaim is sane. It will return
887  * false if memory.reclaim's error codes do not make sense, even if
888  * the usage goal was satisfied.
889  */
890 static bool reclaim_until(const char *memcg, long goal)
891 {
892 	char buf[64];
893 	int retries, err;
894 	long current, to_reclaim;
895 	bool reclaimed = false;
896 
897 	for (retries = 5; retries > 0; retries--) {
898 		current = cg_read_long(memcg, "memory.current");
899 
900 		if (current < goal || values_close(current, goal, 3))
901 			break;
902 		/* Did memory.reclaim return 0 incorrectly? */
903 		else if (reclaimed)
904 			return false;
905 
906 		to_reclaim = current - goal;
907 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
908 		err = cg_write(memcg, "memory.reclaim", buf);
909 		if (!err)
910 			reclaimed = true;
911 		else if (err != -EAGAIN)
912 			return false;
913 	}
914 	return reclaimed;
915 }
916 
917 /*
918  * This test checks that memory.reclaim reclaims the given
919  * amount of memory (from both anon and file, if possible).
920  */
921 static int test_memcg_reclaim(const char *root)
922 {
923 	int ret = KSFT_FAIL;
924 	int fd = -1;
925 	int retries;
926 	char *memcg;
927 	long current, expected_usage;
928 
929 	memcg = cg_name(root, "memcg_test");
930 	if (!memcg)
931 		goto cleanup;
932 
933 	if (cg_create(memcg))
934 		goto cleanup;
935 
936 	current = cg_read_long(memcg, "memory.current");
937 	if (current != 0)
938 		goto cleanup;
939 
940 	fd = get_temp_fd();
941 	if (fd < 0)
942 		goto cleanup;
943 
944 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
945 
946 	/*
947 	 * If swap is enabled, try to reclaim from both anon and file, else try
948 	 * to reclaim from file only.
949 	 */
950 	if (is_swap_enabled()) {
951 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
952 		expected_usage = MB(100);
953 	} else
954 		expected_usage = MB(50);
955 
956 	/*
957 	 * Wait until current usage reaches the expected usage (or we run out of
958 	 * retries).
959 	 */
960 	retries = 5;
961 	while (!values_close(cg_read_long(memcg, "memory.current"),
962 			    expected_usage, 10)) {
963 		if (retries--) {
964 			sleep(1);
965 			continue;
966 		} else {
967 			fprintf(stderr,
968 				"failed to allocate %ld for memcg reclaim test\n",
969 				expected_usage);
970 			goto cleanup;
971 		}
972 	}
973 
974 	/*
975 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
976 	 * and file if swap is enabled.
977 	 */
978 	if (!reclaim_until(memcg, MB(30)))
979 		goto cleanup;
980 
981 	ret = KSFT_PASS;
982 cleanup:
983 	cg_destroy(memcg);
984 	free(memcg);
985 	close(fd);
986 
987 	return ret;
988 }
989 
990 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
991 {
992 	long mem_max = (long)arg;
993 	size_t size = MB(50);
994 	char *buf, *ptr;
995 	long mem_current, swap_current;
996 	int ret = -1;
997 
998 	buf = malloc(size);
999 	if (buf == NULL) {
1000 		fprintf(stderr, "malloc() failed\n");
1001 		return -1;
1002 	}
1003 
1004 	for (ptr = buf; ptr < buf + size; ptr += page_size)
1005 		*ptr = 0;
1006 
1007 	mem_current = cg_read_long(cgroup, "memory.current");
1008 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1009 		goto cleanup;
1010 
1011 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1012 	if (!swap_current ||
1013 	    !values_close(mem_current + swap_current, size, 3))
1014 		goto cleanup;
1015 
1016 	ret = 0;
1017 cleanup:
1018 	free(buf);
1019 	return ret;
1020 }
1021 
1022 /*
1023  * This test checks that memory.swap.max limits the amount of
1024  * anonymous memory which can be swapped out. Additionally, it verifies that
1025  * memory.swap.peak reflects the high watermark and can be reset.
1026  */
1027 static int test_memcg_swap_max_peak(const char *root)
1028 {
1029 	int ret = KSFT_FAIL;
1030 	char *memcg;
1031 	long max, peak;
1032 	struct stat ss;
1033 	int swap_peak_fd = -1, mem_peak_fd = -1;
1034 
1035 	/* any non-empty string resets */
1036 	static const char reset_string[] = "foobarbaz";
1037 
1038 	if (!is_swap_enabled())
1039 		return KSFT_SKIP;
1040 
1041 	memcg = cg_name(root, "memcg_test");
1042 	if (!memcg)
1043 		goto cleanup;
1044 
1045 	if (cg_create(memcg))
1046 		goto cleanup;
1047 
1048 	if (cg_read_long(memcg, "memory.swap.current")) {
1049 		ret = KSFT_SKIP;
1050 		goto cleanup;
1051 	}
1052 
1053 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1054 			       O_RDWR | O_APPEND | O_CLOEXEC);
1055 
1056 	if (swap_peak_fd == -1) {
1057 		if (errno == ENOENT)
1058 			ret = KSFT_SKIP;
1059 		goto cleanup;
1060 	}
1061 
1062 	/*
1063 	 * Before we try to use memory.swap.peak's fd, try to figure out
1064 	 * whether this kernel supports writing to that file in the first
1065 	 * place. (by checking the writable bit on the file's st_mode)
1066 	 */
1067 	if (fstat(swap_peak_fd, &ss))
1068 		goto cleanup;
1069 
1070 	if ((ss.st_mode & S_IWUSR) == 0) {
1071 		ret = KSFT_SKIP;
1072 		goto cleanup;
1073 	}
1074 
1075 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1076 
1077 	if (mem_peak_fd == -1)
1078 		goto cleanup;
1079 
1080 	if (cg_read_long(memcg, "memory.swap.peak"))
1081 		goto cleanup;
1082 
1083 	if (cg_read_long_fd(swap_peak_fd))
1084 		goto cleanup;
1085 
1086 	/* switch the swap and mem fds into local-peak tracking mode*/
1087 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1088 
1089 	if (peak_reset != sizeof(reset_string))
1090 		goto cleanup;
1091 
1092 	if (cg_read_long_fd(swap_peak_fd))
1093 		goto cleanup;
1094 
1095 	if (cg_read_long(memcg, "memory.peak"))
1096 		goto cleanup;
1097 
1098 	if (cg_read_long_fd(mem_peak_fd))
1099 		goto cleanup;
1100 
1101 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1102 	if (peak_reset != sizeof(reset_string))
1103 		goto cleanup;
1104 
1105 	if (cg_read_long_fd(mem_peak_fd))
1106 		goto cleanup;
1107 
1108 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1109 		goto cleanup;
1110 
1111 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1112 		goto cleanup;
1113 
1114 	if (cg_write(memcg, "memory.swap.max", "30M"))
1115 		goto cleanup;
1116 
1117 	if (cg_write(memcg, "memory.max", "30M"))
1118 		goto cleanup;
1119 
1120 	/* Should be killed by OOM killer */
1121 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1122 		goto cleanup;
1123 
1124 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1125 		goto cleanup;
1126 
1127 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1128 		goto cleanup;
1129 
1130 	peak = cg_read_long(memcg, "memory.peak");
1131 	if (peak < MB(29))
1132 		goto cleanup;
1133 
1134 	peak = cg_read_long(memcg, "memory.swap.peak");
1135 	if (peak < MB(29))
1136 		goto cleanup;
1137 
1138 	peak = cg_read_long_fd(mem_peak_fd);
1139 	if (peak < MB(29))
1140 		goto cleanup;
1141 
1142 	peak = cg_read_long_fd(swap_peak_fd);
1143 	if (peak < MB(29))
1144 		goto cleanup;
1145 
1146 	/*
1147 	 * open, reset and close the peak swap on another FD to make sure
1148 	 * multiple extant fds don't corrupt the linked-list
1149 	 */
1150 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1151 	if (peak_reset)
1152 		goto cleanup;
1153 
1154 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1155 	if (peak_reset)
1156 		goto cleanup;
1157 
1158 	/* actually reset on the fds */
1159 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1160 	if (peak_reset != sizeof(reset_string))
1161 		goto cleanup;
1162 
1163 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1164 	if (peak_reset != sizeof(reset_string))
1165 		goto cleanup;
1166 
1167 	peak = cg_read_long_fd(swap_peak_fd);
1168 	if (peak > MB(10))
1169 		goto cleanup;
1170 
1171 	/*
1172 	 * The cgroup is now empty, but there may be a page or two associated
1173 	 * with the open FD accounted to it.
1174 	 */
1175 	peak = cg_read_long_fd(mem_peak_fd);
1176 	if (peak > MB(1))
1177 		goto cleanup;
1178 
1179 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1180 		goto cleanup;
1181 
1182 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1183 		goto cleanup;
1184 
1185 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1186 		goto cleanup;
1187 
1188 	max = cg_read_key_long(memcg, "memory.events", "max ");
1189 	if (max <= 0)
1190 		goto cleanup;
1191 
1192 	peak = cg_read_long(memcg, "memory.peak");
1193 	if (peak < MB(29))
1194 		goto cleanup;
1195 
1196 	peak = cg_read_long(memcg, "memory.swap.peak");
1197 	if (peak < MB(29))
1198 		goto cleanup;
1199 
1200 	peak = cg_read_long_fd(mem_peak_fd);
1201 	if (peak < MB(29))
1202 		goto cleanup;
1203 
1204 	peak = cg_read_long_fd(swap_peak_fd);
1205 	if (peak < MB(19))
1206 		goto cleanup;
1207 
1208 	ret = KSFT_PASS;
1209 
1210 cleanup:
1211 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1212 		ret = KSFT_FAIL;
1213 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1214 		ret = KSFT_FAIL;
1215 	cg_destroy(memcg);
1216 	free(memcg);
1217 
1218 	return ret;
1219 }
1220 
1221 /*
1222  * This test disables swapping and tries to allocate anonymous memory
1223  * up to OOM. Then it checks for oom and oom_kill events in
1224  * memory.events.
1225  */
1226 static int test_memcg_oom_events(const char *root)
1227 {
1228 	int ret = KSFT_FAIL;
1229 	char *memcg;
1230 
1231 	memcg = cg_name(root, "memcg_test");
1232 	if (!memcg)
1233 		goto cleanup;
1234 
1235 	if (cg_create(memcg))
1236 		goto cleanup;
1237 
1238 	if (cg_write(memcg, "memory.max", "30M"))
1239 		goto cleanup;
1240 
1241 	if (cg_write(memcg, "memory.swap.max", "0"))
1242 		goto cleanup;
1243 
1244 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1245 		goto cleanup;
1246 
1247 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1248 		goto cleanup;
1249 
1250 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1251 		goto cleanup;
1252 
1253 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1254 		goto cleanup;
1255 
1256 	ret = KSFT_PASS;
1257 
1258 cleanup:
1259 	cg_destroy(memcg);
1260 	free(memcg);
1261 
1262 	return ret;
1263 }
1264 
1265 struct tcp_server_args {
1266 	unsigned short port;
1267 	int ctl[2];
1268 };
1269 
1270 static int tcp_server(const char *cgroup, void *arg)
1271 {
1272 	struct tcp_server_args *srv_args = arg;
1273 	struct sockaddr_in6 saddr = { 0 };
1274 	socklen_t slen = sizeof(saddr);
1275 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1276 
1277 	close(srv_args->ctl[0]);
1278 	ctl_fd = srv_args->ctl[1];
1279 
1280 	saddr.sin6_family = AF_INET6;
1281 	saddr.sin6_addr = in6addr_any;
1282 	saddr.sin6_port = htons(srv_args->port);
1283 
1284 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1285 	if (sk < 0) {
1286 		/* Pass back errno to the ctl_fd */
1287 		write(ctl_fd, &errno, sizeof(errno));
1288 		return ret;
1289 	}
1290 
1291 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1292 		goto cleanup;
1293 
1294 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1295 		write(ctl_fd, &errno, sizeof(errno));
1296 		goto cleanup;
1297 	}
1298 
1299 	if (listen(sk, 1))
1300 		goto cleanup;
1301 
1302 	ret = 0;
1303 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1304 		ret = -1;
1305 		goto cleanup;
1306 	}
1307 
1308 	client_sk = accept(sk, NULL, NULL);
1309 	if (client_sk < 0)
1310 		goto cleanup;
1311 
1312 	ret = -1;
1313 	for (;;) {
1314 		uint8_t buf[0x100000];
1315 
1316 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1317 			if (errno == ECONNRESET)
1318 				ret = 0;
1319 			break;
1320 		}
1321 	}
1322 
1323 	close(client_sk);
1324 
1325 cleanup:
1326 	close(sk);
1327 	return ret;
1328 }
1329 
1330 static int tcp_client(const char *cgroup, unsigned short port)
1331 {
1332 	const char server[] = "localhost";
1333 	struct addrinfo *ai;
1334 	char servport[6];
1335 	int retries = 0x10; /* nice round number */
1336 	int sk, ret;
1337 	long allocated;
1338 
1339 	allocated = cg_read_long(cgroup, "memory.current");
1340 	snprintf(servport, sizeof(servport), "%hd", port);
1341 	ret = getaddrinfo(server, servport, NULL, &ai);
1342 	if (ret)
1343 		return ret;
1344 
1345 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1346 	if (sk < 0)
1347 		goto free_ainfo;
1348 
1349 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1350 	if (ret < 0)
1351 		goto close_sk;
1352 
1353 	ret = KSFT_FAIL;
1354 	while (retries--) {
1355 		uint8_t buf[0x100000];
1356 		long current, sock;
1357 
1358 		if (read(sk, buf, sizeof(buf)) <= 0)
1359 			goto close_sk;
1360 
1361 		current = cg_read_long(cgroup, "memory.current");
1362 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1363 
1364 		if (current < 0 || sock < 0)
1365 			goto close_sk;
1366 
1367 		/* exclude the memory not related to socket connection */
1368 		if (values_close(current - allocated, sock, 10)) {
1369 			ret = KSFT_PASS;
1370 			break;
1371 		}
1372 	}
1373 
1374 close_sk:
1375 	close(sk);
1376 free_ainfo:
1377 	freeaddrinfo(ai);
1378 	return ret;
1379 }
1380 
1381 /*
1382  * This test checks socket memory accounting.
1383  * The test forks a TCP server listens on a random port between 1000
1384  * and 61000. Once it gets a client connection, it starts writing to
1385  * its socket.
1386  * The TCP client interleaves reads from the socket with check whether
1387  * memory.current and memory.stat.sock are similar.
1388  */
1389 static int test_memcg_sock(const char *root)
1390 {
1391 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1392 	unsigned short port;
1393 	char *memcg;
1394 	long sock_post = -1;
1395 
1396 	memcg = cg_name(root, "memcg_test");
1397 	if (!memcg)
1398 		goto cleanup;
1399 
1400 	if (cg_create(memcg))
1401 		goto cleanup;
1402 
1403 	while (bind_retries--) {
1404 		struct tcp_server_args args;
1405 
1406 		if (pipe(args.ctl))
1407 			goto cleanup;
1408 
1409 		port = args.port = 1000 + rand() % 60000;
1410 
1411 		pid = cg_run_nowait(memcg, tcp_server, &args);
1412 		if (pid < 0)
1413 			goto cleanup;
1414 
1415 		close(args.ctl[1]);
1416 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1417 			goto cleanup;
1418 		close(args.ctl[0]);
1419 
1420 		/* Skip if address family not supported by protocol */
1421 		if (err == EAFNOSUPPORT) {
1422 			ret = KSFT_SKIP;
1423 			goto cleanup;
1424 		}
1425 
1426 		if (!err)
1427 			break;
1428 		if (err != EADDRINUSE)
1429 			goto cleanup;
1430 
1431 		waitpid(pid, NULL, 0);
1432 	}
1433 
1434 	if (err == EADDRINUSE) {
1435 		ret = KSFT_SKIP;
1436 		goto cleanup;
1437 	}
1438 
1439 	if (tcp_client(memcg, port) != KSFT_PASS)
1440 		goto cleanup;
1441 
1442 	waitpid(pid, &err, 0);
1443 	if (WEXITSTATUS(err))
1444 		goto cleanup;
1445 
1446 	if (cg_read_long(memcg, "memory.current") < 0)
1447 		goto cleanup;
1448 
1449 	/*
1450 	 * memory.stat is updated asynchronously via the memcg rstat
1451 	 * flushing worker, which runs periodically (every 2 seconds,
1452 	 * see FLUSH_TIME). On a busy system, the "sock " counter may
1453 	 * stay non-zero for a short period of time after the TCP
1454 	 * connection is closed and all socket memory has been
1455 	 * uncharged.
1456 	 *
1457 	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1458 	 * scheduling slack) and require that the "sock " counter
1459 	 * eventually drops to zero.
1460 	 */
1461 	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1462 					 MEMCG_SOCKSTAT_WAIT_RETRIES,
1463 					 DEFAULT_WAIT_INTERVAL_US);
1464 	if (sock_post)
1465 		goto cleanup;
1466 
1467 	ret = KSFT_PASS;
1468 
1469 cleanup:
1470 	cg_destroy(memcg);
1471 	free(memcg);
1472 
1473 	return ret;
1474 }
1475 
1476 /*
1477  * This test disables swapping and tries to allocate anonymous memory
1478  * up to OOM with memory.group.oom set. Then it checks that all
1479  * processes in the leaf were killed. It also checks that oom_events
1480  * were propagated to the parent level.
1481  */
1482 static int test_memcg_oom_group_leaf_events(const char *root)
1483 {
1484 	int ret = KSFT_FAIL;
1485 	char *parent, *child;
1486 	long parent_oom_events;
1487 
1488 	parent = cg_name(root, "memcg_test_0");
1489 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1490 
1491 	if (!parent || !child)
1492 		goto cleanup;
1493 
1494 	if (cg_create(parent))
1495 		goto cleanup;
1496 
1497 	if (cg_create(child))
1498 		goto cleanup;
1499 
1500 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1501 		goto cleanup;
1502 
1503 	if (cg_write(child, "memory.max", "50M"))
1504 		goto cleanup;
1505 
1506 	if (cg_write(child, "memory.swap.max", "0"))
1507 		goto cleanup;
1508 
1509 	if (cg_write(child, "memory.oom.group", "1"))
1510 		goto cleanup;
1511 
1512 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1513 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1514 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1515 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1516 		goto cleanup;
1517 
1518 	if (cg_test_proc_killed(child))
1519 		goto cleanup;
1520 
1521 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1522 		goto cleanup;
1523 
1524 	parent_oom_events = cg_read_key_long(
1525 			parent, "memory.events", "oom_kill ");
1526 	/*
1527 	 * If memory_localevents is not enabled (the default), the parent should
1528 	 * count OOM events in its children groups. Otherwise, it should not
1529 	 * have observed any events.
1530 	 */
1531 	if (has_localevents && parent_oom_events != 0)
1532 		goto cleanup;
1533 	else if (!has_localevents && parent_oom_events <= 0)
1534 		goto cleanup;
1535 
1536 	ret = KSFT_PASS;
1537 
1538 cleanup:
1539 	if (child)
1540 		cg_destroy(child);
1541 	if (parent)
1542 		cg_destroy(parent);
1543 	free(child);
1544 	free(parent);
1545 
1546 	return ret;
1547 }
1548 
1549 /*
1550  * This test disables swapping and tries to allocate anonymous memory
1551  * up to OOM with memory.group.oom set. Then it checks that all
1552  * processes in the parent and leaf were killed.
1553  */
1554 static int test_memcg_oom_group_parent_events(const char *root)
1555 {
1556 	int ret = KSFT_FAIL;
1557 	char *parent, *child;
1558 
1559 	parent = cg_name(root, "memcg_test_0");
1560 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1561 
1562 	if (!parent || !child)
1563 		goto cleanup;
1564 
1565 	if (cg_create(parent))
1566 		goto cleanup;
1567 
1568 	if (cg_create(child))
1569 		goto cleanup;
1570 
1571 	if (cg_write(parent, "memory.max", "80M"))
1572 		goto cleanup;
1573 
1574 	if (cg_write(parent, "memory.swap.max", "0"))
1575 		goto cleanup;
1576 
1577 	if (cg_write(parent, "memory.oom.group", "1"))
1578 		goto cleanup;
1579 
1580 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1581 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1582 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1583 
1584 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1585 		goto cleanup;
1586 
1587 	if (cg_test_proc_killed(child))
1588 		goto cleanup;
1589 	if (cg_test_proc_killed(parent))
1590 		goto cleanup;
1591 
1592 	ret = KSFT_PASS;
1593 
1594 cleanup:
1595 	if (child)
1596 		cg_destroy(child);
1597 	if (parent)
1598 		cg_destroy(parent);
1599 	free(child);
1600 	free(parent);
1601 
1602 	return ret;
1603 }
1604 
1605 /*
1606  * This test disables swapping and tries to allocate anonymous memory
1607  * up to OOM with memory.group.oom set. Then it checks that all
1608  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1609  */
1610 static int test_memcg_oom_group_score_events(const char *root)
1611 {
1612 	int ret = KSFT_FAIL;
1613 	char *memcg;
1614 	int safe_pid;
1615 
1616 	memcg = cg_name(root, "memcg_test_0");
1617 
1618 	if (!memcg)
1619 		goto cleanup;
1620 
1621 	if (cg_create(memcg))
1622 		goto cleanup;
1623 
1624 	if (cg_write(memcg, "memory.max", "50M"))
1625 		goto cleanup;
1626 
1627 	if (cg_write(memcg, "memory.swap.max", "0"))
1628 		goto cleanup;
1629 
1630 	if (cg_write(memcg, "memory.oom.group", "1"))
1631 		goto cleanup;
1632 
1633 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1634 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1635 		goto cleanup;
1636 
1637 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1638 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1639 		goto cleanup;
1640 
1641 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1642 		goto cleanup;
1643 
1644 	if (kill(safe_pid, SIGKILL))
1645 		goto cleanup;
1646 
1647 	ret = KSFT_PASS;
1648 
1649 cleanup:
1650 	if (memcg)
1651 		cg_destroy(memcg);
1652 	free(memcg);
1653 
1654 	return ret;
1655 }
1656 
1657 static int read_event(int inotify_fd, int expected_event, int expected_wd)
1658 {
1659 	struct inotify_event event;
1660 	ssize_t len = 0;
1661 
1662 	len = read(inotify_fd, &event, sizeof(event));
1663 	if (len < (ssize_t)sizeof(event))
1664 		return -1;
1665 
1666 	if (event.mask != expected_event || event.wd != expected_wd) {
1667 		fprintf(stderr,
1668 			"event does not match expected values: mask %d (expected %d) wd %d (expected %d)\n",
1669 			event.mask, expected_event, event.wd, expected_wd);
1670 		return -1;
1671 	}
1672 
1673 	return 0;
1674 }
1675 
1676 static int test_memcg_inotify_delete_file(const char *root)
1677 {
1678 	int ret = KSFT_FAIL;
1679 	char *memcg = NULL;
1680 	int fd, wd;
1681 
1682 	memcg = cg_name(root, "memcg_test_0");
1683 
1684 	if (!memcg)
1685 		goto cleanup;
1686 
1687 	if (cg_create(memcg))
1688 		goto cleanup;
1689 
1690 	fd = inotify_init1(0);
1691 	if (fd == -1)
1692 		goto cleanup;
1693 
1694 	wd = inotify_add_watch(fd, cg_control(memcg, "memory.events"), IN_DELETE_SELF);
1695 	if (wd == -1)
1696 		goto cleanup;
1697 
1698 	if (cg_destroy(memcg))
1699 		goto cleanup;
1700 	free(memcg);
1701 	memcg = NULL;
1702 
1703 	if (read_event(fd, IN_DELETE_SELF, wd))
1704 		goto cleanup;
1705 
1706 	if (read_event(fd, IN_IGNORED, wd))
1707 		goto cleanup;
1708 
1709 	ret = KSFT_PASS;
1710 
1711 cleanup:
1712 	if (fd >= 0)
1713 		close(fd);
1714 	if (memcg)
1715 		cg_destroy(memcg);
1716 	free(memcg);
1717 
1718 	return ret;
1719 }
1720 
1721 static int test_memcg_inotify_delete_dir(const char *root)
1722 {
1723 	int ret = KSFT_FAIL;
1724 	char *memcg = NULL;
1725 	int fd, wd;
1726 
1727 	memcg = cg_name(root, "memcg_test_0");
1728 
1729 	if (!memcg)
1730 		goto cleanup;
1731 
1732 	if (cg_create(memcg))
1733 		goto cleanup;
1734 
1735 	fd = inotify_init1(0);
1736 	if (fd == -1)
1737 		goto cleanup;
1738 
1739 	wd = inotify_add_watch(fd, memcg, IN_DELETE_SELF);
1740 	if (wd == -1)
1741 		goto cleanup;
1742 
1743 	if (cg_destroy(memcg))
1744 		goto cleanup;
1745 	free(memcg);
1746 	memcg = NULL;
1747 
1748 	if (read_event(fd, IN_DELETE_SELF, wd))
1749 		goto cleanup;
1750 
1751 	if (read_event(fd, IN_IGNORED, wd))
1752 		goto cleanup;
1753 
1754 	ret = KSFT_PASS;
1755 
1756 cleanup:
1757 	if (fd >= 0)
1758 		close(fd);
1759 	if (memcg)
1760 		cg_destroy(memcg);
1761 	free(memcg);
1762 
1763 	return ret;
1764 }
1765 
1766 #define T(x) { x, #x }
1767 struct memcg_test {
1768 	int (*fn)(const char *root);
1769 	const char *name;
1770 } tests[] = {
1771 	T(test_memcg_subtree_control),
1772 	T(test_memcg_current_peak),
1773 	T(test_memcg_min),
1774 	T(test_memcg_low),
1775 	T(test_memcg_high),
1776 	T(test_memcg_high_sync),
1777 	T(test_memcg_max),
1778 	T(test_memcg_reclaim),
1779 	T(test_memcg_oom_events),
1780 	T(test_memcg_swap_max_peak),
1781 	T(test_memcg_sock),
1782 	T(test_memcg_oom_group_leaf_events),
1783 	T(test_memcg_oom_group_parent_events),
1784 	T(test_memcg_oom_group_score_events),
1785 	T(test_memcg_inotify_delete_file),
1786 	T(test_memcg_inotify_delete_dir),
1787 };
1788 #undef T
1789 
1790 int main(int argc, char **argv)
1791 {
1792 	char root[PATH_MAX];
1793 	int i, proc_status;
1794 
1795 	page_size = sysconf(_SC_PAGE_SIZE);
1796 	if (page_size <= 0)
1797 		page_size = BUF_SIZE;
1798 
1799 	ksft_print_header();
1800 	ksft_set_plan(ARRAY_SIZE(tests));
1801 	if (cg_find_unified_root(root, sizeof(root), NULL))
1802 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1803 
1804 	/*
1805 	 * Check that memory controller is available:
1806 	 * memory is listed in cgroup.controllers
1807 	 */
1808 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1809 		ksft_exit_skip("memory controller isn't available\n");
1810 
1811 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1812 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1813 			ksft_exit_skip("Failed to set memory controller\n");
1814 
1815 	proc_status = proc_mount_contains("memory_recursiveprot");
1816 	if (proc_status < 0)
1817 		ksft_exit_skip("Failed to query cgroup mount option\n");
1818 	has_recursiveprot = proc_status;
1819 
1820 	proc_status = proc_mount_contains("memory_localevents");
1821 	if (proc_status < 0)
1822 		ksft_exit_skip("Failed to query cgroup mount option\n");
1823 	has_localevents = proc_status;
1824 
1825 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1826 		switch (tests[i].fn(root)) {
1827 		case KSFT_PASS:
1828 			ksft_test_result_pass("%s\n", tests[i].name);
1829 			break;
1830 		case KSFT_SKIP:
1831 			ksft_test_result_skip("%s\n", tests[i].name);
1832 			break;
1833 		default:
1834 			ksft_test_result_fail("%s\n", tests[i].name);
1835 			break;
1836 		}
1837 	}
1838 
1839 	ksft_finished();
1840 }
1841