xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 70a663205d5085f1d82f7058e9419ff7612e9396)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #include <linux/limits.h>
3 #include <linux/oom.h>
4 #include <fcntl.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/stat.h>
9 #include <sys/types.h>
10 #include <unistd.h>
11 #include <sys/socket.h>
12 #include <sys/wait.h>
13 #include <arpa/inet.h>
14 #include <netinet/in.h>
15 #include <netdb.h>
16 #include <errno.h>
17 #include <sys/mman.h>
18 
19 #include "../kselftest.h"
20 #include "cgroup_util.h"
21 
22 static bool has_localevents;
23 static bool has_recursiveprot;
24 
25 /*
26  * This test creates two nested cgroups with and without enabling
27  * the memory controller.
28  */
29 static int test_memcg_subtree_control(const char *root)
30 {
31 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
32 	int ret = KSFT_FAIL;
33 	char buf[PAGE_SIZE];
34 
35 	/* Create two nested cgroups with the memory controller enabled */
36 	parent = cg_name(root, "memcg_test_0");
37 	child = cg_name(root, "memcg_test_0/memcg_test_1");
38 	if (!parent || !child)
39 		goto cleanup_free;
40 
41 	if (cg_create(parent))
42 		goto cleanup_free;
43 
44 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
45 		goto cleanup_parent;
46 
47 	if (cg_create(child))
48 		goto cleanup_parent;
49 
50 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
51 		goto cleanup_child;
52 
53 	/* Create two nested cgroups without enabling memory controller */
54 	parent2 = cg_name(root, "memcg_test_1");
55 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
56 	if (!parent2 || !child2)
57 		goto cleanup_free2;
58 
59 	if (cg_create(parent2))
60 		goto cleanup_free2;
61 
62 	if (cg_create(child2))
63 		goto cleanup_parent2;
64 
65 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
66 		goto cleanup_all;
67 
68 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
69 		goto cleanup_all;
70 
71 	ret = KSFT_PASS;
72 
73 cleanup_all:
74 	cg_destroy(child2);
75 cleanup_parent2:
76 	cg_destroy(parent2);
77 cleanup_free2:
78 	free(parent2);
79 	free(child2);
80 cleanup_child:
81 	cg_destroy(child);
82 cleanup_parent:
83 	cg_destroy(parent);
84 cleanup_free:
85 	free(parent);
86 	free(child);
87 
88 	return ret;
89 }
90 
91 static int alloc_anon_50M_check(const char *cgroup, void *arg)
92 {
93 	size_t size = MB(50);
94 	char *buf, *ptr;
95 	long anon, current;
96 	int ret = -1;
97 
98 	buf = malloc(size);
99 	if (buf == NULL) {
100 		fprintf(stderr, "malloc() failed\n");
101 		return -1;
102 	}
103 
104 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
105 		*ptr = 0;
106 
107 	current = cg_read_long(cgroup, "memory.current");
108 	if (current < size)
109 		goto cleanup;
110 
111 	if (!values_close(size, current, 3))
112 		goto cleanup;
113 
114 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
115 	if (anon < 0)
116 		goto cleanup;
117 
118 	if (!values_close(anon, current, 3))
119 		goto cleanup;
120 
121 	ret = 0;
122 cleanup:
123 	free(buf);
124 	return ret;
125 }
126 
127 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
128 {
129 	size_t size = MB(50);
130 	int ret = -1;
131 	long current, file;
132 	int fd;
133 
134 	fd = get_temp_fd();
135 	if (fd < 0)
136 		return -1;
137 
138 	if (alloc_pagecache(fd, size))
139 		goto cleanup;
140 
141 	current = cg_read_long(cgroup, "memory.current");
142 	if (current < size)
143 		goto cleanup;
144 
145 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
146 	if (file < 0)
147 		goto cleanup;
148 
149 	if (!values_close(file, current, 10))
150 		goto cleanup;
151 
152 	ret = 0;
153 
154 cleanup:
155 	close(fd);
156 	return ret;
157 }
158 
159 /*
160  * This test create a memory cgroup, allocates
161  * some anonymous memory and some pagecache
162  * and check memory.current and some memory.stat values.
163  */
164 static int test_memcg_current(const char *root)
165 {
166 	int ret = KSFT_FAIL;
167 	long current;
168 	char *memcg;
169 
170 	memcg = cg_name(root, "memcg_test");
171 	if (!memcg)
172 		goto cleanup;
173 
174 	if (cg_create(memcg))
175 		goto cleanup;
176 
177 	current = cg_read_long(memcg, "memory.current");
178 	if (current != 0)
179 		goto cleanup;
180 
181 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
182 		goto cleanup;
183 
184 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
185 		goto cleanup;
186 
187 	ret = KSFT_PASS;
188 
189 cleanup:
190 	cg_destroy(memcg);
191 	free(memcg);
192 
193 	return ret;
194 }
195 
196 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
197 {
198 	int fd = (long)arg;
199 	int ppid = getppid();
200 
201 	if (alloc_pagecache(fd, MB(50)))
202 		return -1;
203 
204 	while (getppid() == ppid)
205 		sleep(1);
206 
207 	return 0;
208 }
209 
210 static int alloc_anon_noexit(const char *cgroup, void *arg)
211 {
212 	int ppid = getppid();
213 	size_t size = (unsigned long)arg;
214 	char *buf, *ptr;
215 
216 	buf = malloc(size);
217 	if (buf == NULL) {
218 		fprintf(stderr, "malloc() failed\n");
219 		return -1;
220 	}
221 
222 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
223 		*ptr = 0;
224 
225 	while (getppid() == ppid)
226 		sleep(1);
227 
228 	free(buf);
229 	return 0;
230 }
231 
232 /*
233  * Wait until processes are killed asynchronously by the OOM killer
234  * If we exceed a timeout, fail.
235  */
236 static int cg_test_proc_killed(const char *cgroup)
237 {
238 	int limit;
239 
240 	for (limit = 10; limit > 0; limit--) {
241 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
242 			return 0;
243 
244 		usleep(100000);
245 	}
246 	return -1;
247 }
248 
249 static bool reclaim_until(const char *memcg, long goal);
250 
251 /*
252  * First, this test creates the following hierarchy:
253  * A       memory.min = 0,    memory.max = 200M
254  * A/B     memory.min = 50M
255  * A/B/C   memory.min = 75M,  memory.current = 50M
256  * A/B/D   memory.min = 25M,  memory.current = 50M
257  * A/B/E   memory.min = 0,    memory.current = 50M
258  * A/B/F   memory.min = 500M, memory.current = 0
259  *
260  * (or memory.low if we test soft protection)
261  *
262  * Usages are pagecache and the test keeps a running
263  * process in every leaf cgroup.
264  * Then it creates A/G and creates a significant
265  * memory pressure in A.
266  *
267  * Then it checks actual memory usages and expects that:
268  * A/B    memory.current ~= 50M
269  * A/B/C  memory.current ~= 29M
270  * A/B/D  memory.current ~= 21M
271  * A/B/E  memory.current ~= 0
272  * A/B/F  memory.current  = 0
273  * (for origin of the numbers, see model in memcg_protection.m.)
274  *
275  * After that it tries to allocate more than there is
276  * unprotected memory in A available, and checks that:
277  * a) memory.min protects pagecache even in this case,
278  * b) memory.low allows reclaiming page cache with low events.
279  *
280  * Then we try to reclaim from A/B/C using memory.reclaim until its
281  * usage reaches 10M.
282  * This makes sure that:
283  * (a) We ignore the protection of the reclaim target memcg.
284  * (b) The previously calculated emin value (~29M) should be dismissed.
285  */
286 static int test_memcg_protection(const char *root, bool min)
287 {
288 	int ret = KSFT_FAIL, rc;
289 	char *parent[3] = {NULL};
290 	char *children[4] = {NULL};
291 	const char *attribute = min ? "memory.min" : "memory.low";
292 	long c[4];
293 	long current;
294 	int i, attempts;
295 	int fd;
296 
297 	fd = get_temp_fd();
298 	if (fd < 0)
299 		goto cleanup;
300 
301 	parent[0] = cg_name(root, "memcg_test_0");
302 	if (!parent[0])
303 		goto cleanup;
304 
305 	parent[1] = cg_name(parent[0], "memcg_test_1");
306 	if (!parent[1])
307 		goto cleanup;
308 
309 	parent[2] = cg_name(parent[0], "memcg_test_2");
310 	if (!parent[2])
311 		goto cleanup;
312 
313 	if (cg_create(parent[0]))
314 		goto cleanup;
315 
316 	if (cg_read_long(parent[0], attribute)) {
317 		/* No memory.min on older kernels is fine */
318 		if (min)
319 			ret = KSFT_SKIP;
320 		goto cleanup;
321 	}
322 
323 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
324 		goto cleanup;
325 
326 	if (cg_write(parent[0], "memory.max", "200M"))
327 		goto cleanup;
328 
329 	if (cg_write(parent[0], "memory.swap.max", "0"))
330 		goto cleanup;
331 
332 	if (cg_create(parent[1]))
333 		goto cleanup;
334 
335 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
336 		goto cleanup;
337 
338 	if (cg_create(parent[2]))
339 		goto cleanup;
340 
341 	for (i = 0; i < ARRAY_SIZE(children); i++) {
342 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
343 		if (!children[i])
344 			goto cleanup;
345 
346 		if (cg_create(children[i]))
347 			goto cleanup;
348 
349 		if (i > 2)
350 			continue;
351 
352 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
353 			      (void *)(long)fd);
354 	}
355 
356 	if (cg_write(parent[1],   attribute, "50M"))
357 		goto cleanup;
358 	if (cg_write(children[0], attribute, "75M"))
359 		goto cleanup;
360 	if (cg_write(children[1], attribute, "25M"))
361 		goto cleanup;
362 	if (cg_write(children[2], attribute, "0"))
363 		goto cleanup;
364 	if (cg_write(children[3], attribute, "500M"))
365 		goto cleanup;
366 
367 	attempts = 0;
368 	while (!values_close(cg_read_long(parent[1], "memory.current"),
369 			     MB(150), 3)) {
370 		if (attempts++ > 5)
371 			break;
372 		sleep(1);
373 	}
374 
375 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
376 		goto cleanup;
377 
378 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
379 		goto cleanup;
380 
381 	for (i = 0; i < ARRAY_SIZE(children); i++)
382 		c[i] = cg_read_long(children[i], "memory.current");
383 
384 	if (!values_close(c[0], MB(29), 10))
385 		goto cleanup;
386 
387 	if (!values_close(c[1], MB(21), 10))
388 		goto cleanup;
389 
390 	if (c[3] != 0)
391 		goto cleanup;
392 
393 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
394 	if (min && !rc)
395 		goto cleanup;
396 	else if (!min && rc) {
397 		fprintf(stderr,
398 			"memory.low prevents from allocating anon memory\n");
399 		goto cleanup;
400 	}
401 
402 	current = min ? MB(50) : MB(30);
403 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
404 		goto cleanup;
405 
406 	if (!reclaim_until(children[0], MB(10)))
407 		goto cleanup;
408 
409 	if (min) {
410 		ret = KSFT_PASS;
411 		goto cleanup;
412 	}
413 
414 	for (i = 0; i < ARRAY_SIZE(children); i++) {
415 		int no_low_events_index = 1;
416 		long low, oom;
417 
418 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
419 		low = cg_read_key_long(children[i], "memory.events", "low ");
420 
421 		if (oom)
422 			goto cleanup;
423 		if (i <= no_low_events_index && low <= 0)
424 			goto cleanup;
425 		if (i > no_low_events_index && low)
426 			goto cleanup;
427 
428 	}
429 
430 	ret = KSFT_PASS;
431 
432 cleanup:
433 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
434 		if (!children[i])
435 			continue;
436 
437 		cg_destroy(children[i]);
438 		free(children[i]);
439 	}
440 
441 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
442 		if (!parent[i])
443 			continue;
444 
445 		cg_destroy(parent[i]);
446 		free(parent[i]);
447 	}
448 	close(fd);
449 	return ret;
450 }
451 
452 static int test_memcg_min(const char *root)
453 {
454 	return test_memcg_protection(root, true);
455 }
456 
457 static int test_memcg_low(const char *root)
458 {
459 	return test_memcg_protection(root, false);
460 }
461 
462 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
463 {
464 	size_t size = MB(50);
465 	int ret = -1;
466 	long current, high, max;
467 	int fd;
468 
469 	high = cg_read_long(cgroup, "memory.high");
470 	max = cg_read_long(cgroup, "memory.max");
471 	if (high != MB(30) && max != MB(30))
472 		return -1;
473 
474 	fd = get_temp_fd();
475 	if (fd < 0)
476 		return -1;
477 
478 	if (alloc_pagecache(fd, size))
479 		goto cleanup;
480 
481 	current = cg_read_long(cgroup, "memory.current");
482 	if (!values_close(current, MB(30), 5))
483 		goto cleanup;
484 
485 	ret = 0;
486 
487 cleanup:
488 	close(fd);
489 	return ret;
490 
491 }
492 
493 /*
494  * This test checks that memory.high limits the amount of
495  * memory which can be consumed by either anonymous memory
496  * or pagecache.
497  */
498 static int test_memcg_high(const char *root)
499 {
500 	int ret = KSFT_FAIL;
501 	char *memcg;
502 	long high;
503 
504 	memcg = cg_name(root, "memcg_test");
505 	if (!memcg)
506 		goto cleanup;
507 
508 	if (cg_create(memcg))
509 		goto cleanup;
510 
511 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
512 		goto cleanup;
513 
514 	if (cg_write(memcg, "memory.swap.max", "0"))
515 		goto cleanup;
516 
517 	if (cg_write(memcg, "memory.high", "30M"))
518 		goto cleanup;
519 
520 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
521 		goto cleanup;
522 
523 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
524 		goto cleanup;
525 
526 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
527 		goto cleanup;
528 
529 	high = cg_read_key_long(memcg, "memory.events", "high ");
530 	if (high <= 0)
531 		goto cleanup;
532 
533 	ret = KSFT_PASS;
534 
535 cleanup:
536 	cg_destroy(memcg);
537 	free(memcg);
538 
539 	return ret;
540 }
541 
542 static int alloc_anon_mlock(const char *cgroup, void *arg)
543 {
544 	size_t size = (size_t)arg;
545 	void *buf;
546 
547 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
548 		   0, 0);
549 	if (buf == MAP_FAILED)
550 		return -1;
551 
552 	mlock(buf, size);
553 	munmap(buf, size);
554 	return 0;
555 }
556 
557 /*
558  * This test checks that memory.high is able to throttle big single shot
559  * allocation i.e. large allocation within one kernel entry.
560  */
561 static int test_memcg_high_sync(const char *root)
562 {
563 	int ret = KSFT_FAIL, pid, fd = -1;
564 	char *memcg;
565 	long pre_high, pre_max;
566 	long post_high, post_max;
567 
568 	memcg = cg_name(root, "memcg_test");
569 	if (!memcg)
570 		goto cleanup;
571 
572 	if (cg_create(memcg))
573 		goto cleanup;
574 
575 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
576 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
577 	if (pre_high < 0 || pre_max < 0)
578 		goto cleanup;
579 
580 	if (cg_write(memcg, "memory.swap.max", "0"))
581 		goto cleanup;
582 
583 	if (cg_write(memcg, "memory.high", "30M"))
584 		goto cleanup;
585 
586 	if (cg_write(memcg, "memory.max", "140M"))
587 		goto cleanup;
588 
589 	fd = memcg_prepare_for_wait(memcg);
590 	if (fd < 0)
591 		goto cleanup;
592 
593 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
594 	if (pid < 0)
595 		goto cleanup;
596 
597 	cg_wait_for(fd);
598 
599 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
600 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
601 	if (post_high < 0 || post_max < 0)
602 		goto cleanup;
603 
604 	if (pre_high == post_high || pre_max != post_max)
605 		goto cleanup;
606 
607 	ret = KSFT_PASS;
608 
609 cleanup:
610 	if (fd >= 0)
611 		close(fd);
612 	cg_destroy(memcg);
613 	free(memcg);
614 
615 	return ret;
616 }
617 
618 /*
619  * This test checks that memory.max limits the amount of
620  * memory which can be consumed by either anonymous memory
621  * or pagecache.
622  */
623 static int test_memcg_max(const char *root)
624 {
625 	int ret = KSFT_FAIL;
626 	char *memcg;
627 	long current, max;
628 
629 	memcg = cg_name(root, "memcg_test");
630 	if (!memcg)
631 		goto cleanup;
632 
633 	if (cg_create(memcg))
634 		goto cleanup;
635 
636 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
637 		goto cleanup;
638 
639 	if (cg_write(memcg, "memory.swap.max", "0"))
640 		goto cleanup;
641 
642 	if (cg_write(memcg, "memory.max", "30M"))
643 		goto cleanup;
644 
645 	/* Should be killed by OOM killer */
646 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
647 		goto cleanup;
648 
649 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
650 		goto cleanup;
651 
652 	current = cg_read_long(memcg, "memory.current");
653 	if (current > MB(30) || !current)
654 		goto cleanup;
655 
656 	max = cg_read_key_long(memcg, "memory.events", "max ");
657 	if (max <= 0)
658 		goto cleanup;
659 
660 	ret = KSFT_PASS;
661 
662 cleanup:
663 	cg_destroy(memcg);
664 	free(memcg);
665 
666 	return ret;
667 }
668 
669 /*
670  * Reclaim from @memcg until usage reaches @goal by writing to
671  * memory.reclaim.
672  *
673  * This function will return false if the usage is already below the
674  * goal.
675  *
676  * This function assumes that writing to memory.reclaim is the only
677  * source of change in memory.current (no concurrent allocations or
678  * reclaim).
679  *
680  * This function makes sure memory.reclaim is sane. It will return
681  * false if memory.reclaim's error codes do not make sense, even if
682  * the usage goal was satisfied.
683  */
684 static bool reclaim_until(const char *memcg, long goal)
685 {
686 	char buf[64];
687 	int retries, err;
688 	long current, to_reclaim;
689 	bool reclaimed = false;
690 
691 	for (retries = 5; retries > 0; retries--) {
692 		current = cg_read_long(memcg, "memory.current");
693 
694 		if (current < goal || values_close(current, goal, 3))
695 			break;
696 		/* Did memory.reclaim return 0 incorrectly? */
697 		else if (reclaimed)
698 			return false;
699 
700 		to_reclaim = current - goal;
701 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
702 		err = cg_write(memcg, "memory.reclaim", buf);
703 		if (!err)
704 			reclaimed = true;
705 		else if (err != -EAGAIN)
706 			return false;
707 	}
708 	return reclaimed;
709 }
710 
711 /*
712  * This test checks that memory.reclaim reclaims the given
713  * amount of memory (from both anon and file, if possible).
714  */
715 static int test_memcg_reclaim(const char *root)
716 {
717 	int ret = KSFT_FAIL;
718 	int fd = -1;
719 	int retries;
720 	char *memcg;
721 	long current, expected_usage;
722 
723 	memcg = cg_name(root, "memcg_test");
724 	if (!memcg)
725 		goto cleanup;
726 
727 	if (cg_create(memcg))
728 		goto cleanup;
729 
730 	current = cg_read_long(memcg, "memory.current");
731 	if (current != 0)
732 		goto cleanup;
733 
734 	fd = get_temp_fd();
735 	if (fd < 0)
736 		goto cleanup;
737 
738 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
739 
740 	/*
741 	 * If swap is enabled, try to reclaim from both anon and file, else try
742 	 * to reclaim from file only.
743 	 */
744 	if (is_swap_enabled()) {
745 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
746 		expected_usage = MB(100);
747 	} else
748 		expected_usage = MB(50);
749 
750 	/*
751 	 * Wait until current usage reaches the expected usage (or we run out of
752 	 * retries).
753 	 */
754 	retries = 5;
755 	while (!values_close(cg_read_long(memcg, "memory.current"),
756 			    expected_usage, 10)) {
757 		if (retries--) {
758 			sleep(1);
759 			continue;
760 		} else {
761 			fprintf(stderr,
762 				"failed to allocate %ld for memcg reclaim test\n",
763 				expected_usage);
764 			goto cleanup;
765 		}
766 	}
767 
768 	/*
769 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
770 	 * and file if swap is enabled.
771 	 */
772 	if (!reclaim_until(memcg, MB(30)))
773 		goto cleanup;
774 
775 	ret = KSFT_PASS;
776 cleanup:
777 	cg_destroy(memcg);
778 	free(memcg);
779 	close(fd);
780 
781 	return ret;
782 }
783 
784 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
785 {
786 	long mem_max = (long)arg;
787 	size_t size = MB(50);
788 	char *buf, *ptr;
789 	long mem_current, swap_current;
790 	int ret = -1;
791 
792 	buf = malloc(size);
793 	if (buf == NULL) {
794 		fprintf(stderr, "malloc() failed\n");
795 		return -1;
796 	}
797 
798 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
799 		*ptr = 0;
800 
801 	mem_current = cg_read_long(cgroup, "memory.current");
802 	if (!mem_current || !values_close(mem_current, mem_max, 3))
803 		goto cleanup;
804 
805 	swap_current = cg_read_long(cgroup, "memory.swap.current");
806 	if (!swap_current ||
807 	    !values_close(mem_current + swap_current, size, 3))
808 		goto cleanup;
809 
810 	ret = 0;
811 cleanup:
812 	free(buf);
813 	return ret;
814 }
815 
816 /*
817  * This test checks that memory.swap.max limits the amount of
818  * anonymous memory which can be swapped out.
819  */
820 static int test_memcg_swap_max(const char *root)
821 {
822 	int ret = KSFT_FAIL;
823 	char *memcg;
824 	long max;
825 
826 	if (!is_swap_enabled())
827 		return KSFT_SKIP;
828 
829 	memcg = cg_name(root, "memcg_test");
830 	if (!memcg)
831 		goto cleanup;
832 
833 	if (cg_create(memcg))
834 		goto cleanup;
835 
836 	if (cg_read_long(memcg, "memory.swap.current")) {
837 		ret = KSFT_SKIP;
838 		goto cleanup;
839 	}
840 
841 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
842 		goto cleanup;
843 
844 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
845 		goto cleanup;
846 
847 	if (cg_write(memcg, "memory.swap.max", "30M"))
848 		goto cleanup;
849 
850 	if (cg_write(memcg, "memory.max", "30M"))
851 		goto cleanup;
852 
853 	/* Should be killed by OOM killer */
854 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
855 		goto cleanup;
856 
857 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
858 		goto cleanup;
859 
860 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
861 		goto cleanup;
862 
863 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
864 		goto cleanup;
865 
866 	max = cg_read_key_long(memcg, "memory.events", "max ");
867 	if (max <= 0)
868 		goto cleanup;
869 
870 	ret = KSFT_PASS;
871 
872 cleanup:
873 	cg_destroy(memcg);
874 	free(memcg);
875 
876 	return ret;
877 }
878 
879 /*
880  * This test disables swapping and tries to allocate anonymous memory
881  * up to OOM. Then it checks for oom and oom_kill events in
882  * memory.events.
883  */
884 static int test_memcg_oom_events(const char *root)
885 {
886 	int ret = KSFT_FAIL;
887 	char *memcg;
888 
889 	memcg = cg_name(root, "memcg_test");
890 	if (!memcg)
891 		goto cleanup;
892 
893 	if (cg_create(memcg))
894 		goto cleanup;
895 
896 	if (cg_write(memcg, "memory.max", "30M"))
897 		goto cleanup;
898 
899 	if (cg_write(memcg, "memory.swap.max", "0"))
900 		goto cleanup;
901 
902 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
903 		goto cleanup;
904 
905 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
906 		goto cleanup;
907 
908 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
909 		goto cleanup;
910 
911 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
912 		goto cleanup;
913 
914 	ret = KSFT_PASS;
915 
916 cleanup:
917 	cg_destroy(memcg);
918 	free(memcg);
919 
920 	return ret;
921 }
922 
923 struct tcp_server_args {
924 	unsigned short port;
925 	int ctl[2];
926 };
927 
928 static int tcp_server(const char *cgroup, void *arg)
929 {
930 	struct tcp_server_args *srv_args = arg;
931 	struct sockaddr_in6 saddr = { 0 };
932 	socklen_t slen = sizeof(saddr);
933 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
934 
935 	close(srv_args->ctl[0]);
936 	ctl_fd = srv_args->ctl[1];
937 
938 	saddr.sin6_family = AF_INET6;
939 	saddr.sin6_addr = in6addr_any;
940 	saddr.sin6_port = htons(srv_args->port);
941 
942 	sk = socket(AF_INET6, SOCK_STREAM, 0);
943 	if (sk < 0)
944 		return ret;
945 
946 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
947 		goto cleanup;
948 
949 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
950 		write(ctl_fd, &errno, sizeof(errno));
951 		goto cleanup;
952 	}
953 
954 	if (listen(sk, 1))
955 		goto cleanup;
956 
957 	ret = 0;
958 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
959 		ret = -1;
960 		goto cleanup;
961 	}
962 
963 	client_sk = accept(sk, NULL, NULL);
964 	if (client_sk < 0)
965 		goto cleanup;
966 
967 	ret = -1;
968 	for (;;) {
969 		uint8_t buf[0x100000];
970 
971 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
972 			if (errno == ECONNRESET)
973 				ret = 0;
974 			break;
975 		}
976 	}
977 
978 	close(client_sk);
979 
980 cleanup:
981 	close(sk);
982 	return ret;
983 }
984 
985 static int tcp_client(const char *cgroup, unsigned short port)
986 {
987 	const char server[] = "localhost";
988 	struct addrinfo *ai;
989 	char servport[6];
990 	int retries = 0x10; /* nice round number */
991 	int sk, ret;
992 	long allocated;
993 
994 	allocated = cg_read_long(cgroup, "memory.current");
995 	snprintf(servport, sizeof(servport), "%hd", port);
996 	ret = getaddrinfo(server, servport, NULL, &ai);
997 	if (ret)
998 		return ret;
999 
1000 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1001 	if (sk < 0)
1002 		goto free_ainfo;
1003 
1004 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1005 	if (ret < 0)
1006 		goto close_sk;
1007 
1008 	ret = KSFT_FAIL;
1009 	while (retries--) {
1010 		uint8_t buf[0x100000];
1011 		long current, sock;
1012 
1013 		if (read(sk, buf, sizeof(buf)) <= 0)
1014 			goto close_sk;
1015 
1016 		current = cg_read_long(cgroup, "memory.current");
1017 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1018 
1019 		if (current < 0 || sock < 0)
1020 			goto close_sk;
1021 
1022 		/* exclude the memory not related to socket connection */
1023 		if (values_close(current - allocated, sock, 10)) {
1024 			ret = KSFT_PASS;
1025 			break;
1026 		}
1027 	}
1028 
1029 close_sk:
1030 	close(sk);
1031 free_ainfo:
1032 	freeaddrinfo(ai);
1033 	return ret;
1034 }
1035 
1036 /*
1037  * This test checks socket memory accounting.
1038  * The test forks a TCP server listens on a random port between 1000
1039  * and 61000. Once it gets a client connection, it starts writing to
1040  * its socket.
1041  * The TCP client interleaves reads from the socket with check whether
1042  * memory.current and memory.stat.sock are similar.
1043  */
1044 static int test_memcg_sock(const char *root)
1045 {
1046 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1047 	unsigned short port;
1048 	char *memcg;
1049 
1050 	memcg = cg_name(root, "memcg_test");
1051 	if (!memcg)
1052 		goto cleanup;
1053 
1054 	if (cg_create(memcg))
1055 		goto cleanup;
1056 
1057 	while (bind_retries--) {
1058 		struct tcp_server_args args;
1059 
1060 		if (pipe(args.ctl))
1061 			goto cleanup;
1062 
1063 		port = args.port = 1000 + rand() % 60000;
1064 
1065 		pid = cg_run_nowait(memcg, tcp_server, &args);
1066 		if (pid < 0)
1067 			goto cleanup;
1068 
1069 		close(args.ctl[1]);
1070 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1071 			goto cleanup;
1072 		close(args.ctl[0]);
1073 
1074 		if (!err)
1075 			break;
1076 		if (err != EADDRINUSE)
1077 			goto cleanup;
1078 
1079 		waitpid(pid, NULL, 0);
1080 	}
1081 
1082 	if (err == EADDRINUSE) {
1083 		ret = KSFT_SKIP;
1084 		goto cleanup;
1085 	}
1086 
1087 	if (tcp_client(memcg, port) != KSFT_PASS)
1088 		goto cleanup;
1089 
1090 	waitpid(pid, &err, 0);
1091 	if (WEXITSTATUS(err))
1092 		goto cleanup;
1093 
1094 	if (cg_read_long(memcg, "memory.current") < 0)
1095 		goto cleanup;
1096 
1097 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1098 		goto cleanup;
1099 
1100 	ret = KSFT_PASS;
1101 
1102 cleanup:
1103 	cg_destroy(memcg);
1104 	free(memcg);
1105 
1106 	return ret;
1107 }
1108 
1109 /*
1110  * This test disables swapping and tries to allocate anonymous memory
1111  * up to OOM with memory.group.oom set. Then it checks that all
1112  * processes in the leaf were killed. It also checks that oom_events
1113  * were propagated to the parent level.
1114  */
1115 static int test_memcg_oom_group_leaf_events(const char *root)
1116 {
1117 	int ret = KSFT_FAIL;
1118 	char *parent, *child;
1119 	long parent_oom_events;
1120 
1121 	parent = cg_name(root, "memcg_test_0");
1122 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1123 
1124 	if (!parent || !child)
1125 		goto cleanup;
1126 
1127 	if (cg_create(parent))
1128 		goto cleanup;
1129 
1130 	if (cg_create(child))
1131 		goto cleanup;
1132 
1133 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1134 		goto cleanup;
1135 
1136 	if (cg_write(child, "memory.max", "50M"))
1137 		goto cleanup;
1138 
1139 	if (cg_write(child, "memory.swap.max", "0"))
1140 		goto cleanup;
1141 
1142 	if (cg_write(child, "memory.oom.group", "1"))
1143 		goto cleanup;
1144 
1145 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1146 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1147 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1148 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1149 		goto cleanup;
1150 
1151 	if (cg_test_proc_killed(child))
1152 		goto cleanup;
1153 
1154 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1155 		goto cleanup;
1156 
1157 	parent_oom_events = cg_read_key_long(
1158 			parent, "memory.events", "oom_kill ");
1159 	/*
1160 	 * If memory_localevents is not enabled (the default), the parent should
1161 	 * count OOM events in its children groups. Otherwise, it should not
1162 	 * have observed any events.
1163 	 */
1164 	if (has_localevents && parent_oom_events != 0)
1165 		goto cleanup;
1166 	else if (!has_localevents && parent_oom_events <= 0)
1167 		goto cleanup;
1168 
1169 	ret = KSFT_PASS;
1170 
1171 cleanup:
1172 	if (child)
1173 		cg_destroy(child);
1174 	if (parent)
1175 		cg_destroy(parent);
1176 	free(child);
1177 	free(parent);
1178 
1179 	return ret;
1180 }
1181 
1182 /*
1183  * This test disables swapping and tries to allocate anonymous memory
1184  * up to OOM with memory.group.oom set. Then it checks that all
1185  * processes in the parent and leaf were killed.
1186  */
1187 static int test_memcg_oom_group_parent_events(const char *root)
1188 {
1189 	int ret = KSFT_FAIL;
1190 	char *parent, *child;
1191 
1192 	parent = cg_name(root, "memcg_test_0");
1193 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1194 
1195 	if (!parent || !child)
1196 		goto cleanup;
1197 
1198 	if (cg_create(parent))
1199 		goto cleanup;
1200 
1201 	if (cg_create(child))
1202 		goto cleanup;
1203 
1204 	if (cg_write(parent, "memory.max", "80M"))
1205 		goto cleanup;
1206 
1207 	if (cg_write(parent, "memory.swap.max", "0"))
1208 		goto cleanup;
1209 
1210 	if (cg_write(parent, "memory.oom.group", "1"))
1211 		goto cleanup;
1212 
1213 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1214 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1215 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1216 
1217 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1218 		goto cleanup;
1219 
1220 	if (cg_test_proc_killed(child))
1221 		goto cleanup;
1222 	if (cg_test_proc_killed(parent))
1223 		goto cleanup;
1224 
1225 	ret = KSFT_PASS;
1226 
1227 cleanup:
1228 	if (child)
1229 		cg_destroy(child);
1230 	if (parent)
1231 		cg_destroy(parent);
1232 	free(child);
1233 	free(parent);
1234 
1235 	return ret;
1236 }
1237 
1238 /*
1239  * This test disables swapping and tries to allocate anonymous memory
1240  * up to OOM with memory.group.oom set. Then it checks that all
1241  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1242  */
1243 static int test_memcg_oom_group_score_events(const char *root)
1244 {
1245 	int ret = KSFT_FAIL;
1246 	char *memcg;
1247 	int safe_pid;
1248 
1249 	memcg = cg_name(root, "memcg_test_0");
1250 
1251 	if (!memcg)
1252 		goto cleanup;
1253 
1254 	if (cg_create(memcg))
1255 		goto cleanup;
1256 
1257 	if (cg_write(memcg, "memory.max", "50M"))
1258 		goto cleanup;
1259 
1260 	if (cg_write(memcg, "memory.swap.max", "0"))
1261 		goto cleanup;
1262 
1263 	if (cg_write(memcg, "memory.oom.group", "1"))
1264 		goto cleanup;
1265 
1266 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1267 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1268 		goto cleanup;
1269 
1270 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1271 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1272 		goto cleanup;
1273 
1274 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1275 		goto cleanup;
1276 
1277 	if (kill(safe_pid, SIGKILL))
1278 		goto cleanup;
1279 
1280 	ret = KSFT_PASS;
1281 
1282 cleanup:
1283 	if (memcg)
1284 		cg_destroy(memcg);
1285 	free(memcg);
1286 
1287 	return ret;
1288 }
1289 
1290 #define T(x) { x, #x }
1291 struct memcg_test {
1292 	int (*fn)(const char *root);
1293 	const char *name;
1294 } tests[] = {
1295 	T(test_memcg_subtree_control),
1296 	T(test_memcg_current),
1297 	T(test_memcg_min),
1298 	T(test_memcg_low),
1299 	T(test_memcg_high),
1300 	T(test_memcg_high_sync),
1301 	T(test_memcg_max),
1302 	T(test_memcg_reclaim),
1303 	T(test_memcg_oom_events),
1304 	T(test_memcg_swap_max),
1305 	T(test_memcg_sock),
1306 	T(test_memcg_oom_group_leaf_events),
1307 	T(test_memcg_oom_group_parent_events),
1308 	T(test_memcg_oom_group_score_events),
1309 };
1310 #undef T
1311 
1312 int main(int argc, char **argv)
1313 {
1314 	char root[PATH_MAX];
1315 	int i, proc_status, ret = EXIT_SUCCESS;
1316 
1317 	if (cg_find_unified_root(root, sizeof(root), NULL))
1318 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1319 
1320 	/*
1321 	 * Check that memory controller is available:
1322 	 * memory is listed in cgroup.controllers
1323 	 */
1324 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1325 		ksft_exit_skip("memory controller isn't available\n");
1326 
1327 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1328 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1329 			ksft_exit_skip("Failed to set memory controller\n");
1330 
1331 	proc_status = proc_mount_contains("memory_recursiveprot");
1332 	if (proc_status < 0)
1333 		ksft_exit_skip("Failed to query cgroup mount option\n");
1334 	has_recursiveprot = proc_status;
1335 
1336 	proc_status = proc_mount_contains("memory_localevents");
1337 	if (proc_status < 0)
1338 		ksft_exit_skip("Failed to query cgroup mount option\n");
1339 	has_localevents = proc_status;
1340 
1341 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1342 		switch (tests[i].fn(root)) {
1343 		case KSFT_PASS:
1344 			ksft_test_result_pass("%s\n", tests[i].name);
1345 			break;
1346 		case KSFT_SKIP:
1347 			ksft_test_result_skip("%s\n", tests[i].name);
1348 			break;
1349 		default:
1350 			ret = EXIT_FAILURE;
1351 			ksft_test_result_fail("%s\n", tests[i].name);
1352 			break;
1353 		}
1354 	}
1355 
1356 	return ret;
1357 }
1358