1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 4 #include <linux/limits.h> 5 #include <fcntl.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 #include <sys/stat.h> 10 #include <sys/types.h> 11 #include <unistd.h> 12 #include <sys/wait.h> 13 #include <errno.h> 14 #include <sys/sysinfo.h> 15 #include <pthread.h> 16 17 #include "kselftest.h" 18 #include "cgroup_util.h" 19 20 21 /* 22 * Memory cgroup charging is performed using percpu batches 64 pages 23 * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So 24 * the maximum discrepancy between charge and vmstat entries is number 25 * of cpus multiplied by 64 pages. 26 */ 27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) 28 29 #define KMEM_DEAD_WAIT_RETRIES 80 30 31 static int alloc_dcache(const char *cgroup, void *arg) 32 { 33 unsigned long i; 34 struct stat st; 35 char buf[128]; 36 37 for (i = 0; i < (unsigned long)arg; i++) { 38 snprintf(buf, sizeof(buf), 39 "/something-non-existent-with-a-long-name-%64lu-%d", 40 i, getpid()); 41 stat(buf, &st); 42 } 43 44 return 0; 45 } 46 47 /* 48 * This test allocates 100000 of negative dentries with long names. 49 * Then it checks that "slab" in memory.stat is larger than 1M. 50 * Then it sets memory.high to 1M and checks that at least 1/2 51 * of slab memory has been reclaimed. 52 */ 53 static int test_kmem_basic(const char *root) 54 { 55 int ret = KSFT_FAIL; 56 char *cg = NULL; 57 long slab0, slab1, current; 58 59 cg = cg_name(root, "kmem_basic_test"); 60 if (!cg) 61 goto cleanup; 62 63 if (cg_create(cg)) 64 goto cleanup; 65 66 if (cg_run(cg, alloc_dcache, (void *)100000)) 67 goto cleanup; 68 69 slab0 = cg_read_key_long(cg, "memory.stat", "slab "); 70 if (slab0 < (1 << 20)) 71 goto cleanup; 72 73 cg_write(cg, "memory.high", "1M"); 74 75 /* wait for RCU freeing */ 76 sleep(1); 77 78 slab1 = cg_read_key_long(cg, "memory.stat", "slab "); 79 if (slab1 < 0) 80 goto cleanup; 81 82 current = cg_read_long(cg, "memory.current"); 83 if (current < 0) 84 goto cleanup; 85 86 if (slab1 < slab0 / 2 && current < slab0 / 2) 87 ret = KSFT_PASS; 88 cleanup: 89 cg_destroy(cg); 90 free(cg); 91 92 return ret; 93 } 94 95 static void *alloc_kmem_fn(void *arg) 96 { 97 alloc_dcache(NULL, (void *)100); 98 return NULL; 99 } 100 101 static int alloc_kmem_smp(const char *cgroup, void *arg) 102 { 103 int nr_threads = 2 * get_nprocs(); 104 pthread_t *tinfo; 105 unsigned long i; 106 int ret = -1; 107 108 tinfo = calloc(nr_threads, sizeof(pthread_t)); 109 if (tinfo == NULL) 110 return -1; 111 112 for (i = 0; i < nr_threads; i++) { 113 if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn, 114 (void *)i)) { 115 free(tinfo); 116 return -1; 117 } 118 } 119 120 for (i = 0; i < nr_threads; i++) { 121 ret = pthread_join(tinfo[i], NULL); 122 if (ret) 123 break; 124 } 125 126 free(tinfo); 127 return ret; 128 } 129 130 static int cg_run_in_subcgroups(const char *parent, 131 int (*fn)(const char *cgroup, void *arg), 132 void *arg, int times) 133 { 134 char *child; 135 int i; 136 137 for (i = 0; i < times; i++) { 138 child = cg_name_indexed(parent, "child", i); 139 if (!child) 140 return -1; 141 142 if (cg_create(child)) { 143 cg_destroy(child); 144 free(child); 145 return -1; 146 } 147 148 if (cg_run(child, fn, NULL)) { 149 cg_destroy(child); 150 free(child); 151 return -1; 152 } 153 154 cg_destroy(child); 155 free(child); 156 } 157 158 return 0; 159 } 160 161 /* 162 * The test creates and destroys a large number of cgroups. In each cgroup it 163 * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS 164 * threads. Then it checks the sanity of numbers on the parent level: 165 * the total size of the cgroups should be roughly equal to 166 * anon + file + kernel + sock. 167 */ 168 static int test_kmem_memcg_deletion(const char *root) 169 { 170 long current, anon, file, kernel, sock, sum; 171 int ret = KSFT_FAIL; 172 char *parent; 173 174 parent = cg_name(root, "kmem_memcg_deletion_test"); 175 if (!parent) 176 goto cleanup; 177 178 if (cg_create(parent)) 179 goto cleanup; 180 181 if (cg_write(parent, "cgroup.subtree_control", "+memory")) 182 goto cleanup; 183 184 if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100)) 185 goto cleanup; 186 187 current = cg_read_long(parent, "memory.current"); 188 anon = cg_read_key_long(parent, "memory.stat", "anon "); 189 file = cg_read_key_long(parent, "memory.stat", "file "); 190 kernel = cg_read_key_long(parent, "memory.stat", "kernel "); 191 sock = cg_read_key_long(parent, "memory.stat", "sock "); 192 if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0) 193 goto cleanup; 194 195 sum = anon + file + kernel + sock; 196 if (labs(sum - current) < MAX_VMSTAT_ERROR) { 197 ret = KSFT_PASS; 198 } else { 199 printf("memory.current = %ld\n", current); 200 printf("anon + file + kernel + sock = %ld\n", sum); 201 printf("anon = %ld\n", anon); 202 printf("file = %ld\n", file); 203 printf("kernel = %ld\n", kernel); 204 printf("sock = %ld\n", sock); 205 } 206 207 cleanup: 208 cg_destroy(parent); 209 free(parent); 210 211 return ret; 212 } 213 214 /* 215 * The test reads the entire /proc/kpagecgroup. If the operation went 216 * successfully (and the kernel didn't panic), the test is treated as passed. 217 */ 218 static int test_kmem_proc_kpagecgroup(const char *root) 219 { 220 unsigned long buf[128]; 221 int ret = KSFT_FAIL; 222 ssize_t len; 223 int fd; 224 225 fd = open("/proc/kpagecgroup", O_RDONLY); 226 if (fd < 0) 227 return ret; 228 229 do { 230 len = read(fd, buf, sizeof(buf)); 231 } while (len > 0); 232 233 if (len == 0) 234 ret = KSFT_PASS; 235 236 close(fd); 237 return ret; 238 } 239 240 static void *pthread_wait_fn(void *arg) 241 { 242 sleep(100); 243 return NULL; 244 } 245 246 static int spawn_1000_threads(const char *cgroup, void *arg) 247 { 248 int nr_threads = 1000; 249 pthread_t *tinfo; 250 unsigned long i; 251 long stack; 252 int ret = -1; 253 254 tinfo = calloc(nr_threads, sizeof(pthread_t)); 255 if (tinfo == NULL) 256 return -1; 257 258 for (i = 0; i < nr_threads; i++) { 259 if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn, 260 (void *)i)) { 261 free(tinfo); 262 return(-1); 263 } 264 } 265 266 stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack "); 267 if (stack >= 4096 * 1000) 268 ret = 0; 269 270 free(tinfo); 271 return ret; 272 } 273 274 /* 275 * The test spawns a process, which spawns 1000 threads. Then it checks 276 * that memory.stat's kernel_stack is at least 1000 pages large. 277 */ 278 static int test_kmem_kernel_stacks(const char *root) 279 { 280 int ret = KSFT_FAIL; 281 char *cg = NULL; 282 283 cg = cg_name(root, "kmem_kernel_stacks_test"); 284 if (!cg) 285 goto cleanup; 286 287 if (cg_create(cg)) 288 goto cleanup; 289 290 if (cg_run(cg, spawn_1000_threads, NULL)) 291 goto cleanup; 292 293 ret = KSFT_PASS; 294 cleanup: 295 cg_destroy(cg); 296 free(cg); 297 298 return ret; 299 } 300 301 /* 302 * This test sequentionally creates 30 child cgroups, allocates some 303 * kernel memory in each of them, and deletes them. Then it checks 304 * that the number of dying cgroups on the parent level is 0. 305 */ 306 static int test_kmem_dead_cgroups(const char *root) 307 { 308 int ret = KSFT_FAIL; 309 char *parent; 310 long dead = -1; 311 312 parent = cg_name(root, "kmem_dead_cgroups_test"); 313 if (!parent) 314 goto cleanup; 315 316 if (cg_create(parent)) 317 goto cleanup; 318 319 if (cg_write(parent, "cgroup.subtree_control", "+memory")) 320 goto cleanup; 321 322 if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) 323 goto cleanup; 324 325 /* 326 * Allow up to ~8s for reclaim of dying descendants to complete. 327 * This is a generous upper bound derived from stress testing, not 328 * from a specific kernel constant, and can be adjusted if reclaim 329 * behavior changes in the future. 330 */ 331 dead = cg_read_key_long_poll(parent, "cgroup.stat", 332 "nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES, 333 DEFAULT_WAIT_INTERVAL_US); 334 if (dead) 335 goto cleanup; 336 337 ret = KSFT_PASS; 338 339 cleanup: 340 cg_destroy(parent); 341 free(parent); 342 343 return ret; 344 } 345 346 /* 347 * This test creates a sub-tree with 1000 memory cgroups. 348 * Then it checks that the memory.current on the parent level 349 * is greater than 0 and approximates matches the percpu value 350 * from memory.stat. 351 */ 352 static int test_percpu_basic(const char *root) 353 { 354 int ret = KSFT_FAIL; 355 char *parent, *child; 356 long current, percpu; 357 int i; 358 359 parent = cg_name(root, "percpu_basic_test"); 360 if (!parent) 361 goto cleanup; 362 363 if (cg_create(parent)) 364 goto cleanup; 365 366 if (cg_write(parent, "cgroup.subtree_control", "+memory")) 367 goto cleanup; 368 369 for (i = 0; i < 1000; i++) { 370 child = cg_name_indexed(parent, "child", i); 371 if (!child) { 372 ret = -1; 373 goto cleanup_children; 374 } 375 376 if (cg_create(child)) { 377 free(child); 378 goto cleanup_children; 379 } 380 381 free(child); 382 } 383 384 current = cg_read_long(parent, "memory.current"); 385 percpu = cg_read_key_long(parent, "memory.stat", "percpu "); 386 387 if (current > 0 && percpu > 0 && labs(current - percpu) < 388 MAX_VMSTAT_ERROR) 389 ret = KSFT_PASS; 390 else 391 printf("memory.current %ld\npercpu %ld\n", 392 current, percpu); 393 394 cleanup_children: 395 for (i = 0; i < 1000; i++) { 396 child = cg_name_indexed(parent, "child", i); 397 cg_destroy(child); 398 free(child); 399 } 400 401 cleanup: 402 cg_destroy(parent); 403 free(parent); 404 405 return ret; 406 } 407 408 #define T(x) { x, #x } 409 struct kmem_test { 410 int (*fn)(const char *root); 411 const char *name; 412 } tests[] = { 413 T(test_kmem_basic), 414 T(test_kmem_memcg_deletion), 415 T(test_kmem_proc_kpagecgroup), 416 T(test_kmem_kernel_stacks), 417 T(test_kmem_dead_cgroups), 418 T(test_percpu_basic), 419 }; 420 #undef T 421 422 int main(int argc, char **argv) 423 { 424 char root[PATH_MAX]; 425 int i; 426 427 ksft_print_header(); 428 ksft_set_plan(ARRAY_SIZE(tests)); 429 if (cg_find_unified_root(root, sizeof(root), NULL)) 430 ksft_exit_skip("cgroup v2 isn't mounted\n"); 431 432 /* 433 * Check that memory controller is available: 434 * memory is listed in cgroup.controllers 435 */ 436 if (cg_read_strstr(root, "cgroup.controllers", "memory")) 437 ksft_exit_skip("memory controller isn't available\n"); 438 439 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 440 if (cg_write(root, "cgroup.subtree_control", "+memory")) 441 ksft_exit_skip("Failed to set memory controller\n"); 442 443 for (i = 0; i < ARRAY_SIZE(tests); i++) { 444 switch (tests[i].fn(root)) { 445 case KSFT_PASS: 446 ksft_test_result_pass("%s\n", tests[i].name); 447 break; 448 case KSFT_SKIP: 449 ksft_test_result_skip("%s\n", tests[i].name); 450 break; 451 default: 452 ksft_test_result_fail("%s\n", tests[i].name); 453 break; 454 } 455 } 456 457 ksft_finished(); 458 } 459