1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 4 #include <linux/limits.h> 5 #include <unistd.h> 6 #include <stdio.h> 7 #include <signal.h> 8 #include <sys/sysinfo.h> 9 #include <string.h> 10 #include <sys/wait.h> 11 #include <sys/mman.h> 12 13 #include "../kselftest.h" 14 #include "cgroup_util.h" 15 16 static int read_int(const char *path, size_t *value) 17 { 18 FILE *file; 19 int ret = 0; 20 21 file = fopen(path, "r"); 22 if (!file) 23 return -1; 24 if (fscanf(file, "%ld", value) != 1) 25 ret = -1; 26 fclose(file); 27 return ret; 28 } 29 30 static int set_min_free_kb(size_t value) 31 { 32 FILE *file; 33 int ret; 34 35 file = fopen("/proc/sys/vm/min_free_kbytes", "w"); 36 if (!file) 37 return -1; 38 ret = fprintf(file, "%ld\n", value); 39 fclose(file); 40 return ret; 41 } 42 43 static int read_min_free_kb(size_t *value) 44 { 45 return read_int("/proc/sys/vm/min_free_kbytes", value); 46 } 47 48 static int get_zswap_stored_pages(size_t *value) 49 { 50 return read_int("/sys/kernel/debug/zswap/stored_pages", value); 51 } 52 53 static long get_cg_wb_count(const char *cg) 54 { 55 return cg_read_key_long(cg, "memory.stat", "zswpwb"); 56 } 57 58 static long get_zswpout(const char *cgroup) 59 { 60 return cg_read_key_long(cgroup, "memory.stat", "zswpout "); 61 } 62 63 static int allocate_and_read_bytes(const char *cgroup, void *arg) 64 { 65 size_t size = (size_t)arg; 66 char *mem = (char *)malloc(size); 67 int ret = 0; 68 69 if (!mem) 70 return -1; 71 for (int i = 0; i < size; i += 4095) 72 mem[i] = 'a'; 73 74 /* Go through the allocated memory to (z)swap in and out pages */ 75 for (int i = 0; i < size; i += 4095) { 76 if (mem[i] != 'a') 77 ret = -1; 78 } 79 80 free(mem); 81 return ret; 82 } 83 84 static int allocate_bytes(const char *cgroup, void *arg) 85 { 86 size_t size = (size_t)arg; 87 char *mem = (char *)malloc(size); 88 89 if (!mem) 90 return -1; 91 for (int i = 0; i < size; i += 4095) 92 mem[i] = 'a'; 93 free(mem); 94 return 0; 95 } 96 97 static char *setup_test_group_1M(const char *root, const char *name) 98 { 99 char *group_name = cg_name(root, name); 100 101 if (!group_name) 102 return NULL; 103 if (cg_create(group_name)) 104 goto fail; 105 if (cg_write(group_name, "memory.max", "1M")) { 106 cg_destroy(group_name); 107 goto fail; 108 } 109 return group_name; 110 fail: 111 free(group_name); 112 return NULL; 113 } 114 115 /* 116 * Sanity test to check that pages are written into zswap. 117 */ 118 static int test_zswap_usage(const char *root) 119 { 120 long zswpout_before, zswpout_after; 121 int ret = KSFT_FAIL; 122 char *test_group; 123 124 test_group = cg_name(root, "no_shrink_test"); 125 if (!test_group) 126 goto out; 127 if (cg_create(test_group)) 128 goto out; 129 if (cg_write(test_group, "memory.max", "1M")) 130 goto out; 131 132 zswpout_before = get_zswpout(test_group); 133 if (zswpout_before < 0) { 134 ksft_print_msg("Failed to get zswpout\n"); 135 goto out; 136 } 137 138 /* Allocate more than memory.max to push memory into zswap */ 139 if (cg_run(test_group, allocate_bytes, (void *)MB(4))) 140 goto out; 141 142 /* Verify that pages come into zswap */ 143 zswpout_after = get_zswpout(test_group); 144 if (zswpout_after <= zswpout_before) { 145 ksft_print_msg("zswpout does not increase after test program\n"); 146 goto out; 147 } 148 ret = KSFT_PASS; 149 150 out: 151 cg_destroy(test_group); 152 free(test_group); 153 return ret; 154 } 155 156 /* 157 * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for 158 * the cgroup. 159 */ 160 static int test_swapin_nozswap(const char *root) 161 { 162 int ret = KSFT_FAIL; 163 char *test_group; 164 long swap_peak, zswpout; 165 166 test_group = cg_name(root, "no_zswap_test"); 167 if (!test_group) 168 goto out; 169 if (cg_create(test_group)) 170 goto out; 171 if (cg_write(test_group, "memory.max", "8M")) 172 goto out; 173 if (cg_write(test_group, "memory.zswap.max", "0")) 174 goto out; 175 176 /* Allocate and read more than memory.max to trigger swapin */ 177 if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) 178 goto out; 179 180 /* Verify that pages are swapped out, but no zswap happened */ 181 swap_peak = cg_read_long(test_group, "memory.swap.peak"); 182 if (swap_peak < 0) { 183 ksft_print_msg("failed to get cgroup's swap_peak\n"); 184 goto out; 185 } 186 187 if (swap_peak < MB(24)) { 188 ksft_print_msg("at least 24MB of memory should be swapped out\n"); 189 goto out; 190 } 191 192 zswpout = get_zswpout(test_group); 193 if (zswpout < 0) { 194 ksft_print_msg("failed to get zswpout\n"); 195 goto out; 196 } 197 198 if (zswpout > 0) { 199 ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n"); 200 goto out; 201 } 202 203 ret = KSFT_PASS; 204 205 out: 206 cg_destroy(test_group); 207 free(test_group); 208 return ret; 209 } 210 211 /* Simple test to verify the (z)swapin code paths */ 212 static int test_zswapin(const char *root) 213 { 214 int ret = KSFT_FAIL; 215 char *test_group; 216 long zswpin; 217 218 test_group = cg_name(root, "zswapin_test"); 219 if (!test_group) 220 goto out; 221 if (cg_create(test_group)) 222 goto out; 223 if (cg_write(test_group, "memory.max", "8M")) 224 goto out; 225 if (cg_write(test_group, "memory.zswap.max", "max")) 226 goto out; 227 228 /* Allocate and read more than memory.max to trigger (z)swap in */ 229 if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) 230 goto out; 231 232 zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin "); 233 if (zswpin < 0) { 234 ksft_print_msg("failed to get zswpin\n"); 235 goto out; 236 } 237 238 if (zswpin < MB(24) / PAGE_SIZE) { 239 ksft_print_msg("at least 24MB should be brought back from zswap\n"); 240 goto out; 241 } 242 243 ret = KSFT_PASS; 244 245 out: 246 cg_destroy(test_group); 247 free(test_group); 248 return ret; 249 } 250 251 /* 252 * Attempt writeback with the following steps: 253 * 1. Allocate memory. 254 * 2. Reclaim memory equal to the amount that was allocated in step 1. 255 This will move it into zswap. 256 * 3. Save current zswap usage. 257 * 4. Move the memory allocated in step 1 back in from zswap. 258 * 5. Set zswap.max to half the amount that was recorded in step 3. 259 * 6. Attempt to reclaim memory equal to the amount that was allocated, 260 this will either trigger writeback if it's enabled, or reclamation 261 will fail if writeback is disabled as there isn't enough zswap space. 262 */ 263 static int attempt_writeback(const char *cgroup, void *arg) 264 { 265 long pagesize = sysconf(_SC_PAGESIZE); 266 char *test_group = arg; 267 size_t memsize = MB(4); 268 char buf[pagesize]; 269 long zswap_usage; 270 bool wb_enabled; 271 int ret = -1; 272 char *mem; 273 274 wb_enabled = cg_read_long(test_group, "memory.zswap.writeback"); 275 mem = (char *)malloc(memsize); 276 if (!mem) 277 return ret; 278 279 /* 280 * Fill half of each page with increasing data, and keep other 281 * half empty, this will result in data that is still compressible 282 * and ends up in zswap, with material zswap usage. 283 */ 284 for (int i = 0; i < pagesize; i++) 285 buf[i] = i < pagesize/2 ? (char) i : 0; 286 287 for (int i = 0; i < memsize; i += pagesize) 288 memcpy(&mem[i], buf, pagesize); 289 290 /* Try and reclaim allocated memory */ 291 if (cg_write_numeric(test_group, "memory.reclaim", memsize)) { 292 ksft_print_msg("Failed to reclaim all of the requested memory\n"); 293 goto out; 294 } 295 296 zswap_usage = cg_read_long(test_group, "memory.zswap.current"); 297 298 /* zswpin */ 299 for (int i = 0; i < memsize; i += pagesize) { 300 if (memcmp(&mem[i], buf, pagesize)) { 301 ksft_print_msg("invalid memory\n"); 302 goto out; 303 } 304 } 305 306 if (cg_write_numeric(test_group, "memory.zswap.max", zswap_usage/2)) 307 goto out; 308 309 /* 310 * If writeback is enabled, trying to reclaim memory now will trigger a 311 * writeback as zswap.max is half of what was needed when reclaim ran the first time. 312 * If writeback is disabled, memory reclaim will fail as zswap is limited and 313 * it can't writeback to swap. 314 */ 315 ret = cg_write_numeric(test_group, "memory.reclaim", memsize); 316 if (!wb_enabled) 317 ret = (ret == -EAGAIN) ? 0 : -1; 318 319 out: 320 free(mem); 321 return ret; 322 } 323 324 /* Test to verify the zswap writeback path */ 325 static int test_zswap_writeback(const char *root, bool wb) 326 { 327 long zswpwb_before, zswpwb_after; 328 int ret = KSFT_FAIL; 329 char *test_group; 330 331 test_group = cg_name(root, "zswap_writeback_test"); 332 if (!test_group) 333 goto out; 334 if (cg_create(test_group)) 335 goto out; 336 if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) 337 goto out; 338 339 zswpwb_before = get_cg_wb_count(test_group); 340 if (zswpwb_before != 0) { 341 ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); 342 goto out; 343 } 344 345 if (cg_run(test_group, attempt_writeback, (void *) test_group)) 346 goto out; 347 348 /* Verify that zswap writeback occurred only if writeback was enabled */ 349 zswpwb_after = get_cg_wb_count(test_group); 350 if (zswpwb_after < 0) 351 goto out; 352 353 if (wb != !!zswpwb_after) { 354 ksft_print_msg("zswpwb_after is %ld while wb is %s", 355 zswpwb_after, wb ? "enabled" : "disabled"); 356 goto out; 357 } 358 359 ret = KSFT_PASS; 360 361 out: 362 cg_destroy(test_group); 363 free(test_group); 364 return ret; 365 } 366 367 static int test_zswap_writeback_enabled(const char *root) 368 { 369 return test_zswap_writeback(root, true); 370 } 371 372 static int test_zswap_writeback_disabled(const char *root) 373 { 374 return test_zswap_writeback(root, false); 375 } 376 377 /* 378 * When trying to store a memcg page in zswap, if the memcg hits its memory 379 * limit in zswap, writeback should affect only the zswapped pages of that 380 * memcg. 381 */ 382 static int test_no_invasive_cgroup_shrink(const char *root) 383 { 384 int ret = KSFT_FAIL; 385 size_t control_allocation_size = MB(10); 386 char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; 387 388 wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); 389 if (!wb_group) 390 return KSFT_FAIL; 391 if (cg_write(wb_group, "memory.zswap.max", "10K")) 392 goto out; 393 control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); 394 if (!control_group) 395 goto out; 396 397 /* Push some test_group2 memory into zswap */ 398 if (cg_enter_current(control_group)) 399 goto out; 400 control_allocation = malloc(control_allocation_size); 401 for (int i = 0; i < control_allocation_size; i += 4095) 402 control_allocation[i] = 'a'; 403 if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) 404 goto out; 405 406 /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ 407 if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) 408 goto out; 409 410 /* Verify that only zswapped memory from gwb_group has been written back */ 411 if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) 412 ret = KSFT_PASS; 413 out: 414 cg_enter_current(root); 415 if (control_group) { 416 cg_destroy(control_group); 417 free(control_group); 418 } 419 cg_destroy(wb_group); 420 free(wb_group); 421 if (control_allocation) 422 free(control_allocation); 423 return ret; 424 } 425 426 struct no_kmem_bypass_child_args { 427 size_t target_alloc_bytes; 428 size_t child_allocated; 429 }; 430 431 static int no_kmem_bypass_child(const char *cgroup, void *arg) 432 { 433 struct no_kmem_bypass_child_args *values = arg; 434 void *allocation; 435 436 allocation = malloc(values->target_alloc_bytes); 437 if (!allocation) { 438 values->child_allocated = true; 439 return -1; 440 } 441 for (long i = 0; i < values->target_alloc_bytes; i += 4095) 442 ((char *)allocation)[i] = 'a'; 443 values->child_allocated = true; 444 pause(); 445 free(allocation); 446 return 0; 447 } 448 449 /* 450 * When pages owned by a memcg are pushed to zswap by kswapd, they should be 451 * charged to that cgroup. This wasn't the case before commit 452 * cd08d80ecdac("mm: correctly charge compressed memory to its memcg"). 453 * 454 * The test first allocates memory in a memcg, then raises min_free_kbytes to 455 * a very high value so that the allocation falls below low wm, then makes 456 * another allocation to trigger kswapd that should push the memcg-owned pages 457 * to zswap and verifies that the zswap pages are correctly charged. 458 * 459 * To be run on a VM with at most 4G of memory. 460 */ 461 static int test_no_kmem_bypass(const char *root) 462 { 463 size_t min_free_kb_high, min_free_kb_low, min_free_kb_original; 464 struct no_kmem_bypass_child_args *values; 465 size_t trigger_allocation_size; 466 int wait_child_iteration = 0; 467 long stored_pages_threshold; 468 struct sysinfo sys_info; 469 int ret = KSFT_FAIL; 470 int child_status; 471 char *test_group = NULL; 472 pid_t child_pid; 473 474 /* Read sys info and compute test values accordingly */ 475 if (sysinfo(&sys_info) != 0) 476 return KSFT_FAIL; 477 if (sys_info.totalram > 5000000000) 478 return KSFT_SKIP; 479 values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ | 480 PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); 481 if (values == MAP_FAILED) 482 return KSFT_FAIL; 483 if (read_min_free_kb(&min_free_kb_original)) 484 return KSFT_FAIL; 485 min_free_kb_high = sys_info.totalram / 2000; 486 min_free_kb_low = sys_info.totalram / 500000; 487 values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + 488 sys_info.totalram * 5 / 100; 489 stored_pages_threshold = sys_info.totalram / 5 / 4096; 490 trigger_allocation_size = sys_info.totalram / 20; 491 492 /* Set up test memcg */ 493 test_group = cg_name(root, "kmem_bypass_test"); 494 if (!test_group) 495 goto out; 496 497 /* Spawn memcg child and wait for it to allocate */ 498 set_min_free_kb(min_free_kb_low); 499 if (cg_create(test_group)) 500 goto out; 501 values->child_allocated = false; 502 child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values); 503 if (child_pid < 0) 504 goto out; 505 while (!values->child_allocated && wait_child_iteration++ < 10000) 506 usleep(1000); 507 508 /* Try to wakeup kswapd and let it push child memory to zswap */ 509 set_min_free_kb(min_free_kb_high); 510 for (int i = 0; i < 20; i++) { 511 size_t stored_pages; 512 char *trigger_allocation = malloc(trigger_allocation_size); 513 514 if (!trigger_allocation) 515 break; 516 for (int i = 0; i < trigger_allocation_size; i += 4095) 517 trigger_allocation[i] = 'b'; 518 usleep(100000); 519 free(trigger_allocation); 520 if (get_zswap_stored_pages(&stored_pages)) 521 break; 522 if (stored_pages < 0) 523 break; 524 /* If memory was pushed to zswap, verify it belongs to memcg */ 525 if (stored_pages > stored_pages_threshold) { 526 int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); 527 int delta = stored_pages * 4096 - zswapped; 528 int result_ok = delta < stored_pages * 4096 / 4; 529 530 ret = result_ok ? KSFT_PASS : KSFT_FAIL; 531 break; 532 } 533 } 534 535 kill(child_pid, SIGTERM); 536 waitpid(child_pid, &child_status, 0); 537 out: 538 set_min_free_kb(min_free_kb_original); 539 cg_destroy(test_group); 540 free(test_group); 541 return ret; 542 } 543 544 #define T(x) { x, #x } 545 struct zswap_test { 546 int (*fn)(const char *root); 547 const char *name; 548 } tests[] = { 549 T(test_zswap_usage), 550 T(test_swapin_nozswap), 551 T(test_zswapin), 552 T(test_zswap_writeback_enabled), 553 T(test_zswap_writeback_disabled), 554 T(test_no_kmem_bypass), 555 T(test_no_invasive_cgroup_shrink), 556 }; 557 #undef T 558 559 static bool zswap_configured(void) 560 { 561 return access("/sys/module/zswap", F_OK) == 0; 562 } 563 564 int main(int argc, char **argv) 565 { 566 char root[PATH_MAX]; 567 int i, ret = EXIT_SUCCESS; 568 569 if (cg_find_unified_root(root, sizeof(root), NULL)) 570 ksft_exit_skip("cgroup v2 isn't mounted\n"); 571 572 if (!zswap_configured()) 573 ksft_exit_skip("zswap isn't configured\n"); 574 575 /* 576 * Check that memory controller is available: 577 * memory is listed in cgroup.controllers 578 */ 579 if (cg_read_strstr(root, "cgroup.controllers", "memory")) 580 ksft_exit_skip("memory controller isn't available\n"); 581 582 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 583 if (cg_write(root, "cgroup.subtree_control", "+memory")) 584 ksft_exit_skip("Failed to set memory controller\n"); 585 586 for (i = 0; i < ARRAY_SIZE(tests); i++) { 587 switch (tests[i].fn(root)) { 588 case KSFT_PASS: 589 ksft_test_result_pass("%s\n", tests[i].name); 590 break; 591 case KSFT_SKIP: 592 ksft_test_result_skip("%s\n", tests[i].name); 593 break; 594 default: 595 ret = EXIT_FAILURE; 596 ksft_test_result_fail("%s\n", tests[i].name); 597 break; 598 } 599 } 600 601 return ret; 602 } 603