1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 4 #include <linux/limits.h> 5 #include <unistd.h> 6 #include <stdio.h> 7 #include <signal.h> 8 #include <sys/sysinfo.h> 9 #include <string.h> 10 #include <sys/wait.h> 11 #include <sys/mman.h> 12 13 #include "../kselftest.h" 14 #include "cgroup_util.h" 15 16 static int read_int(const char *path, size_t *value) 17 { 18 FILE *file; 19 int ret = 0; 20 21 file = fopen(path, "r"); 22 if (!file) 23 return -1; 24 if (fscanf(file, "%ld", value) != 1) 25 ret = -1; 26 fclose(file); 27 return ret; 28 } 29 30 static int set_min_free_kb(size_t value) 31 { 32 FILE *file; 33 int ret; 34 35 file = fopen("/proc/sys/vm/min_free_kbytes", "w"); 36 if (!file) 37 return -1; 38 ret = fprintf(file, "%ld\n", value); 39 fclose(file); 40 return ret; 41 } 42 43 static int read_min_free_kb(size_t *value) 44 { 45 return read_int("/proc/sys/vm/min_free_kbytes", value); 46 } 47 48 static int get_zswap_stored_pages(size_t *value) 49 { 50 return read_int("/sys/kernel/debug/zswap/stored_pages", value); 51 } 52 53 static long get_cg_wb_count(const char *cg) 54 { 55 return cg_read_key_long(cg, "memory.stat", "zswpwb"); 56 } 57 58 static long get_zswpout(const char *cgroup) 59 { 60 return cg_read_key_long(cgroup, "memory.stat", "zswpout "); 61 } 62 63 static int allocate_and_read_bytes(const char *cgroup, void *arg) 64 { 65 size_t size = (size_t)arg; 66 char *mem = (char *)malloc(size); 67 int ret = 0; 68 69 if (!mem) 70 return -1; 71 for (int i = 0; i < size; i += 4095) 72 mem[i] = 'a'; 73 74 /* Go through the allocated memory to (z)swap in and out pages */ 75 for (int i = 0; i < size; i += 4095) { 76 if (mem[i] != 'a') 77 ret = -1; 78 } 79 80 free(mem); 81 return ret; 82 } 83 84 static int allocate_bytes(const char *cgroup, void *arg) 85 { 86 size_t size = (size_t)arg; 87 char *mem = (char *)malloc(size); 88 89 if (!mem) 90 return -1; 91 for (int i = 0; i < size; i += 4095) 92 mem[i] = 'a'; 93 free(mem); 94 return 0; 95 } 96 97 static char *setup_test_group_1M(const char *root, const char *name) 98 { 99 char *group_name = cg_name(root, name); 100 101 if (!group_name) 102 return NULL; 103 if (cg_create(group_name)) 104 goto fail; 105 if (cg_write(group_name, "memory.max", "1M")) { 106 cg_destroy(group_name); 107 goto fail; 108 } 109 return group_name; 110 fail: 111 free(group_name); 112 return NULL; 113 } 114 115 /* 116 * Sanity test to check that pages are written into zswap. 117 */ 118 static int test_zswap_usage(const char *root) 119 { 120 long zswpout_before, zswpout_after; 121 int ret = KSFT_FAIL; 122 char *test_group; 123 124 test_group = cg_name(root, "no_shrink_test"); 125 if (!test_group) 126 goto out; 127 if (cg_create(test_group)) 128 goto out; 129 if (cg_write(test_group, "memory.max", "1M")) 130 goto out; 131 132 zswpout_before = get_zswpout(test_group); 133 if (zswpout_before < 0) { 134 ksft_print_msg("Failed to get zswpout\n"); 135 goto out; 136 } 137 138 /* Allocate more than memory.max to push memory into zswap */ 139 if (cg_run(test_group, allocate_bytes, (void *)MB(4))) 140 goto out; 141 142 /* Verify that pages come into zswap */ 143 zswpout_after = get_zswpout(test_group); 144 if (zswpout_after <= zswpout_before) { 145 ksft_print_msg("zswpout does not increase after test program\n"); 146 goto out; 147 } 148 ret = KSFT_PASS; 149 150 out: 151 cg_destroy(test_group); 152 free(test_group); 153 return ret; 154 } 155 156 /* 157 * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for 158 * the cgroup. 159 */ 160 static int test_swapin_nozswap(const char *root) 161 { 162 int ret = KSFT_FAIL; 163 char *test_group; 164 long swap_peak, zswpout; 165 166 test_group = cg_name(root, "no_zswap_test"); 167 if (!test_group) 168 goto out; 169 if (cg_create(test_group)) 170 goto out; 171 if (cg_write(test_group, "memory.max", "8M")) 172 goto out; 173 if (cg_write(test_group, "memory.zswap.max", "0")) 174 goto out; 175 176 /* Allocate and read more than memory.max to trigger swapin */ 177 if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) 178 goto out; 179 180 /* Verify that pages are swapped out, but no zswap happened */ 181 swap_peak = cg_read_long(test_group, "memory.swap.peak"); 182 if (swap_peak < 0) { 183 ksft_print_msg("failed to get cgroup's swap_peak\n"); 184 goto out; 185 } 186 187 if (swap_peak < MB(24)) { 188 ksft_print_msg("at least 24MB of memory should be swapped out\n"); 189 goto out; 190 } 191 192 zswpout = get_zswpout(test_group); 193 if (zswpout < 0) { 194 ksft_print_msg("failed to get zswpout\n"); 195 goto out; 196 } 197 198 if (zswpout > 0) { 199 ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n"); 200 goto out; 201 } 202 203 ret = KSFT_PASS; 204 205 out: 206 cg_destroy(test_group); 207 free(test_group); 208 return ret; 209 } 210 211 /* Simple test to verify the (z)swapin code paths */ 212 static int test_zswapin(const char *root) 213 { 214 int ret = KSFT_FAIL; 215 char *test_group; 216 long zswpin; 217 218 test_group = cg_name(root, "zswapin_test"); 219 if (!test_group) 220 goto out; 221 if (cg_create(test_group)) 222 goto out; 223 if (cg_write(test_group, "memory.max", "8M")) 224 goto out; 225 if (cg_write(test_group, "memory.zswap.max", "max")) 226 goto out; 227 228 /* Allocate and read more than memory.max to trigger (z)swap in */ 229 if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) 230 goto out; 231 232 zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin "); 233 if (zswpin < 0) { 234 ksft_print_msg("failed to get zswpin\n"); 235 goto out; 236 } 237 238 if (zswpin < MB(24) / PAGE_SIZE) { 239 ksft_print_msg("at least 24MB should be brought back from zswap\n"); 240 goto out; 241 } 242 243 ret = KSFT_PASS; 244 245 out: 246 cg_destroy(test_group); 247 free(test_group); 248 return ret; 249 } 250 251 /* 252 * Attempt writeback with the following steps: 253 * 1. Allocate memory. 254 * 2. Reclaim memory equal to the amount that was allocated in step 1. 255 This will move it into zswap. 256 * 3. Save current zswap usage. 257 * 4. Move the memory allocated in step 1 back in from zswap. 258 * 5. Set zswap.max to half the amount that was recorded in step 3. 259 * 6. Attempt to reclaim memory equal to the amount that was allocated, 260 this will either trigger writeback if it's enabled, or reclamation 261 will fail if writeback is disabled as there isn't enough zswap space. 262 */ 263 static int attempt_writeback(const char *cgroup, void *arg) 264 { 265 long pagesize = sysconf(_SC_PAGESIZE); 266 size_t memsize = MB(4); 267 char buf[pagesize]; 268 long zswap_usage; 269 bool wb_enabled = *(bool *) arg; 270 int ret = -1; 271 char *mem; 272 273 mem = (char *)malloc(memsize); 274 if (!mem) 275 return ret; 276 277 /* 278 * Fill half of each page with increasing data, and keep other 279 * half empty, this will result in data that is still compressible 280 * and ends up in zswap, with material zswap usage. 281 */ 282 for (int i = 0; i < pagesize; i++) 283 buf[i] = i < pagesize/2 ? (char) i : 0; 284 285 for (int i = 0; i < memsize; i += pagesize) 286 memcpy(&mem[i], buf, pagesize); 287 288 /* Try and reclaim allocated memory */ 289 if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { 290 ksft_print_msg("Failed to reclaim all of the requested memory\n"); 291 goto out; 292 } 293 294 zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); 295 296 /* zswpin */ 297 for (int i = 0; i < memsize; i += pagesize) { 298 if (memcmp(&mem[i], buf, pagesize)) { 299 ksft_print_msg("invalid memory\n"); 300 goto out; 301 } 302 } 303 304 if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) 305 goto out; 306 307 /* 308 * If writeback is enabled, trying to reclaim memory now will trigger a 309 * writeback as zswap.max is half of what was needed when reclaim ran the first time. 310 * If writeback is disabled, memory reclaim will fail as zswap is limited and 311 * it can't writeback to swap. 312 */ 313 ret = cg_write_numeric(cgroup, "memory.reclaim", memsize); 314 if (!wb_enabled) 315 ret = (ret == -EAGAIN) ? 0 : -1; 316 317 out: 318 free(mem); 319 return ret; 320 } 321 322 static int test_zswap_writeback_one(const char *cgroup, bool wb) 323 { 324 long zswpwb_before, zswpwb_after; 325 326 zswpwb_before = get_cg_wb_count(cgroup); 327 if (zswpwb_before != 0) { 328 ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before); 329 return -1; 330 } 331 332 if (cg_run(cgroup, attempt_writeback, (void *) &wb)) 333 return -1; 334 335 /* Verify that zswap writeback occurred only if writeback was enabled */ 336 zswpwb_after = get_cg_wb_count(cgroup); 337 if (zswpwb_after < 0) 338 return -1; 339 340 if (wb != !!zswpwb_after) { 341 ksft_print_msg("zswpwb_after is %ld while wb is %s", 342 zswpwb_after, wb ? "enabled" : "disabled"); 343 return -1; 344 } 345 346 return 0; 347 } 348 349 /* Test to verify the zswap writeback path */ 350 static int test_zswap_writeback(const char *root, bool wb) 351 { 352 int ret = KSFT_FAIL; 353 char *test_group, *test_group_child = NULL; 354 355 if (cg_read_strcmp(root, "memory.zswap.writeback", "1")) 356 return KSFT_SKIP; 357 358 test_group = cg_name(root, "zswap_writeback_test"); 359 if (!test_group) 360 goto out; 361 if (cg_create(test_group)) 362 goto out; 363 if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0")) 364 goto out; 365 366 if (test_zswap_writeback_one(test_group, wb)) 367 goto out; 368 369 /* Reset memory.zswap.max to max (modified by attempt_writeback), and 370 * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1. 371 * Thus, the parent's setting shall be what's in effect. */ 372 if (cg_write(test_group, "memory.zswap.max", "max")) 373 goto out; 374 if (cg_write(test_group, "cgroup.subtree_control", "+memory")) 375 goto out; 376 377 test_group_child = cg_name(test_group, "zswap_writeback_test_child"); 378 if (!test_group_child) 379 goto out; 380 if (cg_create(test_group_child)) 381 goto out; 382 if (cg_write(test_group_child, "memory.zswap.writeback", "1")) 383 goto out; 384 385 if (test_zswap_writeback_one(test_group_child, wb)) 386 goto out; 387 388 ret = KSFT_PASS; 389 390 out: 391 if (test_group_child) { 392 cg_destroy(test_group_child); 393 free(test_group_child); 394 } 395 cg_destroy(test_group); 396 free(test_group); 397 return ret; 398 } 399 400 static int test_zswap_writeback_enabled(const char *root) 401 { 402 return test_zswap_writeback(root, true); 403 } 404 405 static int test_zswap_writeback_disabled(const char *root) 406 { 407 return test_zswap_writeback(root, false); 408 } 409 410 /* 411 * When trying to store a memcg page in zswap, if the memcg hits its memory 412 * limit in zswap, writeback should affect only the zswapped pages of that 413 * memcg. 414 */ 415 static int test_no_invasive_cgroup_shrink(const char *root) 416 { 417 int ret = KSFT_FAIL; 418 size_t control_allocation_size = MB(10); 419 char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; 420 421 wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); 422 if (!wb_group) 423 return KSFT_FAIL; 424 if (cg_write(wb_group, "memory.zswap.max", "10K")) 425 goto out; 426 control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); 427 if (!control_group) 428 goto out; 429 430 /* Push some test_group2 memory into zswap */ 431 if (cg_enter_current(control_group)) 432 goto out; 433 control_allocation = malloc(control_allocation_size); 434 for (int i = 0; i < control_allocation_size; i += 4095) 435 control_allocation[i] = 'a'; 436 if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) 437 goto out; 438 439 /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ 440 if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) 441 goto out; 442 443 /* Verify that only zswapped memory from gwb_group has been written back */ 444 if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) 445 ret = KSFT_PASS; 446 out: 447 cg_enter_current(root); 448 if (control_group) { 449 cg_destroy(control_group); 450 free(control_group); 451 } 452 cg_destroy(wb_group); 453 free(wb_group); 454 if (control_allocation) 455 free(control_allocation); 456 return ret; 457 } 458 459 struct no_kmem_bypass_child_args { 460 size_t target_alloc_bytes; 461 size_t child_allocated; 462 }; 463 464 static int no_kmem_bypass_child(const char *cgroup, void *arg) 465 { 466 struct no_kmem_bypass_child_args *values = arg; 467 void *allocation; 468 469 allocation = malloc(values->target_alloc_bytes); 470 if (!allocation) { 471 values->child_allocated = true; 472 return -1; 473 } 474 for (long i = 0; i < values->target_alloc_bytes; i += 4095) 475 ((char *)allocation)[i] = 'a'; 476 values->child_allocated = true; 477 pause(); 478 free(allocation); 479 return 0; 480 } 481 482 /* 483 * When pages owned by a memcg are pushed to zswap by kswapd, they should be 484 * charged to that cgroup. This wasn't the case before commit 485 * cd08d80ecdac("mm: correctly charge compressed memory to its memcg"). 486 * 487 * The test first allocates memory in a memcg, then raises min_free_kbytes to 488 * a very high value so that the allocation falls below low wm, then makes 489 * another allocation to trigger kswapd that should push the memcg-owned pages 490 * to zswap and verifies that the zswap pages are correctly charged. 491 * 492 * To be run on a VM with at most 4G of memory. 493 */ 494 static int test_no_kmem_bypass(const char *root) 495 { 496 size_t min_free_kb_high, min_free_kb_low, min_free_kb_original; 497 struct no_kmem_bypass_child_args *values; 498 size_t trigger_allocation_size; 499 int wait_child_iteration = 0; 500 long stored_pages_threshold; 501 struct sysinfo sys_info; 502 int ret = KSFT_FAIL; 503 int child_status; 504 char *test_group = NULL; 505 pid_t child_pid; 506 507 /* Read sys info and compute test values accordingly */ 508 if (sysinfo(&sys_info) != 0) 509 return KSFT_FAIL; 510 if (sys_info.totalram > 5000000000) 511 return KSFT_SKIP; 512 values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ | 513 PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); 514 if (values == MAP_FAILED) 515 return KSFT_FAIL; 516 if (read_min_free_kb(&min_free_kb_original)) 517 return KSFT_FAIL; 518 min_free_kb_high = sys_info.totalram / 2000; 519 min_free_kb_low = sys_info.totalram / 500000; 520 values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + 521 sys_info.totalram * 5 / 100; 522 stored_pages_threshold = sys_info.totalram / 5 / 4096; 523 trigger_allocation_size = sys_info.totalram / 20; 524 525 /* Set up test memcg */ 526 test_group = cg_name(root, "kmem_bypass_test"); 527 if (!test_group) 528 goto out; 529 530 /* Spawn memcg child and wait for it to allocate */ 531 set_min_free_kb(min_free_kb_low); 532 if (cg_create(test_group)) 533 goto out; 534 values->child_allocated = false; 535 child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values); 536 if (child_pid < 0) 537 goto out; 538 while (!values->child_allocated && wait_child_iteration++ < 10000) 539 usleep(1000); 540 541 /* Try to wakeup kswapd and let it push child memory to zswap */ 542 set_min_free_kb(min_free_kb_high); 543 for (int i = 0; i < 20; i++) { 544 size_t stored_pages; 545 char *trigger_allocation = malloc(trigger_allocation_size); 546 547 if (!trigger_allocation) 548 break; 549 for (int i = 0; i < trigger_allocation_size; i += 4095) 550 trigger_allocation[i] = 'b'; 551 usleep(100000); 552 free(trigger_allocation); 553 if (get_zswap_stored_pages(&stored_pages)) 554 break; 555 if (stored_pages < 0) 556 break; 557 /* If memory was pushed to zswap, verify it belongs to memcg */ 558 if (stored_pages > stored_pages_threshold) { 559 int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); 560 int delta = stored_pages * 4096 - zswapped; 561 int result_ok = delta < stored_pages * 4096 / 4; 562 563 ret = result_ok ? KSFT_PASS : KSFT_FAIL; 564 break; 565 } 566 } 567 568 kill(child_pid, SIGTERM); 569 waitpid(child_pid, &child_status, 0); 570 out: 571 set_min_free_kb(min_free_kb_original); 572 cg_destroy(test_group); 573 free(test_group); 574 return ret; 575 } 576 577 #define T(x) { x, #x } 578 struct zswap_test { 579 int (*fn)(const char *root); 580 const char *name; 581 } tests[] = { 582 T(test_zswap_usage), 583 T(test_swapin_nozswap), 584 T(test_zswapin), 585 T(test_zswap_writeback_enabled), 586 T(test_zswap_writeback_disabled), 587 T(test_no_kmem_bypass), 588 T(test_no_invasive_cgroup_shrink), 589 }; 590 #undef T 591 592 static bool zswap_configured(void) 593 { 594 return access("/sys/module/zswap", F_OK) == 0; 595 } 596 597 int main(int argc, char **argv) 598 { 599 char root[PATH_MAX]; 600 int i, ret = EXIT_SUCCESS; 601 602 if (cg_find_unified_root(root, sizeof(root), NULL)) 603 ksft_exit_skip("cgroup v2 isn't mounted\n"); 604 605 if (!zswap_configured()) 606 ksft_exit_skip("zswap isn't configured\n"); 607 608 /* 609 * Check that memory controller is available: 610 * memory is listed in cgroup.controllers 611 */ 612 if (cg_read_strstr(root, "cgroup.controllers", "memory")) 613 ksft_exit_skip("memory controller isn't available\n"); 614 615 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 616 if (cg_write(root, "cgroup.subtree_control", "+memory")) 617 ksft_exit_skip("Failed to set memory controller\n"); 618 619 for (i = 0; i < ARRAY_SIZE(tests); i++) { 620 switch (tests[i].fn(root)) { 621 case KSFT_PASS: 622 ksft_test_result_pass("%s\n", tests[i].name); 623 break; 624 case KSFT_SKIP: 625 ksft_test_result_skip("%s\n", tests[i].name); 626 break; 627 default: 628 ret = EXIT_FAILURE; 629 ksft_test_result_fail("%s\n", tests[i].name); 630 break; 631 } 632 } 633 634 return ret; 635 } 636