1 #define _GNU_SOURCE 2 #include <ctype.h> 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <dirent.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <stdbool.h> 11 #include <string.h> 12 #include <unistd.h> 13 14 #include <linux/mman.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/types.h> 18 #include <sys/stat.h> 19 #include <sys/sysmacros.h> 20 #include <sys/vfs.h> 21 22 #include "linux/magic.h" 23 24 #include "vm_util.h" 25 #include "thp_settings.h" 26 27 #define BASE_ADDR ((void *)(1UL << 30)) 28 static unsigned long hpage_pmd_size; 29 static unsigned long page_size; 30 static int hpage_pmd_nr; 31 static int anon_order; 32 33 #define PID_SMAPS "/proc/self/smaps" 34 #define TEST_FILE "collapse_test_file" 35 36 #define MAX_LINE_LENGTH 500 37 38 enum vma_type { 39 VMA_ANON, 40 VMA_FILE, 41 VMA_SHMEM, 42 }; 43 44 struct mem_ops { 45 void *(*setup_area)(int nr_hpages); 46 void (*cleanup_area)(void *p, unsigned long size); 47 void (*fault)(void *p, unsigned long start, unsigned long end); 48 bool (*check_huge)(void *addr, int nr_hpages); 49 const char *name; 50 }; 51 52 static struct mem_ops *file_ops; 53 static struct mem_ops *anon_ops; 54 static struct mem_ops *shmem_ops; 55 56 struct collapse_context { 57 void (*collapse)(const char *msg, char *p, int nr_hpages, 58 struct mem_ops *ops, bool expect); 59 bool enforce_pte_scan_limits; 60 const char *name; 61 }; 62 63 static struct collapse_context *khugepaged_context; 64 static struct collapse_context *madvise_context; 65 66 struct file_info { 67 const char *dir; 68 char path[PATH_MAX]; 69 enum vma_type type; 70 int fd; 71 char dev_queue_read_ahead_path[PATH_MAX]; 72 }; 73 74 static struct file_info finfo; 75 static bool skip_settings_restore; 76 static int exit_status; 77 78 static void success(const char *msg) 79 { 80 printf(" \e[32m%s\e[0m\n", msg); 81 } 82 83 static void fail(const char *msg) 84 { 85 printf(" \e[31m%s\e[0m\n", msg); 86 exit_status++; 87 } 88 89 static void skip(const char *msg) 90 { 91 printf(" \e[33m%s\e[0m\n", msg); 92 } 93 94 static void restore_settings_atexit(void) 95 { 96 if (skip_settings_restore) 97 return; 98 99 printf("Restore THP and khugepaged settings..."); 100 thp_restore_settings(); 101 success("OK"); 102 103 skip_settings_restore = true; 104 } 105 106 static void restore_settings(int sig) 107 { 108 /* exit() will invoke the restore_settings_atexit handler. */ 109 exit(sig ? EXIT_FAILURE : exit_status); 110 } 111 112 static void save_settings(void) 113 { 114 printf("Save THP and khugepaged settings..."); 115 if (file_ops && finfo.type == VMA_FILE) 116 thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); 117 thp_save_settings(); 118 119 success("OK"); 120 121 atexit(restore_settings_atexit); 122 signal(SIGTERM, restore_settings); 123 signal(SIGINT, restore_settings); 124 signal(SIGHUP, restore_settings); 125 signal(SIGQUIT, restore_settings); 126 } 127 128 static void get_finfo(const char *dir) 129 { 130 struct stat path_stat; 131 struct statfs fs; 132 char buf[1 << 10]; 133 char path[PATH_MAX]; 134 char *str, *end; 135 136 finfo.dir = dir; 137 stat(finfo.dir, &path_stat); 138 if (!S_ISDIR(path_stat.st_mode)) { 139 printf("%s: Not a directory (%s)\n", __func__, finfo.dir); 140 exit(EXIT_FAILURE); 141 } 142 if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, 143 finfo.dir) >= sizeof(finfo.path)) { 144 printf("%s: Pathname is too long\n", __func__); 145 exit(EXIT_FAILURE); 146 } 147 if (statfs(finfo.dir, &fs)) { 148 perror("statfs()"); 149 exit(EXIT_FAILURE); 150 } 151 finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; 152 if (finfo.type == VMA_SHMEM) 153 return; 154 155 /* Find owning device's queue/read_ahead_kb control */ 156 if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", 157 major(path_stat.st_dev), minor(path_stat.st_dev)) 158 >= sizeof(path)) { 159 printf("%s: Pathname is too long\n", __func__); 160 exit(EXIT_FAILURE); 161 } 162 if (read_file(path, buf, sizeof(buf)) < 0) { 163 perror("read_file(read_num)"); 164 exit(EXIT_FAILURE); 165 } 166 if (strstr(buf, "DEVTYPE=disk")) { 167 /* Found it */ 168 if (snprintf(finfo.dev_queue_read_ahead_path, 169 sizeof(finfo.dev_queue_read_ahead_path), 170 "/sys/dev/block/%d:%d/queue/read_ahead_kb", 171 major(path_stat.st_dev), minor(path_stat.st_dev)) 172 >= sizeof(finfo.dev_queue_read_ahead_path)) { 173 printf("%s: Pathname is too long\n", __func__); 174 exit(EXIT_FAILURE); 175 } 176 return; 177 } 178 if (!strstr(buf, "DEVTYPE=partition")) { 179 printf("%s: Unknown device type: %s\n", __func__, path); 180 exit(EXIT_FAILURE); 181 } 182 /* 183 * Partition of block device - need to find actual device. 184 * Using naming convention that devnameN is partition of 185 * device devname. 186 */ 187 str = strstr(buf, "DEVNAME="); 188 if (!str) { 189 printf("%s: Could not read: %s", __func__, path); 190 exit(EXIT_FAILURE); 191 } 192 str += 8; 193 end = str; 194 while (*end) { 195 if (isdigit(*end)) { 196 *end = '\0'; 197 if (snprintf(finfo.dev_queue_read_ahead_path, 198 sizeof(finfo.dev_queue_read_ahead_path), 199 "/sys/block/%s/queue/read_ahead_kb", 200 str) >= sizeof(finfo.dev_queue_read_ahead_path)) { 201 printf("%s: Pathname is too long\n", __func__); 202 exit(EXIT_FAILURE); 203 } 204 return; 205 } 206 ++end; 207 } 208 printf("%s: Could not read: %s\n", __func__, path); 209 exit(EXIT_FAILURE); 210 } 211 212 static bool check_swap(void *addr, unsigned long size) 213 { 214 bool swap = false; 215 int ret; 216 FILE *fp; 217 char buffer[MAX_LINE_LENGTH]; 218 char addr_pattern[MAX_LINE_LENGTH]; 219 220 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", 221 (unsigned long) addr); 222 if (ret >= MAX_LINE_LENGTH) { 223 printf("%s: Pattern is too long\n", __func__); 224 exit(EXIT_FAILURE); 225 } 226 227 228 fp = fopen(PID_SMAPS, "r"); 229 if (!fp) { 230 printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); 231 exit(EXIT_FAILURE); 232 } 233 if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) 234 goto err_out; 235 236 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", 237 size >> 10); 238 if (ret >= MAX_LINE_LENGTH) { 239 printf("%s: Pattern is too long\n", __func__); 240 exit(EXIT_FAILURE); 241 } 242 /* 243 * Fetch the Swap: in the same block and check whether it got 244 * the expected number of hugeepages next. 245 */ 246 if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) 247 goto err_out; 248 249 if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) 250 goto err_out; 251 252 swap = true; 253 err_out: 254 fclose(fp); 255 return swap; 256 } 257 258 static void *alloc_mapping(int nr) 259 { 260 void *p; 261 262 p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, 263 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 264 if (p != BASE_ADDR) { 265 printf("Failed to allocate VMA at %p\n", BASE_ADDR); 266 exit(EXIT_FAILURE); 267 } 268 269 return p; 270 } 271 272 static void fill_memory(int *p, unsigned long start, unsigned long end) 273 { 274 int i; 275 276 for (i = start / page_size; i < end / page_size; i++) 277 p[i * page_size / sizeof(*p)] = i + 0xdead0000; 278 } 279 280 /* 281 * MADV_COLLAPSE is a best-effort request and may fail if an internal 282 * resource is temporarily unavailable, in which case it will set errno to 283 * EAGAIN. In such a case, immediately reattempt the operation one more 284 * time. 285 */ 286 static int madvise_collapse_retry(void *p, unsigned long size) 287 { 288 bool retry = true; 289 int ret; 290 291 retry: 292 ret = madvise(p, size, MADV_COLLAPSE); 293 if (ret && errno == EAGAIN && retry) { 294 retry = false; 295 goto retry; 296 } 297 return ret; 298 } 299 300 /* 301 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with 302 * validate_memory()'able contents. 303 */ 304 static void *alloc_hpage(struct mem_ops *ops) 305 { 306 void *p = ops->setup_area(1); 307 308 ops->fault(p, 0, hpage_pmd_size); 309 310 /* 311 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. 312 * The latter is ineligible for collapse by MADV_COLLAPSE 313 * while the former might cause MADV_COLLAPSE to race with 314 * khugepaged on low-load system (like a test machine), which 315 * would cause MADV_COLLAPSE to fail with EAGAIN. 316 */ 317 printf("Allocate huge page..."); 318 if (madvise_collapse_retry(p, hpage_pmd_size)) { 319 perror("madvise(MADV_COLLAPSE)"); 320 exit(EXIT_FAILURE); 321 } 322 if (!ops->check_huge(p, 1)) { 323 perror("madvise(MADV_COLLAPSE)"); 324 exit(EXIT_FAILURE); 325 } 326 if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { 327 perror("madvise(MADV_HUGEPAGE)"); 328 exit(EXIT_FAILURE); 329 } 330 success("OK"); 331 return p; 332 } 333 334 static void validate_memory(int *p, unsigned long start, unsigned long end) 335 { 336 int i; 337 338 for (i = start / page_size; i < end / page_size; i++) { 339 if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { 340 printf("Page %d is corrupted: %#x\n", 341 i, p[i * page_size / sizeof(*p)]); 342 exit(EXIT_FAILURE); 343 } 344 } 345 } 346 347 static void *anon_setup_area(int nr_hpages) 348 { 349 return alloc_mapping(nr_hpages); 350 } 351 352 static void anon_cleanup_area(void *p, unsigned long size) 353 { 354 munmap(p, size); 355 } 356 357 static void anon_fault(void *p, unsigned long start, unsigned long end) 358 { 359 fill_memory(p, start, end); 360 } 361 362 static bool anon_check_huge(void *addr, int nr_hpages) 363 { 364 return check_huge_anon(addr, nr_hpages, hpage_pmd_size); 365 } 366 367 static void *file_setup_area(int nr_hpages) 368 { 369 int fd; 370 void *p; 371 unsigned long size; 372 373 unlink(finfo.path); /* Cleanup from previous failed tests */ 374 printf("Creating %s for collapse%s...", finfo.path, 375 finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); 376 fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 377 777); 378 if (fd < 0) { 379 perror("open()"); 380 exit(EXIT_FAILURE); 381 } 382 383 size = nr_hpages * hpage_pmd_size; 384 p = alloc_mapping(nr_hpages); 385 fill_memory(p, 0, size); 386 write(fd, p, size); 387 close(fd); 388 munmap(p, size); 389 success("OK"); 390 391 printf("Opening %s read only for collapse...", finfo.path); 392 finfo.fd = open(finfo.path, O_RDONLY, 777); 393 if (finfo.fd < 0) { 394 perror("open()"); 395 exit(EXIT_FAILURE); 396 } 397 p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, 398 MAP_PRIVATE, finfo.fd, 0); 399 if (p == MAP_FAILED || p != BASE_ADDR) { 400 perror("mmap()"); 401 exit(EXIT_FAILURE); 402 } 403 404 /* Drop page cache */ 405 write_file("/proc/sys/vm/drop_caches", "3", 2); 406 success("OK"); 407 return p; 408 } 409 410 static void file_cleanup_area(void *p, unsigned long size) 411 { 412 munmap(p, size); 413 close(finfo.fd); 414 unlink(finfo.path); 415 } 416 417 static void file_fault(void *p, unsigned long start, unsigned long end) 418 { 419 if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { 420 perror("madvise(MADV_POPULATE_READ"); 421 exit(EXIT_FAILURE); 422 } 423 } 424 425 static bool file_check_huge(void *addr, int nr_hpages) 426 { 427 switch (finfo.type) { 428 case VMA_FILE: 429 return check_huge_file(addr, nr_hpages, hpage_pmd_size); 430 case VMA_SHMEM: 431 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 432 default: 433 exit(EXIT_FAILURE); 434 return false; 435 } 436 } 437 438 static void *shmem_setup_area(int nr_hpages) 439 { 440 void *p; 441 unsigned long size = nr_hpages * hpage_pmd_size; 442 443 finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); 444 if (finfo.fd < 0) { 445 perror("memfd_create()"); 446 exit(EXIT_FAILURE); 447 } 448 if (ftruncate(finfo.fd, size)) { 449 perror("ftruncate()"); 450 exit(EXIT_FAILURE); 451 } 452 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 453 0); 454 if (p != BASE_ADDR) { 455 perror("mmap()"); 456 exit(EXIT_FAILURE); 457 } 458 return p; 459 } 460 461 static void shmem_cleanup_area(void *p, unsigned long size) 462 { 463 munmap(p, size); 464 close(finfo.fd); 465 } 466 467 static bool shmem_check_huge(void *addr, int nr_hpages) 468 { 469 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 470 } 471 472 static struct mem_ops __anon_ops = { 473 .setup_area = &anon_setup_area, 474 .cleanup_area = &anon_cleanup_area, 475 .fault = &anon_fault, 476 .check_huge = &anon_check_huge, 477 .name = "anon", 478 }; 479 480 static struct mem_ops __file_ops = { 481 .setup_area = &file_setup_area, 482 .cleanup_area = &file_cleanup_area, 483 .fault = &file_fault, 484 .check_huge = &file_check_huge, 485 .name = "file", 486 }; 487 488 static struct mem_ops __shmem_ops = { 489 .setup_area = &shmem_setup_area, 490 .cleanup_area = &shmem_cleanup_area, 491 .fault = &anon_fault, 492 .check_huge = &shmem_check_huge, 493 .name = "shmem", 494 }; 495 496 static void __madvise_collapse(const char *msg, char *p, int nr_hpages, 497 struct mem_ops *ops, bool expect) 498 { 499 int ret; 500 struct thp_settings settings = *thp_current_settings(); 501 502 printf("%s...", msg); 503 504 /* 505 * Prevent khugepaged interference and tests that MADV_COLLAPSE 506 * ignores /sys/kernel/mm/transparent_hugepage/enabled 507 */ 508 settings.thp_enabled = THP_NEVER; 509 settings.shmem_enabled = SHMEM_NEVER; 510 thp_push_settings(&settings); 511 512 /* Clear VM_NOHUGEPAGE */ 513 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 514 ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); 515 if (((bool)ret) == expect) 516 fail("Fail: Bad return value"); 517 else if (!ops->check_huge(p, expect ? nr_hpages : 0)) 518 fail("Fail: check_huge()"); 519 else 520 success("OK"); 521 522 thp_pop_settings(); 523 } 524 525 static void madvise_collapse(const char *msg, char *p, int nr_hpages, 526 struct mem_ops *ops, bool expect) 527 { 528 /* Sanity check */ 529 if (!ops->check_huge(p, 0)) { 530 printf("Unexpected huge page\n"); 531 exit(EXIT_FAILURE); 532 } 533 __madvise_collapse(msg, p, nr_hpages, ops, expect); 534 } 535 536 #define TICK 500000 537 static bool wait_for_scan(const char *msg, char *p, int nr_hpages, 538 struct mem_ops *ops) 539 { 540 int full_scans; 541 int timeout = 6; /* 3 seconds */ 542 543 /* Sanity check */ 544 if (!ops->check_huge(p, 0)) { 545 printf("Unexpected huge page\n"); 546 exit(EXIT_FAILURE); 547 } 548 549 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 550 551 /* Wait until the second full_scan completed */ 552 full_scans = thp_read_num("khugepaged/full_scans") + 2; 553 554 printf("%s...", msg); 555 while (timeout--) { 556 if (ops->check_huge(p, nr_hpages)) 557 break; 558 if (thp_read_num("khugepaged/full_scans") >= full_scans) 559 break; 560 printf("."); 561 usleep(TICK); 562 } 563 564 madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); 565 566 return timeout == -1; 567 } 568 569 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, 570 struct mem_ops *ops, bool expect) 571 { 572 if (wait_for_scan(msg, p, nr_hpages, ops)) { 573 if (expect) 574 fail("Timeout"); 575 else 576 success("OK"); 577 return; 578 } 579 580 /* 581 * For file and shmem memory, khugepaged only retracts pte entries after 582 * putting the new hugepage in the page cache. The hugepage must be 583 * subsequently refaulted to install the pmd mapping for the mm. 584 */ 585 if (ops != &__anon_ops) 586 ops->fault(p, 0, nr_hpages * hpage_pmd_size); 587 588 if (ops->check_huge(p, expect ? nr_hpages : 0)) 589 success("OK"); 590 else 591 fail("Fail"); 592 } 593 594 static struct collapse_context __khugepaged_context = { 595 .collapse = &khugepaged_collapse, 596 .enforce_pte_scan_limits = true, 597 .name = "khugepaged", 598 }; 599 600 static struct collapse_context __madvise_context = { 601 .collapse = &madvise_collapse, 602 .enforce_pte_scan_limits = false, 603 .name = "madvise", 604 }; 605 606 static bool is_tmpfs(struct mem_ops *ops) 607 { 608 return ops == &__file_ops && finfo.type == VMA_SHMEM; 609 } 610 611 static bool is_anon(struct mem_ops *ops) 612 { 613 return ops == &__anon_ops; 614 } 615 616 static void alloc_at_fault(void) 617 { 618 struct thp_settings settings = *thp_current_settings(); 619 char *p; 620 621 settings.thp_enabled = THP_ALWAYS; 622 thp_push_settings(&settings); 623 624 p = alloc_mapping(1); 625 *p = 1; 626 printf("Allocate huge page on fault..."); 627 if (check_huge_anon(p, 1, hpage_pmd_size)) 628 success("OK"); 629 else 630 fail("Fail"); 631 632 thp_pop_settings(); 633 634 madvise(p, page_size, MADV_DONTNEED); 635 printf("Split huge PMD on MADV_DONTNEED..."); 636 if (check_huge_anon(p, 0, hpage_pmd_size)) 637 success("OK"); 638 else 639 fail("Fail"); 640 munmap(p, hpage_pmd_size); 641 } 642 643 static void collapse_full(struct collapse_context *c, struct mem_ops *ops) 644 { 645 void *p; 646 int nr_hpages = 4; 647 unsigned long size = nr_hpages * hpage_pmd_size; 648 649 p = ops->setup_area(nr_hpages); 650 ops->fault(p, 0, size); 651 c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, 652 ops, true); 653 validate_memory(p, 0, size); 654 ops->cleanup_area(p, size); 655 } 656 657 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) 658 { 659 void *p; 660 661 p = ops->setup_area(1); 662 c->collapse("Do not collapse empty PTE table", p, 1, ops, false); 663 ops->cleanup_area(p, hpage_pmd_size); 664 } 665 666 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) 667 { 668 void *p; 669 670 p = ops->setup_area(1); 671 ops->fault(p, 0, page_size); 672 c->collapse("Collapse PTE table with single PTE entry present", p, 673 1, ops, true); 674 ops->cleanup_area(p, hpage_pmd_size); 675 } 676 677 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) 678 { 679 int max_ptes_none = hpage_pmd_nr / 2; 680 struct thp_settings settings = *thp_current_settings(); 681 void *p; 682 int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; 683 684 settings.khugepaged.max_ptes_none = max_ptes_none; 685 thp_push_settings(&settings); 686 687 p = ops->setup_area(1); 688 689 if (is_tmpfs(ops)) { 690 /* shmem pages always in the page cache */ 691 printf("tmpfs..."); 692 skip("Skip"); 693 goto skip; 694 } 695 696 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 697 c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, 698 ops, !c->enforce_pte_scan_limits); 699 validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 700 701 if (c->enforce_pte_scan_limits) { 702 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); 703 c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, 704 true); 705 validate_memory(p, 0, 706 (hpage_pmd_nr - max_ptes_none) * page_size); 707 } 708 skip: 709 ops->cleanup_area(p, hpage_pmd_size); 710 thp_pop_settings(); 711 } 712 713 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) 714 { 715 void *p; 716 717 p = ops->setup_area(1); 718 ops->fault(p, 0, hpage_pmd_size); 719 720 printf("Swapout one page..."); 721 if (madvise(p, page_size, MADV_PAGEOUT)) { 722 perror("madvise(MADV_PAGEOUT)"); 723 exit(EXIT_FAILURE); 724 } 725 if (check_swap(p, page_size)) { 726 success("OK"); 727 } else { 728 fail("Fail"); 729 goto out; 730 } 731 732 c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, 733 true); 734 validate_memory(p, 0, hpage_pmd_size); 735 out: 736 ops->cleanup_area(p, hpage_pmd_size); 737 } 738 739 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) 740 { 741 int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); 742 void *p; 743 744 p = ops->setup_area(1); 745 ops->fault(p, 0, hpage_pmd_size); 746 747 printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); 748 if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { 749 perror("madvise(MADV_PAGEOUT)"); 750 exit(EXIT_FAILURE); 751 } 752 if (check_swap(p, (max_ptes_swap + 1) * page_size)) { 753 success("OK"); 754 } else { 755 fail("Fail"); 756 goto out; 757 } 758 759 c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, 760 !c->enforce_pte_scan_limits); 761 validate_memory(p, 0, hpage_pmd_size); 762 763 if (c->enforce_pte_scan_limits) { 764 ops->fault(p, 0, hpage_pmd_size); 765 printf("Swapout %d of %d pages...", max_ptes_swap, 766 hpage_pmd_nr); 767 if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { 768 perror("madvise(MADV_PAGEOUT)"); 769 exit(EXIT_FAILURE); 770 } 771 if (check_swap(p, max_ptes_swap * page_size)) { 772 success("OK"); 773 } else { 774 fail("Fail"); 775 goto out; 776 } 777 778 c->collapse("Collapse with max_ptes_swap pages swapped out", p, 779 1, ops, true); 780 validate_memory(p, 0, hpage_pmd_size); 781 } 782 out: 783 ops->cleanup_area(p, hpage_pmd_size); 784 } 785 786 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) 787 { 788 void *p; 789 790 p = alloc_hpage(ops); 791 792 if (is_tmpfs(ops)) { 793 /* MADV_DONTNEED won't evict tmpfs pages */ 794 printf("tmpfs..."); 795 skip("Skip"); 796 goto skip; 797 } 798 799 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 800 printf("Split huge page leaving single PTE mapping compound page..."); 801 madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); 802 if (ops->check_huge(p, 0)) 803 success("OK"); 804 else 805 fail("Fail"); 806 807 c->collapse("Collapse PTE table with single PTE mapping compound page", 808 p, 1, ops, true); 809 validate_memory(p, 0, page_size); 810 skip: 811 ops->cleanup_area(p, hpage_pmd_size); 812 } 813 814 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) 815 { 816 void *p; 817 818 p = alloc_hpage(ops); 819 printf("Split huge page leaving single PTE page table full of compound pages..."); 820 madvise(p, page_size, MADV_NOHUGEPAGE); 821 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 822 if (ops->check_huge(p, 0)) 823 success("OK"); 824 else 825 fail("Fail"); 826 827 c->collapse("Collapse PTE table full of compound pages", p, 1, ops, 828 true); 829 validate_memory(p, 0, hpage_pmd_size); 830 ops->cleanup_area(p, hpage_pmd_size); 831 } 832 833 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) 834 { 835 void *p; 836 int i; 837 838 p = ops->setup_area(1); 839 for (i = 0; i < hpage_pmd_nr; i++) { 840 printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", 841 i + 1, hpage_pmd_nr); 842 843 madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); 844 ops->fault(BASE_ADDR, 0, hpage_pmd_size); 845 if (!ops->check_huge(BASE_ADDR, 1)) { 846 printf("Failed to allocate huge page\n"); 847 exit(EXIT_FAILURE); 848 } 849 madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); 850 851 p = mremap(BASE_ADDR - i * page_size, 852 i * page_size + hpage_pmd_size, 853 (i + 1) * page_size, 854 MREMAP_MAYMOVE | MREMAP_FIXED, 855 BASE_ADDR + 2 * hpage_pmd_size); 856 if (p == MAP_FAILED) { 857 perror("mremap+unmap"); 858 exit(EXIT_FAILURE); 859 } 860 861 p = mremap(BASE_ADDR + 2 * hpage_pmd_size, 862 (i + 1) * page_size, 863 (i + 1) * page_size + hpage_pmd_size, 864 MREMAP_MAYMOVE | MREMAP_FIXED, 865 BASE_ADDR - (i + 1) * page_size); 866 if (p == MAP_FAILED) { 867 perror("mremap+alloc"); 868 exit(EXIT_FAILURE); 869 } 870 } 871 872 ops->cleanup_area(BASE_ADDR, hpage_pmd_size); 873 ops->fault(p, 0, hpage_pmd_size); 874 if (!ops->check_huge(p, 1)) 875 success("OK"); 876 else 877 fail("Fail"); 878 879 c->collapse("Collapse PTE table full of different compound pages", p, 1, 880 ops, true); 881 882 validate_memory(p, 0, hpage_pmd_size); 883 ops->cleanup_area(p, hpage_pmd_size); 884 } 885 886 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) 887 { 888 int wstatus; 889 void *p; 890 891 p = ops->setup_area(1); 892 893 printf("Allocate small page..."); 894 ops->fault(p, 0, page_size); 895 if (ops->check_huge(p, 0)) 896 success("OK"); 897 else 898 fail("Fail"); 899 900 printf("Share small page over fork()..."); 901 if (!fork()) { 902 /* Do not touch settings on child exit */ 903 skip_settings_restore = true; 904 exit_status = 0; 905 906 if (ops->check_huge(p, 0)) 907 success("OK"); 908 else 909 fail("Fail"); 910 911 ops->fault(p, page_size, 2 * page_size); 912 c->collapse("Collapse PTE table with single page shared with parent process", 913 p, 1, ops, true); 914 915 validate_memory(p, 0, page_size); 916 ops->cleanup_area(p, hpage_pmd_size); 917 exit(exit_status); 918 } 919 920 wait(&wstatus); 921 exit_status += WEXITSTATUS(wstatus); 922 923 printf("Check if parent still has small page..."); 924 if (ops->check_huge(p, 0)) 925 success("OK"); 926 else 927 fail("Fail"); 928 validate_memory(p, 0, page_size); 929 ops->cleanup_area(p, hpage_pmd_size); 930 } 931 932 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) 933 { 934 int wstatus; 935 void *p; 936 937 p = alloc_hpage(ops); 938 printf("Share huge page over fork()..."); 939 if (!fork()) { 940 /* Do not touch settings on child exit */ 941 skip_settings_restore = true; 942 exit_status = 0; 943 944 if (ops->check_huge(p, 1)) 945 success("OK"); 946 else 947 fail("Fail"); 948 949 printf("Split huge page PMD in child process..."); 950 madvise(p, page_size, MADV_NOHUGEPAGE); 951 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 952 if (ops->check_huge(p, 0)) 953 success("OK"); 954 else 955 fail("Fail"); 956 ops->fault(p, 0, page_size); 957 958 thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); 959 c->collapse("Collapse PTE table full of compound pages in child", 960 p, 1, ops, true); 961 thp_write_num("khugepaged/max_ptes_shared", 962 thp_current_settings()->khugepaged.max_ptes_shared); 963 964 validate_memory(p, 0, hpage_pmd_size); 965 ops->cleanup_area(p, hpage_pmd_size); 966 exit(exit_status); 967 } 968 969 wait(&wstatus); 970 exit_status += WEXITSTATUS(wstatus); 971 972 printf("Check if parent still has huge page..."); 973 if (ops->check_huge(p, 1)) 974 success("OK"); 975 else 976 fail("Fail"); 977 validate_memory(p, 0, hpage_pmd_size); 978 ops->cleanup_area(p, hpage_pmd_size); 979 } 980 981 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) 982 { 983 int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); 984 int wstatus; 985 void *p; 986 987 p = alloc_hpage(ops); 988 printf("Share huge page over fork()..."); 989 if (!fork()) { 990 /* Do not touch settings on child exit */ 991 skip_settings_restore = true; 992 exit_status = 0; 993 994 if (ops->check_huge(p, 1)) 995 success("OK"); 996 else 997 fail("Fail"); 998 999 printf("Trigger CoW on page %d of %d...", 1000 hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); 1001 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); 1002 if (ops->check_huge(p, 0)) 1003 success("OK"); 1004 else 1005 fail("Fail"); 1006 1007 c->collapse("Maybe collapse with max_ptes_shared exceeded", p, 1008 1, ops, !c->enforce_pte_scan_limits); 1009 1010 if (c->enforce_pte_scan_limits) { 1011 printf("Trigger CoW on page %d of %d...", 1012 hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); 1013 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * 1014 page_size); 1015 if (ops->check_huge(p, 0)) 1016 success("OK"); 1017 else 1018 fail("Fail"); 1019 1020 c->collapse("Collapse with max_ptes_shared PTEs shared", 1021 p, 1, ops, true); 1022 } 1023 1024 validate_memory(p, 0, hpage_pmd_size); 1025 ops->cleanup_area(p, hpage_pmd_size); 1026 exit(exit_status); 1027 } 1028 1029 wait(&wstatus); 1030 exit_status += WEXITSTATUS(wstatus); 1031 1032 printf("Check if parent still has huge page..."); 1033 if (ops->check_huge(p, 1)) 1034 success("OK"); 1035 else 1036 fail("Fail"); 1037 validate_memory(p, 0, hpage_pmd_size); 1038 ops->cleanup_area(p, hpage_pmd_size); 1039 } 1040 1041 static void madvise_collapse_existing_thps(struct collapse_context *c, 1042 struct mem_ops *ops) 1043 { 1044 void *p; 1045 1046 p = ops->setup_area(1); 1047 ops->fault(p, 0, hpage_pmd_size); 1048 c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); 1049 validate_memory(p, 0, hpage_pmd_size); 1050 1051 /* c->collapse() will find a hugepage and complain - call directly. */ 1052 __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); 1053 validate_memory(p, 0, hpage_pmd_size); 1054 ops->cleanup_area(p, hpage_pmd_size); 1055 } 1056 1057 /* 1058 * Test race with khugepaged where page tables have been retracted and 1059 * pmd cleared. 1060 */ 1061 static void madvise_retracted_page_tables(struct collapse_context *c, 1062 struct mem_ops *ops) 1063 { 1064 void *p; 1065 int nr_hpages = 1; 1066 unsigned long size = nr_hpages * hpage_pmd_size; 1067 1068 p = ops->setup_area(nr_hpages); 1069 ops->fault(p, 0, size); 1070 1071 /* Let khugepaged collapse and leave pmd cleared */ 1072 if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, 1073 ops)) { 1074 fail("Timeout"); 1075 return; 1076 } 1077 success("OK"); 1078 c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, 1079 true); 1080 validate_memory(p, 0, size); 1081 ops->cleanup_area(p, size); 1082 } 1083 1084 static void usage(void) 1085 { 1086 fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n"); 1087 fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); 1088 fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); 1089 fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); 1090 fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); 1091 fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); 1092 fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); 1093 fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); 1094 fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); 1095 fprintf(stderr, "\n\tSupported Options:\n"); 1096 fprintf(stderr, "\t\t-h: This help message.\n"); 1097 fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); 1098 fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n"); 1099 exit(1); 1100 } 1101 1102 static void parse_test_type(int argc, char **argv) 1103 { 1104 int opt; 1105 char *buf; 1106 const char *token; 1107 1108 while ((opt = getopt(argc, argv, "s:h")) != -1) { 1109 switch (opt) { 1110 case 's': 1111 anon_order = atoi(optarg); 1112 break; 1113 case 'h': 1114 default: 1115 usage(); 1116 } 1117 } 1118 1119 argv += optind; 1120 argc -= optind; 1121 1122 if (argc == 0) { 1123 /* Backwards compatibility */ 1124 khugepaged_context = &__khugepaged_context; 1125 madvise_context = &__madvise_context; 1126 anon_ops = &__anon_ops; 1127 return; 1128 } 1129 1130 buf = strdup(argv[0]); 1131 token = strsep(&buf, ":"); 1132 1133 if (!strcmp(token, "all")) { 1134 khugepaged_context = &__khugepaged_context; 1135 madvise_context = &__madvise_context; 1136 } else if (!strcmp(token, "khugepaged")) { 1137 khugepaged_context = &__khugepaged_context; 1138 } else if (!strcmp(token, "madvise")) { 1139 madvise_context = &__madvise_context; 1140 } else { 1141 usage(); 1142 } 1143 1144 if (!buf) 1145 usage(); 1146 1147 if (!strcmp(buf, "all")) { 1148 file_ops = &__file_ops; 1149 anon_ops = &__anon_ops; 1150 shmem_ops = &__shmem_ops; 1151 } else if (!strcmp(buf, "anon")) { 1152 anon_ops = &__anon_ops; 1153 } else if (!strcmp(buf, "file")) { 1154 file_ops = &__file_ops; 1155 } else if (!strcmp(buf, "shmem")) { 1156 shmem_ops = &__shmem_ops; 1157 } else { 1158 usage(); 1159 } 1160 1161 if (!file_ops) 1162 return; 1163 1164 if (argc != 2) 1165 usage(); 1166 1167 get_finfo(argv[1]); 1168 } 1169 1170 int main(int argc, char **argv) 1171 { 1172 int hpage_pmd_order; 1173 struct thp_settings default_settings = { 1174 .thp_enabled = THP_MADVISE, 1175 .thp_defrag = THP_DEFRAG_ALWAYS, 1176 .shmem_enabled = SHMEM_ADVISE, 1177 .use_zero_page = 0, 1178 .khugepaged = { 1179 .defrag = 1, 1180 .alloc_sleep_millisecs = 10, 1181 .scan_sleep_millisecs = 10, 1182 }, 1183 /* 1184 * When testing file-backed memory, the collapse path 1185 * looks at how many pages are found in the page cache, not 1186 * what pages are mapped. Disable read ahead optimization so 1187 * pages don't find their way into the page cache unless 1188 * we mem_ops->fault() them in. 1189 */ 1190 .read_ahead_kb = 0, 1191 }; 1192 1193 parse_test_type(argc, argv); 1194 1195 setbuf(stdout, NULL); 1196 1197 page_size = getpagesize(); 1198 hpage_pmd_size = read_pmd_pagesize(); 1199 if (!hpage_pmd_size) { 1200 printf("Reading PMD pagesize failed"); 1201 exit(EXIT_FAILURE); 1202 } 1203 hpage_pmd_nr = hpage_pmd_size / page_size; 1204 hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); 1205 1206 default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; 1207 default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; 1208 default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; 1209 default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; 1210 default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; 1211 default_settings.hugepages[anon_order].enabled = THP_ALWAYS; 1212 default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT; 1213 default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS; 1214 1215 save_settings(); 1216 thp_push_settings(&default_settings); 1217 1218 alloc_at_fault(); 1219 1220 #define TEST(t, c, o) do { \ 1221 if (c && o) { \ 1222 printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ 1223 t(c, o); \ 1224 } \ 1225 } while (0) 1226 1227 TEST(collapse_full, khugepaged_context, anon_ops); 1228 TEST(collapse_full, khugepaged_context, file_ops); 1229 TEST(collapse_full, khugepaged_context, shmem_ops); 1230 TEST(collapse_full, madvise_context, anon_ops); 1231 TEST(collapse_full, madvise_context, file_ops); 1232 TEST(collapse_full, madvise_context, shmem_ops); 1233 1234 TEST(collapse_empty, khugepaged_context, anon_ops); 1235 TEST(collapse_empty, madvise_context, anon_ops); 1236 1237 TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); 1238 TEST(collapse_single_pte_entry, khugepaged_context, file_ops); 1239 TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); 1240 TEST(collapse_single_pte_entry, madvise_context, anon_ops); 1241 TEST(collapse_single_pte_entry, madvise_context, file_ops); 1242 TEST(collapse_single_pte_entry, madvise_context, shmem_ops); 1243 1244 TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); 1245 TEST(collapse_max_ptes_none, khugepaged_context, file_ops); 1246 TEST(collapse_max_ptes_none, madvise_context, anon_ops); 1247 TEST(collapse_max_ptes_none, madvise_context, file_ops); 1248 1249 TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); 1250 TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); 1251 TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); 1252 TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); 1253 1254 TEST(collapse_full_of_compound, khugepaged_context, anon_ops); 1255 TEST(collapse_full_of_compound, khugepaged_context, file_ops); 1256 TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); 1257 TEST(collapse_full_of_compound, madvise_context, anon_ops); 1258 TEST(collapse_full_of_compound, madvise_context, file_ops); 1259 TEST(collapse_full_of_compound, madvise_context, shmem_ops); 1260 1261 TEST(collapse_compound_extreme, khugepaged_context, anon_ops); 1262 TEST(collapse_compound_extreme, madvise_context, anon_ops); 1263 1264 TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); 1265 TEST(collapse_swapin_single_pte, madvise_context, anon_ops); 1266 1267 TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); 1268 TEST(collapse_max_ptes_swap, madvise_context, anon_ops); 1269 1270 TEST(collapse_fork, khugepaged_context, anon_ops); 1271 TEST(collapse_fork, madvise_context, anon_ops); 1272 1273 TEST(collapse_fork_compound, khugepaged_context, anon_ops); 1274 TEST(collapse_fork_compound, madvise_context, anon_ops); 1275 1276 TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); 1277 TEST(collapse_max_ptes_shared, madvise_context, anon_ops); 1278 1279 TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); 1280 TEST(madvise_collapse_existing_thps, madvise_context, file_ops); 1281 TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); 1282 1283 TEST(madvise_retracted_page_tables, madvise_context, file_ops); 1284 TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); 1285 1286 restore_settings(0); 1287 } 1288