1 #define _GNU_SOURCE 2 #include <ctype.h> 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <dirent.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <stdbool.h> 11 #include <string.h> 12 #include <unistd.h> 13 14 #include <linux/mman.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/types.h> 18 #include <sys/stat.h> 19 #include <sys/sysmacros.h> 20 #include <sys/vfs.h> 21 22 #include "linux/magic.h" 23 24 #include "vm_util.h" 25 #include "thp_settings.h" 26 27 #define BASE_ADDR ((void *)(1UL << 30)) 28 static unsigned long hpage_pmd_size; 29 static unsigned long page_size; 30 static int hpage_pmd_nr; 31 static int anon_order; 32 33 #define PID_SMAPS "/proc/self/smaps" 34 #define TEST_FILE "collapse_test_file" 35 36 #define MAX_LINE_LENGTH 500 37 38 enum vma_type { 39 VMA_ANON, 40 VMA_FILE, 41 VMA_SHMEM, 42 }; 43 44 struct mem_ops { 45 void *(*setup_area)(int nr_hpages); 46 void (*cleanup_area)(void *p, unsigned long size); 47 void (*fault)(void *p, unsigned long start, unsigned long end); 48 bool (*check_huge)(void *addr, int nr_hpages); 49 const char *name; 50 }; 51 52 static struct mem_ops *file_ops; 53 static struct mem_ops *anon_ops; 54 static struct mem_ops *shmem_ops; 55 56 struct collapse_context { 57 void (*collapse)(const char *msg, char *p, int nr_hpages, 58 struct mem_ops *ops, bool expect); 59 bool enforce_pte_scan_limits; 60 const char *name; 61 }; 62 63 static struct collapse_context *khugepaged_context; 64 static struct collapse_context *madvise_context; 65 66 struct file_info { 67 const char *dir; 68 char path[PATH_MAX]; 69 enum vma_type type; 70 int fd; 71 char dev_queue_read_ahead_path[PATH_MAX]; 72 }; 73 74 static struct file_info finfo; 75 static bool skip_settings_restore; 76 static int exit_status; 77 78 static void success(const char *msg) 79 { 80 printf(" \e[32m%s\e[0m\n", msg); 81 } 82 83 static void fail(const char *msg) 84 { 85 printf(" \e[31m%s\e[0m\n", msg); 86 exit_status++; 87 } 88 89 static void skip(const char *msg) 90 { 91 printf(" \e[33m%s\e[0m\n", msg); 92 } 93 94 static void restore_settings_atexit(void) 95 { 96 if (skip_settings_restore) 97 return; 98 99 printf("Restore THP and khugepaged settings..."); 100 thp_restore_settings(); 101 success("OK"); 102 103 skip_settings_restore = true; 104 } 105 106 static void restore_settings(int sig) 107 { 108 /* exit() will invoke the restore_settings_atexit handler. */ 109 exit(sig ? EXIT_FAILURE : exit_status); 110 } 111 112 static void save_settings(void) 113 { 114 printf("Save THP and khugepaged settings..."); 115 if (file_ops && finfo.type == VMA_FILE) 116 thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); 117 thp_save_settings(); 118 119 success("OK"); 120 121 atexit(restore_settings_atexit); 122 signal(SIGTERM, restore_settings); 123 signal(SIGINT, restore_settings); 124 signal(SIGHUP, restore_settings); 125 signal(SIGQUIT, restore_settings); 126 } 127 128 static void get_finfo(const char *dir) 129 { 130 struct stat path_stat; 131 struct statfs fs; 132 char buf[1 << 10]; 133 char path[PATH_MAX]; 134 char *str, *end; 135 136 finfo.dir = dir; 137 stat(finfo.dir, &path_stat); 138 if (!S_ISDIR(path_stat.st_mode)) { 139 printf("%s: Not a directory (%s)\n", __func__, finfo.dir); 140 exit(EXIT_FAILURE); 141 } 142 if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, 143 finfo.dir) >= sizeof(finfo.path)) { 144 printf("%s: Pathname is too long\n", __func__); 145 exit(EXIT_FAILURE); 146 } 147 if (statfs(finfo.dir, &fs)) { 148 perror("statfs()"); 149 exit(EXIT_FAILURE); 150 } 151 finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; 152 if (finfo.type == VMA_SHMEM) 153 return; 154 155 /* Find owning device's queue/read_ahead_kb control */ 156 if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", 157 major(path_stat.st_dev), minor(path_stat.st_dev)) 158 >= sizeof(path)) { 159 printf("%s: Pathname is too long\n", __func__); 160 exit(EXIT_FAILURE); 161 } 162 if (read_file(path, buf, sizeof(buf)) < 0) { 163 perror("read_file(read_num)"); 164 exit(EXIT_FAILURE); 165 } 166 if (strstr(buf, "DEVTYPE=disk")) { 167 /* Found it */ 168 if (snprintf(finfo.dev_queue_read_ahead_path, 169 sizeof(finfo.dev_queue_read_ahead_path), 170 "/sys/dev/block/%d:%d/queue/read_ahead_kb", 171 major(path_stat.st_dev), minor(path_stat.st_dev)) 172 >= sizeof(finfo.dev_queue_read_ahead_path)) { 173 printf("%s: Pathname is too long\n", __func__); 174 exit(EXIT_FAILURE); 175 } 176 return; 177 } 178 if (!strstr(buf, "DEVTYPE=partition")) { 179 printf("%s: Unknown device type: %s\n", __func__, path); 180 exit(EXIT_FAILURE); 181 } 182 /* 183 * Partition of block device - need to find actual device. 184 * Using naming convention that devnameN is partition of 185 * device devname. 186 */ 187 str = strstr(buf, "DEVNAME="); 188 if (!str) { 189 printf("%s: Could not read: %s", __func__, path); 190 exit(EXIT_FAILURE); 191 } 192 str += 8; 193 end = str; 194 while (*end) { 195 if (isdigit(*end)) { 196 *end = '\0'; 197 if (snprintf(finfo.dev_queue_read_ahead_path, 198 sizeof(finfo.dev_queue_read_ahead_path), 199 "/sys/block/%s/queue/read_ahead_kb", 200 str) >= sizeof(finfo.dev_queue_read_ahead_path)) { 201 printf("%s: Pathname is too long\n", __func__); 202 exit(EXIT_FAILURE); 203 } 204 return; 205 } 206 ++end; 207 } 208 printf("%s: Could not read: %s\n", __func__, path); 209 exit(EXIT_FAILURE); 210 } 211 212 static bool check_swap(void *addr, unsigned long size) 213 { 214 bool swap = false; 215 int ret; 216 FILE *fp; 217 char buffer[MAX_LINE_LENGTH]; 218 char addr_pattern[MAX_LINE_LENGTH]; 219 220 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", 221 (unsigned long) addr); 222 if (ret >= MAX_LINE_LENGTH) { 223 printf("%s: Pattern is too long\n", __func__); 224 exit(EXIT_FAILURE); 225 } 226 227 228 fp = fopen(PID_SMAPS, "r"); 229 if (!fp) { 230 printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); 231 exit(EXIT_FAILURE); 232 } 233 if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) 234 goto err_out; 235 236 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", 237 size >> 10); 238 if (ret >= MAX_LINE_LENGTH) { 239 printf("%s: Pattern is too long\n", __func__); 240 exit(EXIT_FAILURE); 241 } 242 /* 243 * Fetch the Swap: in the same block and check whether it got 244 * the expected number of hugeepages next. 245 */ 246 if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) 247 goto err_out; 248 249 if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) 250 goto err_out; 251 252 swap = true; 253 err_out: 254 fclose(fp); 255 return swap; 256 } 257 258 static void *alloc_mapping(int nr) 259 { 260 void *p; 261 262 p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, 263 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 264 if (p != BASE_ADDR) { 265 printf("Failed to allocate VMA at %p\n", BASE_ADDR); 266 exit(EXIT_FAILURE); 267 } 268 269 return p; 270 } 271 272 static void fill_memory(int *p, unsigned long start, unsigned long end) 273 { 274 int i; 275 276 for (i = start / page_size; i < end / page_size; i++) 277 p[i * page_size / sizeof(*p)] = i + 0xdead0000; 278 } 279 280 /* 281 * MADV_COLLAPSE is a best-effort request and may fail if an internal 282 * resource is temporarily unavailable, in which case it will set errno to 283 * EAGAIN. In such a case, immediately reattempt the operation one more 284 * time. 285 */ 286 static int madvise_collapse_retry(void *p, unsigned long size) 287 { 288 bool retry = true; 289 int ret; 290 291 retry: 292 ret = madvise(p, size, MADV_COLLAPSE); 293 if (ret && errno == EAGAIN && retry) { 294 retry = false; 295 goto retry; 296 } 297 return ret; 298 } 299 300 /* 301 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with 302 * validate_memory()'able contents. 303 */ 304 static void *alloc_hpage(struct mem_ops *ops) 305 { 306 void *p = ops->setup_area(1); 307 308 ops->fault(p, 0, hpage_pmd_size); 309 310 /* 311 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. 312 * The latter is ineligible for collapse by MADV_COLLAPSE 313 * while the former might cause MADV_COLLAPSE to race with 314 * khugepaged on low-load system (like a test machine), which 315 * would cause MADV_COLLAPSE to fail with EAGAIN. 316 */ 317 printf("Allocate huge page..."); 318 if (madvise_collapse_retry(p, hpage_pmd_size)) { 319 perror("madvise(MADV_COLLAPSE)"); 320 exit(EXIT_FAILURE); 321 } 322 if (!ops->check_huge(p, 1)) { 323 perror("madvise(MADV_COLLAPSE)"); 324 exit(EXIT_FAILURE); 325 } 326 if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { 327 perror("madvise(MADV_HUGEPAGE)"); 328 exit(EXIT_FAILURE); 329 } 330 success("OK"); 331 return p; 332 } 333 334 static void validate_memory(int *p, unsigned long start, unsigned long end) 335 { 336 int i; 337 338 for (i = start / page_size; i < end / page_size; i++) { 339 if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { 340 printf("Page %d is corrupted: %#x\n", 341 i, p[i * page_size / sizeof(*p)]); 342 exit(EXIT_FAILURE); 343 } 344 } 345 } 346 347 static void *anon_setup_area(int nr_hpages) 348 { 349 return alloc_mapping(nr_hpages); 350 } 351 352 static void anon_cleanup_area(void *p, unsigned long size) 353 { 354 munmap(p, size); 355 } 356 357 static void anon_fault(void *p, unsigned long start, unsigned long end) 358 { 359 fill_memory(p, start, end); 360 } 361 362 static bool anon_check_huge(void *addr, int nr_hpages) 363 { 364 return check_huge_anon(addr, nr_hpages, hpage_pmd_size); 365 } 366 367 static void *file_setup_area(int nr_hpages) 368 { 369 int fd; 370 void *p; 371 unsigned long size; 372 373 unlink(finfo.path); /* Cleanup from previous failed tests */ 374 printf("Creating %s for collapse%s...", finfo.path, 375 finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); 376 fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 377 777); 378 if (fd < 0) { 379 perror("open()"); 380 exit(EXIT_FAILURE); 381 } 382 383 size = nr_hpages * hpage_pmd_size; 384 p = alloc_mapping(nr_hpages); 385 fill_memory(p, 0, size); 386 write(fd, p, size); 387 close(fd); 388 munmap(p, size); 389 success("OK"); 390 391 printf("Opening %s read only for collapse...", finfo.path); 392 finfo.fd = open(finfo.path, O_RDONLY, 777); 393 if (finfo.fd < 0) { 394 perror("open()"); 395 exit(EXIT_FAILURE); 396 } 397 p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, 398 MAP_PRIVATE, finfo.fd, 0); 399 if (p == MAP_FAILED || p != BASE_ADDR) { 400 perror("mmap()"); 401 exit(EXIT_FAILURE); 402 } 403 404 /* Drop page cache */ 405 write_file("/proc/sys/vm/drop_caches", "3", 2); 406 success("OK"); 407 return p; 408 } 409 410 static void file_cleanup_area(void *p, unsigned long size) 411 { 412 munmap(p, size); 413 close(finfo.fd); 414 unlink(finfo.path); 415 } 416 417 static void file_fault(void *p, unsigned long start, unsigned long end) 418 { 419 if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { 420 perror("madvise(MADV_POPULATE_READ"); 421 exit(EXIT_FAILURE); 422 } 423 } 424 425 static bool file_check_huge(void *addr, int nr_hpages) 426 { 427 switch (finfo.type) { 428 case VMA_FILE: 429 return check_huge_file(addr, nr_hpages, hpage_pmd_size); 430 case VMA_SHMEM: 431 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 432 default: 433 exit(EXIT_FAILURE); 434 return false; 435 } 436 } 437 438 static void *shmem_setup_area(int nr_hpages) 439 { 440 void *p; 441 unsigned long size = nr_hpages * hpage_pmd_size; 442 443 finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); 444 if (finfo.fd < 0) { 445 perror("memfd_create()"); 446 exit(EXIT_FAILURE); 447 } 448 if (ftruncate(finfo.fd, size)) { 449 perror("ftruncate()"); 450 exit(EXIT_FAILURE); 451 } 452 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 453 0); 454 if (p != BASE_ADDR) { 455 perror("mmap()"); 456 exit(EXIT_FAILURE); 457 } 458 return p; 459 } 460 461 static void shmem_cleanup_area(void *p, unsigned long size) 462 { 463 munmap(p, size); 464 close(finfo.fd); 465 } 466 467 static bool shmem_check_huge(void *addr, int nr_hpages) 468 { 469 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 470 } 471 472 static struct mem_ops __anon_ops = { 473 .setup_area = &anon_setup_area, 474 .cleanup_area = &anon_cleanup_area, 475 .fault = &anon_fault, 476 .check_huge = &anon_check_huge, 477 .name = "anon", 478 }; 479 480 static struct mem_ops __file_ops = { 481 .setup_area = &file_setup_area, 482 .cleanup_area = &file_cleanup_area, 483 .fault = &file_fault, 484 .check_huge = &file_check_huge, 485 .name = "file", 486 }; 487 488 static struct mem_ops __shmem_ops = { 489 .setup_area = &shmem_setup_area, 490 .cleanup_area = &shmem_cleanup_area, 491 .fault = &anon_fault, 492 .check_huge = &shmem_check_huge, 493 .name = "shmem", 494 }; 495 496 static void __madvise_collapse(const char *msg, char *p, int nr_hpages, 497 struct mem_ops *ops, bool expect) 498 { 499 int ret; 500 struct thp_settings settings = *thp_current_settings(); 501 502 printf("%s...", msg); 503 504 /* 505 * Prevent khugepaged interference and tests that MADV_COLLAPSE 506 * ignores /sys/kernel/mm/transparent_hugepage/enabled 507 */ 508 settings.thp_enabled = THP_NEVER; 509 settings.shmem_enabled = SHMEM_NEVER; 510 thp_push_settings(&settings); 511 512 /* Clear VM_NOHUGEPAGE */ 513 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 514 ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); 515 if (((bool)ret) == expect) 516 fail("Fail: Bad return value"); 517 else if (!ops->check_huge(p, expect ? nr_hpages : 0)) 518 fail("Fail: check_huge()"); 519 else 520 success("OK"); 521 522 thp_pop_settings(); 523 } 524 525 static void madvise_collapse(const char *msg, char *p, int nr_hpages, 526 struct mem_ops *ops, bool expect) 527 { 528 /* Sanity check */ 529 if (!ops->check_huge(p, 0)) { 530 printf("Unexpected huge page\n"); 531 exit(EXIT_FAILURE); 532 } 533 __madvise_collapse(msg, p, nr_hpages, ops, expect); 534 } 535 536 #define TICK 500000 537 static bool wait_for_scan(const char *msg, char *p, int nr_hpages, 538 struct mem_ops *ops) 539 { 540 int full_scans; 541 int timeout = 6; /* 3 seconds */ 542 543 /* Sanity check */ 544 if (!ops->check_huge(p, 0)) { 545 printf("Unexpected huge page\n"); 546 exit(EXIT_FAILURE); 547 } 548 549 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 550 551 /* Wait until the second full_scan completed */ 552 full_scans = thp_read_num("khugepaged/full_scans") + 2; 553 554 printf("%s...", msg); 555 while (timeout--) { 556 if (ops->check_huge(p, nr_hpages)) 557 break; 558 if (thp_read_num("khugepaged/full_scans") >= full_scans) 559 break; 560 printf("."); 561 usleep(TICK); 562 } 563 564 return timeout == -1; 565 } 566 567 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, 568 struct mem_ops *ops, bool expect) 569 { 570 if (wait_for_scan(msg, p, nr_hpages, ops)) { 571 if (expect) 572 fail("Timeout"); 573 else 574 success("OK"); 575 return; 576 } 577 578 /* 579 * For file and shmem memory, khugepaged only retracts pte entries after 580 * putting the new hugepage in the page cache. The hugepage must be 581 * subsequently refaulted to install the pmd mapping for the mm. 582 */ 583 if (ops != &__anon_ops) 584 ops->fault(p, 0, nr_hpages * hpage_pmd_size); 585 586 if (ops->check_huge(p, expect ? nr_hpages : 0)) 587 success("OK"); 588 else 589 fail("Fail"); 590 } 591 592 static struct collapse_context __khugepaged_context = { 593 .collapse = &khugepaged_collapse, 594 .enforce_pte_scan_limits = true, 595 .name = "khugepaged", 596 }; 597 598 static struct collapse_context __madvise_context = { 599 .collapse = &madvise_collapse, 600 .enforce_pte_scan_limits = false, 601 .name = "madvise", 602 }; 603 604 static bool is_tmpfs(struct mem_ops *ops) 605 { 606 return ops == &__file_ops && finfo.type == VMA_SHMEM; 607 } 608 609 static bool is_anon(struct mem_ops *ops) 610 { 611 return ops == &__anon_ops; 612 } 613 614 static void alloc_at_fault(void) 615 { 616 struct thp_settings settings = *thp_current_settings(); 617 char *p; 618 619 settings.thp_enabled = THP_ALWAYS; 620 thp_push_settings(&settings); 621 622 p = alloc_mapping(1); 623 *p = 1; 624 printf("Allocate huge page on fault..."); 625 if (check_huge_anon(p, 1, hpage_pmd_size)) 626 success("OK"); 627 else 628 fail("Fail"); 629 630 thp_pop_settings(); 631 632 madvise(p, page_size, MADV_DONTNEED); 633 printf("Split huge PMD on MADV_DONTNEED..."); 634 if (check_huge_anon(p, 0, hpage_pmd_size)) 635 success("OK"); 636 else 637 fail("Fail"); 638 munmap(p, hpage_pmd_size); 639 } 640 641 static void collapse_full(struct collapse_context *c, struct mem_ops *ops) 642 { 643 void *p; 644 int nr_hpages = 4; 645 unsigned long size = nr_hpages * hpage_pmd_size; 646 647 p = ops->setup_area(nr_hpages); 648 ops->fault(p, 0, size); 649 c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, 650 ops, true); 651 validate_memory(p, 0, size); 652 ops->cleanup_area(p, size); 653 } 654 655 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) 656 { 657 void *p; 658 659 p = ops->setup_area(1); 660 c->collapse("Do not collapse empty PTE table", p, 1, ops, false); 661 ops->cleanup_area(p, hpage_pmd_size); 662 } 663 664 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) 665 { 666 void *p; 667 668 p = ops->setup_area(1); 669 ops->fault(p, 0, page_size); 670 c->collapse("Collapse PTE table with single PTE entry present", p, 671 1, ops, true); 672 ops->cleanup_area(p, hpage_pmd_size); 673 } 674 675 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) 676 { 677 int max_ptes_none = hpage_pmd_nr / 2; 678 struct thp_settings settings = *thp_current_settings(); 679 void *p; 680 int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; 681 682 settings.khugepaged.max_ptes_none = max_ptes_none; 683 thp_push_settings(&settings); 684 685 p = ops->setup_area(1); 686 687 if (is_tmpfs(ops)) { 688 /* shmem pages always in the page cache */ 689 printf("tmpfs..."); 690 skip("Skip"); 691 goto skip; 692 } 693 694 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 695 c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, 696 ops, !c->enforce_pte_scan_limits); 697 validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 698 699 if (c->enforce_pte_scan_limits) { 700 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); 701 c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, 702 true); 703 validate_memory(p, 0, 704 (hpage_pmd_nr - max_ptes_none) * page_size); 705 } 706 skip: 707 ops->cleanup_area(p, hpage_pmd_size); 708 thp_pop_settings(); 709 } 710 711 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) 712 { 713 void *p; 714 715 p = ops->setup_area(1); 716 ops->fault(p, 0, hpage_pmd_size); 717 718 printf("Swapout one page..."); 719 if (madvise(p, page_size, MADV_PAGEOUT)) { 720 perror("madvise(MADV_PAGEOUT)"); 721 exit(EXIT_FAILURE); 722 } 723 if (check_swap(p, page_size)) { 724 success("OK"); 725 } else { 726 fail("Fail"); 727 goto out; 728 } 729 730 c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, 731 true); 732 validate_memory(p, 0, hpage_pmd_size); 733 out: 734 ops->cleanup_area(p, hpage_pmd_size); 735 } 736 737 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) 738 { 739 int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); 740 void *p; 741 742 p = ops->setup_area(1); 743 ops->fault(p, 0, hpage_pmd_size); 744 745 printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); 746 if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { 747 perror("madvise(MADV_PAGEOUT)"); 748 exit(EXIT_FAILURE); 749 } 750 if (check_swap(p, (max_ptes_swap + 1) * page_size)) { 751 success("OK"); 752 } else { 753 fail("Fail"); 754 goto out; 755 } 756 757 c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, 758 !c->enforce_pte_scan_limits); 759 validate_memory(p, 0, hpage_pmd_size); 760 761 if (c->enforce_pte_scan_limits) { 762 ops->fault(p, 0, hpage_pmd_size); 763 printf("Swapout %d of %d pages...", max_ptes_swap, 764 hpage_pmd_nr); 765 if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { 766 perror("madvise(MADV_PAGEOUT)"); 767 exit(EXIT_FAILURE); 768 } 769 if (check_swap(p, max_ptes_swap * page_size)) { 770 success("OK"); 771 } else { 772 fail("Fail"); 773 goto out; 774 } 775 776 c->collapse("Collapse with max_ptes_swap pages swapped out", p, 777 1, ops, true); 778 validate_memory(p, 0, hpage_pmd_size); 779 } 780 out: 781 ops->cleanup_area(p, hpage_pmd_size); 782 } 783 784 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) 785 { 786 void *p; 787 788 p = alloc_hpage(ops); 789 790 if (is_tmpfs(ops)) { 791 /* MADV_DONTNEED won't evict tmpfs pages */ 792 printf("tmpfs..."); 793 skip("Skip"); 794 goto skip; 795 } 796 797 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 798 printf("Split huge page leaving single PTE mapping compound page..."); 799 madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); 800 if (ops->check_huge(p, 0)) 801 success("OK"); 802 else 803 fail("Fail"); 804 805 c->collapse("Collapse PTE table with single PTE mapping compound page", 806 p, 1, ops, true); 807 validate_memory(p, 0, page_size); 808 skip: 809 ops->cleanup_area(p, hpage_pmd_size); 810 } 811 812 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) 813 { 814 void *p; 815 816 p = alloc_hpage(ops); 817 printf("Split huge page leaving single PTE page table full of compound pages..."); 818 madvise(p, page_size, MADV_NOHUGEPAGE); 819 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 820 if (ops->check_huge(p, 0)) 821 success("OK"); 822 else 823 fail("Fail"); 824 825 c->collapse("Collapse PTE table full of compound pages", p, 1, ops, 826 true); 827 validate_memory(p, 0, hpage_pmd_size); 828 ops->cleanup_area(p, hpage_pmd_size); 829 } 830 831 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) 832 { 833 void *p; 834 int i; 835 836 p = ops->setup_area(1); 837 for (i = 0; i < hpage_pmd_nr; i++) { 838 printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", 839 i + 1, hpage_pmd_nr); 840 841 madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); 842 ops->fault(BASE_ADDR, 0, hpage_pmd_size); 843 if (!ops->check_huge(BASE_ADDR, 1)) { 844 printf("Failed to allocate huge page\n"); 845 exit(EXIT_FAILURE); 846 } 847 madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); 848 849 p = mremap(BASE_ADDR - i * page_size, 850 i * page_size + hpage_pmd_size, 851 (i + 1) * page_size, 852 MREMAP_MAYMOVE | MREMAP_FIXED, 853 BASE_ADDR + 2 * hpage_pmd_size); 854 if (p == MAP_FAILED) { 855 perror("mremap+unmap"); 856 exit(EXIT_FAILURE); 857 } 858 859 p = mremap(BASE_ADDR + 2 * hpage_pmd_size, 860 (i + 1) * page_size, 861 (i + 1) * page_size + hpage_pmd_size, 862 MREMAP_MAYMOVE | MREMAP_FIXED, 863 BASE_ADDR - (i + 1) * page_size); 864 if (p == MAP_FAILED) { 865 perror("mremap+alloc"); 866 exit(EXIT_FAILURE); 867 } 868 } 869 870 ops->cleanup_area(BASE_ADDR, hpage_pmd_size); 871 ops->fault(p, 0, hpage_pmd_size); 872 if (!ops->check_huge(p, 1)) 873 success("OK"); 874 else 875 fail("Fail"); 876 877 c->collapse("Collapse PTE table full of different compound pages", p, 1, 878 ops, true); 879 880 validate_memory(p, 0, hpage_pmd_size); 881 ops->cleanup_area(p, hpage_pmd_size); 882 } 883 884 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) 885 { 886 int wstatus; 887 void *p; 888 889 p = ops->setup_area(1); 890 891 printf("Allocate small page..."); 892 ops->fault(p, 0, page_size); 893 if (ops->check_huge(p, 0)) 894 success("OK"); 895 else 896 fail("Fail"); 897 898 printf("Share small page over fork()..."); 899 if (!fork()) { 900 /* Do not touch settings on child exit */ 901 skip_settings_restore = true; 902 exit_status = 0; 903 904 if (ops->check_huge(p, 0)) 905 success("OK"); 906 else 907 fail("Fail"); 908 909 ops->fault(p, page_size, 2 * page_size); 910 c->collapse("Collapse PTE table with single page shared with parent process", 911 p, 1, ops, true); 912 913 validate_memory(p, 0, page_size); 914 ops->cleanup_area(p, hpage_pmd_size); 915 exit(exit_status); 916 } 917 918 wait(&wstatus); 919 exit_status += WEXITSTATUS(wstatus); 920 921 printf("Check if parent still has small page..."); 922 if (ops->check_huge(p, 0)) 923 success("OK"); 924 else 925 fail("Fail"); 926 validate_memory(p, 0, page_size); 927 ops->cleanup_area(p, hpage_pmd_size); 928 } 929 930 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) 931 { 932 int wstatus; 933 void *p; 934 935 p = alloc_hpage(ops); 936 printf("Share huge page over fork()..."); 937 if (!fork()) { 938 /* Do not touch settings on child exit */ 939 skip_settings_restore = true; 940 exit_status = 0; 941 942 if (ops->check_huge(p, 1)) 943 success("OK"); 944 else 945 fail("Fail"); 946 947 printf("Split huge page PMD in child process..."); 948 madvise(p, page_size, MADV_NOHUGEPAGE); 949 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 950 if (ops->check_huge(p, 0)) 951 success("OK"); 952 else 953 fail("Fail"); 954 ops->fault(p, 0, page_size); 955 956 thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); 957 c->collapse("Collapse PTE table full of compound pages in child", 958 p, 1, ops, true); 959 thp_write_num("khugepaged/max_ptes_shared", 960 thp_current_settings()->khugepaged.max_ptes_shared); 961 962 validate_memory(p, 0, hpage_pmd_size); 963 ops->cleanup_area(p, hpage_pmd_size); 964 exit(exit_status); 965 } 966 967 wait(&wstatus); 968 exit_status += WEXITSTATUS(wstatus); 969 970 printf("Check if parent still has huge page..."); 971 if (ops->check_huge(p, 1)) 972 success("OK"); 973 else 974 fail("Fail"); 975 validate_memory(p, 0, hpage_pmd_size); 976 ops->cleanup_area(p, hpage_pmd_size); 977 } 978 979 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) 980 { 981 int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); 982 int wstatus; 983 void *p; 984 985 p = alloc_hpage(ops); 986 printf("Share huge page over fork()..."); 987 if (!fork()) { 988 /* Do not touch settings on child exit */ 989 skip_settings_restore = true; 990 exit_status = 0; 991 992 if (ops->check_huge(p, 1)) 993 success("OK"); 994 else 995 fail("Fail"); 996 997 printf("Trigger CoW on page %d of %d...", 998 hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); 999 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); 1000 if (ops->check_huge(p, 0)) 1001 success("OK"); 1002 else 1003 fail("Fail"); 1004 1005 c->collapse("Maybe collapse with max_ptes_shared exceeded", p, 1006 1, ops, !c->enforce_pte_scan_limits); 1007 1008 if (c->enforce_pte_scan_limits) { 1009 printf("Trigger CoW on page %d of %d...", 1010 hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); 1011 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * 1012 page_size); 1013 if (ops->check_huge(p, 0)) 1014 success("OK"); 1015 else 1016 fail("Fail"); 1017 1018 c->collapse("Collapse with max_ptes_shared PTEs shared", 1019 p, 1, ops, true); 1020 } 1021 1022 validate_memory(p, 0, hpage_pmd_size); 1023 ops->cleanup_area(p, hpage_pmd_size); 1024 exit(exit_status); 1025 } 1026 1027 wait(&wstatus); 1028 exit_status += WEXITSTATUS(wstatus); 1029 1030 printf("Check if parent still has huge page..."); 1031 if (ops->check_huge(p, 1)) 1032 success("OK"); 1033 else 1034 fail("Fail"); 1035 validate_memory(p, 0, hpage_pmd_size); 1036 ops->cleanup_area(p, hpage_pmd_size); 1037 } 1038 1039 static void madvise_collapse_existing_thps(struct collapse_context *c, 1040 struct mem_ops *ops) 1041 { 1042 void *p; 1043 1044 p = ops->setup_area(1); 1045 ops->fault(p, 0, hpage_pmd_size); 1046 c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); 1047 validate_memory(p, 0, hpage_pmd_size); 1048 1049 /* c->collapse() will find a hugepage and complain - call directly. */ 1050 __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); 1051 validate_memory(p, 0, hpage_pmd_size); 1052 ops->cleanup_area(p, hpage_pmd_size); 1053 } 1054 1055 /* 1056 * Test race with khugepaged where page tables have been retracted and 1057 * pmd cleared. 1058 */ 1059 static void madvise_retracted_page_tables(struct collapse_context *c, 1060 struct mem_ops *ops) 1061 { 1062 void *p; 1063 int nr_hpages = 1; 1064 unsigned long size = nr_hpages * hpage_pmd_size; 1065 1066 p = ops->setup_area(nr_hpages); 1067 ops->fault(p, 0, size); 1068 1069 /* Let khugepaged collapse and leave pmd cleared */ 1070 if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, 1071 ops)) { 1072 fail("Timeout"); 1073 return; 1074 } 1075 success("OK"); 1076 c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, 1077 true); 1078 validate_memory(p, 0, size); 1079 ops->cleanup_area(p, size); 1080 } 1081 1082 static void usage(void) 1083 { 1084 fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n"); 1085 fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); 1086 fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); 1087 fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); 1088 fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); 1089 fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); 1090 fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); 1091 fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); 1092 fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n"); 1093 fprintf(stderr, "\n\tSupported Options:\n"); 1094 fprintf(stderr, "\t\t-h: This help message.\n"); 1095 fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); 1096 fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n"); 1097 exit(1); 1098 } 1099 1100 static void parse_test_type(int argc, char **argv) 1101 { 1102 int opt; 1103 char *buf; 1104 const char *token; 1105 1106 while ((opt = getopt(argc, argv, "s:h")) != -1) { 1107 switch (opt) { 1108 case 's': 1109 anon_order = atoi(optarg); 1110 break; 1111 case 'h': 1112 default: 1113 usage(); 1114 } 1115 } 1116 1117 argv += optind; 1118 argc -= optind; 1119 1120 if (argc == 0) { 1121 /* Backwards compatibility */ 1122 khugepaged_context = &__khugepaged_context; 1123 madvise_context = &__madvise_context; 1124 anon_ops = &__anon_ops; 1125 return; 1126 } 1127 1128 buf = strdup(argv[0]); 1129 token = strsep(&buf, ":"); 1130 1131 if (!strcmp(token, "all")) { 1132 khugepaged_context = &__khugepaged_context; 1133 madvise_context = &__madvise_context; 1134 } else if (!strcmp(token, "khugepaged")) { 1135 khugepaged_context = &__khugepaged_context; 1136 } else if (!strcmp(token, "madvise")) { 1137 madvise_context = &__madvise_context; 1138 } else { 1139 usage(); 1140 } 1141 1142 if (!buf) 1143 usage(); 1144 1145 if (!strcmp(buf, "all")) { 1146 file_ops = &__file_ops; 1147 anon_ops = &__anon_ops; 1148 shmem_ops = &__shmem_ops; 1149 } else if (!strcmp(buf, "anon")) { 1150 anon_ops = &__anon_ops; 1151 } else if (!strcmp(buf, "file")) { 1152 file_ops = &__file_ops; 1153 } else if (!strcmp(buf, "shmem")) { 1154 shmem_ops = &__shmem_ops; 1155 } else { 1156 usage(); 1157 } 1158 1159 if (!file_ops) 1160 return; 1161 1162 if (argc != 2) 1163 usage(); 1164 1165 get_finfo(argv[1]); 1166 } 1167 1168 int main(int argc, char **argv) 1169 { 1170 int hpage_pmd_order; 1171 struct thp_settings default_settings = { 1172 .thp_enabled = THP_MADVISE, 1173 .thp_defrag = THP_DEFRAG_ALWAYS, 1174 .shmem_enabled = SHMEM_ADVISE, 1175 .use_zero_page = 0, 1176 .khugepaged = { 1177 .defrag = 1, 1178 .alloc_sleep_millisecs = 10, 1179 .scan_sleep_millisecs = 10, 1180 }, 1181 /* 1182 * When testing file-backed memory, the collapse path 1183 * looks at how many pages are found in the page cache, not 1184 * what pages are mapped. Disable read ahead optimization so 1185 * pages don't find their way into the page cache unless 1186 * we mem_ops->fault() them in. 1187 */ 1188 .read_ahead_kb = 0, 1189 }; 1190 1191 if (!thp_is_enabled()) { 1192 printf("Transparent Hugepages not available\n"); 1193 return KSFT_SKIP; 1194 } 1195 1196 parse_test_type(argc, argv); 1197 1198 setbuf(stdout, NULL); 1199 1200 page_size = getpagesize(); 1201 hpage_pmd_size = read_pmd_pagesize(); 1202 if (!hpage_pmd_size) { 1203 printf("Reading PMD pagesize failed"); 1204 exit(EXIT_FAILURE); 1205 } 1206 hpage_pmd_nr = hpage_pmd_size / page_size; 1207 hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); 1208 1209 default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; 1210 default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; 1211 default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; 1212 default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; 1213 default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; 1214 default_settings.hugepages[anon_order].enabled = THP_ALWAYS; 1215 default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT; 1216 default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS; 1217 1218 save_settings(); 1219 thp_push_settings(&default_settings); 1220 1221 alloc_at_fault(); 1222 1223 #define TEST(t, c, o) do { \ 1224 if (c && o) { \ 1225 printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ 1226 t(c, o); \ 1227 } \ 1228 } while (0) 1229 1230 TEST(collapse_full, khugepaged_context, anon_ops); 1231 TEST(collapse_full, khugepaged_context, file_ops); 1232 TEST(collapse_full, khugepaged_context, shmem_ops); 1233 TEST(collapse_full, madvise_context, anon_ops); 1234 TEST(collapse_full, madvise_context, file_ops); 1235 TEST(collapse_full, madvise_context, shmem_ops); 1236 1237 TEST(collapse_empty, khugepaged_context, anon_ops); 1238 TEST(collapse_empty, madvise_context, anon_ops); 1239 1240 TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); 1241 TEST(collapse_single_pte_entry, khugepaged_context, file_ops); 1242 TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); 1243 TEST(collapse_single_pte_entry, madvise_context, anon_ops); 1244 TEST(collapse_single_pte_entry, madvise_context, file_ops); 1245 TEST(collapse_single_pte_entry, madvise_context, shmem_ops); 1246 1247 TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); 1248 TEST(collapse_max_ptes_none, khugepaged_context, file_ops); 1249 TEST(collapse_max_ptes_none, madvise_context, anon_ops); 1250 TEST(collapse_max_ptes_none, madvise_context, file_ops); 1251 1252 TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); 1253 TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); 1254 TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); 1255 TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); 1256 1257 TEST(collapse_full_of_compound, khugepaged_context, anon_ops); 1258 TEST(collapse_full_of_compound, khugepaged_context, file_ops); 1259 TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); 1260 TEST(collapse_full_of_compound, madvise_context, anon_ops); 1261 TEST(collapse_full_of_compound, madvise_context, file_ops); 1262 TEST(collapse_full_of_compound, madvise_context, shmem_ops); 1263 1264 TEST(collapse_compound_extreme, khugepaged_context, anon_ops); 1265 TEST(collapse_compound_extreme, madvise_context, anon_ops); 1266 1267 TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); 1268 TEST(collapse_swapin_single_pte, madvise_context, anon_ops); 1269 1270 TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); 1271 TEST(collapse_max_ptes_swap, madvise_context, anon_ops); 1272 1273 TEST(collapse_fork, khugepaged_context, anon_ops); 1274 TEST(collapse_fork, madvise_context, anon_ops); 1275 1276 TEST(collapse_fork_compound, khugepaged_context, anon_ops); 1277 TEST(collapse_fork_compound, madvise_context, anon_ops); 1278 1279 TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); 1280 TEST(collapse_max_ptes_shared, madvise_context, anon_ops); 1281 1282 TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); 1283 TEST(madvise_collapse_existing_thps, madvise_context, file_ops); 1284 TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); 1285 1286 TEST(madvise_retracted_page_tables, madvise_context, file_ops); 1287 TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); 1288 1289 restore_settings(0); 1290 } 1291