1 #define _GNU_SOURCE 2 #include <ctype.h> 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <dirent.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <stdbool.h> 11 #include <string.h> 12 #include <unistd.h> 13 14 #include <linux/mman.h> 15 #include <sys/mman.h> 16 #include <sys/wait.h> 17 #include <sys/types.h> 18 #include <sys/stat.h> 19 #include <sys/sysmacros.h> 20 #include <sys/vfs.h> 21 22 #include "linux/magic.h" 23 24 #include "vm_util.h" 25 #include "thp_settings.h" 26 27 #define BASE_ADDR ((void *)(1UL << 30)) 28 static unsigned long hpage_pmd_size; 29 static unsigned long page_size; 30 static int hpage_pmd_nr; 31 static int anon_order; 32 33 #define PID_SMAPS "/proc/self/smaps" 34 #define TEST_FILE "collapse_test_file" 35 36 #define MAX_LINE_LENGTH 500 37 38 enum vma_type { 39 VMA_ANON, 40 VMA_FILE, 41 VMA_SHMEM, 42 }; 43 44 struct mem_ops { 45 void *(*setup_area)(int nr_hpages); 46 void (*cleanup_area)(void *p, unsigned long size); 47 void (*fault)(void *p, unsigned long start, unsigned long end); 48 bool (*check_huge)(void *addr, int nr_hpages); 49 const char *name; 50 }; 51 52 static struct mem_ops *file_ops; 53 static struct mem_ops *anon_ops; 54 static struct mem_ops *shmem_ops; 55 56 struct collapse_context { 57 void (*collapse)(const char *msg, char *p, int nr_hpages, 58 struct mem_ops *ops, bool expect); 59 bool enforce_pte_scan_limits; 60 const char *name; 61 }; 62 63 static struct collapse_context *khugepaged_context; 64 static struct collapse_context *madvise_context; 65 66 struct file_info { 67 const char *dir; 68 char path[PATH_MAX]; 69 enum vma_type type; 70 int fd; 71 char dev_queue_read_ahead_path[PATH_MAX]; 72 }; 73 74 static struct file_info finfo; 75 static bool skip_settings_restore; 76 static int exit_status; 77 78 static void success(const char *msg) 79 { 80 printf(" \e[32m%s\e[0m\n", msg); 81 } 82 83 static void fail(const char *msg) 84 { 85 printf(" \e[31m%s\e[0m\n", msg); 86 exit_status++; 87 } 88 89 static void skip(const char *msg) 90 { 91 printf(" \e[33m%s\e[0m\n", msg); 92 } 93 94 static void restore_settings_atexit(void) 95 { 96 if (skip_settings_restore) 97 return; 98 99 printf("Restore THP and khugepaged settings..."); 100 thp_restore_settings(); 101 success("OK"); 102 103 skip_settings_restore = true; 104 } 105 106 static void restore_settings(int sig) 107 { 108 /* exit() will invoke the restore_settings_atexit handler. */ 109 exit(sig ? EXIT_FAILURE : exit_status); 110 } 111 112 static void save_settings(void) 113 { 114 printf("Save THP and khugepaged settings..."); 115 if (file_ops && finfo.type == VMA_FILE) 116 thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); 117 thp_save_settings(); 118 119 success("OK"); 120 121 atexit(restore_settings_atexit); 122 signal(SIGTERM, restore_settings); 123 signal(SIGINT, restore_settings); 124 signal(SIGHUP, restore_settings); 125 signal(SIGQUIT, restore_settings); 126 } 127 128 static void get_finfo(const char *dir) 129 { 130 struct stat path_stat; 131 struct statfs fs; 132 char buf[1 << 10]; 133 char path[PATH_MAX]; 134 char *str, *end; 135 136 finfo.dir = dir; 137 stat(finfo.dir, &path_stat); 138 if (!S_ISDIR(path_stat.st_mode)) { 139 printf("%s: Not a directory (%s)\n", __func__, finfo.dir); 140 exit(EXIT_FAILURE); 141 } 142 if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, 143 finfo.dir) >= sizeof(finfo.path)) { 144 printf("%s: Pathname is too long\n", __func__); 145 exit(EXIT_FAILURE); 146 } 147 if (statfs(finfo.dir, &fs)) { 148 perror("statfs()"); 149 exit(EXIT_FAILURE); 150 } 151 finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; 152 if (finfo.type == VMA_SHMEM) 153 return; 154 155 /* Find owning device's queue/read_ahead_kb control */ 156 if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", 157 major(path_stat.st_dev), minor(path_stat.st_dev)) 158 >= sizeof(path)) { 159 printf("%s: Pathname is too long\n", __func__); 160 exit(EXIT_FAILURE); 161 } 162 if (read_file(path, buf, sizeof(buf)) < 0) { 163 perror("read_file(read_num)"); 164 exit(EXIT_FAILURE); 165 } 166 if (strstr(buf, "DEVTYPE=disk")) { 167 /* Found it */ 168 if (snprintf(finfo.dev_queue_read_ahead_path, 169 sizeof(finfo.dev_queue_read_ahead_path), 170 "/sys/dev/block/%d:%d/queue/read_ahead_kb", 171 major(path_stat.st_dev), minor(path_stat.st_dev)) 172 >= sizeof(finfo.dev_queue_read_ahead_path)) { 173 printf("%s: Pathname is too long\n", __func__); 174 exit(EXIT_FAILURE); 175 } 176 return; 177 } 178 if (!strstr(buf, "DEVTYPE=partition")) { 179 printf("%s: Unknown device type: %s\n", __func__, path); 180 exit(EXIT_FAILURE); 181 } 182 /* 183 * Partition of block device - need to find actual device. 184 * Using naming convention that devnameN is partition of 185 * device devname. 186 */ 187 str = strstr(buf, "DEVNAME="); 188 if (!str) { 189 printf("%s: Could not read: %s", __func__, path); 190 exit(EXIT_FAILURE); 191 } 192 str += 8; 193 end = str; 194 while (*end) { 195 if (isdigit(*end)) { 196 *end = '\0'; 197 if (snprintf(finfo.dev_queue_read_ahead_path, 198 sizeof(finfo.dev_queue_read_ahead_path), 199 "/sys/block/%s/queue/read_ahead_kb", 200 str) >= sizeof(finfo.dev_queue_read_ahead_path)) { 201 printf("%s: Pathname is too long\n", __func__); 202 exit(EXIT_FAILURE); 203 } 204 return; 205 } 206 ++end; 207 } 208 printf("%s: Could not read: %s\n", __func__, path); 209 exit(EXIT_FAILURE); 210 } 211 212 static bool check_swap(void *addr, unsigned long size) 213 { 214 bool swap = false; 215 int ret; 216 FILE *fp; 217 char buffer[MAX_LINE_LENGTH]; 218 char addr_pattern[MAX_LINE_LENGTH]; 219 220 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", 221 (unsigned long) addr); 222 if (ret >= MAX_LINE_LENGTH) { 223 printf("%s: Pattern is too long\n", __func__); 224 exit(EXIT_FAILURE); 225 } 226 227 228 fp = fopen(PID_SMAPS, "r"); 229 if (!fp) { 230 printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); 231 exit(EXIT_FAILURE); 232 } 233 if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) 234 goto err_out; 235 236 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", 237 size >> 10); 238 if (ret >= MAX_LINE_LENGTH) { 239 printf("%s: Pattern is too long\n", __func__); 240 exit(EXIT_FAILURE); 241 } 242 /* 243 * Fetch the Swap: in the same block and check whether it got 244 * the expected number of hugeepages next. 245 */ 246 if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) 247 goto err_out; 248 249 if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) 250 goto err_out; 251 252 swap = true; 253 err_out: 254 fclose(fp); 255 return swap; 256 } 257 258 static void *alloc_mapping(int nr) 259 { 260 void *p; 261 262 p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, 263 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 264 if (p != BASE_ADDR) { 265 printf("Failed to allocate VMA at %p\n", BASE_ADDR); 266 exit(EXIT_FAILURE); 267 } 268 269 return p; 270 } 271 272 static void fill_memory(int *p, unsigned long start, unsigned long end) 273 { 274 int i; 275 276 for (i = start / page_size; i < end / page_size; i++) 277 p[i * page_size / sizeof(*p)] = i + 0xdead0000; 278 } 279 280 /* 281 * MADV_COLLAPSE is a best-effort request and may fail if an internal 282 * resource is temporarily unavailable, in which case it will set errno to 283 * EAGAIN. In such a case, immediately reattempt the operation one more 284 * time. 285 */ 286 static int madvise_collapse_retry(void *p, unsigned long size) 287 { 288 bool retry = true; 289 int ret; 290 291 retry: 292 ret = madvise(p, size, MADV_COLLAPSE); 293 if (ret && errno == EAGAIN && retry) { 294 retry = false; 295 goto retry; 296 } 297 return ret; 298 } 299 300 /* 301 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with 302 * validate_memory()'able contents. 303 */ 304 static void *alloc_hpage(struct mem_ops *ops) 305 { 306 void *p = ops->setup_area(1); 307 308 ops->fault(p, 0, hpage_pmd_size); 309 310 /* 311 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. 312 * The latter is ineligible for collapse by MADV_COLLAPSE 313 * while the former might cause MADV_COLLAPSE to race with 314 * khugepaged on low-load system (like a test machine), which 315 * would cause MADV_COLLAPSE to fail with EAGAIN. 316 */ 317 printf("Allocate huge page..."); 318 if (madvise_collapse_retry(p, hpage_pmd_size)) { 319 perror("madvise(MADV_COLLAPSE)"); 320 exit(EXIT_FAILURE); 321 } 322 if (!ops->check_huge(p, 1)) { 323 perror("madvise(MADV_COLLAPSE)"); 324 exit(EXIT_FAILURE); 325 } 326 if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { 327 perror("madvise(MADV_HUGEPAGE)"); 328 exit(EXIT_FAILURE); 329 } 330 success("OK"); 331 return p; 332 } 333 334 static void validate_memory(int *p, unsigned long start, unsigned long end) 335 { 336 int i; 337 338 for (i = start / page_size; i < end / page_size; i++) { 339 if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { 340 printf("Page %d is corrupted: %#x\n", 341 i, p[i * page_size / sizeof(*p)]); 342 exit(EXIT_FAILURE); 343 } 344 } 345 } 346 347 static void *anon_setup_area(int nr_hpages) 348 { 349 return alloc_mapping(nr_hpages); 350 } 351 352 static void anon_cleanup_area(void *p, unsigned long size) 353 { 354 munmap(p, size); 355 } 356 357 static void anon_fault(void *p, unsigned long start, unsigned long end) 358 { 359 fill_memory(p, start, end); 360 } 361 362 static bool anon_check_huge(void *addr, int nr_hpages) 363 { 364 return check_huge_anon(addr, nr_hpages, hpage_pmd_size); 365 } 366 367 static void *file_setup_area(int nr_hpages) 368 { 369 int fd; 370 void *p; 371 unsigned long size; 372 373 unlink(finfo.path); /* Cleanup from previous failed tests */ 374 printf("Creating %s for collapse%s...", finfo.path, 375 finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); 376 fd = open(finfo.path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 377 777); 378 if (fd < 0) { 379 perror("open()"); 380 exit(EXIT_FAILURE); 381 } 382 383 size = nr_hpages * hpage_pmd_size; 384 if (ftruncate(fd, size)) { 385 perror("ftruncate()"); 386 exit(EXIT_FAILURE); 387 } 388 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, 389 MAP_SHARED, fd, 0); 390 if (p != BASE_ADDR) { 391 perror("mmap()"); 392 exit(EXIT_FAILURE); 393 } 394 fill_memory(p, 0, size); 395 if (msync(p, size, MS_SYNC)) { 396 perror("msync()"); 397 exit(EXIT_FAILURE); 398 } 399 close(fd); 400 munmap(p, size); 401 success("OK"); 402 403 printf("Opening %s read only for collapse...", finfo.path); 404 finfo.fd = open(finfo.path, O_RDONLY, 777); 405 if (finfo.fd < 0) { 406 perror("open()"); 407 exit(EXIT_FAILURE); 408 } 409 p = mmap(BASE_ADDR, size, PROT_READ, 410 MAP_PRIVATE, finfo.fd, 0); 411 if (p == MAP_FAILED || p != BASE_ADDR) { 412 perror("mmap()"); 413 exit(EXIT_FAILURE); 414 } 415 416 /* Drop page cache */ 417 write_file("/proc/sys/vm/drop_caches", "3", 2); 418 success("OK"); 419 return p; 420 } 421 422 static void file_cleanup_area(void *p, unsigned long size) 423 { 424 munmap(p, size); 425 close(finfo.fd); 426 unlink(finfo.path); 427 } 428 429 static void file_fault(void *p, unsigned long start, unsigned long end) 430 { 431 if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { 432 perror("madvise(MADV_POPULATE_READ"); 433 exit(EXIT_FAILURE); 434 } 435 } 436 437 static bool file_check_huge(void *addr, int nr_hpages) 438 { 439 switch (finfo.type) { 440 case VMA_FILE: 441 return check_huge_file(addr, nr_hpages, hpage_pmd_size); 442 case VMA_SHMEM: 443 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 444 default: 445 exit(EXIT_FAILURE); 446 return false; 447 } 448 } 449 450 static void *shmem_setup_area(int nr_hpages) 451 { 452 void *p; 453 unsigned long size = nr_hpages * hpage_pmd_size; 454 455 finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); 456 if (finfo.fd < 0) { 457 perror("memfd_create()"); 458 exit(EXIT_FAILURE); 459 } 460 if (ftruncate(finfo.fd, size)) { 461 perror("ftruncate()"); 462 exit(EXIT_FAILURE); 463 } 464 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 465 0); 466 if (p != BASE_ADDR) { 467 perror("mmap()"); 468 exit(EXIT_FAILURE); 469 } 470 return p; 471 } 472 473 static void shmem_cleanup_area(void *p, unsigned long size) 474 { 475 munmap(p, size); 476 close(finfo.fd); 477 } 478 479 static bool shmem_check_huge(void *addr, int nr_hpages) 480 { 481 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); 482 } 483 484 static struct mem_ops __anon_ops = { 485 .setup_area = &anon_setup_area, 486 .cleanup_area = &anon_cleanup_area, 487 .fault = &anon_fault, 488 .check_huge = &anon_check_huge, 489 .name = "anon", 490 }; 491 492 static struct mem_ops __file_ops = { 493 .setup_area = &file_setup_area, 494 .cleanup_area = &file_cleanup_area, 495 .fault = &file_fault, 496 .check_huge = &file_check_huge, 497 .name = "file", 498 }; 499 500 static struct mem_ops __shmem_ops = { 501 .setup_area = &shmem_setup_area, 502 .cleanup_area = &shmem_cleanup_area, 503 .fault = &anon_fault, 504 .check_huge = &shmem_check_huge, 505 .name = "shmem", 506 }; 507 508 static void __madvise_collapse(const char *msg, char *p, int nr_hpages, 509 struct mem_ops *ops, bool expect) 510 { 511 int ret; 512 struct thp_settings settings = *thp_current_settings(); 513 514 printf("%s...", msg); 515 516 /* 517 * Prevent khugepaged interference and tests that MADV_COLLAPSE 518 * ignores /sys/kernel/mm/transparent_hugepage/enabled 519 */ 520 settings.thp_enabled = THP_NEVER; 521 settings.shmem_enabled = SHMEM_NEVER; 522 thp_push_settings(&settings); 523 524 /* Clear VM_NOHUGEPAGE */ 525 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 526 ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); 527 if (((bool)ret) == expect) 528 fail("Fail: Bad return value"); 529 else if (!ops->check_huge(p, expect ? nr_hpages : 0)) 530 fail("Fail: check_huge()"); 531 else 532 success("OK"); 533 534 thp_pop_settings(); 535 } 536 537 static void madvise_collapse(const char *msg, char *p, int nr_hpages, 538 struct mem_ops *ops, bool expect) 539 { 540 /* Sanity check */ 541 if (!ops->check_huge(p, 0)) { 542 printf("Unexpected huge page\n"); 543 exit(EXIT_FAILURE); 544 } 545 __madvise_collapse(msg, p, nr_hpages, ops, expect); 546 } 547 548 #define TICK 500000 549 static bool wait_for_scan(const char *msg, char *p, int nr_hpages, 550 struct mem_ops *ops) 551 { 552 int full_scans; 553 int timeout = 6; /* 3 seconds */ 554 555 /* Sanity check */ 556 if (!ops->check_huge(p, 0)) { 557 printf("Unexpected huge page\n"); 558 exit(EXIT_FAILURE); 559 } 560 561 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); 562 563 /* Wait until the second full_scan completed */ 564 full_scans = thp_read_num("khugepaged/full_scans") + 2; 565 566 printf("%s...", msg); 567 while (timeout--) { 568 if (ops->check_huge(p, nr_hpages)) 569 break; 570 if (thp_read_num("khugepaged/full_scans") >= full_scans) 571 break; 572 printf("."); 573 usleep(TICK); 574 } 575 576 return timeout == -1; 577 } 578 579 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, 580 struct mem_ops *ops, bool expect) 581 { 582 if (wait_for_scan(msg, p, nr_hpages, ops)) { 583 if (expect) 584 fail("Timeout"); 585 else 586 success("OK"); 587 return; 588 } 589 590 /* 591 * For file and shmem memory, khugepaged only retracts pte entries after 592 * putting the new hugepage in the page cache. The hugepage must be 593 * subsequently refaulted to install the pmd mapping for the mm. 594 */ 595 if (ops != &__anon_ops) 596 ops->fault(p, 0, nr_hpages * hpage_pmd_size); 597 598 if (ops->check_huge(p, expect ? nr_hpages : 0)) 599 success("OK"); 600 else 601 fail("Fail"); 602 } 603 604 static struct collapse_context __khugepaged_context = { 605 .collapse = &khugepaged_collapse, 606 .enforce_pte_scan_limits = true, 607 .name = "khugepaged", 608 }; 609 610 static struct collapse_context __madvise_context = { 611 .collapse = &madvise_collapse, 612 .enforce_pte_scan_limits = false, 613 .name = "madvise", 614 }; 615 616 static bool is_tmpfs(struct mem_ops *ops) 617 { 618 return ops == &__file_ops && finfo.type == VMA_SHMEM; 619 } 620 621 static bool is_anon(struct mem_ops *ops) 622 { 623 return ops == &__anon_ops; 624 } 625 626 static void alloc_at_fault(void) 627 { 628 struct thp_settings settings = *thp_current_settings(); 629 char *p; 630 631 settings.thp_enabled = THP_ALWAYS; 632 thp_push_settings(&settings); 633 634 p = alloc_mapping(1); 635 *p = 1; 636 printf("Allocate huge page on fault..."); 637 if (check_huge_anon(p, 1, hpage_pmd_size)) 638 success("OK"); 639 else 640 fail("Fail"); 641 642 thp_pop_settings(); 643 644 madvise(p, page_size, MADV_DONTNEED); 645 printf("Split huge PMD on MADV_DONTNEED..."); 646 if (check_huge_anon(p, 0, hpage_pmd_size)) 647 success("OK"); 648 else 649 fail("Fail"); 650 munmap(p, hpage_pmd_size); 651 } 652 653 static void collapse_full(struct collapse_context *c, struct mem_ops *ops) 654 { 655 void *p; 656 int nr_hpages = 4; 657 unsigned long size = nr_hpages * hpage_pmd_size; 658 659 p = ops->setup_area(nr_hpages); 660 ops->fault(p, 0, size); 661 c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, 662 ops, true); 663 validate_memory(p, 0, size); 664 ops->cleanup_area(p, size); 665 } 666 667 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) 668 { 669 void *p; 670 671 p = ops->setup_area(1); 672 c->collapse("Do not collapse empty PTE table", p, 1, ops, false); 673 ops->cleanup_area(p, hpage_pmd_size); 674 } 675 676 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) 677 { 678 void *p; 679 680 p = ops->setup_area(1); 681 ops->fault(p, 0, page_size); 682 c->collapse("Collapse PTE table with single PTE entry present", p, 683 1, ops, true); 684 ops->cleanup_area(p, hpage_pmd_size); 685 } 686 687 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) 688 { 689 int max_ptes_none = hpage_pmd_nr / 2; 690 struct thp_settings settings = *thp_current_settings(); 691 void *p; 692 int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; 693 694 settings.khugepaged.max_ptes_none = max_ptes_none; 695 thp_push_settings(&settings); 696 697 p = ops->setup_area(1); 698 699 if (is_tmpfs(ops)) { 700 /* shmem pages always in the page cache */ 701 printf("tmpfs..."); 702 skip("Skip"); 703 goto skip; 704 } 705 706 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 707 c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, 708 ops, !c->enforce_pte_scan_limits); 709 validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); 710 711 if (c->enforce_pte_scan_limits) { 712 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); 713 c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, 714 true); 715 validate_memory(p, 0, 716 (hpage_pmd_nr - max_ptes_none) * page_size); 717 } 718 skip: 719 ops->cleanup_area(p, hpage_pmd_size); 720 thp_pop_settings(); 721 } 722 723 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) 724 { 725 void *p; 726 727 p = ops->setup_area(1); 728 ops->fault(p, 0, hpage_pmd_size); 729 730 printf("Swapout one page..."); 731 if (madvise(p, page_size, MADV_PAGEOUT)) { 732 perror("madvise(MADV_PAGEOUT)"); 733 exit(EXIT_FAILURE); 734 } 735 if (check_swap(p, page_size)) { 736 success("OK"); 737 } else { 738 fail("Fail"); 739 goto out; 740 } 741 742 c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, 743 true); 744 validate_memory(p, 0, hpage_pmd_size); 745 out: 746 ops->cleanup_area(p, hpage_pmd_size); 747 } 748 749 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) 750 { 751 int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); 752 void *p; 753 754 p = ops->setup_area(1); 755 ops->fault(p, 0, hpage_pmd_size); 756 757 printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); 758 if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { 759 perror("madvise(MADV_PAGEOUT)"); 760 exit(EXIT_FAILURE); 761 } 762 if (check_swap(p, (max_ptes_swap + 1) * page_size)) { 763 success("OK"); 764 } else { 765 fail("Fail"); 766 goto out; 767 } 768 769 c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, 770 !c->enforce_pte_scan_limits); 771 validate_memory(p, 0, hpage_pmd_size); 772 773 if (c->enforce_pte_scan_limits) { 774 ops->fault(p, 0, hpage_pmd_size); 775 printf("Swapout %d of %d pages...", max_ptes_swap, 776 hpage_pmd_nr); 777 if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { 778 perror("madvise(MADV_PAGEOUT)"); 779 exit(EXIT_FAILURE); 780 } 781 if (check_swap(p, max_ptes_swap * page_size)) { 782 success("OK"); 783 } else { 784 fail("Fail"); 785 goto out; 786 } 787 788 c->collapse("Collapse with max_ptes_swap pages swapped out", p, 789 1, ops, true); 790 validate_memory(p, 0, hpage_pmd_size); 791 } 792 out: 793 ops->cleanup_area(p, hpage_pmd_size); 794 } 795 796 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) 797 { 798 void *p; 799 800 p = alloc_hpage(ops); 801 802 if (is_tmpfs(ops)) { 803 /* MADV_DONTNEED won't evict tmpfs pages */ 804 printf("tmpfs..."); 805 skip("Skip"); 806 goto skip; 807 } 808 809 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 810 printf("Split huge page leaving single PTE mapping compound page..."); 811 madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); 812 if (ops->check_huge(p, 0)) 813 success("OK"); 814 else 815 fail("Fail"); 816 817 c->collapse("Collapse PTE table with single PTE mapping compound page", 818 p, 1, ops, true); 819 validate_memory(p, 0, page_size); 820 skip: 821 ops->cleanup_area(p, hpage_pmd_size); 822 } 823 824 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) 825 { 826 void *p; 827 828 p = alloc_hpage(ops); 829 printf("Split huge page leaving single PTE page table full of compound pages..."); 830 madvise(p, page_size, MADV_NOHUGEPAGE); 831 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 832 if (ops->check_huge(p, 0)) 833 success("OK"); 834 else 835 fail("Fail"); 836 837 c->collapse("Collapse PTE table full of compound pages", p, 1, ops, 838 true); 839 validate_memory(p, 0, hpage_pmd_size); 840 ops->cleanup_area(p, hpage_pmd_size); 841 } 842 843 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) 844 { 845 void *p; 846 int i; 847 848 p = ops->setup_area(1); 849 for (i = 0; i < hpage_pmd_nr; i++) { 850 printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", 851 i + 1, hpage_pmd_nr); 852 853 madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); 854 ops->fault(BASE_ADDR, 0, hpage_pmd_size); 855 if (!ops->check_huge(BASE_ADDR, 1)) { 856 printf("Failed to allocate huge page\n"); 857 exit(EXIT_FAILURE); 858 } 859 madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); 860 861 p = mremap(BASE_ADDR - i * page_size, 862 i * page_size + hpage_pmd_size, 863 (i + 1) * page_size, 864 MREMAP_MAYMOVE | MREMAP_FIXED, 865 BASE_ADDR + 2 * hpage_pmd_size); 866 if (p == MAP_FAILED) { 867 perror("mremap+unmap"); 868 exit(EXIT_FAILURE); 869 } 870 871 p = mremap(BASE_ADDR + 2 * hpage_pmd_size, 872 (i + 1) * page_size, 873 (i + 1) * page_size + hpage_pmd_size, 874 MREMAP_MAYMOVE | MREMAP_FIXED, 875 BASE_ADDR - (i + 1) * page_size); 876 if (p == MAP_FAILED) { 877 perror("mremap+alloc"); 878 exit(EXIT_FAILURE); 879 } 880 } 881 882 ops->cleanup_area(BASE_ADDR, hpage_pmd_size); 883 ops->fault(p, 0, hpage_pmd_size); 884 if (!ops->check_huge(p, 1)) 885 success("OK"); 886 else 887 fail("Fail"); 888 889 c->collapse("Collapse PTE table full of different compound pages", p, 1, 890 ops, true); 891 892 validate_memory(p, 0, hpage_pmd_size); 893 ops->cleanup_area(p, hpage_pmd_size); 894 } 895 896 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) 897 { 898 int wstatus; 899 void *p; 900 901 p = ops->setup_area(1); 902 903 printf("Allocate small page..."); 904 ops->fault(p, 0, page_size); 905 if (ops->check_huge(p, 0)) 906 success("OK"); 907 else 908 fail("Fail"); 909 910 printf("Share small page over fork()..."); 911 if (!fork()) { 912 /* Do not touch settings on child exit */ 913 skip_settings_restore = true; 914 exit_status = 0; 915 916 if (ops->check_huge(p, 0)) 917 success("OK"); 918 else 919 fail("Fail"); 920 921 ops->fault(p, page_size, 2 * page_size); 922 c->collapse("Collapse PTE table with single page shared with parent process", 923 p, 1, ops, true); 924 925 validate_memory(p, 0, page_size); 926 ops->cleanup_area(p, hpage_pmd_size); 927 exit(exit_status); 928 } 929 930 wait(&wstatus); 931 exit_status += WEXITSTATUS(wstatus); 932 933 printf("Check if parent still has small page..."); 934 if (ops->check_huge(p, 0)) 935 success("OK"); 936 else 937 fail("Fail"); 938 validate_memory(p, 0, page_size); 939 ops->cleanup_area(p, hpage_pmd_size); 940 } 941 942 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) 943 { 944 int wstatus; 945 void *p; 946 947 p = alloc_hpage(ops); 948 printf("Share huge page over fork()..."); 949 if (!fork()) { 950 /* Do not touch settings on child exit */ 951 skip_settings_restore = true; 952 exit_status = 0; 953 954 if (ops->check_huge(p, 1)) 955 success("OK"); 956 else 957 fail("Fail"); 958 959 printf("Split huge page PMD in child process..."); 960 madvise(p, page_size, MADV_NOHUGEPAGE); 961 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); 962 if (ops->check_huge(p, 0)) 963 success("OK"); 964 else 965 fail("Fail"); 966 ops->fault(p, 0, page_size); 967 968 thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); 969 c->collapse("Collapse PTE table full of compound pages in child", 970 p, 1, ops, true); 971 thp_write_num("khugepaged/max_ptes_shared", 972 thp_current_settings()->khugepaged.max_ptes_shared); 973 974 validate_memory(p, 0, hpage_pmd_size); 975 ops->cleanup_area(p, hpage_pmd_size); 976 exit(exit_status); 977 } 978 979 wait(&wstatus); 980 exit_status += WEXITSTATUS(wstatus); 981 982 printf("Check if parent still has huge page..."); 983 if (ops->check_huge(p, 1)) 984 success("OK"); 985 else 986 fail("Fail"); 987 validate_memory(p, 0, hpage_pmd_size); 988 ops->cleanup_area(p, hpage_pmd_size); 989 } 990 991 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) 992 { 993 int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); 994 int wstatus; 995 void *p; 996 997 p = alloc_hpage(ops); 998 printf("Share huge page over fork()..."); 999 if (!fork()) { 1000 /* Do not touch settings on child exit */ 1001 skip_settings_restore = true; 1002 exit_status = 0; 1003 1004 if (ops->check_huge(p, 1)) 1005 success("OK"); 1006 else 1007 fail("Fail"); 1008 1009 printf("Trigger CoW on page %d of %d...", 1010 hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); 1011 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); 1012 if (ops->check_huge(p, 0)) 1013 success("OK"); 1014 else 1015 fail("Fail"); 1016 1017 c->collapse("Maybe collapse with max_ptes_shared exceeded", p, 1018 1, ops, !c->enforce_pte_scan_limits); 1019 1020 if (c->enforce_pte_scan_limits) { 1021 printf("Trigger CoW on page %d of %d...", 1022 hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); 1023 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * 1024 page_size); 1025 if (ops->check_huge(p, 0)) 1026 success("OK"); 1027 else 1028 fail("Fail"); 1029 1030 c->collapse("Collapse with max_ptes_shared PTEs shared", 1031 p, 1, ops, true); 1032 } 1033 1034 validate_memory(p, 0, hpage_pmd_size); 1035 ops->cleanup_area(p, hpage_pmd_size); 1036 exit(exit_status); 1037 } 1038 1039 wait(&wstatus); 1040 exit_status += WEXITSTATUS(wstatus); 1041 1042 printf("Check if parent still has huge page..."); 1043 if (ops->check_huge(p, 1)) 1044 success("OK"); 1045 else 1046 fail("Fail"); 1047 validate_memory(p, 0, hpage_pmd_size); 1048 ops->cleanup_area(p, hpage_pmd_size); 1049 } 1050 1051 static void madvise_collapse_existing_thps(struct collapse_context *c, 1052 struct mem_ops *ops) 1053 { 1054 void *p; 1055 1056 p = ops->setup_area(1); 1057 ops->fault(p, 0, hpage_pmd_size); 1058 c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); 1059 validate_memory(p, 0, hpage_pmd_size); 1060 1061 /* c->collapse() will find a hugepage and complain - call directly. */ 1062 __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); 1063 validate_memory(p, 0, hpage_pmd_size); 1064 ops->cleanup_area(p, hpage_pmd_size); 1065 } 1066 1067 /* 1068 * Test race with khugepaged where page tables have been retracted and 1069 * pmd cleared. 1070 */ 1071 static void madvise_retracted_page_tables(struct collapse_context *c, 1072 struct mem_ops *ops) 1073 { 1074 void *p; 1075 int nr_hpages = 1; 1076 unsigned long size = nr_hpages * hpage_pmd_size; 1077 1078 p = ops->setup_area(nr_hpages); 1079 ops->fault(p, 0, size); 1080 1081 /* Let khugepaged collapse and leave pmd cleared */ 1082 if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, 1083 ops)) { 1084 fail("Timeout"); 1085 return; 1086 } 1087 success("OK"); 1088 c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, 1089 true); 1090 validate_memory(p, 0, size); 1091 ops->cleanup_area(p, size); 1092 } 1093 1094 static void usage(void) 1095 { 1096 fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n"); 1097 fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); 1098 fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); 1099 fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); 1100 fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); 1101 fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); 1102 fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); 1103 fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); 1104 fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n"); 1105 fprintf(stderr, "\n\tSupported Options:\n"); 1106 fprintf(stderr, "\t\t-h: This help message.\n"); 1107 fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); 1108 fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n"); 1109 exit(1); 1110 } 1111 1112 static void parse_test_type(int argc, char **argv) 1113 { 1114 int opt; 1115 char *buf; 1116 const char *token; 1117 1118 while ((opt = getopt(argc, argv, "s:h")) != -1) { 1119 switch (opt) { 1120 case 's': 1121 anon_order = atoi(optarg); 1122 break; 1123 case 'h': 1124 default: 1125 usage(); 1126 } 1127 } 1128 1129 argv += optind; 1130 argc -= optind; 1131 1132 if (argc == 0) { 1133 /* Backwards compatibility */ 1134 khugepaged_context = &__khugepaged_context; 1135 madvise_context = &__madvise_context; 1136 anon_ops = &__anon_ops; 1137 return; 1138 } 1139 1140 buf = strdup(argv[0]); 1141 token = strsep(&buf, ":"); 1142 1143 if (!strcmp(token, "all")) { 1144 khugepaged_context = &__khugepaged_context; 1145 madvise_context = &__madvise_context; 1146 } else if (!strcmp(token, "khugepaged")) { 1147 khugepaged_context = &__khugepaged_context; 1148 } else if (!strcmp(token, "madvise")) { 1149 madvise_context = &__madvise_context; 1150 } else { 1151 usage(); 1152 } 1153 1154 if (!buf) 1155 usage(); 1156 1157 if (!strcmp(buf, "all")) { 1158 file_ops = &__file_ops; 1159 anon_ops = &__anon_ops; 1160 shmem_ops = &__shmem_ops; 1161 } else if (!strcmp(buf, "anon")) { 1162 anon_ops = &__anon_ops; 1163 } else if (!strcmp(buf, "file")) { 1164 file_ops = &__file_ops; 1165 } else if (!strcmp(buf, "shmem")) { 1166 shmem_ops = &__shmem_ops; 1167 } else { 1168 usage(); 1169 } 1170 1171 if (!file_ops) 1172 return; 1173 1174 if (argc != 2) 1175 usage(); 1176 1177 get_finfo(argv[1]); 1178 } 1179 1180 int main(int argc, char **argv) 1181 { 1182 int hpage_pmd_order; 1183 struct thp_settings default_settings = { 1184 .thp_enabled = THP_MADVISE, 1185 .thp_defrag = THP_DEFRAG_ALWAYS, 1186 .shmem_enabled = SHMEM_ADVISE, 1187 .use_zero_page = 0, 1188 .khugepaged = { 1189 .defrag = 1, 1190 .alloc_sleep_millisecs = 10, 1191 .scan_sleep_millisecs = 10, 1192 }, 1193 /* 1194 * When testing file-backed memory, the collapse path 1195 * looks at how many pages are found in the page cache, not 1196 * what pages are mapped. Disable read ahead optimization so 1197 * pages don't find their way into the page cache unless 1198 * we mem_ops->fault() them in. 1199 */ 1200 .read_ahead_kb = 0, 1201 }; 1202 1203 if (!thp_is_enabled()) { 1204 printf("Transparent Hugepages not available\n"); 1205 return KSFT_SKIP; 1206 } 1207 1208 parse_test_type(argc, argv); 1209 1210 setbuf(stdout, NULL); 1211 1212 page_size = getpagesize(); 1213 hpage_pmd_size = read_pmd_pagesize(); 1214 if (!hpage_pmd_size) { 1215 printf("Reading PMD pagesize failed"); 1216 exit(EXIT_FAILURE); 1217 } 1218 hpage_pmd_nr = hpage_pmd_size / page_size; 1219 hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); 1220 1221 default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; 1222 default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; 1223 default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; 1224 default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; 1225 default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; 1226 default_settings.hugepages[anon_order].enabled = THP_ALWAYS; 1227 default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT; 1228 default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS; 1229 1230 save_settings(); 1231 thp_push_settings(&default_settings); 1232 1233 alloc_at_fault(); 1234 1235 #define TEST(t, c, o) do { \ 1236 if (c && o) { \ 1237 printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ 1238 t(c, o); \ 1239 } \ 1240 } while (0) 1241 1242 TEST(collapse_full, khugepaged_context, anon_ops); 1243 TEST(collapse_full, khugepaged_context, file_ops); 1244 TEST(collapse_full, khugepaged_context, shmem_ops); 1245 TEST(collapse_full, madvise_context, anon_ops); 1246 TEST(collapse_full, madvise_context, file_ops); 1247 TEST(collapse_full, madvise_context, shmem_ops); 1248 1249 TEST(collapse_empty, khugepaged_context, anon_ops); 1250 TEST(collapse_empty, madvise_context, anon_ops); 1251 1252 TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); 1253 TEST(collapse_single_pte_entry, khugepaged_context, file_ops); 1254 TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); 1255 TEST(collapse_single_pte_entry, madvise_context, anon_ops); 1256 TEST(collapse_single_pte_entry, madvise_context, file_ops); 1257 TEST(collapse_single_pte_entry, madvise_context, shmem_ops); 1258 1259 TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); 1260 TEST(collapse_max_ptes_none, khugepaged_context, file_ops); 1261 TEST(collapse_max_ptes_none, madvise_context, anon_ops); 1262 TEST(collapse_max_ptes_none, madvise_context, file_ops); 1263 1264 TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); 1265 TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); 1266 TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); 1267 TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); 1268 1269 TEST(collapse_full_of_compound, khugepaged_context, anon_ops); 1270 TEST(collapse_full_of_compound, khugepaged_context, file_ops); 1271 TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); 1272 TEST(collapse_full_of_compound, madvise_context, anon_ops); 1273 TEST(collapse_full_of_compound, madvise_context, file_ops); 1274 TEST(collapse_full_of_compound, madvise_context, shmem_ops); 1275 1276 TEST(collapse_compound_extreme, khugepaged_context, anon_ops); 1277 TEST(collapse_compound_extreme, madvise_context, anon_ops); 1278 1279 TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); 1280 TEST(collapse_swapin_single_pte, madvise_context, anon_ops); 1281 1282 TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); 1283 TEST(collapse_max_ptes_swap, madvise_context, anon_ops); 1284 1285 TEST(collapse_fork, khugepaged_context, anon_ops); 1286 TEST(collapse_fork, madvise_context, anon_ops); 1287 1288 TEST(collapse_fork_compound, khugepaged_context, anon_ops); 1289 TEST(collapse_fork_compound, madvise_context, anon_ops); 1290 1291 TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); 1292 TEST(collapse_max_ptes_shared, madvise_context, anon_ops); 1293 1294 TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); 1295 TEST(madvise_collapse_existing_thps, madvise_context, file_ops); 1296 TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); 1297 1298 TEST(madvise_retracted_page_tables, madvise_context, file_ops); 1299 TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); 1300 1301 restore_settings(0); 1302 } 1303