1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * COW (Copy On Write) tests. 4 * 5 * Copyright 2022, Red Hat, Inc. 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 #define _GNU_SOURCE 10 #include <stdlib.h> 11 #include <string.h> 12 #include <stdbool.h> 13 #include <stdint.h> 14 #include <unistd.h> 15 #include <errno.h> 16 #include <fcntl.h> 17 #include <assert.h> 18 #include <linux/mman.h> 19 #include <sys/mman.h> 20 #include <sys/ioctl.h> 21 #include <sys/wait.h> 22 #include <linux/memfd.h> 23 24 #include "local_config.h" 25 #ifdef LOCAL_CONFIG_HAVE_LIBURING 26 #include <liburing.h> 27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 28 29 #include "../../../../mm/gup_test.h" 30 #include "../kselftest.h" 31 #include "vm_util.h" 32 #include "thp_settings.h" 33 34 static size_t pagesize; 35 static int pagemap_fd; 36 static size_t pmdsize; 37 static int nr_thpsizes; 38 static size_t thpsizes[20]; 39 static int nr_hugetlbsizes; 40 static size_t hugetlbsizes[10]; 41 static int gup_fd; 42 static bool has_huge_zeropage; 43 44 static int sz2ord(size_t size) 45 { 46 return __builtin_ctzll(size / pagesize); 47 } 48 49 static int detect_thp_sizes(size_t sizes[], int max) 50 { 51 int count = 0; 52 unsigned long orders; 53 size_t kb; 54 int i; 55 56 /* thp not supported at all. */ 57 if (!pmdsize) 58 return 0; 59 60 orders = 1UL << sz2ord(pmdsize); 61 orders |= thp_supported_orders(); 62 63 for (i = 0; orders && count < max; i++) { 64 if (!(orders & (1UL << i))) 65 continue; 66 orders &= ~(1UL << i); 67 kb = (pagesize >> 10) << i; 68 sizes[count++] = kb * 1024; 69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); 70 } 71 72 return count; 73 } 74 75 static void detect_huge_zeropage(void) 76 { 77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", 78 O_RDONLY); 79 size_t enabled = 0; 80 char buf[15]; 81 int ret; 82 83 if (fd < 0) 84 return; 85 86 ret = pread(fd, buf, sizeof(buf), 0); 87 if (ret > 0 && ret < sizeof(buf)) { 88 buf[ret] = 0; 89 90 enabled = strtoul(buf, NULL, 10); 91 if (enabled == 1) { 92 has_huge_zeropage = true; 93 ksft_print_msg("[INFO] huge zeropage is enabled\n"); 94 } 95 } 96 97 close(fd); 98 } 99 100 static bool range_is_swapped(void *addr, size_t size) 101 { 102 for (; size; addr += pagesize, size -= pagesize) 103 if (!pagemap_is_swapped(pagemap_fd, addr)) 104 return false; 105 return true; 106 } 107 108 struct comm_pipes { 109 int child_ready[2]; 110 int parent_ready[2]; 111 }; 112 113 static int setup_comm_pipes(struct comm_pipes *comm_pipes) 114 { 115 if (pipe(comm_pipes->child_ready) < 0) 116 return -errno; 117 if (pipe(comm_pipes->parent_ready) < 0) { 118 close(comm_pipes->child_ready[0]); 119 close(comm_pipes->child_ready[1]); 120 return -errno; 121 } 122 123 return 0; 124 } 125 126 static void close_comm_pipes(struct comm_pipes *comm_pipes) 127 { 128 close(comm_pipes->child_ready[0]); 129 close(comm_pipes->child_ready[1]); 130 close(comm_pipes->parent_ready[0]); 131 close(comm_pipes->parent_ready[1]); 132 } 133 134 static int child_memcmp_fn(char *mem, size_t size, 135 struct comm_pipes *comm_pipes) 136 { 137 char *old = malloc(size); 138 char buf; 139 140 /* Backup the original content. */ 141 memcpy(old, mem, size); 142 143 /* Wait until the parent modified the page. */ 144 write(comm_pipes->child_ready[1], "0", 1); 145 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) 146 ; 147 148 /* See if we still read the old values. */ 149 return memcmp(old, mem, size); 150 } 151 152 static int child_vmsplice_memcmp_fn(char *mem, size_t size, 153 struct comm_pipes *comm_pipes) 154 { 155 struct iovec iov = { 156 .iov_base = mem, 157 .iov_len = size, 158 }; 159 ssize_t cur, total, transferred; 160 char *old, *new; 161 int fds[2]; 162 char buf; 163 164 old = malloc(size); 165 new = malloc(size); 166 167 /* Backup the original content. */ 168 memcpy(old, mem, size); 169 170 if (pipe(fds) < 0) 171 return -errno; 172 173 /* Trigger a read-only pin. */ 174 transferred = vmsplice(fds[1], &iov, 1, 0); 175 if (transferred < 0) 176 return -errno; 177 if (transferred == 0) 178 return -EINVAL; 179 180 /* Unmap it from our page tables. */ 181 if (munmap(mem, size) < 0) 182 return -errno; 183 184 /* Wait until the parent modified it. */ 185 write(comm_pipes->child_ready[1], "0", 1); 186 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) 187 ; 188 189 /* See if we still read the old values via the pipe. */ 190 for (total = 0; total < transferred; total += cur) { 191 cur = read(fds[0], new + total, transferred - total); 192 if (cur < 0) 193 return -errno; 194 } 195 196 return memcmp(old, new, transferred); 197 } 198 199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); 200 201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, 202 child_fn fn) 203 { 204 struct comm_pipes comm_pipes; 205 char buf; 206 int ret; 207 208 ret = setup_comm_pipes(&comm_pipes); 209 if (ret) { 210 ksft_test_result_fail("pipe() failed\n"); 211 return; 212 } 213 214 ret = fork(); 215 if (ret < 0) { 216 ksft_test_result_fail("fork() failed\n"); 217 goto close_comm_pipes; 218 } else if (!ret) { 219 exit(fn(mem, size, &comm_pipes)); 220 } 221 222 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 223 ; 224 225 if (do_mprotect) { 226 /* 227 * mprotect() optimizations might try avoiding 228 * write-faults by directly mapping pages writable. 229 */ 230 ret = mprotect(mem, size, PROT_READ); 231 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); 232 if (ret) { 233 ksft_test_result_fail("mprotect() failed\n"); 234 write(comm_pipes.parent_ready[1], "0", 1); 235 wait(&ret); 236 goto close_comm_pipes; 237 } 238 } 239 240 /* Modify the page. */ 241 memset(mem, 0xff, size); 242 write(comm_pipes.parent_ready[1], "0", 1); 243 244 wait(&ret); 245 if (WIFEXITED(ret)) 246 ret = WEXITSTATUS(ret); 247 else 248 ret = -EINVAL; 249 250 ksft_test_result(!ret, "No leak from parent into child\n"); 251 close_comm_pipes: 252 close_comm_pipes(&comm_pipes); 253 } 254 255 static void test_cow_in_parent(char *mem, size_t size) 256 { 257 do_test_cow_in_parent(mem, size, false, child_memcmp_fn); 258 } 259 260 static void test_cow_in_parent_mprotect(char *mem, size_t size) 261 { 262 do_test_cow_in_parent(mem, size, true, child_memcmp_fn); 263 } 264 265 static void test_vmsplice_in_child(char *mem, size_t size) 266 { 267 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); 268 } 269 270 static void test_vmsplice_in_child_mprotect(char *mem, size_t size) 271 { 272 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); 273 } 274 275 static void do_test_vmsplice_in_parent(char *mem, size_t size, 276 bool before_fork) 277 { 278 struct iovec iov = { 279 .iov_base = mem, 280 .iov_len = size, 281 }; 282 ssize_t cur, total, transferred; 283 struct comm_pipes comm_pipes; 284 char *old, *new; 285 int ret, fds[2]; 286 char buf; 287 288 old = malloc(size); 289 new = malloc(size); 290 291 memcpy(old, mem, size); 292 293 ret = setup_comm_pipes(&comm_pipes); 294 if (ret) { 295 ksft_test_result_fail("pipe() failed\n"); 296 goto free; 297 } 298 299 if (pipe(fds) < 0) { 300 ksft_test_result_fail("pipe() failed\n"); 301 goto close_comm_pipes; 302 } 303 304 if (before_fork) { 305 transferred = vmsplice(fds[1], &iov, 1, 0); 306 if (transferred <= 0) { 307 ksft_test_result_fail("vmsplice() failed\n"); 308 goto close_pipe; 309 } 310 } 311 312 ret = fork(); 313 if (ret < 0) { 314 ksft_test_result_fail("fork() failed\n"); 315 goto close_pipe; 316 } else if (!ret) { 317 write(comm_pipes.child_ready[1], "0", 1); 318 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 319 ; 320 /* Modify page content in the child. */ 321 memset(mem, 0xff, size); 322 exit(0); 323 } 324 325 if (!before_fork) { 326 transferred = vmsplice(fds[1], &iov, 1, 0); 327 if (transferred <= 0) { 328 ksft_test_result_fail("vmsplice() failed\n"); 329 wait(&ret); 330 goto close_pipe; 331 } 332 } 333 334 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 335 ; 336 if (munmap(mem, size) < 0) { 337 ksft_test_result_fail("munmap() failed\n"); 338 goto close_pipe; 339 } 340 write(comm_pipes.parent_ready[1], "0", 1); 341 342 /* Wait until the child is done writing. */ 343 wait(&ret); 344 if (!WIFEXITED(ret)) { 345 ksft_test_result_fail("wait() failed\n"); 346 goto close_pipe; 347 } 348 349 /* See if we still read the old values. */ 350 for (total = 0; total < transferred; total += cur) { 351 cur = read(fds[0], new + total, transferred - total); 352 if (cur < 0) { 353 ksft_test_result_fail("read() failed\n"); 354 goto close_pipe; 355 } 356 } 357 358 ksft_test_result(!memcmp(old, new, transferred), 359 "No leak from child into parent\n"); 360 close_pipe: 361 close(fds[0]); 362 close(fds[1]); 363 close_comm_pipes: 364 close_comm_pipes(&comm_pipes); 365 free: 366 free(old); 367 free(new); 368 } 369 370 static void test_vmsplice_before_fork(char *mem, size_t size) 371 { 372 do_test_vmsplice_in_parent(mem, size, true); 373 } 374 375 static void test_vmsplice_after_fork(char *mem, size_t size) 376 { 377 do_test_vmsplice_in_parent(mem, size, false); 378 } 379 380 #ifdef LOCAL_CONFIG_HAVE_LIBURING 381 static void do_test_iouring(char *mem, size_t size, bool use_fork) 382 { 383 struct comm_pipes comm_pipes; 384 struct io_uring_cqe *cqe; 385 struct io_uring_sqe *sqe; 386 struct io_uring ring; 387 ssize_t cur, total; 388 struct iovec iov; 389 char *buf, *tmp; 390 int ret, fd; 391 FILE *file; 392 393 ret = setup_comm_pipes(&comm_pipes); 394 if (ret) { 395 ksft_test_result_fail("pipe() failed\n"); 396 return; 397 } 398 399 file = tmpfile(); 400 if (!file) { 401 ksft_test_result_fail("tmpfile() failed\n"); 402 goto close_comm_pipes; 403 } 404 fd = fileno(file); 405 assert(fd); 406 407 tmp = malloc(size); 408 if (!tmp) { 409 ksft_test_result_fail("malloc() failed\n"); 410 goto close_file; 411 } 412 413 /* Skip on errors, as we might just lack kernel support. */ 414 ret = io_uring_queue_init(1, &ring, 0); 415 if (ret < 0) { 416 ksft_test_result_skip("io_uring_queue_init() failed\n"); 417 goto free_tmp; 418 } 419 420 /* 421 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN 422 * | FOLL_LONGTERM the range. 423 * 424 * Skip on errors, as we might just lack kernel support or might not 425 * have sufficient MEMLOCK permissions. 426 */ 427 iov.iov_base = mem; 428 iov.iov_len = size; 429 ret = io_uring_register_buffers(&ring, &iov, 1); 430 if (ret) { 431 ksft_test_result_skip("io_uring_register_buffers() failed\n"); 432 goto queue_exit; 433 } 434 435 if (use_fork) { 436 /* 437 * fork() and keep the child alive until we're done. Note that 438 * we expect the pinned page to not get shared with the child. 439 */ 440 ret = fork(); 441 if (ret < 0) { 442 ksft_test_result_fail("fork() failed\n"); 443 goto unregister_buffers; 444 } else if (!ret) { 445 write(comm_pipes.child_ready[1], "0", 1); 446 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 447 ; 448 exit(0); 449 } 450 451 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 452 ; 453 } else { 454 /* 455 * Map the page R/O into the page table. Enable softdirty 456 * tracking to stop the page from getting mapped R/W immediately 457 * again by mprotect() optimizations. Note that we don't have an 458 * easy way to test if that worked (the pagemap does not export 459 * if the page is mapped R/O vs. R/W). 460 */ 461 ret = mprotect(mem, size, PROT_READ); 462 clear_softdirty(); 463 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); 464 if (ret) { 465 ksft_test_result_fail("mprotect() failed\n"); 466 goto unregister_buffers; 467 } 468 } 469 470 /* 471 * Modify the page and write page content as observed by the fixed 472 * buffer pin to the file so we can verify it. 473 */ 474 memset(mem, 0xff, size); 475 sqe = io_uring_get_sqe(&ring); 476 if (!sqe) { 477 ksft_test_result_fail("io_uring_get_sqe() failed\n"); 478 goto quit_child; 479 } 480 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); 481 482 ret = io_uring_submit(&ring); 483 if (ret < 0) { 484 ksft_test_result_fail("io_uring_submit() failed\n"); 485 goto quit_child; 486 } 487 488 ret = io_uring_wait_cqe(&ring, &cqe); 489 if (ret < 0) { 490 ksft_test_result_fail("io_uring_wait_cqe() failed\n"); 491 goto quit_child; 492 } 493 494 if (cqe->res != size) { 495 ksft_test_result_fail("write_fixed failed\n"); 496 goto quit_child; 497 } 498 io_uring_cqe_seen(&ring, cqe); 499 500 /* Read back the file content to the temporary buffer. */ 501 total = 0; 502 while (total < size) { 503 cur = pread(fd, tmp + total, size - total, total); 504 if (cur < 0) { 505 ksft_test_result_fail("pread() failed\n"); 506 goto quit_child; 507 } 508 total += cur; 509 } 510 511 /* Finally, check if we read what we expected. */ 512 ksft_test_result(!memcmp(mem, tmp, size), 513 "Longterm R/W pin is reliable\n"); 514 515 quit_child: 516 if (use_fork) { 517 write(comm_pipes.parent_ready[1], "0", 1); 518 wait(&ret); 519 } 520 unregister_buffers: 521 io_uring_unregister_buffers(&ring); 522 queue_exit: 523 io_uring_queue_exit(&ring); 524 free_tmp: 525 free(tmp); 526 close_file: 527 fclose(file); 528 close_comm_pipes: 529 close_comm_pipes(&comm_pipes); 530 } 531 532 static void test_iouring_ro(char *mem, size_t size) 533 { 534 do_test_iouring(mem, size, false); 535 } 536 537 static void test_iouring_fork(char *mem, size_t size) 538 { 539 do_test_iouring(mem, size, true); 540 } 541 542 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 543 544 enum ro_pin_test { 545 RO_PIN_TEST, 546 RO_PIN_TEST_SHARED, 547 RO_PIN_TEST_PREVIOUSLY_SHARED, 548 RO_PIN_TEST_RO_EXCLUSIVE, 549 }; 550 551 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, 552 bool fast) 553 { 554 struct pin_longterm_test args; 555 struct comm_pipes comm_pipes; 556 char *tmp, buf; 557 __u64 tmp_val; 558 int ret; 559 560 if (gup_fd < 0) { 561 ksft_test_result_skip("gup_test not available\n"); 562 return; 563 } 564 565 tmp = malloc(size); 566 if (!tmp) { 567 ksft_test_result_fail("malloc() failed\n"); 568 return; 569 } 570 571 ret = setup_comm_pipes(&comm_pipes); 572 if (ret) { 573 ksft_test_result_fail("pipe() failed\n"); 574 goto free_tmp; 575 } 576 577 switch (test) { 578 case RO_PIN_TEST: 579 break; 580 case RO_PIN_TEST_SHARED: 581 case RO_PIN_TEST_PREVIOUSLY_SHARED: 582 /* 583 * Share the pages with our child. As the pages are not pinned, 584 * this should just work. 585 */ 586 ret = fork(); 587 if (ret < 0) { 588 ksft_test_result_fail("fork() failed\n"); 589 goto close_comm_pipes; 590 } else if (!ret) { 591 write(comm_pipes.child_ready[1], "0", 1); 592 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 593 ; 594 exit(0); 595 } 596 597 /* Wait until our child is ready. */ 598 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 599 ; 600 601 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { 602 /* 603 * Tell the child to quit now and wait until it quit. 604 * The pages should now be mapped R/O into our page 605 * tables, but they are no longer shared. 606 */ 607 write(comm_pipes.parent_ready[1], "0", 1); 608 wait(&ret); 609 if (!WIFEXITED(ret)) 610 ksft_print_msg("[INFO] wait() failed\n"); 611 } 612 break; 613 case RO_PIN_TEST_RO_EXCLUSIVE: 614 /* 615 * Map the page R/O into the page table. Enable softdirty 616 * tracking to stop the page from getting mapped R/W immediately 617 * again by mprotect() optimizations. Note that we don't have an 618 * easy way to test if that worked (the pagemap does not export 619 * if the page is mapped R/O vs. R/W). 620 */ 621 ret = mprotect(mem, size, PROT_READ); 622 clear_softdirty(); 623 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); 624 if (ret) { 625 ksft_test_result_fail("mprotect() failed\n"); 626 goto close_comm_pipes; 627 } 628 break; 629 default: 630 assert(false); 631 } 632 633 /* Take a R/O pin. This should trigger unsharing. */ 634 args.addr = (__u64)(uintptr_t)mem; 635 args.size = size; 636 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; 637 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); 638 if (ret) { 639 if (errno == EINVAL) 640 ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); 641 else 642 ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); 643 goto wait; 644 } 645 646 /* Modify the page. */ 647 memset(mem, 0xff, size); 648 649 /* 650 * Read back the content via the pin to the temporary buffer and 651 * test if we observed the modification. 652 */ 653 tmp_val = (__u64)(uintptr_t)tmp; 654 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); 655 if (ret) 656 ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); 657 else 658 ksft_test_result(!memcmp(mem, tmp, size), 659 "Longterm R/O pin is reliable\n"); 660 661 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); 662 if (ret) 663 ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); 664 wait: 665 switch (test) { 666 case RO_PIN_TEST_SHARED: 667 write(comm_pipes.parent_ready[1], "0", 1); 668 wait(&ret); 669 if (!WIFEXITED(ret)) 670 ksft_print_msg("[INFO] wait() failed\n"); 671 break; 672 default: 673 break; 674 } 675 close_comm_pipes: 676 close_comm_pipes(&comm_pipes); 677 free_tmp: 678 free(tmp); 679 } 680 681 static void test_ro_pin_on_shared(char *mem, size_t size) 682 { 683 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); 684 } 685 686 static void test_ro_fast_pin_on_shared(char *mem, size_t size) 687 { 688 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); 689 } 690 691 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) 692 { 693 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); 694 } 695 696 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) 697 { 698 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); 699 } 700 701 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) 702 { 703 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); 704 } 705 706 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) 707 { 708 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); 709 } 710 711 typedef void (*test_fn)(char *mem, size_t size); 712 713 static void do_run_with_base_page(test_fn fn, bool swapout) 714 { 715 char *mem; 716 int ret; 717 718 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, 719 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 720 if (mem == MAP_FAILED) { 721 ksft_test_result_fail("mmap() failed\n"); 722 return; 723 } 724 725 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); 726 /* Ignore if not around on a kernel. */ 727 if (ret && errno != EINVAL) { 728 ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); 729 goto munmap; 730 } 731 732 /* Populate a base page. */ 733 memset(mem, 0, pagesize); 734 735 if (swapout) { 736 madvise(mem, pagesize, MADV_PAGEOUT); 737 if (!pagemap_is_swapped(pagemap_fd, mem)) { 738 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); 739 goto munmap; 740 } 741 } 742 743 fn(mem, pagesize); 744 munmap: 745 munmap(mem, pagesize); 746 } 747 748 static void run_with_base_page(test_fn fn, const char *desc) 749 { 750 ksft_print_msg("[RUN] %s ... with base page\n", desc); 751 do_run_with_base_page(fn, false); 752 } 753 754 static void run_with_base_page_swap(test_fn fn, const char *desc) 755 { 756 ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); 757 do_run_with_base_page(fn, true); 758 } 759 760 enum thp_run { 761 THP_RUN_PMD, 762 THP_RUN_PMD_SWAPOUT, 763 THP_RUN_PTE, 764 THP_RUN_PTE_SWAPOUT, 765 THP_RUN_SINGLE_PTE, 766 THP_RUN_SINGLE_PTE_SWAPOUT, 767 THP_RUN_PARTIAL_MREMAP, 768 THP_RUN_PARTIAL_SHARED, 769 }; 770 771 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) 772 { 773 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; 774 size_t size, mmap_size, mremap_size; 775 int ret; 776 777 /* For alignment purposes, we need twice the thp size. */ 778 mmap_size = 2 * thpsize; 779 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 780 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 781 if (mmap_mem == MAP_FAILED) { 782 ksft_test_result_fail("mmap() failed\n"); 783 return; 784 } 785 786 /* We need a THP-aligned memory area. */ 787 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); 788 789 ret = madvise(mem, thpsize, MADV_HUGEPAGE); 790 if (ret) { 791 ksft_test_result_fail("MADV_HUGEPAGE failed\n"); 792 goto munmap; 793 } 794 795 /* 796 * Try to populate a THP. Touch the first sub-page and test if 797 * we get the last sub-page populated automatically. 798 */ 799 mem[0] = 0; 800 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { 801 ksft_test_result_skip("Did not get a THP populated\n"); 802 goto munmap; 803 } 804 memset(mem, 0, thpsize); 805 806 size = thpsize; 807 switch (thp_run) { 808 case THP_RUN_PMD: 809 case THP_RUN_PMD_SWAPOUT: 810 assert(thpsize == pmdsize); 811 break; 812 case THP_RUN_PTE: 813 case THP_RUN_PTE_SWAPOUT: 814 /* 815 * Trigger PTE-mapping the THP by temporarily mapping a single 816 * subpage R/O. This is a noop if the THP is not pmdsize (and 817 * therefore already PTE-mapped). 818 */ 819 ret = mprotect(mem + pagesize, pagesize, PROT_READ); 820 if (ret) { 821 ksft_test_result_fail("mprotect() failed\n"); 822 goto munmap; 823 } 824 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); 825 if (ret) { 826 ksft_test_result_fail("mprotect() failed\n"); 827 goto munmap; 828 } 829 break; 830 case THP_RUN_SINGLE_PTE: 831 case THP_RUN_SINGLE_PTE_SWAPOUT: 832 /* 833 * Discard all but a single subpage of that PTE-mapped THP. What 834 * remains is a single PTE mapping a single subpage. 835 */ 836 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); 837 if (ret) { 838 ksft_test_result_fail("MADV_DONTNEED failed\n"); 839 goto munmap; 840 } 841 size = pagesize; 842 break; 843 case THP_RUN_PARTIAL_MREMAP: 844 /* 845 * Remap half of the THP. We need some new memory location 846 * for that. 847 */ 848 mremap_size = thpsize / 2; 849 mremap_mem = mmap(NULL, mremap_size, PROT_NONE, 850 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 851 if (mem == MAP_FAILED) { 852 ksft_test_result_fail("mmap() failed\n"); 853 goto munmap; 854 } 855 tmp = mremap(mem + mremap_size, mremap_size, mremap_size, 856 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); 857 if (tmp != mremap_mem) { 858 ksft_test_result_fail("mremap() failed\n"); 859 goto munmap; 860 } 861 size = mremap_size; 862 break; 863 case THP_RUN_PARTIAL_SHARED: 864 /* 865 * Share the first page of the THP with a child and quit the 866 * child. This will result in some parts of the THP never 867 * have been shared. 868 */ 869 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); 870 if (ret) { 871 ksft_test_result_fail("MADV_DONTFORK failed\n"); 872 goto munmap; 873 } 874 ret = fork(); 875 if (ret < 0) { 876 ksft_test_result_fail("fork() failed\n"); 877 goto munmap; 878 } else if (!ret) { 879 exit(0); 880 } 881 wait(&ret); 882 /* Allow for sharing all pages again. */ 883 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); 884 if (ret) { 885 ksft_test_result_fail("MADV_DOFORK failed\n"); 886 goto munmap; 887 } 888 break; 889 default: 890 assert(false); 891 } 892 893 switch (thp_run) { 894 case THP_RUN_PMD_SWAPOUT: 895 case THP_RUN_PTE_SWAPOUT: 896 case THP_RUN_SINGLE_PTE_SWAPOUT: 897 madvise(mem, size, MADV_PAGEOUT); 898 if (!range_is_swapped(mem, size)) { 899 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); 900 goto munmap; 901 } 902 break; 903 default: 904 break; 905 } 906 907 fn(mem, size); 908 munmap: 909 munmap(mmap_mem, mmap_size); 910 if (mremap_mem != MAP_FAILED) 911 munmap(mremap_mem, mremap_size); 912 } 913 914 static void run_with_thp(test_fn fn, const char *desc, size_t size) 915 { 916 ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", 917 desc, size / 1024); 918 do_run_with_thp(fn, THP_RUN_PMD, size); 919 } 920 921 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) 922 { 923 ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", 924 desc, size / 1024); 925 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); 926 } 927 928 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) 929 { 930 ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", 931 desc, size / 1024); 932 do_run_with_thp(fn, THP_RUN_PTE, size); 933 } 934 935 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) 936 { 937 ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", 938 desc, size / 1024); 939 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); 940 } 941 942 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) 943 { 944 ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", 945 desc, size / 1024); 946 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); 947 } 948 949 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) 950 { 951 ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", 952 desc, size / 1024); 953 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); 954 } 955 956 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) 957 { 958 ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", 959 desc, size / 1024); 960 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); 961 } 962 963 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) 964 { 965 ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", 966 desc, size / 1024); 967 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); 968 } 969 970 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) 971 { 972 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; 973 char *mem, *dummy; 974 975 ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, 976 hugetlbsize / 1024); 977 978 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; 979 980 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); 981 if (mem == MAP_FAILED) { 982 ksft_test_result_skip("need more free huge pages\n"); 983 return; 984 } 985 986 /* Populate an huge page. */ 987 memset(mem, 0, hugetlbsize); 988 989 /* 990 * We need a total of two hugetlb pages to handle COW/unsharing 991 * properly, otherwise we might get zapped by a SIGBUS. 992 */ 993 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); 994 if (dummy == MAP_FAILED) { 995 ksft_test_result_skip("need more free huge pages\n"); 996 goto munmap; 997 } 998 munmap(dummy, hugetlbsize); 999 1000 fn(mem, hugetlbsize); 1001 munmap: 1002 munmap(mem, hugetlbsize); 1003 } 1004 1005 struct test_case { 1006 const char *desc; 1007 test_fn fn; 1008 }; 1009 1010 /* 1011 * Test cases that are specific to anonymous pages: pages in private mappings 1012 * that may get shared via COW during fork(). 1013 */ 1014 static const struct test_case anon_test_cases[] = { 1015 /* 1016 * Basic COW tests for fork() without any GUP. If we miss to break COW, 1017 * either the child can observe modifications by the parent or the 1018 * other way around. 1019 */ 1020 { 1021 "Basic COW after fork()", 1022 test_cow_in_parent, 1023 }, 1024 /* 1025 * Basic test, but do an additional mprotect(PROT_READ)+ 1026 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. 1027 */ 1028 { 1029 "Basic COW after fork() with mprotect() optimization", 1030 test_cow_in_parent_mprotect, 1031 }, 1032 /* 1033 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If 1034 * we miss to break COW, the child observes modifications by the parent. 1035 * This is CVE-2020-29374 reported by Jann Horn. 1036 */ 1037 { 1038 "vmsplice() + unmap in child", 1039 test_vmsplice_in_child 1040 }, 1041 /* 1042 * vmsplice() test, but do an additional mprotect(PROT_READ)+ 1043 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. 1044 */ 1045 { 1046 "vmsplice() + unmap in child with mprotect() optimization", 1047 test_vmsplice_in_child_mprotect 1048 }, 1049 /* 1050 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after 1051 * fork(); modify in the child. If we miss to break COW, the parent 1052 * observes modifications by the child. 1053 */ 1054 { 1055 "vmsplice() before fork(), unmap in parent after fork()", 1056 test_vmsplice_before_fork, 1057 }, 1058 /* 1059 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the 1060 * child. If we miss to break COW, the parent observes modifications by 1061 * the child. 1062 */ 1063 { 1064 "vmsplice() + unmap in parent after fork()", 1065 test_vmsplice_after_fork, 1066 }, 1067 #ifdef LOCAL_CONFIG_HAVE_LIBURING 1068 /* 1069 * Take a R/W longterm pin and then map the page R/O into the page 1070 * table to trigger a write fault on next access. When modifying the 1071 * page, the page content must be visible via the pin. 1072 */ 1073 { 1074 "R/O-mapping a page registered as iouring fixed buffer", 1075 test_iouring_ro, 1076 }, 1077 /* 1078 * Take a R/W longterm pin and then fork() a child. When modifying the 1079 * page, the page content must be visible via the pin. We expect the 1080 * pinned page to not get shared with the child. 1081 */ 1082 { 1083 "fork() with an iouring fixed buffer", 1084 test_iouring_fork, 1085 }, 1086 1087 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 1088 /* 1089 * Take a R/O longterm pin on a R/O-mapped shared anonymous page. 1090 * When modifying the page via the page table, the page content change 1091 * must be visible via the pin. 1092 */ 1093 { 1094 "R/O GUP pin on R/O-mapped shared page", 1095 test_ro_pin_on_shared, 1096 }, 1097 /* Same as above, but using GUP-fast. */ 1098 { 1099 "R/O GUP-fast pin on R/O-mapped shared page", 1100 test_ro_fast_pin_on_shared, 1101 }, 1102 /* 1103 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that 1104 * was previously shared. When modifying the page via the page table, 1105 * the page content change must be visible via the pin. 1106 */ 1107 { 1108 "R/O GUP pin on R/O-mapped previously-shared page", 1109 test_ro_pin_on_ro_previously_shared, 1110 }, 1111 /* Same as above, but using GUP-fast. */ 1112 { 1113 "R/O GUP-fast pin on R/O-mapped previously-shared page", 1114 test_ro_fast_pin_on_ro_previously_shared, 1115 }, 1116 /* 1117 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. 1118 * When modifying the page via the page table, the page content change 1119 * must be visible via the pin. 1120 */ 1121 { 1122 "R/O GUP pin on R/O-mapped exclusive page", 1123 test_ro_pin_on_ro_exclusive, 1124 }, 1125 /* Same as above, but using GUP-fast. */ 1126 { 1127 "R/O GUP-fast pin on R/O-mapped exclusive page", 1128 test_ro_fast_pin_on_ro_exclusive, 1129 }, 1130 }; 1131 1132 static void run_anon_test_case(struct test_case const *test_case) 1133 { 1134 int i; 1135 1136 run_with_base_page(test_case->fn, test_case->desc); 1137 run_with_base_page_swap(test_case->fn, test_case->desc); 1138 for (i = 0; i < nr_thpsizes; i++) { 1139 size_t size = thpsizes[i]; 1140 struct thp_settings settings = *thp_current_settings(); 1141 1142 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; 1143 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; 1144 thp_push_settings(&settings); 1145 1146 if (size == pmdsize) { 1147 run_with_thp(test_case->fn, test_case->desc, size); 1148 run_with_thp_swap(test_case->fn, test_case->desc, size); 1149 } 1150 1151 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); 1152 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); 1153 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); 1154 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); 1155 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); 1156 run_with_partial_shared_thp(test_case->fn, test_case->desc, size); 1157 1158 thp_pop_settings(); 1159 } 1160 for (i = 0; i < nr_hugetlbsizes; i++) 1161 run_with_hugetlb(test_case->fn, test_case->desc, 1162 hugetlbsizes[i]); 1163 } 1164 1165 static void run_anon_test_cases(void) 1166 { 1167 int i; 1168 1169 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); 1170 1171 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) 1172 run_anon_test_case(&anon_test_cases[i]); 1173 } 1174 1175 static int tests_per_anon_test_case(void) 1176 { 1177 int tests = 2 + nr_hugetlbsizes; 1178 1179 tests += 6 * nr_thpsizes; 1180 if (pmdsize) 1181 tests += 2; 1182 return tests; 1183 } 1184 1185 enum anon_thp_collapse_test { 1186 ANON_THP_COLLAPSE_UNSHARED, 1187 ANON_THP_COLLAPSE_FULLY_SHARED, 1188 ANON_THP_COLLAPSE_LOWER_SHARED, 1189 ANON_THP_COLLAPSE_UPPER_SHARED, 1190 }; 1191 1192 static void do_test_anon_thp_collapse(char *mem, size_t size, 1193 enum anon_thp_collapse_test test) 1194 { 1195 struct comm_pipes comm_pipes; 1196 char buf; 1197 int ret; 1198 1199 ret = setup_comm_pipes(&comm_pipes); 1200 if (ret) { 1201 ksft_test_result_fail("pipe() failed\n"); 1202 return; 1203 } 1204 1205 /* 1206 * Trigger PTE-mapping the THP by temporarily mapping a single subpage 1207 * R/O, such that we can try collapsing it later. 1208 */ 1209 ret = mprotect(mem + pagesize, pagesize, PROT_READ); 1210 if (ret) { 1211 ksft_test_result_fail("mprotect() failed\n"); 1212 goto close_comm_pipes; 1213 } 1214 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); 1215 if (ret) { 1216 ksft_test_result_fail("mprotect() failed\n"); 1217 goto close_comm_pipes; 1218 } 1219 1220 switch (test) { 1221 case ANON_THP_COLLAPSE_UNSHARED: 1222 /* Collapse before actually COW-sharing the page. */ 1223 ret = madvise(mem, size, MADV_COLLAPSE); 1224 if (ret) { 1225 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", 1226 strerror(errno)); 1227 goto close_comm_pipes; 1228 } 1229 break; 1230 case ANON_THP_COLLAPSE_FULLY_SHARED: 1231 /* COW-share the full PTE-mapped THP. */ 1232 break; 1233 case ANON_THP_COLLAPSE_LOWER_SHARED: 1234 /* Don't COW-share the upper part of the THP. */ 1235 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); 1236 if (ret) { 1237 ksft_test_result_fail("MADV_DONTFORK failed\n"); 1238 goto close_comm_pipes; 1239 } 1240 break; 1241 case ANON_THP_COLLAPSE_UPPER_SHARED: 1242 /* Don't COW-share the lower part of the THP. */ 1243 ret = madvise(mem, size / 2, MADV_DONTFORK); 1244 if (ret) { 1245 ksft_test_result_fail("MADV_DONTFORK failed\n"); 1246 goto close_comm_pipes; 1247 } 1248 break; 1249 default: 1250 assert(false); 1251 } 1252 1253 ret = fork(); 1254 if (ret < 0) { 1255 ksft_test_result_fail("fork() failed\n"); 1256 goto close_comm_pipes; 1257 } else if (!ret) { 1258 switch (test) { 1259 case ANON_THP_COLLAPSE_UNSHARED: 1260 case ANON_THP_COLLAPSE_FULLY_SHARED: 1261 exit(child_memcmp_fn(mem, size, &comm_pipes)); 1262 break; 1263 case ANON_THP_COLLAPSE_LOWER_SHARED: 1264 exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); 1265 break; 1266 case ANON_THP_COLLAPSE_UPPER_SHARED: 1267 exit(child_memcmp_fn(mem + size / 2, size / 2, 1268 &comm_pipes)); 1269 break; 1270 default: 1271 assert(false); 1272 } 1273 } 1274 1275 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 1276 ; 1277 1278 switch (test) { 1279 case ANON_THP_COLLAPSE_UNSHARED: 1280 break; 1281 case ANON_THP_COLLAPSE_UPPER_SHARED: 1282 case ANON_THP_COLLAPSE_LOWER_SHARED: 1283 /* 1284 * Revert MADV_DONTFORK such that we merge the VMAs and are 1285 * able to actually collapse. 1286 */ 1287 ret = madvise(mem, size, MADV_DOFORK); 1288 if (ret) { 1289 ksft_test_result_fail("MADV_DOFORK failed\n"); 1290 write(comm_pipes.parent_ready[1], "0", 1); 1291 wait(&ret); 1292 goto close_comm_pipes; 1293 } 1294 /* FALLTHROUGH */ 1295 case ANON_THP_COLLAPSE_FULLY_SHARED: 1296 /* Collapse before anyone modified the COW-shared page. */ 1297 ret = madvise(mem, size, MADV_COLLAPSE); 1298 if (ret) { 1299 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", 1300 strerror(errno)); 1301 write(comm_pipes.parent_ready[1], "0", 1); 1302 wait(&ret); 1303 goto close_comm_pipes; 1304 } 1305 break; 1306 default: 1307 assert(false); 1308 } 1309 1310 /* Modify the page. */ 1311 memset(mem, 0xff, size); 1312 write(comm_pipes.parent_ready[1], "0", 1); 1313 1314 wait(&ret); 1315 if (WIFEXITED(ret)) 1316 ret = WEXITSTATUS(ret); 1317 else 1318 ret = -EINVAL; 1319 1320 ksft_test_result(!ret, "No leak from parent into child\n"); 1321 close_comm_pipes: 1322 close_comm_pipes(&comm_pipes); 1323 } 1324 1325 static void test_anon_thp_collapse_unshared(char *mem, size_t size) 1326 { 1327 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); 1328 } 1329 1330 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) 1331 { 1332 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); 1333 } 1334 1335 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) 1336 { 1337 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); 1338 } 1339 1340 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) 1341 { 1342 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); 1343 } 1344 1345 /* 1346 * Test cases that are specific to anonymous THP: pages in private mappings 1347 * that may get shared via COW during fork(). 1348 */ 1349 static const struct test_case anon_thp_test_cases[] = { 1350 /* 1351 * Basic COW test for fork() without any GUP when collapsing a THP 1352 * before fork(). 1353 * 1354 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place 1355 * collapse") might easily get COW handling wrong when not collapsing 1356 * exclusivity information properly. 1357 */ 1358 { 1359 "Basic COW after fork() when collapsing before fork()", 1360 test_anon_thp_collapse_unshared, 1361 }, 1362 /* Basic COW test, but collapse after COW-sharing a full THP. */ 1363 { 1364 "Basic COW after fork() when collapsing after fork() (fully shared)", 1365 test_anon_thp_collapse_fully_shared, 1366 }, 1367 /* 1368 * Basic COW test, but collapse after COW-sharing the lower half of a 1369 * THP. 1370 */ 1371 { 1372 "Basic COW after fork() when collapsing after fork() (lower shared)", 1373 test_anon_thp_collapse_lower_shared, 1374 }, 1375 /* 1376 * Basic COW test, but collapse after COW-sharing the upper half of a 1377 * THP. 1378 */ 1379 { 1380 "Basic COW after fork() when collapsing after fork() (upper shared)", 1381 test_anon_thp_collapse_upper_shared, 1382 }, 1383 }; 1384 1385 static void run_anon_thp_test_cases(void) 1386 { 1387 int i; 1388 1389 if (!pmdsize) 1390 return; 1391 1392 ksft_print_msg("[INFO] Anonymous THP tests\n"); 1393 1394 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { 1395 struct test_case const *test_case = &anon_thp_test_cases[i]; 1396 1397 ksft_print_msg("[RUN] %s\n", test_case->desc); 1398 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); 1399 } 1400 } 1401 1402 static int tests_per_anon_thp_test_case(void) 1403 { 1404 return pmdsize ? 1 : 0; 1405 } 1406 1407 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); 1408 1409 static void test_cow(char *mem, const char *smem, size_t size) 1410 { 1411 char *old = malloc(size); 1412 1413 /* Backup the original content. */ 1414 memcpy(old, smem, size); 1415 1416 /* Modify the page. */ 1417 memset(mem, 0xff, size); 1418 1419 /* See if we still read the old values via the other mapping. */ 1420 ksft_test_result(!memcmp(smem, old, size), 1421 "Other mapping not modified\n"); 1422 free(old); 1423 } 1424 1425 static void test_ro_pin(char *mem, const char *smem, size_t size) 1426 { 1427 do_test_ro_pin(mem, size, RO_PIN_TEST, false); 1428 } 1429 1430 static void test_ro_fast_pin(char *mem, const char *smem, size_t size) 1431 { 1432 do_test_ro_pin(mem, size, RO_PIN_TEST, true); 1433 } 1434 1435 static void run_with_zeropage(non_anon_test_fn fn, const char *desc) 1436 { 1437 char *mem, *smem, tmp; 1438 1439 ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); 1440 1441 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, 1442 MAP_PRIVATE | MAP_ANON, -1, 0); 1443 if (mem == MAP_FAILED) { 1444 ksft_test_result_fail("mmap() failed\n"); 1445 return; 1446 } 1447 1448 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); 1449 if (mem == MAP_FAILED) { 1450 ksft_test_result_fail("mmap() failed\n"); 1451 goto munmap; 1452 } 1453 1454 /* Read from the page to populate the shared zeropage. */ 1455 tmp = *mem + *smem; 1456 asm volatile("" : "+r" (tmp)); 1457 1458 fn(mem, smem, pagesize); 1459 munmap: 1460 munmap(mem, pagesize); 1461 if (smem != MAP_FAILED) 1462 munmap(smem, pagesize); 1463 } 1464 1465 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) 1466 { 1467 char *mem, *smem, *mmap_mem, *mmap_smem, tmp; 1468 size_t mmap_size; 1469 int ret; 1470 1471 ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); 1472 1473 if (!has_huge_zeropage) { 1474 ksft_test_result_skip("Huge zeropage not enabled\n"); 1475 return; 1476 } 1477 1478 /* For alignment purposes, we need twice the thp size. */ 1479 mmap_size = 2 * pmdsize; 1480 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1481 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1482 if (mmap_mem == MAP_FAILED) { 1483 ksft_test_result_fail("mmap() failed\n"); 1484 return; 1485 } 1486 mmap_smem = mmap(NULL, mmap_size, PROT_READ, 1487 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1488 if (mmap_smem == MAP_FAILED) { 1489 ksft_test_result_fail("mmap() failed\n"); 1490 goto munmap; 1491 } 1492 1493 /* We need a THP-aligned memory area. */ 1494 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); 1495 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); 1496 1497 ret = madvise(mem, pmdsize, MADV_HUGEPAGE); 1498 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); 1499 if (ret) { 1500 ksft_test_result_fail("MADV_HUGEPAGE failed\n"); 1501 goto munmap; 1502 } 1503 1504 /* 1505 * Read from the memory to populate the huge shared zeropage. Read from 1506 * the first sub-page and test if we get another sub-page populated 1507 * automatically. 1508 */ 1509 tmp = *mem + *smem; 1510 asm volatile("" : "+r" (tmp)); 1511 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || 1512 !pagemap_is_populated(pagemap_fd, smem + pagesize)) { 1513 ksft_test_result_skip("Did not get THPs populated\n"); 1514 goto munmap; 1515 } 1516 1517 fn(mem, smem, pmdsize); 1518 munmap: 1519 munmap(mmap_mem, mmap_size); 1520 if (mmap_smem != MAP_FAILED) 1521 munmap(mmap_smem, mmap_size); 1522 } 1523 1524 static void run_with_memfd(non_anon_test_fn fn, const char *desc) 1525 { 1526 char *mem, *smem, tmp; 1527 int fd; 1528 1529 ksft_print_msg("[RUN] %s ... with memfd\n", desc); 1530 1531 fd = memfd_create("test", 0); 1532 if (fd < 0) { 1533 ksft_test_result_fail("memfd_create() failed\n"); 1534 return; 1535 } 1536 1537 /* File consists of a single page filled with zeroes. */ 1538 if (fallocate(fd, 0, 0, pagesize)) { 1539 ksft_test_result_fail("fallocate() failed\n"); 1540 goto close; 1541 } 1542 1543 /* Create a private mapping of the memfd. */ 1544 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); 1545 if (mem == MAP_FAILED) { 1546 ksft_test_result_fail("mmap() failed\n"); 1547 goto close; 1548 } 1549 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); 1550 if (mem == MAP_FAILED) { 1551 ksft_test_result_fail("mmap() failed\n"); 1552 goto munmap; 1553 } 1554 1555 /* Fault the page in. */ 1556 tmp = *mem + *smem; 1557 asm volatile("" : "+r" (tmp)); 1558 1559 fn(mem, smem, pagesize); 1560 munmap: 1561 munmap(mem, pagesize); 1562 if (smem != MAP_FAILED) 1563 munmap(smem, pagesize); 1564 close: 1565 close(fd); 1566 } 1567 1568 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) 1569 { 1570 char *mem, *smem, tmp; 1571 FILE *file; 1572 int fd; 1573 1574 ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); 1575 1576 file = tmpfile(); 1577 if (!file) { 1578 ksft_test_result_fail("tmpfile() failed\n"); 1579 return; 1580 } 1581 1582 fd = fileno(file); 1583 if (fd < 0) { 1584 ksft_test_result_skip("fileno() failed\n"); 1585 return; 1586 } 1587 1588 /* File consists of a single page filled with zeroes. */ 1589 if (fallocate(fd, 0, 0, pagesize)) { 1590 ksft_test_result_fail("fallocate() failed\n"); 1591 goto close; 1592 } 1593 1594 /* Create a private mapping of the memfd. */ 1595 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); 1596 if (mem == MAP_FAILED) { 1597 ksft_test_result_fail("mmap() failed\n"); 1598 goto close; 1599 } 1600 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); 1601 if (mem == MAP_FAILED) { 1602 ksft_test_result_fail("mmap() failed\n"); 1603 goto munmap; 1604 } 1605 1606 /* Fault the page in. */ 1607 tmp = *mem + *smem; 1608 asm volatile("" : "+r" (tmp)); 1609 1610 fn(mem, smem, pagesize); 1611 munmap: 1612 munmap(mem, pagesize); 1613 if (smem != MAP_FAILED) 1614 munmap(smem, pagesize); 1615 close: 1616 fclose(file); 1617 } 1618 1619 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, 1620 size_t hugetlbsize) 1621 { 1622 int flags = MFD_HUGETLB; 1623 char *mem, *smem, tmp; 1624 int fd; 1625 1626 ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, 1627 hugetlbsize / 1024); 1628 1629 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; 1630 1631 fd = memfd_create("test", flags); 1632 if (fd < 0) { 1633 ksft_test_result_skip("memfd_create() failed\n"); 1634 return; 1635 } 1636 1637 /* File consists of a single page filled with zeroes. */ 1638 if (fallocate(fd, 0, 0, hugetlbsize)) { 1639 ksft_test_result_skip("need more free huge pages\n"); 1640 goto close; 1641 } 1642 1643 /* Create a private mapping of the memfd. */ 1644 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 1645 0); 1646 if (mem == MAP_FAILED) { 1647 ksft_test_result_skip("need more free huge pages\n"); 1648 goto close; 1649 } 1650 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); 1651 if (mem == MAP_FAILED) { 1652 ksft_test_result_fail("mmap() failed\n"); 1653 goto munmap; 1654 } 1655 1656 /* Fault the page in. */ 1657 tmp = *mem + *smem; 1658 asm volatile("" : "+r" (tmp)); 1659 1660 fn(mem, smem, hugetlbsize); 1661 munmap: 1662 munmap(mem, hugetlbsize); 1663 if (mem != MAP_FAILED) 1664 munmap(smem, hugetlbsize); 1665 close: 1666 close(fd); 1667 } 1668 1669 struct non_anon_test_case { 1670 const char *desc; 1671 non_anon_test_fn fn; 1672 }; 1673 1674 /* 1675 * Test cases that target any pages in private mappings that are not anonymous: 1676 * pages that may get shared via COW ndependent of fork(). This includes 1677 * the shared zeropage(s), pagecache pages, ... 1678 */ 1679 static const struct non_anon_test_case non_anon_test_cases[] = { 1680 /* 1681 * Basic COW test without any GUP. If we miss to break COW, changes are 1682 * visible via other private/shared mappings. 1683 */ 1684 { 1685 "Basic COW", 1686 test_cow, 1687 }, 1688 /* 1689 * Take a R/O longterm pin. When modifying the page via the page table, 1690 * the page content change must be visible via the pin. 1691 */ 1692 { 1693 "R/O longterm GUP pin", 1694 test_ro_pin, 1695 }, 1696 /* Same as above, but using GUP-fast. */ 1697 { 1698 "R/O longterm GUP-fast pin", 1699 test_ro_fast_pin, 1700 }, 1701 }; 1702 1703 static void run_non_anon_test_case(struct non_anon_test_case const *test_case) 1704 { 1705 int i; 1706 1707 run_with_zeropage(test_case->fn, test_case->desc); 1708 run_with_memfd(test_case->fn, test_case->desc); 1709 run_with_tmpfile(test_case->fn, test_case->desc); 1710 if (pmdsize) 1711 run_with_huge_zeropage(test_case->fn, test_case->desc); 1712 for (i = 0; i < nr_hugetlbsizes; i++) 1713 run_with_memfd_hugetlb(test_case->fn, test_case->desc, 1714 hugetlbsizes[i]); 1715 } 1716 1717 static void run_non_anon_test_cases(void) 1718 { 1719 int i; 1720 1721 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); 1722 1723 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) 1724 run_non_anon_test_case(&non_anon_test_cases[i]); 1725 } 1726 1727 static int tests_per_non_anon_test_case(void) 1728 { 1729 int tests = 3 + nr_hugetlbsizes; 1730 1731 if (pmdsize) 1732 tests += 1; 1733 return tests; 1734 } 1735 1736 int main(int argc, char **argv) 1737 { 1738 int err; 1739 struct thp_settings default_settings; 1740 1741 ksft_print_header(); 1742 1743 pagesize = getpagesize(); 1744 pmdsize = read_pmd_pagesize(); 1745 if (pmdsize) { 1746 /* Only if THP is supported. */ 1747 thp_read_settings(&default_settings); 1748 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; 1749 thp_save_settings(); 1750 thp_push_settings(&default_settings); 1751 1752 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", 1753 pmdsize / 1024); 1754 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); 1755 } 1756 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, 1757 ARRAY_SIZE(hugetlbsizes)); 1758 detect_huge_zeropage(); 1759 1760 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + 1761 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + 1762 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); 1763 1764 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); 1765 pagemap_fd = open("/proc/self/pagemap", O_RDONLY); 1766 if (pagemap_fd < 0) 1767 ksft_exit_fail_msg("opening pagemap failed\n"); 1768 1769 run_anon_test_cases(); 1770 run_anon_thp_test_cases(); 1771 run_non_anon_test_cases(); 1772 1773 if (pmdsize) { 1774 /* Only if THP is supported. */ 1775 thp_restore_settings(); 1776 } 1777 1778 err = ksft_get_fail_cnt(); 1779 if (err) 1780 ksft_exit_fail_msg("%d out of %d tests failed\n", 1781 err, ksft_test_num()); 1782 return ksft_exit_pass(); 1783 } 1784