1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * COW (Copy On Write) tests. 4 * 5 * Copyright 2022, Red Hat, Inc. 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 #define _GNU_SOURCE 10 #include <stdlib.h> 11 #include <string.h> 12 #include <stdbool.h> 13 #include <stdint.h> 14 #include <unistd.h> 15 #include <errno.h> 16 #include <fcntl.h> 17 #include <assert.h> 18 #include <linux/mman.h> 19 #include <sys/mman.h> 20 #include <sys/ioctl.h> 21 #include <sys/wait.h> 22 #include <linux/memfd.h> 23 24 #include "local_config.h" 25 #ifdef LOCAL_CONFIG_HAVE_LIBURING 26 #include <liburing.h> 27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 28 29 #include "../../../../mm/gup_test.h" 30 #include "../kselftest.h" 31 #include "vm_util.h" 32 #include "thp_settings.h" 33 34 static size_t pagesize; 35 static int pagemap_fd; 36 static size_t pmdsize; 37 static int nr_thpsizes; 38 static size_t thpsizes[20]; 39 static int nr_hugetlbsizes; 40 static size_t hugetlbsizes[10]; 41 static int gup_fd; 42 static bool has_huge_zeropage; 43 44 static int sz2ord(size_t size) 45 { 46 return __builtin_ctzll(size / pagesize); 47 } 48 49 static int detect_thp_sizes(size_t sizes[], int max) 50 { 51 int count = 0; 52 unsigned long orders; 53 size_t kb; 54 int i; 55 56 /* thp not supported at all. */ 57 if (!pmdsize) 58 return 0; 59 60 orders = 1UL << sz2ord(pmdsize); 61 orders |= thp_supported_orders(); 62 63 for (i = 0; orders && count < max; i++) { 64 if (!(orders & (1UL << i))) 65 continue; 66 orders &= ~(1UL << i); 67 kb = (pagesize >> 10) << i; 68 sizes[count++] = kb * 1024; 69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); 70 } 71 72 return count; 73 } 74 75 static void detect_huge_zeropage(void) 76 { 77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", 78 O_RDONLY); 79 size_t enabled = 0; 80 char buf[15]; 81 int ret; 82 83 if (fd < 0) 84 return; 85 86 ret = pread(fd, buf, sizeof(buf), 0); 87 if (ret > 0 && ret < sizeof(buf)) { 88 buf[ret] = 0; 89 90 enabled = strtoul(buf, NULL, 10); 91 if (enabled == 1) { 92 has_huge_zeropage = true; 93 ksft_print_msg("[INFO] huge zeropage is enabled\n"); 94 } 95 } 96 97 close(fd); 98 } 99 100 static bool range_is_swapped(void *addr, size_t size) 101 { 102 for (; size; addr += pagesize, size -= pagesize) 103 if (!pagemap_is_swapped(pagemap_fd, addr)) 104 return false; 105 return true; 106 } 107 108 struct comm_pipes { 109 int child_ready[2]; 110 int parent_ready[2]; 111 }; 112 113 static int setup_comm_pipes(struct comm_pipes *comm_pipes) 114 { 115 if (pipe(comm_pipes->child_ready) < 0) 116 return -errno; 117 if (pipe(comm_pipes->parent_ready) < 0) { 118 close(comm_pipes->child_ready[0]); 119 close(comm_pipes->child_ready[1]); 120 return -errno; 121 } 122 123 return 0; 124 } 125 126 static void close_comm_pipes(struct comm_pipes *comm_pipes) 127 { 128 close(comm_pipes->child_ready[0]); 129 close(comm_pipes->child_ready[1]); 130 close(comm_pipes->parent_ready[0]); 131 close(comm_pipes->parent_ready[1]); 132 } 133 134 static int child_memcmp_fn(char *mem, size_t size, 135 struct comm_pipes *comm_pipes) 136 { 137 char *old = malloc(size); 138 char buf; 139 140 /* Backup the original content. */ 141 memcpy(old, mem, size); 142 143 /* Wait until the parent modified the page. */ 144 write(comm_pipes->child_ready[1], "0", 1); 145 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) 146 ; 147 148 /* See if we still read the old values. */ 149 return memcmp(old, mem, size); 150 } 151 152 static int child_vmsplice_memcmp_fn(char *mem, size_t size, 153 struct comm_pipes *comm_pipes) 154 { 155 struct iovec iov = { 156 .iov_base = mem, 157 .iov_len = size, 158 }; 159 ssize_t cur, total, transferred; 160 char *old, *new; 161 int fds[2]; 162 char buf; 163 164 old = malloc(size); 165 new = malloc(size); 166 167 /* Backup the original content. */ 168 memcpy(old, mem, size); 169 170 if (pipe(fds) < 0) 171 return -errno; 172 173 /* Trigger a read-only pin. */ 174 transferred = vmsplice(fds[1], &iov, 1, 0); 175 if (transferred < 0) 176 return -errno; 177 if (transferred == 0) 178 return -EINVAL; 179 180 /* Unmap it from our page tables. */ 181 if (munmap(mem, size) < 0) 182 return -errno; 183 184 /* Wait until the parent modified it. */ 185 write(comm_pipes->child_ready[1], "0", 1); 186 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) 187 ; 188 189 /* See if we still read the old values via the pipe. */ 190 for (total = 0; total < transferred; total += cur) { 191 cur = read(fds[0], new + total, transferred - total); 192 if (cur < 0) 193 return -errno; 194 } 195 196 return memcmp(old, new, transferred); 197 } 198 199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); 200 201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, 202 child_fn fn, bool xfail) 203 { 204 struct comm_pipes comm_pipes; 205 char buf; 206 int ret; 207 208 ret = setup_comm_pipes(&comm_pipes); 209 if (ret) { 210 ksft_test_result_fail("pipe() failed\n"); 211 return; 212 } 213 214 ret = fork(); 215 if (ret < 0) { 216 ksft_test_result_fail("fork() failed\n"); 217 goto close_comm_pipes; 218 } else if (!ret) { 219 exit(fn(mem, size, &comm_pipes)); 220 } 221 222 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 223 ; 224 225 if (do_mprotect) { 226 /* 227 * mprotect() optimizations might try avoiding 228 * write-faults by directly mapping pages writable. 229 */ 230 ret = mprotect(mem, size, PROT_READ); 231 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); 232 if (ret) { 233 ksft_test_result_fail("mprotect() failed\n"); 234 write(comm_pipes.parent_ready[1], "0", 1); 235 wait(&ret); 236 goto close_comm_pipes; 237 } 238 } 239 240 /* Modify the page. */ 241 memset(mem, 0xff, size); 242 write(comm_pipes.parent_ready[1], "0", 1); 243 244 wait(&ret); 245 if (WIFEXITED(ret)) 246 ret = WEXITSTATUS(ret); 247 else 248 ret = -EINVAL; 249 250 if (!ret) { 251 ksft_test_result_pass("No leak from parent into child\n"); 252 } else if (xfail) { 253 /* 254 * With hugetlb, some vmsplice() tests are currently expected to 255 * fail because (a) harder to fix and (b) nobody really cares. 256 * Flag them as expected failure for now. 257 */ 258 ksft_test_result_xfail("Leak from parent into child\n"); 259 } else { 260 ksft_test_result_fail("Leak from parent into child\n"); 261 } 262 close_comm_pipes: 263 close_comm_pipes(&comm_pipes); 264 } 265 266 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) 267 { 268 do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); 269 } 270 271 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) 272 { 273 do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); 274 } 275 276 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) 277 { 278 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, 279 is_hugetlb); 280 } 281 282 static void test_vmsplice_in_child_mprotect(char *mem, size_t size, 283 bool is_hugetlb) 284 { 285 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, 286 is_hugetlb); 287 } 288 289 static void do_test_vmsplice_in_parent(char *mem, size_t size, 290 bool before_fork, bool xfail) 291 { 292 struct iovec iov = { 293 .iov_base = mem, 294 .iov_len = size, 295 }; 296 ssize_t cur, total, transferred; 297 struct comm_pipes comm_pipes; 298 char *old, *new; 299 int ret, fds[2]; 300 char buf; 301 302 old = malloc(size); 303 new = malloc(size); 304 305 memcpy(old, mem, size); 306 307 ret = setup_comm_pipes(&comm_pipes); 308 if (ret) { 309 ksft_test_result_fail("pipe() failed\n"); 310 goto free; 311 } 312 313 if (pipe(fds) < 0) { 314 ksft_test_result_fail("pipe() failed\n"); 315 goto close_comm_pipes; 316 } 317 318 if (before_fork) { 319 transferred = vmsplice(fds[1], &iov, 1, 0); 320 if (transferred <= 0) { 321 ksft_test_result_fail("vmsplice() failed\n"); 322 goto close_pipe; 323 } 324 } 325 326 ret = fork(); 327 if (ret < 0) { 328 ksft_test_result_fail("fork() failed\n"); 329 goto close_pipe; 330 } else if (!ret) { 331 write(comm_pipes.child_ready[1], "0", 1); 332 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 333 ; 334 /* Modify page content in the child. */ 335 memset(mem, 0xff, size); 336 exit(0); 337 } 338 339 if (!before_fork) { 340 transferred = vmsplice(fds[1], &iov, 1, 0); 341 if (transferred <= 0) { 342 ksft_test_result_fail("vmsplice() failed\n"); 343 wait(&ret); 344 goto close_pipe; 345 } 346 } 347 348 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 349 ; 350 if (munmap(mem, size) < 0) { 351 ksft_test_result_fail("munmap() failed\n"); 352 goto close_pipe; 353 } 354 write(comm_pipes.parent_ready[1], "0", 1); 355 356 /* Wait until the child is done writing. */ 357 wait(&ret); 358 if (!WIFEXITED(ret)) { 359 ksft_test_result_fail("wait() failed\n"); 360 goto close_pipe; 361 } 362 363 /* See if we still read the old values. */ 364 for (total = 0; total < transferred; total += cur) { 365 cur = read(fds[0], new + total, transferred - total); 366 if (cur < 0) { 367 ksft_test_result_fail("read() failed\n"); 368 goto close_pipe; 369 } 370 } 371 372 if (!memcmp(old, new, transferred)) { 373 ksft_test_result_pass("No leak from child into parent\n"); 374 } else if (xfail) { 375 /* 376 * With hugetlb, some vmsplice() tests are currently expected to 377 * fail because (a) harder to fix and (b) nobody really cares. 378 * Flag them as expected failure for now. 379 */ 380 ksft_test_result_xfail("Leak from child into parent\n"); 381 } else { 382 ksft_test_result_fail("Leak from child into parent\n"); 383 } 384 close_pipe: 385 close(fds[0]); 386 close(fds[1]); 387 close_comm_pipes: 388 close_comm_pipes(&comm_pipes); 389 free: 390 free(old); 391 free(new); 392 } 393 394 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) 395 { 396 do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); 397 } 398 399 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) 400 { 401 do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); 402 } 403 404 #ifdef LOCAL_CONFIG_HAVE_LIBURING 405 static void do_test_iouring(char *mem, size_t size, bool use_fork) 406 { 407 struct comm_pipes comm_pipes; 408 struct io_uring_cqe *cqe; 409 struct io_uring_sqe *sqe; 410 struct io_uring ring; 411 ssize_t cur, total; 412 struct iovec iov; 413 char *buf, *tmp; 414 int ret, fd; 415 FILE *file; 416 417 ret = setup_comm_pipes(&comm_pipes); 418 if (ret) { 419 ksft_test_result_fail("pipe() failed\n"); 420 return; 421 } 422 423 file = tmpfile(); 424 if (!file) { 425 ksft_test_result_fail("tmpfile() failed\n"); 426 goto close_comm_pipes; 427 } 428 fd = fileno(file); 429 assert(fd); 430 431 tmp = malloc(size); 432 if (!tmp) { 433 ksft_test_result_fail("malloc() failed\n"); 434 goto close_file; 435 } 436 437 /* Skip on errors, as we might just lack kernel support. */ 438 ret = io_uring_queue_init(1, &ring, 0); 439 if (ret < 0) { 440 ksft_test_result_skip("io_uring_queue_init() failed\n"); 441 goto free_tmp; 442 } 443 444 /* 445 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN 446 * | FOLL_LONGTERM the range. 447 * 448 * Skip on errors, as we might just lack kernel support or might not 449 * have sufficient MEMLOCK permissions. 450 */ 451 iov.iov_base = mem; 452 iov.iov_len = size; 453 ret = io_uring_register_buffers(&ring, &iov, 1); 454 if (ret) { 455 ksft_test_result_skip("io_uring_register_buffers() failed\n"); 456 goto queue_exit; 457 } 458 459 if (use_fork) { 460 /* 461 * fork() and keep the child alive until we're done. Note that 462 * we expect the pinned page to not get shared with the child. 463 */ 464 ret = fork(); 465 if (ret < 0) { 466 ksft_test_result_fail("fork() failed\n"); 467 goto unregister_buffers; 468 } else if (!ret) { 469 write(comm_pipes.child_ready[1], "0", 1); 470 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 471 ; 472 exit(0); 473 } 474 475 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 476 ; 477 } else { 478 /* 479 * Map the page R/O into the page table. Enable softdirty 480 * tracking to stop the page from getting mapped R/W immediately 481 * again by mprotect() optimizations. Note that we don't have an 482 * easy way to test if that worked (the pagemap does not export 483 * if the page is mapped R/O vs. R/W). 484 */ 485 ret = mprotect(mem, size, PROT_READ); 486 clear_softdirty(); 487 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); 488 if (ret) { 489 ksft_test_result_fail("mprotect() failed\n"); 490 goto unregister_buffers; 491 } 492 } 493 494 /* 495 * Modify the page and write page content as observed by the fixed 496 * buffer pin to the file so we can verify it. 497 */ 498 memset(mem, 0xff, size); 499 sqe = io_uring_get_sqe(&ring); 500 if (!sqe) { 501 ksft_test_result_fail("io_uring_get_sqe() failed\n"); 502 goto quit_child; 503 } 504 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); 505 506 ret = io_uring_submit(&ring); 507 if (ret < 0) { 508 ksft_test_result_fail("io_uring_submit() failed\n"); 509 goto quit_child; 510 } 511 512 ret = io_uring_wait_cqe(&ring, &cqe); 513 if (ret < 0) { 514 ksft_test_result_fail("io_uring_wait_cqe() failed\n"); 515 goto quit_child; 516 } 517 518 if (cqe->res != size) { 519 ksft_test_result_fail("write_fixed failed\n"); 520 goto quit_child; 521 } 522 io_uring_cqe_seen(&ring, cqe); 523 524 /* Read back the file content to the temporary buffer. */ 525 total = 0; 526 while (total < size) { 527 cur = pread(fd, tmp + total, size - total, total); 528 if (cur < 0) { 529 ksft_test_result_fail("pread() failed\n"); 530 goto quit_child; 531 } 532 total += cur; 533 } 534 535 /* Finally, check if we read what we expected. */ 536 ksft_test_result(!memcmp(mem, tmp, size), 537 "Longterm R/W pin is reliable\n"); 538 539 quit_child: 540 if (use_fork) { 541 write(comm_pipes.parent_ready[1], "0", 1); 542 wait(&ret); 543 } 544 unregister_buffers: 545 io_uring_unregister_buffers(&ring); 546 queue_exit: 547 io_uring_queue_exit(&ring); 548 free_tmp: 549 free(tmp); 550 close_file: 551 fclose(file); 552 close_comm_pipes: 553 close_comm_pipes(&comm_pipes); 554 } 555 556 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) 557 { 558 do_test_iouring(mem, size, false); 559 } 560 561 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) 562 { 563 do_test_iouring(mem, size, true); 564 } 565 566 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 567 568 enum ro_pin_test { 569 RO_PIN_TEST, 570 RO_PIN_TEST_SHARED, 571 RO_PIN_TEST_PREVIOUSLY_SHARED, 572 RO_PIN_TEST_RO_EXCLUSIVE, 573 }; 574 575 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, 576 bool fast) 577 { 578 struct pin_longterm_test args; 579 struct comm_pipes comm_pipes; 580 char *tmp, buf; 581 __u64 tmp_val; 582 int ret; 583 584 if (gup_fd < 0) { 585 ksft_test_result_skip("gup_test not available\n"); 586 return; 587 } 588 589 tmp = malloc(size); 590 if (!tmp) { 591 ksft_test_result_fail("malloc() failed\n"); 592 return; 593 } 594 595 ret = setup_comm_pipes(&comm_pipes); 596 if (ret) { 597 ksft_test_result_fail("pipe() failed\n"); 598 goto free_tmp; 599 } 600 601 switch (test) { 602 case RO_PIN_TEST: 603 break; 604 case RO_PIN_TEST_SHARED: 605 case RO_PIN_TEST_PREVIOUSLY_SHARED: 606 /* 607 * Share the pages with our child. As the pages are not pinned, 608 * this should just work. 609 */ 610 ret = fork(); 611 if (ret < 0) { 612 ksft_test_result_fail("fork() failed\n"); 613 goto close_comm_pipes; 614 } else if (!ret) { 615 write(comm_pipes.child_ready[1], "0", 1); 616 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) 617 ; 618 exit(0); 619 } 620 621 /* Wait until our child is ready. */ 622 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 623 ; 624 625 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { 626 /* 627 * Tell the child to quit now and wait until it quit. 628 * The pages should now be mapped R/O into our page 629 * tables, but they are no longer shared. 630 */ 631 write(comm_pipes.parent_ready[1], "0", 1); 632 wait(&ret); 633 if (!WIFEXITED(ret)) 634 ksft_print_msg("[INFO] wait() failed\n"); 635 } 636 break; 637 case RO_PIN_TEST_RO_EXCLUSIVE: 638 /* 639 * Map the page R/O into the page table. Enable softdirty 640 * tracking to stop the page from getting mapped R/W immediately 641 * again by mprotect() optimizations. Note that we don't have an 642 * easy way to test if that worked (the pagemap does not export 643 * if the page is mapped R/O vs. R/W). 644 */ 645 ret = mprotect(mem, size, PROT_READ); 646 clear_softdirty(); 647 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); 648 if (ret) { 649 ksft_test_result_fail("mprotect() failed\n"); 650 goto close_comm_pipes; 651 } 652 break; 653 default: 654 assert(false); 655 } 656 657 /* Take a R/O pin. This should trigger unsharing. */ 658 args.addr = (__u64)(uintptr_t)mem; 659 args.size = size; 660 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; 661 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); 662 if (ret) { 663 if (errno == EINVAL) 664 ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); 665 else 666 ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); 667 goto wait; 668 } 669 670 /* Modify the page. */ 671 memset(mem, 0xff, size); 672 673 /* 674 * Read back the content via the pin to the temporary buffer and 675 * test if we observed the modification. 676 */ 677 tmp_val = (__u64)(uintptr_t)tmp; 678 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); 679 if (ret) 680 ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); 681 else 682 ksft_test_result(!memcmp(mem, tmp, size), 683 "Longterm R/O pin is reliable\n"); 684 685 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); 686 if (ret) 687 ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); 688 wait: 689 switch (test) { 690 case RO_PIN_TEST_SHARED: 691 write(comm_pipes.parent_ready[1], "0", 1); 692 wait(&ret); 693 if (!WIFEXITED(ret)) 694 ksft_print_msg("[INFO] wait() failed\n"); 695 break; 696 default: 697 break; 698 } 699 close_comm_pipes: 700 close_comm_pipes(&comm_pipes); 701 free_tmp: 702 free(tmp); 703 } 704 705 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) 706 { 707 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); 708 } 709 710 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) 711 { 712 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); 713 } 714 715 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, 716 bool is_hugetlb) 717 { 718 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); 719 } 720 721 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, 722 bool is_hugetlb) 723 { 724 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); 725 } 726 727 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, 728 bool is_hugetlb) 729 { 730 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); 731 } 732 733 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, 734 bool is_hugetlb) 735 { 736 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); 737 } 738 739 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); 740 741 static void do_run_with_base_page(test_fn fn, bool swapout) 742 { 743 char *mem; 744 int ret; 745 746 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, 747 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 748 if (mem == MAP_FAILED) { 749 ksft_test_result_fail("mmap() failed\n"); 750 return; 751 } 752 753 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); 754 /* Ignore if not around on a kernel. */ 755 if (ret && errno != EINVAL) { 756 ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); 757 goto munmap; 758 } 759 760 /* Populate a base page. */ 761 memset(mem, 0, pagesize); 762 763 if (swapout) { 764 madvise(mem, pagesize, MADV_PAGEOUT); 765 if (!pagemap_is_swapped(pagemap_fd, mem)) { 766 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); 767 goto munmap; 768 } 769 } 770 771 fn(mem, pagesize, false); 772 munmap: 773 munmap(mem, pagesize); 774 } 775 776 static void run_with_base_page(test_fn fn, const char *desc) 777 { 778 ksft_print_msg("[RUN] %s ... with base page\n", desc); 779 do_run_with_base_page(fn, false); 780 } 781 782 static void run_with_base_page_swap(test_fn fn, const char *desc) 783 { 784 ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); 785 do_run_with_base_page(fn, true); 786 } 787 788 enum thp_run { 789 THP_RUN_PMD, 790 THP_RUN_PMD_SWAPOUT, 791 THP_RUN_PTE, 792 THP_RUN_PTE_SWAPOUT, 793 THP_RUN_SINGLE_PTE, 794 THP_RUN_SINGLE_PTE_SWAPOUT, 795 THP_RUN_PARTIAL_MREMAP, 796 THP_RUN_PARTIAL_SHARED, 797 }; 798 799 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) 800 { 801 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; 802 size_t size, mmap_size, mremap_size; 803 int ret; 804 805 /* For alignment purposes, we need twice the thp size. */ 806 mmap_size = 2 * thpsize; 807 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 808 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 809 if (mmap_mem == MAP_FAILED) { 810 ksft_test_result_fail("mmap() failed\n"); 811 return; 812 } 813 814 /* We need a THP-aligned memory area. */ 815 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); 816 817 ret = madvise(mem, thpsize, MADV_HUGEPAGE); 818 if (ret) { 819 ksft_test_result_fail("MADV_HUGEPAGE failed\n"); 820 goto munmap; 821 } 822 823 /* 824 * Try to populate a THP. Touch the first sub-page and test if 825 * we get the last sub-page populated automatically. 826 */ 827 mem[0] = 0; 828 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { 829 ksft_test_result_skip("Did not get a THP populated\n"); 830 goto munmap; 831 } 832 memset(mem, 0, thpsize); 833 834 size = thpsize; 835 switch (thp_run) { 836 case THP_RUN_PMD: 837 case THP_RUN_PMD_SWAPOUT: 838 assert(thpsize == pmdsize); 839 break; 840 case THP_RUN_PTE: 841 case THP_RUN_PTE_SWAPOUT: 842 /* 843 * Trigger PTE-mapping the THP by temporarily mapping a single 844 * subpage R/O. This is a noop if the THP is not pmdsize (and 845 * therefore already PTE-mapped). 846 */ 847 ret = mprotect(mem + pagesize, pagesize, PROT_READ); 848 if (ret) { 849 ksft_test_result_fail("mprotect() failed\n"); 850 goto munmap; 851 } 852 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); 853 if (ret) { 854 ksft_test_result_fail("mprotect() failed\n"); 855 goto munmap; 856 } 857 break; 858 case THP_RUN_SINGLE_PTE: 859 case THP_RUN_SINGLE_PTE_SWAPOUT: 860 /* 861 * Discard all but a single subpage of that PTE-mapped THP. What 862 * remains is a single PTE mapping a single subpage. 863 */ 864 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); 865 if (ret) { 866 ksft_test_result_fail("MADV_DONTNEED failed\n"); 867 goto munmap; 868 } 869 size = pagesize; 870 break; 871 case THP_RUN_PARTIAL_MREMAP: 872 /* 873 * Remap half of the THP. We need some new memory location 874 * for that. 875 */ 876 mremap_size = thpsize / 2; 877 mremap_mem = mmap(NULL, mremap_size, PROT_NONE, 878 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 879 if (mem == MAP_FAILED) { 880 ksft_test_result_fail("mmap() failed\n"); 881 goto munmap; 882 } 883 tmp = mremap(mem + mremap_size, mremap_size, mremap_size, 884 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); 885 if (tmp != mremap_mem) { 886 ksft_test_result_fail("mremap() failed\n"); 887 goto munmap; 888 } 889 size = mremap_size; 890 break; 891 case THP_RUN_PARTIAL_SHARED: 892 /* 893 * Share the first page of the THP with a child and quit the 894 * child. This will result in some parts of the THP never 895 * have been shared. 896 */ 897 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); 898 if (ret) { 899 ksft_test_result_fail("MADV_DONTFORK failed\n"); 900 goto munmap; 901 } 902 ret = fork(); 903 if (ret < 0) { 904 ksft_test_result_fail("fork() failed\n"); 905 goto munmap; 906 } else if (!ret) { 907 exit(0); 908 } 909 wait(&ret); 910 /* Allow for sharing all pages again. */ 911 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); 912 if (ret) { 913 ksft_test_result_fail("MADV_DOFORK failed\n"); 914 goto munmap; 915 } 916 break; 917 default: 918 assert(false); 919 } 920 921 switch (thp_run) { 922 case THP_RUN_PMD_SWAPOUT: 923 case THP_RUN_PTE_SWAPOUT: 924 case THP_RUN_SINGLE_PTE_SWAPOUT: 925 madvise(mem, size, MADV_PAGEOUT); 926 if (!range_is_swapped(mem, size)) { 927 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); 928 goto munmap; 929 } 930 break; 931 default: 932 break; 933 } 934 935 fn(mem, size, false); 936 munmap: 937 munmap(mmap_mem, mmap_size); 938 if (mremap_mem != MAP_FAILED) 939 munmap(mremap_mem, mremap_size); 940 } 941 942 static void run_with_thp(test_fn fn, const char *desc, size_t size) 943 { 944 ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", 945 desc, size / 1024); 946 do_run_with_thp(fn, THP_RUN_PMD, size); 947 } 948 949 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) 950 { 951 ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", 952 desc, size / 1024); 953 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); 954 } 955 956 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) 957 { 958 ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", 959 desc, size / 1024); 960 do_run_with_thp(fn, THP_RUN_PTE, size); 961 } 962 963 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) 964 { 965 ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", 966 desc, size / 1024); 967 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); 968 } 969 970 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) 971 { 972 ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", 973 desc, size / 1024); 974 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); 975 } 976 977 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) 978 { 979 ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", 980 desc, size / 1024); 981 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); 982 } 983 984 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) 985 { 986 ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", 987 desc, size / 1024); 988 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); 989 } 990 991 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) 992 { 993 ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", 994 desc, size / 1024); 995 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); 996 } 997 998 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) 999 { 1000 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; 1001 char *mem, *dummy; 1002 1003 ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, 1004 hugetlbsize / 1024); 1005 1006 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; 1007 1008 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); 1009 if (mem == MAP_FAILED) { 1010 ksft_test_result_skip("need more free huge pages\n"); 1011 return; 1012 } 1013 1014 /* Populate an huge page. */ 1015 memset(mem, 0, hugetlbsize); 1016 1017 /* 1018 * We need a total of two hugetlb pages to handle COW/unsharing 1019 * properly, otherwise we might get zapped by a SIGBUS. 1020 */ 1021 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); 1022 if (dummy == MAP_FAILED) { 1023 ksft_test_result_skip("need more free huge pages\n"); 1024 goto munmap; 1025 } 1026 munmap(dummy, hugetlbsize); 1027 1028 fn(mem, hugetlbsize, true); 1029 munmap: 1030 munmap(mem, hugetlbsize); 1031 } 1032 1033 struct test_case { 1034 const char *desc; 1035 test_fn fn; 1036 }; 1037 1038 /* 1039 * Test cases that are specific to anonymous pages: pages in private mappings 1040 * that may get shared via COW during fork(). 1041 */ 1042 static const struct test_case anon_test_cases[] = { 1043 /* 1044 * Basic COW tests for fork() without any GUP. If we miss to break COW, 1045 * either the child can observe modifications by the parent or the 1046 * other way around. 1047 */ 1048 { 1049 "Basic COW after fork()", 1050 test_cow_in_parent, 1051 }, 1052 /* 1053 * Basic test, but do an additional mprotect(PROT_READ)+ 1054 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. 1055 */ 1056 { 1057 "Basic COW after fork() with mprotect() optimization", 1058 test_cow_in_parent_mprotect, 1059 }, 1060 /* 1061 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If 1062 * we miss to break COW, the child observes modifications by the parent. 1063 * This is CVE-2020-29374 reported by Jann Horn. 1064 */ 1065 { 1066 "vmsplice() + unmap in child", 1067 test_vmsplice_in_child, 1068 }, 1069 /* 1070 * vmsplice() test, but do an additional mprotect(PROT_READ)+ 1071 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. 1072 */ 1073 { 1074 "vmsplice() + unmap in child with mprotect() optimization", 1075 test_vmsplice_in_child_mprotect, 1076 }, 1077 /* 1078 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after 1079 * fork(); modify in the child. If we miss to break COW, the parent 1080 * observes modifications by the child. 1081 */ 1082 { 1083 "vmsplice() before fork(), unmap in parent after fork()", 1084 test_vmsplice_before_fork, 1085 }, 1086 /* 1087 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the 1088 * child. If we miss to break COW, the parent observes modifications by 1089 * the child. 1090 */ 1091 { 1092 "vmsplice() + unmap in parent after fork()", 1093 test_vmsplice_after_fork, 1094 }, 1095 #ifdef LOCAL_CONFIG_HAVE_LIBURING 1096 /* 1097 * Take a R/W longterm pin and then map the page R/O into the page 1098 * table to trigger a write fault on next access. When modifying the 1099 * page, the page content must be visible via the pin. 1100 */ 1101 { 1102 "R/O-mapping a page registered as iouring fixed buffer", 1103 test_iouring_ro, 1104 }, 1105 /* 1106 * Take a R/W longterm pin and then fork() a child. When modifying the 1107 * page, the page content must be visible via the pin. We expect the 1108 * pinned page to not get shared with the child. 1109 */ 1110 { 1111 "fork() with an iouring fixed buffer", 1112 test_iouring_fork, 1113 }, 1114 1115 #endif /* LOCAL_CONFIG_HAVE_LIBURING */ 1116 /* 1117 * Take a R/O longterm pin on a R/O-mapped shared anonymous page. 1118 * When modifying the page via the page table, the page content change 1119 * must be visible via the pin. 1120 */ 1121 { 1122 "R/O GUP pin on R/O-mapped shared page", 1123 test_ro_pin_on_shared, 1124 }, 1125 /* Same as above, but using GUP-fast. */ 1126 { 1127 "R/O GUP-fast pin on R/O-mapped shared page", 1128 test_ro_fast_pin_on_shared, 1129 }, 1130 /* 1131 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that 1132 * was previously shared. When modifying the page via the page table, 1133 * the page content change must be visible via the pin. 1134 */ 1135 { 1136 "R/O GUP pin on R/O-mapped previously-shared page", 1137 test_ro_pin_on_ro_previously_shared, 1138 }, 1139 /* Same as above, but using GUP-fast. */ 1140 { 1141 "R/O GUP-fast pin on R/O-mapped previously-shared page", 1142 test_ro_fast_pin_on_ro_previously_shared, 1143 }, 1144 /* 1145 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. 1146 * When modifying the page via the page table, the page content change 1147 * must be visible via the pin. 1148 */ 1149 { 1150 "R/O GUP pin on R/O-mapped exclusive page", 1151 test_ro_pin_on_ro_exclusive, 1152 }, 1153 /* Same as above, but using GUP-fast. */ 1154 { 1155 "R/O GUP-fast pin on R/O-mapped exclusive page", 1156 test_ro_fast_pin_on_ro_exclusive, 1157 }, 1158 }; 1159 1160 static void run_anon_test_case(struct test_case const *test_case) 1161 { 1162 int i; 1163 1164 run_with_base_page(test_case->fn, test_case->desc); 1165 run_with_base_page_swap(test_case->fn, test_case->desc); 1166 for (i = 0; i < nr_thpsizes; i++) { 1167 size_t size = thpsizes[i]; 1168 struct thp_settings settings = *thp_current_settings(); 1169 1170 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; 1171 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; 1172 thp_push_settings(&settings); 1173 1174 if (size == pmdsize) { 1175 run_with_thp(test_case->fn, test_case->desc, size); 1176 run_with_thp_swap(test_case->fn, test_case->desc, size); 1177 } 1178 1179 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); 1180 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); 1181 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); 1182 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); 1183 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); 1184 run_with_partial_shared_thp(test_case->fn, test_case->desc, size); 1185 1186 thp_pop_settings(); 1187 } 1188 for (i = 0; i < nr_hugetlbsizes; i++) 1189 run_with_hugetlb(test_case->fn, test_case->desc, 1190 hugetlbsizes[i]); 1191 } 1192 1193 static void run_anon_test_cases(void) 1194 { 1195 int i; 1196 1197 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); 1198 1199 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) 1200 run_anon_test_case(&anon_test_cases[i]); 1201 } 1202 1203 static int tests_per_anon_test_case(void) 1204 { 1205 int tests = 2 + nr_hugetlbsizes; 1206 1207 tests += 6 * nr_thpsizes; 1208 if (pmdsize) 1209 tests += 2; 1210 return tests; 1211 } 1212 1213 enum anon_thp_collapse_test { 1214 ANON_THP_COLLAPSE_UNSHARED, 1215 ANON_THP_COLLAPSE_FULLY_SHARED, 1216 ANON_THP_COLLAPSE_LOWER_SHARED, 1217 ANON_THP_COLLAPSE_UPPER_SHARED, 1218 }; 1219 1220 static void do_test_anon_thp_collapse(char *mem, size_t size, 1221 enum anon_thp_collapse_test test) 1222 { 1223 struct comm_pipes comm_pipes; 1224 char buf; 1225 int ret; 1226 1227 ret = setup_comm_pipes(&comm_pipes); 1228 if (ret) { 1229 ksft_test_result_fail("pipe() failed\n"); 1230 return; 1231 } 1232 1233 /* 1234 * Trigger PTE-mapping the THP by temporarily mapping a single subpage 1235 * R/O, such that we can try collapsing it later. 1236 */ 1237 ret = mprotect(mem + pagesize, pagesize, PROT_READ); 1238 if (ret) { 1239 ksft_test_result_fail("mprotect() failed\n"); 1240 goto close_comm_pipes; 1241 } 1242 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); 1243 if (ret) { 1244 ksft_test_result_fail("mprotect() failed\n"); 1245 goto close_comm_pipes; 1246 } 1247 1248 switch (test) { 1249 case ANON_THP_COLLAPSE_UNSHARED: 1250 /* Collapse before actually COW-sharing the page. */ 1251 ret = madvise(mem, size, MADV_COLLAPSE); 1252 if (ret) { 1253 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", 1254 strerror(errno)); 1255 goto close_comm_pipes; 1256 } 1257 break; 1258 case ANON_THP_COLLAPSE_FULLY_SHARED: 1259 /* COW-share the full PTE-mapped THP. */ 1260 break; 1261 case ANON_THP_COLLAPSE_LOWER_SHARED: 1262 /* Don't COW-share the upper part of the THP. */ 1263 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); 1264 if (ret) { 1265 ksft_test_result_fail("MADV_DONTFORK failed\n"); 1266 goto close_comm_pipes; 1267 } 1268 break; 1269 case ANON_THP_COLLAPSE_UPPER_SHARED: 1270 /* Don't COW-share the lower part of the THP. */ 1271 ret = madvise(mem, size / 2, MADV_DONTFORK); 1272 if (ret) { 1273 ksft_test_result_fail("MADV_DONTFORK failed\n"); 1274 goto close_comm_pipes; 1275 } 1276 break; 1277 default: 1278 assert(false); 1279 } 1280 1281 ret = fork(); 1282 if (ret < 0) { 1283 ksft_test_result_fail("fork() failed\n"); 1284 goto close_comm_pipes; 1285 } else if (!ret) { 1286 switch (test) { 1287 case ANON_THP_COLLAPSE_UNSHARED: 1288 case ANON_THP_COLLAPSE_FULLY_SHARED: 1289 exit(child_memcmp_fn(mem, size, &comm_pipes)); 1290 break; 1291 case ANON_THP_COLLAPSE_LOWER_SHARED: 1292 exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); 1293 break; 1294 case ANON_THP_COLLAPSE_UPPER_SHARED: 1295 exit(child_memcmp_fn(mem + size / 2, size / 2, 1296 &comm_pipes)); 1297 break; 1298 default: 1299 assert(false); 1300 } 1301 } 1302 1303 while (read(comm_pipes.child_ready[0], &buf, 1) != 1) 1304 ; 1305 1306 switch (test) { 1307 case ANON_THP_COLLAPSE_UNSHARED: 1308 break; 1309 case ANON_THP_COLLAPSE_UPPER_SHARED: 1310 case ANON_THP_COLLAPSE_LOWER_SHARED: 1311 /* 1312 * Revert MADV_DONTFORK such that we merge the VMAs and are 1313 * able to actually collapse. 1314 */ 1315 ret = madvise(mem, size, MADV_DOFORK); 1316 if (ret) { 1317 ksft_test_result_fail("MADV_DOFORK failed\n"); 1318 write(comm_pipes.parent_ready[1], "0", 1); 1319 wait(&ret); 1320 goto close_comm_pipes; 1321 } 1322 /* FALLTHROUGH */ 1323 case ANON_THP_COLLAPSE_FULLY_SHARED: 1324 /* Collapse before anyone modified the COW-shared page. */ 1325 ret = madvise(mem, size, MADV_COLLAPSE); 1326 if (ret) { 1327 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", 1328 strerror(errno)); 1329 write(comm_pipes.parent_ready[1], "0", 1); 1330 wait(&ret); 1331 goto close_comm_pipes; 1332 } 1333 break; 1334 default: 1335 assert(false); 1336 } 1337 1338 /* Modify the page. */ 1339 memset(mem, 0xff, size); 1340 write(comm_pipes.parent_ready[1], "0", 1); 1341 1342 wait(&ret); 1343 if (WIFEXITED(ret)) 1344 ret = WEXITSTATUS(ret); 1345 else 1346 ret = -EINVAL; 1347 1348 ksft_test_result(!ret, "No leak from parent into child\n"); 1349 close_comm_pipes: 1350 close_comm_pipes(&comm_pipes); 1351 } 1352 1353 static void test_anon_thp_collapse_unshared(char *mem, size_t size, 1354 bool is_hugetlb) 1355 { 1356 assert(!is_hugetlb); 1357 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); 1358 } 1359 1360 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, 1361 bool is_hugetlb) 1362 { 1363 assert(!is_hugetlb); 1364 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); 1365 } 1366 1367 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, 1368 bool is_hugetlb) 1369 { 1370 assert(!is_hugetlb); 1371 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); 1372 } 1373 1374 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, 1375 bool is_hugetlb) 1376 { 1377 assert(!is_hugetlb); 1378 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); 1379 } 1380 1381 /* 1382 * Test cases that are specific to anonymous THP: pages in private mappings 1383 * that may get shared via COW during fork(). 1384 */ 1385 static const struct test_case anon_thp_test_cases[] = { 1386 /* 1387 * Basic COW test for fork() without any GUP when collapsing a THP 1388 * before fork(). 1389 * 1390 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place 1391 * collapse") might easily get COW handling wrong when not collapsing 1392 * exclusivity information properly. 1393 */ 1394 { 1395 "Basic COW after fork() when collapsing before fork()", 1396 test_anon_thp_collapse_unshared, 1397 }, 1398 /* Basic COW test, but collapse after COW-sharing a full THP. */ 1399 { 1400 "Basic COW after fork() when collapsing after fork() (fully shared)", 1401 test_anon_thp_collapse_fully_shared, 1402 }, 1403 /* 1404 * Basic COW test, but collapse after COW-sharing the lower half of a 1405 * THP. 1406 */ 1407 { 1408 "Basic COW after fork() when collapsing after fork() (lower shared)", 1409 test_anon_thp_collapse_lower_shared, 1410 }, 1411 /* 1412 * Basic COW test, but collapse after COW-sharing the upper half of a 1413 * THP. 1414 */ 1415 { 1416 "Basic COW after fork() when collapsing after fork() (upper shared)", 1417 test_anon_thp_collapse_upper_shared, 1418 }, 1419 }; 1420 1421 static void run_anon_thp_test_cases(void) 1422 { 1423 int i; 1424 1425 if (!pmdsize) 1426 return; 1427 1428 ksft_print_msg("[INFO] Anonymous THP tests\n"); 1429 1430 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { 1431 struct test_case const *test_case = &anon_thp_test_cases[i]; 1432 1433 ksft_print_msg("[RUN] %s\n", test_case->desc); 1434 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); 1435 } 1436 } 1437 1438 static int tests_per_anon_thp_test_case(void) 1439 { 1440 return pmdsize ? 1 : 0; 1441 } 1442 1443 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); 1444 1445 static void test_cow(char *mem, const char *smem, size_t size) 1446 { 1447 char *old = malloc(size); 1448 1449 /* Backup the original content. */ 1450 memcpy(old, smem, size); 1451 1452 /* Modify the page. */ 1453 memset(mem, 0xff, size); 1454 1455 /* See if we still read the old values via the other mapping. */ 1456 ksft_test_result(!memcmp(smem, old, size), 1457 "Other mapping not modified\n"); 1458 free(old); 1459 } 1460 1461 static void test_ro_pin(char *mem, const char *smem, size_t size) 1462 { 1463 do_test_ro_pin(mem, size, RO_PIN_TEST, false); 1464 } 1465 1466 static void test_ro_fast_pin(char *mem, const char *smem, size_t size) 1467 { 1468 do_test_ro_pin(mem, size, RO_PIN_TEST, true); 1469 } 1470 1471 static void run_with_zeropage(non_anon_test_fn fn, const char *desc) 1472 { 1473 char *mem, *smem, tmp; 1474 1475 ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); 1476 1477 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, 1478 MAP_PRIVATE | MAP_ANON, -1, 0); 1479 if (mem == MAP_FAILED) { 1480 ksft_test_result_fail("mmap() failed\n"); 1481 return; 1482 } 1483 1484 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); 1485 if (mem == MAP_FAILED) { 1486 ksft_test_result_fail("mmap() failed\n"); 1487 goto munmap; 1488 } 1489 1490 /* Read from the page to populate the shared zeropage. */ 1491 tmp = *mem + *smem; 1492 asm volatile("" : "+r" (tmp)); 1493 1494 fn(mem, smem, pagesize); 1495 munmap: 1496 munmap(mem, pagesize); 1497 if (smem != MAP_FAILED) 1498 munmap(smem, pagesize); 1499 } 1500 1501 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) 1502 { 1503 char *mem, *smem, *mmap_mem, *mmap_smem, tmp; 1504 size_t mmap_size; 1505 int ret; 1506 1507 ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); 1508 1509 if (!has_huge_zeropage) { 1510 ksft_test_result_skip("Huge zeropage not enabled\n"); 1511 return; 1512 } 1513 1514 /* For alignment purposes, we need twice the thp size. */ 1515 mmap_size = 2 * pmdsize; 1516 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, 1517 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1518 if (mmap_mem == MAP_FAILED) { 1519 ksft_test_result_fail("mmap() failed\n"); 1520 return; 1521 } 1522 mmap_smem = mmap(NULL, mmap_size, PROT_READ, 1523 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1524 if (mmap_smem == MAP_FAILED) { 1525 ksft_test_result_fail("mmap() failed\n"); 1526 goto munmap; 1527 } 1528 1529 /* We need a THP-aligned memory area. */ 1530 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); 1531 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); 1532 1533 ret = madvise(mem, pmdsize, MADV_HUGEPAGE); 1534 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); 1535 if (ret) { 1536 ksft_test_result_fail("MADV_HUGEPAGE failed\n"); 1537 goto munmap; 1538 } 1539 1540 /* 1541 * Read from the memory to populate the huge shared zeropage. Read from 1542 * the first sub-page and test if we get another sub-page populated 1543 * automatically. 1544 */ 1545 tmp = *mem + *smem; 1546 asm volatile("" : "+r" (tmp)); 1547 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || 1548 !pagemap_is_populated(pagemap_fd, smem + pagesize)) { 1549 ksft_test_result_skip("Did not get THPs populated\n"); 1550 goto munmap; 1551 } 1552 1553 fn(mem, smem, pmdsize); 1554 munmap: 1555 munmap(mmap_mem, mmap_size); 1556 if (mmap_smem != MAP_FAILED) 1557 munmap(mmap_smem, mmap_size); 1558 } 1559 1560 static void run_with_memfd(non_anon_test_fn fn, const char *desc) 1561 { 1562 char *mem, *smem, tmp; 1563 int fd; 1564 1565 ksft_print_msg("[RUN] %s ... with memfd\n", desc); 1566 1567 fd = memfd_create("test", 0); 1568 if (fd < 0) { 1569 ksft_test_result_fail("memfd_create() failed\n"); 1570 return; 1571 } 1572 1573 /* File consists of a single page filled with zeroes. */ 1574 if (fallocate(fd, 0, 0, pagesize)) { 1575 ksft_test_result_fail("fallocate() failed\n"); 1576 goto close; 1577 } 1578 1579 /* Create a private mapping of the memfd. */ 1580 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); 1581 if (mem == MAP_FAILED) { 1582 ksft_test_result_fail("mmap() failed\n"); 1583 goto close; 1584 } 1585 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); 1586 if (mem == MAP_FAILED) { 1587 ksft_test_result_fail("mmap() failed\n"); 1588 goto munmap; 1589 } 1590 1591 /* Fault the page in. */ 1592 tmp = *mem + *smem; 1593 asm volatile("" : "+r" (tmp)); 1594 1595 fn(mem, smem, pagesize); 1596 munmap: 1597 munmap(mem, pagesize); 1598 if (smem != MAP_FAILED) 1599 munmap(smem, pagesize); 1600 close: 1601 close(fd); 1602 } 1603 1604 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) 1605 { 1606 char *mem, *smem, tmp; 1607 FILE *file; 1608 int fd; 1609 1610 ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); 1611 1612 file = tmpfile(); 1613 if (!file) { 1614 ksft_test_result_fail("tmpfile() failed\n"); 1615 return; 1616 } 1617 1618 fd = fileno(file); 1619 if (fd < 0) { 1620 ksft_test_result_skip("fileno() failed\n"); 1621 return; 1622 } 1623 1624 /* File consists of a single page filled with zeroes. */ 1625 if (fallocate(fd, 0, 0, pagesize)) { 1626 ksft_test_result_fail("fallocate() failed\n"); 1627 goto close; 1628 } 1629 1630 /* Create a private mapping of the memfd. */ 1631 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); 1632 if (mem == MAP_FAILED) { 1633 ksft_test_result_fail("mmap() failed\n"); 1634 goto close; 1635 } 1636 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); 1637 if (mem == MAP_FAILED) { 1638 ksft_test_result_fail("mmap() failed\n"); 1639 goto munmap; 1640 } 1641 1642 /* Fault the page in. */ 1643 tmp = *mem + *smem; 1644 asm volatile("" : "+r" (tmp)); 1645 1646 fn(mem, smem, pagesize); 1647 munmap: 1648 munmap(mem, pagesize); 1649 if (smem != MAP_FAILED) 1650 munmap(smem, pagesize); 1651 close: 1652 fclose(file); 1653 } 1654 1655 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, 1656 size_t hugetlbsize) 1657 { 1658 int flags = MFD_HUGETLB; 1659 char *mem, *smem, tmp; 1660 int fd; 1661 1662 ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, 1663 hugetlbsize / 1024); 1664 1665 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; 1666 1667 fd = memfd_create("test", flags); 1668 if (fd < 0) { 1669 ksft_test_result_skip("memfd_create() failed\n"); 1670 return; 1671 } 1672 1673 /* File consists of a single page filled with zeroes. */ 1674 if (fallocate(fd, 0, 0, hugetlbsize)) { 1675 ksft_test_result_skip("need more free huge pages\n"); 1676 goto close; 1677 } 1678 1679 /* Create a private mapping of the memfd. */ 1680 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 1681 0); 1682 if (mem == MAP_FAILED) { 1683 ksft_test_result_skip("need more free huge pages\n"); 1684 goto close; 1685 } 1686 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); 1687 if (mem == MAP_FAILED) { 1688 ksft_test_result_fail("mmap() failed\n"); 1689 goto munmap; 1690 } 1691 1692 /* Fault the page in. */ 1693 tmp = *mem + *smem; 1694 asm volatile("" : "+r" (tmp)); 1695 1696 fn(mem, smem, hugetlbsize); 1697 munmap: 1698 munmap(mem, hugetlbsize); 1699 if (mem != MAP_FAILED) 1700 munmap(smem, hugetlbsize); 1701 close: 1702 close(fd); 1703 } 1704 1705 struct non_anon_test_case { 1706 const char *desc; 1707 non_anon_test_fn fn; 1708 }; 1709 1710 /* 1711 * Test cases that target any pages in private mappings that are not anonymous: 1712 * pages that may get shared via COW ndependent of fork(). This includes 1713 * the shared zeropage(s), pagecache pages, ... 1714 */ 1715 static const struct non_anon_test_case non_anon_test_cases[] = { 1716 /* 1717 * Basic COW test without any GUP. If we miss to break COW, changes are 1718 * visible via other private/shared mappings. 1719 */ 1720 { 1721 "Basic COW", 1722 test_cow, 1723 }, 1724 /* 1725 * Take a R/O longterm pin. When modifying the page via the page table, 1726 * the page content change must be visible via the pin. 1727 */ 1728 { 1729 "R/O longterm GUP pin", 1730 test_ro_pin, 1731 }, 1732 /* Same as above, but using GUP-fast. */ 1733 { 1734 "R/O longterm GUP-fast pin", 1735 test_ro_fast_pin, 1736 }, 1737 }; 1738 1739 static void run_non_anon_test_case(struct non_anon_test_case const *test_case) 1740 { 1741 int i; 1742 1743 run_with_zeropage(test_case->fn, test_case->desc); 1744 run_with_memfd(test_case->fn, test_case->desc); 1745 run_with_tmpfile(test_case->fn, test_case->desc); 1746 if (pmdsize) 1747 run_with_huge_zeropage(test_case->fn, test_case->desc); 1748 for (i = 0; i < nr_hugetlbsizes; i++) 1749 run_with_memfd_hugetlb(test_case->fn, test_case->desc, 1750 hugetlbsizes[i]); 1751 } 1752 1753 static void run_non_anon_test_cases(void) 1754 { 1755 int i; 1756 1757 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); 1758 1759 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) 1760 run_non_anon_test_case(&non_anon_test_cases[i]); 1761 } 1762 1763 static int tests_per_non_anon_test_case(void) 1764 { 1765 int tests = 3 + nr_hugetlbsizes; 1766 1767 if (pmdsize) 1768 tests += 1; 1769 return tests; 1770 } 1771 1772 int main(int argc, char **argv) 1773 { 1774 int err; 1775 struct thp_settings default_settings; 1776 1777 ksft_print_header(); 1778 1779 pagesize = getpagesize(); 1780 pmdsize = read_pmd_pagesize(); 1781 if (pmdsize) { 1782 /* Only if THP is supported. */ 1783 thp_read_settings(&default_settings); 1784 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; 1785 thp_save_settings(); 1786 thp_push_settings(&default_settings); 1787 1788 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", 1789 pmdsize / 1024); 1790 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); 1791 } 1792 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, 1793 ARRAY_SIZE(hugetlbsizes)); 1794 detect_huge_zeropage(); 1795 1796 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + 1797 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + 1798 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); 1799 1800 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); 1801 pagemap_fd = open("/proc/self/pagemap", O_RDONLY); 1802 if (pagemap_fd < 0) 1803 ksft_exit_fail_msg("opening pagemap failed\n"); 1804 1805 run_anon_test_cases(); 1806 run_anon_thp_test_cases(); 1807 run_non_anon_test_cases(); 1808 1809 if (pmdsize) { 1810 /* Only if THP is supported. */ 1811 thp_restore_settings(); 1812 } 1813 1814 err = ksft_get_fail_cnt(); 1815 if (err) 1816 ksft_exit_fail_msg("%d out of %d tests failed\n", 1817 err, ksft_test_num()); 1818 ksft_exit_pass(); 1819 } 1820