1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43
sz2ord(size_t size)44 static int sz2ord(size_t size)
45 {
46 return __builtin_ctzll(size / pagesize);
47 }
48
detect_thp_sizes(size_t sizes[],int max)49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 int count = 0;
52 unsigned long orders;
53 size_t kb;
54 int i;
55
56 /* thp not supported at all. */
57 if (!pmdsize)
58 return 0;
59
60 orders = 1UL << sz2ord(pmdsize);
61 orders |= thp_supported_orders();
62
63 for (i = 0; orders && count < max; i++) {
64 if (!(orders & (1UL << i)))
65 continue;
66 orders &= ~(1UL << i);
67 kb = (pagesize >> 10) << i;
68 sizes[count++] = kb * 1024;
69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 }
71
72 return count;
73 }
74
detect_huge_zeropage(void)75 static void detect_huge_zeropage(void)
76 {
77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 O_RDONLY);
79 size_t enabled = 0;
80 char buf[15];
81 int ret;
82
83 if (fd < 0)
84 return;
85
86 ret = pread(fd, buf, sizeof(buf), 0);
87 if (ret > 0 && ret < sizeof(buf)) {
88 buf[ret] = 0;
89
90 enabled = strtoul(buf, NULL, 10);
91 if (enabled == 1) {
92 has_huge_zeropage = true;
93 ksft_print_msg("[INFO] huge zeropage is enabled\n");
94 }
95 }
96
97 close(fd);
98 }
99
range_is_swapped(void * addr,size_t size)100 static bool range_is_swapped(void *addr, size_t size)
101 {
102 for (; size; addr += pagesize, size -= pagesize)
103 if (!pagemap_is_swapped(pagemap_fd, addr))
104 return false;
105 return true;
106 }
107
108 struct comm_pipes {
109 int child_ready[2];
110 int parent_ready[2];
111 };
112
setup_comm_pipes(struct comm_pipes * comm_pipes)113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114 {
115 if (pipe(comm_pipes->child_ready) < 0)
116 return -errno;
117 if (pipe(comm_pipes->parent_ready) < 0) {
118 close(comm_pipes->child_ready[0]);
119 close(comm_pipes->child_ready[1]);
120 return -errno;
121 }
122
123 return 0;
124 }
125
close_comm_pipes(struct comm_pipes * comm_pipes)126 static void close_comm_pipes(struct comm_pipes *comm_pipes)
127 {
128 close(comm_pipes->child_ready[0]);
129 close(comm_pipes->child_ready[1]);
130 close(comm_pipes->parent_ready[0]);
131 close(comm_pipes->parent_ready[1]);
132 }
133
child_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)134 static int child_memcmp_fn(char *mem, size_t size,
135 struct comm_pipes *comm_pipes)
136 {
137 char *old = malloc(size);
138 char buf;
139
140 /* Backup the original content. */
141 memcpy(old, mem, size);
142
143 /* Wait until the parent modified the page. */
144 write(comm_pipes->child_ready[1], "0", 1);
145 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
146 ;
147
148 /* See if we still read the old values. */
149 return memcmp(old, mem, size);
150 }
151
child_vmsplice_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)152 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153 struct comm_pipes *comm_pipes)
154 {
155 struct iovec iov = {
156 .iov_base = mem,
157 .iov_len = size,
158 };
159 ssize_t cur, total, transferred;
160 char *old, *new;
161 int fds[2];
162 char buf;
163
164 old = malloc(size);
165 new = malloc(size);
166
167 /* Backup the original content. */
168 memcpy(old, mem, size);
169
170 if (pipe(fds) < 0)
171 return -errno;
172
173 /* Trigger a read-only pin. */
174 transferred = vmsplice(fds[1], &iov, 1, 0);
175 if (transferred < 0)
176 return -errno;
177 if (transferred == 0)
178 return -EINVAL;
179
180 /* Unmap it from our page tables. */
181 if (munmap(mem, size) < 0)
182 return -errno;
183
184 /* Wait until the parent modified it. */
185 write(comm_pipes->child_ready[1], "0", 1);
186 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
187 ;
188
189 /* See if we still read the old values via the pipe. */
190 for (total = 0; total < transferred; total += cur) {
191 cur = read(fds[0], new + total, transferred - total);
192 if (cur < 0)
193 return -errno;
194 }
195
196 return memcmp(old, new, transferred);
197 }
198
199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
200
do_test_cow_in_parent(char * mem,size_t size,bool do_mprotect,child_fn fn,bool xfail)201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202 child_fn fn, bool xfail)
203 {
204 struct comm_pipes comm_pipes;
205 char buf;
206 int ret;
207
208 ret = setup_comm_pipes(&comm_pipes);
209 if (ret) {
210 ksft_test_result_fail("pipe() failed\n");
211 return;
212 }
213
214 ret = fork();
215 if (ret < 0) {
216 ksft_test_result_fail("fork() failed\n");
217 goto close_comm_pipes;
218 } else if (!ret) {
219 exit(fn(mem, size, &comm_pipes));
220 }
221
222 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
223 ;
224
225 if (do_mprotect) {
226 /*
227 * mprotect() optimizations might try avoiding
228 * write-faults by directly mapping pages writable.
229 */
230 ret = mprotect(mem, size, PROT_READ);
231 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
232 if (ret) {
233 ksft_test_result_fail("mprotect() failed\n");
234 write(comm_pipes.parent_ready[1], "0", 1);
235 wait(&ret);
236 goto close_comm_pipes;
237 }
238 }
239
240 /* Modify the page. */
241 memset(mem, 0xff, size);
242 write(comm_pipes.parent_ready[1], "0", 1);
243
244 wait(&ret);
245 if (WIFEXITED(ret))
246 ret = WEXITSTATUS(ret);
247 else
248 ret = -EINVAL;
249
250 if (!ret) {
251 ksft_test_result_pass("No leak from parent into child\n");
252 } else if (xfail) {
253 /*
254 * With hugetlb, some vmsplice() tests are currently expected to
255 * fail because (a) harder to fix and (b) nobody really cares.
256 * Flag them as expected failure for now.
257 */
258 ksft_test_result_xfail("Leak from parent into child\n");
259 } else {
260 ksft_test_result_fail("Leak from parent into child\n");
261 }
262 close_comm_pipes:
263 close_comm_pipes(&comm_pipes);
264 }
265
test_cow_in_parent(char * mem,size_t size,bool is_hugetlb)266 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
267 {
268 do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
269 }
270
test_cow_in_parent_mprotect(char * mem,size_t size,bool is_hugetlb)271 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
272 {
273 do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
274 }
275
test_vmsplice_in_child(char * mem,size_t size,bool is_hugetlb)276 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
277 {
278 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
279 is_hugetlb);
280 }
281
test_vmsplice_in_child_mprotect(char * mem,size_t size,bool is_hugetlb)282 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
283 bool is_hugetlb)
284 {
285 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
286 is_hugetlb);
287 }
288
do_test_vmsplice_in_parent(char * mem,size_t size,bool before_fork,bool xfail)289 static void do_test_vmsplice_in_parent(char *mem, size_t size,
290 bool before_fork, bool xfail)
291 {
292 struct iovec iov = {
293 .iov_base = mem,
294 .iov_len = size,
295 };
296 ssize_t cur, total, transferred;
297 struct comm_pipes comm_pipes;
298 char *old, *new;
299 int ret, fds[2];
300 char buf;
301
302 old = malloc(size);
303 new = malloc(size);
304
305 memcpy(old, mem, size);
306
307 ret = setup_comm_pipes(&comm_pipes);
308 if (ret) {
309 ksft_test_result_fail("pipe() failed\n");
310 goto free;
311 }
312
313 if (pipe(fds) < 0) {
314 ksft_test_result_fail("pipe() failed\n");
315 goto close_comm_pipes;
316 }
317
318 if (before_fork) {
319 transferred = vmsplice(fds[1], &iov, 1, 0);
320 if (transferred <= 0) {
321 ksft_test_result_fail("vmsplice() failed\n");
322 goto close_pipe;
323 }
324 }
325
326 ret = fork();
327 if (ret < 0) {
328 ksft_test_result_fail("fork() failed\n");
329 goto close_pipe;
330 } else if (!ret) {
331 write(comm_pipes.child_ready[1], "0", 1);
332 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
333 ;
334 /* Modify page content in the child. */
335 memset(mem, 0xff, size);
336 exit(0);
337 }
338
339 if (!before_fork) {
340 transferred = vmsplice(fds[1], &iov, 1, 0);
341 if (transferred <= 0) {
342 ksft_test_result_fail("vmsplice() failed\n");
343 wait(&ret);
344 goto close_pipe;
345 }
346 }
347
348 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
349 ;
350 if (munmap(mem, size) < 0) {
351 ksft_test_result_fail("munmap() failed\n");
352 goto close_pipe;
353 }
354 write(comm_pipes.parent_ready[1], "0", 1);
355
356 /* Wait until the child is done writing. */
357 wait(&ret);
358 if (!WIFEXITED(ret)) {
359 ksft_test_result_fail("wait() failed\n");
360 goto close_pipe;
361 }
362
363 /* See if we still read the old values. */
364 for (total = 0; total < transferred; total += cur) {
365 cur = read(fds[0], new + total, transferred - total);
366 if (cur < 0) {
367 ksft_test_result_fail("read() failed\n");
368 goto close_pipe;
369 }
370 }
371
372 if (!memcmp(old, new, transferred)) {
373 ksft_test_result_pass("No leak from child into parent\n");
374 } else if (xfail) {
375 /*
376 * With hugetlb, some vmsplice() tests are currently expected to
377 * fail because (a) harder to fix and (b) nobody really cares.
378 * Flag them as expected failure for now.
379 */
380 ksft_test_result_xfail("Leak from child into parent\n");
381 } else {
382 ksft_test_result_fail("Leak from child into parent\n");
383 }
384 close_pipe:
385 close(fds[0]);
386 close(fds[1]);
387 close_comm_pipes:
388 close_comm_pipes(&comm_pipes);
389 free:
390 free(old);
391 free(new);
392 }
393
test_vmsplice_before_fork(char * mem,size_t size,bool is_hugetlb)394 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
395 {
396 do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
397 }
398
test_vmsplice_after_fork(char * mem,size_t size,bool is_hugetlb)399 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
400 {
401 do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
402 }
403
404 #ifdef LOCAL_CONFIG_HAVE_LIBURING
do_test_iouring(char * mem,size_t size,bool use_fork)405 static void do_test_iouring(char *mem, size_t size, bool use_fork)
406 {
407 struct comm_pipes comm_pipes;
408 struct io_uring_cqe *cqe;
409 struct io_uring_sqe *sqe;
410 struct io_uring ring;
411 ssize_t cur, total;
412 struct iovec iov;
413 char *buf, *tmp;
414 int ret, fd;
415 FILE *file;
416
417 ret = setup_comm_pipes(&comm_pipes);
418 if (ret) {
419 ksft_test_result_fail("pipe() failed\n");
420 return;
421 }
422
423 file = tmpfile();
424 if (!file) {
425 ksft_test_result_fail("tmpfile() failed\n");
426 goto close_comm_pipes;
427 }
428 fd = fileno(file);
429 assert(fd);
430
431 tmp = malloc(size);
432 if (!tmp) {
433 ksft_test_result_fail("malloc() failed\n");
434 goto close_file;
435 }
436
437 /* Skip on errors, as we might just lack kernel support. */
438 ret = io_uring_queue_init(1, &ring, 0);
439 if (ret < 0) {
440 ksft_test_result_skip("io_uring_queue_init() failed\n");
441 goto free_tmp;
442 }
443
444 /*
445 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
446 * | FOLL_LONGTERM the range.
447 *
448 * Skip on errors, as we might just lack kernel support or might not
449 * have sufficient MEMLOCK permissions.
450 */
451 iov.iov_base = mem;
452 iov.iov_len = size;
453 ret = io_uring_register_buffers(&ring, &iov, 1);
454 if (ret) {
455 ksft_test_result_skip("io_uring_register_buffers() failed\n");
456 goto queue_exit;
457 }
458
459 if (use_fork) {
460 /*
461 * fork() and keep the child alive until we're done. Note that
462 * we expect the pinned page to not get shared with the child.
463 */
464 ret = fork();
465 if (ret < 0) {
466 ksft_test_result_fail("fork() failed\n");
467 goto unregister_buffers;
468 } else if (!ret) {
469 write(comm_pipes.child_ready[1], "0", 1);
470 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
471 ;
472 exit(0);
473 }
474
475 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
476 ;
477 } else {
478 /*
479 * Map the page R/O into the page table. Enable softdirty
480 * tracking to stop the page from getting mapped R/W immediately
481 * again by mprotect() optimizations. Note that we don't have an
482 * easy way to test if that worked (the pagemap does not export
483 * if the page is mapped R/O vs. R/W).
484 */
485 ret = mprotect(mem, size, PROT_READ);
486 clear_softdirty();
487 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
488 if (ret) {
489 ksft_test_result_fail("mprotect() failed\n");
490 goto unregister_buffers;
491 }
492 }
493
494 /*
495 * Modify the page and write page content as observed by the fixed
496 * buffer pin to the file so we can verify it.
497 */
498 memset(mem, 0xff, size);
499 sqe = io_uring_get_sqe(&ring);
500 if (!sqe) {
501 ksft_test_result_fail("io_uring_get_sqe() failed\n");
502 goto quit_child;
503 }
504 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
505
506 ret = io_uring_submit(&ring);
507 if (ret < 0) {
508 ksft_test_result_fail("io_uring_submit() failed\n");
509 goto quit_child;
510 }
511
512 ret = io_uring_wait_cqe(&ring, &cqe);
513 if (ret < 0) {
514 ksft_test_result_fail("io_uring_wait_cqe() failed\n");
515 goto quit_child;
516 }
517
518 if (cqe->res != size) {
519 ksft_test_result_fail("write_fixed failed\n");
520 goto quit_child;
521 }
522 io_uring_cqe_seen(&ring, cqe);
523
524 /* Read back the file content to the temporary buffer. */
525 total = 0;
526 while (total < size) {
527 cur = pread(fd, tmp + total, size - total, total);
528 if (cur < 0) {
529 ksft_test_result_fail("pread() failed\n");
530 goto quit_child;
531 }
532 total += cur;
533 }
534
535 /* Finally, check if we read what we expected. */
536 ksft_test_result(!memcmp(mem, tmp, size),
537 "Longterm R/W pin is reliable\n");
538
539 quit_child:
540 if (use_fork) {
541 write(comm_pipes.parent_ready[1], "0", 1);
542 wait(&ret);
543 }
544 unregister_buffers:
545 io_uring_unregister_buffers(&ring);
546 queue_exit:
547 io_uring_queue_exit(&ring);
548 free_tmp:
549 free(tmp);
550 close_file:
551 fclose(file);
552 close_comm_pipes:
553 close_comm_pipes(&comm_pipes);
554 }
555
test_iouring_ro(char * mem,size_t size,bool is_hugetlb)556 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
557 {
558 do_test_iouring(mem, size, false);
559 }
560
test_iouring_fork(char * mem,size_t size,bool is_hugetlb)561 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
562 {
563 do_test_iouring(mem, size, true);
564 }
565
566 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
567
568 enum ro_pin_test {
569 RO_PIN_TEST,
570 RO_PIN_TEST_SHARED,
571 RO_PIN_TEST_PREVIOUSLY_SHARED,
572 RO_PIN_TEST_RO_EXCLUSIVE,
573 };
574
do_test_ro_pin(char * mem,size_t size,enum ro_pin_test test,bool fast)575 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
576 bool fast)
577 {
578 struct pin_longterm_test args;
579 struct comm_pipes comm_pipes;
580 char *tmp, buf;
581 __u64 tmp_val;
582 int ret;
583
584 if (gup_fd < 0) {
585 ksft_test_result_skip("gup_test not available\n");
586 return;
587 }
588
589 tmp = malloc(size);
590 if (!tmp) {
591 ksft_test_result_fail("malloc() failed\n");
592 return;
593 }
594
595 ret = setup_comm_pipes(&comm_pipes);
596 if (ret) {
597 ksft_test_result_fail("pipe() failed\n");
598 goto free_tmp;
599 }
600
601 switch (test) {
602 case RO_PIN_TEST:
603 break;
604 case RO_PIN_TEST_SHARED:
605 case RO_PIN_TEST_PREVIOUSLY_SHARED:
606 /*
607 * Share the pages with our child. As the pages are not pinned,
608 * this should just work.
609 */
610 ret = fork();
611 if (ret < 0) {
612 ksft_test_result_fail("fork() failed\n");
613 goto close_comm_pipes;
614 } else if (!ret) {
615 write(comm_pipes.child_ready[1], "0", 1);
616 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
617 ;
618 exit(0);
619 }
620
621 /* Wait until our child is ready. */
622 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
623 ;
624
625 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
626 /*
627 * Tell the child to quit now and wait until it quit.
628 * The pages should now be mapped R/O into our page
629 * tables, but they are no longer shared.
630 */
631 write(comm_pipes.parent_ready[1], "0", 1);
632 wait(&ret);
633 if (!WIFEXITED(ret))
634 ksft_print_msg("[INFO] wait() failed\n");
635 }
636 break;
637 case RO_PIN_TEST_RO_EXCLUSIVE:
638 /*
639 * Map the page R/O into the page table. Enable softdirty
640 * tracking to stop the page from getting mapped R/W immediately
641 * again by mprotect() optimizations. Note that we don't have an
642 * easy way to test if that worked (the pagemap does not export
643 * if the page is mapped R/O vs. R/W).
644 */
645 ret = mprotect(mem, size, PROT_READ);
646 clear_softdirty();
647 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
648 if (ret) {
649 ksft_test_result_fail("mprotect() failed\n");
650 goto close_comm_pipes;
651 }
652 break;
653 default:
654 assert(false);
655 }
656
657 /* Take a R/O pin. This should trigger unsharing. */
658 args.addr = (__u64)(uintptr_t)mem;
659 args.size = size;
660 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
661 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
662 if (ret) {
663 if (errno == EINVAL)
664 ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
665 else
666 ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
667 goto wait;
668 }
669
670 /* Modify the page. */
671 memset(mem, 0xff, size);
672
673 /*
674 * Read back the content via the pin to the temporary buffer and
675 * test if we observed the modification.
676 */
677 tmp_val = (__u64)(uintptr_t)tmp;
678 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
679 if (ret)
680 ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
681 else
682 ksft_test_result(!memcmp(mem, tmp, size),
683 "Longterm R/O pin is reliable\n");
684
685 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
686 if (ret)
687 ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
688 wait:
689 switch (test) {
690 case RO_PIN_TEST_SHARED:
691 write(comm_pipes.parent_ready[1], "0", 1);
692 wait(&ret);
693 if (!WIFEXITED(ret))
694 ksft_print_msg("[INFO] wait() failed\n");
695 break;
696 default:
697 break;
698 }
699 close_comm_pipes:
700 close_comm_pipes(&comm_pipes);
701 free_tmp:
702 free(tmp);
703 }
704
test_ro_pin_on_shared(char * mem,size_t size,bool is_hugetlb)705 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
706 {
707 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
708 }
709
test_ro_fast_pin_on_shared(char * mem,size_t size,bool is_hugetlb)710 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
711 {
712 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
713 }
714
test_ro_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)715 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
716 bool is_hugetlb)
717 {
718 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
719 }
720
test_ro_fast_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)721 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
722 bool is_hugetlb)
723 {
724 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
725 }
726
test_ro_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)727 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
728 bool is_hugetlb)
729 {
730 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
731 }
732
test_ro_fast_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)733 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
734 bool is_hugetlb)
735 {
736 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
737 }
738
739 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
740
do_run_with_base_page(test_fn fn,bool swapout)741 static void do_run_with_base_page(test_fn fn, bool swapout)
742 {
743 char *mem;
744 int ret;
745
746 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
747 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
748 if (mem == MAP_FAILED) {
749 ksft_test_result_fail("mmap() failed\n");
750 return;
751 }
752
753 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
754 /* Ignore if not around on a kernel. */
755 if (ret && errno != EINVAL) {
756 ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
757 goto munmap;
758 }
759
760 /* Populate a base page. */
761 memset(mem, 1, pagesize);
762
763 if (swapout) {
764 madvise(mem, pagesize, MADV_PAGEOUT);
765 if (!pagemap_is_swapped(pagemap_fd, mem)) {
766 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
767 goto munmap;
768 }
769 }
770
771 fn(mem, pagesize, false);
772 munmap:
773 munmap(mem, pagesize);
774 }
775
run_with_base_page(test_fn fn,const char * desc)776 static void run_with_base_page(test_fn fn, const char *desc)
777 {
778 ksft_print_msg("[RUN] %s ... with base page\n", desc);
779 do_run_with_base_page(fn, false);
780 }
781
run_with_base_page_swap(test_fn fn,const char * desc)782 static void run_with_base_page_swap(test_fn fn, const char *desc)
783 {
784 ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
785 do_run_with_base_page(fn, true);
786 }
787
788 enum thp_run {
789 THP_RUN_PMD,
790 THP_RUN_PMD_SWAPOUT,
791 THP_RUN_PTE,
792 THP_RUN_PTE_SWAPOUT,
793 THP_RUN_SINGLE_PTE,
794 THP_RUN_SINGLE_PTE_SWAPOUT,
795 THP_RUN_PARTIAL_MREMAP,
796 THP_RUN_PARTIAL_SHARED,
797 };
798
do_run_with_thp(test_fn fn,enum thp_run thp_run,size_t thpsize)799 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
800 {
801 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
802 size_t size, mmap_size, mremap_size;
803 int ret;
804
805 /* For alignment purposes, we need twice the thp size. */
806 mmap_size = 2 * thpsize;
807 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
808 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
809 if (mmap_mem == MAP_FAILED) {
810 ksft_test_result_fail("mmap() failed\n");
811 return;
812 }
813
814 /* We need a THP-aligned memory area. */
815 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
816
817 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
818 if (ret) {
819 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
820 goto munmap;
821 }
822
823 /*
824 * Try to populate a THP. Touch the first sub-page and test if
825 * we get the last sub-page populated automatically.
826 */
827 mem[0] = 1;
828 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
829 ksft_test_result_skip("Did not get a THP populated\n");
830 goto munmap;
831 }
832 memset(mem, 1, thpsize);
833
834 size = thpsize;
835 switch (thp_run) {
836 case THP_RUN_PMD:
837 case THP_RUN_PMD_SWAPOUT:
838 assert(thpsize == pmdsize);
839 break;
840 case THP_RUN_PTE:
841 case THP_RUN_PTE_SWAPOUT:
842 /*
843 * Trigger PTE-mapping the THP by temporarily mapping a single
844 * subpage R/O. This is a noop if the THP is not pmdsize (and
845 * therefore already PTE-mapped).
846 */
847 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
848 if (ret) {
849 ksft_test_result_fail("mprotect() failed\n");
850 goto munmap;
851 }
852 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
853 if (ret) {
854 ksft_test_result_fail("mprotect() failed\n");
855 goto munmap;
856 }
857 break;
858 case THP_RUN_SINGLE_PTE:
859 case THP_RUN_SINGLE_PTE_SWAPOUT:
860 /*
861 * Discard all but a single subpage of that PTE-mapped THP. What
862 * remains is a single PTE mapping a single subpage.
863 */
864 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
865 if (ret) {
866 ksft_test_result_fail("MADV_DONTNEED failed\n");
867 goto munmap;
868 }
869 size = pagesize;
870 break;
871 case THP_RUN_PARTIAL_MREMAP:
872 /*
873 * Remap half of the THP. We need some new memory location
874 * for that.
875 */
876 mremap_size = thpsize / 2;
877 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
878 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
879 if (mem == MAP_FAILED) {
880 ksft_test_result_fail("mmap() failed\n");
881 goto munmap;
882 }
883 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
884 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
885 if (tmp != mremap_mem) {
886 ksft_test_result_fail("mremap() failed\n");
887 goto munmap;
888 }
889 size = mremap_size;
890 break;
891 case THP_RUN_PARTIAL_SHARED:
892 /*
893 * Share the first page of the THP with a child and quit the
894 * child. This will result in some parts of the THP never
895 * have been shared.
896 */
897 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
898 if (ret) {
899 ksft_test_result_fail("MADV_DONTFORK failed\n");
900 goto munmap;
901 }
902 ret = fork();
903 if (ret < 0) {
904 ksft_test_result_fail("fork() failed\n");
905 goto munmap;
906 } else if (!ret) {
907 exit(0);
908 }
909 wait(&ret);
910 /* Allow for sharing all pages again. */
911 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
912 if (ret) {
913 ksft_test_result_fail("MADV_DOFORK failed\n");
914 goto munmap;
915 }
916 break;
917 default:
918 assert(false);
919 }
920
921 switch (thp_run) {
922 case THP_RUN_PMD_SWAPOUT:
923 case THP_RUN_PTE_SWAPOUT:
924 case THP_RUN_SINGLE_PTE_SWAPOUT:
925 madvise(mem, size, MADV_PAGEOUT);
926 if (!range_is_swapped(mem, size)) {
927 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
928 goto munmap;
929 }
930 break;
931 default:
932 break;
933 }
934
935 fn(mem, size, false);
936 munmap:
937 munmap(mmap_mem, mmap_size);
938 if (mremap_mem != MAP_FAILED)
939 munmap(mremap_mem, mremap_size);
940 }
941
run_with_thp(test_fn fn,const char * desc,size_t size)942 static void run_with_thp(test_fn fn, const char *desc, size_t size)
943 {
944 ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
945 desc, size / 1024);
946 do_run_with_thp(fn, THP_RUN_PMD, size);
947 }
948
run_with_thp_swap(test_fn fn,const char * desc,size_t size)949 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
950 {
951 ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
952 desc, size / 1024);
953 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
954 }
955
run_with_pte_mapped_thp(test_fn fn,const char * desc,size_t size)956 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
957 {
958 ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
959 desc, size / 1024);
960 do_run_with_thp(fn, THP_RUN_PTE, size);
961 }
962
run_with_pte_mapped_thp_swap(test_fn fn,const char * desc,size_t size)963 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
964 {
965 ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
966 desc, size / 1024);
967 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
968 }
969
run_with_single_pte_of_thp(test_fn fn,const char * desc,size_t size)970 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
971 {
972 ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
973 desc, size / 1024);
974 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
975 }
976
run_with_single_pte_of_thp_swap(test_fn fn,const char * desc,size_t size)977 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
978 {
979 ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
980 desc, size / 1024);
981 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
982 }
983
run_with_partial_mremap_thp(test_fn fn,const char * desc,size_t size)984 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
985 {
986 ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
987 desc, size / 1024);
988 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
989 }
990
run_with_partial_shared_thp(test_fn fn,const char * desc,size_t size)991 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
992 {
993 ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
994 desc, size / 1024);
995 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
996 }
997
run_with_hugetlb(test_fn fn,const char * desc,size_t hugetlbsize)998 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
999 {
1000 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1001 char *mem, *dummy;
1002
1003 ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
1004 hugetlbsize / 1024);
1005
1006 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1007
1008 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1009 if (mem == MAP_FAILED) {
1010 ksft_test_result_skip("need more free huge pages\n");
1011 return;
1012 }
1013
1014 /* Populate an huge page. */
1015 memset(mem, 1, hugetlbsize);
1016
1017 /*
1018 * We need a total of two hugetlb pages to handle COW/unsharing
1019 * properly, otherwise we might get zapped by a SIGBUS.
1020 */
1021 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1022 if (dummy == MAP_FAILED) {
1023 ksft_test_result_skip("need more free huge pages\n");
1024 goto munmap;
1025 }
1026 munmap(dummy, hugetlbsize);
1027
1028 fn(mem, hugetlbsize, true);
1029 munmap:
1030 munmap(mem, hugetlbsize);
1031 }
1032
1033 struct test_case {
1034 const char *desc;
1035 test_fn fn;
1036 };
1037
1038 /*
1039 * Test cases that are specific to anonymous pages: pages in private mappings
1040 * that may get shared via COW during fork().
1041 */
1042 static const struct test_case anon_test_cases[] = {
1043 /*
1044 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1045 * either the child can observe modifications by the parent or the
1046 * other way around.
1047 */
1048 {
1049 "Basic COW after fork()",
1050 test_cow_in_parent,
1051 },
1052 /*
1053 * Basic test, but do an additional mprotect(PROT_READ)+
1054 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1055 */
1056 {
1057 "Basic COW after fork() with mprotect() optimization",
1058 test_cow_in_parent_mprotect,
1059 },
1060 /*
1061 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1062 * we miss to break COW, the child observes modifications by the parent.
1063 * This is CVE-2020-29374 reported by Jann Horn.
1064 */
1065 {
1066 "vmsplice() + unmap in child",
1067 test_vmsplice_in_child,
1068 },
1069 /*
1070 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1071 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1072 */
1073 {
1074 "vmsplice() + unmap in child with mprotect() optimization",
1075 test_vmsplice_in_child_mprotect,
1076 },
1077 /*
1078 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1079 * fork(); modify in the child. If we miss to break COW, the parent
1080 * observes modifications by the child.
1081 */
1082 {
1083 "vmsplice() before fork(), unmap in parent after fork()",
1084 test_vmsplice_before_fork,
1085 },
1086 /*
1087 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1088 * child. If we miss to break COW, the parent observes modifications by
1089 * the child.
1090 */
1091 {
1092 "vmsplice() + unmap in parent after fork()",
1093 test_vmsplice_after_fork,
1094 },
1095 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1096 /*
1097 * Take a R/W longterm pin and then map the page R/O into the page
1098 * table to trigger a write fault on next access. When modifying the
1099 * page, the page content must be visible via the pin.
1100 */
1101 {
1102 "R/O-mapping a page registered as iouring fixed buffer",
1103 test_iouring_ro,
1104 },
1105 /*
1106 * Take a R/W longterm pin and then fork() a child. When modifying the
1107 * page, the page content must be visible via the pin. We expect the
1108 * pinned page to not get shared with the child.
1109 */
1110 {
1111 "fork() with an iouring fixed buffer",
1112 test_iouring_fork,
1113 },
1114
1115 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1116 /*
1117 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1118 * When modifying the page via the page table, the page content change
1119 * must be visible via the pin.
1120 */
1121 {
1122 "R/O GUP pin on R/O-mapped shared page",
1123 test_ro_pin_on_shared,
1124 },
1125 /* Same as above, but using GUP-fast. */
1126 {
1127 "R/O GUP-fast pin on R/O-mapped shared page",
1128 test_ro_fast_pin_on_shared,
1129 },
1130 /*
1131 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1132 * was previously shared. When modifying the page via the page table,
1133 * the page content change must be visible via the pin.
1134 */
1135 {
1136 "R/O GUP pin on R/O-mapped previously-shared page",
1137 test_ro_pin_on_ro_previously_shared,
1138 },
1139 /* Same as above, but using GUP-fast. */
1140 {
1141 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1142 test_ro_fast_pin_on_ro_previously_shared,
1143 },
1144 /*
1145 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1146 * When modifying the page via the page table, the page content change
1147 * must be visible via the pin.
1148 */
1149 {
1150 "R/O GUP pin on R/O-mapped exclusive page",
1151 test_ro_pin_on_ro_exclusive,
1152 },
1153 /* Same as above, but using GUP-fast. */
1154 {
1155 "R/O GUP-fast pin on R/O-mapped exclusive page",
1156 test_ro_fast_pin_on_ro_exclusive,
1157 },
1158 };
1159
run_anon_test_case(struct test_case const * test_case)1160 static void run_anon_test_case(struct test_case const *test_case)
1161 {
1162 int i;
1163
1164 run_with_base_page(test_case->fn, test_case->desc);
1165 run_with_base_page_swap(test_case->fn, test_case->desc);
1166 for (i = 0; i < nr_thpsizes; i++) {
1167 size_t size = thpsizes[i];
1168 struct thp_settings settings = *thp_current_settings();
1169
1170 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1171 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1172 thp_push_settings(&settings);
1173
1174 if (size == pmdsize) {
1175 run_with_thp(test_case->fn, test_case->desc, size);
1176 run_with_thp_swap(test_case->fn, test_case->desc, size);
1177 }
1178
1179 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1180 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1181 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1182 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1183 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1184 run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1185
1186 thp_pop_settings();
1187 }
1188 for (i = 0; i < nr_hugetlbsizes; i++)
1189 run_with_hugetlb(test_case->fn, test_case->desc,
1190 hugetlbsizes[i]);
1191 }
1192
run_anon_test_cases(void)1193 static void run_anon_test_cases(void)
1194 {
1195 int i;
1196
1197 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1198
1199 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1200 run_anon_test_case(&anon_test_cases[i]);
1201 }
1202
tests_per_anon_test_case(void)1203 static int tests_per_anon_test_case(void)
1204 {
1205 int tests = 2 + nr_hugetlbsizes;
1206
1207 tests += 6 * nr_thpsizes;
1208 if (pmdsize)
1209 tests += 2;
1210 return tests;
1211 }
1212
1213 enum anon_thp_collapse_test {
1214 ANON_THP_COLLAPSE_UNSHARED,
1215 ANON_THP_COLLAPSE_FULLY_SHARED,
1216 ANON_THP_COLLAPSE_LOWER_SHARED,
1217 ANON_THP_COLLAPSE_UPPER_SHARED,
1218 };
1219
do_test_anon_thp_collapse(char * mem,size_t size,enum anon_thp_collapse_test test)1220 static void do_test_anon_thp_collapse(char *mem, size_t size,
1221 enum anon_thp_collapse_test test)
1222 {
1223 struct comm_pipes comm_pipes;
1224 char buf;
1225 int ret;
1226
1227 ret = setup_comm_pipes(&comm_pipes);
1228 if (ret) {
1229 ksft_test_result_fail("pipe() failed\n");
1230 return;
1231 }
1232
1233 /*
1234 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1235 * R/O, such that we can try collapsing it later.
1236 */
1237 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1238 if (ret) {
1239 ksft_test_result_fail("mprotect() failed\n");
1240 goto close_comm_pipes;
1241 }
1242 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1243 if (ret) {
1244 ksft_test_result_fail("mprotect() failed\n");
1245 goto close_comm_pipes;
1246 }
1247
1248 switch (test) {
1249 case ANON_THP_COLLAPSE_UNSHARED:
1250 /* Collapse before actually COW-sharing the page. */
1251 ret = madvise(mem, size, MADV_COLLAPSE);
1252 if (ret) {
1253 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1254 strerror(errno));
1255 goto close_comm_pipes;
1256 }
1257 break;
1258 case ANON_THP_COLLAPSE_FULLY_SHARED:
1259 /* COW-share the full PTE-mapped THP. */
1260 break;
1261 case ANON_THP_COLLAPSE_LOWER_SHARED:
1262 /* Don't COW-share the upper part of the THP. */
1263 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1264 if (ret) {
1265 ksft_test_result_fail("MADV_DONTFORK failed\n");
1266 goto close_comm_pipes;
1267 }
1268 break;
1269 case ANON_THP_COLLAPSE_UPPER_SHARED:
1270 /* Don't COW-share the lower part of the THP. */
1271 ret = madvise(mem, size / 2, MADV_DONTFORK);
1272 if (ret) {
1273 ksft_test_result_fail("MADV_DONTFORK failed\n");
1274 goto close_comm_pipes;
1275 }
1276 break;
1277 default:
1278 assert(false);
1279 }
1280
1281 ret = fork();
1282 if (ret < 0) {
1283 ksft_test_result_fail("fork() failed\n");
1284 goto close_comm_pipes;
1285 } else if (!ret) {
1286 switch (test) {
1287 case ANON_THP_COLLAPSE_UNSHARED:
1288 case ANON_THP_COLLAPSE_FULLY_SHARED:
1289 exit(child_memcmp_fn(mem, size, &comm_pipes));
1290 break;
1291 case ANON_THP_COLLAPSE_LOWER_SHARED:
1292 exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1293 break;
1294 case ANON_THP_COLLAPSE_UPPER_SHARED:
1295 exit(child_memcmp_fn(mem + size / 2, size / 2,
1296 &comm_pipes));
1297 break;
1298 default:
1299 assert(false);
1300 }
1301 }
1302
1303 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1304 ;
1305
1306 switch (test) {
1307 case ANON_THP_COLLAPSE_UNSHARED:
1308 break;
1309 case ANON_THP_COLLAPSE_UPPER_SHARED:
1310 case ANON_THP_COLLAPSE_LOWER_SHARED:
1311 /*
1312 * Revert MADV_DONTFORK such that we merge the VMAs and are
1313 * able to actually collapse.
1314 */
1315 ret = madvise(mem, size, MADV_DOFORK);
1316 if (ret) {
1317 ksft_test_result_fail("MADV_DOFORK failed\n");
1318 write(comm_pipes.parent_ready[1], "0", 1);
1319 wait(&ret);
1320 goto close_comm_pipes;
1321 }
1322 /* FALLTHROUGH */
1323 case ANON_THP_COLLAPSE_FULLY_SHARED:
1324 /* Collapse before anyone modified the COW-shared page. */
1325 ret = madvise(mem, size, MADV_COLLAPSE);
1326 if (ret) {
1327 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1328 strerror(errno));
1329 write(comm_pipes.parent_ready[1], "0", 1);
1330 wait(&ret);
1331 goto close_comm_pipes;
1332 }
1333 break;
1334 default:
1335 assert(false);
1336 }
1337
1338 /* Modify the page. */
1339 memset(mem, 0xff, size);
1340 write(comm_pipes.parent_ready[1], "0", 1);
1341
1342 wait(&ret);
1343 if (WIFEXITED(ret))
1344 ret = WEXITSTATUS(ret);
1345 else
1346 ret = -EINVAL;
1347
1348 ksft_test_result(!ret, "No leak from parent into child\n");
1349 close_comm_pipes:
1350 close_comm_pipes(&comm_pipes);
1351 }
1352
test_anon_thp_collapse_unshared(char * mem,size_t size,bool is_hugetlb)1353 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1354 bool is_hugetlb)
1355 {
1356 assert(!is_hugetlb);
1357 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1358 }
1359
test_anon_thp_collapse_fully_shared(char * mem,size_t size,bool is_hugetlb)1360 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1361 bool is_hugetlb)
1362 {
1363 assert(!is_hugetlb);
1364 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1365 }
1366
test_anon_thp_collapse_lower_shared(char * mem,size_t size,bool is_hugetlb)1367 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1368 bool is_hugetlb)
1369 {
1370 assert(!is_hugetlb);
1371 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1372 }
1373
test_anon_thp_collapse_upper_shared(char * mem,size_t size,bool is_hugetlb)1374 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1375 bool is_hugetlb)
1376 {
1377 assert(!is_hugetlb);
1378 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1379 }
1380
1381 /*
1382 * Test cases that are specific to anonymous THP: pages in private mappings
1383 * that may get shared via COW during fork().
1384 */
1385 static const struct test_case anon_thp_test_cases[] = {
1386 /*
1387 * Basic COW test for fork() without any GUP when collapsing a THP
1388 * before fork().
1389 *
1390 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1391 * collapse") might easily get COW handling wrong when not collapsing
1392 * exclusivity information properly.
1393 */
1394 {
1395 "Basic COW after fork() when collapsing before fork()",
1396 test_anon_thp_collapse_unshared,
1397 },
1398 /* Basic COW test, but collapse after COW-sharing a full THP. */
1399 {
1400 "Basic COW after fork() when collapsing after fork() (fully shared)",
1401 test_anon_thp_collapse_fully_shared,
1402 },
1403 /*
1404 * Basic COW test, but collapse after COW-sharing the lower half of a
1405 * THP.
1406 */
1407 {
1408 "Basic COW after fork() when collapsing after fork() (lower shared)",
1409 test_anon_thp_collapse_lower_shared,
1410 },
1411 /*
1412 * Basic COW test, but collapse after COW-sharing the upper half of a
1413 * THP.
1414 */
1415 {
1416 "Basic COW after fork() when collapsing after fork() (upper shared)",
1417 test_anon_thp_collapse_upper_shared,
1418 },
1419 };
1420
run_anon_thp_test_cases(void)1421 static void run_anon_thp_test_cases(void)
1422 {
1423 int i;
1424
1425 if (!pmdsize)
1426 return;
1427
1428 ksft_print_msg("[INFO] Anonymous THP tests\n");
1429
1430 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1431 struct test_case const *test_case = &anon_thp_test_cases[i];
1432
1433 ksft_print_msg("[RUN] %s\n", test_case->desc);
1434 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1435 }
1436 }
1437
tests_per_anon_thp_test_case(void)1438 static int tests_per_anon_thp_test_case(void)
1439 {
1440 return pmdsize ? 1 : 0;
1441 }
1442
1443 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1444
test_cow(char * mem,const char * smem,size_t size)1445 static void test_cow(char *mem, const char *smem, size_t size)
1446 {
1447 char *old = malloc(size);
1448
1449 /* Backup the original content. */
1450 memcpy(old, smem, size);
1451
1452 /* Modify the page. */
1453 memset(mem, 0xff, size);
1454
1455 /* See if we still read the old values via the other mapping. */
1456 ksft_test_result(!memcmp(smem, old, size),
1457 "Other mapping not modified\n");
1458 free(old);
1459 }
1460
test_ro_pin(char * mem,const char * smem,size_t size)1461 static void test_ro_pin(char *mem, const char *smem, size_t size)
1462 {
1463 do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1464 }
1465
test_ro_fast_pin(char * mem,const char * smem,size_t size)1466 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1467 {
1468 do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1469 }
1470
run_with_zeropage(non_anon_test_fn fn,const char * desc)1471 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1472 {
1473 char *mem, *smem, tmp;
1474
1475 ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1476
1477 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1478 MAP_PRIVATE | MAP_ANON, -1, 0);
1479 if (mem == MAP_FAILED) {
1480 ksft_test_result_fail("mmap() failed\n");
1481 return;
1482 }
1483
1484 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1485 if (mem == MAP_FAILED) {
1486 ksft_test_result_fail("mmap() failed\n");
1487 goto munmap;
1488 }
1489
1490 /* Read from the page to populate the shared zeropage. */
1491 tmp = *mem + *smem;
1492 asm volatile("" : "+r" (tmp));
1493
1494 fn(mem, smem, pagesize);
1495 munmap:
1496 munmap(mem, pagesize);
1497 if (smem != MAP_FAILED)
1498 munmap(smem, pagesize);
1499 }
1500
run_with_huge_zeropage(non_anon_test_fn fn,const char * desc)1501 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1502 {
1503 char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1504 size_t mmap_size;
1505 int ret;
1506
1507 ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1508
1509 if (!has_huge_zeropage) {
1510 ksft_test_result_skip("Huge zeropage not enabled\n");
1511 return;
1512 }
1513
1514 /* For alignment purposes, we need twice the thp size. */
1515 mmap_size = 2 * pmdsize;
1516 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1517 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1518 if (mmap_mem == MAP_FAILED) {
1519 ksft_test_result_fail("mmap() failed\n");
1520 return;
1521 }
1522 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1523 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1524 if (mmap_smem == MAP_FAILED) {
1525 ksft_test_result_fail("mmap() failed\n");
1526 goto munmap;
1527 }
1528
1529 /* We need a THP-aligned memory area. */
1530 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1531 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1532
1533 ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1534 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1535 if (ret) {
1536 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1537 goto munmap;
1538 }
1539
1540 /*
1541 * Read from the memory to populate the huge shared zeropage. Read from
1542 * the first sub-page and test if we get another sub-page populated
1543 * automatically.
1544 */
1545 tmp = *mem + *smem;
1546 asm volatile("" : "+r" (tmp));
1547 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1548 !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1549 ksft_test_result_skip("Did not get THPs populated\n");
1550 goto munmap;
1551 }
1552
1553 fn(mem, smem, pmdsize);
1554 munmap:
1555 munmap(mmap_mem, mmap_size);
1556 if (mmap_smem != MAP_FAILED)
1557 munmap(mmap_smem, mmap_size);
1558 }
1559
run_with_memfd(non_anon_test_fn fn,const char * desc)1560 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1561 {
1562 char *mem, *smem, tmp;
1563 int fd;
1564
1565 ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1566
1567 fd = memfd_create("test", 0);
1568 if (fd < 0) {
1569 ksft_test_result_fail("memfd_create() failed\n");
1570 return;
1571 }
1572
1573 /* File consists of a single page filled with zeroes. */
1574 if (fallocate(fd, 0, 0, pagesize)) {
1575 ksft_test_result_fail("fallocate() failed\n");
1576 goto close;
1577 }
1578
1579 /* Create a private mapping of the memfd. */
1580 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1581 if (mem == MAP_FAILED) {
1582 ksft_test_result_fail("mmap() failed\n");
1583 goto close;
1584 }
1585 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1586 if (mem == MAP_FAILED) {
1587 ksft_test_result_fail("mmap() failed\n");
1588 goto munmap;
1589 }
1590
1591 /* Fault the page in. */
1592 tmp = *mem + *smem;
1593 asm volatile("" : "+r" (tmp));
1594
1595 fn(mem, smem, pagesize);
1596 munmap:
1597 munmap(mem, pagesize);
1598 if (smem != MAP_FAILED)
1599 munmap(smem, pagesize);
1600 close:
1601 close(fd);
1602 }
1603
run_with_tmpfile(non_anon_test_fn fn,const char * desc)1604 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1605 {
1606 char *mem, *smem, tmp;
1607 FILE *file;
1608 int fd;
1609
1610 ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1611
1612 file = tmpfile();
1613 if (!file) {
1614 ksft_test_result_fail("tmpfile() failed\n");
1615 return;
1616 }
1617
1618 fd = fileno(file);
1619 if (fd < 0) {
1620 ksft_test_result_skip("fileno() failed\n");
1621 return;
1622 }
1623
1624 /* File consists of a single page filled with zeroes. */
1625 if (fallocate(fd, 0, 0, pagesize)) {
1626 ksft_test_result_fail("fallocate() failed\n");
1627 goto close;
1628 }
1629
1630 /* Create a private mapping of the memfd. */
1631 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1632 if (mem == MAP_FAILED) {
1633 ksft_test_result_fail("mmap() failed\n");
1634 goto close;
1635 }
1636 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1637 if (mem == MAP_FAILED) {
1638 ksft_test_result_fail("mmap() failed\n");
1639 goto munmap;
1640 }
1641
1642 /* Fault the page in. */
1643 tmp = *mem + *smem;
1644 asm volatile("" : "+r" (tmp));
1645
1646 fn(mem, smem, pagesize);
1647 munmap:
1648 munmap(mem, pagesize);
1649 if (smem != MAP_FAILED)
1650 munmap(smem, pagesize);
1651 close:
1652 fclose(file);
1653 }
1654
run_with_memfd_hugetlb(non_anon_test_fn fn,const char * desc,size_t hugetlbsize)1655 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1656 size_t hugetlbsize)
1657 {
1658 int flags = MFD_HUGETLB;
1659 char *mem, *smem, tmp;
1660 int fd;
1661
1662 ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1663 hugetlbsize / 1024);
1664
1665 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1666
1667 fd = memfd_create("test", flags);
1668 if (fd < 0) {
1669 ksft_test_result_skip("memfd_create() failed\n");
1670 return;
1671 }
1672
1673 /* File consists of a single page filled with zeroes. */
1674 if (fallocate(fd, 0, 0, hugetlbsize)) {
1675 ksft_test_result_skip("need more free huge pages\n");
1676 goto close;
1677 }
1678
1679 /* Create a private mapping of the memfd. */
1680 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1681 0);
1682 if (mem == MAP_FAILED) {
1683 ksft_test_result_skip("need more free huge pages\n");
1684 goto close;
1685 }
1686 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1687 if (mem == MAP_FAILED) {
1688 ksft_test_result_fail("mmap() failed\n");
1689 goto munmap;
1690 }
1691
1692 /* Fault the page in. */
1693 tmp = *mem + *smem;
1694 asm volatile("" : "+r" (tmp));
1695
1696 fn(mem, smem, hugetlbsize);
1697 munmap:
1698 munmap(mem, hugetlbsize);
1699 if (mem != MAP_FAILED)
1700 munmap(smem, hugetlbsize);
1701 close:
1702 close(fd);
1703 }
1704
1705 struct non_anon_test_case {
1706 const char *desc;
1707 non_anon_test_fn fn;
1708 };
1709
1710 /*
1711 * Test cases that target any pages in private mappings that are not anonymous:
1712 * pages that may get shared via COW ndependent of fork(). This includes
1713 * the shared zeropage(s), pagecache pages, ...
1714 */
1715 static const struct non_anon_test_case non_anon_test_cases[] = {
1716 /*
1717 * Basic COW test without any GUP. If we miss to break COW, changes are
1718 * visible via other private/shared mappings.
1719 */
1720 {
1721 "Basic COW",
1722 test_cow,
1723 },
1724 /*
1725 * Take a R/O longterm pin. When modifying the page via the page table,
1726 * the page content change must be visible via the pin.
1727 */
1728 {
1729 "R/O longterm GUP pin",
1730 test_ro_pin,
1731 },
1732 /* Same as above, but using GUP-fast. */
1733 {
1734 "R/O longterm GUP-fast pin",
1735 test_ro_fast_pin,
1736 },
1737 };
1738
run_non_anon_test_case(struct non_anon_test_case const * test_case)1739 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1740 {
1741 int i;
1742
1743 run_with_zeropage(test_case->fn, test_case->desc);
1744 run_with_memfd(test_case->fn, test_case->desc);
1745 run_with_tmpfile(test_case->fn, test_case->desc);
1746 if (pmdsize)
1747 run_with_huge_zeropage(test_case->fn, test_case->desc);
1748 for (i = 0; i < nr_hugetlbsizes; i++)
1749 run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1750 hugetlbsizes[i]);
1751 }
1752
run_non_anon_test_cases(void)1753 static void run_non_anon_test_cases(void)
1754 {
1755 int i;
1756
1757 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1758
1759 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1760 run_non_anon_test_case(&non_anon_test_cases[i]);
1761 }
1762
tests_per_non_anon_test_case(void)1763 static int tests_per_non_anon_test_case(void)
1764 {
1765 int tests = 3 + nr_hugetlbsizes;
1766
1767 if (pmdsize)
1768 tests += 1;
1769 return tests;
1770 }
1771
main(int argc,char ** argv)1772 int main(int argc, char **argv)
1773 {
1774 int err;
1775 struct thp_settings default_settings;
1776
1777 ksft_print_header();
1778
1779 pagesize = getpagesize();
1780 pmdsize = read_pmd_pagesize();
1781 if (pmdsize) {
1782 /* Only if THP is supported. */
1783 thp_read_settings(&default_settings);
1784 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1785 thp_save_settings();
1786 thp_push_settings(&default_settings);
1787
1788 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1789 pmdsize / 1024);
1790 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1791 }
1792 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1793 ARRAY_SIZE(hugetlbsizes));
1794 detect_huge_zeropage();
1795
1796 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1797 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1798 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1799
1800 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1801 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1802 if (pagemap_fd < 0)
1803 ksft_exit_fail_msg("opening pagemap failed\n");
1804
1805 run_anon_test_cases();
1806 run_anon_thp_test_cases();
1807 run_non_anon_test_cases();
1808
1809 if (pmdsize) {
1810 /* Only if THP is supported. */
1811 thp_restore_settings();
1812 }
1813
1814 err = ksft_get_fail_cnt();
1815 if (err)
1816 ksft_exit_fail_msg("%d out of %d tests failed\n",
1817 err, ksft_test_num());
1818 ksft_exit_pass();
1819 }
1820