1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43
sz2ord(size_t size)44 static int sz2ord(size_t size)
45 {
46 return __builtin_ctzll(size / pagesize);
47 }
48
detect_thp_sizes(size_t sizes[],int max)49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 int count = 0;
52 unsigned long orders;
53 size_t kb;
54 int i;
55
56 /* thp not supported at all. */
57 if (!pmdsize)
58 return 0;
59
60 orders = 1UL << sz2ord(pmdsize);
61 orders |= thp_supported_orders();
62
63 for (i = 0; orders && count < max; i++) {
64 if (!(orders & (1UL << i)))
65 continue;
66 orders &= ~(1UL << i);
67 kb = (pagesize >> 10) << i;
68 sizes[count++] = kb * 1024;
69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 }
71
72 return count;
73 }
74
detect_huge_zeropage(void)75 static void detect_huge_zeropage(void)
76 {
77 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 O_RDONLY);
79 size_t enabled = 0;
80 char buf[15];
81 int ret;
82
83 if (fd < 0)
84 return;
85
86 ret = pread(fd, buf, sizeof(buf), 0);
87 if (ret > 0 && ret < sizeof(buf)) {
88 buf[ret] = 0;
89
90 enabled = strtoul(buf, NULL, 10);
91 if (enabled == 1) {
92 has_huge_zeropage = true;
93 ksft_print_msg("[INFO] huge zeropage is enabled\n");
94 }
95 }
96
97 close(fd);
98 }
99
range_is_swapped(void * addr,size_t size)100 static bool range_is_swapped(void *addr, size_t size)
101 {
102 for (; size; addr += pagesize, size -= pagesize)
103 if (!pagemap_is_swapped(pagemap_fd, addr))
104 return false;
105 return true;
106 }
107
108 struct comm_pipes {
109 int child_ready[2];
110 int parent_ready[2];
111 };
112
setup_comm_pipes(struct comm_pipes * comm_pipes)113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114 {
115 if (pipe(comm_pipes->child_ready) < 0) {
116 ksft_perror("pipe()");
117 return -errno;
118 }
119 if (pipe(comm_pipes->parent_ready) < 0) {
120 ksft_perror("pipe()");
121 close(comm_pipes->child_ready[0]);
122 close(comm_pipes->child_ready[1]);
123 return -errno;
124 }
125
126 return 0;
127 }
128
close_comm_pipes(struct comm_pipes * comm_pipes)129 static void close_comm_pipes(struct comm_pipes *comm_pipes)
130 {
131 close(comm_pipes->child_ready[0]);
132 close(comm_pipes->child_ready[1]);
133 close(comm_pipes->parent_ready[0]);
134 close(comm_pipes->parent_ready[1]);
135 }
136
child_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)137 static int child_memcmp_fn(char *mem, size_t size,
138 struct comm_pipes *comm_pipes)
139 {
140 char *old = malloc(size);
141 char buf;
142
143 /* Backup the original content. */
144 memcpy(old, mem, size);
145
146 /* Wait until the parent modified the page. */
147 write(comm_pipes->child_ready[1], "0", 1);
148 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
149 ;
150
151 /* See if we still read the old values. */
152 return memcmp(old, mem, size);
153 }
154
child_vmsplice_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)155 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
156 struct comm_pipes *comm_pipes)
157 {
158 struct iovec iov = {
159 .iov_base = mem,
160 .iov_len = size,
161 };
162 ssize_t cur, total, transferred;
163 char *old, *new;
164 int fds[2];
165 char buf;
166
167 old = malloc(size);
168 new = malloc(size);
169
170 /* Backup the original content. */
171 memcpy(old, mem, size);
172
173 if (pipe(fds) < 0)
174 return -errno;
175
176 /* Trigger a read-only pin. */
177 transferred = vmsplice(fds[1], &iov, 1, 0);
178 if (transferred < 0)
179 return -errno;
180 if (transferred == 0)
181 return -EINVAL;
182
183 /* Unmap it from our page tables. */
184 if (munmap(mem, size) < 0)
185 return -errno;
186
187 /* Wait until the parent modified it. */
188 write(comm_pipes->child_ready[1], "0", 1);
189 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
190 ;
191
192 /* See if we still read the old values via the pipe. */
193 for (total = 0; total < transferred; total += cur) {
194 cur = read(fds[0], new + total, transferred - total);
195 if (cur < 0)
196 return -errno;
197 }
198
199 return memcmp(old, new, transferred);
200 }
201
202 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
203
do_test_cow_in_parent(char * mem,size_t size,bool do_mprotect,child_fn fn,bool xfail)204 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
205 child_fn fn, bool xfail)
206 {
207 struct comm_pipes comm_pipes;
208 char buf;
209 int ret;
210
211 ret = setup_comm_pipes(&comm_pipes);
212 if (ret) {
213 log_test_result(KSFT_FAIL);
214 return;
215 }
216
217 ret = fork();
218 if (ret < 0) {
219 ksft_perror("fork() failed");
220 log_test_result(KSFT_FAIL);
221 goto close_comm_pipes;
222 } else if (!ret) {
223 exit(fn(mem, size, &comm_pipes));
224 }
225
226 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
227 ;
228
229 if (do_mprotect) {
230 /*
231 * mprotect() optimizations might try avoiding
232 * write-faults by directly mapping pages writable.
233 */
234 ret = mprotect(mem, size, PROT_READ);
235 if (ret) {
236 ksft_perror("mprotect() failed");
237 log_test_result(KSFT_FAIL);
238 write(comm_pipes.parent_ready[1], "0", 1);
239 wait(&ret);
240 goto close_comm_pipes;
241 }
242
243 ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
244 if (ret) {
245 ksft_perror("mprotect() failed");
246 log_test_result(KSFT_FAIL);
247 write(comm_pipes.parent_ready[1], "0", 1);
248 wait(&ret);
249 goto close_comm_pipes;
250 }
251 }
252
253 /* Modify the page. */
254 memset(mem, 0xff, size);
255 write(comm_pipes.parent_ready[1], "0", 1);
256
257 wait(&ret);
258 if (WIFEXITED(ret))
259 ret = WEXITSTATUS(ret);
260 else
261 ret = -EINVAL;
262
263 if (!ret) {
264 log_test_result(KSFT_PASS);
265 } else if (xfail) {
266 /*
267 * With hugetlb, some vmsplice() tests are currently expected to
268 * fail because (a) harder to fix and (b) nobody really cares.
269 * Flag them as expected failure for now.
270 */
271 log_test_result(KSFT_XFAIL);
272 } else {
273 log_test_result(KSFT_FAIL);
274 }
275 close_comm_pipes:
276 close_comm_pipes(&comm_pipes);
277 }
278
test_cow_in_parent(char * mem,size_t size,bool is_hugetlb)279 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
280 {
281 do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
282 }
283
test_cow_in_parent_mprotect(char * mem,size_t size,bool is_hugetlb)284 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
285 {
286 do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
287 }
288
test_vmsplice_in_child(char * mem,size_t size,bool is_hugetlb)289 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
290 {
291 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
292 is_hugetlb);
293 }
294
test_vmsplice_in_child_mprotect(char * mem,size_t size,bool is_hugetlb)295 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
296 bool is_hugetlb)
297 {
298 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
299 is_hugetlb);
300 }
301
do_test_vmsplice_in_parent(char * mem,size_t size,bool before_fork,bool xfail)302 static void do_test_vmsplice_in_parent(char *mem, size_t size,
303 bool before_fork, bool xfail)
304 {
305 struct iovec iov = {
306 .iov_base = mem,
307 .iov_len = size,
308 };
309 ssize_t cur, total, transferred = 0;
310 struct comm_pipes comm_pipes;
311 char *old, *new;
312 int ret, fds[2];
313 char buf;
314
315 old = malloc(size);
316 new = malloc(size);
317
318 memcpy(old, mem, size);
319
320 ret = setup_comm_pipes(&comm_pipes);
321 if (ret) {
322 log_test_result(KSFT_FAIL);
323 goto free;
324 }
325
326 if (pipe(fds) < 0) {
327 ksft_perror("pipe() failed");
328 log_test_result(KSFT_FAIL);
329 goto close_comm_pipes;
330 }
331
332 if (before_fork) {
333 transferred = vmsplice(fds[1], &iov, 1, 0);
334 if (transferred <= 0) {
335 ksft_print_msg("vmsplice() failed\n");
336 log_test_result(KSFT_FAIL);
337 goto close_pipe;
338 }
339 }
340
341 ret = fork();
342 if (ret < 0) {
343 ksft_perror("fork() failed\n");
344 log_test_result(KSFT_FAIL);
345 goto close_pipe;
346 } else if (!ret) {
347 write(comm_pipes.child_ready[1], "0", 1);
348 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
349 ;
350 /* Modify page content in the child. */
351 memset(mem, 0xff, size);
352 exit(0);
353 }
354
355 if (!before_fork) {
356 transferred = vmsplice(fds[1], &iov, 1, 0);
357 if (transferred <= 0) {
358 ksft_perror("vmsplice() failed");
359 log_test_result(KSFT_FAIL);
360 wait(&ret);
361 goto close_pipe;
362 }
363 }
364
365 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
366 ;
367 if (munmap(mem, size) < 0) {
368 ksft_perror("munmap() failed");
369 log_test_result(KSFT_FAIL);
370 goto close_pipe;
371 }
372 write(comm_pipes.parent_ready[1], "0", 1);
373
374 /* Wait until the child is done writing. */
375 wait(&ret);
376 if (!WIFEXITED(ret)) {
377 ksft_perror("wait() failed");
378 log_test_result(KSFT_FAIL);
379 goto close_pipe;
380 }
381
382 /* See if we still read the old values. */
383 for (total = 0; total < transferred; total += cur) {
384 cur = read(fds[0], new + total, transferred - total);
385 if (cur < 0) {
386 ksft_perror("read() failed");
387 log_test_result(KSFT_FAIL);
388 goto close_pipe;
389 }
390 }
391
392 if (!memcmp(old, new, transferred)) {
393 log_test_result(KSFT_PASS);
394 } else if (xfail) {
395 /*
396 * With hugetlb, some vmsplice() tests are currently expected to
397 * fail because (a) harder to fix and (b) nobody really cares.
398 * Flag them as expected failure for now.
399 */
400 log_test_result(KSFT_XFAIL);
401 } else {
402 log_test_result(KSFT_FAIL);
403 }
404 close_pipe:
405 close(fds[0]);
406 close(fds[1]);
407 close_comm_pipes:
408 close_comm_pipes(&comm_pipes);
409 free:
410 free(old);
411 free(new);
412 }
413
test_vmsplice_before_fork(char * mem,size_t size,bool is_hugetlb)414 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
415 {
416 do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
417 }
418
test_vmsplice_after_fork(char * mem,size_t size,bool is_hugetlb)419 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
420 {
421 do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
422 }
423
424 #ifdef LOCAL_CONFIG_HAVE_LIBURING
do_test_iouring(char * mem,size_t size,bool use_fork)425 static void do_test_iouring(char *mem, size_t size, bool use_fork)
426 {
427 struct comm_pipes comm_pipes;
428 struct io_uring_cqe *cqe;
429 struct io_uring_sqe *sqe;
430 struct io_uring ring;
431 ssize_t cur, total;
432 struct iovec iov;
433 char *buf, *tmp;
434 int ret, fd;
435 FILE *file;
436
437 ret = setup_comm_pipes(&comm_pipes);
438 if (ret) {
439 log_test_result(KSFT_FAIL);
440 return;
441 }
442
443 file = tmpfile();
444 if (!file) {
445 ksft_perror("tmpfile() failed");
446 log_test_result(KSFT_FAIL);
447 goto close_comm_pipes;
448 }
449 fd = fileno(file);
450 assert(fd);
451
452 tmp = malloc(size);
453 if (!tmp) {
454 ksft_print_msg("malloc() failed\n");
455 log_test_result(KSFT_FAIL);
456 goto close_file;
457 }
458
459 /* Skip on errors, as we might just lack kernel support. */
460 ret = io_uring_queue_init(1, &ring, 0);
461 if (ret < 0) {
462 ksft_print_msg("io_uring_queue_init() failed\n");
463 log_test_result(KSFT_SKIP);
464 goto free_tmp;
465 }
466
467 /*
468 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
469 * | FOLL_LONGTERM the range.
470 *
471 * Skip on errors, as we might just lack kernel support or might not
472 * have sufficient MEMLOCK permissions.
473 */
474 iov.iov_base = mem;
475 iov.iov_len = size;
476 ret = io_uring_register_buffers(&ring, &iov, 1);
477 if (ret) {
478 ksft_print_msg("io_uring_register_buffers() failed\n");
479 log_test_result(KSFT_SKIP);
480 goto queue_exit;
481 }
482
483 if (use_fork) {
484 /*
485 * fork() and keep the child alive until we're done. Note that
486 * we expect the pinned page to not get shared with the child.
487 */
488 ret = fork();
489 if (ret < 0) {
490 ksft_perror("fork() failed");
491 log_test_result(KSFT_FAIL);
492 goto unregister_buffers;
493 } else if (!ret) {
494 write(comm_pipes.child_ready[1], "0", 1);
495 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
496 ;
497 exit(0);
498 }
499
500 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
501 ;
502 } else {
503 /*
504 * Map the page R/O into the page table. Enable softdirty
505 * tracking to stop the page from getting mapped R/W immediately
506 * again by mprotect() optimizations. Note that we don't have an
507 * easy way to test if that worked (the pagemap does not export
508 * if the page is mapped R/O vs. R/W).
509 */
510 ret = mprotect(mem, size, PROT_READ);
511 if (ret) {
512 ksft_perror("mprotect() failed");
513 log_test_result(KSFT_FAIL);
514 goto unregister_buffers;
515 }
516
517 clear_softdirty();
518 ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
519 if (ret) {
520 ksft_perror("mprotect() failed");
521 log_test_result(KSFT_FAIL);
522 goto unregister_buffers;
523 }
524 }
525
526 /*
527 * Modify the page and write page content as observed by the fixed
528 * buffer pin to the file so we can verify it.
529 */
530 memset(mem, 0xff, size);
531 sqe = io_uring_get_sqe(&ring);
532 if (!sqe) {
533 ksft_print_msg("io_uring_get_sqe() failed\n");
534 log_test_result(KSFT_FAIL);
535 goto quit_child;
536 }
537 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
538
539 ret = io_uring_submit(&ring);
540 if (ret < 0) {
541 ksft_print_msg("io_uring_submit() failed\n");
542 log_test_result(KSFT_FAIL);
543 goto quit_child;
544 }
545
546 ret = io_uring_wait_cqe(&ring, &cqe);
547 if (ret < 0) {
548 ksft_print_msg("io_uring_wait_cqe() failed\n");
549 log_test_result(KSFT_FAIL);
550 goto quit_child;
551 }
552
553 if (cqe->res != size) {
554 ksft_print_msg("write_fixed failed\n");
555 log_test_result(KSFT_FAIL);
556 goto quit_child;
557 }
558 io_uring_cqe_seen(&ring, cqe);
559
560 /* Read back the file content to the temporary buffer. */
561 total = 0;
562 while (total < size) {
563 cur = pread(fd, tmp + total, size - total, total);
564 if (cur < 0) {
565 ksft_print_msg("pread() failed\n");
566 log_test_result(KSFT_FAIL);
567 goto quit_child;
568 }
569 total += cur;
570 }
571
572 /* Finally, check if we read what we expected. */
573 if (!memcmp(mem, tmp, size))
574 log_test_result(KSFT_PASS);
575 else
576 log_test_result(KSFT_FAIL);
577
578 quit_child:
579 if (use_fork) {
580 write(comm_pipes.parent_ready[1], "0", 1);
581 wait(&ret);
582 }
583 unregister_buffers:
584 io_uring_unregister_buffers(&ring);
585 queue_exit:
586 io_uring_queue_exit(&ring);
587 free_tmp:
588 free(tmp);
589 close_file:
590 fclose(file);
591 close_comm_pipes:
592 close_comm_pipes(&comm_pipes);
593 }
594
test_iouring_ro(char * mem,size_t size,bool is_hugetlb)595 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
596 {
597 do_test_iouring(mem, size, false);
598 }
599
test_iouring_fork(char * mem,size_t size,bool is_hugetlb)600 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
601 {
602 do_test_iouring(mem, size, true);
603 }
604
605 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
606
607 enum ro_pin_test {
608 RO_PIN_TEST,
609 RO_PIN_TEST_SHARED,
610 RO_PIN_TEST_PREVIOUSLY_SHARED,
611 RO_PIN_TEST_RO_EXCLUSIVE,
612 };
613
do_test_ro_pin(char * mem,size_t size,enum ro_pin_test test,bool fast)614 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
615 bool fast)
616 {
617 struct pin_longterm_test args;
618 struct comm_pipes comm_pipes;
619 char *tmp, buf;
620 __u64 tmp_val;
621 int ret;
622
623 if (gup_fd < 0) {
624 ksft_print_msg("gup_test not available\n");
625 log_test_result(KSFT_SKIP);
626 return;
627 }
628
629 tmp = malloc(size);
630 if (!tmp) {
631 ksft_print_msg("malloc() failed\n");
632 log_test_result(KSFT_FAIL);
633 return;
634 }
635
636 ret = setup_comm_pipes(&comm_pipes);
637 if (ret) {
638 log_test_result(KSFT_FAIL);
639 goto free_tmp;
640 }
641
642 switch (test) {
643 case RO_PIN_TEST:
644 break;
645 case RO_PIN_TEST_SHARED:
646 case RO_PIN_TEST_PREVIOUSLY_SHARED:
647 /*
648 * Share the pages with our child. As the pages are not pinned,
649 * this should just work.
650 */
651 ret = fork();
652 if (ret < 0) {
653 ksft_perror("fork() failed");
654 log_test_result(KSFT_FAIL);
655 goto close_comm_pipes;
656 } else if (!ret) {
657 write(comm_pipes.child_ready[1], "0", 1);
658 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
659 ;
660 exit(0);
661 }
662
663 /* Wait until our child is ready. */
664 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
665 ;
666
667 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
668 /*
669 * Tell the child to quit now and wait until it quit.
670 * The pages should now be mapped R/O into our page
671 * tables, but they are no longer shared.
672 */
673 write(comm_pipes.parent_ready[1], "0", 1);
674 wait(&ret);
675 if (!WIFEXITED(ret))
676 ksft_print_msg("[INFO] wait() failed\n");
677 }
678 break;
679 case RO_PIN_TEST_RO_EXCLUSIVE:
680 /*
681 * Map the page R/O into the page table. Enable softdirty
682 * tracking to stop the page from getting mapped R/W immediately
683 * again by mprotect() optimizations. Note that we don't have an
684 * easy way to test if that worked (the pagemap does not export
685 * if the page is mapped R/O vs. R/W).
686 */
687 ret = mprotect(mem, size, PROT_READ);
688 clear_softdirty();
689 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
690 if (ret) {
691 ksft_perror("mprotect() failed");
692 log_test_result(KSFT_FAIL);
693 goto close_comm_pipes;
694 }
695 break;
696 default:
697 assert(false);
698 }
699
700 /* Take a R/O pin. This should trigger unsharing. */
701 args.addr = (__u64)(uintptr_t)mem;
702 args.size = size;
703 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
704 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
705 if (ret) {
706 if (errno == EINVAL)
707 ret = KSFT_SKIP;
708 else
709 ret = KSFT_FAIL;
710 ksft_perror("PIN_LONGTERM_TEST_START failed");
711 log_test_result(ret);
712 goto wait;
713 }
714
715 /* Modify the page. */
716 memset(mem, 0xff, size);
717
718 /*
719 * Read back the content via the pin to the temporary buffer and
720 * test if we observed the modification.
721 */
722 tmp_val = (__u64)(uintptr_t)tmp;
723 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
724 if (ret) {
725 ksft_perror("PIN_LONGTERM_TEST_READ failed");
726 log_test_result(KSFT_FAIL);
727 } else {
728 if (!memcmp(mem, tmp, size))
729 log_test_result(KSFT_PASS);
730 else
731 log_test_result(KSFT_FAIL);
732 }
733
734 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
735 if (ret)
736 ksft_perror("PIN_LONGTERM_TEST_STOP failed");
737 wait:
738 switch (test) {
739 case RO_PIN_TEST_SHARED:
740 write(comm_pipes.parent_ready[1], "0", 1);
741 wait(&ret);
742 if (!WIFEXITED(ret))
743 ksft_perror("wait() failed");
744 break;
745 default:
746 break;
747 }
748 close_comm_pipes:
749 close_comm_pipes(&comm_pipes);
750 free_tmp:
751 free(tmp);
752 }
753
test_ro_pin_on_shared(char * mem,size_t size,bool is_hugetlb)754 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
755 {
756 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
757 }
758
test_ro_fast_pin_on_shared(char * mem,size_t size,bool is_hugetlb)759 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
760 {
761 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
762 }
763
test_ro_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)764 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
765 bool is_hugetlb)
766 {
767 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
768 }
769
test_ro_fast_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)770 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
771 bool is_hugetlb)
772 {
773 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
774 }
775
test_ro_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)776 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
777 bool is_hugetlb)
778 {
779 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
780 }
781
test_ro_fast_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)782 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
783 bool is_hugetlb)
784 {
785 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
786 }
787
788 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
789
do_run_with_base_page(test_fn fn,bool swapout)790 static void do_run_with_base_page(test_fn fn, bool swapout)
791 {
792 char *mem;
793 int ret;
794
795 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
796 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
797 if (mem == MAP_FAILED) {
798 ksft_perror("mmap() failed");
799 log_test_result(KSFT_FAIL);
800 return;
801 }
802
803 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
804 /* Ignore if not around on a kernel. */
805 if (ret && errno != EINVAL) {
806 ksft_perror("MADV_NOHUGEPAGE failed");
807 log_test_result(KSFT_FAIL);
808 goto munmap;
809 }
810
811 /* Populate a base page. */
812 memset(mem, 1, pagesize);
813
814 if (swapout) {
815 madvise(mem, pagesize, MADV_PAGEOUT);
816 if (!pagemap_is_swapped(pagemap_fd, mem)) {
817 ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
818 log_test_result(KSFT_SKIP);
819 goto munmap;
820 }
821 }
822
823 fn(mem, pagesize, false);
824 munmap:
825 munmap(mem, pagesize);
826 }
827
run_with_base_page(test_fn fn,const char * desc)828 static void run_with_base_page(test_fn fn, const char *desc)
829 {
830 log_test_start("%s ... with base page", desc);
831 do_run_with_base_page(fn, false);
832 }
833
run_with_base_page_swap(test_fn fn,const char * desc)834 static void run_with_base_page_swap(test_fn fn, const char *desc)
835 {
836 log_test_start("%s ... with swapped out base page", desc);
837 do_run_with_base_page(fn, true);
838 }
839
840 enum thp_run {
841 THP_RUN_PMD,
842 THP_RUN_PMD_SWAPOUT,
843 THP_RUN_PTE,
844 THP_RUN_PTE_SWAPOUT,
845 THP_RUN_SINGLE_PTE,
846 THP_RUN_SINGLE_PTE_SWAPOUT,
847 THP_RUN_PARTIAL_MREMAP,
848 THP_RUN_PARTIAL_SHARED,
849 };
850
do_run_with_thp(test_fn fn,enum thp_run thp_run,size_t thpsize)851 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
852 {
853 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
854 size_t size, mmap_size, mremap_size;
855 int ret;
856
857 /* For alignment purposes, we need twice the thp size. */
858 mmap_size = 2 * thpsize;
859 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
860 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
861 if (mmap_mem == MAP_FAILED) {
862 ksft_perror("mmap() failed");
863 log_test_result(KSFT_FAIL);
864 return;
865 }
866
867 /* We need a THP-aligned memory area. */
868 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
869
870 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
871 if (ret) {
872 ksft_perror("MADV_HUGEPAGE failed");
873 log_test_result(KSFT_FAIL);
874 goto munmap;
875 }
876
877 /*
878 * Try to populate a THP. Touch the first sub-page and test if
879 * we get the last sub-page populated automatically.
880 */
881 mem[0] = 1;
882 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
883 ksft_print_msg("Did not get a THP populated\n");
884 log_test_result(KSFT_SKIP);
885 goto munmap;
886 }
887 memset(mem, 1, thpsize);
888
889 size = thpsize;
890 switch (thp_run) {
891 case THP_RUN_PMD:
892 case THP_RUN_PMD_SWAPOUT:
893 assert(thpsize == pmdsize);
894 break;
895 case THP_RUN_PTE:
896 case THP_RUN_PTE_SWAPOUT:
897 /*
898 * Trigger PTE-mapping the THP by temporarily mapping a single
899 * subpage R/O. This is a noop if the THP is not pmdsize (and
900 * therefore already PTE-mapped).
901 */
902 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
903 if (ret) {
904 ksft_perror("mprotect() failed");
905 log_test_result(KSFT_FAIL);
906 goto munmap;
907 }
908 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
909 if (ret) {
910 ksft_perror("mprotect() failed");
911 log_test_result(KSFT_FAIL);
912 goto munmap;
913 }
914 break;
915 case THP_RUN_SINGLE_PTE:
916 case THP_RUN_SINGLE_PTE_SWAPOUT:
917 /*
918 * Discard all but a single subpage of that PTE-mapped THP. What
919 * remains is a single PTE mapping a single subpage.
920 */
921 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
922 if (ret) {
923 ksft_perror("MADV_DONTNEED failed");
924 log_test_result(KSFT_FAIL);
925 goto munmap;
926 }
927 size = pagesize;
928 break;
929 case THP_RUN_PARTIAL_MREMAP:
930 /*
931 * Remap half of the THP. We need some new memory location
932 * for that.
933 */
934 mremap_size = thpsize / 2;
935 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
936 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
937 if (mremap_mem == MAP_FAILED) {
938 ksft_perror("mmap() failed");
939 log_test_result(KSFT_FAIL);
940 goto munmap;
941 }
942 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
943 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
944 if (tmp != mremap_mem) {
945 ksft_perror("mremap() failed");
946 log_test_result(KSFT_FAIL);
947 goto munmap;
948 }
949 size = mremap_size;
950 break;
951 case THP_RUN_PARTIAL_SHARED:
952 /*
953 * Share the first page of the THP with a child and quit the
954 * child. This will result in some parts of the THP never
955 * have been shared.
956 */
957 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
958 if (ret) {
959 ksft_perror("MADV_DONTFORK failed");
960 log_test_result(KSFT_FAIL);
961 goto munmap;
962 }
963 ret = fork();
964 if (ret < 0) {
965 ksft_perror("fork() failed");
966 log_test_result(KSFT_FAIL);
967 goto munmap;
968 } else if (!ret) {
969 exit(0);
970 }
971 wait(&ret);
972 /* Allow for sharing all pages again. */
973 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
974 if (ret) {
975 ksft_perror("MADV_DOFORK failed");
976 log_test_result(KSFT_FAIL);
977 goto munmap;
978 }
979 break;
980 default:
981 assert(false);
982 }
983
984 switch (thp_run) {
985 case THP_RUN_PMD_SWAPOUT:
986 case THP_RUN_PTE_SWAPOUT:
987 case THP_RUN_SINGLE_PTE_SWAPOUT:
988 madvise(mem, size, MADV_PAGEOUT);
989 if (!range_is_swapped(mem, size)) {
990 ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
991 log_test_result(KSFT_SKIP);
992 goto munmap;
993 }
994 break;
995 default:
996 break;
997 }
998
999 fn(mem, size, false);
1000 munmap:
1001 munmap(mmap_mem, mmap_size);
1002 if (mremap_mem != MAP_FAILED)
1003 munmap(mremap_mem, mremap_size);
1004 }
1005
run_with_thp(test_fn fn,const char * desc,size_t size)1006 static void run_with_thp(test_fn fn, const char *desc, size_t size)
1007 {
1008 log_test_start("%s ... with THP (%zu kB)",
1009 desc, size / 1024);
1010 do_run_with_thp(fn, THP_RUN_PMD, size);
1011 }
1012
run_with_thp_swap(test_fn fn,const char * desc,size_t size)1013 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
1014 {
1015 log_test_start("%s ... with swapped-out THP (%zu kB)",
1016 desc, size / 1024);
1017 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
1018 }
1019
run_with_pte_mapped_thp(test_fn fn,const char * desc,size_t size)1020 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
1021 {
1022 log_test_start("%s ... with PTE-mapped THP (%zu kB)",
1023 desc, size / 1024);
1024 do_run_with_thp(fn, THP_RUN_PTE, size);
1025 }
1026
run_with_pte_mapped_thp_swap(test_fn fn,const char * desc,size_t size)1027 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
1028 {
1029 log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
1030 desc, size / 1024);
1031 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
1032 }
1033
run_with_single_pte_of_thp(test_fn fn,const char * desc,size_t size)1034 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
1035 {
1036 log_test_start("%s ... with single PTE of THP (%zu kB)",
1037 desc, size / 1024);
1038 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
1039 }
1040
run_with_single_pte_of_thp_swap(test_fn fn,const char * desc,size_t size)1041 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
1042 {
1043 log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
1044 desc, size / 1024);
1045 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
1046 }
1047
run_with_partial_mremap_thp(test_fn fn,const char * desc,size_t size)1048 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
1049 {
1050 log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
1051 desc, size / 1024);
1052 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
1053 }
1054
run_with_partial_shared_thp(test_fn fn,const char * desc,size_t size)1055 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
1056 {
1057 log_test_start("%s ... with partially shared THP (%zu kB)",
1058 desc, size / 1024);
1059 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
1060 }
1061
run_with_hugetlb(test_fn fn,const char * desc,size_t hugetlbsize)1062 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1063 {
1064 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1065 char *mem, *dummy;
1066
1067 log_test_start("%s ... with hugetlb (%zu kB)", desc,
1068 hugetlbsize / 1024);
1069
1070 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1071
1072 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1073 if (mem == MAP_FAILED) {
1074 ksft_perror("need more free huge pages");
1075 log_test_result(KSFT_SKIP);
1076 return;
1077 }
1078
1079 /* Populate an huge page. */
1080 memset(mem, 1, hugetlbsize);
1081
1082 /*
1083 * We need a total of two hugetlb pages to handle COW/unsharing
1084 * properly, otherwise we might get zapped by a SIGBUS.
1085 */
1086 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1087 if (dummy == MAP_FAILED) {
1088 ksft_perror("need more free huge pages");
1089 log_test_result(KSFT_SKIP);
1090 goto munmap;
1091 }
1092 munmap(dummy, hugetlbsize);
1093
1094 fn(mem, hugetlbsize, true);
1095 munmap:
1096 munmap(mem, hugetlbsize);
1097 }
1098
1099 struct test_case {
1100 const char *desc;
1101 test_fn fn;
1102 };
1103
1104 /*
1105 * Test cases that are specific to anonymous pages: pages in private mappings
1106 * that may get shared via COW during fork().
1107 */
1108 static const struct test_case anon_test_cases[] = {
1109 /*
1110 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1111 * either the child can observe modifications by the parent or the
1112 * other way around.
1113 */
1114 {
1115 "Basic COW after fork()",
1116 test_cow_in_parent,
1117 },
1118 /*
1119 * Basic test, but do an additional mprotect(PROT_READ)+
1120 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1121 */
1122 {
1123 "Basic COW after fork() with mprotect() optimization",
1124 test_cow_in_parent_mprotect,
1125 },
1126 /*
1127 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1128 * we miss to break COW, the child observes modifications by the parent.
1129 * This is CVE-2020-29374 reported by Jann Horn.
1130 */
1131 {
1132 "vmsplice() + unmap in child",
1133 test_vmsplice_in_child,
1134 },
1135 /*
1136 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1137 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1138 */
1139 {
1140 "vmsplice() + unmap in child with mprotect() optimization",
1141 test_vmsplice_in_child_mprotect,
1142 },
1143 /*
1144 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1145 * fork(); modify in the child. If we miss to break COW, the parent
1146 * observes modifications by the child.
1147 */
1148 {
1149 "vmsplice() before fork(), unmap in parent after fork()",
1150 test_vmsplice_before_fork,
1151 },
1152 /*
1153 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1154 * child. If we miss to break COW, the parent observes modifications by
1155 * the child.
1156 */
1157 {
1158 "vmsplice() + unmap in parent after fork()",
1159 test_vmsplice_after_fork,
1160 },
1161 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1162 /*
1163 * Take a R/W longterm pin and then map the page R/O into the page
1164 * table to trigger a write fault on next access. When modifying the
1165 * page, the page content must be visible via the pin.
1166 */
1167 {
1168 "R/O-mapping a page registered as iouring fixed buffer",
1169 test_iouring_ro,
1170 },
1171 /*
1172 * Take a R/W longterm pin and then fork() a child. When modifying the
1173 * page, the page content must be visible via the pin. We expect the
1174 * pinned page to not get shared with the child.
1175 */
1176 {
1177 "fork() with an iouring fixed buffer",
1178 test_iouring_fork,
1179 },
1180
1181 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1182 /*
1183 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1184 * When modifying the page via the page table, the page content change
1185 * must be visible via the pin.
1186 */
1187 {
1188 "R/O GUP pin on R/O-mapped shared page",
1189 test_ro_pin_on_shared,
1190 },
1191 /* Same as above, but using GUP-fast. */
1192 {
1193 "R/O GUP-fast pin on R/O-mapped shared page",
1194 test_ro_fast_pin_on_shared,
1195 },
1196 /*
1197 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1198 * was previously shared. When modifying the page via the page table,
1199 * the page content change must be visible via the pin.
1200 */
1201 {
1202 "R/O GUP pin on R/O-mapped previously-shared page",
1203 test_ro_pin_on_ro_previously_shared,
1204 },
1205 /* Same as above, but using GUP-fast. */
1206 {
1207 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1208 test_ro_fast_pin_on_ro_previously_shared,
1209 },
1210 /*
1211 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1212 * When modifying the page via the page table, the page content change
1213 * must be visible via the pin.
1214 */
1215 {
1216 "R/O GUP pin on R/O-mapped exclusive page",
1217 test_ro_pin_on_ro_exclusive,
1218 },
1219 /* Same as above, but using GUP-fast. */
1220 {
1221 "R/O GUP-fast pin on R/O-mapped exclusive page",
1222 test_ro_fast_pin_on_ro_exclusive,
1223 },
1224 };
1225
run_anon_test_case(struct test_case const * test_case)1226 static void run_anon_test_case(struct test_case const *test_case)
1227 {
1228 int i;
1229
1230 run_with_base_page(test_case->fn, test_case->desc);
1231 run_with_base_page_swap(test_case->fn, test_case->desc);
1232 for (i = 0; i < nr_thpsizes; i++) {
1233 size_t size = thpsizes[i];
1234 struct thp_settings settings = *thp_current_settings();
1235
1236 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1237 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1238 thp_push_settings(&settings);
1239
1240 if (size == pmdsize) {
1241 run_with_thp(test_case->fn, test_case->desc, size);
1242 run_with_thp_swap(test_case->fn, test_case->desc, size);
1243 }
1244
1245 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1246 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1247 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1248 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1249 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1250 run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1251
1252 thp_pop_settings();
1253 }
1254 for (i = 0; i < nr_hugetlbsizes; i++)
1255 run_with_hugetlb(test_case->fn, test_case->desc,
1256 hugetlbsizes[i]);
1257 }
1258
run_anon_test_cases(void)1259 static void run_anon_test_cases(void)
1260 {
1261 int i;
1262
1263 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1264
1265 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1266 run_anon_test_case(&anon_test_cases[i]);
1267 }
1268
tests_per_anon_test_case(void)1269 static int tests_per_anon_test_case(void)
1270 {
1271 int tests = 2 + nr_hugetlbsizes;
1272
1273 tests += 6 * nr_thpsizes;
1274 if (pmdsize)
1275 tests += 2;
1276 return tests;
1277 }
1278
1279 enum anon_thp_collapse_test {
1280 ANON_THP_COLLAPSE_UNSHARED,
1281 ANON_THP_COLLAPSE_FULLY_SHARED,
1282 ANON_THP_COLLAPSE_LOWER_SHARED,
1283 ANON_THP_COLLAPSE_UPPER_SHARED,
1284 };
1285
do_test_anon_thp_collapse(char * mem,size_t size,enum anon_thp_collapse_test test)1286 static void do_test_anon_thp_collapse(char *mem, size_t size,
1287 enum anon_thp_collapse_test test)
1288 {
1289 struct comm_pipes comm_pipes;
1290 char buf;
1291 int ret;
1292
1293 ret = setup_comm_pipes(&comm_pipes);
1294 if (ret) {
1295 log_test_result(KSFT_FAIL);
1296 return;
1297 }
1298
1299 /*
1300 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1301 * R/O, such that we can try collapsing it later.
1302 */
1303 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1304 if (ret) {
1305 ksft_perror("mprotect() failed");
1306 log_test_result(KSFT_FAIL);
1307 goto close_comm_pipes;
1308 }
1309 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1310 if (ret) {
1311 ksft_perror("mprotect() failed");
1312 log_test_result(KSFT_FAIL);
1313 goto close_comm_pipes;
1314 }
1315
1316 switch (test) {
1317 case ANON_THP_COLLAPSE_UNSHARED:
1318 /* Collapse before actually COW-sharing the page. */
1319 ret = madvise(mem, size, MADV_COLLAPSE);
1320 if (ret) {
1321 ksft_perror("MADV_COLLAPSE failed");
1322 log_test_result(KSFT_SKIP);
1323 goto close_comm_pipes;
1324 }
1325 break;
1326 case ANON_THP_COLLAPSE_FULLY_SHARED:
1327 /* COW-share the full PTE-mapped THP. */
1328 break;
1329 case ANON_THP_COLLAPSE_LOWER_SHARED:
1330 /* Don't COW-share the upper part of the THP. */
1331 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1332 if (ret) {
1333 ksft_perror("MADV_DONTFORK failed");
1334 log_test_result(KSFT_FAIL);
1335 goto close_comm_pipes;
1336 }
1337 break;
1338 case ANON_THP_COLLAPSE_UPPER_SHARED:
1339 /* Don't COW-share the lower part of the THP. */
1340 ret = madvise(mem, size / 2, MADV_DONTFORK);
1341 if (ret) {
1342 ksft_perror("MADV_DONTFORK failed");
1343 log_test_result(KSFT_FAIL);
1344 goto close_comm_pipes;
1345 }
1346 break;
1347 default:
1348 assert(false);
1349 }
1350
1351 ret = fork();
1352 if (ret < 0) {
1353 ksft_perror("fork() failed");
1354 log_test_result(KSFT_FAIL);
1355 goto close_comm_pipes;
1356 } else if (!ret) {
1357 switch (test) {
1358 case ANON_THP_COLLAPSE_UNSHARED:
1359 case ANON_THP_COLLAPSE_FULLY_SHARED:
1360 exit(child_memcmp_fn(mem, size, &comm_pipes));
1361 break;
1362 case ANON_THP_COLLAPSE_LOWER_SHARED:
1363 exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1364 break;
1365 case ANON_THP_COLLAPSE_UPPER_SHARED:
1366 exit(child_memcmp_fn(mem + size / 2, size / 2,
1367 &comm_pipes));
1368 break;
1369 default:
1370 assert(false);
1371 }
1372 }
1373
1374 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1375 ;
1376
1377 switch (test) {
1378 case ANON_THP_COLLAPSE_UNSHARED:
1379 break;
1380 case ANON_THP_COLLAPSE_UPPER_SHARED:
1381 case ANON_THP_COLLAPSE_LOWER_SHARED:
1382 /*
1383 * Revert MADV_DONTFORK such that we merge the VMAs and are
1384 * able to actually collapse.
1385 */
1386 ret = madvise(mem, size, MADV_DOFORK);
1387 if (ret) {
1388 ksft_perror("MADV_DOFORK failed");
1389 log_test_result(KSFT_FAIL);
1390 write(comm_pipes.parent_ready[1], "0", 1);
1391 wait(&ret);
1392 goto close_comm_pipes;
1393 }
1394 /* FALLTHROUGH */
1395 case ANON_THP_COLLAPSE_FULLY_SHARED:
1396 /* Collapse before anyone modified the COW-shared page. */
1397 ret = madvise(mem, size, MADV_COLLAPSE);
1398 if (ret) {
1399 ksft_perror("MADV_COLLAPSE failed");
1400 log_test_result(KSFT_SKIP);
1401 write(comm_pipes.parent_ready[1], "0", 1);
1402 wait(&ret);
1403 goto close_comm_pipes;
1404 }
1405 break;
1406 default:
1407 assert(false);
1408 }
1409
1410 /* Modify the page. */
1411 memset(mem, 0xff, size);
1412 write(comm_pipes.parent_ready[1], "0", 1);
1413
1414 wait(&ret);
1415 if (WIFEXITED(ret))
1416 ret = WEXITSTATUS(ret);
1417 else
1418 ret = -EINVAL;
1419
1420 if (!ret)
1421 log_test_result(KSFT_PASS);
1422 else
1423 log_test_result(KSFT_FAIL);
1424 close_comm_pipes:
1425 close_comm_pipes(&comm_pipes);
1426 }
1427
test_anon_thp_collapse_unshared(char * mem,size_t size,bool is_hugetlb)1428 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1429 bool is_hugetlb)
1430 {
1431 assert(!is_hugetlb);
1432 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1433 }
1434
test_anon_thp_collapse_fully_shared(char * mem,size_t size,bool is_hugetlb)1435 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1436 bool is_hugetlb)
1437 {
1438 assert(!is_hugetlb);
1439 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1440 }
1441
test_anon_thp_collapse_lower_shared(char * mem,size_t size,bool is_hugetlb)1442 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1443 bool is_hugetlb)
1444 {
1445 assert(!is_hugetlb);
1446 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1447 }
1448
test_anon_thp_collapse_upper_shared(char * mem,size_t size,bool is_hugetlb)1449 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1450 bool is_hugetlb)
1451 {
1452 assert(!is_hugetlb);
1453 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1454 }
1455
1456 /*
1457 * Test cases that are specific to anonymous THP: pages in private mappings
1458 * that may get shared via COW during fork().
1459 */
1460 static const struct test_case anon_thp_test_cases[] = {
1461 /*
1462 * Basic COW test for fork() without any GUP when collapsing a THP
1463 * before fork().
1464 *
1465 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1466 * collapse") might easily get COW handling wrong when not collapsing
1467 * exclusivity information properly.
1468 */
1469 {
1470 "Basic COW after fork() when collapsing before fork()",
1471 test_anon_thp_collapse_unshared,
1472 },
1473 /* Basic COW test, but collapse after COW-sharing a full THP. */
1474 {
1475 "Basic COW after fork() when collapsing after fork() (fully shared)",
1476 test_anon_thp_collapse_fully_shared,
1477 },
1478 /*
1479 * Basic COW test, but collapse after COW-sharing the lower half of a
1480 * THP.
1481 */
1482 {
1483 "Basic COW after fork() when collapsing after fork() (lower shared)",
1484 test_anon_thp_collapse_lower_shared,
1485 },
1486 /*
1487 * Basic COW test, but collapse after COW-sharing the upper half of a
1488 * THP.
1489 */
1490 {
1491 "Basic COW after fork() when collapsing after fork() (upper shared)",
1492 test_anon_thp_collapse_upper_shared,
1493 },
1494 };
1495
run_anon_thp_test_cases(void)1496 static void run_anon_thp_test_cases(void)
1497 {
1498 int i;
1499
1500 if (!pmdsize)
1501 return;
1502
1503 ksft_print_msg("[INFO] Anonymous THP tests\n");
1504
1505 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1506 struct test_case const *test_case = &anon_thp_test_cases[i];
1507
1508 log_test_start("%s", test_case->desc);
1509 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1510 }
1511 }
1512
tests_per_anon_thp_test_case(void)1513 static int tests_per_anon_thp_test_case(void)
1514 {
1515 return pmdsize ? 1 : 0;
1516 }
1517
1518 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1519
test_cow(char * mem,const char * smem,size_t size)1520 static void test_cow(char *mem, const char *smem, size_t size)
1521 {
1522 char *old = malloc(size);
1523
1524 /* Backup the original content. */
1525 memcpy(old, smem, size);
1526
1527 /* Modify the page. */
1528 memset(mem, 0xff, size);
1529
1530 /* See if we still read the old values via the other mapping. */
1531 if (!memcmp(smem, old, size))
1532 log_test_result(KSFT_PASS);
1533 else
1534 log_test_result(KSFT_FAIL);
1535 free(old);
1536 }
1537
test_ro_pin(char * mem,const char * smem,size_t size)1538 static void test_ro_pin(char *mem, const char *smem, size_t size)
1539 {
1540 do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1541 }
1542
test_ro_fast_pin(char * mem,const char * smem,size_t size)1543 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1544 {
1545 do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1546 }
1547
run_with_zeropage(non_anon_test_fn fn,const char * desc)1548 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1549 {
1550 char *mem, *smem, tmp;
1551
1552 log_test_start("%s ... with shared zeropage", desc);
1553
1554 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1555 MAP_PRIVATE | MAP_ANON, -1, 0);
1556 if (mem == MAP_FAILED) {
1557 ksft_perror("mmap() failed");
1558 log_test_result(KSFT_FAIL);
1559 return;
1560 }
1561
1562 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1563 if (smem == MAP_FAILED) {
1564 ksft_perror("mmap() failed");
1565 log_test_result(KSFT_FAIL);
1566 goto munmap;
1567 }
1568
1569 /* Read from the page to populate the shared zeropage. */
1570 tmp = *mem + *smem;
1571 asm volatile("" : "+r" (tmp));
1572
1573 fn(mem, smem, pagesize);
1574 munmap:
1575 munmap(mem, pagesize);
1576 if (smem != MAP_FAILED)
1577 munmap(smem, pagesize);
1578 }
1579
run_with_huge_zeropage(non_anon_test_fn fn,const char * desc)1580 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1581 {
1582 char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1583 size_t mmap_size;
1584 int ret;
1585
1586 log_test_start("%s ... with huge zeropage", desc);
1587
1588 if (!has_huge_zeropage) {
1589 ksft_print_msg("Huge zeropage not enabled\n");
1590 log_test_result(KSFT_SKIP);
1591 return;
1592 }
1593
1594 /* For alignment purposes, we need twice the thp size. */
1595 mmap_size = 2 * pmdsize;
1596 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1597 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1598 if (mmap_mem == MAP_FAILED) {
1599 ksft_perror("mmap() failed");
1600 log_test_result(KSFT_FAIL);
1601 return;
1602 }
1603 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1604 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1605 if (mmap_smem == MAP_FAILED) {
1606 ksft_perror("mmap() failed");
1607 log_test_result(KSFT_FAIL);
1608 goto munmap;
1609 }
1610
1611 /* We need a THP-aligned memory area. */
1612 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1613 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1614
1615 ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1616 if (ret != 0) {
1617 ksft_perror("madvise()");
1618 log_test_result(KSFT_FAIL);
1619 goto munmap;
1620 }
1621 ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1622 if (ret != 0) {
1623 ksft_perror("madvise()");
1624 log_test_result(KSFT_FAIL);
1625 goto munmap;
1626 }
1627
1628 /*
1629 * Read from the memory to populate the huge shared zeropage. Read from
1630 * the first sub-page and test if we get another sub-page populated
1631 * automatically.
1632 */
1633 tmp = *mem + *smem;
1634 asm volatile("" : "+r" (tmp));
1635 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1636 !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1637 ksft_test_result_skip("Did not get THPs populated\n");
1638 goto munmap;
1639 }
1640
1641 fn(mem, smem, pmdsize);
1642 munmap:
1643 munmap(mmap_mem, mmap_size);
1644 if (mmap_smem != MAP_FAILED)
1645 munmap(mmap_smem, mmap_size);
1646 }
1647
run_with_memfd(non_anon_test_fn fn,const char * desc)1648 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1649 {
1650 char *mem, *smem, tmp;
1651 int fd;
1652
1653 log_test_start("%s ... with memfd", desc);
1654
1655 fd = memfd_create("test", 0);
1656 if (fd < 0) {
1657 ksft_perror("memfd_create() failed");
1658 log_test_result(KSFT_FAIL);
1659 return;
1660 }
1661
1662 /* File consists of a single page filled with zeroes. */
1663 if (fallocate(fd, 0, 0, pagesize)) {
1664 ksft_perror("fallocate() failed");
1665 log_test_result(KSFT_FAIL);
1666 goto close;
1667 }
1668
1669 /* Create a private mapping of the memfd. */
1670 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1671 if (mem == MAP_FAILED) {
1672 ksft_perror("mmap() failed");
1673 log_test_result(KSFT_FAIL);
1674 goto close;
1675 }
1676 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1677 if (smem == MAP_FAILED) {
1678 ksft_perror("mmap() failed");
1679 log_test_result(KSFT_FAIL);
1680 goto munmap;
1681 }
1682
1683 /* Fault the page in. */
1684 tmp = *mem + *smem;
1685 asm volatile("" : "+r" (tmp));
1686
1687 fn(mem, smem, pagesize);
1688 munmap:
1689 munmap(mem, pagesize);
1690 if (smem != MAP_FAILED)
1691 munmap(smem, pagesize);
1692 close:
1693 close(fd);
1694 }
1695
run_with_tmpfile(non_anon_test_fn fn,const char * desc)1696 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1697 {
1698 char *mem, *smem, tmp;
1699 FILE *file;
1700 int fd;
1701
1702 log_test_start("%s ... with tmpfile", desc);
1703
1704 file = tmpfile();
1705 if (!file) {
1706 ksft_perror("tmpfile() failed");
1707 log_test_result(KSFT_FAIL);
1708 return;
1709 }
1710
1711 fd = fileno(file);
1712 if (fd < 0) {
1713 ksft_perror("fileno() failed");
1714 log_test_result(KSFT_SKIP);
1715 return;
1716 }
1717
1718 /* File consists of a single page filled with zeroes. */
1719 if (fallocate(fd, 0, 0, pagesize)) {
1720 ksft_perror("fallocate() failed");
1721 log_test_result(KSFT_FAIL);
1722 goto close;
1723 }
1724
1725 /* Create a private mapping of the memfd. */
1726 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1727 if (mem == MAP_FAILED) {
1728 ksft_perror("mmap() failed");
1729 log_test_result(KSFT_FAIL);
1730 goto close;
1731 }
1732 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1733 if (smem == MAP_FAILED) {
1734 ksft_perror("mmap() failed");
1735 log_test_result(KSFT_FAIL);
1736 goto munmap;
1737 }
1738
1739 /* Fault the page in. */
1740 tmp = *mem + *smem;
1741 asm volatile("" : "+r" (tmp));
1742
1743 fn(mem, smem, pagesize);
1744 munmap:
1745 munmap(mem, pagesize);
1746 if (smem != MAP_FAILED)
1747 munmap(smem, pagesize);
1748 close:
1749 fclose(file);
1750 }
1751
run_with_memfd_hugetlb(non_anon_test_fn fn,const char * desc,size_t hugetlbsize)1752 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1753 size_t hugetlbsize)
1754 {
1755 int flags = MFD_HUGETLB;
1756 char *mem, *smem, tmp;
1757 int fd;
1758
1759 log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
1760 hugetlbsize / 1024);
1761
1762 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1763
1764 fd = memfd_create("test", flags);
1765 if (fd < 0) {
1766 ksft_perror("memfd_create() failed");
1767 log_test_result(KSFT_SKIP);
1768 return;
1769 }
1770
1771 /* File consists of a single page filled with zeroes. */
1772 if (fallocate(fd, 0, 0, hugetlbsize)) {
1773 ksft_perror("need more free huge pages");
1774 log_test_result(KSFT_SKIP);
1775 goto close;
1776 }
1777
1778 /* Create a private mapping of the memfd. */
1779 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1780 0);
1781 if (mem == MAP_FAILED) {
1782 ksft_perror("need more free huge pages");
1783 log_test_result(KSFT_SKIP);
1784 goto close;
1785 }
1786 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1787 if (smem == MAP_FAILED) {
1788 ksft_perror("mmap() failed");
1789 log_test_result(KSFT_FAIL);
1790 goto munmap;
1791 }
1792
1793 /* Fault the page in. */
1794 tmp = *mem + *smem;
1795 asm volatile("" : "+r" (tmp));
1796
1797 fn(mem, smem, hugetlbsize);
1798 munmap:
1799 munmap(mem, hugetlbsize);
1800 if (smem != MAP_FAILED)
1801 munmap(smem, hugetlbsize);
1802 close:
1803 close(fd);
1804 }
1805
1806 struct non_anon_test_case {
1807 const char *desc;
1808 non_anon_test_fn fn;
1809 };
1810
1811 /*
1812 * Test cases that target any pages in private mappings that are not anonymous:
1813 * pages that may get shared via COW ndependent of fork(). This includes
1814 * the shared zeropage(s), pagecache pages, ...
1815 */
1816 static const struct non_anon_test_case non_anon_test_cases[] = {
1817 /*
1818 * Basic COW test without any GUP. If we miss to break COW, changes are
1819 * visible via other private/shared mappings.
1820 */
1821 {
1822 "Basic COW",
1823 test_cow,
1824 },
1825 /*
1826 * Take a R/O longterm pin. When modifying the page via the page table,
1827 * the page content change must be visible via the pin.
1828 */
1829 {
1830 "R/O longterm GUP pin",
1831 test_ro_pin,
1832 },
1833 /* Same as above, but using GUP-fast. */
1834 {
1835 "R/O longterm GUP-fast pin",
1836 test_ro_fast_pin,
1837 },
1838 };
1839
run_non_anon_test_case(struct non_anon_test_case const * test_case)1840 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1841 {
1842 int i;
1843
1844 run_with_zeropage(test_case->fn, test_case->desc);
1845 run_with_memfd(test_case->fn, test_case->desc);
1846 run_with_tmpfile(test_case->fn, test_case->desc);
1847 if (pmdsize)
1848 run_with_huge_zeropage(test_case->fn, test_case->desc);
1849 for (i = 0; i < nr_hugetlbsizes; i++)
1850 run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1851 hugetlbsizes[i]);
1852 }
1853
run_non_anon_test_cases(void)1854 static void run_non_anon_test_cases(void)
1855 {
1856 int i;
1857
1858 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1859
1860 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1861 run_non_anon_test_case(&non_anon_test_cases[i]);
1862 }
1863
tests_per_non_anon_test_case(void)1864 static int tests_per_non_anon_test_case(void)
1865 {
1866 int tests = 3 + nr_hugetlbsizes;
1867
1868 if (pmdsize)
1869 tests += 1;
1870 return tests;
1871 }
1872
main(int argc,char ** argv)1873 int main(int argc, char **argv)
1874 {
1875 struct thp_settings default_settings;
1876
1877 ksft_print_header();
1878
1879 pagesize = getpagesize();
1880 pmdsize = read_pmd_pagesize();
1881 if (pmdsize) {
1882 /* Only if THP is supported. */
1883 thp_read_settings(&default_settings);
1884 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1885 thp_save_settings();
1886 thp_push_settings(&default_settings);
1887
1888 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1889 pmdsize / 1024);
1890 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1891 }
1892 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1893 ARRAY_SIZE(hugetlbsizes));
1894 detect_huge_zeropage();
1895
1896 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1897 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1898 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1899
1900 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1901 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1902 if (pagemap_fd < 0)
1903 ksft_exit_fail_msg("opening pagemap failed\n");
1904
1905 run_anon_test_cases();
1906 run_anon_thp_test_cases();
1907 run_non_anon_test_cases();
1908
1909 if (pmdsize) {
1910 /* Only if THP is supported. */
1911 thp_restore_settings();
1912 }
1913
1914 ksft_finished();
1915 }
1916