xref: /linux/tools/testing/selftests/mm/cow.c (revision 3972ea2469385777d0ebc67794f30cdb0cdbffd9)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <sys/mman.h>
19 #include <sys/ioctl.h>
20 #include <sys/wait.h>
21 #include <linux/memfd.h>
22 
23 #include "local_config.h"
24 #ifdef LOCAL_CONFIG_HAVE_LIBURING
25 #include <liburing.h>
26 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
27 
28 #include "../../../../mm/gup_test.h"
29 #include "../kselftest.h"
30 #include "vm_util.h"
31 
32 static size_t pagesize;
33 static int pagemap_fd;
34 static size_t thpsize;
35 static int nr_hugetlbsizes;
36 static size_t hugetlbsizes[10];
37 static int gup_fd;
38 static bool has_huge_zeropage;
39 
40 static void detect_huge_zeropage(void)
41 {
42 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
43 		      O_RDONLY);
44 	size_t enabled = 0;
45 	char buf[15];
46 	int ret;
47 
48 	if (fd < 0)
49 		return;
50 
51 	ret = pread(fd, buf, sizeof(buf), 0);
52 	if (ret > 0 && ret < sizeof(buf)) {
53 		buf[ret] = 0;
54 
55 		enabled = strtoul(buf, NULL, 10);
56 		if (enabled == 1) {
57 			has_huge_zeropage = true;
58 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
59 		}
60 	}
61 
62 	close(fd);
63 }
64 
65 static bool range_is_swapped(void *addr, size_t size)
66 {
67 	for (; size; addr += pagesize, size -= pagesize)
68 		if (!pagemap_is_swapped(pagemap_fd, addr))
69 			return false;
70 	return true;
71 }
72 
73 struct comm_pipes {
74 	int child_ready[2];
75 	int parent_ready[2];
76 };
77 
78 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
79 {
80 	if (pipe(comm_pipes->child_ready) < 0)
81 		return -errno;
82 	if (pipe(comm_pipes->parent_ready) < 0) {
83 		close(comm_pipes->child_ready[0]);
84 		close(comm_pipes->child_ready[1]);
85 		return -errno;
86 	}
87 
88 	return 0;
89 }
90 
91 static void close_comm_pipes(struct comm_pipes *comm_pipes)
92 {
93 	close(comm_pipes->child_ready[0]);
94 	close(comm_pipes->child_ready[1]);
95 	close(comm_pipes->parent_ready[0]);
96 	close(comm_pipes->parent_ready[1]);
97 }
98 
99 static int child_memcmp_fn(char *mem, size_t size,
100 			   struct comm_pipes *comm_pipes)
101 {
102 	char *old = malloc(size);
103 	char buf;
104 
105 	/* Backup the original content. */
106 	memcpy(old, mem, size);
107 
108 	/* Wait until the parent modified the page. */
109 	write(comm_pipes->child_ready[1], "0", 1);
110 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
111 		;
112 
113 	/* See if we still read the old values. */
114 	return memcmp(old, mem, size);
115 }
116 
117 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
118 				    struct comm_pipes *comm_pipes)
119 {
120 	struct iovec iov = {
121 		.iov_base = mem,
122 		.iov_len = size,
123 	};
124 	ssize_t cur, total, transferred;
125 	char *old, *new;
126 	int fds[2];
127 	char buf;
128 
129 	old = malloc(size);
130 	new = malloc(size);
131 
132 	/* Backup the original content. */
133 	memcpy(old, mem, size);
134 
135 	if (pipe(fds) < 0)
136 		return -errno;
137 
138 	/* Trigger a read-only pin. */
139 	transferred = vmsplice(fds[1], &iov, 1, 0);
140 	if (transferred < 0)
141 		return -errno;
142 	if (transferred == 0)
143 		return -EINVAL;
144 
145 	/* Unmap it from our page tables. */
146 	if (munmap(mem, size) < 0)
147 		return -errno;
148 
149 	/* Wait until the parent modified it. */
150 	write(comm_pipes->child_ready[1], "0", 1);
151 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
152 		;
153 
154 	/* See if we still read the old values via the pipe. */
155 	for (total = 0; total < transferred; total += cur) {
156 		cur = read(fds[0], new + total, transferred - total);
157 		if (cur < 0)
158 			return -errno;
159 	}
160 
161 	return memcmp(old, new, transferred);
162 }
163 
164 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
165 
166 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
167 				  child_fn fn)
168 {
169 	struct comm_pipes comm_pipes;
170 	char buf;
171 	int ret;
172 
173 	ret = setup_comm_pipes(&comm_pipes);
174 	if (ret) {
175 		ksft_test_result_fail("pipe() failed\n");
176 		return;
177 	}
178 
179 	ret = fork();
180 	if (ret < 0) {
181 		ksft_test_result_fail("fork() failed\n");
182 		goto close_comm_pipes;
183 	} else if (!ret) {
184 		exit(fn(mem, size, &comm_pipes));
185 	}
186 
187 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
188 		;
189 
190 	if (do_mprotect) {
191 		/*
192 		 * mprotect() optimizations might try avoiding
193 		 * write-faults by directly mapping pages writable.
194 		 */
195 		ret = mprotect(mem, size, PROT_READ);
196 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
197 		if (ret) {
198 			ksft_test_result_fail("mprotect() failed\n");
199 			write(comm_pipes.parent_ready[1], "0", 1);
200 			wait(&ret);
201 			goto close_comm_pipes;
202 		}
203 	}
204 
205 	/* Modify the page. */
206 	memset(mem, 0xff, size);
207 	write(comm_pipes.parent_ready[1], "0", 1);
208 
209 	wait(&ret);
210 	if (WIFEXITED(ret))
211 		ret = WEXITSTATUS(ret);
212 	else
213 		ret = -EINVAL;
214 
215 	ksft_test_result(!ret, "No leak from parent into child\n");
216 close_comm_pipes:
217 	close_comm_pipes(&comm_pipes);
218 }
219 
220 static void test_cow_in_parent(char *mem, size_t size)
221 {
222 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
223 }
224 
225 static void test_cow_in_parent_mprotect(char *mem, size_t size)
226 {
227 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
228 }
229 
230 static void test_vmsplice_in_child(char *mem, size_t size)
231 {
232 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
233 }
234 
235 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
236 {
237 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
238 }
239 
240 static void do_test_vmsplice_in_parent(char *mem, size_t size,
241 				       bool before_fork)
242 {
243 	struct iovec iov = {
244 		.iov_base = mem,
245 		.iov_len = size,
246 	};
247 	ssize_t cur, total, transferred;
248 	struct comm_pipes comm_pipes;
249 	char *old, *new;
250 	int ret, fds[2];
251 	char buf;
252 
253 	old = malloc(size);
254 	new = malloc(size);
255 
256 	memcpy(old, mem, size);
257 
258 	ret = setup_comm_pipes(&comm_pipes);
259 	if (ret) {
260 		ksft_test_result_fail("pipe() failed\n");
261 		goto free;
262 	}
263 
264 	if (pipe(fds) < 0) {
265 		ksft_test_result_fail("pipe() failed\n");
266 		goto close_comm_pipes;
267 	}
268 
269 	if (before_fork) {
270 		transferred = vmsplice(fds[1], &iov, 1, 0);
271 		if (transferred <= 0) {
272 			ksft_test_result_fail("vmsplice() failed\n");
273 			goto close_pipe;
274 		}
275 	}
276 
277 	ret = fork();
278 	if (ret < 0) {
279 		ksft_test_result_fail("fork() failed\n");
280 		goto close_pipe;
281 	} else if (!ret) {
282 		write(comm_pipes.child_ready[1], "0", 1);
283 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
284 			;
285 		/* Modify page content in the child. */
286 		memset(mem, 0xff, size);
287 		exit(0);
288 	}
289 
290 	if (!before_fork) {
291 		transferred = vmsplice(fds[1], &iov, 1, 0);
292 		if (transferred <= 0) {
293 			ksft_test_result_fail("vmsplice() failed\n");
294 			wait(&ret);
295 			goto close_pipe;
296 		}
297 	}
298 
299 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
300 		;
301 	if (munmap(mem, size) < 0) {
302 		ksft_test_result_fail("munmap() failed\n");
303 		goto close_pipe;
304 	}
305 	write(comm_pipes.parent_ready[1], "0", 1);
306 
307 	/* Wait until the child is done writing. */
308 	wait(&ret);
309 	if (!WIFEXITED(ret)) {
310 		ksft_test_result_fail("wait() failed\n");
311 		goto close_pipe;
312 	}
313 
314 	/* See if we still read the old values. */
315 	for (total = 0; total < transferred; total += cur) {
316 		cur = read(fds[0], new + total, transferred - total);
317 		if (cur < 0) {
318 			ksft_test_result_fail("read() failed\n");
319 			goto close_pipe;
320 		}
321 	}
322 
323 	ksft_test_result(!memcmp(old, new, transferred),
324 			 "No leak from child into parent\n");
325 close_pipe:
326 	close(fds[0]);
327 	close(fds[1]);
328 close_comm_pipes:
329 	close_comm_pipes(&comm_pipes);
330 free:
331 	free(old);
332 	free(new);
333 }
334 
335 static void test_vmsplice_before_fork(char *mem, size_t size)
336 {
337 	do_test_vmsplice_in_parent(mem, size, true);
338 }
339 
340 static void test_vmsplice_after_fork(char *mem, size_t size)
341 {
342 	do_test_vmsplice_in_parent(mem, size, false);
343 }
344 
345 #ifdef LOCAL_CONFIG_HAVE_LIBURING
346 static void do_test_iouring(char *mem, size_t size, bool use_fork)
347 {
348 	struct comm_pipes comm_pipes;
349 	struct io_uring_cqe *cqe;
350 	struct io_uring_sqe *sqe;
351 	struct io_uring ring;
352 	ssize_t cur, total;
353 	struct iovec iov;
354 	char *buf, *tmp;
355 	int ret, fd;
356 	FILE *file;
357 
358 	ret = setup_comm_pipes(&comm_pipes);
359 	if (ret) {
360 		ksft_test_result_fail("pipe() failed\n");
361 		return;
362 	}
363 
364 	file = tmpfile();
365 	if (!file) {
366 		ksft_test_result_fail("tmpfile() failed\n");
367 		goto close_comm_pipes;
368 	}
369 	fd = fileno(file);
370 	assert(fd);
371 
372 	tmp = malloc(size);
373 	if (!tmp) {
374 		ksft_test_result_fail("malloc() failed\n");
375 		goto close_file;
376 	}
377 
378 	/* Skip on errors, as we might just lack kernel support. */
379 	ret = io_uring_queue_init(1, &ring, 0);
380 	if (ret < 0) {
381 		ksft_test_result_skip("io_uring_queue_init() failed\n");
382 		goto free_tmp;
383 	}
384 
385 	/*
386 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
387 	 * | FOLL_LONGTERM the range.
388 	 *
389 	 * Skip on errors, as we might just lack kernel support or might not
390 	 * have sufficient MEMLOCK permissions.
391 	 */
392 	iov.iov_base = mem;
393 	iov.iov_len = size;
394 	ret = io_uring_register_buffers(&ring, &iov, 1);
395 	if (ret) {
396 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
397 		goto queue_exit;
398 	}
399 
400 	if (use_fork) {
401 		/*
402 		 * fork() and keep the child alive until we're done. Note that
403 		 * we expect the pinned page to not get shared with the child.
404 		 */
405 		ret = fork();
406 		if (ret < 0) {
407 			ksft_test_result_fail("fork() failed\n");
408 			goto unregister_buffers;
409 		} else if (!ret) {
410 			write(comm_pipes.child_ready[1], "0", 1);
411 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
412 				;
413 			exit(0);
414 		}
415 
416 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
417 			;
418 	} else {
419 		/*
420 		 * Map the page R/O into the page table. Enable softdirty
421 		 * tracking to stop the page from getting mapped R/W immediately
422 		 * again by mprotect() optimizations. Note that we don't have an
423 		 * easy way to test if that worked (the pagemap does not export
424 		 * if the page is mapped R/O vs. R/W).
425 		 */
426 		ret = mprotect(mem, size, PROT_READ);
427 		clear_softdirty();
428 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
429 		if (ret) {
430 			ksft_test_result_fail("mprotect() failed\n");
431 			goto unregister_buffers;
432 		}
433 	}
434 
435 	/*
436 	 * Modify the page and write page content as observed by the fixed
437 	 * buffer pin to the file so we can verify it.
438 	 */
439 	memset(mem, 0xff, size);
440 	sqe = io_uring_get_sqe(&ring);
441 	if (!sqe) {
442 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
443 		goto quit_child;
444 	}
445 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
446 
447 	ret = io_uring_submit(&ring);
448 	if (ret < 0) {
449 		ksft_test_result_fail("io_uring_submit() failed\n");
450 		goto quit_child;
451 	}
452 
453 	ret = io_uring_wait_cqe(&ring, &cqe);
454 	if (ret < 0) {
455 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
456 		goto quit_child;
457 	}
458 
459 	if (cqe->res != size) {
460 		ksft_test_result_fail("write_fixed failed\n");
461 		goto quit_child;
462 	}
463 	io_uring_cqe_seen(&ring, cqe);
464 
465 	/* Read back the file content to the temporary buffer. */
466 	total = 0;
467 	while (total < size) {
468 		cur = pread(fd, tmp + total, size - total, total);
469 		if (cur < 0) {
470 			ksft_test_result_fail("pread() failed\n");
471 			goto quit_child;
472 		}
473 		total += cur;
474 	}
475 
476 	/* Finally, check if we read what we expected. */
477 	ksft_test_result(!memcmp(mem, tmp, size),
478 			 "Longterm R/W pin is reliable\n");
479 
480 quit_child:
481 	if (use_fork) {
482 		write(comm_pipes.parent_ready[1], "0", 1);
483 		wait(&ret);
484 	}
485 unregister_buffers:
486 	io_uring_unregister_buffers(&ring);
487 queue_exit:
488 	io_uring_queue_exit(&ring);
489 free_tmp:
490 	free(tmp);
491 close_file:
492 	fclose(file);
493 close_comm_pipes:
494 	close_comm_pipes(&comm_pipes);
495 }
496 
497 static void test_iouring_ro(char *mem, size_t size)
498 {
499 	do_test_iouring(mem, size, false);
500 }
501 
502 static void test_iouring_fork(char *mem, size_t size)
503 {
504 	do_test_iouring(mem, size, true);
505 }
506 
507 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
508 
509 enum ro_pin_test {
510 	RO_PIN_TEST,
511 	RO_PIN_TEST_SHARED,
512 	RO_PIN_TEST_PREVIOUSLY_SHARED,
513 	RO_PIN_TEST_RO_EXCLUSIVE,
514 };
515 
516 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
517 			   bool fast)
518 {
519 	struct pin_longterm_test args;
520 	struct comm_pipes comm_pipes;
521 	char *tmp, buf;
522 	__u64 tmp_val;
523 	int ret;
524 
525 	if (gup_fd < 0) {
526 		ksft_test_result_skip("gup_test not available\n");
527 		return;
528 	}
529 
530 	tmp = malloc(size);
531 	if (!tmp) {
532 		ksft_test_result_fail("malloc() failed\n");
533 		return;
534 	}
535 
536 	ret = setup_comm_pipes(&comm_pipes);
537 	if (ret) {
538 		ksft_test_result_fail("pipe() failed\n");
539 		goto free_tmp;
540 	}
541 
542 	switch (test) {
543 	case RO_PIN_TEST:
544 		break;
545 	case RO_PIN_TEST_SHARED:
546 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
547 		/*
548 		 * Share the pages with our child. As the pages are not pinned,
549 		 * this should just work.
550 		 */
551 		ret = fork();
552 		if (ret < 0) {
553 			ksft_test_result_fail("fork() failed\n");
554 			goto close_comm_pipes;
555 		} else if (!ret) {
556 			write(comm_pipes.child_ready[1], "0", 1);
557 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
558 				;
559 			exit(0);
560 		}
561 
562 		/* Wait until our child is ready. */
563 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
564 			;
565 
566 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
567 			/*
568 			 * Tell the child to quit now and wait until it quit.
569 			 * The pages should now be mapped R/O into our page
570 			 * tables, but they are no longer shared.
571 			 */
572 			write(comm_pipes.parent_ready[1], "0", 1);
573 			wait(&ret);
574 			if (!WIFEXITED(ret))
575 				ksft_print_msg("[INFO] wait() failed\n");
576 		}
577 		break;
578 	case RO_PIN_TEST_RO_EXCLUSIVE:
579 		/*
580 		 * Map the page R/O into the page table. Enable softdirty
581 		 * tracking to stop the page from getting mapped R/W immediately
582 		 * again by mprotect() optimizations. Note that we don't have an
583 		 * easy way to test if that worked (the pagemap does not export
584 		 * if the page is mapped R/O vs. R/W).
585 		 */
586 		ret = mprotect(mem, size, PROT_READ);
587 		clear_softdirty();
588 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
589 		if (ret) {
590 			ksft_test_result_fail("mprotect() failed\n");
591 			goto close_comm_pipes;
592 		}
593 		break;
594 	default:
595 		assert(false);
596 	}
597 
598 	/* Take a R/O pin. This should trigger unsharing. */
599 	args.addr = (__u64)(uintptr_t)mem;
600 	args.size = size;
601 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
602 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
603 	if (ret) {
604 		if (errno == EINVAL)
605 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
606 		else
607 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
608 		goto wait;
609 	}
610 
611 	/* Modify the page. */
612 	memset(mem, 0xff, size);
613 
614 	/*
615 	 * Read back the content via the pin to the temporary buffer and
616 	 * test if we observed the modification.
617 	 */
618 	tmp_val = (__u64)(uintptr_t)tmp;
619 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
620 	if (ret)
621 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
622 	else
623 		ksft_test_result(!memcmp(mem, tmp, size),
624 				 "Longterm R/O pin is reliable\n");
625 
626 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
627 	if (ret)
628 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
629 wait:
630 	switch (test) {
631 	case RO_PIN_TEST_SHARED:
632 		write(comm_pipes.parent_ready[1], "0", 1);
633 		wait(&ret);
634 		if (!WIFEXITED(ret))
635 			ksft_print_msg("[INFO] wait() failed\n");
636 		break;
637 	default:
638 		break;
639 	}
640 close_comm_pipes:
641 	close_comm_pipes(&comm_pipes);
642 free_tmp:
643 	free(tmp);
644 }
645 
646 static void test_ro_pin_on_shared(char *mem, size_t size)
647 {
648 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
649 }
650 
651 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
652 {
653 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
654 }
655 
656 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
657 {
658 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
659 }
660 
661 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
662 {
663 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
664 }
665 
666 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
667 {
668 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
669 }
670 
671 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
672 {
673 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
674 }
675 
676 typedef void (*test_fn)(char *mem, size_t size);
677 
678 static void do_run_with_base_page(test_fn fn, bool swapout)
679 {
680 	char *mem;
681 	int ret;
682 
683 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
684 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
685 	if (mem == MAP_FAILED) {
686 		ksft_test_result_fail("mmap() failed\n");
687 		return;
688 	}
689 
690 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
691 	/* Ignore if not around on a kernel. */
692 	if (ret && errno != EINVAL) {
693 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
694 		goto munmap;
695 	}
696 
697 	/* Populate a base page. */
698 	memset(mem, 0, pagesize);
699 
700 	if (swapout) {
701 		madvise(mem, pagesize, MADV_PAGEOUT);
702 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
703 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
704 			goto munmap;
705 		}
706 	}
707 
708 	fn(mem, pagesize);
709 munmap:
710 	munmap(mem, pagesize);
711 }
712 
713 static void run_with_base_page(test_fn fn, const char *desc)
714 {
715 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
716 	do_run_with_base_page(fn, false);
717 }
718 
719 static void run_with_base_page_swap(test_fn fn, const char *desc)
720 {
721 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
722 	do_run_with_base_page(fn, true);
723 }
724 
725 enum thp_run {
726 	THP_RUN_PMD,
727 	THP_RUN_PMD_SWAPOUT,
728 	THP_RUN_PTE,
729 	THP_RUN_PTE_SWAPOUT,
730 	THP_RUN_SINGLE_PTE,
731 	THP_RUN_SINGLE_PTE_SWAPOUT,
732 	THP_RUN_PARTIAL_MREMAP,
733 	THP_RUN_PARTIAL_SHARED,
734 };
735 
736 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
737 {
738 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
739 	size_t size, mmap_size, mremap_size;
740 	int ret;
741 
742 	/* For alignment purposes, we need twice the thp size. */
743 	mmap_size = 2 * thpsize;
744 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
745 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
746 	if (mmap_mem == MAP_FAILED) {
747 		ksft_test_result_fail("mmap() failed\n");
748 		return;
749 	}
750 
751 	/* We need a THP-aligned memory area. */
752 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
753 
754 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
755 	if (ret) {
756 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
757 		goto munmap;
758 	}
759 
760 	/*
761 	 * Try to populate a THP. Touch the first sub-page and test if we get
762 	 * another sub-page populated automatically.
763 	 */
764 	mem[0] = 0;
765 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
766 		ksft_test_result_skip("Did not get a THP populated\n");
767 		goto munmap;
768 	}
769 	memset(mem, 0, thpsize);
770 
771 	size = thpsize;
772 	switch (thp_run) {
773 	case THP_RUN_PMD:
774 	case THP_RUN_PMD_SWAPOUT:
775 		break;
776 	case THP_RUN_PTE:
777 	case THP_RUN_PTE_SWAPOUT:
778 		/*
779 		 * Trigger PTE-mapping the THP by temporarily mapping a single
780 		 * subpage R/O.
781 		 */
782 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
783 		if (ret) {
784 			ksft_test_result_fail("mprotect() failed\n");
785 			goto munmap;
786 		}
787 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
788 		if (ret) {
789 			ksft_test_result_fail("mprotect() failed\n");
790 			goto munmap;
791 		}
792 		break;
793 	case THP_RUN_SINGLE_PTE:
794 	case THP_RUN_SINGLE_PTE_SWAPOUT:
795 		/*
796 		 * Discard all but a single subpage of that PTE-mapped THP. What
797 		 * remains is a single PTE mapping a single subpage.
798 		 */
799 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
800 		if (ret) {
801 			ksft_test_result_fail("MADV_DONTNEED failed\n");
802 			goto munmap;
803 		}
804 		size = pagesize;
805 		break;
806 	case THP_RUN_PARTIAL_MREMAP:
807 		/*
808 		 * Remap half of the THP. We need some new memory location
809 		 * for that.
810 		 */
811 		mremap_size = thpsize / 2;
812 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
813 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
814 		if (mem == MAP_FAILED) {
815 			ksft_test_result_fail("mmap() failed\n");
816 			goto munmap;
817 		}
818 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
819 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
820 		if (tmp != mremap_mem) {
821 			ksft_test_result_fail("mremap() failed\n");
822 			goto munmap;
823 		}
824 		size = mremap_size;
825 		break;
826 	case THP_RUN_PARTIAL_SHARED:
827 		/*
828 		 * Share the first page of the THP with a child and quit the
829 		 * child. This will result in some parts of the THP never
830 		 * have been shared.
831 		 */
832 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
833 		if (ret) {
834 			ksft_test_result_fail("MADV_DONTFORK failed\n");
835 			goto munmap;
836 		}
837 		ret = fork();
838 		if (ret < 0) {
839 			ksft_test_result_fail("fork() failed\n");
840 			goto munmap;
841 		} else if (!ret) {
842 			exit(0);
843 		}
844 		wait(&ret);
845 		/* Allow for sharing all pages again. */
846 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
847 		if (ret) {
848 			ksft_test_result_fail("MADV_DOFORK failed\n");
849 			goto munmap;
850 		}
851 		break;
852 	default:
853 		assert(false);
854 	}
855 
856 	switch (thp_run) {
857 	case THP_RUN_PMD_SWAPOUT:
858 	case THP_RUN_PTE_SWAPOUT:
859 	case THP_RUN_SINGLE_PTE_SWAPOUT:
860 		madvise(mem, size, MADV_PAGEOUT);
861 		if (!range_is_swapped(mem, size)) {
862 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
863 			goto munmap;
864 		}
865 		break;
866 	default:
867 		break;
868 	}
869 
870 	fn(mem, size);
871 munmap:
872 	munmap(mmap_mem, mmap_size);
873 	if (mremap_mem != MAP_FAILED)
874 		munmap(mremap_mem, mremap_size);
875 }
876 
877 static void run_with_thp(test_fn fn, const char *desc)
878 {
879 	ksft_print_msg("[RUN] %s ... with THP\n", desc);
880 	do_run_with_thp(fn, THP_RUN_PMD);
881 }
882 
883 static void run_with_thp_swap(test_fn fn, const char *desc)
884 {
885 	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
886 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
887 }
888 
889 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
890 {
891 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
892 	do_run_with_thp(fn, THP_RUN_PTE);
893 }
894 
895 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
896 {
897 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
898 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
899 }
900 
901 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
902 {
903 	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
904 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
905 }
906 
907 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
908 {
909 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
910 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
911 }
912 
913 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
914 {
915 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
916 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
917 }
918 
919 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
920 {
921 	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
922 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
923 }
924 
925 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
926 {
927 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
928 	char *mem, *dummy;
929 
930 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
931 		       hugetlbsize / 1024);
932 
933 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
934 
935 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
936 	if (mem == MAP_FAILED) {
937 		ksft_test_result_skip("need more free huge pages\n");
938 		return;
939 	}
940 
941 	/* Populate an huge page. */
942 	memset(mem, 0, hugetlbsize);
943 
944 	/*
945 	 * We need a total of two hugetlb pages to handle COW/unsharing
946 	 * properly, otherwise we might get zapped by a SIGBUS.
947 	 */
948 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
949 	if (dummy == MAP_FAILED) {
950 		ksft_test_result_skip("need more free huge pages\n");
951 		goto munmap;
952 	}
953 	munmap(dummy, hugetlbsize);
954 
955 	fn(mem, hugetlbsize);
956 munmap:
957 	munmap(mem, hugetlbsize);
958 }
959 
960 struct test_case {
961 	const char *desc;
962 	test_fn fn;
963 };
964 
965 /*
966  * Test cases that are specific to anonymous pages: pages in private mappings
967  * that may get shared via COW during fork().
968  */
969 static const struct test_case anon_test_cases[] = {
970 	/*
971 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
972 	 * either the child can observe modifications by the parent or the
973 	 * other way around.
974 	 */
975 	{
976 		"Basic COW after fork()",
977 		test_cow_in_parent,
978 	},
979 	/*
980 	 * Basic test, but do an additional mprotect(PROT_READ)+
981 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
982 	 */
983 	{
984 		"Basic COW after fork() with mprotect() optimization",
985 		test_cow_in_parent_mprotect,
986 	},
987 	/*
988 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
989 	 * we miss to break COW, the child observes modifications by the parent.
990 	 * This is CVE-2020-29374 reported by Jann Horn.
991 	 */
992 	{
993 		"vmsplice() + unmap in child",
994 		test_vmsplice_in_child
995 	},
996 	/*
997 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
998 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
999 	 */
1000 	{
1001 		"vmsplice() + unmap in child with mprotect() optimization",
1002 		test_vmsplice_in_child_mprotect
1003 	},
1004 	/*
1005 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1006 	 * fork(); modify in the child. If we miss to break COW, the parent
1007 	 * observes modifications by the child.
1008 	 */
1009 	{
1010 		"vmsplice() before fork(), unmap in parent after fork()",
1011 		test_vmsplice_before_fork,
1012 	},
1013 	/*
1014 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1015 	 * child. If we miss to break COW, the parent observes modifications by
1016 	 * the child.
1017 	 */
1018 	{
1019 		"vmsplice() + unmap in parent after fork()",
1020 		test_vmsplice_after_fork,
1021 	},
1022 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1023 	/*
1024 	 * Take a R/W longterm pin and then map the page R/O into the page
1025 	 * table to trigger a write fault on next access. When modifying the
1026 	 * page, the page content must be visible via the pin.
1027 	 */
1028 	{
1029 		"R/O-mapping a page registered as iouring fixed buffer",
1030 		test_iouring_ro,
1031 	},
1032 	/*
1033 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1034 	 * page, the page content must be visible via the pin. We expect the
1035 	 * pinned page to not get shared with the child.
1036 	 */
1037 	{
1038 		"fork() with an iouring fixed buffer",
1039 		test_iouring_fork,
1040 	},
1041 
1042 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1043 	/*
1044 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1045 	 * When modifying the page via the page table, the page content change
1046 	 * must be visible via the pin.
1047 	 */
1048 	{
1049 		"R/O GUP pin on R/O-mapped shared page",
1050 		test_ro_pin_on_shared,
1051 	},
1052 	/* Same as above, but using GUP-fast. */
1053 	{
1054 		"R/O GUP-fast pin on R/O-mapped shared page",
1055 		test_ro_fast_pin_on_shared,
1056 	},
1057 	/*
1058 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1059 	 * was previously shared. When modifying the page via the page table,
1060 	 * the page content change must be visible via the pin.
1061 	 */
1062 	{
1063 		"R/O GUP pin on R/O-mapped previously-shared page",
1064 		test_ro_pin_on_ro_previously_shared,
1065 	},
1066 	/* Same as above, but using GUP-fast. */
1067 	{
1068 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1069 		test_ro_fast_pin_on_ro_previously_shared,
1070 	},
1071 	/*
1072 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1073 	 * When modifying the page via the page table, the page content change
1074 	 * must be visible via the pin.
1075 	 */
1076 	{
1077 		"R/O GUP pin on R/O-mapped exclusive page",
1078 		test_ro_pin_on_ro_exclusive,
1079 	},
1080 	/* Same as above, but using GUP-fast. */
1081 	{
1082 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1083 		test_ro_fast_pin_on_ro_exclusive,
1084 	},
1085 };
1086 
1087 static void run_anon_test_case(struct test_case const *test_case)
1088 {
1089 	int i;
1090 
1091 	run_with_base_page(test_case->fn, test_case->desc);
1092 	run_with_base_page_swap(test_case->fn, test_case->desc);
1093 	if (thpsize) {
1094 		run_with_thp(test_case->fn, test_case->desc);
1095 		run_with_thp_swap(test_case->fn, test_case->desc);
1096 		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1097 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1098 		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1099 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1100 		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1101 		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1102 	}
1103 	for (i = 0; i < nr_hugetlbsizes; i++)
1104 		run_with_hugetlb(test_case->fn, test_case->desc,
1105 				 hugetlbsizes[i]);
1106 }
1107 
1108 static void run_anon_test_cases(void)
1109 {
1110 	int i;
1111 
1112 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1113 
1114 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1115 		run_anon_test_case(&anon_test_cases[i]);
1116 }
1117 
1118 static int tests_per_anon_test_case(void)
1119 {
1120 	int tests = 2 + nr_hugetlbsizes;
1121 
1122 	if (thpsize)
1123 		tests += 8;
1124 	return tests;
1125 }
1126 
1127 enum anon_thp_collapse_test {
1128 	ANON_THP_COLLAPSE_UNSHARED,
1129 	ANON_THP_COLLAPSE_FULLY_SHARED,
1130 	ANON_THP_COLLAPSE_LOWER_SHARED,
1131 	ANON_THP_COLLAPSE_UPPER_SHARED,
1132 };
1133 
1134 static void do_test_anon_thp_collapse(char *mem, size_t size,
1135 				      enum anon_thp_collapse_test test)
1136 {
1137 	struct comm_pipes comm_pipes;
1138 	char buf;
1139 	int ret;
1140 
1141 	ret = setup_comm_pipes(&comm_pipes);
1142 	if (ret) {
1143 		ksft_test_result_fail("pipe() failed\n");
1144 		return;
1145 	}
1146 
1147 	/*
1148 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1149 	 * R/O, such that we can try collapsing it later.
1150 	 */
1151 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1152 	if (ret) {
1153 		ksft_test_result_fail("mprotect() failed\n");
1154 		goto close_comm_pipes;
1155 	}
1156 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1157 	if (ret) {
1158 		ksft_test_result_fail("mprotect() failed\n");
1159 		goto close_comm_pipes;
1160 	}
1161 
1162 	switch (test) {
1163 	case ANON_THP_COLLAPSE_UNSHARED:
1164 		/* Collapse before actually COW-sharing the page. */
1165 		ret = madvise(mem, size, MADV_COLLAPSE);
1166 		if (ret) {
1167 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1168 					      strerror(errno));
1169 			goto close_comm_pipes;
1170 		}
1171 		break;
1172 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1173 		/* COW-share the full PTE-mapped THP. */
1174 		break;
1175 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1176 		/* Don't COW-share the upper part of the THP. */
1177 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1178 		if (ret) {
1179 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1180 			goto close_comm_pipes;
1181 		}
1182 		break;
1183 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1184 		/* Don't COW-share the lower part of the THP. */
1185 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1186 		if (ret) {
1187 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1188 			goto close_comm_pipes;
1189 		}
1190 		break;
1191 	default:
1192 		assert(false);
1193 	}
1194 
1195 	ret = fork();
1196 	if (ret < 0) {
1197 		ksft_test_result_fail("fork() failed\n");
1198 		goto close_comm_pipes;
1199 	} else if (!ret) {
1200 		switch (test) {
1201 		case ANON_THP_COLLAPSE_UNSHARED:
1202 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1203 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1204 			break;
1205 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1206 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1207 			break;
1208 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1209 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1210 					     &comm_pipes));
1211 			break;
1212 		default:
1213 			assert(false);
1214 		}
1215 	}
1216 
1217 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1218 		;
1219 
1220 	switch (test) {
1221 	case ANON_THP_COLLAPSE_UNSHARED:
1222 		break;
1223 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1224 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1225 		/*
1226 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1227 		 * able to actually collapse.
1228 		 */
1229 		ret = madvise(mem, size, MADV_DOFORK);
1230 		if (ret) {
1231 			ksft_test_result_fail("MADV_DOFORK failed\n");
1232 			write(comm_pipes.parent_ready[1], "0", 1);
1233 			wait(&ret);
1234 			goto close_comm_pipes;
1235 		}
1236 		/* FALLTHROUGH */
1237 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1238 		/* Collapse before anyone modified the COW-shared page. */
1239 		ret = madvise(mem, size, MADV_COLLAPSE);
1240 		if (ret) {
1241 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1242 					      strerror(errno));
1243 			write(comm_pipes.parent_ready[1], "0", 1);
1244 			wait(&ret);
1245 			goto close_comm_pipes;
1246 		}
1247 		break;
1248 	default:
1249 		assert(false);
1250 	}
1251 
1252 	/* Modify the page. */
1253 	memset(mem, 0xff, size);
1254 	write(comm_pipes.parent_ready[1], "0", 1);
1255 
1256 	wait(&ret);
1257 	if (WIFEXITED(ret))
1258 		ret = WEXITSTATUS(ret);
1259 	else
1260 		ret = -EINVAL;
1261 
1262 	ksft_test_result(!ret, "No leak from parent into child\n");
1263 close_comm_pipes:
1264 	close_comm_pipes(&comm_pipes);
1265 }
1266 
1267 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1268 {
1269 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1270 }
1271 
1272 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1273 {
1274 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1275 }
1276 
1277 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1278 {
1279 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1280 }
1281 
1282 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1283 {
1284 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1285 }
1286 
1287 /*
1288  * Test cases that are specific to anonymous THP: pages in private mappings
1289  * that may get shared via COW during fork().
1290  */
1291 static const struct test_case anon_thp_test_cases[] = {
1292 	/*
1293 	 * Basic COW test for fork() without any GUP when collapsing a THP
1294 	 * before fork().
1295 	 *
1296 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1297 	 * collapse") might easily get COW handling wrong when not collapsing
1298 	 * exclusivity information properly.
1299 	 */
1300 	{
1301 		"Basic COW after fork() when collapsing before fork()",
1302 		test_anon_thp_collapse_unshared,
1303 	},
1304 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1305 	{
1306 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1307 		test_anon_thp_collapse_fully_shared,
1308 	},
1309 	/*
1310 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1311 	 * THP.
1312 	 */
1313 	{
1314 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1315 		test_anon_thp_collapse_lower_shared,
1316 	},
1317 	/*
1318 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1319 	 * THP.
1320 	 */
1321 	{
1322 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1323 		test_anon_thp_collapse_upper_shared,
1324 	},
1325 };
1326 
1327 static void run_anon_thp_test_cases(void)
1328 {
1329 	int i;
1330 
1331 	if (!thpsize)
1332 		return;
1333 
1334 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1335 
1336 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1337 		struct test_case const *test_case = &anon_thp_test_cases[i];
1338 
1339 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1340 		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1341 	}
1342 }
1343 
1344 static int tests_per_anon_thp_test_case(void)
1345 {
1346 	return thpsize ? 1 : 0;
1347 }
1348 
1349 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1350 
1351 static void test_cow(char *mem, const char *smem, size_t size)
1352 {
1353 	char *old = malloc(size);
1354 
1355 	/* Backup the original content. */
1356 	memcpy(old, smem, size);
1357 
1358 	/* Modify the page. */
1359 	memset(mem, 0xff, size);
1360 
1361 	/* See if we still read the old values via the other mapping. */
1362 	ksft_test_result(!memcmp(smem, old, size),
1363 			 "Other mapping not modified\n");
1364 	free(old);
1365 }
1366 
1367 static void test_ro_pin(char *mem, const char *smem, size_t size)
1368 {
1369 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1370 }
1371 
1372 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1373 {
1374 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1375 }
1376 
1377 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1378 {
1379 	char *mem, *smem, tmp;
1380 
1381 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1382 
1383 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1384 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1385 	if (mem == MAP_FAILED) {
1386 		ksft_test_result_fail("mmap() failed\n");
1387 		return;
1388 	}
1389 
1390 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1391 	if (mem == MAP_FAILED) {
1392 		ksft_test_result_fail("mmap() failed\n");
1393 		goto munmap;
1394 	}
1395 
1396 	/* Read from the page to populate the shared zeropage. */
1397 	tmp = *mem + *smem;
1398 	asm volatile("" : "+r" (tmp));
1399 
1400 	fn(mem, smem, pagesize);
1401 munmap:
1402 	munmap(mem, pagesize);
1403 	if (smem != MAP_FAILED)
1404 		munmap(smem, pagesize);
1405 }
1406 
1407 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1408 {
1409 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1410 	size_t mmap_size;
1411 	int ret;
1412 
1413 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1414 
1415 	if (!has_huge_zeropage) {
1416 		ksft_test_result_skip("Huge zeropage not enabled\n");
1417 		return;
1418 	}
1419 
1420 	/* For alignment purposes, we need twice the thp size. */
1421 	mmap_size = 2 * thpsize;
1422 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1423 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1424 	if (mmap_mem == MAP_FAILED) {
1425 		ksft_test_result_fail("mmap() failed\n");
1426 		return;
1427 	}
1428 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1429 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1430 	if (mmap_smem == MAP_FAILED) {
1431 		ksft_test_result_fail("mmap() failed\n");
1432 		goto munmap;
1433 	}
1434 
1435 	/* We need a THP-aligned memory area. */
1436 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1437 	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1438 
1439 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1440 	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1441 	if (ret) {
1442 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1443 		goto munmap;
1444 	}
1445 
1446 	/*
1447 	 * Read from the memory to populate the huge shared zeropage. Read from
1448 	 * the first sub-page and test if we get another sub-page populated
1449 	 * automatically.
1450 	 */
1451 	tmp = *mem + *smem;
1452 	asm volatile("" : "+r" (tmp));
1453 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1454 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1455 		ksft_test_result_skip("Did not get THPs populated\n");
1456 		goto munmap;
1457 	}
1458 
1459 	fn(mem, smem, thpsize);
1460 munmap:
1461 	munmap(mmap_mem, mmap_size);
1462 	if (mmap_smem != MAP_FAILED)
1463 		munmap(mmap_smem, mmap_size);
1464 }
1465 
1466 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1467 {
1468 	char *mem, *smem, tmp;
1469 	int fd;
1470 
1471 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1472 
1473 	fd = memfd_create("test", 0);
1474 	if (fd < 0) {
1475 		ksft_test_result_fail("memfd_create() failed\n");
1476 		return;
1477 	}
1478 
1479 	/* File consists of a single page filled with zeroes. */
1480 	if (fallocate(fd, 0, 0, pagesize)) {
1481 		ksft_test_result_fail("fallocate() failed\n");
1482 		goto close;
1483 	}
1484 
1485 	/* Create a private mapping of the memfd. */
1486 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1487 	if (mem == MAP_FAILED) {
1488 		ksft_test_result_fail("mmap() failed\n");
1489 		goto close;
1490 	}
1491 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1492 	if (mem == MAP_FAILED) {
1493 		ksft_test_result_fail("mmap() failed\n");
1494 		goto munmap;
1495 	}
1496 
1497 	/* Fault the page in. */
1498 	tmp = *mem + *smem;
1499 	asm volatile("" : "+r" (tmp));
1500 
1501 	fn(mem, smem, pagesize);
1502 munmap:
1503 	munmap(mem, pagesize);
1504 	if (smem != MAP_FAILED)
1505 		munmap(smem, pagesize);
1506 close:
1507 	close(fd);
1508 }
1509 
1510 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1511 {
1512 	char *mem, *smem, tmp;
1513 	FILE *file;
1514 	int fd;
1515 
1516 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1517 
1518 	file = tmpfile();
1519 	if (!file) {
1520 		ksft_test_result_fail("tmpfile() failed\n");
1521 		return;
1522 	}
1523 
1524 	fd = fileno(file);
1525 	if (fd < 0) {
1526 		ksft_test_result_skip("fileno() failed\n");
1527 		return;
1528 	}
1529 
1530 	/* File consists of a single page filled with zeroes. */
1531 	if (fallocate(fd, 0, 0, pagesize)) {
1532 		ksft_test_result_fail("fallocate() failed\n");
1533 		goto close;
1534 	}
1535 
1536 	/* Create a private mapping of the memfd. */
1537 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1538 	if (mem == MAP_FAILED) {
1539 		ksft_test_result_fail("mmap() failed\n");
1540 		goto close;
1541 	}
1542 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1543 	if (mem == MAP_FAILED) {
1544 		ksft_test_result_fail("mmap() failed\n");
1545 		goto munmap;
1546 	}
1547 
1548 	/* Fault the page in. */
1549 	tmp = *mem + *smem;
1550 	asm volatile("" : "+r" (tmp));
1551 
1552 	fn(mem, smem, pagesize);
1553 munmap:
1554 	munmap(mem, pagesize);
1555 	if (smem != MAP_FAILED)
1556 		munmap(smem, pagesize);
1557 close:
1558 	fclose(file);
1559 }
1560 
1561 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1562 				   size_t hugetlbsize)
1563 {
1564 	int flags = MFD_HUGETLB;
1565 	char *mem, *smem, tmp;
1566 	int fd;
1567 
1568 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1569 		       hugetlbsize / 1024);
1570 
1571 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1572 
1573 	fd = memfd_create("test", flags);
1574 	if (fd < 0) {
1575 		ksft_test_result_skip("memfd_create() failed\n");
1576 		return;
1577 	}
1578 
1579 	/* File consists of a single page filled with zeroes. */
1580 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1581 		ksft_test_result_skip("need more free huge pages\n");
1582 		goto close;
1583 	}
1584 
1585 	/* Create a private mapping of the memfd. */
1586 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1587 		   0);
1588 	if (mem == MAP_FAILED) {
1589 		ksft_test_result_skip("need more free huge pages\n");
1590 		goto close;
1591 	}
1592 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1593 	if (mem == MAP_FAILED) {
1594 		ksft_test_result_fail("mmap() failed\n");
1595 		goto munmap;
1596 	}
1597 
1598 	/* Fault the page in. */
1599 	tmp = *mem + *smem;
1600 	asm volatile("" : "+r" (tmp));
1601 
1602 	fn(mem, smem, hugetlbsize);
1603 munmap:
1604 	munmap(mem, hugetlbsize);
1605 	if (mem != MAP_FAILED)
1606 		munmap(smem, hugetlbsize);
1607 close:
1608 	close(fd);
1609 }
1610 
1611 struct non_anon_test_case {
1612 	const char *desc;
1613 	non_anon_test_fn fn;
1614 };
1615 
1616 /*
1617  * Test cases that target any pages in private mappings that are not anonymous:
1618  * pages that may get shared via COW ndependent of fork(). This includes
1619  * the shared zeropage(s), pagecache pages, ...
1620  */
1621 static const struct non_anon_test_case non_anon_test_cases[] = {
1622 	/*
1623 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1624 	 * visible via other private/shared mappings.
1625 	 */
1626 	{
1627 		"Basic COW",
1628 		test_cow,
1629 	},
1630 	/*
1631 	 * Take a R/O longterm pin. When modifying the page via the page table,
1632 	 * the page content change must be visible via the pin.
1633 	 */
1634 	{
1635 		"R/O longterm GUP pin",
1636 		test_ro_pin,
1637 	},
1638 	/* Same as above, but using GUP-fast. */
1639 	{
1640 		"R/O longterm GUP-fast pin",
1641 		test_ro_fast_pin,
1642 	},
1643 };
1644 
1645 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1646 {
1647 	int i;
1648 
1649 	run_with_zeropage(test_case->fn, test_case->desc);
1650 	run_with_memfd(test_case->fn, test_case->desc);
1651 	run_with_tmpfile(test_case->fn, test_case->desc);
1652 	if (thpsize)
1653 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1654 	for (i = 0; i < nr_hugetlbsizes; i++)
1655 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1656 				       hugetlbsizes[i]);
1657 }
1658 
1659 static void run_non_anon_test_cases(void)
1660 {
1661 	int i;
1662 
1663 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1664 
1665 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1666 		run_non_anon_test_case(&non_anon_test_cases[i]);
1667 }
1668 
1669 static int tests_per_non_anon_test_case(void)
1670 {
1671 	int tests = 3 + nr_hugetlbsizes;
1672 
1673 	if (thpsize)
1674 		tests += 1;
1675 	return tests;
1676 }
1677 
1678 int main(int argc, char **argv)
1679 {
1680 	int err;
1681 
1682 	pagesize = getpagesize();
1683 	thpsize = read_pmd_pagesize();
1684 	if (thpsize)
1685 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1686 			       thpsize / 1024);
1687 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1688 						    ARRAY_SIZE(hugetlbsizes));
1689 	detect_huge_zeropage();
1690 
1691 	ksft_print_header();
1692 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1693 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1694 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1695 
1696 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1697 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1698 	if (pagemap_fd < 0)
1699 		ksft_exit_fail_msg("opening pagemap failed\n");
1700 
1701 	run_anon_test_cases();
1702 	run_anon_thp_test_cases();
1703 	run_non_anon_test_cases();
1704 
1705 	err = ksft_get_fail_cnt();
1706 	if (err)
1707 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1708 				   err, ksft_test_num());
1709 	return ksft_exit_pass();
1710 }
1711