xref: /linux/tools/testing/selftests/mm/cow.c (revision 4bf6a4ebc59201bcd12c932f25edda4c3e36e5df)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33 
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43 
44 static int sz2ord(size_t size)
45 {
46 	return __builtin_ctzll(size / pagesize);
47 }
48 
49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 	int count = 0;
52 	unsigned long orders;
53 	size_t kb;
54 	int i;
55 
56 	/* thp not supported at all. */
57 	if (!pmdsize)
58 		return 0;
59 
60 	orders = 1UL << sz2ord(pmdsize);
61 	orders |= thp_supported_orders();
62 
63 	for (i = 0; orders && count < max; i++) {
64 		if (!(orders & (1UL << i)))
65 			continue;
66 		orders &= ~(1UL << i);
67 		kb = (pagesize >> 10) << i;
68 		sizes[count++] = kb * 1024;
69 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 	}
71 
72 	return count;
73 }
74 
75 static void detect_huge_zeropage(void)
76 {
77 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 		      O_RDONLY);
79 	size_t enabled = 0;
80 	char buf[15];
81 	int ret;
82 
83 	if (fd < 0)
84 		return;
85 
86 	ret = pread(fd, buf, sizeof(buf), 0);
87 	if (ret > 0 && ret < sizeof(buf)) {
88 		buf[ret] = 0;
89 
90 		enabled = strtoul(buf, NULL, 10);
91 		if (enabled == 1) {
92 			has_huge_zeropage = true;
93 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
94 		}
95 	}
96 
97 	close(fd);
98 }
99 
100 static bool range_is_swapped(void *addr, size_t size)
101 {
102 	for (; size; addr += pagesize, size -= pagesize)
103 		if (!pagemap_is_swapped(pagemap_fd, addr))
104 			return false;
105 	return true;
106 }
107 
108 struct comm_pipes {
109 	int child_ready[2];
110 	int parent_ready[2];
111 };
112 
113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114 {
115 	if (pipe(comm_pipes->child_ready) < 0)
116 		return -errno;
117 	if (pipe(comm_pipes->parent_ready) < 0) {
118 		close(comm_pipes->child_ready[0]);
119 		close(comm_pipes->child_ready[1]);
120 		return -errno;
121 	}
122 
123 	return 0;
124 }
125 
126 static void close_comm_pipes(struct comm_pipes *comm_pipes)
127 {
128 	close(comm_pipes->child_ready[0]);
129 	close(comm_pipes->child_ready[1]);
130 	close(comm_pipes->parent_ready[0]);
131 	close(comm_pipes->parent_ready[1]);
132 }
133 
134 static int child_memcmp_fn(char *mem, size_t size,
135 			   struct comm_pipes *comm_pipes)
136 {
137 	char *old = malloc(size);
138 	char buf;
139 
140 	/* Backup the original content. */
141 	memcpy(old, mem, size);
142 
143 	/* Wait until the parent modified the page. */
144 	write(comm_pipes->child_ready[1], "0", 1);
145 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
146 		;
147 
148 	/* See if we still read the old values. */
149 	return memcmp(old, mem, size);
150 }
151 
152 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153 				    struct comm_pipes *comm_pipes)
154 {
155 	struct iovec iov = {
156 		.iov_base = mem,
157 		.iov_len = size,
158 	};
159 	ssize_t cur, total, transferred;
160 	char *old, *new;
161 	int fds[2];
162 	char buf;
163 
164 	old = malloc(size);
165 	new = malloc(size);
166 
167 	/* Backup the original content. */
168 	memcpy(old, mem, size);
169 
170 	if (pipe(fds) < 0)
171 		return -errno;
172 
173 	/* Trigger a read-only pin. */
174 	transferred = vmsplice(fds[1], &iov, 1, 0);
175 	if (transferred < 0)
176 		return -errno;
177 	if (transferred == 0)
178 		return -EINVAL;
179 
180 	/* Unmap it from our page tables. */
181 	if (munmap(mem, size) < 0)
182 		return -errno;
183 
184 	/* Wait until the parent modified it. */
185 	write(comm_pipes->child_ready[1], "0", 1);
186 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
187 		;
188 
189 	/* See if we still read the old values via the pipe. */
190 	for (total = 0; total < transferred; total += cur) {
191 		cur = read(fds[0], new + total, transferred - total);
192 		if (cur < 0)
193 			return -errno;
194 	}
195 
196 	return memcmp(old, new, transferred);
197 }
198 
199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
200 
201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202 		child_fn fn, bool xfail)
203 {
204 	struct comm_pipes comm_pipes;
205 	char buf;
206 	int ret;
207 
208 	ret = setup_comm_pipes(&comm_pipes);
209 	if (ret) {
210 		ksft_test_result_fail("pipe() failed\n");
211 		return;
212 	}
213 
214 	ret = fork();
215 	if (ret < 0) {
216 		ksft_test_result_fail("fork() failed\n");
217 		goto close_comm_pipes;
218 	} else if (!ret) {
219 		exit(fn(mem, size, &comm_pipes));
220 	}
221 
222 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
223 		;
224 
225 	if (do_mprotect) {
226 		/*
227 		 * mprotect() optimizations might try avoiding
228 		 * write-faults by directly mapping pages writable.
229 		 */
230 		ret = mprotect(mem, size, PROT_READ);
231 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
232 		if (ret) {
233 			ksft_test_result_fail("mprotect() failed\n");
234 			write(comm_pipes.parent_ready[1], "0", 1);
235 			wait(&ret);
236 			goto close_comm_pipes;
237 		}
238 	}
239 
240 	/* Modify the page. */
241 	memset(mem, 0xff, size);
242 	write(comm_pipes.parent_ready[1], "0", 1);
243 
244 	wait(&ret);
245 	if (WIFEXITED(ret))
246 		ret = WEXITSTATUS(ret);
247 	else
248 		ret = -EINVAL;
249 
250 	if (!ret) {
251 		ksft_test_result_pass("No leak from parent into child\n");
252 	} else if (xfail) {
253 		/*
254 		 * With hugetlb, some vmsplice() tests are currently expected to
255 		 * fail because (a) harder to fix and (b) nobody really cares.
256 		 * Flag them as expected failure for now.
257 		 */
258 		ksft_test_result_xfail("Leak from parent into child\n");
259 	} else {
260 		ksft_test_result_fail("Leak from parent into child\n");
261 	}
262 close_comm_pipes:
263 	close_comm_pipes(&comm_pipes);
264 }
265 
266 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
267 {
268 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
269 }
270 
271 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
272 {
273 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
274 }
275 
276 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
277 {
278 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
279 			      is_hugetlb);
280 }
281 
282 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
283 		bool is_hugetlb)
284 {
285 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
286 			      is_hugetlb);
287 }
288 
289 static void do_test_vmsplice_in_parent(char *mem, size_t size,
290 				       bool before_fork, bool xfail)
291 {
292 	struct iovec iov = {
293 		.iov_base = mem,
294 		.iov_len = size,
295 	};
296 	ssize_t cur, total, transferred;
297 	struct comm_pipes comm_pipes;
298 	char *old, *new;
299 	int ret, fds[2];
300 	char buf;
301 
302 	old = malloc(size);
303 	new = malloc(size);
304 
305 	memcpy(old, mem, size);
306 
307 	ret = setup_comm_pipes(&comm_pipes);
308 	if (ret) {
309 		ksft_test_result_fail("pipe() failed\n");
310 		goto free;
311 	}
312 
313 	if (pipe(fds) < 0) {
314 		ksft_test_result_fail("pipe() failed\n");
315 		goto close_comm_pipes;
316 	}
317 
318 	if (before_fork) {
319 		transferred = vmsplice(fds[1], &iov, 1, 0);
320 		if (transferred <= 0) {
321 			ksft_test_result_fail("vmsplice() failed\n");
322 			goto close_pipe;
323 		}
324 	}
325 
326 	ret = fork();
327 	if (ret < 0) {
328 		ksft_test_result_fail("fork() failed\n");
329 		goto close_pipe;
330 	} else if (!ret) {
331 		write(comm_pipes.child_ready[1], "0", 1);
332 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
333 			;
334 		/* Modify page content in the child. */
335 		memset(mem, 0xff, size);
336 		exit(0);
337 	}
338 
339 	if (!before_fork) {
340 		transferred = vmsplice(fds[1], &iov, 1, 0);
341 		if (transferred <= 0) {
342 			ksft_test_result_fail("vmsplice() failed\n");
343 			wait(&ret);
344 			goto close_pipe;
345 		}
346 	}
347 
348 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
349 		;
350 	if (munmap(mem, size) < 0) {
351 		ksft_test_result_fail("munmap() failed\n");
352 		goto close_pipe;
353 	}
354 	write(comm_pipes.parent_ready[1], "0", 1);
355 
356 	/* Wait until the child is done writing. */
357 	wait(&ret);
358 	if (!WIFEXITED(ret)) {
359 		ksft_test_result_fail("wait() failed\n");
360 		goto close_pipe;
361 	}
362 
363 	/* See if we still read the old values. */
364 	for (total = 0; total < transferred; total += cur) {
365 		cur = read(fds[0], new + total, transferred - total);
366 		if (cur < 0) {
367 			ksft_test_result_fail("read() failed\n");
368 			goto close_pipe;
369 		}
370 	}
371 
372 	if (!memcmp(old, new, transferred)) {
373 		ksft_test_result_pass("No leak from child into parent\n");
374 	} else if (xfail) {
375 		/*
376 		 * With hugetlb, some vmsplice() tests are currently expected to
377 		 * fail because (a) harder to fix and (b) nobody really cares.
378 		 * Flag them as expected failure for now.
379 		 */
380 		ksft_test_result_xfail("Leak from child into parent\n");
381 	} else {
382 		ksft_test_result_fail("Leak from child into parent\n");
383 	}
384 close_pipe:
385 	close(fds[0]);
386 	close(fds[1]);
387 close_comm_pipes:
388 	close_comm_pipes(&comm_pipes);
389 free:
390 	free(old);
391 	free(new);
392 }
393 
394 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
395 {
396 	do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
397 }
398 
399 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
400 {
401 	do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
402 }
403 
404 #ifdef LOCAL_CONFIG_HAVE_LIBURING
405 static void do_test_iouring(char *mem, size_t size, bool use_fork)
406 {
407 	struct comm_pipes comm_pipes;
408 	struct io_uring_cqe *cqe;
409 	struct io_uring_sqe *sqe;
410 	struct io_uring ring;
411 	ssize_t cur, total;
412 	struct iovec iov;
413 	char *buf, *tmp;
414 	int ret, fd;
415 	FILE *file;
416 
417 	ret = setup_comm_pipes(&comm_pipes);
418 	if (ret) {
419 		ksft_test_result_fail("pipe() failed\n");
420 		return;
421 	}
422 
423 	file = tmpfile();
424 	if (!file) {
425 		ksft_test_result_fail("tmpfile() failed\n");
426 		goto close_comm_pipes;
427 	}
428 	fd = fileno(file);
429 	assert(fd);
430 
431 	tmp = malloc(size);
432 	if (!tmp) {
433 		ksft_test_result_fail("malloc() failed\n");
434 		goto close_file;
435 	}
436 
437 	/* Skip on errors, as we might just lack kernel support. */
438 	ret = io_uring_queue_init(1, &ring, 0);
439 	if (ret < 0) {
440 		ksft_test_result_skip("io_uring_queue_init() failed\n");
441 		goto free_tmp;
442 	}
443 
444 	/*
445 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
446 	 * | FOLL_LONGTERM the range.
447 	 *
448 	 * Skip on errors, as we might just lack kernel support or might not
449 	 * have sufficient MEMLOCK permissions.
450 	 */
451 	iov.iov_base = mem;
452 	iov.iov_len = size;
453 	ret = io_uring_register_buffers(&ring, &iov, 1);
454 	if (ret) {
455 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
456 		goto queue_exit;
457 	}
458 
459 	if (use_fork) {
460 		/*
461 		 * fork() and keep the child alive until we're done. Note that
462 		 * we expect the pinned page to not get shared with the child.
463 		 */
464 		ret = fork();
465 		if (ret < 0) {
466 			ksft_test_result_fail("fork() failed\n");
467 			goto unregister_buffers;
468 		} else if (!ret) {
469 			write(comm_pipes.child_ready[1], "0", 1);
470 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
471 				;
472 			exit(0);
473 		}
474 
475 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
476 			;
477 	} else {
478 		/*
479 		 * Map the page R/O into the page table. Enable softdirty
480 		 * tracking to stop the page from getting mapped R/W immediately
481 		 * again by mprotect() optimizations. Note that we don't have an
482 		 * easy way to test if that worked (the pagemap does not export
483 		 * if the page is mapped R/O vs. R/W).
484 		 */
485 		ret = mprotect(mem, size, PROT_READ);
486 		clear_softdirty();
487 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
488 		if (ret) {
489 			ksft_test_result_fail("mprotect() failed\n");
490 			goto unregister_buffers;
491 		}
492 	}
493 
494 	/*
495 	 * Modify the page and write page content as observed by the fixed
496 	 * buffer pin to the file so we can verify it.
497 	 */
498 	memset(mem, 0xff, size);
499 	sqe = io_uring_get_sqe(&ring);
500 	if (!sqe) {
501 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
502 		goto quit_child;
503 	}
504 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
505 
506 	ret = io_uring_submit(&ring);
507 	if (ret < 0) {
508 		ksft_test_result_fail("io_uring_submit() failed\n");
509 		goto quit_child;
510 	}
511 
512 	ret = io_uring_wait_cqe(&ring, &cqe);
513 	if (ret < 0) {
514 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
515 		goto quit_child;
516 	}
517 
518 	if (cqe->res != size) {
519 		ksft_test_result_fail("write_fixed failed\n");
520 		goto quit_child;
521 	}
522 	io_uring_cqe_seen(&ring, cqe);
523 
524 	/* Read back the file content to the temporary buffer. */
525 	total = 0;
526 	while (total < size) {
527 		cur = pread(fd, tmp + total, size - total, total);
528 		if (cur < 0) {
529 			ksft_test_result_fail("pread() failed\n");
530 			goto quit_child;
531 		}
532 		total += cur;
533 	}
534 
535 	/* Finally, check if we read what we expected. */
536 	ksft_test_result(!memcmp(mem, tmp, size),
537 			 "Longterm R/W pin is reliable\n");
538 
539 quit_child:
540 	if (use_fork) {
541 		write(comm_pipes.parent_ready[1], "0", 1);
542 		wait(&ret);
543 	}
544 unregister_buffers:
545 	io_uring_unregister_buffers(&ring);
546 queue_exit:
547 	io_uring_queue_exit(&ring);
548 free_tmp:
549 	free(tmp);
550 close_file:
551 	fclose(file);
552 close_comm_pipes:
553 	close_comm_pipes(&comm_pipes);
554 }
555 
556 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
557 {
558 	do_test_iouring(mem, size, false);
559 }
560 
561 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
562 {
563 	do_test_iouring(mem, size, true);
564 }
565 
566 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
567 
568 enum ro_pin_test {
569 	RO_PIN_TEST,
570 	RO_PIN_TEST_SHARED,
571 	RO_PIN_TEST_PREVIOUSLY_SHARED,
572 	RO_PIN_TEST_RO_EXCLUSIVE,
573 };
574 
575 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
576 			   bool fast)
577 {
578 	struct pin_longterm_test args;
579 	struct comm_pipes comm_pipes;
580 	char *tmp, buf;
581 	__u64 tmp_val;
582 	int ret;
583 
584 	if (gup_fd < 0) {
585 		ksft_test_result_skip("gup_test not available\n");
586 		return;
587 	}
588 
589 	tmp = malloc(size);
590 	if (!tmp) {
591 		ksft_test_result_fail("malloc() failed\n");
592 		return;
593 	}
594 
595 	ret = setup_comm_pipes(&comm_pipes);
596 	if (ret) {
597 		ksft_test_result_fail("pipe() failed\n");
598 		goto free_tmp;
599 	}
600 
601 	switch (test) {
602 	case RO_PIN_TEST:
603 		break;
604 	case RO_PIN_TEST_SHARED:
605 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
606 		/*
607 		 * Share the pages with our child. As the pages are not pinned,
608 		 * this should just work.
609 		 */
610 		ret = fork();
611 		if (ret < 0) {
612 			ksft_test_result_fail("fork() failed\n");
613 			goto close_comm_pipes;
614 		} else if (!ret) {
615 			write(comm_pipes.child_ready[1], "0", 1);
616 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
617 				;
618 			exit(0);
619 		}
620 
621 		/* Wait until our child is ready. */
622 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
623 			;
624 
625 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
626 			/*
627 			 * Tell the child to quit now and wait until it quit.
628 			 * The pages should now be mapped R/O into our page
629 			 * tables, but they are no longer shared.
630 			 */
631 			write(comm_pipes.parent_ready[1], "0", 1);
632 			wait(&ret);
633 			if (!WIFEXITED(ret))
634 				ksft_print_msg("[INFO] wait() failed\n");
635 		}
636 		break;
637 	case RO_PIN_TEST_RO_EXCLUSIVE:
638 		/*
639 		 * Map the page R/O into the page table. Enable softdirty
640 		 * tracking to stop the page from getting mapped R/W immediately
641 		 * again by mprotect() optimizations. Note that we don't have an
642 		 * easy way to test if that worked (the pagemap does not export
643 		 * if the page is mapped R/O vs. R/W).
644 		 */
645 		ret = mprotect(mem, size, PROT_READ);
646 		clear_softdirty();
647 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
648 		if (ret) {
649 			ksft_test_result_fail("mprotect() failed\n");
650 			goto close_comm_pipes;
651 		}
652 		break;
653 	default:
654 		assert(false);
655 	}
656 
657 	/* Take a R/O pin. This should trigger unsharing. */
658 	args.addr = (__u64)(uintptr_t)mem;
659 	args.size = size;
660 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
661 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
662 	if (ret) {
663 		if (errno == EINVAL)
664 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
665 		else
666 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
667 		goto wait;
668 	}
669 
670 	/* Modify the page. */
671 	memset(mem, 0xff, size);
672 
673 	/*
674 	 * Read back the content via the pin to the temporary buffer and
675 	 * test if we observed the modification.
676 	 */
677 	tmp_val = (__u64)(uintptr_t)tmp;
678 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
679 	if (ret)
680 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
681 	else
682 		ksft_test_result(!memcmp(mem, tmp, size),
683 				 "Longterm R/O pin is reliable\n");
684 
685 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
686 	if (ret)
687 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
688 wait:
689 	switch (test) {
690 	case RO_PIN_TEST_SHARED:
691 		write(comm_pipes.parent_ready[1], "0", 1);
692 		wait(&ret);
693 		if (!WIFEXITED(ret))
694 			ksft_print_msg("[INFO] wait() failed\n");
695 		break;
696 	default:
697 		break;
698 	}
699 close_comm_pipes:
700 	close_comm_pipes(&comm_pipes);
701 free_tmp:
702 	free(tmp);
703 }
704 
705 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
706 {
707 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
708 }
709 
710 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
711 {
712 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
713 }
714 
715 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
716 		bool is_hugetlb)
717 {
718 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
719 }
720 
721 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
722 		bool is_hugetlb)
723 {
724 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
725 }
726 
727 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
728 		bool is_hugetlb)
729 {
730 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
731 }
732 
733 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
734 		bool is_hugetlb)
735 {
736 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
737 }
738 
739 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
740 
741 static void do_run_with_base_page(test_fn fn, bool swapout)
742 {
743 	char *mem;
744 	int ret;
745 
746 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
747 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
748 	if (mem == MAP_FAILED) {
749 		ksft_test_result_fail("mmap() failed\n");
750 		return;
751 	}
752 
753 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
754 	/* Ignore if not around on a kernel. */
755 	if (ret && errno != EINVAL) {
756 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
757 		goto munmap;
758 	}
759 
760 	/* Populate a base page. */
761 	memset(mem, 0, pagesize);
762 
763 	if (swapout) {
764 		madvise(mem, pagesize, MADV_PAGEOUT);
765 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
766 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
767 			goto munmap;
768 		}
769 	}
770 
771 	fn(mem, pagesize, false);
772 munmap:
773 	munmap(mem, pagesize);
774 }
775 
776 static void run_with_base_page(test_fn fn, const char *desc)
777 {
778 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
779 	do_run_with_base_page(fn, false);
780 }
781 
782 static void run_with_base_page_swap(test_fn fn, const char *desc)
783 {
784 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
785 	do_run_with_base_page(fn, true);
786 }
787 
788 enum thp_run {
789 	THP_RUN_PMD,
790 	THP_RUN_PMD_SWAPOUT,
791 	THP_RUN_PTE,
792 	THP_RUN_PTE_SWAPOUT,
793 	THP_RUN_SINGLE_PTE,
794 	THP_RUN_SINGLE_PTE_SWAPOUT,
795 	THP_RUN_PARTIAL_MREMAP,
796 	THP_RUN_PARTIAL_SHARED,
797 };
798 
799 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
800 {
801 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
802 	size_t size, mmap_size, mremap_size;
803 	int ret;
804 
805 	/* For alignment purposes, we need twice the thp size. */
806 	mmap_size = 2 * thpsize;
807 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
808 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
809 	if (mmap_mem == MAP_FAILED) {
810 		ksft_test_result_fail("mmap() failed\n");
811 		return;
812 	}
813 
814 	/* We need a THP-aligned memory area. */
815 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
816 
817 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
818 	if (ret) {
819 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
820 		goto munmap;
821 	}
822 
823 	/*
824 	 * Try to populate a THP. Touch the first sub-page and test if
825 	 * we get the last sub-page populated automatically.
826 	 */
827 	mem[0] = 0;
828 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
829 		ksft_test_result_skip("Did not get a THP populated\n");
830 		goto munmap;
831 	}
832 	memset(mem, 0, thpsize);
833 
834 	size = thpsize;
835 	switch (thp_run) {
836 	case THP_RUN_PMD:
837 	case THP_RUN_PMD_SWAPOUT:
838 		assert(thpsize == pmdsize);
839 		break;
840 	case THP_RUN_PTE:
841 	case THP_RUN_PTE_SWAPOUT:
842 		/*
843 		 * Trigger PTE-mapping the THP by temporarily mapping a single
844 		 * subpage R/O. This is a noop if the THP is not pmdsize (and
845 		 * therefore already PTE-mapped).
846 		 */
847 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
848 		if (ret) {
849 			ksft_test_result_fail("mprotect() failed\n");
850 			goto munmap;
851 		}
852 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
853 		if (ret) {
854 			ksft_test_result_fail("mprotect() failed\n");
855 			goto munmap;
856 		}
857 		break;
858 	case THP_RUN_SINGLE_PTE:
859 	case THP_RUN_SINGLE_PTE_SWAPOUT:
860 		/*
861 		 * Discard all but a single subpage of that PTE-mapped THP. What
862 		 * remains is a single PTE mapping a single subpage.
863 		 */
864 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
865 		if (ret) {
866 			ksft_test_result_fail("MADV_DONTNEED failed\n");
867 			goto munmap;
868 		}
869 		size = pagesize;
870 		break;
871 	case THP_RUN_PARTIAL_MREMAP:
872 		/*
873 		 * Remap half of the THP. We need some new memory location
874 		 * for that.
875 		 */
876 		mremap_size = thpsize / 2;
877 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
878 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
879 		if (mem == MAP_FAILED) {
880 			ksft_test_result_fail("mmap() failed\n");
881 			goto munmap;
882 		}
883 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
884 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
885 		if (tmp != mremap_mem) {
886 			ksft_test_result_fail("mremap() failed\n");
887 			goto munmap;
888 		}
889 		size = mremap_size;
890 		break;
891 	case THP_RUN_PARTIAL_SHARED:
892 		/*
893 		 * Share the first page of the THP with a child and quit the
894 		 * child. This will result in some parts of the THP never
895 		 * have been shared.
896 		 */
897 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
898 		if (ret) {
899 			ksft_test_result_fail("MADV_DONTFORK failed\n");
900 			goto munmap;
901 		}
902 		ret = fork();
903 		if (ret < 0) {
904 			ksft_test_result_fail("fork() failed\n");
905 			goto munmap;
906 		} else if (!ret) {
907 			exit(0);
908 		}
909 		wait(&ret);
910 		/* Allow for sharing all pages again. */
911 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
912 		if (ret) {
913 			ksft_test_result_fail("MADV_DOFORK failed\n");
914 			goto munmap;
915 		}
916 		break;
917 	default:
918 		assert(false);
919 	}
920 
921 	switch (thp_run) {
922 	case THP_RUN_PMD_SWAPOUT:
923 	case THP_RUN_PTE_SWAPOUT:
924 	case THP_RUN_SINGLE_PTE_SWAPOUT:
925 		madvise(mem, size, MADV_PAGEOUT);
926 		if (!range_is_swapped(mem, size)) {
927 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
928 			goto munmap;
929 		}
930 		break;
931 	default:
932 		break;
933 	}
934 
935 	fn(mem, size, false);
936 munmap:
937 	munmap(mmap_mem, mmap_size);
938 	if (mremap_mem != MAP_FAILED)
939 		munmap(mremap_mem, mremap_size);
940 }
941 
942 static void run_with_thp(test_fn fn, const char *desc, size_t size)
943 {
944 	ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
945 		desc, size / 1024);
946 	do_run_with_thp(fn, THP_RUN_PMD, size);
947 }
948 
949 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
950 {
951 	ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
952 		desc, size / 1024);
953 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
954 }
955 
956 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
957 {
958 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
959 		desc, size / 1024);
960 	do_run_with_thp(fn, THP_RUN_PTE, size);
961 }
962 
963 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
964 {
965 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
966 		desc, size / 1024);
967 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
968 }
969 
970 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
971 {
972 	ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
973 		desc, size / 1024);
974 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
975 }
976 
977 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
978 {
979 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
980 		desc, size / 1024);
981 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
982 }
983 
984 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
985 {
986 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
987 		desc, size / 1024);
988 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
989 }
990 
991 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
992 {
993 	ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
994 		desc, size / 1024);
995 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
996 }
997 
998 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
999 {
1000 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1001 	char *mem, *dummy;
1002 
1003 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
1004 		       hugetlbsize / 1024);
1005 
1006 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1007 
1008 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1009 	if (mem == MAP_FAILED) {
1010 		ksft_test_result_skip("need more free huge pages\n");
1011 		return;
1012 	}
1013 
1014 	/* Populate an huge page. */
1015 	memset(mem, 0, hugetlbsize);
1016 
1017 	/*
1018 	 * We need a total of two hugetlb pages to handle COW/unsharing
1019 	 * properly, otherwise we might get zapped by a SIGBUS.
1020 	 */
1021 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1022 	if (dummy == MAP_FAILED) {
1023 		ksft_test_result_skip("need more free huge pages\n");
1024 		goto munmap;
1025 	}
1026 	munmap(dummy, hugetlbsize);
1027 
1028 	fn(mem, hugetlbsize, true);
1029 munmap:
1030 	munmap(mem, hugetlbsize);
1031 }
1032 
1033 struct test_case {
1034 	const char *desc;
1035 	test_fn fn;
1036 };
1037 
1038 /*
1039  * Test cases that are specific to anonymous pages: pages in private mappings
1040  * that may get shared via COW during fork().
1041  */
1042 static const struct test_case anon_test_cases[] = {
1043 	/*
1044 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1045 	 * either the child can observe modifications by the parent or the
1046 	 * other way around.
1047 	 */
1048 	{
1049 		"Basic COW after fork()",
1050 		test_cow_in_parent,
1051 	},
1052 	/*
1053 	 * Basic test, but do an additional mprotect(PROT_READ)+
1054 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1055 	 */
1056 	{
1057 		"Basic COW after fork() with mprotect() optimization",
1058 		test_cow_in_parent_mprotect,
1059 	},
1060 	/*
1061 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1062 	 * we miss to break COW, the child observes modifications by the parent.
1063 	 * This is CVE-2020-29374 reported by Jann Horn.
1064 	 */
1065 	{
1066 		"vmsplice() + unmap in child",
1067 		test_vmsplice_in_child,
1068 	},
1069 	/*
1070 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1071 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1072 	 */
1073 	{
1074 		"vmsplice() + unmap in child with mprotect() optimization",
1075 		test_vmsplice_in_child_mprotect,
1076 	},
1077 	/*
1078 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1079 	 * fork(); modify in the child. If we miss to break COW, the parent
1080 	 * observes modifications by the child.
1081 	 */
1082 	{
1083 		"vmsplice() before fork(), unmap in parent after fork()",
1084 		test_vmsplice_before_fork,
1085 	},
1086 	/*
1087 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1088 	 * child. If we miss to break COW, the parent observes modifications by
1089 	 * the child.
1090 	 */
1091 	{
1092 		"vmsplice() + unmap in parent after fork()",
1093 		test_vmsplice_after_fork,
1094 	},
1095 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1096 	/*
1097 	 * Take a R/W longterm pin and then map the page R/O into the page
1098 	 * table to trigger a write fault on next access. When modifying the
1099 	 * page, the page content must be visible via the pin.
1100 	 */
1101 	{
1102 		"R/O-mapping a page registered as iouring fixed buffer",
1103 		test_iouring_ro,
1104 	},
1105 	/*
1106 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1107 	 * page, the page content must be visible via the pin. We expect the
1108 	 * pinned page to not get shared with the child.
1109 	 */
1110 	{
1111 		"fork() with an iouring fixed buffer",
1112 		test_iouring_fork,
1113 	},
1114 
1115 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1116 	/*
1117 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1118 	 * When modifying the page via the page table, the page content change
1119 	 * must be visible via the pin.
1120 	 */
1121 	{
1122 		"R/O GUP pin on R/O-mapped shared page",
1123 		test_ro_pin_on_shared,
1124 	},
1125 	/* Same as above, but using GUP-fast. */
1126 	{
1127 		"R/O GUP-fast pin on R/O-mapped shared page",
1128 		test_ro_fast_pin_on_shared,
1129 	},
1130 	/*
1131 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1132 	 * was previously shared. When modifying the page via the page table,
1133 	 * the page content change must be visible via the pin.
1134 	 */
1135 	{
1136 		"R/O GUP pin on R/O-mapped previously-shared page",
1137 		test_ro_pin_on_ro_previously_shared,
1138 	},
1139 	/* Same as above, but using GUP-fast. */
1140 	{
1141 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1142 		test_ro_fast_pin_on_ro_previously_shared,
1143 	},
1144 	/*
1145 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1146 	 * When modifying the page via the page table, the page content change
1147 	 * must be visible via the pin.
1148 	 */
1149 	{
1150 		"R/O GUP pin on R/O-mapped exclusive page",
1151 		test_ro_pin_on_ro_exclusive,
1152 	},
1153 	/* Same as above, but using GUP-fast. */
1154 	{
1155 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1156 		test_ro_fast_pin_on_ro_exclusive,
1157 	},
1158 };
1159 
1160 static void run_anon_test_case(struct test_case const *test_case)
1161 {
1162 	int i;
1163 
1164 	run_with_base_page(test_case->fn, test_case->desc);
1165 	run_with_base_page_swap(test_case->fn, test_case->desc);
1166 	for (i = 0; i < nr_thpsizes; i++) {
1167 		size_t size = thpsizes[i];
1168 		struct thp_settings settings = *thp_current_settings();
1169 
1170 		settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1171 		settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1172 		thp_push_settings(&settings);
1173 
1174 		if (size == pmdsize) {
1175 			run_with_thp(test_case->fn, test_case->desc, size);
1176 			run_with_thp_swap(test_case->fn, test_case->desc, size);
1177 		}
1178 
1179 		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1180 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1181 		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1182 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1183 		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1184 		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1185 
1186 		thp_pop_settings();
1187 	}
1188 	for (i = 0; i < nr_hugetlbsizes; i++)
1189 		run_with_hugetlb(test_case->fn, test_case->desc,
1190 				 hugetlbsizes[i]);
1191 }
1192 
1193 static void run_anon_test_cases(void)
1194 {
1195 	int i;
1196 
1197 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1198 
1199 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1200 		run_anon_test_case(&anon_test_cases[i]);
1201 }
1202 
1203 static int tests_per_anon_test_case(void)
1204 {
1205 	int tests = 2 + nr_hugetlbsizes;
1206 
1207 	tests += 6 * nr_thpsizes;
1208 	if (pmdsize)
1209 		tests += 2;
1210 	return tests;
1211 }
1212 
1213 enum anon_thp_collapse_test {
1214 	ANON_THP_COLLAPSE_UNSHARED,
1215 	ANON_THP_COLLAPSE_FULLY_SHARED,
1216 	ANON_THP_COLLAPSE_LOWER_SHARED,
1217 	ANON_THP_COLLAPSE_UPPER_SHARED,
1218 };
1219 
1220 static void do_test_anon_thp_collapse(char *mem, size_t size,
1221 				      enum anon_thp_collapse_test test)
1222 {
1223 	struct comm_pipes comm_pipes;
1224 	char buf;
1225 	int ret;
1226 
1227 	ret = setup_comm_pipes(&comm_pipes);
1228 	if (ret) {
1229 		ksft_test_result_fail("pipe() failed\n");
1230 		return;
1231 	}
1232 
1233 	/*
1234 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1235 	 * R/O, such that we can try collapsing it later.
1236 	 */
1237 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1238 	if (ret) {
1239 		ksft_test_result_fail("mprotect() failed\n");
1240 		goto close_comm_pipes;
1241 	}
1242 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1243 	if (ret) {
1244 		ksft_test_result_fail("mprotect() failed\n");
1245 		goto close_comm_pipes;
1246 	}
1247 
1248 	switch (test) {
1249 	case ANON_THP_COLLAPSE_UNSHARED:
1250 		/* Collapse before actually COW-sharing the page. */
1251 		ret = madvise(mem, size, MADV_COLLAPSE);
1252 		if (ret) {
1253 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1254 					      strerror(errno));
1255 			goto close_comm_pipes;
1256 		}
1257 		break;
1258 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1259 		/* COW-share the full PTE-mapped THP. */
1260 		break;
1261 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1262 		/* Don't COW-share the upper part of the THP. */
1263 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1264 		if (ret) {
1265 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1266 			goto close_comm_pipes;
1267 		}
1268 		break;
1269 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1270 		/* Don't COW-share the lower part of the THP. */
1271 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1272 		if (ret) {
1273 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1274 			goto close_comm_pipes;
1275 		}
1276 		break;
1277 	default:
1278 		assert(false);
1279 	}
1280 
1281 	ret = fork();
1282 	if (ret < 0) {
1283 		ksft_test_result_fail("fork() failed\n");
1284 		goto close_comm_pipes;
1285 	} else if (!ret) {
1286 		switch (test) {
1287 		case ANON_THP_COLLAPSE_UNSHARED:
1288 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1289 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1290 			break;
1291 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1292 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1293 			break;
1294 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1295 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1296 					     &comm_pipes));
1297 			break;
1298 		default:
1299 			assert(false);
1300 		}
1301 	}
1302 
1303 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1304 		;
1305 
1306 	switch (test) {
1307 	case ANON_THP_COLLAPSE_UNSHARED:
1308 		break;
1309 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1310 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1311 		/*
1312 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1313 		 * able to actually collapse.
1314 		 */
1315 		ret = madvise(mem, size, MADV_DOFORK);
1316 		if (ret) {
1317 			ksft_test_result_fail("MADV_DOFORK failed\n");
1318 			write(comm_pipes.parent_ready[1], "0", 1);
1319 			wait(&ret);
1320 			goto close_comm_pipes;
1321 		}
1322 		/* FALLTHROUGH */
1323 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1324 		/* Collapse before anyone modified the COW-shared page. */
1325 		ret = madvise(mem, size, MADV_COLLAPSE);
1326 		if (ret) {
1327 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1328 					      strerror(errno));
1329 			write(comm_pipes.parent_ready[1], "0", 1);
1330 			wait(&ret);
1331 			goto close_comm_pipes;
1332 		}
1333 		break;
1334 	default:
1335 		assert(false);
1336 	}
1337 
1338 	/* Modify the page. */
1339 	memset(mem, 0xff, size);
1340 	write(comm_pipes.parent_ready[1], "0", 1);
1341 
1342 	wait(&ret);
1343 	if (WIFEXITED(ret))
1344 		ret = WEXITSTATUS(ret);
1345 	else
1346 		ret = -EINVAL;
1347 
1348 	ksft_test_result(!ret, "No leak from parent into child\n");
1349 close_comm_pipes:
1350 	close_comm_pipes(&comm_pipes);
1351 }
1352 
1353 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1354 		bool is_hugetlb)
1355 {
1356 	assert(!is_hugetlb);
1357 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1358 }
1359 
1360 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1361 		bool is_hugetlb)
1362 {
1363 	assert(!is_hugetlb);
1364 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1365 }
1366 
1367 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1368 		bool is_hugetlb)
1369 {
1370 	assert(!is_hugetlb);
1371 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1372 }
1373 
1374 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1375 		bool is_hugetlb)
1376 {
1377 	assert(!is_hugetlb);
1378 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1379 }
1380 
1381 /*
1382  * Test cases that are specific to anonymous THP: pages in private mappings
1383  * that may get shared via COW during fork().
1384  */
1385 static const struct test_case anon_thp_test_cases[] = {
1386 	/*
1387 	 * Basic COW test for fork() without any GUP when collapsing a THP
1388 	 * before fork().
1389 	 *
1390 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1391 	 * collapse") might easily get COW handling wrong when not collapsing
1392 	 * exclusivity information properly.
1393 	 */
1394 	{
1395 		"Basic COW after fork() when collapsing before fork()",
1396 		test_anon_thp_collapse_unshared,
1397 	},
1398 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1399 	{
1400 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1401 		test_anon_thp_collapse_fully_shared,
1402 	},
1403 	/*
1404 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1405 	 * THP.
1406 	 */
1407 	{
1408 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1409 		test_anon_thp_collapse_lower_shared,
1410 	},
1411 	/*
1412 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1413 	 * THP.
1414 	 */
1415 	{
1416 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1417 		test_anon_thp_collapse_upper_shared,
1418 	},
1419 };
1420 
1421 static void run_anon_thp_test_cases(void)
1422 {
1423 	int i;
1424 
1425 	if (!pmdsize)
1426 		return;
1427 
1428 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1429 
1430 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1431 		struct test_case const *test_case = &anon_thp_test_cases[i];
1432 
1433 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1434 		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1435 	}
1436 }
1437 
1438 static int tests_per_anon_thp_test_case(void)
1439 {
1440 	return pmdsize ? 1 : 0;
1441 }
1442 
1443 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1444 
1445 static void test_cow(char *mem, const char *smem, size_t size)
1446 {
1447 	char *old = malloc(size);
1448 
1449 	/* Backup the original content. */
1450 	memcpy(old, smem, size);
1451 
1452 	/* Modify the page. */
1453 	memset(mem, 0xff, size);
1454 
1455 	/* See if we still read the old values via the other mapping. */
1456 	ksft_test_result(!memcmp(smem, old, size),
1457 			 "Other mapping not modified\n");
1458 	free(old);
1459 }
1460 
1461 static void test_ro_pin(char *mem, const char *smem, size_t size)
1462 {
1463 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1464 }
1465 
1466 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1467 {
1468 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1469 }
1470 
1471 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1472 {
1473 	char *mem, *smem, tmp;
1474 
1475 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1476 
1477 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1478 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1479 	if (mem == MAP_FAILED) {
1480 		ksft_test_result_fail("mmap() failed\n");
1481 		return;
1482 	}
1483 
1484 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1485 	if (mem == MAP_FAILED) {
1486 		ksft_test_result_fail("mmap() failed\n");
1487 		goto munmap;
1488 	}
1489 
1490 	/* Read from the page to populate the shared zeropage. */
1491 	tmp = *mem + *smem;
1492 	asm volatile("" : "+r" (tmp));
1493 
1494 	fn(mem, smem, pagesize);
1495 munmap:
1496 	munmap(mem, pagesize);
1497 	if (smem != MAP_FAILED)
1498 		munmap(smem, pagesize);
1499 }
1500 
1501 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1502 {
1503 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1504 	size_t mmap_size;
1505 	int ret;
1506 
1507 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1508 
1509 	if (!has_huge_zeropage) {
1510 		ksft_test_result_skip("Huge zeropage not enabled\n");
1511 		return;
1512 	}
1513 
1514 	/* For alignment purposes, we need twice the thp size. */
1515 	mmap_size = 2 * pmdsize;
1516 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1517 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1518 	if (mmap_mem == MAP_FAILED) {
1519 		ksft_test_result_fail("mmap() failed\n");
1520 		return;
1521 	}
1522 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1523 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1524 	if (mmap_smem == MAP_FAILED) {
1525 		ksft_test_result_fail("mmap() failed\n");
1526 		goto munmap;
1527 	}
1528 
1529 	/* We need a THP-aligned memory area. */
1530 	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1531 	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1532 
1533 	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1534 	ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1535 	if (ret) {
1536 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1537 		goto munmap;
1538 	}
1539 
1540 	/*
1541 	 * Read from the memory to populate the huge shared zeropage. Read from
1542 	 * the first sub-page and test if we get another sub-page populated
1543 	 * automatically.
1544 	 */
1545 	tmp = *mem + *smem;
1546 	asm volatile("" : "+r" (tmp));
1547 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1548 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1549 		ksft_test_result_skip("Did not get THPs populated\n");
1550 		goto munmap;
1551 	}
1552 
1553 	fn(mem, smem, pmdsize);
1554 munmap:
1555 	munmap(mmap_mem, mmap_size);
1556 	if (mmap_smem != MAP_FAILED)
1557 		munmap(mmap_smem, mmap_size);
1558 }
1559 
1560 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1561 {
1562 	char *mem, *smem, tmp;
1563 	int fd;
1564 
1565 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1566 
1567 	fd = memfd_create("test", 0);
1568 	if (fd < 0) {
1569 		ksft_test_result_fail("memfd_create() failed\n");
1570 		return;
1571 	}
1572 
1573 	/* File consists of a single page filled with zeroes. */
1574 	if (fallocate(fd, 0, 0, pagesize)) {
1575 		ksft_test_result_fail("fallocate() failed\n");
1576 		goto close;
1577 	}
1578 
1579 	/* Create a private mapping of the memfd. */
1580 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1581 	if (mem == MAP_FAILED) {
1582 		ksft_test_result_fail("mmap() failed\n");
1583 		goto close;
1584 	}
1585 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1586 	if (mem == MAP_FAILED) {
1587 		ksft_test_result_fail("mmap() failed\n");
1588 		goto munmap;
1589 	}
1590 
1591 	/* Fault the page in. */
1592 	tmp = *mem + *smem;
1593 	asm volatile("" : "+r" (tmp));
1594 
1595 	fn(mem, smem, pagesize);
1596 munmap:
1597 	munmap(mem, pagesize);
1598 	if (smem != MAP_FAILED)
1599 		munmap(smem, pagesize);
1600 close:
1601 	close(fd);
1602 }
1603 
1604 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1605 {
1606 	char *mem, *smem, tmp;
1607 	FILE *file;
1608 	int fd;
1609 
1610 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1611 
1612 	file = tmpfile();
1613 	if (!file) {
1614 		ksft_test_result_fail("tmpfile() failed\n");
1615 		return;
1616 	}
1617 
1618 	fd = fileno(file);
1619 	if (fd < 0) {
1620 		ksft_test_result_skip("fileno() failed\n");
1621 		return;
1622 	}
1623 
1624 	/* File consists of a single page filled with zeroes. */
1625 	if (fallocate(fd, 0, 0, pagesize)) {
1626 		ksft_test_result_fail("fallocate() failed\n");
1627 		goto close;
1628 	}
1629 
1630 	/* Create a private mapping of the memfd. */
1631 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1632 	if (mem == MAP_FAILED) {
1633 		ksft_test_result_fail("mmap() failed\n");
1634 		goto close;
1635 	}
1636 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1637 	if (mem == MAP_FAILED) {
1638 		ksft_test_result_fail("mmap() failed\n");
1639 		goto munmap;
1640 	}
1641 
1642 	/* Fault the page in. */
1643 	tmp = *mem + *smem;
1644 	asm volatile("" : "+r" (tmp));
1645 
1646 	fn(mem, smem, pagesize);
1647 munmap:
1648 	munmap(mem, pagesize);
1649 	if (smem != MAP_FAILED)
1650 		munmap(smem, pagesize);
1651 close:
1652 	fclose(file);
1653 }
1654 
1655 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1656 				   size_t hugetlbsize)
1657 {
1658 	int flags = MFD_HUGETLB;
1659 	char *mem, *smem, tmp;
1660 	int fd;
1661 
1662 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1663 		       hugetlbsize / 1024);
1664 
1665 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1666 
1667 	fd = memfd_create("test", flags);
1668 	if (fd < 0) {
1669 		ksft_test_result_skip("memfd_create() failed\n");
1670 		return;
1671 	}
1672 
1673 	/* File consists of a single page filled with zeroes. */
1674 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1675 		ksft_test_result_skip("need more free huge pages\n");
1676 		goto close;
1677 	}
1678 
1679 	/* Create a private mapping of the memfd. */
1680 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1681 		   0);
1682 	if (mem == MAP_FAILED) {
1683 		ksft_test_result_skip("need more free huge pages\n");
1684 		goto close;
1685 	}
1686 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1687 	if (mem == MAP_FAILED) {
1688 		ksft_test_result_fail("mmap() failed\n");
1689 		goto munmap;
1690 	}
1691 
1692 	/* Fault the page in. */
1693 	tmp = *mem + *smem;
1694 	asm volatile("" : "+r" (tmp));
1695 
1696 	fn(mem, smem, hugetlbsize);
1697 munmap:
1698 	munmap(mem, hugetlbsize);
1699 	if (mem != MAP_FAILED)
1700 		munmap(smem, hugetlbsize);
1701 close:
1702 	close(fd);
1703 }
1704 
1705 struct non_anon_test_case {
1706 	const char *desc;
1707 	non_anon_test_fn fn;
1708 };
1709 
1710 /*
1711  * Test cases that target any pages in private mappings that are not anonymous:
1712  * pages that may get shared via COW ndependent of fork(). This includes
1713  * the shared zeropage(s), pagecache pages, ...
1714  */
1715 static const struct non_anon_test_case non_anon_test_cases[] = {
1716 	/*
1717 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1718 	 * visible via other private/shared mappings.
1719 	 */
1720 	{
1721 		"Basic COW",
1722 		test_cow,
1723 	},
1724 	/*
1725 	 * Take a R/O longterm pin. When modifying the page via the page table,
1726 	 * the page content change must be visible via the pin.
1727 	 */
1728 	{
1729 		"R/O longterm GUP pin",
1730 		test_ro_pin,
1731 	},
1732 	/* Same as above, but using GUP-fast. */
1733 	{
1734 		"R/O longterm GUP-fast pin",
1735 		test_ro_fast_pin,
1736 	},
1737 };
1738 
1739 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1740 {
1741 	int i;
1742 
1743 	run_with_zeropage(test_case->fn, test_case->desc);
1744 	run_with_memfd(test_case->fn, test_case->desc);
1745 	run_with_tmpfile(test_case->fn, test_case->desc);
1746 	if (pmdsize)
1747 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1748 	for (i = 0; i < nr_hugetlbsizes; i++)
1749 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1750 				       hugetlbsizes[i]);
1751 }
1752 
1753 static void run_non_anon_test_cases(void)
1754 {
1755 	int i;
1756 
1757 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1758 
1759 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1760 		run_non_anon_test_case(&non_anon_test_cases[i]);
1761 }
1762 
1763 static int tests_per_non_anon_test_case(void)
1764 {
1765 	int tests = 3 + nr_hugetlbsizes;
1766 
1767 	if (pmdsize)
1768 		tests += 1;
1769 	return tests;
1770 }
1771 
1772 int main(int argc, char **argv)
1773 {
1774 	int err;
1775 	struct thp_settings default_settings;
1776 
1777 	ksft_print_header();
1778 
1779 	pagesize = getpagesize();
1780 	pmdsize = read_pmd_pagesize();
1781 	if (pmdsize) {
1782 		/* Only if THP is supported. */
1783 		thp_read_settings(&default_settings);
1784 		default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1785 		thp_save_settings();
1786 		thp_push_settings(&default_settings);
1787 
1788 		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1789 			       pmdsize / 1024);
1790 		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1791 	}
1792 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1793 						    ARRAY_SIZE(hugetlbsizes));
1794 	detect_huge_zeropage();
1795 
1796 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1797 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1798 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1799 
1800 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1801 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1802 	if (pagemap_fd < 0)
1803 		ksft_exit_fail_msg("opening pagemap failed\n");
1804 
1805 	run_anon_test_cases();
1806 	run_anon_thp_test_cases();
1807 	run_non_anon_test_cases();
1808 
1809 	if (pmdsize) {
1810 		/* Only if THP is supported. */
1811 		thp_restore_settings();
1812 	}
1813 
1814 	err = ksft_get_fail_cnt();
1815 	if (err)
1816 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1817 				   err, ksft_test_num());
1818 	return ksft_exit_pass();
1819 }
1820