xref: /linux/tools/testing/selftests/mm/cow.c (revision 20d3fac43608a1d7ef71991935abc4456baa1da7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33 
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43 
44 static int detect_thp_sizes(size_t sizes[], int max)
45 {
46 	int count = 0;
47 	unsigned long orders;
48 	size_t kb;
49 	int i;
50 
51 	/* thp not supported at all. */
52 	if (!pmdsize)
53 		return 0;
54 
55 	orders = 1UL << sz2ord(pmdsize, pagesize);
56 	orders |= thp_supported_orders();
57 
58 	for (i = 0; orders && count < max; i++) {
59 		if (!(orders & (1UL << i)))
60 			continue;
61 		orders &= ~(1UL << i);
62 		kb = (pagesize >> 10) << i;
63 		sizes[count++] = kb * 1024;
64 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
65 	}
66 
67 	return count;
68 }
69 
70 static bool range_is_swapped(void *addr, size_t size)
71 {
72 	for (; size; addr += pagesize, size -= pagesize)
73 		if (!pagemap_is_swapped(pagemap_fd, addr))
74 			return false;
75 	return true;
76 }
77 
78 static bool populate_page_checked(char *addr)
79 {
80 	bool ret;
81 
82 	FORCE_READ(*addr);
83 	ret = pagemap_is_populated(pagemap_fd, addr);
84 	if (!ret)
85 		ksft_print_msg("Failed to populate page\n");
86 
87 	return ret;
88 }
89 
90 struct comm_pipes {
91 	int child_ready[2];
92 	int parent_ready[2];
93 };
94 
95 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
96 {
97 	if (pipe(comm_pipes->child_ready) < 0) {
98 		ksft_perror("pipe() failed");
99 		return -errno;
100 	}
101 	if (pipe(comm_pipes->parent_ready) < 0) {
102 		ksft_perror("pipe() failed");
103 		close(comm_pipes->child_ready[0]);
104 		close(comm_pipes->child_ready[1]);
105 		return -errno;
106 	}
107 
108 	return 0;
109 }
110 
111 static void close_comm_pipes(struct comm_pipes *comm_pipes)
112 {
113 	close(comm_pipes->child_ready[0]);
114 	close(comm_pipes->child_ready[1]);
115 	close(comm_pipes->parent_ready[0]);
116 	close(comm_pipes->parent_ready[1]);
117 }
118 
119 static int child_memcmp_fn(char *mem, size_t size,
120 			   struct comm_pipes *comm_pipes)
121 {
122 	char *old = malloc(size);
123 	char buf;
124 
125 	/* Backup the original content. */
126 	memcpy(old, mem, size);
127 
128 	/* Wait until the parent modified the page. */
129 	write(comm_pipes->child_ready[1], "0", 1);
130 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
131 		;
132 
133 	/* See if we still read the old values. */
134 	return memcmp(old, mem, size);
135 }
136 
137 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
138 				    struct comm_pipes *comm_pipes)
139 {
140 	struct iovec iov = {
141 		.iov_base = mem,
142 		.iov_len = size,
143 	};
144 	ssize_t cur, total, transferred;
145 	char *old, *new;
146 	int fds[2];
147 	char buf;
148 
149 	old = malloc(size);
150 	new = malloc(size);
151 
152 	/* Backup the original content. */
153 	memcpy(old, mem, size);
154 
155 	if (pipe(fds) < 0)
156 		return -errno;
157 
158 	/* Trigger a read-only pin. */
159 	transferred = vmsplice(fds[1], &iov, 1, 0);
160 	if (transferred < 0)
161 		return -errno;
162 	if (transferred == 0)
163 		return -EINVAL;
164 
165 	/* Unmap it from our page tables. */
166 	if (munmap(mem, size) < 0)
167 		return -errno;
168 
169 	/* Wait until the parent modified it. */
170 	write(comm_pipes->child_ready[1], "0", 1);
171 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
172 		;
173 
174 	/* See if we still read the old values via the pipe. */
175 	for (total = 0; total < transferred; total += cur) {
176 		cur = read(fds[0], new + total, transferred - total);
177 		if (cur < 0)
178 			return -errno;
179 	}
180 
181 	return memcmp(old, new, transferred);
182 }
183 
184 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
185 
186 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
187 		child_fn fn, bool xfail)
188 {
189 	struct comm_pipes comm_pipes;
190 	char buf;
191 	int ret;
192 
193 	ret = setup_comm_pipes(&comm_pipes);
194 	if (ret) {
195 		log_test_result(KSFT_FAIL);
196 		return;
197 	}
198 
199 	ret = fork();
200 	if (ret < 0) {
201 		ksft_perror("fork() failed");
202 		log_test_result(KSFT_FAIL);
203 		goto close_comm_pipes;
204 	} else if (!ret) {
205 		exit(fn(mem, size, &comm_pipes));
206 	}
207 
208 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
209 		;
210 
211 	if (do_mprotect) {
212 		/*
213 		 * mprotect() optimizations might try avoiding
214 		 * write-faults by directly mapping pages writable.
215 		 */
216 		ret = mprotect(mem, size, PROT_READ);
217 		if (ret) {
218 			ksft_perror("mprotect() failed");
219 			log_test_result(KSFT_FAIL);
220 			write(comm_pipes.parent_ready[1], "0", 1);
221 			wait(&ret);
222 			goto close_comm_pipes;
223 		}
224 
225 		ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
226 		if (ret) {
227 			ksft_perror("mprotect() failed");
228 			log_test_result(KSFT_FAIL);
229 			write(comm_pipes.parent_ready[1], "0", 1);
230 			wait(&ret);
231 			goto close_comm_pipes;
232 		}
233 	}
234 
235 	/* Modify the page. */
236 	memset(mem, 0xff, size);
237 	write(comm_pipes.parent_ready[1], "0", 1);
238 
239 	wait(&ret);
240 	if (WIFEXITED(ret))
241 		ret = WEXITSTATUS(ret);
242 	else
243 		ret = -EINVAL;
244 
245 	if (!ret) {
246 		log_test_result(KSFT_PASS);
247 	} else if (xfail) {
248 		/*
249 		 * With hugetlb, some vmsplice() tests are currently expected to
250 		 * fail because (a) harder to fix and (b) nobody really cares.
251 		 * Flag them as expected failure for now.
252 		 */
253 		ksft_print_msg("Leak from parent into child\n");
254 		log_test_result(KSFT_XFAIL);
255 	} else {
256 		ksft_print_msg("Leak from parent into child\n");
257 		log_test_result(KSFT_FAIL);
258 	}
259 close_comm_pipes:
260 	close_comm_pipes(&comm_pipes);
261 }
262 
263 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
264 {
265 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
266 }
267 
268 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
269 {
270 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
271 }
272 
273 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
274 {
275 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
276 			      is_hugetlb);
277 }
278 
279 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
280 		bool is_hugetlb)
281 {
282 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
283 			      is_hugetlb);
284 }
285 
286 static void do_test_vmsplice_in_parent(char *mem, size_t size,
287 				       bool before_fork, bool xfail)
288 {
289 	struct iovec iov = {
290 		.iov_base = mem,
291 		.iov_len = size,
292 	};
293 	ssize_t cur, total, transferred = 0;
294 	struct comm_pipes comm_pipes;
295 	char *old, *new;
296 	int ret, fds[2];
297 	char buf;
298 
299 	old = malloc(size);
300 	new = malloc(size);
301 
302 	memcpy(old, mem, size);
303 
304 	ret = setup_comm_pipes(&comm_pipes);
305 	if (ret) {
306 		log_test_result(KSFT_FAIL);
307 		goto free;
308 	}
309 
310 	if (pipe(fds) < 0) {
311 		ksft_perror("pipe() failed");
312 		log_test_result(KSFT_FAIL);
313 		goto close_comm_pipes;
314 	}
315 
316 	if (before_fork) {
317 		transferred = vmsplice(fds[1], &iov, 1, 0);
318 		if (transferred <= 0) {
319 			ksft_perror("vmsplice() failed\n");
320 			log_test_result(KSFT_FAIL);
321 			goto close_pipe;
322 		}
323 	}
324 
325 	ret = fork();
326 	if (ret < 0) {
327 		ksft_perror("fork() failed\n");
328 		log_test_result(KSFT_FAIL);
329 		goto close_pipe;
330 	} else if (!ret) {
331 		write(comm_pipes.child_ready[1], "0", 1);
332 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
333 			;
334 		/* Modify page content in the child. */
335 		memset(mem, 0xff, size);
336 		exit(0);
337 	}
338 
339 	if (!before_fork) {
340 		transferred = vmsplice(fds[1], &iov, 1, 0);
341 		if (transferred <= 0) {
342 			ksft_perror("vmsplice() failed");
343 			log_test_result(KSFT_FAIL);
344 			wait(&ret);
345 			goto close_pipe;
346 		}
347 	}
348 
349 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
350 		;
351 	if (munmap(mem, size) < 0) {
352 		ksft_perror("munmap() failed");
353 		log_test_result(KSFT_FAIL);
354 		goto close_pipe;
355 	}
356 	write(comm_pipes.parent_ready[1], "0", 1);
357 
358 	/* Wait until the child is done writing. */
359 	wait(&ret);
360 	if (!WIFEXITED(ret)) {
361 		ksft_perror("wait() failed");
362 		log_test_result(KSFT_FAIL);
363 		goto close_pipe;
364 	}
365 
366 	/* See if we still read the old values. */
367 	for (total = 0; total < transferred; total += cur) {
368 		cur = read(fds[0], new + total, transferred - total);
369 		if (cur < 0) {
370 			ksft_perror("read() failed");
371 			log_test_result(KSFT_FAIL);
372 			goto close_pipe;
373 		}
374 	}
375 
376 	if (!memcmp(old, new, transferred)) {
377 		log_test_result(KSFT_PASS);
378 	} else if (xfail) {
379 		/*
380 		 * With hugetlb, some vmsplice() tests are currently expected to
381 		 * fail because (a) harder to fix and (b) nobody really cares.
382 		 * Flag them as expected failure for now.
383 		 */
384 		ksft_print_msg("Leak from child into parent\n");
385 		log_test_result(KSFT_XFAIL);
386 	} else {
387 		ksft_print_msg("Leak from child into parent\n");
388 		log_test_result(KSFT_FAIL);
389 	}
390 close_pipe:
391 	close(fds[0]);
392 	close(fds[1]);
393 close_comm_pipes:
394 	close_comm_pipes(&comm_pipes);
395 free:
396 	free(old);
397 	free(new);
398 }
399 
400 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
401 {
402 	do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
403 }
404 
405 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
406 {
407 	do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
408 }
409 
410 #ifdef LOCAL_CONFIG_HAVE_LIBURING
411 static void do_test_iouring(char *mem, size_t size, bool use_fork)
412 {
413 	struct comm_pipes comm_pipes;
414 	struct io_uring_cqe *cqe;
415 	struct io_uring_sqe *sqe;
416 	struct io_uring ring;
417 	ssize_t cur, total;
418 	struct iovec iov;
419 	char *buf, *tmp;
420 	int ret, fd;
421 	FILE *file;
422 
423 	ret = setup_comm_pipes(&comm_pipes);
424 	if (ret) {
425 		log_test_result(KSFT_FAIL);
426 		return;
427 	}
428 
429 	file = tmpfile();
430 	if (!file) {
431 		ksft_perror("tmpfile() failed");
432 		log_test_result(KSFT_FAIL);
433 		goto close_comm_pipes;
434 	}
435 	fd = fileno(file);
436 	assert(fd);
437 
438 	tmp = malloc(size);
439 	if (!tmp) {
440 		ksft_print_msg("malloc() failed\n");
441 		log_test_result(KSFT_FAIL);
442 		goto close_file;
443 	}
444 
445 	/* Skip on errors, as we might just lack kernel support. */
446 	ret = io_uring_queue_init(1, &ring, 0);
447 	if (ret < 0) {
448 		ksft_print_msg("io_uring_queue_init() failed\n");
449 		log_test_result(KSFT_SKIP);
450 		goto free_tmp;
451 	}
452 
453 	/*
454 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
455 	 * | FOLL_LONGTERM the range.
456 	 *
457 	 * Skip on errors, as we might just lack kernel support or might not
458 	 * have sufficient MEMLOCK permissions.
459 	 */
460 	iov.iov_base = mem;
461 	iov.iov_len = size;
462 	ret = io_uring_register_buffers(&ring, &iov, 1);
463 	if (ret) {
464 		ksft_print_msg("io_uring_register_buffers() failed\n");
465 		log_test_result(KSFT_SKIP);
466 		goto queue_exit;
467 	}
468 
469 	if (use_fork) {
470 		/*
471 		 * fork() and keep the child alive until we're done. Note that
472 		 * we expect the pinned page to not get shared with the child.
473 		 */
474 		ret = fork();
475 		if (ret < 0) {
476 			ksft_perror("fork() failed");
477 			log_test_result(KSFT_FAIL);
478 			goto unregister_buffers;
479 		} else if (!ret) {
480 			write(comm_pipes.child_ready[1], "0", 1);
481 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
482 				;
483 			exit(0);
484 		}
485 
486 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
487 			;
488 	} else {
489 		/*
490 		 * Map the page R/O into the page table. Enable softdirty
491 		 * tracking to stop the page from getting mapped R/W immediately
492 		 * again by mprotect() optimizations. Note that we don't have an
493 		 * easy way to test if that worked (the pagemap does not export
494 		 * if the page is mapped R/O vs. R/W).
495 		 */
496 		ret = mprotect(mem, size, PROT_READ);
497 		if (ret) {
498 			ksft_perror("mprotect() failed");
499 			log_test_result(KSFT_FAIL);
500 			goto unregister_buffers;
501 		}
502 
503 		clear_softdirty();
504 		ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
505 		if (ret) {
506 			ksft_perror("mprotect() failed");
507 			log_test_result(KSFT_FAIL);
508 			goto unregister_buffers;
509 		}
510 	}
511 
512 	/*
513 	 * Modify the page and write page content as observed by the fixed
514 	 * buffer pin to the file so we can verify it.
515 	 */
516 	memset(mem, 0xff, size);
517 	sqe = io_uring_get_sqe(&ring);
518 	if (!sqe) {
519 		ksft_print_msg("io_uring_get_sqe() failed\n");
520 		log_test_result(KSFT_FAIL);
521 		goto quit_child;
522 	}
523 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
524 
525 	ret = io_uring_submit(&ring);
526 	if (ret < 0) {
527 		ksft_print_msg("io_uring_submit() failed\n");
528 		log_test_result(KSFT_FAIL);
529 		goto quit_child;
530 	}
531 
532 	ret = io_uring_wait_cqe(&ring, &cqe);
533 	if (ret < 0) {
534 		ksft_print_msg("io_uring_wait_cqe() failed\n");
535 		log_test_result(KSFT_FAIL);
536 		goto quit_child;
537 	}
538 
539 	if (cqe->res != size) {
540 		ksft_print_msg("write_fixed failed\n");
541 		log_test_result(KSFT_FAIL);
542 		goto quit_child;
543 	}
544 	io_uring_cqe_seen(&ring, cqe);
545 
546 	/* Read back the file content to the temporary buffer. */
547 	total = 0;
548 	while (total < size) {
549 		cur = pread(fd, tmp + total, size - total, total);
550 		if (cur < 0) {
551 			ksft_perror("pread() failed\n");
552 			log_test_result(KSFT_FAIL);
553 			goto quit_child;
554 		}
555 		total += cur;
556 	}
557 
558 	/* Finally, check if we read what we expected. */
559 	if (!memcmp(mem, tmp, size)) {
560 		log_test_result(KSFT_PASS);
561 	} else {
562 		ksft_print_msg("Longtom R/W pin is not reliable\n");
563 		log_test_result(KSFT_FAIL);
564 	}
565 
566 quit_child:
567 	if (use_fork) {
568 		write(comm_pipes.parent_ready[1], "0", 1);
569 		wait(&ret);
570 	}
571 unregister_buffers:
572 	io_uring_unregister_buffers(&ring);
573 queue_exit:
574 	io_uring_queue_exit(&ring);
575 free_tmp:
576 	free(tmp);
577 close_file:
578 	fclose(file);
579 close_comm_pipes:
580 	close_comm_pipes(&comm_pipes);
581 }
582 
583 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
584 {
585 	do_test_iouring(mem, size, false);
586 }
587 
588 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
589 {
590 	do_test_iouring(mem, size, true);
591 }
592 
593 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
594 
595 enum ro_pin_test {
596 	RO_PIN_TEST,
597 	RO_PIN_TEST_SHARED,
598 	RO_PIN_TEST_PREVIOUSLY_SHARED,
599 	RO_PIN_TEST_RO_EXCLUSIVE,
600 };
601 
602 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
603 			   bool fast)
604 {
605 	struct pin_longterm_test args;
606 	struct comm_pipes comm_pipes;
607 	char *tmp, buf;
608 	__u64 tmp_val;
609 	int ret;
610 
611 	if (gup_fd < 0) {
612 		ksft_print_msg("gup_test not available\n");
613 		log_test_result(KSFT_SKIP);
614 		return;
615 	}
616 
617 	tmp = malloc(size);
618 	if (!tmp) {
619 		ksft_perror("malloc() failed\n");
620 		log_test_result(KSFT_FAIL);
621 		return;
622 	}
623 
624 	ret = setup_comm_pipes(&comm_pipes);
625 	if (ret) {
626 		log_test_result(KSFT_FAIL);
627 		goto free_tmp;
628 	}
629 
630 	switch (test) {
631 	case RO_PIN_TEST:
632 		break;
633 	case RO_PIN_TEST_SHARED:
634 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
635 		/*
636 		 * Share the pages with our child. As the pages are not pinned,
637 		 * this should just work.
638 		 */
639 		ret = fork();
640 		if (ret < 0) {
641 			ksft_perror("fork() failed");
642 			log_test_result(KSFT_FAIL);
643 			goto close_comm_pipes;
644 		} else if (!ret) {
645 			write(comm_pipes.child_ready[1], "0", 1);
646 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
647 				;
648 			exit(0);
649 		}
650 
651 		/* Wait until our child is ready. */
652 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
653 			;
654 
655 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
656 			/*
657 			 * Tell the child to quit now and wait until it quit.
658 			 * The pages should now be mapped R/O into our page
659 			 * tables, but they are no longer shared.
660 			 */
661 			write(comm_pipes.parent_ready[1], "0", 1);
662 			wait(&ret);
663 			if (!WIFEXITED(ret))
664 				ksft_print_msg("[INFO] wait() failed\n");
665 		}
666 		break;
667 	case RO_PIN_TEST_RO_EXCLUSIVE:
668 		/*
669 		 * Map the page R/O into the page table. Enable softdirty
670 		 * tracking to stop the page from getting mapped R/W immediately
671 		 * again by mprotect() optimizations. Note that we don't have an
672 		 * easy way to test if that worked (the pagemap does not export
673 		 * if the page is mapped R/O vs. R/W).
674 		 */
675 		ret = mprotect(mem, size, PROT_READ);
676 		clear_softdirty();
677 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
678 		if (ret) {
679 			ksft_perror("mprotect() failed");
680 			log_test_result(KSFT_FAIL);
681 			goto close_comm_pipes;
682 		}
683 		break;
684 	default:
685 		assert(false);
686 	}
687 
688 	/* Take a R/O pin. This should trigger unsharing. */
689 	args.addr = (__u64)(uintptr_t)mem;
690 	args.size = size;
691 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
692 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
693 	if (ret) {
694 		if (errno == EINVAL)
695 			ret = KSFT_SKIP;
696 		else
697 			ret = KSFT_FAIL;
698 		ksft_perror("PIN_LONGTERM_TEST_START failed");
699 		log_test_result(ret);
700 		goto wait;
701 	}
702 
703 	/* Modify the page. */
704 	memset(mem, 0xff, size);
705 
706 	/*
707 	 * Read back the content via the pin to the temporary buffer and
708 	 * test if we observed the modification.
709 	 */
710 	tmp_val = (__u64)(uintptr_t)tmp;
711 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
712 	if (ret) {
713 		ksft_perror("PIN_LONGTERM_TEST_READ failed");
714 		log_test_result(KSFT_FAIL);
715 	} else {
716 		if (!memcmp(mem, tmp, size)) {
717 			log_test_result(KSFT_PASS);
718 		} else {
719 			ksft_print_msg("Longterm R/O pin is not reliable\n");
720 			log_test_result(KSFT_FAIL);
721 		}
722 	}
723 
724 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
725 	if (ret)
726 		ksft_perror("PIN_LONGTERM_TEST_STOP failed");
727 wait:
728 	switch (test) {
729 	case RO_PIN_TEST_SHARED:
730 		write(comm_pipes.parent_ready[1], "0", 1);
731 		wait(&ret);
732 		if (!WIFEXITED(ret))
733 			ksft_perror("wait() failed");
734 		break;
735 	default:
736 		break;
737 	}
738 close_comm_pipes:
739 	close_comm_pipes(&comm_pipes);
740 free_tmp:
741 	free(tmp);
742 }
743 
744 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
745 {
746 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
747 }
748 
749 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
750 {
751 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
752 }
753 
754 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
755 		bool is_hugetlb)
756 {
757 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
758 }
759 
760 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
761 		bool is_hugetlb)
762 {
763 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
764 }
765 
766 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
767 		bool is_hugetlb)
768 {
769 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
770 }
771 
772 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
773 		bool is_hugetlb)
774 {
775 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
776 }
777 
778 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
779 
780 static void do_run_with_base_page(test_fn fn, bool swapout)
781 {
782 	char *mem;
783 	int ret;
784 
785 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
786 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
787 	if (mem == MAP_FAILED) {
788 		ksft_perror("mmap() failed");
789 		log_test_result(KSFT_FAIL);
790 		return;
791 	}
792 
793 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
794 	/* Ignore if not around on a kernel. */
795 	if (ret && errno != EINVAL) {
796 		ksft_perror("MADV_NOHUGEPAGE failed");
797 		log_test_result(KSFT_FAIL);
798 		goto munmap;
799 	}
800 
801 	/* Populate a base page. */
802 	memset(mem, 1, pagesize);
803 
804 	if (swapout) {
805 		madvise(mem, pagesize, MADV_PAGEOUT);
806 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
807 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
808 			log_test_result(KSFT_SKIP);
809 			goto munmap;
810 		}
811 	}
812 
813 	fn(mem, pagesize, false);
814 munmap:
815 	munmap(mem, pagesize);
816 }
817 
818 static void run_with_base_page(test_fn fn, const char *desc)
819 {
820 	log_test_start("%s ... with base page", desc);
821 	do_run_with_base_page(fn, false);
822 }
823 
824 static void run_with_base_page_swap(test_fn fn, const char *desc)
825 {
826 	log_test_start("%s ... with swapped out base page", desc);
827 	do_run_with_base_page(fn, true);
828 }
829 
830 enum thp_run {
831 	THP_RUN_PMD,
832 	THP_RUN_PMD_SWAPOUT,
833 	THP_RUN_PTE,
834 	THP_RUN_PTE_SWAPOUT,
835 	THP_RUN_SINGLE_PTE,
836 	THP_RUN_SINGLE_PTE_SWAPOUT,
837 	THP_RUN_PARTIAL_MREMAP,
838 	THP_RUN_PARTIAL_SHARED,
839 };
840 
841 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
842 {
843 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
844 	size_t size, mmap_size, mremap_size;
845 	int ret;
846 
847 	/* For alignment purposes, we need twice the thp size. */
848 	mmap_size = 2 * thpsize;
849 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
850 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
851 	if (mmap_mem == MAP_FAILED) {
852 		ksft_perror("mmap() failed");
853 		log_test_result(KSFT_FAIL);
854 		return;
855 	}
856 
857 	/* We need a THP-aligned memory area. */
858 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
859 
860 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
861 	if (ret) {
862 		ksft_perror("MADV_HUGEPAGE failed");
863 		log_test_result(KSFT_FAIL);
864 		goto munmap;
865 	}
866 
867 	/*
868 	 * Try to populate a THP. Touch the first sub-page and test if
869 	 * we get the last sub-page populated automatically.
870 	 */
871 	mem[0] = 1;
872 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
873 		ksft_print_msg("Did not get a THP populated\n");
874 		log_test_result(KSFT_SKIP);
875 		goto munmap;
876 	}
877 	memset(mem, 1, thpsize);
878 
879 	size = thpsize;
880 	switch (thp_run) {
881 	case THP_RUN_PMD:
882 	case THP_RUN_PMD_SWAPOUT:
883 		assert(thpsize == pmdsize);
884 		break;
885 	case THP_RUN_PTE:
886 	case THP_RUN_PTE_SWAPOUT:
887 		/*
888 		 * Trigger PTE-mapping the THP by temporarily mapping a single
889 		 * subpage R/O. This is a noop if the THP is not pmdsize (and
890 		 * therefore already PTE-mapped).
891 		 */
892 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
893 		if (ret) {
894 			ksft_perror("mprotect() failed");
895 			log_test_result(KSFT_FAIL);
896 			goto munmap;
897 		}
898 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
899 		if (ret) {
900 			ksft_perror("mprotect() failed");
901 			log_test_result(KSFT_FAIL);
902 			goto munmap;
903 		}
904 		break;
905 	case THP_RUN_SINGLE_PTE:
906 	case THP_RUN_SINGLE_PTE_SWAPOUT:
907 		/*
908 		 * Discard all but a single subpage of that PTE-mapped THP. What
909 		 * remains is a single PTE mapping a single subpage.
910 		 */
911 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
912 		if (ret) {
913 			ksft_perror("MADV_DONTNEED failed");
914 			log_test_result(KSFT_FAIL);
915 			goto munmap;
916 		}
917 		size = pagesize;
918 		break;
919 	case THP_RUN_PARTIAL_MREMAP:
920 		/*
921 		 * Remap half of the THP. We need some new memory location
922 		 * for that.
923 		 */
924 		mremap_size = thpsize / 2;
925 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
926 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
927 		if (mremap_mem == MAP_FAILED) {
928 			ksft_perror("mmap() failed");
929 			log_test_result(KSFT_FAIL);
930 			goto munmap;
931 		}
932 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
933 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
934 		if (tmp != mremap_mem) {
935 			ksft_perror("mremap() failed");
936 			log_test_result(KSFT_FAIL);
937 			goto munmap;
938 		}
939 		size = mremap_size;
940 		break;
941 	case THP_RUN_PARTIAL_SHARED:
942 		/*
943 		 * Share the first page of the THP with a child and quit the
944 		 * child. This will result in some parts of the THP never
945 		 * have been shared.
946 		 */
947 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
948 		if (ret) {
949 			ksft_perror("MADV_DONTFORK failed");
950 			log_test_result(KSFT_FAIL);
951 			goto munmap;
952 		}
953 		ret = fork();
954 		if (ret < 0) {
955 			ksft_perror("fork() failed");
956 			log_test_result(KSFT_FAIL);
957 			goto munmap;
958 		} else if (!ret) {
959 			exit(0);
960 		}
961 		wait(&ret);
962 		/* Allow for sharing all pages again. */
963 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
964 		if (ret) {
965 			ksft_perror("MADV_DOFORK failed");
966 			log_test_result(KSFT_FAIL);
967 			goto munmap;
968 		}
969 		break;
970 	default:
971 		assert(false);
972 	}
973 
974 	switch (thp_run) {
975 	case THP_RUN_PMD_SWAPOUT:
976 	case THP_RUN_PTE_SWAPOUT:
977 	case THP_RUN_SINGLE_PTE_SWAPOUT:
978 		madvise(mem, size, MADV_PAGEOUT);
979 		if (!range_is_swapped(mem, size)) {
980 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
981 			log_test_result(KSFT_SKIP);
982 			goto munmap;
983 		}
984 		break;
985 	default:
986 		break;
987 	}
988 
989 	fn(mem, size, false);
990 munmap:
991 	munmap(mmap_mem, mmap_size);
992 	if (mremap_mem != MAP_FAILED)
993 		munmap(mremap_mem, mremap_size);
994 }
995 
996 static void run_with_thp(test_fn fn, const char *desc, size_t size)
997 {
998 	log_test_start("%s ... with THP (%zu kB)",
999 		desc, size / 1024);
1000 	do_run_with_thp(fn, THP_RUN_PMD, size);
1001 }
1002 
1003 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
1004 {
1005 	log_test_start("%s ... with swapped-out THP (%zu kB)",
1006 		desc, size / 1024);
1007 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
1008 }
1009 
1010 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
1011 {
1012 	log_test_start("%s ... with PTE-mapped THP (%zu kB)",
1013 		desc, size / 1024);
1014 	do_run_with_thp(fn, THP_RUN_PTE, size);
1015 }
1016 
1017 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
1018 {
1019 	log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
1020 		desc, size / 1024);
1021 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
1022 }
1023 
1024 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
1025 {
1026 	log_test_start("%s ... with single PTE of THP (%zu kB)",
1027 		desc, size / 1024);
1028 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
1029 }
1030 
1031 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
1032 {
1033 	log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
1034 		desc, size / 1024);
1035 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
1036 }
1037 
1038 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
1039 {
1040 	log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
1041 		desc, size / 1024);
1042 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
1043 }
1044 
1045 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
1046 {
1047 	log_test_start("%s ... with partially shared THP (%zu kB)",
1048 		desc, size / 1024);
1049 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
1050 }
1051 
1052 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1053 {
1054 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1055 	char *mem, *dummy;
1056 
1057 	log_test_start("%s ... with hugetlb (%zu kB)", desc,
1058 		       hugetlbsize / 1024);
1059 
1060 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1061 
1062 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1063 	if (mem == MAP_FAILED) {
1064 		ksft_perror("need more free huge pages");
1065 		log_test_result(KSFT_SKIP);
1066 		return;
1067 	}
1068 
1069 	/* Populate an huge page. */
1070 	memset(mem, 1, hugetlbsize);
1071 
1072 	/*
1073 	 * We need a total of two hugetlb pages to handle COW/unsharing
1074 	 * properly, otherwise we might get zapped by a SIGBUS.
1075 	 */
1076 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1077 	if (dummy == MAP_FAILED) {
1078 		ksft_perror("need more free huge pages");
1079 		log_test_result(KSFT_SKIP);
1080 		goto munmap;
1081 	}
1082 	munmap(dummy, hugetlbsize);
1083 
1084 	fn(mem, hugetlbsize, true);
1085 munmap:
1086 	munmap(mem, hugetlbsize);
1087 }
1088 
1089 struct test_case {
1090 	const char *desc;
1091 	test_fn fn;
1092 };
1093 
1094 /*
1095  * Test cases that are specific to anonymous pages: pages in private mappings
1096  * that may get shared via COW during fork().
1097  */
1098 static const struct test_case anon_test_cases[] = {
1099 	/*
1100 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1101 	 * either the child can observe modifications by the parent or the
1102 	 * other way around.
1103 	 */
1104 	{
1105 		"Basic COW after fork()",
1106 		test_cow_in_parent,
1107 	},
1108 	/*
1109 	 * Basic test, but do an additional mprotect(PROT_READ)+
1110 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1111 	 */
1112 	{
1113 		"Basic COW after fork() with mprotect() optimization",
1114 		test_cow_in_parent_mprotect,
1115 	},
1116 	/*
1117 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1118 	 * we miss to break COW, the child observes modifications by the parent.
1119 	 * This is CVE-2020-29374 reported by Jann Horn.
1120 	 */
1121 	{
1122 		"vmsplice() + unmap in child",
1123 		test_vmsplice_in_child,
1124 	},
1125 	/*
1126 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1127 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1128 	 */
1129 	{
1130 		"vmsplice() + unmap in child with mprotect() optimization",
1131 		test_vmsplice_in_child_mprotect,
1132 	},
1133 	/*
1134 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1135 	 * fork(); modify in the child. If we miss to break COW, the parent
1136 	 * observes modifications by the child.
1137 	 */
1138 	{
1139 		"vmsplice() before fork(), unmap in parent after fork()",
1140 		test_vmsplice_before_fork,
1141 	},
1142 	/*
1143 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1144 	 * child. If we miss to break COW, the parent observes modifications by
1145 	 * the child.
1146 	 */
1147 	{
1148 		"vmsplice() + unmap in parent after fork()",
1149 		test_vmsplice_after_fork,
1150 	},
1151 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1152 	/*
1153 	 * Take a R/W longterm pin and then map the page R/O into the page
1154 	 * table to trigger a write fault on next access. When modifying the
1155 	 * page, the page content must be visible via the pin.
1156 	 */
1157 	{
1158 		"R/O-mapping a page registered as iouring fixed buffer",
1159 		test_iouring_ro,
1160 	},
1161 	/*
1162 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1163 	 * page, the page content must be visible via the pin. We expect the
1164 	 * pinned page to not get shared with the child.
1165 	 */
1166 	{
1167 		"fork() with an iouring fixed buffer",
1168 		test_iouring_fork,
1169 	},
1170 
1171 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1172 	/*
1173 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1174 	 * When modifying the page via the page table, the page content change
1175 	 * must be visible via the pin.
1176 	 */
1177 	{
1178 		"R/O GUP pin on R/O-mapped shared page",
1179 		test_ro_pin_on_shared,
1180 	},
1181 	/* Same as above, but using GUP-fast. */
1182 	{
1183 		"R/O GUP-fast pin on R/O-mapped shared page",
1184 		test_ro_fast_pin_on_shared,
1185 	},
1186 	/*
1187 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1188 	 * was previously shared. When modifying the page via the page table,
1189 	 * the page content change must be visible via the pin.
1190 	 */
1191 	{
1192 		"R/O GUP pin on R/O-mapped previously-shared page",
1193 		test_ro_pin_on_ro_previously_shared,
1194 	},
1195 	/* Same as above, but using GUP-fast. */
1196 	{
1197 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1198 		test_ro_fast_pin_on_ro_previously_shared,
1199 	},
1200 	/*
1201 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1202 	 * When modifying the page via the page table, the page content change
1203 	 * must be visible via the pin.
1204 	 */
1205 	{
1206 		"R/O GUP pin on R/O-mapped exclusive page",
1207 		test_ro_pin_on_ro_exclusive,
1208 	},
1209 	/* Same as above, but using GUP-fast. */
1210 	{
1211 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1212 		test_ro_fast_pin_on_ro_exclusive,
1213 	},
1214 };
1215 
1216 static void run_anon_test_case(struct test_case const *test_case)
1217 {
1218 	int i;
1219 
1220 	run_with_base_page(test_case->fn, test_case->desc);
1221 	run_with_base_page_swap(test_case->fn, test_case->desc);
1222 	for (i = 0; i < nr_thpsizes; i++) {
1223 		size_t size = thpsizes[i];
1224 		struct thp_settings settings = *thp_current_settings();
1225 
1226 		settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_NEVER;
1227 		settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS;
1228 		thp_push_settings(&settings);
1229 
1230 		if (size == pmdsize) {
1231 			run_with_thp(test_case->fn, test_case->desc, size);
1232 			run_with_thp_swap(test_case->fn, test_case->desc, size);
1233 		}
1234 
1235 		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1236 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1237 		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1238 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1239 		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1240 		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1241 
1242 		thp_pop_settings();
1243 	}
1244 	for (i = 0; i < nr_hugetlbsizes; i++)
1245 		run_with_hugetlb(test_case->fn, test_case->desc,
1246 				 hugetlbsizes[i]);
1247 }
1248 
1249 static void run_anon_test_cases(void)
1250 {
1251 	int i;
1252 
1253 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1254 
1255 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1256 		run_anon_test_case(&anon_test_cases[i]);
1257 }
1258 
1259 static int tests_per_anon_test_case(void)
1260 {
1261 	int tests = 2 + nr_hugetlbsizes;
1262 
1263 	tests += 6 * nr_thpsizes;
1264 	if (pmdsize)
1265 		tests += 2;
1266 	return tests;
1267 }
1268 
1269 enum anon_thp_collapse_test {
1270 	ANON_THP_COLLAPSE_UNSHARED,
1271 	ANON_THP_COLLAPSE_FULLY_SHARED,
1272 	ANON_THP_COLLAPSE_LOWER_SHARED,
1273 	ANON_THP_COLLAPSE_UPPER_SHARED,
1274 };
1275 
1276 static void do_test_anon_thp_collapse(char *mem, size_t size,
1277 				      enum anon_thp_collapse_test test)
1278 {
1279 	struct comm_pipes comm_pipes;
1280 	char buf;
1281 	int ret;
1282 
1283 	ret = setup_comm_pipes(&comm_pipes);
1284 	if (ret) {
1285 		log_test_result(KSFT_FAIL);
1286 		return;
1287 	}
1288 
1289 	/*
1290 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1291 	 * R/O, such that we can try collapsing it later.
1292 	 */
1293 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1294 	if (ret) {
1295 		ksft_perror("mprotect() failed");
1296 		log_test_result(KSFT_FAIL);
1297 		goto close_comm_pipes;
1298 	}
1299 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1300 	if (ret) {
1301 		ksft_perror("mprotect() failed");
1302 		log_test_result(KSFT_FAIL);
1303 		goto close_comm_pipes;
1304 	}
1305 
1306 	switch (test) {
1307 	case ANON_THP_COLLAPSE_UNSHARED:
1308 		/* Collapse before actually COW-sharing the page. */
1309 		ret = madvise(mem, size, MADV_COLLAPSE);
1310 		if (ret) {
1311 			ksft_perror("MADV_COLLAPSE failed");
1312 			log_test_result(KSFT_SKIP);
1313 			goto close_comm_pipes;
1314 		}
1315 		break;
1316 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1317 		/* COW-share the full PTE-mapped THP. */
1318 		break;
1319 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1320 		/* Don't COW-share the upper part of the THP. */
1321 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1322 		if (ret) {
1323 			ksft_perror("MADV_DONTFORK failed");
1324 			log_test_result(KSFT_FAIL);
1325 			goto close_comm_pipes;
1326 		}
1327 		break;
1328 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1329 		/* Don't COW-share the lower part of the THP. */
1330 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1331 		if (ret) {
1332 			ksft_perror("MADV_DONTFORK failed");
1333 			log_test_result(KSFT_FAIL);
1334 			goto close_comm_pipes;
1335 		}
1336 		break;
1337 	default:
1338 		assert(false);
1339 	}
1340 
1341 	ret = fork();
1342 	if (ret < 0) {
1343 		ksft_perror("fork() failed");
1344 		log_test_result(KSFT_FAIL);
1345 		goto close_comm_pipes;
1346 	} else if (!ret) {
1347 		switch (test) {
1348 		case ANON_THP_COLLAPSE_UNSHARED:
1349 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1350 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1351 			break;
1352 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1353 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1354 			break;
1355 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1356 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1357 					     &comm_pipes));
1358 			break;
1359 		default:
1360 			assert(false);
1361 		}
1362 	}
1363 
1364 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1365 		;
1366 
1367 	switch (test) {
1368 	case ANON_THP_COLLAPSE_UNSHARED:
1369 		break;
1370 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1371 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1372 		/*
1373 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1374 		 * able to actually collapse.
1375 		 */
1376 		ret = madvise(mem, size, MADV_DOFORK);
1377 		if (ret) {
1378 			ksft_perror("MADV_DOFORK failed");
1379 			log_test_result(KSFT_FAIL);
1380 			write(comm_pipes.parent_ready[1], "0", 1);
1381 			wait(&ret);
1382 			goto close_comm_pipes;
1383 		}
1384 		/* FALLTHROUGH */
1385 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1386 		/* Collapse before anyone modified the COW-shared page. */
1387 		ret = madvise(mem, size, MADV_COLLAPSE);
1388 		if (ret) {
1389 			ksft_perror("MADV_COLLAPSE failed");
1390 			log_test_result(KSFT_SKIP);
1391 			write(comm_pipes.parent_ready[1], "0", 1);
1392 			wait(&ret);
1393 			goto close_comm_pipes;
1394 		}
1395 		break;
1396 	default:
1397 		assert(false);
1398 	}
1399 
1400 	/* Modify the page. */
1401 	memset(mem, 0xff, size);
1402 	write(comm_pipes.parent_ready[1], "0", 1);
1403 
1404 	wait(&ret);
1405 	if (WIFEXITED(ret))
1406 		ret = WEXITSTATUS(ret);
1407 	else
1408 		ret = -EINVAL;
1409 
1410 	if (!ret) {
1411 		log_test_result(KSFT_PASS);
1412 	} else {
1413 		ksft_print_msg("Leak from parent into child\n");
1414 		log_test_result(KSFT_FAIL);
1415 	}
1416 close_comm_pipes:
1417 	close_comm_pipes(&comm_pipes);
1418 }
1419 
1420 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1421 		bool is_hugetlb)
1422 {
1423 	assert(!is_hugetlb);
1424 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1425 }
1426 
1427 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1428 		bool is_hugetlb)
1429 {
1430 	assert(!is_hugetlb);
1431 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1432 }
1433 
1434 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1435 		bool is_hugetlb)
1436 {
1437 	assert(!is_hugetlb);
1438 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1439 }
1440 
1441 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1442 		bool is_hugetlb)
1443 {
1444 	assert(!is_hugetlb);
1445 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1446 }
1447 
1448 /*
1449  * Test cases that are specific to anonymous THP: pages in private mappings
1450  * that may get shared via COW during fork().
1451  */
1452 static const struct test_case anon_thp_test_cases[] = {
1453 	/*
1454 	 * Basic COW test for fork() without any GUP when collapsing a THP
1455 	 * before fork().
1456 	 *
1457 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1458 	 * collapse") might easily get COW handling wrong when not collapsing
1459 	 * exclusivity information properly.
1460 	 */
1461 	{
1462 		"Basic COW after fork() when collapsing before fork()",
1463 		test_anon_thp_collapse_unshared,
1464 	},
1465 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1466 	{
1467 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1468 		test_anon_thp_collapse_fully_shared,
1469 	},
1470 	/*
1471 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1472 	 * THP.
1473 	 */
1474 	{
1475 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1476 		test_anon_thp_collapse_lower_shared,
1477 	},
1478 	/*
1479 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1480 	 * THP.
1481 	 */
1482 	{
1483 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1484 		test_anon_thp_collapse_upper_shared,
1485 	},
1486 };
1487 
1488 static void run_anon_thp_test_cases(void)
1489 {
1490 	int i;
1491 
1492 	if (!pmdsize)
1493 		return;
1494 
1495 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1496 
1497 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1498 		struct test_case const *test_case = &anon_thp_test_cases[i];
1499 
1500 		log_test_start("%s", test_case->desc);
1501 		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1502 	}
1503 }
1504 
1505 static int tests_per_anon_thp_test_case(void)
1506 {
1507 	return pmdsize ? 1 : 0;
1508 }
1509 
1510 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1511 
1512 static void test_cow(char *mem, const char *smem, size_t size)
1513 {
1514 	char *old = malloc(size);
1515 
1516 	/* Backup the original content. */
1517 	memcpy(old, smem, size);
1518 
1519 	/* Modify the page. */
1520 	memset(mem, 0xff, size);
1521 
1522 	/* See if we still read the old values via the other mapping. */
1523 	if (!memcmp(smem, old, size)) {
1524 		log_test_result(KSFT_PASS);
1525 	} else {
1526 		ksft_print_msg("Other mapping modified\n");
1527 		log_test_result(KSFT_FAIL);
1528 	}
1529 	free(old);
1530 }
1531 
1532 static void test_ro_pin(char *mem, const char *smem, size_t size)
1533 {
1534 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1535 }
1536 
1537 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1538 {
1539 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1540 }
1541 
1542 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1543 {
1544 	char *mem, *smem;
1545 
1546 	log_test_start("%s ... with shared zeropage", desc);
1547 
1548 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1549 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1550 	if (mem == MAP_FAILED) {
1551 		ksft_perror("mmap() failed");
1552 		log_test_result(KSFT_FAIL);
1553 		return;
1554 	}
1555 
1556 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1557 	if (smem == MAP_FAILED) {
1558 		ksft_perror("mmap() failed");
1559 		log_test_result(KSFT_FAIL);
1560 		goto munmap;
1561 	}
1562 
1563 	/* Read from the page to populate the shared zeropage. */
1564 	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
1565 		log_test_result(KSFT_FAIL);
1566 		goto munmap;
1567 	}
1568 
1569 	fn(mem, smem, pagesize);
1570 munmap:
1571 	munmap(mem, pagesize);
1572 	if (smem != MAP_FAILED)
1573 		munmap(smem, pagesize);
1574 }
1575 
1576 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1577 {
1578 	char *mem, *smem, *mmap_mem, *mmap_smem;
1579 	size_t mmap_size;
1580 	int ret;
1581 
1582 	log_test_start("%s ... with huge zeropage", desc);
1583 
1584 	if (!has_huge_zeropage) {
1585 		ksft_print_msg("Huge zeropage not enabled\n");
1586 		log_test_result(KSFT_SKIP);
1587 		return;
1588 	}
1589 
1590 	/* For alignment purposes, we need twice the thp size. */
1591 	mmap_size = 2 * pmdsize;
1592 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1593 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1594 	if (mmap_mem == MAP_FAILED) {
1595 		ksft_perror("mmap() failed");
1596 		log_test_result(KSFT_FAIL);
1597 		return;
1598 	}
1599 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1600 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1601 	if (mmap_smem == MAP_FAILED) {
1602 		ksft_perror("mmap() failed");
1603 		log_test_result(KSFT_FAIL);
1604 		goto munmap;
1605 	}
1606 
1607 	/* We need a THP-aligned memory area. */
1608 	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1609 	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1610 
1611 	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1612 	if (ret) {
1613 		ksft_perror("madvise()");
1614 		log_test_result(KSFT_FAIL);
1615 		goto munmap;
1616 	}
1617 	ret = madvise(smem, pmdsize, MADV_HUGEPAGE);
1618 	if (ret) {
1619 		ksft_perror("madvise()");
1620 		log_test_result(KSFT_FAIL);
1621 		goto munmap;
1622 	}
1623 
1624 	/*
1625 	 * Read from the memory to populate the huge shared zeropage. Read from
1626 	 * the first sub-page and test if we get another sub-page populated
1627 	 * automatically.
1628 	 */
1629 	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
1630 		log_test_result(KSFT_FAIL);
1631 		goto munmap;
1632 	}
1633 
1634 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1635 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1636 		ksft_test_result_skip("Did not get THPs populated\n");
1637 		goto munmap;
1638 	}
1639 
1640 	fn(mem, smem, pmdsize);
1641 munmap:
1642 	munmap(mmap_mem, mmap_size);
1643 	if (mmap_smem != MAP_FAILED)
1644 		munmap(mmap_smem, mmap_size);
1645 }
1646 
1647 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1648 {
1649 	char *mem, *smem;
1650 	int fd;
1651 
1652 	log_test_start("%s ... with memfd", desc);
1653 
1654 	fd = memfd_create("test", 0);
1655 	if (fd < 0) {
1656 		ksft_perror("memfd_create() failed");
1657 		log_test_result(KSFT_FAIL);
1658 		return;
1659 	}
1660 
1661 	/* File consists of a single page filled with zeroes. */
1662 	if (fallocate(fd, 0, 0, pagesize)) {
1663 		ksft_perror("fallocate() failed");
1664 		log_test_result(KSFT_FAIL);
1665 		goto close;
1666 	}
1667 
1668 	/* Create a private mapping of the memfd. */
1669 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1670 	if (mem == MAP_FAILED) {
1671 		ksft_perror("mmap() failed");
1672 		log_test_result(KSFT_FAIL);
1673 		goto close;
1674 	}
1675 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1676 	if (smem == MAP_FAILED) {
1677 		ksft_perror("mmap() failed");
1678 		log_test_result(KSFT_FAIL);
1679 		goto munmap;
1680 	}
1681 
1682 	/* Fault the page in. */
1683 	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
1684 		log_test_result(KSFT_FAIL);
1685 		goto munmap;
1686 	}
1687 
1688 	fn(mem, smem, pagesize);
1689 munmap:
1690 	munmap(mem, pagesize);
1691 	if (smem != MAP_FAILED)
1692 		munmap(smem, pagesize);
1693 close:
1694 	close(fd);
1695 }
1696 
1697 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1698 {
1699 	char *mem, *smem;
1700 	FILE *file;
1701 	int fd;
1702 
1703 	log_test_start("%s ... with tmpfile", desc);
1704 
1705 	file = tmpfile();
1706 	if (!file) {
1707 		ksft_perror("tmpfile() failed");
1708 		log_test_result(KSFT_FAIL);
1709 		return;
1710 	}
1711 
1712 	fd = fileno(file);
1713 	if (fd < 0) {
1714 		ksft_perror("fileno() failed");
1715 		log_test_result(KSFT_SKIP);
1716 		return;
1717 	}
1718 
1719 	/* File consists of a single page filled with zeroes. */
1720 	if (fallocate(fd, 0, 0, pagesize)) {
1721 		ksft_perror("fallocate() failed");
1722 		log_test_result(KSFT_FAIL);
1723 		goto close;
1724 	}
1725 
1726 	/* Create a private mapping of the memfd. */
1727 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1728 	if (mem == MAP_FAILED) {
1729 		ksft_perror("mmap() failed");
1730 		log_test_result(KSFT_FAIL);
1731 		goto close;
1732 	}
1733 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1734 	if (smem == MAP_FAILED) {
1735 		ksft_perror("mmap() failed");
1736 		log_test_result(KSFT_FAIL);
1737 		goto munmap;
1738 	}
1739 
1740 	/* Fault the page in. */
1741 	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
1742 		log_test_result(KSFT_FAIL);
1743 		goto munmap;
1744 	}
1745 
1746 	fn(mem, smem, pagesize);
1747 munmap:
1748 	munmap(mem, pagesize);
1749 	if (smem != MAP_FAILED)
1750 		munmap(smem, pagesize);
1751 close:
1752 	fclose(file);
1753 }
1754 
1755 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1756 				   size_t hugetlbsize)
1757 {
1758 	int flags = MFD_HUGETLB;
1759 	char *mem, *smem;
1760 	int fd;
1761 
1762 	log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
1763 		       hugetlbsize / 1024);
1764 
1765 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1766 
1767 	fd = memfd_create("test", flags);
1768 	if (fd < 0) {
1769 		ksft_perror("memfd_create() failed");
1770 		log_test_result(KSFT_SKIP);
1771 		return;
1772 	}
1773 
1774 	/* File consists of a single page filled with zeroes. */
1775 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1776 		ksft_perror("need more free huge pages");
1777 		log_test_result(KSFT_SKIP);
1778 		goto close;
1779 	}
1780 
1781 	/* Create a private mapping of the memfd. */
1782 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1783 		   0);
1784 	if (mem == MAP_FAILED) {
1785 		ksft_perror("need more free huge pages");
1786 		log_test_result(KSFT_SKIP);
1787 		goto close;
1788 	}
1789 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1790 	if (smem == MAP_FAILED) {
1791 		ksft_perror("mmap() failed");
1792 		log_test_result(KSFT_FAIL);
1793 		goto munmap;
1794 	}
1795 
1796 	/* Fault the page in. */
1797 	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
1798 		log_test_result(KSFT_FAIL);
1799 		goto munmap;
1800 	}
1801 
1802 	fn(mem, smem, hugetlbsize);
1803 munmap:
1804 	munmap(mem, hugetlbsize);
1805 	if (smem != MAP_FAILED)
1806 		munmap(smem, hugetlbsize);
1807 close:
1808 	close(fd);
1809 }
1810 
1811 struct non_anon_test_case {
1812 	const char *desc;
1813 	non_anon_test_fn fn;
1814 };
1815 
1816 /*
1817  * Test cases that target any pages in private mappings that are not anonymous:
1818  * pages that may get shared via COW ndependent of fork(). This includes
1819  * the shared zeropage(s), pagecache pages, ...
1820  */
1821 static const struct non_anon_test_case non_anon_test_cases[] = {
1822 	/*
1823 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1824 	 * visible via other private/shared mappings.
1825 	 */
1826 	{
1827 		"Basic COW",
1828 		test_cow,
1829 	},
1830 	/*
1831 	 * Take a R/O longterm pin. When modifying the page via the page table,
1832 	 * the page content change must be visible via the pin.
1833 	 */
1834 	{
1835 		"R/O longterm GUP pin",
1836 		test_ro_pin,
1837 	},
1838 	/* Same as above, but using GUP-fast. */
1839 	{
1840 		"R/O longterm GUP-fast pin",
1841 		test_ro_fast_pin,
1842 	},
1843 };
1844 
1845 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1846 {
1847 	int i;
1848 
1849 	run_with_zeropage(test_case->fn, test_case->desc);
1850 	run_with_memfd(test_case->fn, test_case->desc);
1851 	run_with_tmpfile(test_case->fn, test_case->desc);
1852 	if (pmdsize)
1853 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1854 	for (i = 0; i < nr_hugetlbsizes; i++)
1855 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1856 				       hugetlbsizes[i]);
1857 }
1858 
1859 static void run_non_anon_test_cases(void)
1860 {
1861 	int i;
1862 
1863 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1864 
1865 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1866 		run_non_anon_test_case(&non_anon_test_cases[i]);
1867 }
1868 
1869 static int tests_per_non_anon_test_case(void)
1870 {
1871 	int tests = 3 + nr_hugetlbsizes;
1872 
1873 	if (pmdsize)
1874 		tests += 1;
1875 	return tests;
1876 }
1877 
1878 int main(int argc, char **argv)
1879 {
1880 	struct thp_settings default_settings;
1881 
1882 	ksft_print_header();
1883 
1884 	pagesize = getpagesize();
1885 	pmdsize = read_pmd_pagesize();
1886 	if (pmdsize) {
1887 		/* Only if THP is supported. */
1888 		thp_read_settings(&default_settings);
1889 		default_settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_INHERIT;
1890 		thp_save_settings();
1891 		thp_push_settings(&default_settings);
1892 
1893 		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1894 			       pmdsize / 1024);
1895 		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1896 	}
1897 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1898 						    ARRAY_SIZE(hugetlbsizes));
1899 	has_huge_zeropage = detect_huge_zeropage();
1900 
1901 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1902 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1903 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1904 
1905 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1906 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1907 	if (pagemap_fd < 0)
1908 		ksft_exit_fail_msg("opening pagemap failed\n");
1909 
1910 	run_anon_test_cases();
1911 	run_anon_thp_test_cases();
1912 	run_non_anon_test_cases();
1913 
1914 	if (pmdsize) {
1915 		/* Only if THP is supported. */
1916 		thp_restore_settings();
1917 	}
1918 
1919 	ksft_finished();
1920 }
1921