xref: /linux/tools/testing/selftests/mm/cow.c (revision 1a80ff0f8896750156f22dbf2d4591d79bb2a155)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33 
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43 
44 static int sz2ord(size_t size)
45 {
46 	return __builtin_ctzll(size / pagesize);
47 }
48 
49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 	int count = 0;
52 	unsigned long orders;
53 	size_t kb;
54 	int i;
55 
56 	/* thp not supported at all. */
57 	if (!pmdsize)
58 		return 0;
59 
60 	orders = 1UL << sz2ord(pmdsize);
61 	orders |= thp_supported_orders();
62 
63 	for (i = 0; orders && count < max; i++) {
64 		if (!(orders & (1UL << i)))
65 			continue;
66 		orders &= ~(1UL << i);
67 		kb = (pagesize >> 10) << i;
68 		sizes[count++] = kb * 1024;
69 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 	}
71 
72 	return count;
73 }
74 
75 static void detect_huge_zeropage(void)
76 {
77 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 		      O_RDONLY);
79 	size_t enabled = 0;
80 	char buf[15];
81 	int ret;
82 
83 	if (fd < 0)
84 		return;
85 
86 	ret = pread(fd, buf, sizeof(buf), 0);
87 	if (ret > 0 && ret < sizeof(buf)) {
88 		buf[ret] = 0;
89 
90 		enabled = strtoul(buf, NULL, 10);
91 		if (enabled == 1) {
92 			has_huge_zeropage = true;
93 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
94 		}
95 	}
96 
97 	close(fd);
98 }
99 
100 static bool range_is_swapped(void *addr, size_t size)
101 {
102 	for (; size; addr += pagesize, size -= pagesize)
103 		if (!pagemap_is_swapped(pagemap_fd, addr))
104 			return false;
105 	return true;
106 }
107 
108 struct comm_pipes {
109 	int child_ready[2];
110 	int parent_ready[2];
111 };
112 
113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114 {
115 	if (pipe(comm_pipes->child_ready) < 0) {
116 		ksft_perror("pipe() failed");
117 		return -errno;
118 	}
119 	if (pipe(comm_pipes->parent_ready) < 0) {
120 		ksft_perror("pipe() failed");
121 		close(comm_pipes->child_ready[0]);
122 		close(comm_pipes->child_ready[1]);
123 		return -errno;
124 	}
125 
126 	return 0;
127 }
128 
129 static void close_comm_pipes(struct comm_pipes *comm_pipes)
130 {
131 	close(comm_pipes->child_ready[0]);
132 	close(comm_pipes->child_ready[1]);
133 	close(comm_pipes->parent_ready[0]);
134 	close(comm_pipes->parent_ready[1]);
135 }
136 
137 static int child_memcmp_fn(char *mem, size_t size,
138 			   struct comm_pipes *comm_pipes)
139 {
140 	char *old = malloc(size);
141 	char buf;
142 
143 	/* Backup the original content. */
144 	memcpy(old, mem, size);
145 
146 	/* Wait until the parent modified the page. */
147 	write(comm_pipes->child_ready[1], "0", 1);
148 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
149 		;
150 
151 	/* See if we still read the old values. */
152 	return memcmp(old, mem, size);
153 }
154 
155 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
156 				    struct comm_pipes *comm_pipes)
157 {
158 	struct iovec iov = {
159 		.iov_base = mem,
160 		.iov_len = size,
161 	};
162 	ssize_t cur, total, transferred;
163 	char *old, *new;
164 	int fds[2];
165 	char buf;
166 
167 	old = malloc(size);
168 	new = malloc(size);
169 
170 	/* Backup the original content. */
171 	memcpy(old, mem, size);
172 
173 	if (pipe(fds) < 0)
174 		return -errno;
175 
176 	/* Trigger a read-only pin. */
177 	transferred = vmsplice(fds[1], &iov, 1, 0);
178 	if (transferred < 0)
179 		return -errno;
180 	if (transferred == 0)
181 		return -EINVAL;
182 
183 	/* Unmap it from our page tables. */
184 	if (munmap(mem, size) < 0)
185 		return -errno;
186 
187 	/* Wait until the parent modified it. */
188 	write(comm_pipes->child_ready[1], "0", 1);
189 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
190 		;
191 
192 	/* See if we still read the old values via the pipe. */
193 	for (total = 0; total < transferred; total += cur) {
194 		cur = read(fds[0], new + total, transferred - total);
195 		if (cur < 0)
196 			return -errno;
197 	}
198 
199 	return memcmp(old, new, transferred);
200 }
201 
202 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
203 
204 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
205 		child_fn fn, bool xfail)
206 {
207 	struct comm_pipes comm_pipes;
208 	char buf;
209 	int ret;
210 
211 	ret = setup_comm_pipes(&comm_pipes);
212 	if (ret) {
213 		log_test_result(KSFT_FAIL);
214 		return;
215 	}
216 
217 	ret = fork();
218 	if (ret < 0) {
219 		ksft_perror("fork() failed");
220 		log_test_result(KSFT_FAIL);
221 		goto close_comm_pipes;
222 	} else if (!ret) {
223 		exit(fn(mem, size, &comm_pipes));
224 	}
225 
226 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
227 		;
228 
229 	if (do_mprotect) {
230 		/*
231 		 * mprotect() optimizations might try avoiding
232 		 * write-faults by directly mapping pages writable.
233 		 */
234 		ret = mprotect(mem, size, PROT_READ);
235 		if (ret) {
236 			ksft_perror("mprotect() failed");
237 			log_test_result(KSFT_FAIL);
238 			write(comm_pipes.parent_ready[1], "0", 1);
239 			wait(&ret);
240 			goto close_comm_pipes;
241 		}
242 
243 		ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
244 		if (ret) {
245 			ksft_perror("mprotect() failed");
246 			log_test_result(KSFT_FAIL);
247 			write(comm_pipes.parent_ready[1], "0", 1);
248 			wait(&ret);
249 			goto close_comm_pipes;
250 		}
251 	}
252 
253 	/* Modify the page. */
254 	memset(mem, 0xff, size);
255 	write(comm_pipes.parent_ready[1], "0", 1);
256 
257 	wait(&ret);
258 	if (WIFEXITED(ret))
259 		ret = WEXITSTATUS(ret);
260 	else
261 		ret = -EINVAL;
262 
263 	if (!ret) {
264 		log_test_result(KSFT_PASS);
265 	} else if (xfail) {
266 		/*
267 		 * With hugetlb, some vmsplice() tests are currently expected to
268 		 * fail because (a) harder to fix and (b) nobody really cares.
269 		 * Flag them as expected failure for now.
270 		 */
271 		ksft_print_msg("Leak from parent into child\n");
272 		log_test_result(KSFT_XFAIL);
273 	} else {
274 		ksft_print_msg("Leak from parent into child\n");
275 		log_test_result(KSFT_FAIL);
276 	}
277 close_comm_pipes:
278 	close_comm_pipes(&comm_pipes);
279 }
280 
281 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
282 {
283 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
284 }
285 
286 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
287 {
288 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
289 }
290 
291 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
292 {
293 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
294 			      is_hugetlb);
295 }
296 
297 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
298 		bool is_hugetlb)
299 {
300 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
301 			      is_hugetlb);
302 }
303 
304 static void do_test_vmsplice_in_parent(char *mem, size_t size,
305 				       bool before_fork, bool xfail)
306 {
307 	struct iovec iov = {
308 		.iov_base = mem,
309 		.iov_len = size,
310 	};
311 	ssize_t cur, total, transferred = 0;
312 	struct comm_pipes comm_pipes;
313 	char *old, *new;
314 	int ret, fds[2];
315 	char buf;
316 
317 	old = malloc(size);
318 	new = malloc(size);
319 
320 	memcpy(old, mem, size);
321 
322 	ret = setup_comm_pipes(&comm_pipes);
323 	if (ret) {
324 		log_test_result(KSFT_FAIL);
325 		goto free;
326 	}
327 
328 	if (pipe(fds) < 0) {
329 		ksft_perror("pipe() failed");
330 		log_test_result(KSFT_FAIL);
331 		goto close_comm_pipes;
332 	}
333 
334 	if (before_fork) {
335 		transferred = vmsplice(fds[1], &iov, 1, 0);
336 		if (transferred <= 0) {
337 			ksft_perror("vmsplice() failed\n");
338 			log_test_result(KSFT_FAIL);
339 			goto close_pipe;
340 		}
341 	}
342 
343 	ret = fork();
344 	if (ret < 0) {
345 		ksft_perror("fork() failed\n");
346 		log_test_result(KSFT_FAIL);
347 		goto close_pipe;
348 	} else if (!ret) {
349 		write(comm_pipes.child_ready[1], "0", 1);
350 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
351 			;
352 		/* Modify page content in the child. */
353 		memset(mem, 0xff, size);
354 		exit(0);
355 	}
356 
357 	if (!before_fork) {
358 		transferred = vmsplice(fds[1], &iov, 1, 0);
359 		if (transferred <= 0) {
360 			ksft_perror("vmsplice() failed");
361 			log_test_result(KSFT_FAIL);
362 			wait(&ret);
363 			goto close_pipe;
364 		}
365 	}
366 
367 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
368 		;
369 	if (munmap(mem, size) < 0) {
370 		ksft_perror("munmap() failed");
371 		log_test_result(KSFT_FAIL);
372 		goto close_pipe;
373 	}
374 	write(comm_pipes.parent_ready[1], "0", 1);
375 
376 	/* Wait until the child is done writing. */
377 	wait(&ret);
378 	if (!WIFEXITED(ret)) {
379 		ksft_perror("wait() failed");
380 		log_test_result(KSFT_FAIL);
381 		goto close_pipe;
382 	}
383 
384 	/* See if we still read the old values. */
385 	for (total = 0; total < transferred; total += cur) {
386 		cur = read(fds[0], new + total, transferred - total);
387 		if (cur < 0) {
388 			ksft_perror("read() failed");
389 			log_test_result(KSFT_FAIL);
390 			goto close_pipe;
391 		}
392 	}
393 
394 	if (!memcmp(old, new, transferred)) {
395 		log_test_result(KSFT_PASS);
396 	} else if (xfail) {
397 		/*
398 		 * With hugetlb, some vmsplice() tests are currently expected to
399 		 * fail because (a) harder to fix and (b) nobody really cares.
400 		 * Flag them as expected failure for now.
401 		 */
402 		ksft_print_msg("Leak from child into parent\n");
403 		log_test_result(KSFT_XFAIL);
404 	} else {
405 		ksft_print_msg("Leak from child into parent\n");
406 		log_test_result(KSFT_FAIL);
407 	}
408 close_pipe:
409 	close(fds[0]);
410 	close(fds[1]);
411 close_comm_pipes:
412 	close_comm_pipes(&comm_pipes);
413 free:
414 	free(old);
415 	free(new);
416 }
417 
418 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
419 {
420 	do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
421 }
422 
423 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
424 {
425 	do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
426 }
427 
428 #ifdef LOCAL_CONFIG_HAVE_LIBURING
429 static void do_test_iouring(char *mem, size_t size, bool use_fork)
430 {
431 	struct comm_pipes comm_pipes;
432 	struct io_uring_cqe *cqe;
433 	struct io_uring_sqe *sqe;
434 	struct io_uring ring;
435 	ssize_t cur, total;
436 	struct iovec iov;
437 	char *buf, *tmp;
438 	int ret, fd;
439 	FILE *file;
440 
441 	ret = setup_comm_pipes(&comm_pipes);
442 	if (ret) {
443 		log_test_result(KSFT_FAIL);
444 		return;
445 	}
446 
447 	file = tmpfile();
448 	if (!file) {
449 		ksft_perror("tmpfile() failed");
450 		log_test_result(KSFT_FAIL);
451 		goto close_comm_pipes;
452 	}
453 	fd = fileno(file);
454 	assert(fd);
455 
456 	tmp = malloc(size);
457 	if (!tmp) {
458 		ksft_print_msg("malloc() failed\n");
459 		log_test_result(KSFT_FAIL);
460 		goto close_file;
461 	}
462 
463 	/* Skip on errors, as we might just lack kernel support. */
464 	ret = io_uring_queue_init(1, &ring, 0);
465 	if (ret < 0) {
466 		ksft_print_msg("io_uring_queue_init() failed\n");
467 		log_test_result(KSFT_SKIP);
468 		goto free_tmp;
469 	}
470 
471 	/*
472 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
473 	 * | FOLL_LONGTERM the range.
474 	 *
475 	 * Skip on errors, as we might just lack kernel support or might not
476 	 * have sufficient MEMLOCK permissions.
477 	 */
478 	iov.iov_base = mem;
479 	iov.iov_len = size;
480 	ret = io_uring_register_buffers(&ring, &iov, 1);
481 	if (ret) {
482 		ksft_print_msg("io_uring_register_buffers() failed\n");
483 		log_test_result(KSFT_SKIP);
484 		goto queue_exit;
485 	}
486 
487 	if (use_fork) {
488 		/*
489 		 * fork() and keep the child alive until we're done. Note that
490 		 * we expect the pinned page to not get shared with the child.
491 		 */
492 		ret = fork();
493 		if (ret < 0) {
494 			ksft_perror("fork() failed");
495 			log_test_result(KSFT_FAIL);
496 			goto unregister_buffers;
497 		} else if (!ret) {
498 			write(comm_pipes.child_ready[1], "0", 1);
499 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
500 				;
501 			exit(0);
502 		}
503 
504 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
505 			;
506 	} else {
507 		/*
508 		 * Map the page R/O into the page table. Enable softdirty
509 		 * tracking to stop the page from getting mapped R/W immediately
510 		 * again by mprotect() optimizations. Note that we don't have an
511 		 * easy way to test if that worked (the pagemap does not export
512 		 * if the page is mapped R/O vs. R/W).
513 		 */
514 		ret = mprotect(mem, size, PROT_READ);
515 		if (ret) {
516 			ksft_perror("mprotect() failed");
517 			log_test_result(KSFT_FAIL);
518 			goto unregister_buffers;
519 		}
520 
521 		clear_softdirty();
522 		ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
523 		if (ret) {
524 			ksft_perror("mprotect() failed");
525 			log_test_result(KSFT_FAIL);
526 			goto unregister_buffers;
527 		}
528 	}
529 
530 	/*
531 	 * Modify the page and write page content as observed by the fixed
532 	 * buffer pin to the file so we can verify it.
533 	 */
534 	memset(mem, 0xff, size);
535 	sqe = io_uring_get_sqe(&ring);
536 	if (!sqe) {
537 		ksft_print_msg("io_uring_get_sqe() failed\n");
538 		log_test_result(KSFT_FAIL);
539 		goto quit_child;
540 	}
541 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
542 
543 	ret = io_uring_submit(&ring);
544 	if (ret < 0) {
545 		ksft_print_msg("io_uring_submit() failed\n");
546 		log_test_result(KSFT_FAIL);
547 		goto quit_child;
548 	}
549 
550 	ret = io_uring_wait_cqe(&ring, &cqe);
551 	if (ret < 0) {
552 		ksft_print_msg("io_uring_wait_cqe() failed\n");
553 		log_test_result(KSFT_FAIL);
554 		goto quit_child;
555 	}
556 
557 	if (cqe->res != size) {
558 		ksft_print_msg("write_fixed failed\n");
559 		log_test_result(KSFT_FAIL);
560 		goto quit_child;
561 	}
562 	io_uring_cqe_seen(&ring, cqe);
563 
564 	/* Read back the file content to the temporary buffer. */
565 	total = 0;
566 	while (total < size) {
567 		cur = pread(fd, tmp + total, size - total, total);
568 		if (cur < 0) {
569 			ksft_perror("pread() failed\n");
570 			log_test_result(KSFT_FAIL);
571 			goto quit_child;
572 		}
573 		total += cur;
574 	}
575 
576 	/* Finally, check if we read what we expected. */
577 	if (!memcmp(mem, tmp, size)) {
578 		log_test_result(KSFT_PASS);
579 	} else {
580 		ksft_print_msg("Longtom R/W pin is not reliable\n");
581 		log_test_result(KSFT_FAIL);
582 	}
583 
584 quit_child:
585 	if (use_fork) {
586 		write(comm_pipes.parent_ready[1], "0", 1);
587 		wait(&ret);
588 	}
589 unregister_buffers:
590 	io_uring_unregister_buffers(&ring);
591 queue_exit:
592 	io_uring_queue_exit(&ring);
593 free_tmp:
594 	free(tmp);
595 close_file:
596 	fclose(file);
597 close_comm_pipes:
598 	close_comm_pipes(&comm_pipes);
599 }
600 
601 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
602 {
603 	do_test_iouring(mem, size, false);
604 }
605 
606 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
607 {
608 	do_test_iouring(mem, size, true);
609 }
610 
611 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
612 
613 enum ro_pin_test {
614 	RO_PIN_TEST,
615 	RO_PIN_TEST_SHARED,
616 	RO_PIN_TEST_PREVIOUSLY_SHARED,
617 	RO_PIN_TEST_RO_EXCLUSIVE,
618 };
619 
620 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
621 			   bool fast)
622 {
623 	struct pin_longterm_test args;
624 	struct comm_pipes comm_pipes;
625 	char *tmp, buf;
626 	__u64 tmp_val;
627 	int ret;
628 
629 	if (gup_fd < 0) {
630 		ksft_print_msg("gup_test not available\n");
631 		log_test_result(KSFT_SKIP);
632 		return;
633 	}
634 
635 	tmp = malloc(size);
636 	if (!tmp) {
637 		ksft_perror("malloc() failed\n");
638 		log_test_result(KSFT_FAIL);
639 		return;
640 	}
641 
642 	ret = setup_comm_pipes(&comm_pipes);
643 	if (ret) {
644 		log_test_result(KSFT_FAIL);
645 		goto free_tmp;
646 	}
647 
648 	switch (test) {
649 	case RO_PIN_TEST:
650 		break;
651 	case RO_PIN_TEST_SHARED:
652 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
653 		/*
654 		 * Share the pages with our child. As the pages are not pinned,
655 		 * this should just work.
656 		 */
657 		ret = fork();
658 		if (ret < 0) {
659 			ksft_perror("fork() failed");
660 			log_test_result(KSFT_FAIL);
661 			goto close_comm_pipes;
662 		} else if (!ret) {
663 			write(comm_pipes.child_ready[1], "0", 1);
664 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
665 				;
666 			exit(0);
667 		}
668 
669 		/* Wait until our child is ready. */
670 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
671 			;
672 
673 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
674 			/*
675 			 * Tell the child to quit now and wait until it quit.
676 			 * The pages should now be mapped R/O into our page
677 			 * tables, but they are no longer shared.
678 			 */
679 			write(comm_pipes.parent_ready[1], "0", 1);
680 			wait(&ret);
681 			if (!WIFEXITED(ret))
682 				ksft_print_msg("[INFO] wait() failed\n");
683 		}
684 		break;
685 	case RO_PIN_TEST_RO_EXCLUSIVE:
686 		/*
687 		 * Map the page R/O into the page table. Enable softdirty
688 		 * tracking to stop the page from getting mapped R/W immediately
689 		 * again by mprotect() optimizations. Note that we don't have an
690 		 * easy way to test if that worked (the pagemap does not export
691 		 * if the page is mapped R/O vs. R/W).
692 		 */
693 		ret = mprotect(mem, size, PROT_READ);
694 		clear_softdirty();
695 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
696 		if (ret) {
697 			ksft_perror("mprotect() failed");
698 			log_test_result(KSFT_FAIL);
699 			goto close_comm_pipes;
700 		}
701 		break;
702 	default:
703 		assert(false);
704 	}
705 
706 	/* Take a R/O pin. This should trigger unsharing. */
707 	args.addr = (__u64)(uintptr_t)mem;
708 	args.size = size;
709 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
710 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
711 	if (ret) {
712 		if (errno == EINVAL)
713 			ret = KSFT_SKIP;
714 		else
715 			ret = KSFT_FAIL;
716 		ksft_perror("PIN_LONGTERM_TEST_START failed");
717 		log_test_result(ret);
718 		goto wait;
719 	}
720 
721 	/* Modify the page. */
722 	memset(mem, 0xff, size);
723 
724 	/*
725 	 * Read back the content via the pin to the temporary buffer and
726 	 * test if we observed the modification.
727 	 */
728 	tmp_val = (__u64)(uintptr_t)tmp;
729 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
730 	if (ret) {
731 		ksft_perror("PIN_LONGTERM_TEST_READ failed");
732 		log_test_result(KSFT_FAIL);
733 	} else {
734 		if (!memcmp(mem, tmp, size)) {
735 			log_test_result(KSFT_PASS);
736 		} else {
737 			ksft_print_msg("Longterm R/O pin is not reliable\n");
738 			log_test_result(KSFT_FAIL);
739 		}
740 	}
741 
742 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
743 	if (ret)
744 		ksft_perror("PIN_LONGTERM_TEST_STOP failed");
745 wait:
746 	switch (test) {
747 	case RO_PIN_TEST_SHARED:
748 		write(comm_pipes.parent_ready[1], "0", 1);
749 		wait(&ret);
750 		if (!WIFEXITED(ret))
751 			ksft_perror("wait() failed");
752 		break;
753 	default:
754 		break;
755 	}
756 close_comm_pipes:
757 	close_comm_pipes(&comm_pipes);
758 free_tmp:
759 	free(tmp);
760 }
761 
762 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
763 {
764 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
765 }
766 
767 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
768 {
769 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
770 }
771 
772 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
773 		bool is_hugetlb)
774 {
775 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
776 }
777 
778 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
779 		bool is_hugetlb)
780 {
781 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
782 }
783 
784 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
785 		bool is_hugetlb)
786 {
787 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
788 }
789 
790 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
791 		bool is_hugetlb)
792 {
793 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
794 }
795 
796 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
797 
798 static void do_run_with_base_page(test_fn fn, bool swapout)
799 {
800 	char *mem;
801 	int ret;
802 
803 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
804 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
805 	if (mem == MAP_FAILED) {
806 		ksft_perror("mmap() failed");
807 		log_test_result(KSFT_FAIL);
808 		return;
809 	}
810 
811 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
812 	/* Ignore if not around on a kernel. */
813 	if (ret && errno != EINVAL) {
814 		ksft_perror("MADV_NOHUGEPAGE failed");
815 		log_test_result(KSFT_FAIL);
816 		goto munmap;
817 	}
818 
819 	/* Populate a base page. */
820 	memset(mem, 1, pagesize);
821 
822 	if (swapout) {
823 		madvise(mem, pagesize, MADV_PAGEOUT);
824 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
825 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
826 			log_test_result(KSFT_SKIP);
827 			goto munmap;
828 		}
829 	}
830 
831 	fn(mem, pagesize, false);
832 munmap:
833 	munmap(mem, pagesize);
834 }
835 
836 static void run_with_base_page(test_fn fn, const char *desc)
837 {
838 	log_test_start("%s ... with base page", desc);
839 	do_run_with_base_page(fn, false);
840 }
841 
842 static void run_with_base_page_swap(test_fn fn, const char *desc)
843 {
844 	log_test_start("%s ... with swapped out base page", desc);
845 	do_run_with_base_page(fn, true);
846 }
847 
848 enum thp_run {
849 	THP_RUN_PMD,
850 	THP_RUN_PMD_SWAPOUT,
851 	THP_RUN_PTE,
852 	THP_RUN_PTE_SWAPOUT,
853 	THP_RUN_SINGLE_PTE,
854 	THP_RUN_SINGLE_PTE_SWAPOUT,
855 	THP_RUN_PARTIAL_MREMAP,
856 	THP_RUN_PARTIAL_SHARED,
857 };
858 
859 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
860 {
861 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
862 	size_t size, mmap_size, mremap_size;
863 	int ret;
864 
865 	/* For alignment purposes, we need twice the thp size. */
866 	mmap_size = 2 * thpsize;
867 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
868 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
869 	if (mmap_mem == MAP_FAILED) {
870 		ksft_perror("mmap() failed");
871 		log_test_result(KSFT_FAIL);
872 		return;
873 	}
874 
875 	/* We need a THP-aligned memory area. */
876 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
877 
878 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
879 	if (ret) {
880 		ksft_perror("MADV_HUGEPAGE failed");
881 		log_test_result(KSFT_FAIL);
882 		goto munmap;
883 	}
884 
885 	/*
886 	 * Try to populate a THP. Touch the first sub-page and test if
887 	 * we get the last sub-page populated automatically.
888 	 */
889 	mem[0] = 1;
890 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
891 		ksft_print_msg("Did not get a THP populated\n");
892 		log_test_result(KSFT_SKIP);
893 		goto munmap;
894 	}
895 	memset(mem, 1, thpsize);
896 
897 	size = thpsize;
898 	switch (thp_run) {
899 	case THP_RUN_PMD:
900 	case THP_RUN_PMD_SWAPOUT:
901 		assert(thpsize == pmdsize);
902 		break;
903 	case THP_RUN_PTE:
904 	case THP_RUN_PTE_SWAPOUT:
905 		/*
906 		 * Trigger PTE-mapping the THP by temporarily mapping a single
907 		 * subpage R/O. This is a noop if the THP is not pmdsize (and
908 		 * therefore already PTE-mapped).
909 		 */
910 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
911 		if (ret) {
912 			ksft_perror("mprotect() failed");
913 			log_test_result(KSFT_FAIL);
914 			goto munmap;
915 		}
916 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
917 		if (ret) {
918 			ksft_perror("mprotect() failed");
919 			log_test_result(KSFT_FAIL);
920 			goto munmap;
921 		}
922 		break;
923 	case THP_RUN_SINGLE_PTE:
924 	case THP_RUN_SINGLE_PTE_SWAPOUT:
925 		/*
926 		 * Discard all but a single subpage of that PTE-mapped THP. What
927 		 * remains is a single PTE mapping a single subpage.
928 		 */
929 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
930 		if (ret) {
931 			ksft_perror("MADV_DONTNEED failed");
932 			log_test_result(KSFT_FAIL);
933 			goto munmap;
934 		}
935 		size = pagesize;
936 		break;
937 	case THP_RUN_PARTIAL_MREMAP:
938 		/*
939 		 * Remap half of the THP. We need some new memory location
940 		 * for that.
941 		 */
942 		mremap_size = thpsize / 2;
943 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
944 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
945 		if (mremap_mem == MAP_FAILED) {
946 			ksft_perror("mmap() failed");
947 			log_test_result(KSFT_FAIL);
948 			goto munmap;
949 		}
950 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
951 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
952 		if (tmp != mremap_mem) {
953 			ksft_perror("mremap() failed");
954 			log_test_result(KSFT_FAIL);
955 			goto munmap;
956 		}
957 		size = mremap_size;
958 		break;
959 	case THP_RUN_PARTIAL_SHARED:
960 		/*
961 		 * Share the first page of the THP with a child and quit the
962 		 * child. This will result in some parts of the THP never
963 		 * have been shared.
964 		 */
965 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
966 		if (ret) {
967 			ksft_perror("MADV_DONTFORK failed");
968 			log_test_result(KSFT_FAIL);
969 			goto munmap;
970 		}
971 		ret = fork();
972 		if (ret < 0) {
973 			ksft_perror("fork() failed");
974 			log_test_result(KSFT_FAIL);
975 			goto munmap;
976 		} else if (!ret) {
977 			exit(0);
978 		}
979 		wait(&ret);
980 		/* Allow for sharing all pages again. */
981 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
982 		if (ret) {
983 			ksft_perror("MADV_DOFORK failed");
984 			log_test_result(KSFT_FAIL);
985 			goto munmap;
986 		}
987 		break;
988 	default:
989 		assert(false);
990 	}
991 
992 	switch (thp_run) {
993 	case THP_RUN_PMD_SWAPOUT:
994 	case THP_RUN_PTE_SWAPOUT:
995 	case THP_RUN_SINGLE_PTE_SWAPOUT:
996 		madvise(mem, size, MADV_PAGEOUT);
997 		if (!range_is_swapped(mem, size)) {
998 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
999 			log_test_result(KSFT_SKIP);
1000 			goto munmap;
1001 		}
1002 		break;
1003 	default:
1004 		break;
1005 	}
1006 
1007 	fn(mem, size, false);
1008 munmap:
1009 	munmap(mmap_mem, mmap_size);
1010 	if (mremap_mem != MAP_FAILED)
1011 		munmap(mremap_mem, mremap_size);
1012 }
1013 
1014 static void run_with_thp(test_fn fn, const char *desc, size_t size)
1015 {
1016 	log_test_start("%s ... with THP (%zu kB)",
1017 		desc, size / 1024);
1018 	do_run_with_thp(fn, THP_RUN_PMD, size);
1019 }
1020 
1021 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
1022 {
1023 	log_test_start("%s ... with swapped-out THP (%zu kB)",
1024 		desc, size / 1024);
1025 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
1026 }
1027 
1028 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
1029 {
1030 	log_test_start("%s ... with PTE-mapped THP (%zu kB)",
1031 		desc, size / 1024);
1032 	do_run_with_thp(fn, THP_RUN_PTE, size);
1033 }
1034 
1035 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
1036 {
1037 	log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
1038 		desc, size / 1024);
1039 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
1040 }
1041 
1042 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
1043 {
1044 	log_test_start("%s ... with single PTE of THP (%zu kB)",
1045 		desc, size / 1024);
1046 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
1047 }
1048 
1049 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
1050 {
1051 	log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
1052 		desc, size / 1024);
1053 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
1054 }
1055 
1056 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
1057 {
1058 	log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
1059 		desc, size / 1024);
1060 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
1061 }
1062 
1063 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
1064 {
1065 	log_test_start("%s ... with partially shared THP (%zu kB)",
1066 		desc, size / 1024);
1067 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
1068 }
1069 
1070 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1071 {
1072 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1073 	char *mem, *dummy;
1074 
1075 	log_test_start("%s ... with hugetlb (%zu kB)", desc,
1076 		       hugetlbsize / 1024);
1077 
1078 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1079 
1080 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1081 	if (mem == MAP_FAILED) {
1082 		ksft_perror("need more free huge pages");
1083 		log_test_result(KSFT_SKIP);
1084 		return;
1085 	}
1086 
1087 	/* Populate an huge page. */
1088 	memset(mem, 1, hugetlbsize);
1089 
1090 	/*
1091 	 * We need a total of two hugetlb pages to handle COW/unsharing
1092 	 * properly, otherwise we might get zapped by a SIGBUS.
1093 	 */
1094 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1095 	if (dummy == MAP_FAILED) {
1096 		ksft_perror("need more free huge pages");
1097 		log_test_result(KSFT_SKIP);
1098 		goto munmap;
1099 	}
1100 	munmap(dummy, hugetlbsize);
1101 
1102 	fn(mem, hugetlbsize, true);
1103 munmap:
1104 	munmap(mem, hugetlbsize);
1105 }
1106 
1107 struct test_case {
1108 	const char *desc;
1109 	test_fn fn;
1110 };
1111 
1112 /*
1113  * Test cases that are specific to anonymous pages: pages in private mappings
1114  * that may get shared via COW during fork().
1115  */
1116 static const struct test_case anon_test_cases[] = {
1117 	/*
1118 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1119 	 * either the child can observe modifications by the parent or the
1120 	 * other way around.
1121 	 */
1122 	{
1123 		"Basic COW after fork()",
1124 		test_cow_in_parent,
1125 	},
1126 	/*
1127 	 * Basic test, but do an additional mprotect(PROT_READ)+
1128 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1129 	 */
1130 	{
1131 		"Basic COW after fork() with mprotect() optimization",
1132 		test_cow_in_parent_mprotect,
1133 	},
1134 	/*
1135 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1136 	 * we miss to break COW, the child observes modifications by the parent.
1137 	 * This is CVE-2020-29374 reported by Jann Horn.
1138 	 */
1139 	{
1140 		"vmsplice() + unmap in child",
1141 		test_vmsplice_in_child,
1142 	},
1143 	/*
1144 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1145 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1146 	 */
1147 	{
1148 		"vmsplice() + unmap in child with mprotect() optimization",
1149 		test_vmsplice_in_child_mprotect,
1150 	},
1151 	/*
1152 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1153 	 * fork(); modify in the child. If we miss to break COW, the parent
1154 	 * observes modifications by the child.
1155 	 */
1156 	{
1157 		"vmsplice() before fork(), unmap in parent after fork()",
1158 		test_vmsplice_before_fork,
1159 	},
1160 	/*
1161 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1162 	 * child. If we miss to break COW, the parent observes modifications by
1163 	 * the child.
1164 	 */
1165 	{
1166 		"vmsplice() + unmap in parent after fork()",
1167 		test_vmsplice_after_fork,
1168 	},
1169 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1170 	/*
1171 	 * Take a R/W longterm pin and then map the page R/O into the page
1172 	 * table to trigger a write fault on next access. When modifying the
1173 	 * page, the page content must be visible via the pin.
1174 	 */
1175 	{
1176 		"R/O-mapping a page registered as iouring fixed buffer",
1177 		test_iouring_ro,
1178 	},
1179 	/*
1180 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1181 	 * page, the page content must be visible via the pin. We expect the
1182 	 * pinned page to not get shared with the child.
1183 	 */
1184 	{
1185 		"fork() with an iouring fixed buffer",
1186 		test_iouring_fork,
1187 	},
1188 
1189 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1190 	/*
1191 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1192 	 * When modifying the page via the page table, the page content change
1193 	 * must be visible via the pin.
1194 	 */
1195 	{
1196 		"R/O GUP pin on R/O-mapped shared page",
1197 		test_ro_pin_on_shared,
1198 	},
1199 	/* Same as above, but using GUP-fast. */
1200 	{
1201 		"R/O GUP-fast pin on R/O-mapped shared page",
1202 		test_ro_fast_pin_on_shared,
1203 	},
1204 	/*
1205 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1206 	 * was previously shared. When modifying the page via the page table,
1207 	 * the page content change must be visible via the pin.
1208 	 */
1209 	{
1210 		"R/O GUP pin on R/O-mapped previously-shared page",
1211 		test_ro_pin_on_ro_previously_shared,
1212 	},
1213 	/* Same as above, but using GUP-fast. */
1214 	{
1215 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1216 		test_ro_fast_pin_on_ro_previously_shared,
1217 	},
1218 	/*
1219 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1220 	 * When modifying the page via the page table, the page content change
1221 	 * must be visible via the pin.
1222 	 */
1223 	{
1224 		"R/O GUP pin on R/O-mapped exclusive page",
1225 		test_ro_pin_on_ro_exclusive,
1226 	},
1227 	/* Same as above, but using GUP-fast. */
1228 	{
1229 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1230 		test_ro_fast_pin_on_ro_exclusive,
1231 	},
1232 };
1233 
1234 static void run_anon_test_case(struct test_case const *test_case)
1235 {
1236 	int i;
1237 
1238 	run_with_base_page(test_case->fn, test_case->desc);
1239 	run_with_base_page_swap(test_case->fn, test_case->desc);
1240 	for (i = 0; i < nr_thpsizes; i++) {
1241 		size_t size = thpsizes[i];
1242 		struct thp_settings settings = *thp_current_settings();
1243 
1244 		settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1245 		settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1246 		thp_push_settings(&settings);
1247 
1248 		if (size == pmdsize) {
1249 			run_with_thp(test_case->fn, test_case->desc, size);
1250 			run_with_thp_swap(test_case->fn, test_case->desc, size);
1251 		}
1252 
1253 		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1254 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1255 		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1256 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1257 		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1258 		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1259 
1260 		thp_pop_settings();
1261 	}
1262 	for (i = 0; i < nr_hugetlbsizes; i++)
1263 		run_with_hugetlb(test_case->fn, test_case->desc,
1264 				 hugetlbsizes[i]);
1265 }
1266 
1267 static void run_anon_test_cases(void)
1268 {
1269 	int i;
1270 
1271 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1272 
1273 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1274 		run_anon_test_case(&anon_test_cases[i]);
1275 }
1276 
1277 static int tests_per_anon_test_case(void)
1278 {
1279 	int tests = 2 + nr_hugetlbsizes;
1280 
1281 	tests += 6 * nr_thpsizes;
1282 	if (pmdsize)
1283 		tests += 2;
1284 	return tests;
1285 }
1286 
1287 enum anon_thp_collapse_test {
1288 	ANON_THP_COLLAPSE_UNSHARED,
1289 	ANON_THP_COLLAPSE_FULLY_SHARED,
1290 	ANON_THP_COLLAPSE_LOWER_SHARED,
1291 	ANON_THP_COLLAPSE_UPPER_SHARED,
1292 };
1293 
1294 static void do_test_anon_thp_collapse(char *mem, size_t size,
1295 				      enum anon_thp_collapse_test test)
1296 {
1297 	struct comm_pipes comm_pipes;
1298 	char buf;
1299 	int ret;
1300 
1301 	ret = setup_comm_pipes(&comm_pipes);
1302 	if (ret) {
1303 		log_test_result(KSFT_FAIL);
1304 		return;
1305 	}
1306 
1307 	/*
1308 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1309 	 * R/O, such that we can try collapsing it later.
1310 	 */
1311 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1312 	if (ret) {
1313 		ksft_perror("mprotect() failed");
1314 		log_test_result(KSFT_FAIL);
1315 		goto close_comm_pipes;
1316 	}
1317 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1318 	if (ret) {
1319 		ksft_perror("mprotect() failed");
1320 		log_test_result(KSFT_FAIL);
1321 		goto close_comm_pipes;
1322 	}
1323 
1324 	switch (test) {
1325 	case ANON_THP_COLLAPSE_UNSHARED:
1326 		/* Collapse before actually COW-sharing the page. */
1327 		ret = madvise(mem, size, MADV_COLLAPSE);
1328 		if (ret) {
1329 			ksft_perror("MADV_COLLAPSE failed");
1330 			log_test_result(KSFT_SKIP);
1331 			goto close_comm_pipes;
1332 		}
1333 		break;
1334 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1335 		/* COW-share the full PTE-mapped THP. */
1336 		break;
1337 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1338 		/* Don't COW-share the upper part of the THP. */
1339 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1340 		if (ret) {
1341 			ksft_perror("MADV_DONTFORK failed");
1342 			log_test_result(KSFT_FAIL);
1343 			goto close_comm_pipes;
1344 		}
1345 		break;
1346 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1347 		/* Don't COW-share the lower part of the THP. */
1348 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1349 		if (ret) {
1350 			ksft_perror("MADV_DONTFORK failed");
1351 			log_test_result(KSFT_FAIL);
1352 			goto close_comm_pipes;
1353 		}
1354 		break;
1355 	default:
1356 		assert(false);
1357 	}
1358 
1359 	ret = fork();
1360 	if (ret < 0) {
1361 		ksft_perror("fork() failed");
1362 		log_test_result(KSFT_FAIL);
1363 		goto close_comm_pipes;
1364 	} else if (!ret) {
1365 		switch (test) {
1366 		case ANON_THP_COLLAPSE_UNSHARED:
1367 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1368 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1369 			break;
1370 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1371 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1372 			break;
1373 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1374 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1375 					     &comm_pipes));
1376 			break;
1377 		default:
1378 			assert(false);
1379 		}
1380 	}
1381 
1382 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1383 		;
1384 
1385 	switch (test) {
1386 	case ANON_THP_COLLAPSE_UNSHARED:
1387 		break;
1388 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1389 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1390 		/*
1391 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1392 		 * able to actually collapse.
1393 		 */
1394 		ret = madvise(mem, size, MADV_DOFORK);
1395 		if (ret) {
1396 			ksft_perror("MADV_DOFORK failed");
1397 			log_test_result(KSFT_FAIL);
1398 			write(comm_pipes.parent_ready[1], "0", 1);
1399 			wait(&ret);
1400 			goto close_comm_pipes;
1401 		}
1402 		/* FALLTHROUGH */
1403 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1404 		/* Collapse before anyone modified the COW-shared page. */
1405 		ret = madvise(mem, size, MADV_COLLAPSE);
1406 		if (ret) {
1407 			ksft_perror("MADV_COLLAPSE failed");
1408 			log_test_result(KSFT_SKIP);
1409 			write(comm_pipes.parent_ready[1], "0", 1);
1410 			wait(&ret);
1411 			goto close_comm_pipes;
1412 		}
1413 		break;
1414 	default:
1415 		assert(false);
1416 	}
1417 
1418 	/* Modify the page. */
1419 	memset(mem, 0xff, size);
1420 	write(comm_pipes.parent_ready[1], "0", 1);
1421 
1422 	wait(&ret);
1423 	if (WIFEXITED(ret))
1424 		ret = WEXITSTATUS(ret);
1425 	else
1426 		ret = -EINVAL;
1427 
1428 	if (!ret) {
1429 		log_test_result(KSFT_PASS);
1430 	} else {
1431 		ksft_print_msg("Leak from parent into child\n");
1432 		log_test_result(KSFT_FAIL);
1433 	}
1434 close_comm_pipes:
1435 	close_comm_pipes(&comm_pipes);
1436 }
1437 
1438 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1439 		bool is_hugetlb)
1440 {
1441 	assert(!is_hugetlb);
1442 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1443 }
1444 
1445 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1446 		bool is_hugetlb)
1447 {
1448 	assert(!is_hugetlb);
1449 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1450 }
1451 
1452 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1453 		bool is_hugetlb)
1454 {
1455 	assert(!is_hugetlb);
1456 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1457 }
1458 
1459 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1460 		bool is_hugetlb)
1461 {
1462 	assert(!is_hugetlb);
1463 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1464 }
1465 
1466 /*
1467  * Test cases that are specific to anonymous THP: pages in private mappings
1468  * that may get shared via COW during fork().
1469  */
1470 static const struct test_case anon_thp_test_cases[] = {
1471 	/*
1472 	 * Basic COW test for fork() without any GUP when collapsing a THP
1473 	 * before fork().
1474 	 *
1475 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1476 	 * collapse") might easily get COW handling wrong when not collapsing
1477 	 * exclusivity information properly.
1478 	 */
1479 	{
1480 		"Basic COW after fork() when collapsing before fork()",
1481 		test_anon_thp_collapse_unshared,
1482 	},
1483 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1484 	{
1485 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1486 		test_anon_thp_collapse_fully_shared,
1487 	},
1488 	/*
1489 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1490 	 * THP.
1491 	 */
1492 	{
1493 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1494 		test_anon_thp_collapse_lower_shared,
1495 	},
1496 	/*
1497 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1498 	 * THP.
1499 	 */
1500 	{
1501 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1502 		test_anon_thp_collapse_upper_shared,
1503 	},
1504 };
1505 
1506 static void run_anon_thp_test_cases(void)
1507 {
1508 	int i;
1509 
1510 	if (!pmdsize)
1511 		return;
1512 
1513 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1514 
1515 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1516 		struct test_case const *test_case = &anon_thp_test_cases[i];
1517 
1518 		log_test_start("%s", test_case->desc);
1519 		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1520 	}
1521 }
1522 
1523 static int tests_per_anon_thp_test_case(void)
1524 {
1525 	return pmdsize ? 1 : 0;
1526 }
1527 
1528 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1529 
1530 static void test_cow(char *mem, const char *smem, size_t size)
1531 {
1532 	char *old = malloc(size);
1533 
1534 	/* Backup the original content. */
1535 	memcpy(old, smem, size);
1536 
1537 	/* Modify the page. */
1538 	memset(mem, 0xff, size);
1539 
1540 	/* See if we still read the old values via the other mapping. */
1541 	if (!memcmp(smem, old, size)) {
1542 		log_test_result(KSFT_PASS);
1543 	} else {
1544 		ksft_print_msg("Other mapping modified\n");
1545 		log_test_result(KSFT_FAIL);
1546 	}
1547 	free(old);
1548 }
1549 
1550 static void test_ro_pin(char *mem, const char *smem, size_t size)
1551 {
1552 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1553 }
1554 
1555 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1556 {
1557 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1558 }
1559 
1560 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1561 {
1562 	char *mem, *smem, tmp;
1563 
1564 	log_test_start("%s ... with shared zeropage", desc);
1565 
1566 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1567 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1568 	if (mem == MAP_FAILED) {
1569 		ksft_perror("mmap() failed");
1570 		log_test_result(KSFT_FAIL);
1571 		return;
1572 	}
1573 
1574 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1575 	if (smem == MAP_FAILED) {
1576 		ksft_perror("mmap() failed");
1577 		log_test_result(KSFT_FAIL);
1578 		goto munmap;
1579 	}
1580 
1581 	/* Read from the page to populate the shared zeropage. */
1582 	tmp = *mem + *smem;
1583 	asm volatile("" : "+r" (tmp));
1584 
1585 	fn(mem, smem, pagesize);
1586 munmap:
1587 	munmap(mem, pagesize);
1588 	if (smem != MAP_FAILED)
1589 		munmap(smem, pagesize);
1590 }
1591 
1592 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1593 {
1594 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1595 	size_t mmap_size;
1596 	int ret;
1597 
1598 	log_test_start("%s ... with huge zeropage", desc);
1599 
1600 	if (!has_huge_zeropage) {
1601 		ksft_print_msg("Huge zeropage not enabled\n");
1602 		log_test_result(KSFT_SKIP);
1603 		return;
1604 	}
1605 
1606 	/* For alignment purposes, we need twice the thp size. */
1607 	mmap_size = 2 * pmdsize;
1608 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1609 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1610 	if (mmap_mem == MAP_FAILED) {
1611 		ksft_perror("mmap() failed");
1612 		log_test_result(KSFT_FAIL);
1613 		return;
1614 	}
1615 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1616 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1617 	if (mmap_smem == MAP_FAILED) {
1618 		ksft_perror("mmap() failed");
1619 		log_test_result(KSFT_FAIL);
1620 		goto munmap;
1621 	}
1622 
1623 	/* We need a THP-aligned memory area. */
1624 	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1625 	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1626 
1627 	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1628 	if (ret) {
1629 		ksft_perror("madvise()");
1630 		log_test_result(KSFT_FAIL);
1631 		goto munmap;
1632 	}
1633 	ret = madvise(smem, pmdsize, MADV_HUGEPAGE);
1634 	if (ret) {
1635 		ksft_perror("madvise()");
1636 		log_test_result(KSFT_FAIL);
1637 		goto munmap;
1638 	}
1639 
1640 	/*
1641 	 * Read from the memory to populate the huge shared zeropage. Read from
1642 	 * the first sub-page and test if we get another sub-page populated
1643 	 * automatically.
1644 	 */
1645 	tmp = *mem + *smem;
1646 	asm volatile("" : "+r" (tmp));
1647 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1648 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1649 		ksft_test_result_skip("Did not get THPs populated\n");
1650 		goto munmap;
1651 	}
1652 
1653 	fn(mem, smem, pmdsize);
1654 munmap:
1655 	munmap(mmap_mem, mmap_size);
1656 	if (mmap_smem != MAP_FAILED)
1657 		munmap(mmap_smem, mmap_size);
1658 }
1659 
1660 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1661 {
1662 	char *mem, *smem, tmp;
1663 	int fd;
1664 
1665 	log_test_start("%s ... with memfd", desc);
1666 
1667 	fd = memfd_create("test", 0);
1668 	if (fd < 0) {
1669 		ksft_perror("memfd_create() failed");
1670 		log_test_result(KSFT_FAIL);
1671 		return;
1672 	}
1673 
1674 	/* File consists of a single page filled with zeroes. */
1675 	if (fallocate(fd, 0, 0, pagesize)) {
1676 		ksft_perror("fallocate() failed");
1677 		log_test_result(KSFT_FAIL);
1678 		goto close;
1679 	}
1680 
1681 	/* Create a private mapping of the memfd. */
1682 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1683 	if (mem == MAP_FAILED) {
1684 		ksft_perror("mmap() failed");
1685 		log_test_result(KSFT_FAIL);
1686 		goto close;
1687 	}
1688 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1689 	if (smem == MAP_FAILED) {
1690 		ksft_perror("mmap() failed");
1691 		log_test_result(KSFT_FAIL);
1692 		goto munmap;
1693 	}
1694 
1695 	/* Fault the page in. */
1696 	tmp = *mem + *smem;
1697 	asm volatile("" : "+r" (tmp));
1698 
1699 	fn(mem, smem, pagesize);
1700 munmap:
1701 	munmap(mem, pagesize);
1702 	if (smem != MAP_FAILED)
1703 		munmap(smem, pagesize);
1704 close:
1705 	close(fd);
1706 }
1707 
1708 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1709 {
1710 	char *mem, *smem, tmp;
1711 	FILE *file;
1712 	int fd;
1713 
1714 	log_test_start("%s ... with tmpfile", desc);
1715 
1716 	file = tmpfile();
1717 	if (!file) {
1718 		ksft_perror("tmpfile() failed");
1719 		log_test_result(KSFT_FAIL);
1720 		return;
1721 	}
1722 
1723 	fd = fileno(file);
1724 	if (fd < 0) {
1725 		ksft_perror("fileno() failed");
1726 		log_test_result(KSFT_SKIP);
1727 		return;
1728 	}
1729 
1730 	/* File consists of a single page filled with zeroes. */
1731 	if (fallocate(fd, 0, 0, pagesize)) {
1732 		ksft_perror("fallocate() failed");
1733 		log_test_result(KSFT_FAIL);
1734 		goto close;
1735 	}
1736 
1737 	/* Create a private mapping of the memfd. */
1738 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1739 	if (mem == MAP_FAILED) {
1740 		ksft_perror("mmap() failed");
1741 		log_test_result(KSFT_FAIL);
1742 		goto close;
1743 	}
1744 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1745 	if (smem == MAP_FAILED) {
1746 		ksft_perror("mmap() failed");
1747 		log_test_result(KSFT_FAIL);
1748 		goto munmap;
1749 	}
1750 
1751 	/* Fault the page in. */
1752 	tmp = *mem + *smem;
1753 	asm volatile("" : "+r" (tmp));
1754 
1755 	fn(mem, smem, pagesize);
1756 munmap:
1757 	munmap(mem, pagesize);
1758 	if (smem != MAP_FAILED)
1759 		munmap(smem, pagesize);
1760 close:
1761 	fclose(file);
1762 }
1763 
1764 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1765 				   size_t hugetlbsize)
1766 {
1767 	int flags = MFD_HUGETLB;
1768 	char *mem, *smem, tmp;
1769 	int fd;
1770 
1771 	log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
1772 		       hugetlbsize / 1024);
1773 
1774 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1775 
1776 	fd = memfd_create("test", flags);
1777 	if (fd < 0) {
1778 		ksft_perror("memfd_create() failed");
1779 		log_test_result(KSFT_SKIP);
1780 		return;
1781 	}
1782 
1783 	/* File consists of a single page filled with zeroes. */
1784 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1785 		ksft_perror("need more free huge pages");
1786 		log_test_result(KSFT_SKIP);
1787 		goto close;
1788 	}
1789 
1790 	/* Create a private mapping of the memfd. */
1791 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1792 		   0);
1793 	if (mem == MAP_FAILED) {
1794 		ksft_perror("need more free huge pages");
1795 		log_test_result(KSFT_SKIP);
1796 		goto close;
1797 	}
1798 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1799 	if (smem == MAP_FAILED) {
1800 		ksft_perror("mmap() failed");
1801 		log_test_result(KSFT_FAIL);
1802 		goto munmap;
1803 	}
1804 
1805 	/* Fault the page in. */
1806 	tmp = *mem + *smem;
1807 	asm volatile("" : "+r" (tmp));
1808 
1809 	fn(mem, smem, hugetlbsize);
1810 munmap:
1811 	munmap(mem, hugetlbsize);
1812 	if (smem != MAP_FAILED)
1813 		munmap(smem, hugetlbsize);
1814 close:
1815 	close(fd);
1816 }
1817 
1818 struct non_anon_test_case {
1819 	const char *desc;
1820 	non_anon_test_fn fn;
1821 };
1822 
1823 /*
1824  * Test cases that target any pages in private mappings that are not anonymous:
1825  * pages that may get shared via COW ndependent of fork(). This includes
1826  * the shared zeropage(s), pagecache pages, ...
1827  */
1828 static const struct non_anon_test_case non_anon_test_cases[] = {
1829 	/*
1830 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1831 	 * visible via other private/shared mappings.
1832 	 */
1833 	{
1834 		"Basic COW",
1835 		test_cow,
1836 	},
1837 	/*
1838 	 * Take a R/O longterm pin. When modifying the page via the page table,
1839 	 * the page content change must be visible via the pin.
1840 	 */
1841 	{
1842 		"R/O longterm GUP pin",
1843 		test_ro_pin,
1844 	},
1845 	/* Same as above, but using GUP-fast. */
1846 	{
1847 		"R/O longterm GUP-fast pin",
1848 		test_ro_fast_pin,
1849 	},
1850 };
1851 
1852 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1853 {
1854 	int i;
1855 
1856 	run_with_zeropage(test_case->fn, test_case->desc);
1857 	run_with_memfd(test_case->fn, test_case->desc);
1858 	run_with_tmpfile(test_case->fn, test_case->desc);
1859 	if (pmdsize)
1860 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1861 	for (i = 0; i < nr_hugetlbsizes; i++)
1862 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1863 				       hugetlbsizes[i]);
1864 }
1865 
1866 static void run_non_anon_test_cases(void)
1867 {
1868 	int i;
1869 
1870 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1871 
1872 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1873 		run_non_anon_test_case(&non_anon_test_cases[i]);
1874 }
1875 
1876 static int tests_per_non_anon_test_case(void)
1877 {
1878 	int tests = 3 + nr_hugetlbsizes;
1879 
1880 	if (pmdsize)
1881 		tests += 1;
1882 	return tests;
1883 }
1884 
1885 int main(int argc, char **argv)
1886 {
1887 	struct thp_settings default_settings;
1888 
1889 	ksft_print_header();
1890 
1891 	pagesize = getpagesize();
1892 	pmdsize = read_pmd_pagesize();
1893 	if (pmdsize) {
1894 		/* Only if THP is supported. */
1895 		thp_read_settings(&default_settings);
1896 		default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1897 		thp_save_settings();
1898 		thp_push_settings(&default_settings);
1899 
1900 		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1901 			       pmdsize / 1024);
1902 		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1903 	}
1904 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1905 						    ARRAY_SIZE(hugetlbsizes));
1906 	detect_huge_zeropage();
1907 
1908 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1909 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1910 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1911 
1912 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1913 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1914 	if (pagemap_fd < 0)
1915 		ksft_exit_fail_msg("opening pagemap failed\n");
1916 
1917 	run_anon_test_cases();
1918 	run_anon_thp_test_cases();
1919 	run_non_anon_test_cases();
1920 
1921 	if (pmdsize) {
1922 		/* Only if THP is supported. */
1923 		thp_restore_settings();
1924 	}
1925 
1926 	ksft_finished();
1927 }
1928