xref: /linux/tools/testing/selftests/mm/cow.c (revision 55a42f78ffd386e01a5404419f8c5ded7db70a21)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33 
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43 
44 static int detect_thp_sizes(size_t sizes[], int max)
45 {
46 	int count = 0;
47 	unsigned long orders;
48 	size_t kb;
49 	int i;
50 
51 	/* thp not supported at all. */
52 	if (!pmdsize)
53 		return 0;
54 
55 	orders = 1UL << sz2ord(pmdsize, pagesize);
56 	orders |= thp_supported_orders();
57 
58 	for (i = 0; orders && count < max; i++) {
59 		if (!(orders & (1UL << i)))
60 			continue;
61 		orders &= ~(1UL << i);
62 		kb = (pagesize >> 10) << i;
63 		sizes[count++] = kb * 1024;
64 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
65 	}
66 
67 	return count;
68 }
69 
70 static bool range_is_swapped(void *addr, size_t size)
71 {
72 	for (; size; addr += pagesize, size -= pagesize)
73 		if (!pagemap_is_swapped(pagemap_fd, addr))
74 			return false;
75 	return true;
76 }
77 
78 struct comm_pipes {
79 	int child_ready[2];
80 	int parent_ready[2];
81 };
82 
83 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
84 {
85 	if (pipe(comm_pipes->child_ready) < 0) {
86 		ksft_perror("pipe() failed");
87 		return -errno;
88 	}
89 	if (pipe(comm_pipes->parent_ready) < 0) {
90 		ksft_perror("pipe() failed");
91 		close(comm_pipes->child_ready[0]);
92 		close(comm_pipes->child_ready[1]);
93 		return -errno;
94 	}
95 
96 	return 0;
97 }
98 
99 static void close_comm_pipes(struct comm_pipes *comm_pipes)
100 {
101 	close(comm_pipes->child_ready[0]);
102 	close(comm_pipes->child_ready[1]);
103 	close(comm_pipes->parent_ready[0]);
104 	close(comm_pipes->parent_ready[1]);
105 }
106 
107 static int child_memcmp_fn(char *mem, size_t size,
108 			   struct comm_pipes *comm_pipes)
109 {
110 	char *old = malloc(size);
111 	char buf;
112 
113 	/* Backup the original content. */
114 	memcpy(old, mem, size);
115 
116 	/* Wait until the parent modified the page. */
117 	write(comm_pipes->child_ready[1], "0", 1);
118 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
119 		;
120 
121 	/* See if we still read the old values. */
122 	return memcmp(old, mem, size);
123 }
124 
125 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
126 				    struct comm_pipes *comm_pipes)
127 {
128 	struct iovec iov = {
129 		.iov_base = mem,
130 		.iov_len = size,
131 	};
132 	ssize_t cur, total, transferred;
133 	char *old, *new;
134 	int fds[2];
135 	char buf;
136 
137 	old = malloc(size);
138 	new = malloc(size);
139 
140 	/* Backup the original content. */
141 	memcpy(old, mem, size);
142 
143 	if (pipe(fds) < 0)
144 		return -errno;
145 
146 	/* Trigger a read-only pin. */
147 	transferred = vmsplice(fds[1], &iov, 1, 0);
148 	if (transferred < 0)
149 		return -errno;
150 	if (transferred == 0)
151 		return -EINVAL;
152 
153 	/* Unmap it from our page tables. */
154 	if (munmap(mem, size) < 0)
155 		return -errno;
156 
157 	/* Wait until the parent modified it. */
158 	write(comm_pipes->child_ready[1], "0", 1);
159 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
160 		;
161 
162 	/* See if we still read the old values via the pipe. */
163 	for (total = 0; total < transferred; total += cur) {
164 		cur = read(fds[0], new + total, transferred - total);
165 		if (cur < 0)
166 			return -errno;
167 	}
168 
169 	return memcmp(old, new, transferred);
170 }
171 
172 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
173 
174 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
175 		child_fn fn, bool xfail)
176 {
177 	struct comm_pipes comm_pipes;
178 	char buf;
179 	int ret;
180 
181 	ret = setup_comm_pipes(&comm_pipes);
182 	if (ret) {
183 		log_test_result(KSFT_FAIL);
184 		return;
185 	}
186 
187 	ret = fork();
188 	if (ret < 0) {
189 		ksft_perror("fork() failed");
190 		log_test_result(KSFT_FAIL);
191 		goto close_comm_pipes;
192 	} else if (!ret) {
193 		exit(fn(mem, size, &comm_pipes));
194 	}
195 
196 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
197 		;
198 
199 	if (do_mprotect) {
200 		/*
201 		 * mprotect() optimizations might try avoiding
202 		 * write-faults by directly mapping pages writable.
203 		 */
204 		ret = mprotect(mem, size, PROT_READ);
205 		if (ret) {
206 			ksft_perror("mprotect() failed");
207 			log_test_result(KSFT_FAIL);
208 			write(comm_pipes.parent_ready[1], "0", 1);
209 			wait(&ret);
210 			goto close_comm_pipes;
211 		}
212 
213 		ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
214 		if (ret) {
215 			ksft_perror("mprotect() failed");
216 			log_test_result(KSFT_FAIL);
217 			write(comm_pipes.parent_ready[1], "0", 1);
218 			wait(&ret);
219 			goto close_comm_pipes;
220 		}
221 	}
222 
223 	/* Modify the page. */
224 	memset(mem, 0xff, size);
225 	write(comm_pipes.parent_ready[1], "0", 1);
226 
227 	wait(&ret);
228 	if (WIFEXITED(ret))
229 		ret = WEXITSTATUS(ret);
230 	else
231 		ret = -EINVAL;
232 
233 	if (!ret) {
234 		log_test_result(KSFT_PASS);
235 	} else if (xfail) {
236 		/*
237 		 * With hugetlb, some vmsplice() tests are currently expected to
238 		 * fail because (a) harder to fix and (b) nobody really cares.
239 		 * Flag them as expected failure for now.
240 		 */
241 		ksft_print_msg("Leak from parent into child\n");
242 		log_test_result(KSFT_XFAIL);
243 	} else {
244 		ksft_print_msg("Leak from parent into child\n");
245 		log_test_result(KSFT_FAIL);
246 	}
247 close_comm_pipes:
248 	close_comm_pipes(&comm_pipes);
249 }
250 
251 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
252 {
253 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
254 }
255 
256 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
257 {
258 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
259 }
260 
261 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
262 {
263 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
264 			      is_hugetlb);
265 }
266 
267 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
268 		bool is_hugetlb)
269 {
270 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
271 			      is_hugetlb);
272 }
273 
274 static void do_test_vmsplice_in_parent(char *mem, size_t size,
275 				       bool before_fork, bool xfail)
276 {
277 	struct iovec iov = {
278 		.iov_base = mem,
279 		.iov_len = size,
280 	};
281 	ssize_t cur, total, transferred = 0;
282 	struct comm_pipes comm_pipes;
283 	char *old, *new;
284 	int ret, fds[2];
285 	char buf;
286 
287 	old = malloc(size);
288 	new = malloc(size);
289 
290 	memcpy(old, mem, size);
291 
292 	ret = setup_comm_pipes(&comm_pipes);
293 	if (ret) {
294 		log_test_result(KSFT_FAIL);
295 		goto free;
296 	}
297 
298 	if (pipe(fds) < 0) {
299 		ksft_perror("pipe() failed");
300 		log_test_result(KSFT_FAIL);
301 		goto close_comm_pipes;
302 	}
303 
304 	if (before_fork) {
305 		transferred = vmsplice(fds[1], &iov, 1, 0);
306 		if (transferred <= 0) {
307 			ksft_perror("vmsplice() failed\n");
308 			log_test_result(KSFT_FAIL);
309 			goto close_pipe;
310 		}
311 	}
312 
313 	ret = fork();
314 	if (ret < 0) {
315 		ksft_perror("fork() failed\n");
316 		log_test_result(KSFT_FAIL);
317 		goto close_pipe;
318 	} else if (!ret) {
319 		write(comm_pipes.child_ready[1], "0", 1);
320 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
321 			;
322 		/* Modify page content in the child. */
323 		memset(mem, 0xff, size);
324 		exit(0);
325 	}
326 
327 	if (!before_fork) {
328 		transferred = vmsplice(fds[1], &iov, 1, 0);
329 		if (transferred <= 0) {
330 			ksft_perror("vmsplice() failed");
331 			log_test_result(KSFT_FAIL);
332 			wait(&ret);
333 			goto close_pipe;
334 		}
335 	}
336 
337 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
338 		;
339 	if (munmap(mem, size) < 0) {
340 		ksft_perror("munmap() failed");
341 		log_test_result(KSFT_FAIL);
342 		goto close_pipe;
343 	}
344 	write(comm_pipes.parent_ready[1], "0", 1);
345 
346 	/* Wait until the child is done writing. */
347 	wait(&ret);
348 	if (!WIFEXITED(ret)) {
349 		ksft_perror("wait() failed");
350 		log_test_result(KSFT_FAIL);
351 		goto close_pipe;
352 	}
353 
354 	/* See if we still read the old values. */
355 	for (total = 0; total < transferred; total += cur) {
356 		cur = read(fds[0], new + total, transferred - total);
357 		if (cur < 0) {
358 			ksft_perror("read() failed");
359 			log_test_result(KSFT_FAIL);
360 			goto close_pipe;
361 		}
362 	}
363 
364 	if (!memcmp(old, new, transferred)) {
365 		log_test_result(KSFT_PASS);
366 	} else if (xfail) {
367 		/*
368 		 * With hugetlb, some vmsplice() tests are currently expected to
369 		 * fail because (a) harder to fix and (b) nobody really cares.
370 		 * Flag them as expected failure for now.
371 		 */
372 		ksft_print_msg("Leak from child into parent\n");
373 		log_test_result(KSFT_XFAIL);
374 	} else {
375 		ksft_print_msg("Leak from child into parent\n");
376 		log_test_result(KSFT_FAIL);
377 	}
378 close_pipe:
379 	close(fds[0]);
380 	close(fds[1]);
381 close_comm_pipes:
382 	close_comm_pipes(&comm_pipes);
383 free:
384 	free(old);
385 	free(new);
386 }
387 
388 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
389 {
390 	do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
391 }
392 
393 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
394 {
395 	do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
396 }
397 
398 #ifdef LOCAL_CONFIG_HAVE_LIBURING
399 static void do_test_iouring(char *mem, size_t size, bool use_fork)
400 {
401 	struct comm_pipes comm_pipes;
402 	struct io_uring_cqe *cqe;
403 	struct io_uring_sqe *sqe;
404 	struct io_uring ring;
405 	ssize_t cur, total;
406 	struct iovec iov;
407 	char *buf, *tmp;
408 	int ret, fd;
409 	FILE *file;
410 
411 	ret = setup_comm_pipes(&comm_pipes);
412 	if (ret) {
413 		log_test_result(KSFT_FAIL);
414 		return;
415 	}
416 
417 	file = tmpfile();
418 	if (!file) {
419 		ksft_perror("tmpfile() failed");
420 		log_test_result(KSFT_FAIL);
421 		goto close_comm_pipes;
422 	}
423 	fd = fileno(file);
424 	assert(fd);
425 
426 	tmp = malloc(size);
427 	if (!tmp) {
428 		ksft_print_msg("malloc() failed\n");
429 		log_test_result(KSFT_FAIL);
430 		goto close_file;
431 	}
432 
433 	/* Skip on errors, as we might just lack kernel support. */
434 	ret = io_uring_queue_init(1, &ring, 0);
435 	if (ret < 0) {
436 		ksft_print_msg("io_uring_queue_init() failed\n");
437 		log_test_result(KSFT_SKIP);
438 		goto free_tmp;
439 	}
440 
441 	/*
442 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
443 	 * | FOLL_LONGTERM the range.
444 	 *
445 	 * Skip on errors, as we might just lack kernel support or might not
446 	 * have sufficient MEMLOCK permissions.
447 	 */
448 	iov.iov_base = mem;
449 	iov.iov_len = size;
450 	ret = io_uring_register_buffers(&ring, &iov, 1);
451 	if (ret) {
452 		ksft_print_msg("io_uring_register_buffers() failed\n");
453 		log_test_result(KSFT_SKIP);
454 		goto queue_exit;
455 	}
456 
457 	if (use_fork) {
458 		/*
459 		 * fork() and keep the child alive until we're done. Note that
460 		 * we expect the pinned page to not get shared with the child.
461 		 */
462 		ret = fork();
463 		if (ret < 0) {
464 			ksft_perror("fork() failed");
465 			log_test_result(KSFT_FAIL);
466 			goto unregister_buffers;
467 		} else if (!ret) {
468 			write(comm_pipes.child_ready[1], "0", 1);
469 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
470 				;
471 			exit(0);
472 		}
473 
474 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
475 			;
476 	} else {
477 		/*
478 		 * Map the page R/O into the page table. Enable softdirty
479 		 * tracking to stop the page from getting mapped R/W immediately
480 		 * again by mprotect() optimizations. Note that we don't have an
481 		 * easy way to test if that worked (the pagemap does not export
482 		 * if the page is mapped R/O vs. R/W).
483 		 */
484 		ret = mprotect(mem, size, PROT_READ);
485 		if (ret) {
486 			ksft_perror("mprotect() failed");
487 			log_test_result(KSFT_FAIL);
488 			goto unregister_buffers;
489 		}
490 
491 		clear_softdirty();
492 		ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
493 		if (ret) {
494 			ksft_perror("mprotect() failed");
495 			log_test_result(KSFT_FAIL);
496 			goto unregister_buffers;
497 		}
498 	}
499 
500 	/*
501 	 * Modify the page and write page content as observed by the fixed
502 	 * buffer pin to the file so we can verify it.
503 	 */
504 	memset(mem, 0xff, size);
505 	sqe = io_uring_get_sqe(&ring);
506 	if (!sqe) {
507 		ksft_print_msg("io_uring_get_sqe() failed\n");
508 		log_test_result(KSFT_FAIL);
509 		goto quit_child;
510 	}
511 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
512 
513 	ret = io_uring_submit(&ring);
514 	if (ret < 0) {
515 		ksft_print_msg("io_uring_submit() failed\n");
516 		log_test_result(KSFT_FAIL);
517 		goto quit_child;
518 	}
519 
520 	ret = io_uring_wait_cqe(&ring, &cqe);
521 	if (ret < 0) {
522 		ksft_print_msg("io_uring_wait_cqe() failed\n");
523 		log_test_result(KSFT_FAIL);
524 		goto quit_child;
525 	}
526 
527 	if (cqe->res != size) {
528 		ksft_print_msg("write_fixed failed\n");
529 		log_test_result(KSFT_FAIL);
530 		goto quit_child;
531 	}
532 	io_uring_cqe_seen(&ring, cqe);
533 
534 	/* Read back the file content to the temporary buffer. */
535 	total = 0;
536 	while (total < size) {
537 		cur = pread(fd, tmp + total, size - total, total);
538 		if (cur < 0) {
539 			ksft_perror("pread() failed\n");
540 			log_test_result(KSFT_FAIL);
541 			goto quit_child;
542 		}
543 		total += cur;
544 	}
545 
546 	/* Finally, check if we read what we expected. */
547 	if (!memcmp(mem, tmp, size)) {
548 		log_test_result(KSFT_PASS);
549 	} else {
550 		ksft_print_msg("Longtom R/W pin is not reliable\n");
551 		log_test_result(KSFT_FAIL);
552 	}
553 
554 quit_child:
555 	if (use_fork) {
556 		write(comm_pipes.parent_ready[1], "0", 1);
557 		wait(&ret);
558 	}
559 unregister_buffers:
560 	io_uring_unregister_buffers(&ring);
561 queue_exit:
562 	io_uring_queue_exit(&ring);
563 free_tmp:
564 	free(tmp);
565 close_file:
566 	fclose(file);
567 close_comm_pipes:
568 	close_comm_pipes(&comm_pipes);
569 }
570 
571 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
572 {
573 	do_test_iouring(mem, size, false);
574 }
575 
576 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
577 {
578 	do_test_iouring(mem, size, true);
579 }
580 
581 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
582 
583 enum ro_pin_test {
584 	RO_PIN_TEST,
585 	RO_PIN_TEST_SHARED,
586 	RO_PIN_TEST_PREVIOUSLY_SHARED,
587 	RO_PIN_TEST_RO_EXCLUSIVE,
588 };
589 
590 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
591 			   bool fast)
592 {
593 	struct pin_longterm_test args;
594 	struct comm_pipes comm_pipes;
595 	char *tmp, buf;
596 	__u64 tmp_val;
597 	int ret;
598 
599 	if (gup_fd < 0) {
600 		ksft_print_msg("gup_test not available\n");
601 		log_test_result(KSFT_SKIP);
602 		return;
603 	}
604 
605 	tmp = malloc(size);
606 	if (!tmp) {
607 		ksft_perror("malloc() failed\n");
608 		log_test_result(KSFT_FAIL);
609 		return;
610 	}
611 
612 	ret = setup_comm_pipes(&comm_pipes);
613 	if (ret) {
614 		log_test_result(KSFT_FAIL);
615 		goto free_tmp;
616 	}
617 
618 	switch (test) {
619 	case RO_PIN_TEST:
620 		break;
621 	case RO_PIN_TEST_SHARED:
622 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
623 		/*
624 		 * Share the pages with our child. As the pages are not pinned,
625 		 * this should just work.
626 		 */
627 		ret = fork();
628 		if (ret < 0) {
629 			ksft_perror("fork() failed");
630 			log_test_result(KSFT_FAIL);
631 			goto close_comm_pipes;
632 		} else if (!ret) {
633 			write(comm_pipes.child_ready[1], "0", 1);
634 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
635 				;
636 			exit(0);
637 		}
638 
639 		/* Wait until our child is ready. */
640 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
641 			;
642 
643 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
644 			/*
645 			 * Tell the child to quit now and wait until it quit.
646 			 * The pages should now be mapped R/O into our page
647 			 * tables, but they are no longer shared.
648 			 */
649 			write(comm_pipes.parent_ready[1], "0", 1);
650 			wait(&ret);
651 			if (!WIFEXITED(ret))
652 				ksft_print_msg("[INFO] wait() failed\n");
653 		}
654 		break;
655 	case RO_PIN_TEST_RO_EXCLUSIVE:
656 		/*
657 		 * Map the page R/O into the page table. Enable softdirty
658 		 * tracking to stop the page from getting mapped R/W immediately
659 		 * again by mprotect() optimizations. Note that we don't have an
660 		 * easy way to test if that worked (the pagemap does not export
661 		 * if the page is mapped R/O vs. R/W).
662 		 */
663 		ret = mprotect(mem, size, PROT_READ);
664 		clear_softdirty();
665 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
666 		if (ret) {
667 			ksft_perror("mprotect() failed");
668 			log_test_result(KSFT_FAIL);
669 			goto close_comm_pipes;
670 		}
671 		break;
672 	default:
673 		assert(false);
674 	}
675 
676 	/* Take a R/O pin. This should trigger unsharing. */
677 	args.addr = (__u64)(uintptr_t)mem;
678 	args.size = size;
679 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
680 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
681 	if (ret) {
682 		if (errno == EINVAL)
683 			ret = KSFT_SKIP;
684 		else
685 			ret = KSFT_FAIL;
686 		ksft_perror("PIN_LONGTERM_TEST_START failed");
687 		log_test_result(ret);
688 		goto wait;
689 	}
690 
691 	/* Modify the page. */
692 	memset(mem, 0xff, size);
693 
694 	/*
695 	 * Read back the content via the pin to the temporary buffer and
696 	 * test if we observed the modification.
697 	 */
698 	tmp_val = (__u64)(uintptr_t)tmp;
699 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
700 	if (ret) {
701 		ksft_perror("PIN_LONGTERM_TEST_READ failed");
702 		log_test_result(KSFT_FAIL);
703 	} else {
704 		if (!memcmp(mem, tmp, size)) {
705 			log_test_result(KSFT_PASS);
706 		} else {
707 			ksft_print_msg("Longterm R/O pin is not reliable\n");
708 			log_test_result(KSFT_FAIL);
709 		}
710 	}
711 
712 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
713 	if (ret)
714 		ksft_perror("PIN_LONGTERM_TEST_STOP failed");
715 wait:
716 	switch (test) {
717 	case RO_PIN_TEST_SHARED:
718 		write(comm_pipes.parent_ready[1], "0", 1);
719 		wait(&ret);
720 		if (!WIFEXITED(ret))
721 			ksft_perror("wait() failed");
722 		break;
723 	default:
724 		break;
725 	}
726 close_comm_pipes:
727 	close_comm_pipes(&comm_pipes);
728 free_tmp:
729 	free(tmp);
730 }
731 
732 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
733 {
734 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
735 }
736 
737 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
738 {
739 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
740 }
741 
742 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
743 		bool is_hugetlb)
744 {
745 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
746 }
747 
748 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
749 		bool is_hugetlb)
750 {
751 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
752 }
753 
754 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
755 		bool is_hugetlb)
756 {
757 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
758 }
759 
760 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
761 		bool is_hugetlb)
762 {
763 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
764 }
765 
766 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
767 
768 static void do_run_with_base_page(test_fn fn, bool swapout)
769 {
770 	char *mem;
771 	int ret;
772 
773 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
774 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
775 	if (mem == MAP_FAILED) {
776 		ksft_perror("mmap() failed");
777 		log_test_result(KSFT_FAIL);
778 		return;
779 	}
780 
781 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
782 	/* Ignore if not around on a kernel. */
783 	if (ret && errno != EINVAL) {
784 		ksft_perror("MADV_NOHUGEPAGE failed");
785 		log_test_result(KSFT_FAIL);
786 		goto munmap;
787 	}
788 
789 	/* Populate a base page. */
790 	memset(mem, 1, pagesize);
791 
792 	if (swapout) {
793 		madvise(mem, pagesize, MADV_PAGEOUT);
794 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
795 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
796 			log_test_result(KSFT_SKIP);
797 			goto munmap;
798 		}
799 	}
800 
801 	fn(mem, pagesize, false);
802 munmap:
803 	munmap(mem, pagesize);
804 }
805 
806 static void run_with_base_page(test_fn fn, const char *desc)
807 {
808 	log_test_start("%s ... with base page", desc);
809 	do_run_with_base_page(fn, false);
810 }
811 
812 static void run_with_base_page_swap(test_fn fn, const char *desc)
813 {
814 	log_test_start("%s ... with swapped out base page", desc);
815 	do_run_with_base_page(fn, true);
816 }
817 
818 enum thp_run {
819 	THP_RUN_PMD,
820 	THP_RUN_PMD_SWAPOUT,
821 	THP_RUN_PTE,
822 	THP_RUN_PTE_SWAPOUT,
823 	THP_RUN_SINGLE_PTE,
824 	THP_RUN_SINGLE_PTE_SWAPOUT,
825 	THP_RUN_PARTIAL_MREMAP,
826 	THP_RUN_PARTIAL_SHARED,
827 };
828 
829 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
830 {
831 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
832 	size_t size, mmap_size, mremap_size;
833 	int ret;
834 
835 	/* For alignment purposes, we need twice the thp size. */
836 	mmap_size = 2 * thpsize;
837 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
838 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
839 	if (mmap_mem == MAP_FAILED) {
840 		ksft_perror("mmap() failed");
841 		log_test_result(KSFT_FAIL);
842 		return;
843 	}
844 
845 	/* We need a THP-aligned memory area. */
846 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
847 
848 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
849 	if (ret) {
850 		ksft_perror("MADV_HUGEPAGE failed");
851 		log_test_result(KSFT_FAIL);
852 		goto munmap;
853 	}
854 
855 	/*
856 	 * Try to populate a THP. Touch the first sub-page and test if
857 	 * we get the last sub-page populated automatically.
858 	 */
859 	mem[0] = 1;
860 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
861 		ksft_print_msg("Did not get a THP populated\n");
862 		log_test_result(KSFT_SKIP);
863 		goto munmap;
864 	}
865 	memset(mem, 1, thpsize);
866 
867 	size = thpsize;
868 	switch (thp_run) {
869 	case THP_RUN_PMD:
870 	case THP_RUN_PMD_SWAPOUT:
871 		assert(thpsize == pmdsize);
872 		break;
873 	case THP_RUN_PTE:
874 	case THP_RUN_PTE_SWAPOUT:
875 		/*
876 		 * Trigger PTE-mapping the THP by temporarily mapping a single
877 		 * subpage R/O. This is a noop if the THP is not pmdsize (and
878 		 * therefore already PTE-mapped).
879 		 */
880 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
881 		if (ret) {
882 			ksft_perror("mprotect() failed");
883 			log_test_result(KSFT_FAIL);
884 			goto munmap;
885 		}
886 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
887 		if (ret) {
888 			ksft_perror("mprotect() failed");
889 			log_test_result(KSFT_FAIL);
890 			goto munmap;
891 		}
892 		break;
893 	case THP_RUN_SINGLE_PTE:
894 	case THP_RUN_SINGLE_PTE_SWAPOUT:
895 		/*
896 		 * Discard all but a single subpage of that PTE-mapped THP. What
897 		 * remains is a single PTE mapping a single subpage.
898 		 */
899 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
900 		if (ret) {
901 			ksft_perror("MADV_DONTNEED failed");
902 			log_test_result(KSFT_FAIL);
903 			goto munmap;
904 		}
905 		size = pagesize;
906 		break;
907 	case THP_RUN_PARTIAL_MREMAP:
908 		/*
909 		 * Remap half of the THP. We need some new memory location
910 		 * for that.
911 		 */
912 		mremap_size = thpsize / 2;
913 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
914 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
915 		if (mremap_mem == MAP_FAILED) {
916 			ksft_perror("mmap() failed");
917 			log_test_result(KSFT_FAIL);
918 			goto munmap;
919 		}
920 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
921 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
922 		if (tmp != mremap_mem) {
923 			ksft_perror("mremap() failed");
924 			log_test_result(KSFT_FAIL);
925 			goto munmap;
926 		}
927 		size = mremap_size;
928 		break;
929 	case THP_RUN_PARTIAL_SHARED:
930 		/*
931 		 * Share the first page of the THP with a child and quit the
932 		 * child. This will result in some parts of the THP never
933 		 * have been shared.
934 		 */
935 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
936 		if (ret) {
937 			ksft_perror("MADV_DONTFORK failed");
938 			log_test_result(KSFT_FAIL);
939 			goto munmap;
940 		}
941 		ret = fork();
942 		if (ret < 0) {
943 			ksft_perror("fork() failed");
944 			log_test_result(KSFT_FAIL);
945 			goto munmap;
946 		} else if (!ret) {
947 			exit(0);
948 		}
949 		wait(&ret);
950 		/* Allow for sharing all pages again. */
951 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
952 		if (ret) {
953 			ksft_perror("MADV_DOFORK failed");
954 			log_test_result(KSFT_FAIL);
955 			goto munmap;
956 		}
957 		break;
958 	default:
959 		assert(false);
960 	}
961 
962 	switch (thp_run) {
963 	case THP_RUN_PMD_SWAPOUT:
964 	case THP_RUN_PTE_SWAPOUT:
965 	case THP_RUN_SINGLE_PTE_SWAPOUT:
966 		madvise(mem, size, MADV_PAGEOUT);
967 		if (!range_is_swapped(mem, size)) {
968 			ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
969 			log_test_result(KSFT_SKIP);
970 			goto munmap;
971 		}
972 		break;
973 	default:
974 		break;
975 	}
976 
977 	fn(mem, size, false);
978 munmap:
979 	munmap(mmap_mem, mmap_size);
980 	if (mremap_mem != MAP_FAILED)
981 		munmap(mremap_mem, mremap_size);
982 }
983 
984 static void run_with_thp(test_fn fn, const char *desc, size_t size)
985 {
986 	log_test_start("%s ... with THP (%zu kB)",
987 		desc, size / 1024);
988 	do_run_with_thp(fn, THP_RUN_PMD, size);
989 }
990 
991 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
992 {
993 	log_test_start("%s ... with swapped-out THP (%zu kB)",
994 		desc, size / 1024);
995 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
996 }
997 
998 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
999 {
1000 	log_test_start("%s ... with PTE-mapped THP (%zu kB)",
1001 		desc, size / 1024);
1002 	do_run_with_thp(fn, THP_RUN_PTE, size);
1003 }
1004 
1005 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
1006 {
1007 	log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
1008 		desc, size / 1024);
1009 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
1010 }
1011 
1012 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
1013 {
1014 	log_test_start("%s ... with single PTE of THP (%zu kB)",
1015 		desc, size / 1024);
1016 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
1017 }
1018 
1019 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
1020 {
1021 	log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
1022 		desc, size / 1024);
1023 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
1024 }
1025 
1026 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
1027 {
1028 	log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
1029 		desc, size / 1024);
1030 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
1031 }
1032 
1033 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
1034 {
1035 	log_test_start("%s ... with partially shared THP (%zu kB)",
1036 		desc, size / 1024);
1037 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
1038 }
1039 
1040 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1041 {
1042 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1043 	char *mem, *dummy;
1044 
1045 	log_test_start("%s ... with hugetlb (%zu kB)", desc,
1046 		       hugetlbsize / 1024);
1047 
1048 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1049 
1050 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1051 	if (mem == MAP_FAILED) {
1052 		ksft_perror("need more free huge pages");
1053 		log_test_result(KSFT_SKIP);
1054 		return;
1055 	}
1056 
1057 	/* Populate an huge page. */
1058 	memset(mem, 1, hugetlbsize);
1059 
1060 	/*
1061 	 * We need a total of two hugetlb pages to handle COW/unsharing
1062 	 * properly, otherwise we might get zapped by a SIGBUS.
1063 	 */
1064 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1065 	if (dummy == MAP_FAILED) {
1066 		ksft_perror("need more free huge pages");
1067 		log_test_result(KSFT_SKIP);
1068 		goto munmap;
1069 	}
1070 	munmap(dummy, hugetlbsize);
1071 
1072 	fn(mem, hugetlbsize, true);
1073 munmap:
1074 	munmap(mem, hugetlbsize);
1075 }
1076 
1077 struct test_case {
1078 	const char *desc;
1079 	test_fn fn;
1080 };
1081 
1082 /*
1083  * Test cases that are specific to anonymous pages: pages in private mappings
1084  * that may get shared via COW during fork().
1085  */
1086 static const struct test_case anon_test_cases[] = {
1087 	/*
1088 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1089 	 * either the child can observe modifications by the parent or the
1090 	 * other way around.
1091 	 */
1092 	{
1093 		"Basic COW after fork()",
1094 		test_cow_in_parent,
1095 	},
1096 	/*
1097 	 * Basic test, but do an additional mprotect(PROT_READ)+
1098 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1099 	 */
1100 	{
1101 		"Basic COW after fork() with mprotect() optimization",
1102 		test_cow_in_parent_mprotect,
1103 	},
1104 	/*
1105 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1106 	 * we miss to break COW, the child observes modifications by the parent.
1107 	 * This is CVE-2020-29374 reported by Jann Horn.
1108 	 */
1109 	{
1110 		"vmsplice() + unmap in child",
1111 		test_vmsplice_in_child,
1112 	},
1113 	/*
1114 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1115 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1116 	 */
1117 	{
1118 		"vmsplice() + unmap in child with mprotect() optimization",
1119 		test_vmsplice_in_child_mprotect,
1120 	},
1121 	/*
1122 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1123 	 * fork(); modify in the child. If we miss to break COW, the parent
1124 	 * observes modifications by the child.
1125 	 */
1126 	{
1127 		"vmsplice() before fork(), unmap in parent after fork()",
1128 		test_vmsplice_before_fork,
1129 	},
1130 	/*
1131 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1132 	 * child. If we miss to break COW, the parent observes modifications by
1133 	 * the child.
1134 	 */
1135 	{
1136 		"vmsplice() + unmap in parent after fork()",
1137 		test_vmsplice_after_fork,
1138 	},
1139 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1140 	/*
1141 	 * Take a R/W longterm pin and then map the page R/O into the page
1142 	 * table to trigger a write fault on next access. When modifying the
1143 	 * page, the page content must be visible via the pin.
1144 	 */
1145 	{
1146 		"R/O-mapping a page registered as iouring fixed buffer",
1147 		test_iouring_ro,
1148 	},
1149 	/*
1150 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1151 	 * page, the page content must be visible via the pin. We expect the
1152 	 * pinned page to not get shared with the child.
1153 	 */
1154 	{
1155 		"fork() with an iouring fixed buffer",
1156 		test_iouring_fork,
1157 	},
1158 
1159 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1160 	/*
1161 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1162 	 * When modifying the page via the page table, the page content change
1163 	 * must be visible via the pin.
1164 	 */
1165 	{
1166 		"R/O GUP pin on R/O-mapped shared page",
1167 		test_ro_pin_on_shared,
1168 	},
1169 	/* Same as above, but using GUP-fast. */
1170 	{
1171 		"R/O GUP-fast pin on R/O-mapped shared page",
1172 		test_ro_fast_pin_on_shared,
1173 	},
1174 	/*
1175 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1176 	 * was previously shared. When modifying the page via the page table,
1177 	 * the page content change must be visible via the pin.
1178 	 */
1179 	{
1180 		"R/O GUP pin on R/O-mapped previously-shared page",
1181 		test_ro_pin_on_ro_previously_shared,
1182 	},
1183 	/* Same as above, but using GUP-fast. */
1184 	{
1185 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1186 		test_ro_fast_pin_on_ro_previously_shared,
1187 	},
1188 	/*
1189 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1190 	 * When modifying the page via the page table, the page content change
1191 	 * must be visible via the pin.
1192 	 */
1193 	{
1194 		"R/O GUP pin on R/O-mapped exclusive page",
1195 		test_ro_pin_on_ro_exclusive,
1196 	},
1197 	/* Same as above, but using GUP-fast. */
1198 	{
1199 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1200 		test_ro_fast_pin_on_ro_exclusive,
1201 	},
1202 };
1203 
1204 static void run_anon_test_case(struct test_case const *test_case)
1205 {
1206 	int i;
1207 
1208 	run_with_base_page(test_case->fn, test_case->desc);
1209 	run_with_base_page_swap(test_case->fn, test_case->desc);
1210 	for (i = 0; i < nr_thpsizes; i++) {
1211 		size_t size = thpsizes[i];
1212 		struct thp_settings settings = *thp_current_settings();
1213 
1214 		settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_NEVER;
1215 		settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS;
1216 		thp_push_settings(&settings);
1217 
1218 		if (size == pmdsize) {
1219 			run_with_thp(test_case->fn, test_case->desc, size);
1220 			run_with_thp_swap(test_case->fn, test_case->desc, size);
1221 		}
1222 
1223 		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1224 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1225 		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1226 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1227 		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1228 		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1229 
1230 		thp_pop_settings();
1231 	}
1232 	for (i = 0; i < nr_hugetlbsizes; i++)
1233 		run_with_hugetlb(test_case->fn, test_case->desc,
1234 				 hugetlbsizes[i]);
1235 }
1236 
1237 static void run_anon_test_cases(void)
1238 {
1239 	int i;
1240 
1241 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1242 
1243 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1244 		run_anon_test_case(&anon_test_cases[i]);
1245 }
1246 
1247 static int tests_per_anon_test_case(void)
1248 {
1249 	int tests = 2 + nr_hugetlbsizes;
1250 
1251 	tests += 6 * nr_thpsizes;
1252 	if (pmdsize)
1253 		tests += 2;
1254 	return tests;
1255 }
1256 
1257 enum anon_thp_collapse_test {
1258 	ANON_THP_COLLAPSE_UNSHARED,
1259 	ANON_THP_COLLAPSE_FULLY_SHARED,
1260 	ANON_THP_COLLAPSE_LOWER_SHARED,
1261 	ANON_THP_COLLAPSE_UPPER_SHARED,
1262 };
1263 
1264 static void do_test_anon_thp_collapse(char *mem, size_t size,
1265 				      enum anon_thp_collapse_test test)
1266 {
1267 	struct comm_pipes comm_pipes;
1268 	char buf;
1269 	int ret;
1270 
1271 	ret = setup_comm_pipes(&comm_pipes);
1272 	if (ret) {
1273 		log_test_result(KSFT_FAIL);
1274 		return;
1275 	}
1276 
1277 	/*
1278 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1279 	 * R/O, such that we can try collapsing it later.
1280 	 */
1281 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1282 	if (ret) {
1283 		ksft_perror("mprotect() failed");
1284 		log_test_result(KSFT_FAIL);
1285 		goto close_comm_pipes;
1286 	}
1287 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1288 	if (ret) {
1289 		ksft_perror("mprotect() failed");
1290 		log_test_result(KSFT_FAIL);
1291 		goto close_comm_pipes;
1292 	}
1293 
1294 	switch (test) {
1295 	case ANON_THP_COLLAPSE_UNSHARED:
1296 		/* Collapse before actually COW-sharing the page. */
1297 		ret = madvise(mem, size, MADV_COLLAPSE);
1298 		if (ret) {
1299 			ksft_perror("MADV_COLLAPSE failed");
1300 			log_test_result(KSFT_SKIP);
1301 			goto close_comm_pipes;
1302 		}
1303 		break;
1304 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1305 		/* COW-share the full PTE-mapped THP. */
1306 		break;
1307 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1308 		/* Don't COW-share the upper part of the THP. */
1309 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1310 		if (ret) {
1311 			ksft_perror("MADV_DONTFORK failed");
1312 			log_test_result(KSFT_FAIL);
1313 			goto close_comm_pipes;
1314 		}
1315 		break;
1316 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1317 		/* Don't COW-share the lower part of the THP. */
1318 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1319 		if (ret) {
1320 			ksft_perror("MADV_DONTFORK failed");
1321 			log_test_result(KSFT_FAIL);
1322 			goto close_comm_pipes;
1323 		}
1324 		break;
1325 	default:
1326 		assert(false);
1327 	}
1328 
1329 	ret = fork();
1330 	if (ret < 0) {
1331 		ksft_perror("fork() failed");
1332 		log_test_result(KSFT_FAIL);
1333 		goto close_comm_pipes;
1334 	} else if (!ret) {
1335 		switch (test) {
1336 		case ANON_THP_COLLAPSE_UNSHARED:
1337 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1338 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1339 			break;
1340 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1341 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1342 			break;
1343 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1344 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1345 					     &comm_pipes));
1346 			break;
1347 		default:
1348 			assert(false);
1349 		}
1350 	}
1351 
1352 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1353 		;
1354 
1355 	switch (test) {
1356 	case ANON_THP_COLLAPSE_UNSHARED:
1357 		break;
1358 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1359 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1360 		/*
1361 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1362 		 * able to actually collapse.
1363 		 */
1364 		ret = madvise(mem, size, MADV_DOFORK);
1365 		if (ret) {
1366 			ksft_perror("MADV_DOFORK failed");
1367 			log_test_result(KSFT_FAIL);
1368 			write(comm_pipes.parent_ready[1], "0", 1);
1369 			wait(&ret);
1370 			goto close_comm_pipes;
1371 		}
1372 		/* FALLTHROUGH */
1373 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1374 		/* Collapse before anyone modified the COW-shared page. */
1375 		ret = madvise(mem, size, MADV_COLLAPSE);
1376 		if (ret) {
1377 			ksft_perror("MADV_COLLAPSE failed");
1378 			log_test_result(KSFT_SKIP);
1379 			write(comm_pipes.parent_ready[1], "0", 1);
1380 			wait(&ret);
1381 			goto close_comm_pipes;
1382 		}
1383 		break;
1384 	default:
1385 		assert(false);
1386 	}
1387 
1388 	/* Modify the page. */
1389 	memset(mem, 0xff, size);
1390 	write(comm_pipes.parent_ready[1], "0", 1);
1391 
1392 	wait(&ret);
1393 	if (WIFEXITED(ret))
1394 		ret = WEXITSTATUS(ret);
1395 	else
1396 		ret = -EINVAL;
1397 
1398 	if (!ret) {
1399 		log_test_result(KSFT_PASS);
1400 	} else {
1401 		ksft_print_msg("Leak from parent into child\n");
1402 		log_test_result(KSFT_FAIL);
1403 	}
1404 close_comm_pipes:
1405 	close_comm_pipes(&comm_pipes);
1406 }
1407 
1408 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1409 		bool is_hugetlb)
1410 {
1411 	assert(!is_hugetlb);
1412 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1413 }
1414 
1415 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1416 		bool is_hugetlb)
1417 {
1418 	assert(!is_hugetlb);
1419 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1420 }
1421 
1422 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1423 		bool is_hugetlb)
1424 {
1425 	assert(!is_hugetlb);
1426 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1427 }
1428 
1429 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1430 		bool is_hugetlb)
1431 {
1432 	assert(!is_hugetlb);
1433 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1434 }
1435 
1436 /*
1437  * Test cases that are specific to anonymous THP: pages in private mappings
1438  * that may get shared via COW during fork().
1439  */
1440 static const struct test_case anon_thp_test_cases[] = {
1441 	/*
1442 	 * Basic COW test for fork() without any GUP when collapsing a THP
1443 	 * before fork().
1444 	 *
1445 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1446 	 * collapse") might easily get COW handling wrong when not collapsing
1447 	 * exclusivity information properly.
1448 	 */
1449 	{
1450 		"Basic COW after fork() when collapsing before fork()",
1451 		test_anon_thp_collapse_unshared,
1452 	},
1453 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1454 	{
1455 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1456 		test_anon_thp_collapse_fully_shared,
1457 	},
1458 	/*
1459 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1460 	 * THP.
1461 	 */
1462 	{
1463 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1464 		test_anon_thp_collapse_lower_shared,
1465 	},
1466 	/*
1467 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1468 	 * THP.
1469 	 */
1470 	{
1471 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1472 		test_anon_thp_collapse_upper_shared,
1473 	},
1474 };
1475 
1476 static void run_anon_thp_test_cases(void)
1477 {
1478 	int i;
1479 
1480 	if (!pmdsize)
1481 		return;
1482 
1483 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1484 
1485 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1486 		struct test_case const *test_case = &anon_thp_test_cases[i];
1487 
1488 		log_test_start("%s", test_case->desc);
1489 		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1490 	}
1491 }
1492 
1493 static int tests_per_anon_thp_test_case(void)
1494 {
1495 	return pmdsize ? 1 : 0;
1496 }
1497 
1498 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1499 
1500 static void test_cow(char *mem, const char *smem, size_t size)
1501 {
1502 	char *old = malloc(size);
1503 
1504 	/* Backup the original content. */
1505 	memcpy(old, smem, size);
1506 
1507 	/* Modify the page. */
1508 	memset(mem, 0xff, size);
1509 
1510 	/* See if we still read the old values via the other mapping. */
1511 	if (!memcmp(smem, old, size)) {
1512 		log_test_result(KSFT_PASS);
1513 	} else {
1514 		ksft_print_msg("Other mapping modified\n");
1515 		log_test_result(KSFT_FAIL);
1516 	}
1517 	free(old);
1518 }
1519 
1520 static void test_ro_pin(char *mem, const char *smem, size_t size)
1521 {
1522 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1523 }
1524 
1525 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1526 {
1527 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1528 }
1529 
1530 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1531 {
1532 	char *mem, *smem;
1533 
1534 	log_test_start("%s ... with shared zeropage", desc);
1535 
1536 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1537 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1538 	if (mem == MAP_FAILED) {
1539 		ksft_perror("mmap() failed");
1540 		log_test_result(KSFT_FAIL);
1541 		return;
1542 	}
1543 
1544 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1545 	if (smem == MAP_FAILED) {
1546 		ksft_perror("mmap() failed");
1547 		log_test_result(KSFT_FAIL);
1548 		goto munmap;
1549 	}
1550 
1551 	/* Read from the page to populate the shared zeropage. */
1552 	FORCE_READ(*mem);
1553 	FORCE_READ(*smem);
1554 
1555 	fn(mem, smem, pagesize);
1556 munmap:
1557 	munmap(mem, pagesize);
1558 	if (smem != MAP_FAILED)
1559 		munmap(smem, pagesize);
1560 }
1561 
1562 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1563 {
1564 	char *mem, *smem, *mmap_mem, *mmap_smem;
1565 	size_t mmap_size;
1566 	int ret;
1567 
1568 	log_test_start("%s ... with huge zeropage", desc);
1569 
1570 	if (!has_huge_zeropage) {
1571 		ksft_print_msg("Huge zeropage not enabled\n");
1572 		log_test_result(KSFT_SKIP);
1573 		return;
1574 	}
1575 
1576 	/* For alignment purposes, we need twice the thp size. */
1577 	mmap_size = 2 * pmdsize;
1578 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1579 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1580 	if (mmap_mem == MAP_FAILED) {
1581 		ksft_perror("mmap() failed");
1582 		log_test_result(KSFT_FAIL);
1583 		return;
1584 	}
1585 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1586 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1587 	if (mmap_smem == MAP_FAILED) {
1588 		ksft_perror("mmap() failed");
1589 		log_test_result(KSFT_FAIL);
1590 		goto munmap;
1591 	}
1592 
1593 	/* We need a THP-aligned memory area. */
1594 	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1595 	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1596 
1597 	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1598 	if (ret) {
1599 		ksft_perror("madvise()");
1600 		log_test_result(KSFT_FAIL);
1601 		goto munmap;
1602 	}
1603 	ret = madvise(smem, pmdsize, MADV_HUGEPAGE);
1604 	if (ret) {
1605 		ksft_perror("madvise()");
1606 		log_test_result(KSFT_FAIL);
1607 		goto munmap;
1608 	}
1609 
1610 	/*
1611 	 * Read from the memory to populate the huge shared zeropage. Read from
1612 	 * the first sub-page and test if we get another sub-page populated
1613 	 * automatically.
1614 	 */
1615 	FORCE_READ(mem);
1616 	FORCE_READ(smem);
1617 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1618 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1619 		ksft_test_result_skip("Did not get THPs populated\n");
1620 		goto munmap;
1621 	}
1622 
1623 	fn(mem, smem, pmdsize);
1624 munmap:
1625 	munmap(mmap_mem, mmap_size);
1626 	if (mmap_smem != MAP_FAILED)
1627 		munmap(mmap_smem, mmap_size);
1628 }
1629 
1630 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1631 {
1632 	char *mem, *smem;
1633 	int fd;
1634 
1635 	log_test_start("%s ... with memfd", desc);
1636 
1637 	fd = memfd_create("test", 0);
1638 	if (fd < 0) {
1639 		ksft_perror("memfd_create() failed");
1640 		log_test_result(KSFT_FAIL);
1641 		return;
1642 	}
1643 
1644 	/* File consists of a single page filled with zeroes. */
1645 	if (fallocate(fd, 0, 0, pagesize)) {
1646 		ksft_perror("fallocate() failed");
1647 		log_test_result(KSFT_FAIL);
1648 		goto close;
1649 	}
1650 
1651 	/* Create a private mapping of the memfd. */
1652 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1653 	if (mem == MAP_FAILED) {
1654 		ksft_perror("mmap() failed");
1655 		log_test_result(KSFT_FAIL);
1656 		goto close;
1657 	}
1658 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1659 	if (smem == MAP_FAILED) {
1660 		ksft_perror("mmap() failed");
1661 		log_test_result(KSFT_FAIL);
1662 		goto munmap;
1663 	}
1664 
1665 	/* Fault the page in. */
1666 	FORCE_READ(mem);
1667 	FORCE_READ(smem);
1668 
1669 	fn(mem, smem, pagesize);
1670 munmap:
1671 	munmap(mem, pagesize);
1672 	if (smem != MAP_FAILED)
1673 		munmap(smem, pagesize);
1674 close:
1675 	close(fd);
1676 }
1677 
1678 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1679 {
1680 	char *mem, *smem;
1681 	FILE *file;
1682 	int fd;
1683 
1684 	log_test_start("%s ... with tmpfile", desc);
1685 
1686 	file = tmpfile();
1687 	if (!file) {
1688 		ksft_perror("tmpfile() failed");
1689 		log_test_result(KSFT_FAIL);
1690 		return;
1691 	}
1692 
1693 	fd = fileno(file);
1694 	if (fd < 0) {
1695 		ksft_perror("fileno() failed");
1696 		log_test_result(KSFT_SKIP);
1697 		return;
1698 	}
1699 
1700 	/* File consists of a single page filled with zeroes. */
1701 	if (fallocate(fd, 0, 0, pagesize)) {
1702 		ksft_perror("fallocate() failed");
1703 		log_test_result(KSFT_FAIL);
1704 		goto close;
1705 	}
1706 
1707 	/* Create a private mapping of the memfd. */
1708 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1709 	if (mem == MAP_FAILED) {
1710 		ksft_perror("mmap() failed");
1711 		log_test_result(KSFT_FAIL);
1712 		goto close;
1713 	}
1714 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1715 	if (smem == MAP_FAILED) {
1716 		ksft_perror("mmap() failed");
1717 		log_test_result(KSFT_FAIL);
1718 		goto munmap;
1719 	}
1720 
1721 	/* Fault the page in. */
1722 	FORCE_READ(mem);
1723 	FORCE_READ(smem);
1724 
1725 	fn(mem, smem, pagesize);
1726 munmap:
1727 	munmap(mem, pagesize);
1728 	if (smem != MAP_FAILED)
1729 		munmap(smem, pagesize);
1730 close:
1731 	fclose(file);
1732 }
1733 
1734 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1735 				   size_t hugetlbsize)
1736 {
1737 	int flags = MFD_HUGETLB;
1738 	char *mem, *smem;
1739 	int fd;
1740 
1741 	log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
1742 		       hugetlbsize / 1024);
1743 
1744 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1745 
1746 	fd = memfd_create("test", flags);
1747 	if (fd < 0) {
1748 		ksft_perror("memfd_create() failed");
1749 		log_test_result(KSFT_SKIP);
1750 		return;
1751 	}
1752 
1753 	/* File consists of a single page filled with zeroes. */
1754 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1755 		ksft_perror("need more free huge pages");
1756 		log_test_result(KSFT_SKIP);
1757 		goto close;
1758 	}
1759 
1760 	/* Create a private mapping of the memfd. */
1761 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1762 		   0);
1763 	if (mem == MAP_FAILED) {
1764 		ksft_perror("need more free huge pages");
1765 		log_test_result(KSFT_SKIP);
1766 		goto close;
1767 	}
1768 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1769 	if (smem == MAP_FAILED) {
1770 		ksft_perror("mmap() failed");
1771 		log_test_result(KSFT_FAIL);
1772 		goto munmap;
1773 	}
1774 
1775 	/* Fault the page in. */
1776 	FORCE_READ(mem);
1777 	FORCE_READ(smem);
1778 
1779 	fn(mem, smem, hugetlbsize);
1780 munmap:
1781 	munmap(mem, hugetlbsize);
1782 	if (smem != MAP_FAILED)
1783 		munmap(smem, hugetlbsize);
1784 close:
1785 	close(fd);
1786 }
1787 
1788 struct non_anon_test_case {
1789 	const char *desc;
1790 	non_anon_test_fn fn;
1791 };
1792 
1793 /*
1794  * Test cases that target any pages in private mappings that are not anonymous:
1795  * pages that may get shared via COW ndependent of fork(). This includes
1796  * the shared zeropage(s), pagecache pages, ...
1797  */
1798 static const struct non_anon_test_case non_anon_test_cases[] = {
1799 	/*
1800 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1801 	 * visible via other private/shared mappings.
1802 	 */
1803 	{
1804 		"Basic COW",
1805 		test_cow,
1806 	},
1807 	/*
1808 	 * Take a R/O longterm pin. When modifying the page via the page table,
1809 	 * the page content change must be visible via the pin.
1810 	 */
1811 	{
1812 		"R/O longterm GUP pin",
1813 		test_ro_pin,
1814 	},
1815 	/* Same as above, but using GUP-fast. */
1816 	{
1817 		"R/O longterm GUP-fast pin",
1818 		test_ro_fast_pin,
1819 	},
1820 };
1821 
1822 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1823 {
1824 	int i;
1825 
1826 	run_with_zeropage(test_case->fn, test_case->desc);
1827 	run_with_memfd(test_case->fn, test_case->desc);
1828 	run_with_tmpfile(test_case->fn, test_case->desc);
1829 	if (pmdsize)
1830 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1831 	for (i = 0; i < nr_hugetlbsizes; i++)
1832 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1833 				       hugetlbsizes[i]);
1834 }
1835 
1836 static void run_non_anon_test_cases(void)
1837 {
1838 	int i;
1839 
1840 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1841 
1842 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1843 		run_non_anon_test_case(&non_anon_test_cases[i]);
1844 }
1845 
1846 static int tests_per_non_anon_test_case(void)
1847 {
1848 	int tests = 3 + nr_hugetlbsizes;
1849 
1850 	if (pmdsize)
1851 		tests += 1;
1852 	return tests;
1853 }
1854 
1855 int main(int argc, char **argv)
1856 {
1857 	struct thp_settings default_settings;
1858 
1859 	ksft_print_header();
1860 
1861 	pagesize = getpagesize();
1862 	pmdsize = read_pmd_pagesize();
1863 	if (pmdsize) {
1864 		/* Only if THP is supported. */
1865 		thp_read_settings(&default_settings);
1866 		default_settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_INHERIT;
1867 		thp_save_settings();
1868 		thp_push_settings(&default_settings);
1869 
1870 		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1871 			       pmdsize / 1024);
1872 		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1873 	}
1874 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1875 						    ARRAY_SIZE(hugetlbsizes));
1876 	has_huge_zeropage = detect_huge_zeropage();
1877 
1878 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1879 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1880 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1881 
1882 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1883 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1884 	if (pagemap_fd < 0)
1885 		ksft_exit_fail_msg("opening pagemap failed\n");
1886 
1887 	run_anon_test_cases();
1888 	run_anon_thp_test_cases();
1889 	run_non_anon_test_cases();
1890 
1891 	if (pmdsize) {
1892 		/* Only if THP is supported. */
1893 		thp_restore_settings();
1894 	}
1895 
1896 	ksft_finished();
1897 }
1898