xref: /linux/tools/testing/selftests/mm/cow.c (revision 8b6d678fede700db6466d73f11fcbad496fa515e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33 
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43 
44 static int sz2ord(size_t size)
45 {
46 	return __builtin_ctzll(size / pagesize);
47 }
48 
49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 	int count = 0;
52 	unsigned long orders;
53 	size_t kb;
54 	int i;
55 
56 	/* thp not supported at all. */
57 	if (!pmdsize)
58 		return 0;
59 
60 	orders = 1UL << sz2ord(pmdsize);
61 	orders |= thp_supported_orders();
62 
63 	for (i = 0; orders && count < max; i++) {
64 		if (!(orders & (1UL << i)))
65 			continue;
66 		orders &= ~(1UL << i);
67 		kb = (pagesize >> 10) << i;
68 		sizes[count++] = kb * 1024;
69 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 	}
71 
72 	return count;
73 }
74 
75 static void detect_huge_zeropage(void)
76 {
77 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
78 		      O_RDONLY);
79 	size_t enabled = 0;
80 	char buf[15];
81 	int ret;
82 
83 	if (fd < 0)
84 		return;
85 
86 	ret = pread(fd, buf, sizeof(buf), 0);
87 	if (ret > 0 && ret < sizeof(buf)) {
88 		buf[ret] = 0;
89 
90 		enabled = strtoul(buf, NULL, 10);
91 		if (enabled == 1) {
92 			has_huge_zeropage = true;
93 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
94 		}
95 	}
96 
97 	close(fd);
98 }
99 
100 static bool range_is_swapped(void *addr, size_t size)
101 {
102 	for (; size; addr += pagesize, size -= pagesize)
103 		if (!pagemap_is_swapped(pagemap_fd, addr))
104 			return false;
105 	return true;
106 }
107 
108 struct comm_pipes {
109 	int child_ready[2];
110 	int parent_ready[2];
111 };
112 
113 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
114 {
115 	if (pipe(comm_pipes->child_ready) < 0)
116 		return -errno;
117 	if (pipe(comm_pipes->parent_ready) < 0) {
118 		close(comm_pipes->child_ready[0]);
119 		close(comm_pipes->child_ready[1]);
120 		return -errno;
121 	}
122 
123 	return 0;
124 }
125 
126 static void close_comm_pipes(struct comm_pipes *comm_pipes)
127 {
128 	close(comm_pipes->child_ready[0]);
129 	close(comm_pipes->child_ready[1]);
130 	close(comm_pipes->parent_ready[0]);
131 	close(comm_pipes->parent_ready[1]);
132 }
133 
134 static int child_memcmp_fn(char *mem, size_t size,
135 			   struct comm_pipes *comm_pipes)
136 {
137 	char *old = malloc(size);
138 	char buf;
139 
140 	/* Backup the original content. */
141 	memcpy(old, mem, size);
142 
143 	/* Wait until the parent modified the page. */
144 	write(comm_pipes->child_ready[1], "0", 1);
145 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
146 		;
147 
148 	/* See if we still read the old values. */
149 	return memcmp(old, mem, size);
150 }
151 
152 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
153 				    struct comm_pipes *comm_pipes)
154 {
155 	struct iovec iov = {
156 		.iov_base = mem,
157 		.iov_len = size,
158 	};
159 	ssize_t cur, total, transferred;
160 	char *old, *new;
161 	int fds[2];
162 	char buf;
163 
164 	old = malloc(size);
165 	new = malloc(size);
166 
167 	/* Backup the original content. */
168 	memcpy(old, mem, size);
169 
170 	if (pipe(fds) < 0)
171 		return -errno;
172 
173 	/* Trigger a read-only pin. */
174 	transferred = vmsplice(fds[1], &iov, 1, 0);
175 	if (transferred < 0)
176 		return -errno;
177 	if (transferred == 0)
178 		return -EINVAL;
179 
180 	/* Unmap it from our page tables. */
181 	if (munmap(mem, size) < 0)
182 		return -errno;
183 
184 	/* Wait until the parent modified it. */
185 	write(comm_pipes->child_ready[1], "0", 1);
186 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
187 		;
188 
189 	/* See if we still read the old values via the pipe. */
190 	for (total = 0; total < transferred; total += cur) {
191 		cur = read(fds[0], new + total, transferred - total);
192 		if (cur < 0)
193 			return -errno;
194 	}
195 
196 	return memcmp(old, new, transferred);
197 }
198 
199 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
200 
201 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
202 				  child_fn fn)
203 {
204 	struct comm_pipes comm_pipes;
205 	char buf;
206 	int ret;
207 
208 	ret = setup_comm_pipes(&comm_pipes);
209 	if (ret) {
210 		ksft_test_result_fail("pipe() failed\n");
211 		return;
212 	}
213 
214 	ret = fork();
215 	if (ret < 0) {
216 		ksft_test_result_fail("fork() failed\n");
217 		goto close_comm_pipes;
218 	} else if (!ret) {
219 		exit(fn(mem, size, &comm_pipes));
220 	}
221 
222 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
223 		;
224 
225 	if (do_mprotect) {
226 		/*
227 		 * mprotect() optimizations might try avoiding
228 		 * write-faults by directly mapping pages writable.
229 		 */
230 		ret = mprotect(mem, size, PROT_READ);
231 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
232 		if (ret) {
233 			ksft_test_result_fail("mprotect() failed\n");
234 			write(comm_pipes.parent_ready[1], "0", 1);
235 			wait(&ret);
236 			goto close_comm_pipes;
237 		}
238 	}
239 
240 	/* Modify the page. */
241 	memset(mem, 0xff, size);
242 	write(comm_pipes.parent_ready[1], "0", 1);
243 
244 	wait(&ret);
245 	if (WIFEXITED(ret))
246 		ret = WEXITSTATUS(ret);
247 	else
248 		ret = -EINVAL;
249 
250 	ksft_test_result(!ret, "No leak from parent into child\n");
251 close_comm_pipes:
252 	close_comm_pipes(&comm_pipes);
253 }
254 
255 static void test_cow_in_parent(char *mem, size_t size)
256 {
257 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
258 }
259 
260 static void test_cow_in_parent_mprotect(char *mem, size_t size)
261 {
262 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
263 }
264 
265 static void test_vmsplice_in_child(char *mem, size_t size)
266 {
267 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
268 }
269 
270 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
271 {
272 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
273 }
274 
275 static void do_test_vmsplice_in_parent(char *mem, size_t size,
276 				       bool before_fork)
277 {
278 	struct iovec iov = {
279 		.iov_base = mem,
280 		.iov_len = size,
281 	};
282 	ssize_t cur, total, transferred;
283 	struct comm_pipes comm_pipes;
284 	char *old, *new;
285 	int ret, fds[2];
286 	char buf;
287 
288 	old = malloc(size);
289 	new = malloc(size);
290 
291 	memcpy(old, mem, size);
292 
293 	ret = setup_comm_pipes(&comm_pipes);
294 	if (ret) {
295 		ksft_test_result_fail("pipe() failed\n");
296 		goto free;
297 	}
298 
299 	if (pipe(fds) < 0) {
300 		ksft_test_result_fail("pipe() failed\n");
301 		goto close_comm_pipes;
302 	}
303 
304 	if (before_fork) {
305 		transferred = vmsplice(fds[1], &iov, 1, 0);
306 		if (transferred <= 0) {
307 			ksft_test_result_fail("vmsplice() failed\n");
308 			goto close_pipe;
309 		}
310 	}
311 
312 	ret = fork();
313 	if (ret < 0) {
314 		ksft_test_result_fail("fork() failed\n");
315 		goto close_pipe;
316 	} else if (!ret) {
317 		write(comm_pipes.child_ready[1], "0", 1);
318 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
319 			;
320 		/* Modify page content in the child. */
321 		memset(mem, 0xff, size);
322 		exit(0);
323 	}
324 
325 	if (!before_fork) {
326 		transferred = vmsplice(fds[1], &iov, 1, 0);
327 		if (transferred <= 0) {
328 			ksft_test_result_fail("vmsplice() failed\n");
329 			wait(&ret);
330 			goto close_pipe;
331 		}
332 	}
333 
334 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
335 		;
336 	if (munmap(mem, size) < 0) {
337 		ksft_test_result_fail("munmap() failed\n");
338 		goto close_pipe;
339 	}
340 	write(comm_pipes.parent_ready[1], "0", 1);
341 
342 	/* Wait until the child is done writing. */
343 	wait(&ret);
344 	if (!WIFEXITED(ret)) {
345 		ksft_test_result_fail("wait() failed\n");
346 		goto close_pipe;
347 	}
348 
349 	/* See if we still read the old values. */
350 	for (total = 0; total < transferred; total += cur) {
351 		cur = read(fds[0], new + total, transferred - total);
352 		if (cur < 0) {
353 			ksft_test_result_fail("read() failed\n");
354 			goto close_pipe;
355 		}
356 	}
357 
358 	ksft_test_result(!memcmp(old, new, transferred),
359 			 "No leak from child into parent\n");
360 close_pipe:
361 	close(fds[0]);
362 	close(fds[1]);
363 close_comm_pipes:
364 	close_comm_pipes(&comm_pipes);
365 free:
366 	free(old);
367 	free(new);
368 }
369 
370 static void test_vmsplice_before_fork(char *mem, size_t size)
371 {
372 	do_test_vmsplice_in_parent(mem, size, true);
373 }
374 
375 static void test_vmsplice_after_fork(char *mem, size_t size)
376 {
377 	do_test_vmsplice_in_parent(mem, size, false);
378 }
379 
380 #ifdef LOCAL_CONFIG_HAVE_LIBURING
381 static void do_test_iouring(char *mem, size_t size, bool use_fork)
382 {
383 	struct comm_pipes comm_pipes;
384 	struct io_uring_cqe *cqe;
385 	struct io_uring_sqe *sqe;
386 	struct io_uring ring;
387 	ssize_t cur, total;
388 	struct iovec iov;
389 	char *buf, *tmp;
390 	int ret, fd;
391 	FILE *file;
392 
393 	ret = setup_comm_pipes(&comm_pipes);
394 	if (ret) {
395 		ksft_test_result_fail("pipe() failed\n");
396 		return;
397 	}
398 
399 	file = tmpfile();
400 	if (!file) {
401 		ksft_test_result_fail("tmpfile() failed\n");
402 		goto close_comm_pipes;
403 	}
404 	fd = fileno(file);
405 	assert(fd);
406 
407 	tmp = malloc(size);
408 	if (!tmp) {
409 		ksft_test_result_fail("malloc() failed\n");
410 		goto close_file;
411 	}
412 
413 	/* Skip on errors, as we might just lack kernel support. */
414 	ret = io_uring_queue_init(1, &ring, 0);
415 	if (ret < 0) {
416 		ksft_test_result_skip("io_uring_queue_init() failed\n");
417 		goto free_tmp;
418 	}
419 
420 	/*
421 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
422 	 * | FOLL_LONGTERM the range.
423 	 *
424 	 * Skip on errors, as we might just lack kernel support or might not
425 	 * have sufficient MEMLOCK permissions.
426 	 */
427 	iov.iov_base = mem;
428 	iov.iov_len = size;
429 	ret = io_uring_register_buffers(&ring, &iov, 1);
430 	if (ret) {
431 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
432 		goto queue_exit;
433 	}
434 
435 	if (use_fork) {
436 		/*
437 		 * fork() and keep the child alive until we're done. Note that
438 		 * we expect the pinned page to not get shared with the child.
439 		 */
440 		ret = fork();
441 		if (ret < 0) {
442 			ksft_test_result_fail("fork() failed\n");
443 			goto unregister_buffers;
444 		} else if (!ret) {
445 			write(comm_pipes.child_ready[1], "0", 1);
446 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
447 				;
448 			exit(0);
449 		}
450 
451 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
452 			;
453 	} else {
454 		/*
455 		 * Map the page R/O into the page table. Enable softdirty
456 		 * tracking to stop the page from getting mapped R/W immediately
457 		 * again by mprotect() optimizations. Note that we don't have an
458 		 * easy way to test if that worked (the pagemap does not export
459 		 * if the page is mapped R/O vs. R/W).
460 		 */
461 		ret = mprotect(mem, size, PROT_READ);
462 		clear_softdirty();
463 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
464 		if (ret) {
465 			ksft_test_result_fail("mprotect() failed\n");
466 			goto unregister_buffers;
467 		}
468 	}
469 
470 	/*
471 	 * Modify the page and write page content as observed by the fixed
472 	 * buffer pin to the file so we can verify it.
473 	 */
474 	memset(mem, 0xff, size);
475 	sqe = io_uring_get_sqe(&ring);
476 	if (!sqe) {
477 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
478 		goto quit_child;
479 	}
480 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
481 
482 	ret = io_uring_submit(&ring);
483 	if (ret < 0) {
484 		ksft_test_result_fail("io_uring_submit() failed\n");
485 		goto quit_child;
486 	}
487 
488 	ret = io_uring_wait_cqe(&ring, &cqe);
489 	if (ret < 0) {
490 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
491 		goto quit_child;
492 	}
493 
494 	if (cqe->res != size) {
495 		ksft_test_result_fail("write_fixed failed\n");
496 		goto quit_child;
497 	}
498 	io_uring_cqe_seen(&ring, cqe);
499 
500 	/* Read back the file content to the temporary buffer. */
501 	total = 0;
502 	while (total < size) {
503 		cur = pread(fd, tmp + total, size - total, total);
504 		if (cur < 0) {
505 			ksft_test_result_fail("pread() failed\n");
506 			goto quit_child;
507 		}
508 		total += cur;
509 	}
510 
511 	/* Finally, check if we read what we expected. */
512 	ksft_test_result(!memcmp(mem, tmp, size),
513 			 "Longterm R/W pin is reliable\n");
514 
515 quit_child:
516 	if (use_fork) {
517 		write(comm_pipes.parent_ready[1], "0", 1);
518 		wait(&ret);
519 	}
520 unregister_buffers:
521 	io_uring_unregister_buffers(&ring);
522 queue_exit:
523 	io_uring_queue_exit(&ring);
524 free_tmp:
525 	free(tmp);
526 close_file:
527 	fclose(file);
528 close_comm_pipes:
529 	close_comm_pipes(&comm_pipes);
530 }
531 
532 static void test_iouring_ro(char *mem, size_t size)
533 {
534 	do_test_iouring(mem, size, false);
535 }
536 
537 static void test_iouring_fork(char *mem, size_t size)
538 {
539 	do_test_iouring(mem, size, true);
540 }
541 
542 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
543 
544 enum ro_pin_test {
545 	RO_PIN_TEST,
546 	RO_PIN_TEST_SHARED,
547 	RO_PIN_TEST_PREVIOUSLY_SHARED,
548 	RO_PIN_TEST_RO_EXCLUSIVE,
549 };
550 
551 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
552 			   bool fast)
553 {
554 	struct pin_longterm_test args;
555 	struct comm_pipes comm_pipes;
556 	char *tmp, buf;
557 	__u64 tmp_val;
558 	int ret;
559 
560 	if (gup_fd < 0) {
561 		ksft_test_result_skip("gup_test not available\n");
562 		return;
563 	}
564 
565 	tmp = malloc(size);
566 	if (!tmp) {
567 		ksft_test_result_fail("malloc() failed\n");
568 		return;
569 	}
570 
571 	ret = setup_comm_pipes(&comm_pipes);
572 	if (ret) {
573 		ksft_test_result_fail("pipe() failed\n");
574 		goto free_tmp;
575 	}
576 
577 	switch (test) {
578 	case RO_PIN_TEST:
579 		break;
580 	case RO_PIN_TEST_SHARED:
581 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
582 		/*
583 		 * Share the pages with our child. As the pages are not pinned,
584 		 * this should just work.
585 		 */
586 		ret = fork();
587 		if (ret < 0) {
588 			ksft_test_result_fail("fork() failed\n");
589 			goto close_comm_pipes;
590 		} else if (!ret) {
591 			write(comm_pipes.child_ready[1], "0", 1);
592 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
593 				;
594 			exit(0);
595 		}
596 
597 		/* Wait until our child is ready. */
598 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
599 			;
600 
601 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
602 			/*
603 			 * Tell the child to quit now and wait until it quit.
604 			 * The pages should now be mapped R/O into our page
605 			 * tables, but they are no longer shared.
606 			 */
607 			write(comm_pipes.parent_ready[1], "0", 1);
608 			wait(&ret);
609 			if (!WIFEXITED(ret))
610 				ksft_print_msg("[INFO] wait() failed\n");
611 		}
612 		break;
613 	case RO_PIN_TEST_RO_EXCLUSIVE:
614 		/*
615 		 * Map the page R/O into the page table. Enable softdirty
616 		 * tracking to stop the page from getting mapped R/W immediately
617 		 * again by mprotect() optimizations. Note that we don't have an
618 		 * easy way to test if that worked (the pagemap does not export
619 		 * if the page is mapped R/O vs. R/W).
620 		 */
621 		ret = mprotect(mem, size, PROT_READ);
622 		clear_softdirty();
623 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
624 		if (ret) {
625 			ksft_test_result_fail("mprotect() failed\n");
626 			goto close_comm_pipes;
627 		}
628 		break;
629 	default:
630 		assert(false);
631 	}
632 
633 	/* Take a R/O pin. This should trigger unsharing. */
634 	args.addr = (__u64)(uintptr_t)mem;
635 	args.size = size;
636 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
637 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
638 	if (ret) {
639 		if (errno == EINVAL)
640 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
641 		else
642 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
643 		goto wait;
644 	}
645 
646 	/* Modify the page. */
647 	memset(mem, 0xff, size);
648 
649 	/*
650 	 * Read back the content via the pin to the temporary buffer and
651 	 * test if we observed the modification.
652 	 */
653 	tmp_val = (__u64)(uintptr_t)tmp;
654 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
655 	if (ret)
656 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
657 	else
658 		ksft_test_result(!memcmp(mem, tmp, size),
659 				 "Longterm R/O pin is reliable\n");
660 
661 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
662 	if (ret)
663 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
664 wait:
665 	switch (test) {
666 	case RO_PIN_TEST_SHARED:
667 		write(comm_pipes.parent_ready[1], "0", 1);
668 		wait(&ret);
669 		if (!WIFEXITED(ret))
670 			ksft_print_msg("[INFO] wait() failed\n");
671 		break;
672 	default:
673 		break;
674 	}
675 close_comm_pipes:
676 	close_comm_pipes(&comm_pipes);
677 free_tmp:
678 	free(tmp);
679 }
680 
681 static void test_ro_pin_on_shared(char *mem, size_t size)
682 {
683 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
684 }
685 
686 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
687 {
688 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
689 }
690 
691 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
692 {
693 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
694 }
695 
696 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
697 {
698 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
699 }
700 
701 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
702 {
703 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
704 }
705 
706 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
707 {
708 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
709 }
710 
711 typedef void (*test_fn)(char *mem, size_t size);
712 
713 static void do_run_with_base_page(test_fn fn, bool swapout)
714 {
715 	char *mem;
716 	int ret;
717 
718 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
719 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
720 	if (mem == MAP_FAILED) {
721 		ksft_test_result_fail("mmap() failed\n");
722 		return;
723 	}
724 
725 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
726 	/* Ignore if not around on a kernel. */
727 	if (ret && errno != EINVAL) {
728 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
729 		goto munmap;
730 	}
731 
732 	/* Populate a base page. */
733 	memset(mem, 0, pagesize);
734 
735 	if (swapout) {
736 		madvise(mem, pagesize, MADV_PAGEOUT);
737 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
738 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
739 			goto munmap;
740 		}
741 	}
742 
743 	fn(mem, pagesize);
744 munmap:
745 	munmap(mem, pagesize);
746 }
747 
748 static void run_with_base_page(test_fn fn, const char *desc)
749 {
750 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
751 	do_run_with_base_page(fn, false);
752 }
753 
754 static void run_with_base_page_swap(test_fn fn, const char *desc)
755 {
756 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
757 	do_run_with_base_page(fn, true);
758 }
759 
760 enum thp_run {
761 	THP_RUN_PMD,
762 	THP_RUN_PMD_SWAPOUT,
763 	THP_RUN_PTE,
764 	THP_RUN_PTE_SWAPOUT,
765 	THP_RUN_SINGLE_PTE,
766 	THP_RUN_SINGLE_PTE_SWAPOUT,
767 	THP_RUN_PARTIAL_MREMAP,
768 	THP_RUN_PARTIAL_SHARED,
769 };
770 
771 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
772 {
773 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
774 	size_t size, mmap_size, mremap_size;
775 	int ret;
776 
777 	/* For alignment purposes, we need twice the thp size. */
778 	mmap_size = 2 * thpsize;
779 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
780 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
781 	if (mmap_mem == MAP_FAILED) {
782 		ksft_test_result_fail("mmap() failed\n");
783 		return;
784 	}
785 
786 	/* We need a THP-aligned memory area. */
787 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
788 
789 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
790 	if (ret) {
791 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
792 		goto munmap;
793 	}
794 
795 	/*
796 	 * Try to populate a THP. Touch the first sub-page and test if
797 	 * we get the last sub-page populated automatically.
798 	 */
799 	mem[0] = 0;
800 	if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
801 		ksft_test_result_skip("Did not get a THP populated\n");
802 		goto munmap;
803 	}
804 	memset(mem, 0, thpsize);
805 
806 	size = thpsize;
807 	switch (thp_run) {
808 	case THP_RUN_PMD:
809 	case THP_RUN_PMD_SWAPOUT:
810 		assert(thpsize == pmdsize);
811 		break;
812 	case THP_RUN_PTE:
813 	case THP_RUN_PTE_SWAPOUT:
814 		/*
815 		 * Trigger PTE-mapping the THP by temporarily mapping a single
816 		 * subpage R/O. This is a noop if the THP is not pmdsize (and
817 		 * therefore already PTE-mapped).
818 		 */
819 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
820 		if (ret) {
821 			ksft_test_result_fail("mprotect() failed\n");
822 			goto munmap;
823 		}
824 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
825 		if (ret) {
826 			ksft_test_result_fail("mprotect() failed\n");
827 			goto munmap;
828 		}
829 		break;
830 	case THP_RUN_SINGLE_PTE:
831 	case THP_RUN_SINGLE_PTE_SWAPOUT:
832 		/*
833 		 * Discard all but a single subpage of that PTE-mapped THP. What
834 		 * remains is a single PTE mapping a single subpage.
835 		 */
836 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
837 		if (ret) {
838 			ksft_test_result_fail("MADV_DONTNEED failed\n");
839 			goto munmap;
840 		}
841 		size = pagesize;
842 		break;
843 	case THP_RUN_PARTIAL_MREMAP:
844 		/*
845 		 * Remap half of the THP. We need some new memory location
846 		 * for that.
847 		 */
848 		mremap_size = thpsize / 2;
849 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
850 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
851 		if (mem == MAP_FAILED) {
852 			ksft_test_result_fail("mmap() failed\n");
853 			goto munmap;
854 		}
855 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
856 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
857 		if (tmp != mremap_mem) {
858 			ksft_test_result_fail("mremap() failed\n");
859 			goto munmap;
860 		}
861 		size = mremap_size;
862 		break;
863 	case THP_RUN_PARTIAL_SHARED:
864 		/*
865 		 * Share the first page of the THP with a child and quit the
866 		 * child. This will result in some parts of the THP never
867 		 * have been shared.
868 		 */
869 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
870 		if (ret) {
871 			ksft_test_result_fail("MADV_DONTFORK failed\n");
872 			goto munmap;
873 		}
874 		ret = fork();
875 		if (ret < 0) {
876 			ksft_test_result_fail("fork() failed\n");
877 			goto munmap;
878 		} else if (!ret) {
879 			exit(0);
880 		}
881 		wait(&ret);
882 		/* Allow for sharing all pages again. */
883 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
884 		if (ret) {
885 			ksft_test_result_fail("MADV_DOFORK failed\n");
886 			goto munmap;
887 		}
888 		break;
889 	default:
890 		assert(false);
891 	}
892 
893 	switch (thp_run) {
894 	case THP_RUN_PMD_SWAPOUT:
895 	case THP_RUN_PTE_SWAPOUT:
896 	case THP_RUN_SINGLE_PTE_SWAPOUT:
897 		madvise(mem, size, MADV_PAGEOUT);
898 		if (!range_is_swapped(mem, size)) {
899 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
900 			goto munmap;
901 		}
902 		break;
903 	default:
904 		break;
905 	}
906 
907 	fn(mem, size);
908 munmap:
909 	munmap(mmap_mem, mmap_size);
910 	if (mremap_mem != MAP_FAILED)
911 		munmap(mremap_mem, mremap_size);
912 }
913 
914 static void run_with_thp(test_fn fn, const char *desc, size_t size)
915 {
916 	ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
917 		desc, size / 1024);
918 	do_run_with_thp(fn, THP_RUN_PMD, size);
919 }
920 
921 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
922 {
923 	ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
924 		desc, size / 1024);
925 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
926 }
927 
928 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
929 {
930 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
931 		desc, size / 1024);
932 	do_run_with_thp(fn, THP_RUN_PTE, size);
933 }
934 
935 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
936 {
937 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
938 		desc, size / 1024);
939 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
940 }
941 
942 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
943 {
944 	ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
945 		desc, size / 1024);
946 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
947 }
948 
949 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
950 {
951 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
952 		desc, size / 1024);
953 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
954 }
955 
956 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
957 {
958 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
959 		desc, size / 1024);
960 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
961 }
962 
963 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
964 {
965 	ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
966 		desc, size / 1024);
967 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
968 }
969 
970 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
971 {
972 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
973 	char *mem, *dummy;
974 
975 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
976 		       hugetlbsize / 1024);
977 
978 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
979 
980 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
981 	if (mem == MAP_FAILED) {
982 		ksft_test_result_skip("need more free huge pages\n");
983 		return;
984 	}
985 
986 	/* Populate an huge page. */
987 	memset(mem, 0, hugetlbsize);
988 
989 	/*
990 	 * We need a total of two hugetlb pages to handle COW/unsharing
991 	 * properly, otherwise we might get zapped by a SIGBUS.
992 	 */
993 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
994 	if (dummy == MAP_FAILED) {
995 		ksft_test_result_skip("need more free huge pages\n");
996 		goto munmap;
997 	}
998 	munmap(dummy, hugetlbsize);
999 
1000 	fn(mem, hugetlbsize);
1001 munmap:
1002 	munmap(mem, hugetlbsize);
1003 }
1004 
1005 struct test_case {
1006 	const char *desc;
1007 	test_fn fn;
1008 };
1009 
1010 /*
1011  * Test cases that are specific to anonymous pages: pages in private mappings
1012  * that may get shared via COW during fork().
1013  */
1014 static const struct test_case anon_test_cases[] = {
1015 	/*
1016 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1017 	 * either the child can observe modifications by the parent or the
1018 	 * other way around.
1019 	 */
1020 	{
1021 		"Basic COW after fork()",
1022 		test_cow_in_parent,
1023 	},
1024 	/*
1025 	 * Basic test, but do an additional mprotect(PROT_READ)+
1026 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1027 	 */
1028 	{
1029 		"Basic COW after fork() with mprotect() optimization",
1030 		test_cow_in_parent_mprotect,
1031 	},
1032 	/*
1033 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1034 	 * we miss to break COW, the child observes modifications by the parent.
1035 	 * This is CVE-2020-29374 reported by Jann Horn.
1036 	 */
1037 	{
1038 		"vmsplice() + unmap in child",
1039 		test_vmsplice_in_child
1040 	},
1041 	/*
1042 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1043 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1044 	 */
1045 	{
1046 		"vmsplice() + unmap in child with mprotect() optimization",
1047 		test_vmsplice_in_child_mprotect
1048 	},
1049 	/*
1050 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1051 	 * fork(); modify in the child. If we miss to break COW, the parent
1052 	 * observes modifications by the child.
1053 	 */
1054 	{
1055 		"vmsplice() before fork(), unmap in parent after fork()",
1056 		test_vmsplice_before_fork,
1057 	},
1058 	/*
1059 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1060 	 * child. If we miss to break COW, the parent observes modifications by
1061 	 * the child.
1062 	 */
1063 	{
1064 		"vmsplice() + unmap in parent after fork()",
1065 		test_vmsplice_after_fork,
1066 	},
1067 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1068 	/*
1069 	 * Take a R/W longterm pin and then map the page R/O into the page
1070 	 * table to trigger a write fault on next access. When modifying the
1071 	 * page, the page content must be visible via the pin.
1072 	 */
1073 	{
1074 		"R/O-mapping a page registered as iouring fixed buffer",
1075 		test_iouring_ro,
1076 	},
1077 	/*
1078 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1079 	 * page, the page content must be visible via the pin. We expect the
1080 	 * pinned page to not get shared with the child.
1081 	 */
1082 	{
1083 		"fork() with an iouring fixed buffer",
1084 		test_iouring_fork,
1085 	},
1086 
1087 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1088 	/*
1089 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1090 	 * When modifying the page via the page table, the page content change
1091 	 * must be visible via the pin.
1092 	 */
1093 	{
1094 		"R/O GUP pin on R/O-mapped shared page",
1095 		test_ro_pin_on_shared,
1096 	},
1097 	/* Same as above, but using GUP-fast. */
1098 	{
1099 		"R/O GUP-fast pin on R/O-mapped shared page",
1100 		test_ro_fast_pin_on_shared,
1101 	},
1102 	/*
1103 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1104 	 * was previously shared. When modifying the page via the page table,
1105 	 * the page content change must be visible via the pin.
1106 	 */
1107 	{
1108 		"R/O GUP pin on R/O-mapped previously-shared page",
1109 		test_ro_pin_on_ro_previously_shared,
1110 	},
1111 	/* Same as above, but using GUP-fast. */
1112 	{
1113 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1114 		test_ro_fast_pin_on_ro_previously_shared,
1115 	},
1116 	/*
1117 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1118 	 * When modifying the page via the page table, the page content change
1119 	 * must be visible via the pin.
1120 	 */
1121 	{
1122 		"R/O GUP pin on R/O-mapped exclusive page",
1123 		test_ro_pin_on_ro_exclusive,
1124 	},
1125 	/* Same as above, but using GUP-fast. */
1126 	{
1127 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1128 		test_ro_fast_pin_on_ro_exclusive,
1129 	},
1130 };
1131 
1132 static void run_anon_test_case(struct test_case const *test_case)
1133 {
1134 	int i;
1135 
1136 	run_with_base_page(test_case->fn, test_case->desc);
1137 	run_with_base_page_swap(test_case->fn, test_case->desc);
1138 	for (i = 0; i < nr_thpsizes; i++) {
1139 		size_t size = thpsizes[i];
1140 		struct thp_settings settings = *thp_current_settings();
1141 
1142 		settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1143 		settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1144 		thp_push_settings(&settings);
1145 
1146 		if (size == pmdsize) {
1147 			run_with_thp(test_case->fn, test_case->desc, size);
1148 			run_with_thp_swap(test_case->fn, test_case->desc, size);
1149 		}
1150 
1151 		run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1152 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1153 		run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1154 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1155 		run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1156 		run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1157 
1158 		thp_pop_settings();
1159 	}
1160 	for (i = 0; i < nr_hugetlbsizes; i++)
1161 		run_with_hugetlb(test_case->fn, test_case->desc,
1162 				 hugetlbsizes[i]);
1163 }
1164 
1165 static void run_anon_test_cases(void)
1166 {
1167 	int i;
1168 
1169 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1170 
1171 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1172 		run_anon_test_case(&anon_test_cases[i]);
1173 }
1174 
1175 static int tests_per_anon_test_case(void)
1176 {
1177 	int tests = 2 + nr_hugetlbsizes;
1178 
1179 	tests += 6 * nr_thpsizes;
1180 	if (pmdsize)
1181 		tests += 2;
1182 	return tests;
1183 }
1184 
1185 enum anon_thp_collapse_test {
1186 	ANON_THP_COLLAPSE_UNSHARED,
1187 	ANON_THP_COLLAPSE_FULLY_SHARED,
1188 	ANON_THP_COLLAPSE_LOWER_SHARED,
1189 	ANON_THP_COLLAPSE_UPPER_SHARED,
1190 };
1191 
1192 static void do_test_anon_thp_collapse(char *mem, size_t size,
1193 				      enum anon_thp_collapse_test test)
1194 {
1195 	struct comm_pipes comm_pipes;
1196 	char buf;
1197 	int ret;
1198 
1199 	ret = setup_comm_pipes(&comm_pipes);
1200 	if (ret) {
1201 		ksft_test_result_fail("pipe() failed\n");
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1207 	 * R/O, such that we can try collapsing it later.
1208 	 */
1209 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1210 	if (ret) {
1211 		ksft_test_result_fail("mprotect() failed\n");
1212 		goto close_comm_pipes;
1213 	}
1214 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1215 	if (ret) {
1216 		ksft_test_result_fail("mprotect() failed\n");
1217 		goto close_comm_pipes;
1218 	}
1219 
1220 	switch (test) {
1221 	case ANON_THP_COLLAPSE_UNSHARED:
1222 		/* Collapse before actually COW-sharing the page. */
1223 		ret = madvise(mem, size, MADV_COLLAPSE);
1224 		if (ret) {
1225 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1226 					      strerror(errno));
1227 			goto close_comm_pipes;
1228 		}
1229 		break;
1230 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1231 		/* COW-share the full PTE-mapped THP. */
1232 		break;
1233 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1234 		/* Don't COW-share the upper part of the THP. */
1235 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1236 		if (ret) {
1237 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1238 			goto close_comm_pipes;
1239 		}
1240 		break;
1241 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1242 		/* Don't COW-share the lower part of the THP. */
1243 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1244 		if (ret) {
1245 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1246 			goto close_comm_pipes;
1247 		}
1248 		break;
1249 	default:
1250 		assert(false);
1251 	}
1252 
1253 	ret = fork();
1254 	if (ret < 0) {
1255 		ksft_test_result_fail("fork() failed\n");
1256 		goto close_comm_pipes;
1257 	} else if (!ret) {
1258 		switch (test) {
1259 		case ANON_THP_COLLAPSE_UNSHARED:
1260 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1261 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1262 			break;
1263 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1264 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1265 			break;
1266 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1267 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1268 					     &comm_pipes));
1269 			break;
1270 		default:
1271 			assert(false);
1272 		}
1273 	}
1274 
1275 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1276 		;
1277 
1278 	switch (test) {
1279 	case ANON_THP_COLLAPSE_UNSHARED:
1280 		break;
1281 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1282 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1283 		/*
1284 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1285 		 * able to actually collapse.
1286 		 */
1287 		ret = madvise(mem, size, MADV_DOFORK);
1288 		if (ret) {
1289 			ksft_test_result_fail("MADV_DOFORK failed\n");
1290 			write(comm_pipes.parent_ready[1], "0", 1);
1291 			wait(&ret);
1292 			goto close_comm_pipes;
1293 		}
1294 		/* FALLTHROUGH */
1295 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1296 		/* Collapse before anyone modified the COW-shared page. */
1297 		ret = madvise(mem, size, MADV_COLLAPSE);
1298 		if (ret) {
1299 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1300 					      strerror(errno));
1301 			write(comm_pipes.parent_ready[1], "0", 1);
1302 			wait(&ret);
1303 			goto close_comm_pipes;
1304 		}
1305 		break;
1306 	default:
1307 		assert(false);
1308 	}
1309 
1310 	/* Modify the page. */
1311 	memset(mem, 0xff, size);
1312 	write(comm_pipes.parent_ready[1], "0", 1);
1313 
1314 	wait(&ret);
1315 	if (WIFEXITED(ret))
1316 		ret = WEXITSTATUS(ret);
1317 	else
1318 		ret = -EINVAL;
1319 
1320 	ksft_test_result(!ret, "No leak from parent into child\n");
1321 close_comm_pipes:
1322 	close_comm_pipes(&comm_pipes);
1323 }
1324 
1325 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1326 {
1327 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1328 }
1329 
1330 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1331 {
1332 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1333 }
1334 
1335 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1336 {
1337 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1338 }
1339 
1340 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1341 {
1342 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1343 }
1344 
1345 /*
1346  * Test cases that are specific to anonymous THP: pages in private mappings
1347  * that may get shared via COW during fork().
1348  */
1349 static const struct test_case anon_thp_test_cases[] = {
1350 	/*
1351 	 * Basic COW test for fork() without any GUP when collapsing a THP
1352 	 * before fork().
1353 	 *
1354 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1355 	 * collapse") might easily get COW handling wrong when not collapsing
1356 	 * exclusivity information properly.
1357 	 */
1358 	{
1359 		"Basic COW after fork() when collapsing before fork()",
1360 		test_anon_thp_collapse_unshared,
1361 	},
1362 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1363 	{
1364 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1365 		test_anon_thp_collapse_fully_shared,
1366 	},
1367 	/*
1368 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1369 	 * THP.
1370 	 */
1371 	{
1372 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1373 		test_anon_thp_collapse_lower_shared,
1374 	},
1375 	/*
1376 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1377 	 * THP.
1378 	 */
1379 	{
1380 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1381 		test_anon_thp_collapse_upper_shared,
1382 	},
1383 };
1384 
1385 static void run_anon_thp_test_cases(void)
1386 {
1387 	int i;
1388 
1389 	if (!pmdsize)
1390 		return;
1391 
1392 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1393 
1394 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1395 		struct test_case const *test_case = &anon_thp_test_cases[i];
1396 
1397 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1398 		do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1399 	}
1400 }
1401 
1402 static int tests_per_anon_thp_test_case(void)
1403 {
1404 	return pmdsize ? 1 : 0;
1405 }
1406 
1407 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1408 
1409 static void test_cow(char *mem, const char *smem, size_t size)
1410 {
1411 	char *old = malloc(size);
1412 
1413 	/* Backup the original content. */
1414 	memcpy(old, smem, size);
1415 
1416 	/* Modify the page. */
1417 	memset(mem, 0xff, size);
1418 
1419 	/* See if we still read the old values via the other mapping. */
1420 	ksft_test_result(!memcmp(smem, old, size),
1421 			 "Other mapping not modified\n");
1422 	free(old);
1423 }
1424 
1425 static void test_ro_pin(char *mem, const char *smem, size_t size)
1426 {
1427 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1428 }
1429 
1430 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1431 {
1432 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1433 }
1434 
1435 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1436 {
1437 	char *mem, *smem, tmp;
1438 
1439 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1440 
1441 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1442 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1443 	if (mem == MAP_FAILED) {
1444 		ksft_test_result_fail("mmap() failed\n");
1445 		return;
1446 	}
1447 
1448 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1449 	if (mem == MAP_FAILED) {
1450 		ksft_test_result_fail("mmap() failed\n");
1451 		goto munmap;
1452 	}
1453 
1454 	/* Read from the page to populate the shared zeropage. */
1455 	tmp = *mem + *smem;
1456 	asm volatile("" : "+r" (tmp));
1457 
1458 	fn(mem, smem, pagesize);
1459 munmap:
1460 	munmap(mem, pagesize);
1461 	if (smem != MAP_FAILED)
1462 		munmap(smem, pagesize);
1463 }
1464 
1465 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1466 {
1467 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1468 	size_t mmap_size;
1469 	int ret;
1470 
1471 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1472 
1473 	if (!has_huge_zeropage) {
1474 		ksft_test_result_skip("Huge zeropage not enabled\n");
1475 		return;
1476 	}
1477 
1478 	/* For alignment purposes, we need twice the thp size. */
1479 	mmap_size = 2 * pmdsize;
1480 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1481 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1482 	if (mmap_mem == MAP_FAILED) {
1483 		ksft_test_result_fail("mmap() failed\n");
1484 		return;
1485 	}
1486 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1487 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1488 	if (mmap_smem == MAP_FAILED) {
1489 		ksft_test_result_fail("mmap() failed\n");
1490 		goto munmap;
1491 	}
1492 
1493 	/* We need a THP-aligned memory area. */
1494 	mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1495 	smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1496 
1497 	ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1498 	ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
1499 	if (ret) {
1500 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1501 		goto munmap;
1502 	}
1503 
1504 	/*
1505 	 * Read from the memory to populate the huge shared zeropage. Read from
1506 	 * the first sub-page and test if we get another sub-page populated
1507 	 * automatically.
1508 	 */
1509 	tmp = *mem + *smem;
1510 	asm volatile("" : "+r" (tmp));
1511 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1512 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1513 		ksft_test_result_skip("Did not get THPs populated\n");
1514 		goto munmap;
1515 	}
1516 
1517 	fn(mem, smem, pmdsize);
1518 munmap:
1519 	munmap(mmap_mem, mmap_size);
1520 	if (mmap_smem != MAP_FAILED)
1521 		munmap(mmap_smem, mmap_size);
1522 }
1523 
1524 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1525 {
1526 	char *mem, *smem, tmp;
1527 	int fd;
1528 
1529 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1530 
1531 	fd = memfd_create("test", 0);
1532 	if (fd < 0) {
1533 		ksft_test_result_fail("memfd_create() failed\n");
1534 		return;
1535 	}
1536 
1537 	/* File consists of a single page filled with zeroes. */
1538 	if (fallocate(fd, 0, 0, pagesize)) {
1539 		ksft_test_result_fail("fallocate() failed\n");
1540 		goto close;
1541 	}
1542 
1543 	/* Create a private mapping of the memfd. */
1544 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1545 	if (mem == MAP_FAILED) {
1546 		ksft_test_result_fail("mmap() failed\n");
1547 		goto close;
1548 	}
1549 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1550 	if (mem == MAP_FAILED) {
1551 		ksft_test_result_fail("mmap() failed\n");
1552 		goto munmap;
1553 	}
1554 
1555 	/* Fault the page in. */
1556 	tmp = *mem + *smem;
1557 	asm volatile("" : "+r" (tmp));
1558 
1559 	fn(mem, smem, pagesize);
1560 munmap:
1561 	munmap(mem, pagesize);
1562 	if (smem != MAP_FAILED)
1563 		munmap(smem, pagesize);
1564 close:
1565 	close(fd);
1566 }
1567 
1568 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1569 {
1570 	char *mem, *smem, tmp;
1571 	FILE *file;
1572 	int fd;
1573 
1574 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1575 
1576 	file = tmpfile();
1577 	if (!file) {
1578 		ksft_test_result_fail("tmpfile() failed\n");
1579 		return;
1580 	}
1581 
1582 	fd = fileno(file);
1583 	if (fd < 0) {
1584 		ksft_test_result_skip("fileno() failed\n");
1585 		return;
1586 	}
1587 
1588 	/* File consists of a single page filled with zeroes. */
1589 	if (fallocate(fd, 0, 0, pagesize)) {
1590 		ksft_test_result_fail("fallocate() failed\n");
1591 		goto close;
1592 	}
1593 
1594 	/* Create a private mapping of the memfd. */
1595 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1596 	if (mem == MAP_FAILED) {
1597 		ksft_test_result_fail("mmap() failed\n");
1598 		goto close;
1599 	}
1600 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1601 	if (mem == MAP_FAILED) {
1602 		ksft_test_result_fail("mmap() failed\n");
1603 		goto munmap;
1604 	}
1605 
1606 	/* Fault the page in. */
1607 	tmp = *mem + *smem;
1608 	asm volatile("" : "+r" (tmp));
1609 
1610 	fn(mem, smem, pagesize);
1611 munmap:
1612 	munmap(mem, pagesize);
1613 	if (smem != MAP_FAILED)
1614 		munmap(smem, pagesize);
1615 close:
1616 	fclose(file);
1617 }
1618 
1619 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1620 				   size_t hugetlbsize)
1621 {
1622 	int flags = MFD_HUGETLB;
1623 	char *mem, *smem, tmp;
1624 	int fd;
1625 
1626 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1627 		       hugetlbsize / 1024);
1628 
1629 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1630 
1631 	fd = memfd_create("test", flags);
1632 	if (fd < 0) {
1633 		ksft_test_result_skip("memfd_create() failed\n");
1634 		return;
1635 	}
1636 
1637 	/* File consists of a single page filled with zeroes. */
1638 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1639 		ksft_test_result_skip("need more free huge pages\n");
1640 		goto close;
1641 	}
1642 
1643 	/* Create a private mapping of the memfd. */
1644 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1645 		   0);
1646 	if (mem == MAP_FAILED) {
1647 		ksft_test_result_skip("need more free huge pages\n");
1648 		goto close;
1649 	}
1650 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1651 	if (mem == MAP_FAILED) {
1652 		ksft_test_result_fail("mmap() failed\n");
1653 		goto munmap;
1654 	}
1655 
1656 	/* Fault the page in. */
1657 	tmp = *mem + *smem;
1658 	asm volatile("" : "+r" (tmp));
1659 
1660 	fn(mem, smem, hugetlbsize);
1661 munmap:
1662 	munmap(mem, hugetlbsize);
1663 	if (mem != MAP_FAILED)
1664 		munmap(smem, hugetlbsize);
1665 close:
1666 	close(fd);
1667 }
1668 
1669 struct non_anon_test_case {
1670 	const char *desc;
1671 	non_anon_test_fn fn;
1672 };
1673 
1674 /*
1675  * Test cases that target any pages in private mappings that are not anonymous:
1676  * pages that may get shared via COW ndependent of fork(). This includes
1677  * the shared zeropage(s), pagecache pages, ...
1678  */
1679 static const struct non_anon_test_case non_anon_test_cases[] = {
1680 	/*
1681 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1682 	 * visible via other private/shared mappings.
1683 	 */
1684 	{
1685 		"Basic COW",
1686 		test_cow,
1687 	},
1688 	/*
1689 	 * Take a R/O longterm pin. When modifying the page via the page table,
1690 	 * the page content change must be visible via the pin.
1691 	 */
1692 	{
1693 		"R/O longterm GUP pin",
1694 		test_ro_pin,
1695 	},
1696 	/* Same as above, but using GUP-fast. */
1697 	{
1698 		"R/O longterm GUP-fast pin",
1699 		test_ro_fast_pin,
1700 	},
1701 };
1702 
1703 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1704 {
1705 	int i;
1706 
1707 	run_with_zeropage(test_case->fn, test_case->desc);
1708 	run_with_memfd(test_case->fn, test_case->desc);
1709 	run_with_tmpfile(test_case->fn, test_case->desc);
1710 	if (pmdsize)
1711 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1712 	for (i = 0; i < nr_hugetlbsizes; i++)
1713 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1714 				       hugetlbsizes[i]);
1715 }
1716 
1717 static void run_non_anon_test_cases(void)
1718 {
1719 	int i;
1720 
1721 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1722 
1723 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1724 		run_non_anon_test_case(&non_anon_test_cases[i]);
1725 }
1726 
1727 static int tests_per_non_anon_test_case(void)
1728 {
1729 	int tests = 3 + nr_hugetlbsizes;
1730 
1731 	if (pmdsize)
1732 		tests += 1;
1733 	return tests;
1734 }
1735 
1736 int main(int argc, char **argv)
1737 {
1738 	int err;
1739 	struct thp_settings default_settings;
1740 
1741 	ksft_print_header();
1742 
1743 	pagesize = getpagesize();
1744 	pmdsize = read_pmd_pagesize();
1745 	if (pmdsize) {
1746 		/* Only if THP is supported. */
1747 		thp_read_settings(&default_settings);
1748 		default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1749 		thp_save_settings();
1750 		thp_push_settings(&default_settings);
1751 
1752 		ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1753 			       pmdsize / 1024);
1754 		nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1755 	}
1756 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1757 						    ARRAY_SIZE(hugetlbsizes));
1758 	detect_huge_zeropage();
1759 
1760 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1761 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1762 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1763 
1764 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1765 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1766 	if (pagemap_fd < 0)
1767 		ksft_exit_fail_msg("opening pagemap failed\n");
1768 
1769 	run_anon_test_cases();
1770 	run_anon_thp_test_cases();
1771 	run_non_anon_test_cases();
1772 
1773 	if (pmdsize) {
1774 		/* Only if THP is supported. */
1775 		thp_restore_settings();
1776 	}
1777 
1778 	err = ksft_get_fail_cnt();
1779 	if (err)
1780 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1781 				   err, ksft_test_num());
1782 	return ksft_exit_pass();
1783 }
1784