xref: /linux/tools/testing/selftests/mm/cow.c (revision 81b1e3f91d77564611ab10d2c61774cf6a46ec78)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <sys/mman.h>
19 #include <sys/ioctl.h>
20 #include <sys/wait.h>
21 #include <linux/memfd.h>
22 
23 #include "local_config.h"
24 #ifdef LOCAL_CONFIG_HAVE_LIBURING
25 #include <liburing.h>
26 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
27 
28 #include "../../../../mm/gup_test.h"
29 #include "../kselftest.h"
30 #include "vm_util.h"
31 
32 #ifndef MADV_PAGEOUT
33 #define MADV_PAGEOUT 21
34 #endif
35 #ifndef MADV_COLLAPSE
36 #define MADV_COLLAPSE 25
37 #endif
38 
39 static size_t pagesize;
40 static int pagemap_fd;
41 static size_t thpsize;
42 static int nr_hugetlbsizes;
43 static size_t hugetlbsizes[10];
44 static int gup_fd;
45 static bool has_huge_zeropage;
46 
47 static void detect_huge_zeropage(void)
48 {
49 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
50 		      O_RDONLY);
51 	size_t enabled = 0;
52 	char buf[15];
53 	int ret;
54 
55 	if (fd < 0)
56 		return;
57 
58 	ret = pread(fd, buf, sizeof(buf), 0);
59 	if (ret > 0 && ret < sizeof(buf)) {
60 		buf[ret] = 0;
61 
62 		enabled = strtoul(buf, NULL, 10);
63 		if (enabled == 1) {
64 			has_huge_zeropage = true;
65 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
66 		}
67 	}
68 
69 	close(fd);
70 }
71 
72 static bool range_is_swapped(void *addr, size_t size)
73 {
74 	for (; size; addr += pagesize, size -= pagesize)
75 		if (!pagemap_is_swapped(pagemap_fd, addr))
76 			return false;
77 	return true;
78 }
79 
80 struct comm_pipes {
81 	int child_ready[2];
82 	int parent_ready[2];
83 };
84 
85 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
86 {
87 	if (pipe(comm_pipes->child_ready) < 0)
88 		return -errno;
89 	if (pipe(comm_pipes->parent_ready) < 0) {
90 		close(comm_pipes->child_ready[0]);
91 		close(comm_pipes->child_ready[1]);
92 		return -errno;
93 	}
94 
95 	return 0;
96 }
97 
98 static void close_comm_pipes(struct comm_pipes *comm_pipes)
99 {
100 	close(comm_pipes->child_ready[0]);
101 	close(comm_pipes->child_ready[1]);
102 	close(comm_pipes->parent_ready[0]);
103 	close(comm_pipes->parent_ready[1]);
104 }
105 
106 static int child_memcmp_fn(char *mem, size_t size,
107 			   struct comm_pipes *comm_pipes)
108 {
109 	char *old = malloc(size);
110 	char buf;
111 
112 	/* Backup the original content. */
113 	memcpy(old, mem, size);
114 
115 	/* Wait until the parent modified the page. */
116 	write(comm_pipes->child_ready[1], "0", 1);
117 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
118 		;
119 
120 	/* See if we still read the old values. */
121 	return memcmp(old, mem, size);
122 }
123 
124 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
125 				    struct comm_pipes *comm_pipes)
126 {
127 	struct iovec iov = {
128 		.iov_base = mem,
129 		.iov_len = size,
130 	};
131 	ssize_t cur, total, transferred;
132 	char *old, *new;
133 	int fds[2];
134 	char buf;
135 
136 	old = malloc(size);
137 	new = malloc(size);
138 
139 	/* Backup the original content. */
140 	memcpy(old, mem, size);
141 
142 	if (pipe(fds) < 0)
143 		return -errno;
144 
145 	/* Trigger a read-only pin. */
146 	transferred = vmsplice(fds[1], &iov, 1, 0);
147 	if (transferred < 0)
148 		return -errno;
149 	if (transferred == 0)
150 		return -EINVAL;
151 
152 	/* Unmap it from our page tables. */
153 	if (munmap(mem, size) < 0)
154 		return -errno;
155 
156 	/* Wait until the parent modified it. */
157 	write(comm_pipes->child_ready[1], "0", 1);
158 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
159 		;
160 
161 	/* See if we still read the old values via the pipe. */
162 	for (total = 0; total < transferred; total += cur) {
163 		cur = read(fds[0], new + total, transferred - total);
164 		if (cur < 0)
165 			return -errno;
166 	}
167 
168 	return memcmp(old, new, transferred);
169 }
170 
171 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
172 
173 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
174 				  child_fn fn)
175 {
176 	struct comm_pipes comm_pipes;
177 	char buf;
178 	int ret;
179 
180 	ret = setup_comm_pipes(&comm_pipes);
181 	if (ret) {
182 		ksft_test_result_fail("pipe() failed\n");
183 		return;
184 	}
185 
186 	ret = fork();
187 	if (ret < 0) {
188 		ksft_test_result_fail("fork() failed\n");
189 		goto close_comm_pipes;
190 	} else if (!ret) {
191 		exit(fn(mem, size, &comm_pipes));
192 	}
193 
194 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
195 		;
196 
197 	if (do_mprotect) {
198 		/*
199 		 * mprotect() optimizations might try avoiding
200 		 * write-faults by directly mapping pages writable.
201 		 */
202 		ret = mprotect(mem, size, PROT_READ);
203 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
204 		if (ret) {
205 			ksft_test_result_fail("mprotect() failed\n");
206 			write(comm_pipes.parent_ready[1], "0", 1);
207 			wait(&ret);
208 			goto close_comm_pipes;
209 		}
210 	}
211 
212 	/* Modify the page. */
213 	memset(mem, 0xff, size);
214 	write(comm_pipes.parent_ready[1], "0", 1);
215 
216 	wait(&ret);
217 	if (WIFEXITED(ret))
218 		ret = WEXITSTATUS(ret);
219 	else
220 		ret = -EINVAL;
221 
222 	ksft_test_result(!ret, "No leak from parent into child\n");
223 close_comm_pipes:
224 	close_comm_pipes(&comm_pipes);
225 }
226 
227 static void test_cow_in_parent(char *mem, size_t size)
228 {
229 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
230 }
231 
232 static void test_cow_in_parent_mprotect(char *mem, size_t size)
233 {
234 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
235 }
236 
237 static void test_vmsplice_in_child(char *mem, size_t size)
238 {
239 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
240 }
241 
242 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
243 {
244 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
245 }
246 
247 static void do_test_vmsplice_in_parent(char *mem, size_t size,
248 				       bool before_fork)
249 {
250 	struct iovec iov = {
251 		.iov_base = mem,
252 		.iov_len = size,
253 	};
254 	ssize_t cur, total, transferred;
255 	struct comm_pipes comm_pipes;
256 	char *old, *new;
257 	int ret, fds[2];
258 	char buf;
259 
260 	old = malloc(size);
261 	new = malloc(size);
262 
263 	memcpy(old, mem, size);
264 
265 	ret = setup_comm_pipes(&comm_pipes);
266 	if (ret) {
267 		ksft_test_result_fail("pipe() failed\n");
268 		goto free;
269 	}
270 
271 	if (pipe(fds) < 0) {
272 		ksft_test_result_fail("pipe() failed\n");
273 		goto close_comm_pipes;
274 	}
275 
276 	if (before_fork) {
277 		transferred = vmsplice(fds[1], &iov, 1, 0);
278 		if (transferred <= 0) {
279 			ksft_test_result_fail("vmsplice() failed\n");
280 			goto close_pipe;
281 		}
282 	}
283 
284 	ret = fork();
285 	if (ret < 0) {
286 		ksft_test_result_fail("fork() failed\n");
287 		goto close_pipe;
288 	} else if (!ret) {
289 		write(comm_pipes.child_ready[1], "0", 1);
290 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
291 			;
292 		/* Modify page content in the child. */
293 		memset(mem, 0xff, size);
294 		exit(0);
295 	}
296 
297 	if (!before_fork) {
298 		transferred = vmsplice(fds[1], &iov, 1, 0);
299 		if (transferred <= 0) {
300 			ksft_test_result_fail("vmsplice() failed\n");
301 			wait(&ret);
302 			goto close_pipe;
303 		}
304 	}
305 
306 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
307 		;
308 	if (munmap(mem, size) < 0) {
309 		ksft_test_result_fail("munmap() failed\n");
310 		goto close_pipe;
311 	}
312 	write(comm_pipes.parent_ready[1], "0", 1);
313 
314 	/* Wait until the child is done writing. */
315 	wait(&ret);
316 	if (!WIFEXITED(ret)) {
317 		ksft_test_result_fail("wait() failed\n");
318 		goto close_pipe;
319 	}
320 
321 	/* See if we still read the old values. */
322 	for (total = 0; total < transferred; total += cur) {
323 		cur = read(fds[0], new + total, transferred - total);
324 		if (cur < 0) {
325 			ksft_test_result_fail("read() failed\n");
326 			goto close_pipe;
327 		}
328 	}
329 
330 	ksft_test_result(!memcmp(old, new, transferred),
331 			 "No leak from child into parent\n");
332 close_pipe:
333 	close(fds[0]);
334 	close(fds[1]);
335 close_comm_pipes:
336 	close_comm_pipes(&comm_pipes);
337 free:
338 	free(old);
339 	free(new);
340 }
341 
342 static void test_vmsplice_before_fork(char *mem, size_t size)
343 {
344 	do_test_vmsplice_in_parent(mem, size, true);
345 }
346 
347 static void test_vmsplice_after_fork(char *mem, size_t size)
348 {
349 	do_test_vmsplice_in_parent(mem, size, false);
350 }
351 
352 #ifdef LOCAL_CONFIG_HAVE_LIBURING
353 static void do_test_iouring(char *mem, size_t size, bool use_fork)
354 {
355 	struct comm_pipes comm_pipes;
356 	struct io_uring_cqe *cqe;
357 	struct io_uring_sqe *sqe;
358 	struct io_uring ring;
359 	ssize_t cur, total;
360 	struct iovec iov;
361 	char *buf, *tmp;
362 	int ret, fd;
363 	FILE *file;
364 
365 	ret = setup_comm_pipes(&comm_pipes);
366 	if (ret) {
367 		ksft_test_result_fail("pipe() failed\n");
368 		return;
369 	}
370 
371 	file = tmpfile();
372 	if (!file) {
373 		ksft_test_result_fail("tmpfile() failed\n");
374 		goto close_comm_pipes;
375 	}
376 	fd = fileno(file);
377 	assert(fd);
378 
379 	tmp = malloc(size);
380 	if (!tmp) {
381 		ksft_test_result_fail("malloc() failed\n");
382 		goto close_file;
383 	}
384 
385 	/* Skip on errors, as we might just lack kernel support. */
386 	ret = io_uring_queue_init(1, &ring, 0);
387 	if (ret < 0) {
388 		ksft_test_result_skip("io_uring_queue_init() failed\n");
389 		goto free_tmp;
390 	}
391 
392 	/*
393 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
394 	 * | FOLL_LONGTERM the range.
395 	 *
396 	 * Skip on errors, as we might just lack kernel support or might not
397 	 * have sufficient MEMLOCK permissions.
398 	 */
399 	iov.iov_base = mem;
400 	iov.iov_len = size;
401 	ret = io_uring_register_buffers(&ring, &iov, 1);
402 	if (ret) {
403 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
404 		goto queue_exit;
405 	}
406 
407 	if (use_fork) {
408 		/*
409 		 * fork() and keep the child alive until we're done. Note that
410 		 * we expect the pinned page to not get shared with the child.
411 		 */
412 		ret = fork();
413 		if (ret < 0) {
414 			ksft_test_result_fail("fork() failed\n");
415 			goto unregister_buffers;
416 		} else if (!ret) {
417 			write(comm_pipes.child_ready[1], "0", 1);
418 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
419 				;
420 			exit(0);
421 		}
422 
423 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
424 			;
425 	} else {
426 		/*
427 		 * Map the page R/O into the page table. Enable softdirty
428 		 * tracking to stop the page from getting mapped R/W immediately
429 		 * again by mprotect() optimizations. Note that we don't have an
430 		 * easy way to test if that worked (the pagemap does not export
431 		 * if the page is mapped R/O vs. R/W).
432 		 */
433 		ret = mprotect(mem, size, PROT_READ);
434 		clear_softdirty();
435 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
436 		if (ret) {
437 			ksft_test_result_fail("mprotect() failed\n");
438 			goto unregister_buffers;
439 		}
440 	}
441 
442 	/*
443 	 * Modify the page and write page content as observed by the fixed
444 	 * buffer pin to the file so we can verify it.
445 	 */
446 	memset(mem, 0xff, size);
447 	sqe = io_uring_get_sqe(&ring);
448 	if (!sqe) {
449 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
450 		goto quit_child;
451 	}
452 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
453 
454 	ret = io_uring_submit(&ring);
455 	if (ret < 0) {
456 		ksft_test_result_fail("io_uring_submit() failed\n");
457 		goto quit_child;
458 	}
459 
460 	ret = io_uring_wait_cqe(&ring, &cqe);
461 	if (ret < 0) {
462 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
463 		goto quit_child;
464 	}
465 
466 	if (cqe->res != size) {
467 		ksft_test_result_fail("write_fixed failed\n");
468 		goto quit_child;
469 	}
470 	io_uring_cqe_seen(&ring, cqe);
471 
472 	/* Read back the file content to the temporary buffer. */
473 	total = 0;
474 	while (total < size) {
475 		cur = pread(fd, tmp + total, size - total, total);
476 		if (cur < 0) {
477 			ksft_test_result_fail("pread() failed\n");
478 			goto quit_child;
479 		}
480 		total += cur;
481 	}
482 
483 	/* Finally, check if we read what we expected. */
484 	ksft_test_result(!memcmp(mem, tmp, size),
485 			 "Longterm R/W pin is reliable\n");
486 
487 quit_child:
488 	if (use_fork) {
489 		write(comm_pipes.parent_ready[1], "0", 1);
490 		wait(&ret);
491 	}
492 unregister_buffers:
493 	io_uring_unregister_buffers(&ring);
494 queue_exit:
495 	io_uring_queue_exit(&ring);
496 free_tmp:
497 	free(tmp);
498 close_file:
499 	fclose(file);
500 close_comm_pipes:
501 	close_comm_pipes(&comm_pipes);
502 }
503 
504 static void test_iouring_ro(char *mem, size_t size)
505 {
506 	do_test_iouring(mem, size, false);
507 }
508 
509 static void test_iouring_fork(char *mem, size_t size)
510 {
511 	do_test_iouring(mem, size, true);
512 }
513 
514 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
515 
516 enum ro_pin_test {
517 	RO_PIN_TEST,
518 	RO_PIN_TEST_SHARED,
519 	RO_PIN_TEST_PREVIOUSLY_SHARED,
520 	RO_PIN_TEST_RO_EXCLUSIVE,
521 };
522 
523 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
524 			   bool fast)
525 {
526 	struct pin_longterm_test args;
527 	struct comm_pipes comm_pipes;
528 	char *tmp, buf;
529 	__u64 tmp_val;
530 	int ret;
531 
532 	if (gup_fd < 0) {
533 		ksft_test_result_skip("gup_test not available\n");
534 		return;
535 	}
536 
537 	tmp = malloc(size);
538 	if (!tmp) {
539 		ksft_test_result_fail("malloc() failed\n");
540 		return;
541 	}
542 
543 	ret = setup_comm_pipes(&comm_pipes);
544 	if (ret) {
545 		ksft_test_result_fail("pipe() failed\n");
546 		goto free_tmp;
547 	}
548 
549 	switch (test) {
550 	case RO_PIN_TEST:
551 		break;
552 	case RO_PIN_TEST_SHARED:
553 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
554 		/*
555 		 * Share the pages with our child. As the pages are not pinned,
556 		 * this should just work.
557 		 */
558 		ret = fork();
559 		if (ret < 0) {
560 			ksft_test_result_fail("fork() failed\n");
561 			goto close_comm_pipes;
562 		} else if (!ret) {
563 			write(comm_pipes.child_ready[1], "0", 1);
564 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
565 				;
566 			exit(0);
567 		}
568 
569 		/* Wait until our child is ready. */
570 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
571 			;
572 
573 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
574 			/*
575 			 * Tell the child to quit now and wait until it quit.
576 			 * The pages should now be mapped R/O into our page
577 			 * tables, but they are no longer shared.
578 			 */
579 			write(comm_pipes.parent_ready[1], "0", 1);
580 			wait(&ret);
581 			if (!WIFEXITED(ret))
582 				ksft_print_msg("[INFO] wait() failed\n");
583 		}
584 		break;
585 	case RO_PIN_TEST_RO_EXCLUSIVE:
586 		/*
587 		 * Map the page R/O into the page table. Enable softdirty
588 		 * tracking to stop the page from getting mapped R/W immediately
589 		 * again by mprotect() optimizations. Note that we don't have an
590 		 * easy way to test if that worked (the pagemap does not export
591 		 * if the page is mapped R/O vs. R/W).
592 		 */
593 		ret = mprotect(mem, size, PROT_READ);
594 		clear_softdirty();
595 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
596 		if (ret) {
597 			ksft_test_result_fail("mprotect() failed\n");
598 			goto close_comm_pipes;
599 		}
600 		break;
601 	default:
602 		assert(false);
603 	}
604 
605 	/* Take a R/O pin. This should trigger unsharing. */
606 	args.addr = (__u64)(uintptr_t)mem;
607 	args.size = size;
608 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
609 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
610 	if (ret) {
611 		if (errno == EINVAL)
612 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
613 		else
614 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
615 		goto wait;
616 	}
617 
618 	/* Modify the page. */
619 	memset(mem, 0xff, size);
620 
621 	/*
622 	 * Read back the content via the pin to the temporary buffer and
623 	 * test if we observed the modification.
624 	 */
625 	tmp_val = (__u64)(uintptr_t)tmp;
626 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
627 	if (ret)
628 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
629 	else
630 		ksft_test_result(!memcmp(mem, tmp, size),
631 				 "Longterm R/O pin is reliable\n");
632 
633 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
634 	if (ret)
635 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
636 wait:
637 	switch (test) {
638 	case RO_PIN_TEST_SHARED:
639 		write(comm_pipes.parent_ready[1], "0", 1);
640 		wait(&ret);
641 		if (!WIFEXITED(ret))
642 			ksft_print_msg("[INFO] wait() failed\n");
643 		break;
644 	default:
645 		break;
646 	}
647 close_comm_pipes:
648 	close_comm_pipes(&comm_pipes);
649 free_tmp:
650 	free(tmp);
651 }
652 
653 static void test_ro_pin_on_shared(char *mem, size_t size)
654 {
655 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
656 }
657 
658 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
659 {
660 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
661 }
662 
663 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
664 {
665 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
666 }
667 
668 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
669 {
670 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
671 }
672 
673 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
674 {
675 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
676 }
677 
678 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
679 {
680 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
681 }
682 
683 typedef void (*test_fn)(char *mem, size_t size);
684 
685 static void do_run_with_base_page(test_fn fn, bool swapout)
686 {
687 	char *mem;
688 	int ret;
689 
690 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
691 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
692 	if (mem == MAP_FAILED) {
693 		ksft_test_result_fail("mmap() failed\n");
694 		return;
695 	}
696 
697 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
698 	/* Ignore if not around on a kernel. */
699 	if (ret && errno != EINVAL) {
700 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
701 		goto munmap;
702 	}
703 
704 	/* Populate a base page. */
705 	memset(mem, 0, pagesize);
706 
707 	if (swapout) {
708 		madvise(mem, pagesize, MADV_PAGEOUT);
709 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
710 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
711 			goto munmap;
712 		}
713 	}
714 
715 	fn(mem, pagesize);
716 munmap:
717 	munmap(mem, pagesize);
718 }
719 
720 static void run_with_base_page(test_fn fn, const char *desc)
721 {
722 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
723 	do_run_with_base_page(fn, false);
724 }
725 
726 static void run_with_base_page_swap(test_fn fn, const char *desc)
727 {
728 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
729 	do_run_with_base_page(fn, true);
730 }
731 
732 enum thp_run {
733 	THP_RUN_PMD,
734 	THP_RUN_PMD_SWAPOUT,
735 	THP_RUN_PTE,
736 	THP_RUN_PTE_SWAPOUT,
737 	THP_RUN_SINGLE_PTE,
738 	THP_RUN_SINGLE_PTE_SWAPOUT,
739 	THP_RUN_PARTIAL_MREMAP,
740 	THP_RUN_PARTIAL_SHARED,
741 };
742 
743 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
744 {
745 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
746 	size_t size, mmap_size, mremap_size;
747 	int ret;
748 
749 	/* For alignment purposes, we need twice the thp size. */
750 	mmap_size = 2 * thpsize;
751 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
752 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
753 	if (mmap_mem == MAP_FAILED) {
754 		ksft_test_result_fail("mmap() failed\n");
755 		return;
756 	}
757 
758 	/* We need a THP-aligned memory area. */
759 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
760 
761 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
762 	if (ret) {
763 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
764 		goto munmap;
765 	}
766 
767 	/*
768 	 * Try to populate a THP. Touch the first sub-page and test if we get
769 	 * another sub-page populated automatically.
770 	 */
771 	mem[0] = 0;
772 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
773 		ksft_test_result_skip("Did not get a THP populated\n");
774 		goto munmap;
775 	}
776 	memset(mem, 0, thpsize);
777 
778 	size = thpsize;
779 	switch (thp_run) {
780 	case THP_RUN_PMD:
781 	case THP_RUN_PMD_SWAPOUT:
782 		break;
783 	case THP_RUN_PTE:
784 	case THP_RUN_PTE_SWAPOUT:
785 		/*
786 		 * Trigger PTE-mapping the THP by temporarily mapping a single
787 		 * subpage R/O.
788 		 */
789 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
790 		if (ret) {
791 			ksft_test_result_fail("mprotect() failed\n");
792 			goto munmap;
793 		}
794 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
795 		if (ret) {
796 			ksft_test_result_fail("mprotect() failed\n");
797 			goto munmap;
798 		}
799 		break;
800 	case THP_RUN_SINGLE_PTE:
801 	case THP_RUN_SINGLE_PTE_SWAPOUT:
802 		/*
803 		 * Discard all but a single subpage of that PTE-mapped THP. What
804 		 * remains is a single PTE mapping a single subpage.
805 		 */
806 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
807 		if (ret) {
808 			ksft_test_result_fail("MADV_DONTNEED failed\n");
809 			goto munmap;
810 		}
811 		size = pagesize;
812 		break;
813 	case THP_RUN_PARTIAL_MREMAP:
814 		/*
815 		 * Remap half of the THP. We need some new memory location
816 		 * for that.
817 		 */
818 		mremap_size = thpsize / 2;
819 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
820 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
821 		if (mem == MAP_FAILED) {
822 			ksft_test_result_fail("mmap() failed\n");
823 			goto munmap;
824 		}
825 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
826 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
827 		if (tmp != mremap_mem) {
828 			ksft_test_result_fail("mremap() failed\n");
829 			goto munmap;
830 		}
831 		size = mremap_size;
832 		break;
833 	case THP_RUN_PARTIAL_SHARED:
834 		/*
835 		 * Share the first page of the THP with a child and quit the
836 		 * child. This will result in some parts of the THP never
837 		 * have been shared.
838 		 */
839 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
840 		if (ret) {
841 			ksft_test_result_fail("MADV_DONTFORK failed\n");
842 			goto munmap;
843 		}
844 		ret = fork();
845 		if (ret < 0) {
846 			ksft_test_result_fail("fork() failed\n");
847 			goto munmap;
848 		} else if (!ret) {
849 			exit(0);
850 		}
851 		wait(&ret);
852 		/* Allow for sharing all pages again. */
853 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
854 		if (ret) {
855 			ksft_test_result_fail("MADV_DOFORK failed\n");
856 			goto munmap;
857 		}
858 		break;
859 	default:
860 		assert(false);
861 	}
862 
863 	switch (thp_run) {
864 	case THP_RUN_PMD_SWAPOUT:
865 	case THP_RUN_PTE_SWAPOUT:
866 	case THP_RUN_SINGLE_PTE_SWAPOUT:
867 		madvise(mem, size, MADV_PAGEOUT);
868 		if (!range_is_swapped(mem, size)) {
869 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
870 			goto munmap;
871 		}
872 		break;
873 	default:
874 		break;
875 	}
876 
877 	fn(mem, size);
878 munmap:
879 	munmap(mmap_mem, mmap_size);
880 	if (mremap_mem != MAP_FAILED)
881 		munmap(mremap_mem, mremap_size);
882 }
883 
884 static void run_with_thp(test_fn fn, const char *desc)
885 {
886 	ksft_print_msg("[RUN] %s ... with THP\n", desc);
887 	do_run_with_thp(fn, THP_RUN_PMD);
888 }
889 
890 static void run_with_thp_swap(test_fn fn, const char *desc)
891 {
892 	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
893 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
894 }
895 
896 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
897 {
898 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
899 	do_run_with_thp(fn, THP_RUN_PTE);
900 }
901 
902 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
903 {
904 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
905 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
906 }
907 
908 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
909 {
910 	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
911 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
912 }
913 
914 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
915 {
916 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
917 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
918 }
919 
920 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
921 {
922 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
923 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
924 }
925 
926 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
927 {
928 	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
929 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
930 }
931 
932 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
933 {
934 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
935 	char *mem, *dummy;
936 
937 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
938 		       hugetlbsize / 1024);
939 
940 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
941 
942 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
943 	if (mem == MAP_FAILED) {
944 		ksft_test_result_skip("need more free huge pages\n");
945 		return;
946 	}
947 
948 	/* Populate an huge page. */
949 	memset(mem, 0, hugetlbsize);
950 
951 	/*
952 	 * We need a total of two hugetlb pages to handle COW/unsharing
953 	 * properly, otherwise we might get zapped by a SIGBUS.
954 	 */
955 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
956 	if (dummy == MAP_FAILED) {
957 		ksft_test_result_skip("need more free huge pages\n");
958 		goto munmap;
959 	}
960 	munmap(dummy, hugetlbsize);
961 
962 	fn(mem, hugetlbsize);
963 munmap:
964 	munmap(mem, hugetlbsize);
965 }
966 
967 struct test_case {
968 	const char *desc;
969 	test_fn fn;
970 };
971 
972 /*
973  * Test cases that are specific to anonymous pages: pages in private mappings
974  * that may get shared via COW during fork().
975  */
976 static const struct test_case anon_test_cases[] = {
977 	/*
978 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
979 	 * either the child can observe modifications by the parent or the
980 	 * other way around.
981 	 */
982 	{
983 		"Basic COW after fork()",
984 		test_cow_in_parent,
985 	},
986 	/*
987 	 * Basic test, but do an additional mprotect(PROT_READ)+
988 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
989 	 */
990 	{
991 		"Basic COW after fork() with mprotect() optimization",
992 		test_cow_in_parent_mprotect,
993 	},
994 	/*
995 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
996 	 * we miss to break COW, the child observes modifications by the parent.
997 	 * This is CVE-2020-29374 reported by Jann Horn.
998 	 */
999 	{
1000 		"vmsplice() + unmap in child",
1001 		test_vmsplice_in_child
1002 	},
1003 	/*
1004 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1005 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1006 	 */
1007 	{
1008 		"vmsplice() + unmap in child with mprotect() optimization",
1009 		test_vmsplice_in_child_mprotect
1010 	},
1011 	/*
1012 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1013 	 * fork(); modify in the child. If we miss to break COW, the parent
1014 	 * observes modifications by the child.
1015 	 */
1016 	{
1017 		"vmsplice() before fork(), unmap in parent after fork()",
1018 		test_vmsplice_before_fork,
1019 	},
1020 	/*
1021 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1022 	 * child. If we miss to break COW, the parent observes modifications by
1023 	 * the child.
1024 	 */
1025 	{
1026 		"vmsplice() + unmap in parent after fork()",
1027 		test_vmsplice_after_fork,
1028 	},
1029 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1030 	/*
1031 	 * Take a R/W longterm pin and then map the page R/O into the page
1032 	 * table to trigger a write fault on next access. When modifying the
1033 	 * page, the page content must be visible via the pin.
1034 	 */
1035 	{
1036 		"R/O-mapping a page registered as iouring fixed buffer",
1037 		test_iouring_ro,
1038 	},
1039 	/*
1040 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1041 	 * page, the page content must be visible via the pin. We expect the
1042 	 * pinned page to not get shared with the child.
1043 	 */
1044 	{
1045 		"fork() with an iouring fixed buffer",
1046 		test_iouring_fork,
1047 	},
1048 
1049 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1050 	/*
1051 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1052 	 * When modifying the page via the page table, the page content change
1053 	 * must be visible via the pin.
1054 	 */
1055 	{
1056 		"R/O GUP pin on R/O-mapped shared page",
1057 		test_ro_pin_on_shared,
1058 	},
1059 	/* Same as above, but using GUP-fast. */
1060 	{
1061 		"R/O GUP-fast pin on R/O-mapped shared page",
1062 		test_ro_fast_pin_on_shared,
1063 	},
1064 	/*
1065 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1066 	 * was previously shared. When modifying the page via the page table,
1067 	 * the page content change must be visible via the pin.
1068 	 */
1069 	{
1070 		"R/O GUP pin on R/O-mapped previously-shared page",
1071 		test_ro_pin_on_ro_previously_shared,
1072 	},
1073 	/* Same as above, but using GUP-fast. */
1074 	{
1075 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1076 		test_ro_fast_pin_on_ro_previously_shared,
1077 	},
1078 	/*
1079 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1080 	 * When modifying the page via the page table, the page content change
1081 	 * must be visible via the pin.
1082 	 */
1083 	{
1084 		"R/O GUP pin on R/O-mapped exclusive page",
1085 		test_ro_pin_on_ro_exclusive,
1086 	},
1087 	/* Same as above, but using GUP-fast. */
1088 	{
1089 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1090 		test_ro_fast_pin_on_ro_exclusive,
1091 	},
1092 };
1093 
1094 static void run_anon_test_case(struct test_case const *test_case)
1095 {
1096 	int i;
1097 
1098 	run_with_base_page(test_case->fn, test_case->desc);
1099 	run_with_base_page_swap(test_case->fn, test_case->desc);
1100 	if (thpsize) {
1101 		run_with_thp(test_case->fn, test_case->desc);
1102 		run_with_thp_swap(test_case->fn, test_case->desc);
1103 		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1104 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1105 		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1106 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1107 		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1108 		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1109 	}
1110 	for (i = 0; i < nr_hugetlbsizes; i++)
1111 		run_with_hugetlb(test_case->fn, test_case->desc,
1112 				 hugetlbsizes[i]);
1113 }
1114 
1115 static void run_anon_test_cases(void)
1116 {
1117 	int i;
1118 
1119 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1120 
1121 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1122 		run_anon_test_case(&anon_test_cases[i]);
1123 }
1124 
1125 static int tests_per_anon_test_case(void)
1126 {
1127 	int tests = 2 + nr_hugetlbsizes;
1128 
1129 	if (thpsize)
1130 		tests += 8;
1131 	return tests;
1132 }
1133 
1134 enum anon_thp_collapse_test {
1135 	ANON_THP_COLLAPSE_UNSHARED,
1136 	ANON_THP_COLLAPSE_FULLY_SHARED,
1137 	ANON_THP_COLLAPSE_LOWER_SHARED,
1138 	ANON_THP_COLLAPSE_UPPER_SHARED,
1139 };
1140 
1141 static void do_test_anon_thp_collapse(char *mem, size_t size,
1142 				      enum anon_thp_collapse_test test)
1143 {
1144 	struct comm_pipes comm_pipes;
1145 	char buf;
1146 	int ret;
1147 
1148 	ret = setup_comm_pipes(&comm_pipes);
1149 	if (ret) {
1150 		ksft_test_result_fail("pipe() failed\n");
1151 		return;
1152 	}
1153 
1154 	/*
1155 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1156 	 * R/O, such that we can try collapsing it later.
1157 	 */
1158 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1159 	if (ret) {
1160 		ksft_test_result_fail("mprotect() failed\n");
1161 		goto close_comm_pipes;
1162 	}
1163 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1164 	if (ret) {
1165 		ksft_test_result_fail("mprotect() failed\n");
1166 		goto close_comm_pipes;
1167 	}
1168 
1169 	switch (test) {
1170 	case ANON_THP_COLLAPSE_UNSHARED:
1171 		/* Collapse before actually COW-sharing the page. */
1172 		ret = madvise(mem, size, MADV_COLLAPSE);
1173 		if (ret) {
1174 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1175 					      strerror(errno));
1176 			goto close_comm_pipes;
1177 		}
1178 		break;
1179 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1180 		/* COW-share the full PTE-mapped THP. */
1181 		break;
1182 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1183 		/* Don't COW-share the upper part of the THP. */
1184 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1185 		if (ret) {
1186 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1187 			goto close_comm_pipes;
1188 		}
1189 		break;
1190 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1191 		/* Don't COW-share the lower part of the THP. */
1192 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1193 		if (ret) {
1194 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1195 			goto close_comm_pipes;
1196 		}
1197 		break;
1198 	default:
1199 		assert(false);
1200 	}
1201 
1202 	ret = fork();
1203 	if (ret < 0) {
1204 		ksft_test_result_fail("fork() failed\n");
1205 		goto close_comm_pipes;
1206 	} else if (!ret) {
1207 		switch (test) {
1208 		case ANON_THP_COLLAPSE_UNSHARED:
1209 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1210 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1211 			break;
1212 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1213 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1214 			break;
1215 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1216 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1217 					     &comm_pipes));
1218 			break;
1219 		default:
1220 			assert(false);
1221 		}
1222 	}
1223 
1224 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1225 		;
1226 
1227 	switch (test) {
1228 	case ANON_THP_COLLAPSE_UNSHARED:
1229 		break;
1230 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1231 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1232 		/*
1233 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1234 		 * able to actually collapse.
1235 		 */
1236 		ret = madvise(mem, size, MADV_DOFORK);
1237 		if (ret) {
1238 			ksft_test_result_fail("MADV_DOFORK failed\n");
1239 			write(comm_pipes.parent_ready[1], "0", 1);
1240 			wait(&ret);
1241 			goto close_comm_pipes;
1242 		}
1243 		/* FALLTHROUGH */
1244 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1245 		/* Collapse before anyone modified the COW-shared page. */
1246 		ret = madvise(mem, size, MADV_COLLAPSE);
1247 		if (ret) {
1248 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1249 					      strerror(errno));
1250 			write(comm_pipes.parent_ready[1], "0", 1);
1251 			wait(&ret);
1252 			goto close_comm_pipes;
1253 		}
1254 		break;
1255 	default:
1256 		assert(false);
1257 	}
1258 
1259 	/* Modify the page. */
1260 	memset(mem, 0xff, size);
1261 	write(comm_pipes.parent_ready[1], "0", 1);
1262 
1263 	wait(&ret);
1264 	if (WIFEXITED(ret))
1265 		ret = WEXITSTATUS(ret);
1266 	else
1267 		ret = -EINVAL;
1268 
1269 	ksft_test_result(!ret, "No leak from parent into child\n");
1270 close_comm_pipes:
1271 	close_comm_pipes(&comm_pipes);
1272 }
1273 
1274 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1275 {
1276 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1277 }
1278 
1279 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1280 {
1281 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1282 }
1283 
1284 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1285 {
1286 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1287 }
1288 
1289 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1290 {
1291 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1292 }
1293 
1294 /*
1295  * Test cases that are specific to anonymous THP: pages in private mappings
1296  * that may get shared via COW during fork().
1297  */
1298 static const struct test_case anon_thp_test_cases[] = {
1299 	/*
1300 	 * Basic COW test for fork() without any GUP when collapsing a THP
1301 	 * before fork().
1302 	 *
1303 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1304 	 * collapse") might easily get COW handling wrong when not collapsing
1305 	 * exclusivity information properly.
1306 	 */
1307 	{
1308 		"Basic COW after fork() when collapsing before fork()",
1309 		test_anon_thp_collapse_unshared,
1310 	},
1311 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1312 	{
1313 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1314 		test_anon_thp_collapse_fully_shared,
1315 	},
1316 	/*
1317 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1318 	 * THP.
1319 	 */
1320 	{
1321 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1322 		test_anon_thp_collapse_lower_shared,
1323 	},
1324 	/*
1325 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1326 	 * THP.
1327 	 */
1328 	{
1329 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1330 		test_anon_thp_collapse_upper_shared,
1331 	},
1332 };
1333 
1334 static void run_anon_thp_test_cases(void)
1335 {
1336 	int i;
1337 
1338 	if (!thpsize)
1339 		return;
1340 
1341 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1342 
1343 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1344 		struct test_case const *test_case = &anon_thp_test_cases[i];
1345 
1346 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1347 		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1348 	}
1349 }
1350 
1351 static int tests_per_anon_thp_test_case(void)
1352 {
1353 	return thpsize ? 1 : 0;
1354 }
1355 
1356 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1357 
1358 static void test_cow(char *mem, const char *smem, size_t size)
1359 {
1360 	char *old = malloc(size);
1361 
1362 	/* Backup the original content. */
1363 	memcpy(old, smem, size);
1364 
1365 	/* Modify the page. */
1366 	memset(mem, 0xff, size);
1367 
1368 	/* See if we still read the old values via the other mapping. */
1369 	ksft_test_result(!memcmp(smem, old, size),
1370 			 "Other mapping not modified\n");
1371 	free(old);
1372 }
1373 
1374 static void test_ro_pin(char *mem, const char *smem, size_t size)
1375 {
1376 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1377 }
1378 
1379 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1380 {
1381 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1382 }
1383 
1384 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1385 {
1386 	char *mem, *smem, tmp;
1387 
1388 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1389 
1390 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1391 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1392 	if (mem == MAP_FAILED) {
1393 		ksft_test_result_fail("mmap() failed\n");
1394 		return;
1395 	}
1396 
1397 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1398 	if (mem == MAP_FAILED) {
1399 		ksft_test_result_fail("mmap() failed\n");
1400 		goto munmap;
1401 	}
1402 
1403 	/* Read from the page to populate the shared zeropage. */
1404 	tmp = *mem + *smem;
1405 	asm volatile("" : "+r" (tmp));
1406 
1407 	fn(mem, smem, pagesize);
1408 munmap:
1409 	munmap(mem, pagesize);
1410 	if (smem != MAP_FAILED)
1411 		munmap(smem, pagesize);
1412 }
1413 
1414 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1415 {
1416 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1417 	size_t mmap_size;
1418 	int ret;
1419 
1420 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1421 
1422 	if (!has_huge_zeropage) {
1423 		ksft_test_result_skip("Huge zeropage not enabled\n");
1424 		return;
1425 	}
1426 
1427 	/* For alignment purposes, we need twice the thp size. */
1428 	mmap_size = 2 * thpsize;
1429 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1430 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1431 	if (mmap_mem == MAP_FAILED) {
1432 		ksft_test_result_fail("mmap() failed\n");
1433 		return;
1434 	}
1435 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1436 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1437 	if (mmap_smem == MAP_FAILED) {
1438 		ksft_test_result_fail("mmap() failed\n");
1439 		goto munmap;
1440 	}
1441 
1442 	/* We need a THP-aligned memory area. */
1443 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1444 	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1445 
1446 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1447 	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1448 	if (ret) {
1449 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1450 		goto munmap;
1451 	}
1452 
1453 	/*
1454 	 * Read from the memory to populate the huge shared zeropage. Read from
1455 	 * the first sub-page and test if we get another sub-page populated
1456 	 * automatically.
1457 	 */
1458 	tmp = *mem + *smem;
1459 	asm volatile("" : "+r" (tmp));
1460 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1461 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1462 		ksft_test_result_skip("Did not get THPs populated\n");
1463 		goto munmap;
1464 	}
1465 
1466 	fn(mem, smem, thpsize);
1467 munmap:
1468 	munmap(mmap_mem, mmap_size);
1469 	if (mmap_smem != MAP_FAILED)
1470 		munmap(mmap_smem, mmap_size);
1471 }
1472 
1473 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1474 {
1475 	char *mem, *smem, tmp;
1476 	int fd;
1477 
1478 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1479 
1480 	fd = memfd_create("test", 0);
1481 	if (fd < 0) {
1482 		ksft_test_result_fail("memfd_create() failed\n");
1483 		return;
1484 	}
1485 
1486 	/* File consists of a single page filled with zeroes. */
1487 	if (fallocate(fd, 0, 0, pagesize)) {
1488 		ksft_test_result_fail("fallocate() failed\n");
1489 		goto close;
1490 	}
1491 
1492 	/* Create a private mapping of the memfd. */
1493 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1494 	if (mem == MAP_FAILED) {
1495 		ksft_test_result_fail("mmap() failed\n");
1496 		goto close;
1497 	}
1498 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1499 	if (mem == MAP_FAILED) {
1500 		ksft_test_result_fail("mmap() failed\n");
1501 		goto munmap;
1502 	}
1503 
1504 	/* Fault the page in. */
1505 	tmp = *mem + *smem;
1506 	asm volatile("" : "+r" (tmp));
1507 
1508 	fn(mem, smem, pagesize);
1509 munmap:
1510 	munmap(mem, pagesize);
1511 	if (smem != MAP_FAILED)
1512 		munmap(smem, pagesize);
1513 close:
1514 	close(fd);
1515 }
1516 
1517 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1518 {
1519 	char *mem, *smem, tmp;
1520 	FILE *file;
1521 	int fd;
1522 
1523 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1524 
1525 	file = tmpfile();
1526 	if (!file) {
1527 		ksft_test_result_fail("tmpfile() failed\n");
1528 		return;
1529 	}
1530 
1531 	fd = fileno(file);
1532 	if (fd < 0) {
1533 		ksft_test_result_skip("fileno() failed\n");
1534 		return;
1535 	}
1536 
1537 	/* File consists of a single page filled with zeroes. */
1538 	if (fallocate(fd, 0, 0, pagesize)) {
1539 		ksft_test_result_fail("fallocate() failed\n");
1540 		goto close;
1541 	}
1542 
1543 	/* Create a private mapping of the memfd. */
1544 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1545 	if (mem == MAP_FAILED) {
1546 		ksft_test_result_fail("mmap() failed\n");
1547 		goto close;
1548 	}
1549 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1550 	if (mem == MAP_FAILED) {
1551 		ksft_test_result_fail("mmap() failed\n");
1552 		goto munmap;
1553 	}
1554 
1555 	/* Fault the page in. */
1556 	tmp = *mem + *smem;
1557 	asm volatile("" : "+r" (tmp));
1558 
1559 	fn(mem, smem, pagesize);
1560 munmap:
1561 	munmap(mem, pagesize);
1562 	if (smem != MAP_FAILED)
1563 		munmap(smem, pagesize);
1564 close:
1565 	fclose(file);
1566 }
1567 
1568 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1569 				   size_t hugetlbsize)
1570 {
1571 	int flags = MFD_HUGETLB;
1572 	char *mem, *smem, tmp;
1573 	int fd;
1574 
1575 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1576 		       hugetlbsize / 1024);
1577 
1578 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1579 
1580 	fd = memfd_create("test", flags);
1581 	if (fd < 0) {
1582 		ksft_test_result_skip("memfd_create() failed\n");
1583 		return;
1584 	}
1585 
1586 	/* File consists of a single page filled with zeroes. */
1587 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1588 		ksft_test_result_skip("need more free huge pages\n");
1589 		goto close;
1590 	}
1591 
1592 	/* Create a private mapping of the memfd. */
1593 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1594 		   0);
1595 	if (mem == MAP_FAILED) {
1596 		ksft_test_result_skip("need more free huge pages\n");
1597 		goto close;
1598 	}
1599 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1600 	if (mem == MAP_FAILED) {
1601 		ksft_test_result_fail("mmap() failed\n");
1602 		goto munmap;
1603 	}
1604 
1605 	/* Fault the page in. */
1606 	tmp = *mem + *smem;
1607 	asm volatile("" : "+r" (tmp));
1608 
1609 	fn(mem, smem, hugetlbsize);
1610 munmap:
1611 	munmap(mem, hugetlbsize);
1612 	if (mem != MAP_FAILED)
1613 		munmap(smem, hugetlbsize);
1614 close:
1615 	close(fd);
1616 }
1617 
1618 struct non_anon_test_case {
1619 	const char *desc;
1620 	non_anon_test_fn fn;
1621 };
1622 
1623 /*
1624  * Test cases that target any pages in private mappings that are not anonymous:
1625  * pages that may get shared via COW ndependent of fork(). This includes
1626  * the shared zeropage(s), pagecache pages, ...
1627  */
1628 static const struct non_anon_test_case non_anon_test_cases[] = {
1629 	/*
1630 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1631 	 * visible via other private/shared mappings.
1632 	 */
1633 	{
1634 		"Basic COW",
1635 		test_cow,
1636 	},
1637 	/*
1638 	 * Take a R/O longterm pin. When modifying the page via the page table,
1639 	 * the page content change must be visible via the pin.
1640 	 */
1641 	{
1642 		"R/O longterm GUP pin",
1643 		test_ro_pin,
1644 	},
1645 	/* Same as above, but using GUP-fast. */
1646 	{
1647 		"R/O longterm GUP-fast pin",
1648 		test_ro_fast_pin,
1649 	},
1650 };
1651 
1652 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1653 {
1654 	int i;
1655 
1656 	run_with_zeropage(test_case->fn, test_case->desc);
1657 	run_with_memfd(test_case->fn, test_case->desc);
1658 	run_with_tmpfile(test_case->fn, test_case->desc);
1659 	if (thpsize)
1660 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1661 	for (i = 0; i < nr_hugetlbsizes; i++)
1662 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1663 				       hugetlbsizes[i]);
1664 }
1665 
1666 static void run_non_anon_test_cases(void)
1667 {
1668 	int i;
1669 
1670 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1671 
1672 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1673 		run_non_anon_test_case(&non_anon_test_cases[i]);
1674 }
1675 
1676 static int tests_per_non_anon_test_case(void)
1677 {
1678 	int tests = 3 + nr_hugetlbsizes;
1679 
1680 	if (thpsize)
1681 		tests += 1;
1682 	return tests;
1683 }
1684 
1685 int main(int argc, char **argv)
1686 {
1687 	int err;
1688 
1689 	pagesize = getpagesize();
1690 	thpsize = read_pmd_pagesize();
1691 	if (thpsize)
1692 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1693 			       thpsize / 1024);
1694 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1695 						    ARRAY_SIZE(hugetlbsizes));
1696 	detect_huge_zeropage();
1697 
1698 	ksft_print_header();
1699 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1700 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1701 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1702 
1703 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1704 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1705 	if (pagemap_fd < 0)
1706 		ksft_exit_fail_msg("opening pagemap failed\n");
1707 
1708 	run_anon_test_cases();
1709 	run_anon_thp_test_cases();
1710 	run_non_anon_test_cases();
1711 
1712 	err = ksft_get_fail_cnt();
1713 	if (err)
1714 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1715 				   err, ksft_test_num());
1716 	return ksft_exit_pass();
1717 }
1718