xref: /linux/tools/testing/selftests/mm/uffd-unit-tests.c (revision beace86e61e465dba204a268ab3f3377153a4973)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd unit tests.
4  *
5  *  Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 #include "../../../../mm/gup_test.h"
11 
12 #ifdef __NR_userfaultfd
13 
14 /* The unit test doesn't need a large or random size, make it 32MB for now */
15 #define  UFFD_TEST_MEM_SIZE               (32UL << 20)
16 
17 #define  MEM_ANON                         BIT_ULL(0)
18 #define  MEM_SHMEM                        BIT_ULL(1)
19 #define  MEM_SHMEM_PRIVATE                BIT_ULL(2)
20 #define  MEM_HUGETLB                      BIT_ULL(3)
21 #define  MEM_HUGETLB_PRIVATE              BIT_ULL(4)
22 
23 #define  MEM_ALL  (MEM_ANON | MEM_SHMEM | MEM_SHMEM_PRIVATE | \
24 		   MEM_HUGETLB | MEM_HUGETLB_PRIVATE)
25 
26 #define ALIGN_UP(x, align_to) \
27 	((__typeof__(x))((((unsigned long)(x)) + ((align_to)-1)) & ~((align_to)-1)))
28 
29 #define MAX(a, b) (((a) > (b)) ? (a) : (b))
30 
31 struct mem_type {
32 	const char *name;
33 	unsigned int mem_flag;
34 	uffd_test_ops_t *mem_ops;
35 	bool shared;
36 };
37 typedef struct mem_type mem_type_t;
38 
39 mem_type_t mem_types[] = {
40 	{
41 		.name = "anon",
42 		.mem_flag = MEM_ANON,
43 		.mem_ops = &anon_uffd_test_ops,
44 		.shared = false,
45 	},
46 	{
47 		.name = "shmem",
48 		.mem_flag = MEM_SHMEM,
49 		.mem_ops = &shmem_uffd_test_ops,
50 		.shared = true,
51 	},
52 	{
53 		.name = "shmem-private",
54 		.mem_flag = MEM_SHMEM_PRIVATE,
55 		.mem_ops = &shmem_uffd_test_ops,
56 		.shared = false,
57 	},
58 	{
59 		.name = "hugetlb",
60 		.mem_flag = MEM_HUGETLB,
61 		.mem_ops = &hugetlb_uffd_test_ops,
62 		.shared = true,
63 	},
64 	{
65 		.name = "hugetlb-private",
66 		.mem_flag = MEM_HUGETLB_PRIVATE,
67 		.mem_ops = &hugetlb_uffd_test_ops,
68 		.shared = false,
69 	},
70 };
71 
72 /* Arguments to be passed over to each uffd unit test */
73 struct uffd_test_args {
74 	mem_type_t *mem_type;
75 };
76 typedef struct uffd_test_args uffd_test_args_t;
77 
78 /* Returns: UFFD_TEST_* */
79 typedef void (*uffd_test_fn)(uffd_test_args_t *);
80 
81 typedef struct {
82 	const char *name;
83 	uffd_test_fn uffd_fn;
84 	unsigned int mem_targets;
85 	uint64_t uffd_feature_required;
86 	uffd_test_case_ops_t *test_case_ops;
87 } uffd_test_case_t;
88 
89 static void uffd_test_report(void)
90 {
91 	printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n",
92 	       ksft_get_pass_cnt(),
93 	       ksft_get_xskip_cnt(),
94 	       ksft_get_fail_cnt(),
95 	       ksft_test_num());
96 }
97 
98 static void uffd_test_pass(void)
99 {
100 	printf("done\n");
101 	ksft_inc_pass_cnt();
102 }
103 
104 #define  uffd_test_start(...)  do {		\
105 		printf("Testing ");		\
106 		printf(__VA_ARGS__);		\
107 		printf("... ");			\
108 		fflush(stdout);			\
109 	} while (0)
110 
111 #define  uffd_test_fail(...)  do {		\
112 		printf("failed [reason: ");	\
113 		printf(__VA_ARGS__);		\
114 		printf("]\n");			\
115 		ksft_inc_fail_cnt();		\
116 	} while (0)
117 
118 static void uffd_test_skip(const char *message)
119 {
120 	printf("skipped [reason: %s]\n", message);
121 	ksft_inc_xskip_cnt();
122 }
123 
124 /*
125  * Returns 1 if specific userfaultfd supported, 0 otherwise.  Note, we'll
126  * return 1 even if some test failed as long as uffd supported, because in
127  * that case we still want to proceed with the rest uffd unit tests.
128  */
129 static int test_uffd_api(bool use_dev)
130 {
131 	struct uffdio_api uffdio_api;
132 	int uffd;
133 
134 	uffd_test_start("UFFDIO_API (with %s)",
135 			use_dev ? "/dev/userfaultfd" : "syscall");
136 
137 	if (use_dev)
138 		uffd = uffd_open_dev(UFFD_FLAGS);
139 	else
140 		uffd = uffd_open_sys(UFFD_FLAGS);
141 	if (uffd < 0) {
142 		uffd_test_skip("cannot open userfaultfd handle");
143 		return 0;
144 	}
145 
146 	/* Test wrong UFFD_API */
147 	uffdio_api.api = 0xab;
148 	uffdio_api.features = 0;
149 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
150 		uffd_test_fail("UFFDIO_API should fail with wrong api but didn't");
151 		goto out;
152 	}
153 
154 	/* Test wrong feature bit */
155 	uffdio_api.api = UFFD_API;
156 	uffdio_api.features = BIT_ULL(63);
157 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
158 		uffd_test_fail("UFFDIO_API should fail with wrong feature but didn't");
159 		goto out;
160 	}
161 
162 	/* Test normal UFFDIO_API */
163 	uffdio_api.api = UFFD_API;
164 	uffdio_api.features = 0;
165 	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
166 		uffd_test_fail("UFFDIO_API should succeed but failed");
167 		goto out;
168 	}
169 
170 	/* Test double requests of UFFDIO_API with a random feature set */
171 	uffdio_api.features = BIT_ULL(0);
172 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
173 		uffd_test_fail("UFFDIO_API should reject initialized uffd");
174 		goto out;
175 	}
176 
177 	uffd_test_pass();
178 out:
179 	close(uffd);
180 	/* We have a valid uffd handle */
181 	return 1;
182 }
183 
184 /*
185  * This function initializes the global variables.  TODO: remove global
186  * vars and then remove this.
187  */
188 static int
189 uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test,
190 		       mem_type_t *mem_type, const char **errmsg)
191 {
192 	map_shared = mem_type->shared;
193 	uffd_test_ops = mem_type->mem_ops;
194 	uffd_test_case_ops = test->test_case_ops;
195 
196 	if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB))
197 		page_size = default_huge_page_size();
198 	else
199 		page_size = psize();
200 
201 	/* Ensure we have at least 2 pages */
202 	nr_pages = MAX(UFFD_TEST_MEM_SIZE, page_size * 2) / page_size;
203 	/* TODO: remove this global var.. it's so ugly */
204 	nr_parallel = 1;
205 
206 	/* Initialize test arguments */
207 	args->mem_type = mem_type;
208 
209 	return uffd_test_ctx_init(test->uffd_feature_required, errmsg);
210 }
211 
212 static bool uffd_feature_supported(uffd_test_case_t *test)
213 {
214 	uint64_t features;
215 
216 	if (uffd_get_features(&features))
217 		return false;
218 
219 	return (features & test->uffd_feature_required) ==
220 	    test->uffd_feature_required;
221 }
222 
223 static int pagemap_open(void)
224 {
225 	int fd = open("/proc/self/pagemap", O_RDONLY);
226 
227 	if (fd < 0)
228 		err("open pagemap");
229 
230 	return fd;
231 }
232 
233 /* This macro let __LINE__ works in err() */
234 #define  pagemap_check_wp(value, wp) do {				\
235 		if (!!(value & PM_UFFD_WP) != wp)			\
236 			err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
237 	} while (0)
238 
239 typedef struct {
240 	int parent_uffd, child_uffd;
241 } fork_event_args;
242 
243 static void *fork_event_consumer(void *data)
244 {
245 	fork_event_args *args = data;
246 	struct uffd_msg msg = { 0 };
247 
248 	ready_for_fork = true;
249 
250 	/* Read until a full msg received */
251 	while (uffd_read_msg(args->parent_uffd, &msg));
252 
253 	if (msg.event != UFFD_EVENT_FORK)
254 		err("wrong message: %u\n", msg.event);
255 
256 	/* Just to be properly freed later */
257 	args->child_uffd = msg.arg.fork.ufd;
258 	return NULL;
259 }
260 
261 typedef struct {
262 	int gup_fd;
263 	bool pinned;
264 } pin_args;
265 
266 /*
267  * Returns 0 if succeed, <0 for errors.  pin_pages() needs to be paired
268  * with unpin_pages().  Currently it needs to be RO longterm pin to satisfy
269  * all needs of the test cases (e.g., trigger unshare, trigger fork() early
270  * CoW, etc.).
271  */
272 static int pin_pages(pin_args *args, void *buffer, size_t size)
273 {
274 	struct pin_longterm_test test = {
275 		.addr = (uintptr_t)buffer,
276 		.size = size,
277 		/* Read-only pins */
278 		.flags = 0,
279 	};
280 
281 	if (args->pinned)
282 		err("already pinned");
283 
284 	args->gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
285 	if (args->gup_fd < 0)
286 		return -errno;
287 
288 	if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_START, &test)) {
289 		/* Even if gup_test existed, can be an old gup_test / kernel */
290 		close(args->gup_fd);
291 		return -errno;
292 	}
293 	args->pinned = true;
294 	return 0;
295 }
296 
297 static void unpin_pages(pin_args *args)
298 {
299 	if (!args->pinned)
300 		err("unpin without pin first");
301 	if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_STOP))
302 		err("PIN_LONGTERM_TEST_STOP");
303 	close(args->gup_fd);
304 	args->pinned = false;
305 }
306 
307 static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
308 {
309 	fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 };
310 	pthread_t thread;
311 	pid_t child;
312 	uint64_t value;
313 	int fd, result;
314 
315 	/* Prepare a thread to resolve EVENT_FORK */
316 	if (with_event) {
317 		ready_for_fork = false;
318 		if (pthread_create(&thread, NULL, fork_event_consumer, &args))
319 			err("pthread_create()");
320 		while (!ready_for_fork)
321 			; /* Wait for the poll_thread to start executing before forking */
322 	}
323 
324 	child = fork();
325 	if (!child) {
326 		/* Open the pagemap fd of the child itself */
327 		pin_args args = {};
328 
329 		fd = pagemap_open();
330 
331 		if (test_pin && pin_pages(&args, area_dst, page_size))
332 			/*
333 			 * Normally when reach here we have pinned in
334 			 * previous tests, so shouldn't fail anymore
335 			 */
336 			err("pin page failed in child");
337 
338 		value = pagemap_get_entry(fd, area_dst);
339 		/*
340 		 * After fork(), we should handle uffd-wp bit differently:
341 		 *
342 		 * (1) when with EVENT_FORK, it should persist
343 		 * (2) when without EVENT_FORK, it should be dropped
344 		 */
345 		pagemap_check_wp(value, with_event);
346 		if (test_pin)
347 			unpin_pages(&args);
348 		/* Succeed */
349 		exit(0);
350 	}
351 	waitpid(child, &result, 0);
352 
353 	if (with_event) {
354 		if (pthread_join(thread, NULL))
355 			err("pthread_join()");
356 		if (args.child_uffd < 0)
357 			err("Didn't receive child uffd");
358 		close(args.child_uffd);
359 	}
360 
361 	return result;
362 }
363 
364 static void uffd_wp_unpopulated_test(uffd_test_args_t *args)
365 {
366 	uint64_t value;
367 	int pagemap_fd;
368 
369 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
370 			  false, true, false))
371 		err("register failed");
372 
373 	pagemap_fd = pagemap_open();
374 
375 	/* Test applying pte marker to anon unpopulated */
376 	wp_range(uffd, (uint64_t)area_dst, page_size, true);
377 	value = pagemap_get_entry(pagemap_fd, area_dst);
378 	pagemap_check_wp(value, true);
379 
380 	/* Test unprotect on anon pte marker */
381 	wp_range(uffd, (uint64_t)area_dst, page_size, false);
382 	value = pagemap_get_entry(pagemap_fd, area_dst);
383 	pagemap_check_wp(value, false);
384 
385 	/* Test zap on anon marker */
386 	wp_range(uffd, (uint64_t)area_dst, page_size, true);
387 	if (madvise(area_dst, page_size, MADV_DONTNEED))
388 		err("madvise(MADV_DONTNEED) failed");
389 	value = pagemap_get_entry(pagemap_fd, area_dst);
390 	pagemap_check_wp(value, false);
391 
392 	/* Test fault in after marker removed */
393 	*area_dst = 1;
394 	value = pagemap_get_entry(pagemap_fd, area_dst);
395 	pagemap_check_wp(value, false);
396 	/* Drop it to make pte none again */
397 	if (madvise(area_dst, page_size, MADV_DONTNEED))
398 		err("madvise(MADV_DONTNEED) failed");
399 
400 	/* Test read-zero-page upon pte marker */
401 	wp_range(uffd, (uint64_t)area_dst, page_size, true);
402 	*(volatile char *)area_dst;
403 	/* Drop it to make pte none again */
404 	if (madvise(area_dst, page_size, MADV_DONTNEED))
405 		err("madvise(MADV_DONTNEED) failed");
406 
407 	uffd_test_pass();
408 }
409 
410 static void uffd_wp_fork_test_common(uffd_test_args_t *args,
411 				     bool with_event)
412 {
413 	int pagemap_fd;
414 	uint64_t value;
415 
416 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
417 			  false, true, false))
418 		err("register failed");
419 
420 	pagemap_fd = pagemap_open();
421 
422 	/* Touch the page */
423 	*area_dst = 1;
424 	wp_range(uffd, (uint64_t)area_dst, page_size, true);
425 	value = pagemap_get_entry(pagemap_fd, area_dst);
426 	pagemap_check_wp(value, true);
427 	if (pagemap_test_fork(uffd, with_event, false)) {
428 		uffd_test_fail("Detected %s uffd-wp bit in child in present pte",
429 			       with_event ? "missing" : "stall");
430 		goto out;
431 	}
432 
433 	/*
434 	 * This is an attempt for zapping the pgtable so as to test the
435 	 * markers.
436 	 *
437 	 * For private mappings, PAGEOUT will only work on exclusive ptes
438 	 * (PM_MMAP_EXCLUSIVE) which we should satisfy.
439 	 *
440 	 * For shared, PAGEOUT may not work.  Use DONTNEED instead which
441 	 * plays a similar role of zapping (rather than freeing the page)
442 	 * to expose pte markers.
443 	 */
444 	if (args->mem_type->shared) {
445 		if (madvise(area_dst, page_size, MADV_DONTNEED))
446 			err("MADV_DONTNEED");
447 	} else {
448 		/*
449 		 * NOTE: ignore retval because private-hugetlb doesn't yet
450 		 * support swapping, so it could fail.
451 		 */
452 		madvise(area_dst, page_size, MADV_PAGEOUT);
453 	}
454 
455 	/* Uffd-wp should persist even swapped out */
456 	value = pagemap_get_entry(pagemap_fd, area_dst);
457 	pagemap_check_wp(value, true);
458 	if (pagemap_test_fork(uffd, with_event, false)) {
459 		uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte",
460 			       with_event ? "missing" : "stall");
461 		goto out;
462 	}
463 
464 	/* Unprotect; this tests swap pte modifications */
465 	wp_range(uffd, (uint64_t)area_dst, page_size, false);
466 	value = pagemap_get_entry(pagemap_fd, area_dst);
467 	pagemap_check_wp(value, false);
468 
469 	/* Fault in the page from disk */
470 	*area_dst = 2;
471 	value = pagemap_get_entry(pagemap_fd, area_dst);
472 	pagemap_check_wp(value, false);
473 	uffd_test_pass();
474 out:
475 	if (uffd_unregister(uffd, area_dst, nr_pages * page_size))
476 		err("unregister failed");
477 	close(pagemap_fd);
478 }
479 
480 static void uffd_wp_fork_test(uffd_test_args_t *args)
481 {
482 	uffd_wp_fork_test_common(args, false);
483 }
484 
485 static void uffd_wp_fork_with_event_test(uffd_test_args_t *args)
486 {
487 	uffd_wp_fork_test_common(args, true);
488 }
489 
490 static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args,
491 					 bool with_event)
492 {
493 	int pagemap_fd;
494 	pin_args pin_args = {};
495 
496 	if (uffd_register(uffd, area_dst, page_size, false, true, false))
497 		err("register failed");
498 
499 	pagemap_fd = pagemap_open();
500 
501 	/* Touch the page */
502 	*area_dst = 1;
503 	wp_range(uffd, (uint64_t)area_dst, page_size, true);
504 
505 	/*
506 	 * 1. First pin, then fork().  This tests fork() special path when
507 	 * doing early CoW if the page is private.
508 	 */
509 	if (pin_pages(&pin_args, area_dst, page_size)) {
510 		uffd_test_skip("Possibly CONFIG_GUP_TEST missing "
511 			       "or unprivileged");
512 		close(pagemap_fd);
513 		uffd_unregister(uffd, area_dst, page_size);
514 		return;
515 	}
516 
517 	if (pagemap_test_fork(uffd, with_event, false)) {
518 		uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()",
519 			       with_event ? "missing" : "stall");
520 		unpin_pages(&pin_args);
521 		goto out;
522 	}
523 
524 	unpin_pages(&pin_args);
525 
526 	/*
527 	 * 2. First fork(), then pin (in the child, where test_pin==true).
528 	 * This tests COR, aka, page unsharing on private memories.
529 	 */
530 	if (pagemap_test_fork(uffd, with_event, true)) {
531 		uffd_test_fail("Detected %s uffd-wp bit when RO pin",
532 			       with_event ? "missing" : "stall");
533 		goto out;
534 	}
535 	uffd_test_pass();
536 out:
537 	if (uffd_unregister(uffd, area_dst, page_size))
538 		err("register failed");
539 	close(pagemap_fd);
540 }
541 
542 static void uffd_wp_fork_pin_test(uffd_test_args_t *args)
543 {
544 	uffd_wp_fork_pin_test_common(args, false);
545 }
546 
547 static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args)
548 {
549 	uffd_wp_fork_pin_test_common(args, true);
550 }
551 
552 static void check_memory_contents(char *p)
553 {
554 	unsigned long i, j;
555 	uint8_t expected_byte;
556 
557 	for (i = 0; i < nr_pages; ++i) {
558 		expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
559 		for (j = 0; j < page_size; j++) {
560 			uint8_t v = *(uint8_t *)(p + (i * page_size) + j);
561 			if (v != expected_byte)
562 				err("unexpected page contents");
563 		}
564 	}
565 }
566 
567 static void uffd_minor_test_common(bool test_collapse, bool test_wp)
568 {
569 	unsigned long p;
570 	pthread_t uffd_mon;
571 	char c;
572 	struct uffd_args args = { 0 };
573 
574 	/*
575 	 * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing
576 	 * both do not make much sense.
577 	 */
578 	assert(!(test_collapse && test_wp));
579 
580 	if (uffd_register(uffd, area_dst_alias, nr_pages * page_size,
581 			  /* NOTE! MADV_COLLAPSE may not work with uffd-wp */
582 			  false, test_wp, true))
583 		err("register failure");
584 
585 	/*
586 	 * After registering with UFFD, populate the non-UFFD-registered side of
587 	 * the shared mapping. This should *not* trigger any UFFD minor faults.
588 	 */
589 	for (p = 0; p < nr_pages; ++p)
590 		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
591 		       page_size);
592 
593 	args.apply_wp = test_wp;
594 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
595 		err("uffd_poll_thread create");
596 
597 	/*
598 	 * Read each of the pages back using the UFFD-registered mapping. We
599 	 * expect that the first time we touch a page, it will result in a minor
600 	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
601 	 * page's contents, and then issuing a CONTINUE ioctl.
602 	 */
603 	check_memory_contents(area_dst_alias);
604 
605 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
606 		err("pipe write");
607 	if (pthread_join(uffd_mon, NULL))
608 		err("join() failed");
609 
610 	if (test_collapse) {
611 		if (madvise(area_dst_alias, nr_pages * page_size,
612 			    MADV_COLLAPSE)) {
613 			/* It's fine to fail for this one... */
614 			uffd_test_skip("MADV_COLLAPSE failed");
615 			return;
616 		}
617 
618 		uffd_test_ops->check_pmd_mapping(area_dst,
619 						 nr_pages * page_size /
620 						 read_pmd_pagesize());
621 		/*
622 		 * This won't cause uffd-fault - it purely just makes sure there
623 		 * was no corruption.
624 		 */
625 		check_memory_contents(area_dst_alias);
626 	}
627 
628 	if (args.missing_faults != 0 || args.minor_faults != nr_pages)
629 		uffd_test_fail("stats check error");
630 	else
631 		uffd_test_pass();
632 }
633 
634 void uffd_minor_test(uffd_test_args_t *args)
635 {
636 	uffd_minor_test_common(false, false);
637 }
638 
639 void uffd_minor_wp_test(uffd_test_args_t *args)
640 {
641 	uffd_minor_test_common(false, true);
642 }
643 
644 void uffd_minor_collapse_test(uffd_test_args_t *args)
645 {
646 	uffd_minor_test_common(true, false);
647 }
648 
649 static sigjmp_buf jbuf, *sigbuf;
650 
651 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
652 {
653 	if (sig == SIGBUS) {
654 		if (sigbuf)
655 			siglongjmp(*sigbuf, 1);
656 		abort();
657 	}
658 }
659 
660 /*
661  * For non-cooperative userfaultfd test we fork() a process that will
662  * generate pagefaults, will mremap the area monitored by the
663  * userfaultfd and at last this process will release the monitored
664  * area.
665  * For the anonymous and shared memory the area is divided into two
666  * parts, the first part is accessed before mremap, and the second
667  * part is accessed after mremap. Since hugetlbfs does not support
668  * mremap, the entire monitored area is accessed in a single pass for
669  * HUGETLB_TEST.
670  * The release of the pages currently generates event for shmem and
671  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
672  * for hugetlb.
673  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
674  * monitored area, generate pagefaults and test that signal is delivered.
675  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
676  * test robustness use case - we release monitored area, fork a process
677  * that will generate pagefaults and verify signal is generated.
678  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
679  * feature. Using monitor thread, verify no userfault events are generated.
680  */
681 static int faulting_process(int signal_test, bool wp)
682 {
683 	unsigned long nr, i;
684 	unsigned long long count;
685 	unsigned long split_nr_pages;
686 	unsigned long lastnr;
687 	struct sigaction act;
688 	volatile unsigned long signalled = 0;
689 
690 	split_nr_pages = (nr_pages + 1) / 2;
691 
692 	if (signal_test) {
693 		sigbuf = &jbuf;
694 		memset(&act, 0, sizeof(act));
695 		act.sa_sigaction = sighndl;
696 		act.sa_flags = SA_SIGINFO;
697 		if (sigaction(SIGBUS, &act, 0))
698 			err("sigaction");
699 		lastnr = (unsigned long)-1;
700 	}
701 
702 	for (nr = 0; nr < split_nr_pages; nr++) {
703 		volatile int steps = 1;
704 		unsigned long offset = nr * page_size;
705 
706 		if (signal_test) {
707 			if (sigsetjmp(*sigbuf, 1) != 0) {
708 				if (steps == 1 && nr == lastnr)
709 					err("Signal repeated");
710 
711 				lastnr = nr;
712 				if (signal_test == 1) {
713 					if (steps == 1) {
714 						/* This is a MISSING request */
715 						steps++;
716 						if (copy_page(uffd, offset, wp))
717 							signalled++;
718 					} else {
719 						/* This is a WP request */
720 						assert(steps == 2);
721 						wp_range(uffd,
722 							 (__u64)area_dst +
723 							 offset,
724 							 page_size, false);
725 					}
726 				} else {
727 					signalled++;
728 					continue;
729 				}
730 			}
731 		}
732 
733 		count = *area_count(area_dst, nr);
734 		if (count != count_verify[nr])
735 			err("nr %lu memory corruption %llu %llu\n",
736 			    nr, count, count_verify[nr]);
737 		/*
738 		 * Trigger write protection if there is by writing
739 		 * the same value back.
740 		 */
741 		*area_count(area_dst, nr) = count;
742 	}
743 
744 	if (signal_test)
745 		return signalled != split_nr_pages;
746 
747 	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
748 			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
749 	if (area_dst == MAP_FAILED)
750 		err("mremap");
751 	/* Reset area_src since we just clobbered it */
752 	area_src = NULL;
753 
754 	for (; nr < nr_pages; nr++) {
755 		count = *area_count(area_dst, nr);
756 		if (count != count_verify[nr]) {
757 			err("nr %lu memory corruption %llu %llu\n",
758 			    nr, count, count_verify[nr]);
759 		}
760 		/*
761 		 * Trigger write protection if there is by writing
762 		 * the same value back.
763 		 */
764 		*area_count(area_dst, nr) = count;
765 	}
766 
767 	uffd_test_ops->release_pages(area_dst);
768 
769 	for (nr = 0; nr < nr_pages; nr++)
770 		for (i = 0; i < page_size; i++)
771 			if (*(area_dst + nr * page_size + i) != 0)
772 				err("page %lu offset %lu is not zero", nr, i);
773 
774 	return 0;
775 }
776 
777 static void uffd_sigbus_test_common(bool wp)
778 {
779 	unsigned long userfaults;
780 	pthread_t uffd_mon;
781 	pid_t pid;
782 	int err;
783 	char c;
784 	struct uffd_args args = { 0 };
785 
786 	ready_for_fork = false;
787 
788 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
789 
790 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
791 			  true, wp, false))
792 		err("register failure");
793 
794 	if (faulting_process(1, wp))
795 		err("faulting process failed");
796 
797 	uffd_test_ops->release_pages(area_dst);
798 
799 	args.apply_wp = wp;
800 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
801 		err("uffd_poll_thread create");
802 
803 	while (!ready_for_fork)
804 		; /* Wait for the poll_thread to start executing before forking */
805 
806 	pid = fork();
807 	if (pid < 0)
808 		err("fork");
809 
810 	if (!pid)
811 		exit(faulting_process(2, wp));
812 
813 	waitpid(pid, &err, 0);
814 	if (err)
815 		err("faulting process failed");
816 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
817 		err("pipe write");
818 	if (pthread_join(uffd_mon, (void **)&userfaults))
819 		err("pthread_join()");
820 
821 	if (userfaults)
822 		uffd_test_fail("Signal test failed, userfaults: %ld", userfaults);
823 	else
824 		uffd_test_pass();
825 }
826 
827 static void uffd_sigbus_test(uffd_test_args_t *args)
828 {
829 	uffd_sigbus_test_common(false);
830 }
831 
832 static void uffd_sigbus_wp_test(uffd_test_args_t *args)
833 {
834 	uffd_sigbus_test_common(true);
835 }
836 
837 static void uffd_events_test_common(bool wp)
838 {
839 	pthread_t uffd_mon;
840 	pid_t pid;
841 	int err;
842 	char c;
843 	struct uffd_args args = { 0 };
844 
845 	ready_for_fork = false;
846 
847 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
848 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
849 			  true, wp, false))
850 		err("register failure");
851 
852 	args.apply_wp = wp;
853 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
854 		err("uffd_poll_thread create");
855 
856 	while (!ready_for_fork)
857 		; /* Wait for the poll_thread to start executing before forking */
858 
859 	pid = fork();
860 	if (pid < 0)
861 		err("fork");
862 
863 	if (!pid)
864 		exit(faulting_process(0, wp));
865 
866 	waitpid(pid, &err, 0);
867 	if (err)
868 		err("faulting process failed");
869 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
870 		err("pipe write");
871 	if (pthread_join(uffd_mon, NULL))
872 		err("pthread_join()");
873 
874 	if (args.missing_faults != nr_pages)
875 		uffd_test_fail("Fault counts wrong");
876 	else
877 		uffd_test_pass();
878 }
879 
880 static void uffd_events_test(uffd_test_args_t *args)
881 {
882 	uffd_events_test_common(false);
883 }
884 
885 static void uffd_events_wp_test(uffd_test_args_t *args)
886 {
887 	uffd_events_test_common(true);
888 }
889 
890 static void retry_uffdio_zeropage(int ufd,
891 				  struct uffdio_zeropage *uffdio_zeropage)
892 {
893 	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
894 				     uffdio_zeropage->range.len,
895 				     0);
896 	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
897 		if (uffdio_zeropage->zeropage != -EEXIST)
898 			err("UFFDIO_ZEROPAGE error: %"PRId64,
899 			    (int64_t)uffdio_zeropage->zeropage);
900 	} else {
901 		err("UFFDIO_ZEROPAGE error: %"PRId64,
902 		    (int64_t)uffdio_zeropage->zeropage);
903 	}
904 }
905 
906 static bool do_uffdio_zeropage(int ufd, bool has_zeropage)
907 {
908 	struct uffdio_zeropage uffdio_zeropage = { 0 };
909 	int ret;
910 	__s64 res;
911 
912 	uffdio_zeropage.range.start = (unsigned long) area_dst;
913 	uffdio_zeropage.range.len = page_size;
914 	uffdio_zeropage.mode = 0;
915 	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
916 	res = uffdio_zeropage.zeropage;
917 	if (ret) {
918 		/* real retval in ufdio_zeropage.zeropage */
919 		if (has_zeropage)
920 			err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
921 		else if (res != -EINVAL)
922 			err("UFFDIO_ZEROPAGE not -EINVAL");
923 	} else if (has_zeropage) {
924 		if (res != page_size)
925 			err("UFFDIO_ZEROPAGE unexpected size");
926 		else
927 			retry_uffdio_zeropage(ufd, &uffdio_zeropage);
928 		return true;
929 	} else
930 		err("UFFDIO_ZEROPAGE succeeded");
931 
932 	return false;
933 }
934 
935 /*
936  * Registers a range with MISSING mode only for zeropage test.  Return true
937  * if UFFDIO_ZEROPAGE supported, false otherwise. Can't use uffd_register()
938  * because we want to detect .ioctls along the way.
939  */
940 static bool
941 uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len)
942 {
943 	uint64_t ioctls = 0;
944 
945 	if (uffd_register_with_ioctls(uffd, addr, len, true,
946 				      false, false, &ioctls))
947 		err("zeropage register fail");
948 
949 	return ioctls & (1 << _UFFDIO_ZEROPAGE);
950 }
951 
952 /* exercise UFFDIO_ZEROPAGE */
953 static void uffd_zeropage_test(uffd_test_args_t *args)
954 {
955 	bool has_zeropage;
956 	int i;
957 
958 	has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size);
959 	if (area_dst_alias)
960 		/* Ignore the retval; we already have it */
961 		uffd_register_detect_zeropage(uffd, area_dst_alias, page_size);
962 
963 	if (do_uffdio_zeropage(uffd, has_zeropage))
964 		for (i = 0; i < page_size; i++)
965 			if (area_dst[i] != 0)
966 				err("data non-zero at offset %d\n", i);
967 
968 	if (uffd_unregister(uffd, area_dst, page_size))
969 		err("unregister");
970 
971 	if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size))
972 		err("unregister");
973 
974 	uffd_test_pass();
975 }
976 
977 static void uffd_register_poison(int uffd, void *addr, uint64_t len)
978 {
979 	uint64_t ioctls = 0;
980 	uint64_t expected = (1 << _UFFDIO_COPY) | (1 << _UFFDIO_POISON);
981 
982 	if (uffd_register_with_ioctls(uffd, addr, len, true,
983 				      false, false, &ioctls))
984 		err("poison register fail");
985 
986 	if ((ioctls & expected) != expected)
987 		err("registered area doesn't support COPY and POISON ioctls");
988 }
989 
990 static void do_uffdio_poison(int uffd, unsigned long offset)
991 {
992 	struct uffdio_poison uffdio_poison = { 0 };
993 	int ret;
994 	__s64 res;
995 
996 	uffdio_poison.range.start = (unsigned long) area_dst + offset;
997 	uffdio_poison.range.len = page_size;
998 	uffdio_poison.mode = 0;
999 	ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison);
1000 	res = uffdio_poison.updated;
1001 
1002 	if (ret)
1003 		err("UFFDIO_POISON error: %"PRId64, (int64_t)res);
1004 	else if (res != page_size)
1005 		err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res);
1006 }
1007 
1008 static void uffd_poison_handle_fault(
1009 	struct uffd_msg *msg, struct uffd_args *args)
1010 {
1011 	unsigned long offset;
1012 
1013 	if (msg->event != UFFD_EVENT_PAGEFAULT)
1014 		err("unexpected msg event %u", msg->event);
1015 
1016 	if (msg->arg.pagefault.flags &
1017 	    (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR))
1018 		err("unexpected fault type %llu", msg->arg.pagefault.flags);
1019 
1020 	offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
1021 	offset &= ~(page_size-1);
1022 
1023 	/* Odd pages -> copy zeroed page; even pages -> poison. */
1024 	if (offset & page_size)
1025 		copy_page(uffd, offset, false);
1026 	else
1027 		do_uffdio_poison(uffd, offset);
1028 }
1029 
1030 /* Make sure to cover odd/even, and minimum duplications */
1031 #define  UFFD_POISON_TEST_NPAGES  4
1032 
1033 static void uffd_poison_test(uffd_test_args_t *targs)
1034 {
1035 	pthread_t uffd_mon;
1036 	char c;
1037 	struct uffd_args args = { 0 };
1038 	struct sigaction act = { 0 };
1039 	unsigned long nr_sigbus = 0;
1040 	unsigned long nr, poison_pages = UFFD_POISON_TEST_NPAGES;
1041 
1042 	if (nr_pages < poison_pages) {
1043 		uffd_test_skip("Too few pages for POISON test");
1044 		return;
1045 	}
1046 
1047 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1048 
1049 	uffd_register_poison(uffd, area_dst, poison_pages * page_size);
1050 	memset(area_src, 0, poison_pages * page_size);
1051 
1052 	args.handle_fault = uffd_poison_handle_fault;
1053 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
1054 		err("uffd_poll_thread create");
1055 
1056 	sigbuf = &jbuf;
1057 	act.sa_sigaction = sighndl;
1058 	act.sa_flags = SA_SIGINFO;
1059 	if (sigaction(SIGBUS, &act, 0))
1060 		err("sigaction");
1061 
1062 	for (nr = 0; nr < poison_pages; ++nr) {
1063 		unsigned long offset = nr * page_size;
1064 		const char *bytes = (const char *) area_dst + offset;
1065 		const char *i;
1066 
1067 		if (sigsetjmp(*sigbuf, 1)) {
1068 			/*
1069 			 * Access below triggered a SIGBUS, which was caught by
1070 			 * sighndl, which then jumped here. Count this SIGBUS,
1071 			 * and move on to next page.
1072 			 */
1073 			++nr_sigbus;
1074 			continue;
1075 		}
1076 
1077 		for (i = bytes; i < bytes + page_size; ++i) {
1078 			if (*i)
1079 				err("nonzero byte in area_dst (%p) at %p: %u",
1080 				    area_dst, i, *i);
1081 		}
1082 	}
1083 
1084 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1085 		err("pipe write");
1086 	if (pthread_join(uffd_mon, NULL))
1087 		err("pthread_join()");
1088 
1089 	if (nr_sigbus != poison_pages / 2)
1090 		err("expected to receive %lu SIGBUS, actually received %lu",
1091 		    poison_pages / 2, nr_sigbus);
1092 
1093 	uffd_test_pass();
1094 }
1095 
1096 static void
1097 uffd_move_handle_fault_common(struct uffd_msg *msg, struct uffd_args *args,
1098 			      unsigned long len)
1099 {
1100 	unsigned long offset;
1101 
1102 	if (msg->event != UFFD_EVENT_PAGEFAULT)
1103 		err("unexpected msg event %u", msg->event);
1104 
1105 	if (msg->arg.pagefault.flags &
1106 	    (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR | UFFD_PAGEFAULT_FLAG_WRITE))
1107 		err("unexpected fault type %llu", msg->arg.pagefault.flags);
1108 
1109 	offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
1110 	offset &= ~(len-1);
1111 
1112 	if (move_page(uffd, offset, len))
1113 		args->missing_faults++;
1114 }
1115 
1116 static void uffd_move_handle_fault(struct uffd_msg *msg,
1117 				   struct uffd_args *args)
1118 {
1119 	uffd_move_handle_fault_common(msg, args, page_size);
1120 }
1121 
1122 static void uffd_move_pmd_handle_fault(struct uffd_msg *msg,
1123 				       struct uffd_args *args)
1124 {
1125 	uffd_move_handle_fault_common(msg, args, read_pmd_pagesize());
1126 }
1127 
1128 static void
1129 uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size,
1130 		      void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args))
1131 {
1132 	unsigned long nr;
1133 	pthread_t uffd_mon;
1134 	char c;
1135 	unsigned long long count;
1136 	struct uffd_args args = { 0 };
1137 	char *orig_area_src = NULL, *orig_area_dst = NULL;
1138 	unsigned long step_size, step_count;
1139 	unsigned long src_offs = 0;
1140 	unsigned long dst_offs = 0;
1141 
1142 	/* Prevent source pages from being mapped more than once */
1143 	if (madvise(area_src, nr_pages * page_size, MADV_DONTFORK))
1144 		err("madvise(MADV_DONTFORK) failure");
1145 
1146 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
1147 			  true, false, false))
1148 		err("register failure");
1149 
1150 	args.handle_fault = handle_fault;
1151 	if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
1152 		err("uffd_poll_thread create");
1153 
1154 	step_size = chunk_size / page_size;
1155 	step_count = nr_pages / step_size;
1156 
1157 	if (chunk_size > page_size) {
1158 		char *aligned_src = ALIGN_UP(area_src, chunk_size);
1159 		char *aligned_dst = ALIGN_UP(area_dst, chunk_size);
1160 
1161 		if (aligned_src != area_src || aligned_dst != area_dst) {
1162 			src_offs = (aligned_src - area_src) / page_size;
1163 			dst_offs = (aligned_dst - area_dst) / page_size;
1164 			step_count--;
1165 		}
1166 		orig_area_src = area_src;
1167 		orig_area_dst = area_dst;
1168 		area_src = aligned_src;
1169 		area_dst = aligned_dst;
1170 	}
1171 
1172 	/*
1173 	 * Read each of the pages back using the UFFD-registered mapping. We
1174 	 * expect that the first time we touch a page, it will result in a missing
1175 	 * fault. uffd_poll_thread will resolve the fault by moving source
1176 	 * page to destination.
1177 	 */
1178 	for (nr = 0; nr < step_count * step_size; nr += step_size) {
1179 		unsigned long i;
1180 
1181 		/* Check area_src content */
1182 		for (i = 0; i < step_size; i++) {
1183 			count = *area_count(area_src, nr + i);
1184 			if (count != count_verify[src_offs + nr + i])
1185 				err("nr %lu source memory invalid %llu %llu\n",
1186 				    nr + i, count, count_verify[src_offs + nr + i]);
1187 		}
1188 
1189 		/* Faulting into area_dst should move the page or the huge page */
1190 		for (i = 0; i < step_size; i++) {
1191 			count = *area_count(area_dst, nr + i);
1192 			if (count != count_verify[dst_offs + nr + i])
1193 				err("nr %lu memory corruption %llu %llu\n",
1194 				    nr, count, count_verify[dst_offs + nr + i]);
1195 		}
1196 
1197 		/* Re-check area_src content which should be empty */
1198 		for (i = 0; i < step_size; i++) {
1199 			count = *area_count(area_src, nr + i);
1200 			if (count != 0)
1201 				err("nr %lu move failed %llu %llu\n",
1202 				    nr, count, count_verify[src_offs + nr + i]);
1203 		}
1204 	}
1205 	if (chunk_size > page_size) {
1206 		area_src = orig_area_src;
1207 		area_dst = orig_area_dst;
1208 	}
1209 
1210 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1211 		err("pipe write");
1212 	if (pthread_join(uffd_mon, NULL))
1213 		err("join() failed");
1214 
1215 	if (args.missing_faults != step_count || args.minor_faults != 0)
1216 		uffd_test_fail("stats check error");
1217 	else
1218 		uffd_test_pass();
1219 }
1220 
1221 static void uffd_move_test(uffd_test_args_t *targs)
1222 {
1223 	uffd_move_test_common(targs, page_size, uffd_move_handle_fault);
1224 }
1225 
1226 static void uffd_move_pmd_test(uffd_test_args_t *targs)
1227 {
1228 	if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1229 		err("madvise(MADV_HUGEPAGE) failure");
1230 	uffd_move_test_common(targs, read_pmd_pagesize(),
1231 			      uffd_move_pmd_handle_fault);
1232 }
1233 
1234 static void uffd_move_pmd_split_test(uffd_test_args_t *targs)
1235 {
1236 	if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1237 		err("madvise(MADV_NOHUGEPAGE) failure");
1238 	uffd_move_test_common(targs, read_pmd_pagesize(),
1239 			      uffd_move_pmd_handle_fault);
1240 }
1241 
1242 static bool
1243 uffdio_verify_results(const char *name, int ret, int error, long result)
1244 {
1245 	/*
1246 	 * Should always return -1 with errno=EAGAIN, with corresponding
1247 	 * result field updated in ioctl() args to be -EAGAIN too
1248 	 * (e.g. copy.copy field for UFFDIO_COPY).
1249 	 */
1250 	if (ret != -1) {
1251 		uffd_test_fail("%s should have returned -1", name);
1252 		return false;
1253 	}
1254 
1255 	if (error != EAGAIN) {
1256 		uffd_test_fail("%s should have errno==EAGAIN", name);
1257 		return false;
1258 	}
1259 
1260 	if (result != -EAGAIN) {
1261 		uffd_test_fail("%s should have been updated for -EAGAIN",
1262 			       name);
1263 		return false;
1264 	}
1265 
1266 	return true;
1267 }
1268 
1269 /*
1270  * This defines a function to test one ioctl.  Note that here "field" can
1271  * be 1 or anything not -EAGAIN.  With that initial value set, we can
1272  * verify later that it should be updated by kernel (when -EAGAIN
1273  * returned), by checking whether it is also updated to -EAGAIN.
1274  */
1275 #define DEFINE_MMAP_CHANGING_TEST(name, ioctl_name, field)		\
1276 	static bool uffdio_mmap_changing_test_##name(int fd)		\
1277 	{								\
1278 		int ret;						\
1279 		struct uffdio_##name args = {				\
1280 			.field = 1,					\
1281 		};							\
1282 		ret = ioctl(fd, ioctl_name, &args);			\
1283 		return uffdio_verify_results(#ioctl_name, ret, errno, args.field); \
1284 	}
1285 
1286 DEFINE_MMAP_CHANGING_TEST(zeropage, UFFDIO_ZEROPAGE, zeropage)
1287 DEFINE_MMAP_CHANGING_TEST(copy, UFFDIO_COPY, copy)
1288 DEFINE_MMAP_CHANGING_TEST(move, UFFDIO_MOVE, move)
1289 DEFINE_MMAP_CHANGING_TEST(poison, UFFDIO_POISON, updated)
1290 DEFINE_MMAP_CHANGING_TEST(continue, UFFDIO_CONTINUE, mapped)
1291 
1292 typedef enum {
1293 	/* We actually do not care about any state except UNINTERRUPTIBLE.. */
1294 	THR_STATE_UNKNOWN = 0,
1295 	THR_STATE_UNINTERRUPTIBLE,
1296 } thread_state;
1297 
1298 static void sleep_short(void)
1299 {
1300 	usleep(1000);
1301 }
1302 
1303 static thread_state thread_state_get(pid_t tid)
1304 {
1305 	const char *header = "State:\t";
1306 	char tmp[256], *p, c;
1307 	FILE *fp;
1308 
1309 	snprintf(tmp, sizeof(tmp), "/proc/%d/status", tid);
1310 	fp = fopen(tmp, "r");
1311 
1312 	if (!fp)
1313 		return THR_STATE_UNKNOWN;
1314 
1315 	while (fgets(tmp, sizeof(tmp), fp)) {
1316 		p = strstr(tmp, header);
1317 		if (p) {
1318 			/* For example, "State:\tD (disk sleep)" */
1319 			c = *(p + sizeof(header) - 1);
1320 			return c == 'D' ?
1321 			    THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
1322 		}
1323 	}
1324 
1325 	return THR_STATE_UNKNOWN;
1326 }
1327 
1328 static void thread_state_until(pid_t tid, thread_state state)
1329 {
1330 	thread_state s;
1331 
1332 	do {
1333 		s = thread_state_get(tid);
1334 		sleep_short();
1335 	} while (s != state);
1336 }
1337 
1338 static void *uffd_mmap_changing_thread(void *opaque)
1339 {
1340 	volatile pid_t *pid = opaque;
1341 	int ret;
1342 
1343 	/* Unfortunately, it's only fetch-able from the thread itself.. */
1344 	assert(*pid == 0);
1345 	*pid = syscall(SYS_gettid);
1346 
1347 	/* Inject an event, this will hang solid until the event read */
1348 	ret = madvise(area_dst, page_size, MADV_REMOVE);
1349 	if (ret)
1350 		err("madvise(MADV_REMOVE) failed");
1351 
1352 	return NULL;
1353 }
1354 
1355 static void uffd_consume_message(int fd)
1356 {
1357 	struct uffd_msg msg = { 0 };
1358 
1359 	while (uffd_read_msg(fd, &msg));
1360 }
1361 
1362 static void uffd_mmap_changing_test(uffd_test_args_t *targs)
1363 {
1364 	/*
1365 	 * This stores the real PID (which can be different from how tid is
1366 	 * defined..) for the child thread, 0 means not initialized.
1367 	 */
1368 	pid_t pid = 0;
1369 	pthread_t tid;
1370 	int ret;
1371 
1372 	if (uffd_register(uffd, area_dst, nr_pages * page_size,
1373 			  true, false, false))
1374 		err("uffd_register() failed");
1375 
1376 	/* Create a thread to generate the racy event */
1377 	ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &pid);
1378 	if (ret)
1379 		err("pthread_create() failed");
1380 
1381 	/*
1382 	 * Wait until the thread setup the pid.  Use volatile to make sure
1383 	 * it reads from RAM not regs.
1384 	 */
1385 	while (!(volatile pid_t)pid)
1386 		sleep_short();
1387 
1388 	/* Wait until the thread hangs at REMOVE event */
1389 	thread_state_until(pid, THR_STATE_UNINTERRUPTIBLE);
1390 
1391 	if (!uffdio_mmap_changing_test_copy(uffd))
1392 		return;
1393 
1394 	if (!uffdio_mmap_changing_test_zeropage(uffd))
1395 		return;
1396 
1397 	if (!uffdio_mmap_changing_test_move(uffd))
1398 		return;
1399 
1400 	if (!uffdio_mmap_changing_test_poison(uffd))
1401 		return;
1402 
1403 	if (!uffdio_mmap_changing_test_continue(uffd))
1404 		return;
1405 
1406 	/*
1407 	 * All succeeded above!  Recycle everything.  Start by reading the
1408 	 * event so as to kick the thread roll again..
1409 	 */
1410 	uffd_consume_message(uffd);
1411 
1412 	ret = pthread_join(tid, NULL);
1413 	assert(ret == 0);
1414 
1415 	uffd_test_pass();
1416 }
1417 
1418 static int prevent_hugepages(const char **errmsg)
1419 {
1420 	/* This should be done before source area is populated */
1421 	if (madvise(area_src, nr_pages * page_size, MADV_NOHUGEPAGE)) {
1422 		/* Ignore only if CONFIG_TRANSPARENT_HUGEPAGE=n */
1423 		if (errno != EINVAL) {
1424 			if (errmsg)
1425 				*errmsg = "madvise(MADV_NOHUGEPAGE) failed";
1426 			return -errno;
1427 		}
1428 	}
1429 	return 0;
1430 }
1431 
1432 static int request_hugepages(const char **errmsg)
1433 {
1434 	/* This should be done before source area is populated */
1435 	if (madvise(area_src, nr_pages * page_size, MADV_HUGEPAGE)) {
1436 		if (errmsg) {
1437 			*errmsg = (errno == EINVAL) ?
1438 				"CONFIG_TRANSPARENT_HUGEPAGE is not set" :
1439 				"madvise(MADV_HUGEPAGE) failed";
1440 		}
1441 		return -errno;
1442 	}
1443 	return 0;
1444 }
1445 
1446 struct uffd_test_case_ops uffd_move_test_case_ops = {
1447 	.post_alloc = prevent_hugepages,
1448 };
1449 
1450 struct uffd_test_case_ops uffd_move_test_pmd_case_ops = {
1451 	.post_alloc = request_hugepages,
1452 };
1453 
1454 /*
1455  * Test the returned uffdio_register.ioctls with different register modes.
1456  * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test.
1457  */
1458 static void
1459 do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor)
1460 {
1461 	uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE);
1462 	mem_type_t *mem_type = args->mem_type;
1463 	int ret;
1464 
1465 	ret = uffd_register_with_ioctls(uffd, area_dst, page_size,
1466 					miss, wp, minor, &ioctls);
1467 
1468 	/*
1469 	 * Handle special cases of UFFDIO_REGISTER here where it should
1470 	 * just fail with -EINVAL first..
1471 	 *
1472 	 * Case 1: register MINOR on anon
1473 	 * Case 2: register with no mode selected
1474 	 */
1475 	if ((minor && (mem_type->mem_flag == MEM_ANON)) ||
1476 	    (!miss && !wp && !minor)) {
1477 		if (ret != -EINVAL)
1478 			err("register (miss=%d, wp=%d, minor=%d) failed "
1479 			    "with wrong errno=%d", miss, wp, minor, ret);
1480 		return;
1481 	}
1482 
1483 	/* UFFDIO_REGISTER should succeed, then check ioctls returned */
1484 	if (miss)
1485 		expected |= BIT_ULL(_UFFDIO_COPY);
1486 	if (wp)
1487 		expected |= BIT_ULL(_UFFDIO_WRITEPROTECT);
1488 	if (minor)
1489 		expected |= BIT_ULL(_UFFDIO_CONTINUE);
1490 
1491 	if ((ioctls & expected) != expected)
1492 		err("unexpected uffdio_register.ioctls "
1493 		    "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", "
1494 		    "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls);
1495 
1496 	if (uffd_unregister(uffd, area_dst, page_size))
1497 		err("unregister");
1498 }
1499 
1500 static void uffd_register_ioctls_test(uffd_test_args_t *args)
1501 {
1502 	int miss, wp, minor;
1503 
1504 	for (miss = 0; miss <= 1; miss++)
1505 		for (wp = 0; wp <= 1; wp++)
1506 			for (minor = 0; minor <= 1; minor++)
1507 				do_register_ioctls_test(args, miss, wp, minor);
1508 
1509 	uffd_test_pass();
1510 }
1511 
1512 uffd_test_case_t uffd_tests[] = {
1513 	{
1514 		/* Test returned uffdio_register.ioctls. */
1515 		.name = "register-ioctls",
1516 		.uffd_fn = uffd_register_ioctls_test,
1517 		.mem_targets = MEM_ALL,
1518 		.uffd_feature_required = UFFD_FEATURE_MISSING_HUGETLBFS |
1519 		UFFD_FEATURE_MISSING_SHMEM |
1520 		UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1521 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
1522 		UFFD_FEATURE_MINOR_HUGETLBFS |
1523 		UFFD_FEATURE_MINOR_SHMEM,
1524 	},
1525 	{
1526 		.name = "zeropage",
1527 		.uffd_fn = uffd_zeropage_test,
1528 		.mem_targets = MEM_ALL,
1529 		.uffd_feature_required = 0,
1530 	},
1531 	{
1532 		.name = "move",
1533 		.uffd_fn = uffd_move_test,
1534 		.mem_targets = MEM_ANON,
1535 		.uffd_feature_required = UFFD_FEATURE_MOVE,
1536 		.test_case_ops = &uffd_move_test_case_ops,
1537 	},
1538 	{
1539 		.name = "move-pmd",
1540 		.uffd_fn = uffd_move_pmd_test,
1541 		.mem_targets = MEM_ANON,
1542 		.uffd_feature_required = UFFD_FEATURE_MOVE,
1543 		.test_case_ops = &uffd_move_test_pmd_case_ops,
1544 	},
1545 	{
1546 		.name = "move-pmd-split",
1547 		.uffd_fn = uffd_move_pmd_split_test,
1548 		.mem_targets = MEM_ANON,
1549 		.uffd_feature_required = UFFD_FEATURE_MOVE,
1550 		.test_case_ops = &uffd_move_test_pmd_case_ops,
1551 	},
1552 	{
1553 		.name = "wp-fork",
1554 		.uffd_fn = uffd_wp_fork_test,
1555 		.mem_targets = MEM_ALL,
1556 		.uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1557 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
1558 	},
1559 	{
1560 		.name = "wp-fork-with-event",
1561 		.uffd_fn = uffd_wp_fork_with_event_test,
1562 		.mem_targets = MEM_ALL,
1563 		.uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1564 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
1565 		/* when set, child process should inherit uffd-wp bits */
1566 		UFFD_FEATURE_EVENT_FORK,
1567 	},
1568 	{
1569 		.name = "wp-fork-pin",
1570 		.uffd_fn = uffd_wp_fork_pin_test,
1571 		.mem_targets = MEM_ALL,
1572 		.uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1573 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
1574 	},
1575 	{
1576 		.name = "wp-fork-pin-with-event",
1577 		.uffd_fn = uffd_wp_fork_pin_with_event_test,
1578 		.mem_targets = MEM_ALL,
1579 		.uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1580 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
1581 		/* when set, child process should inherit uffd-wp bits */
1582 		UFFD_FEATURE_EVENT_FORK,
1583 	},
1584 	{
1585 		.name = "wp-unpopulated",
1586 		.uffd_fn = uffd_wp_unpopulated_test,
1587 		.mem_targets = MEM_ANON,
1588 		.uffd_feature_required =
1589 		UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED,
1590 	},
1591 	{
1592 		.name = "minor",
1593 		.uffd_fn = uffd_minor_test,
1594 		.mem_targets = MEM_SHMEM | MEM_HUGETLB,
1595 		.uffd_feature_required =
1596 		UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM,
1597 	},
1598 	{
1599 		.name = "minor-wp",
1600 		.uffd_fn = uffd_minor_wp_test,
1601 		.mem_targets = MEM_SHMEM | MEM_HUGETLB,
1602 		.uffd_feature_required =
1603 		UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM |
1604 		UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1605 		/*
1606 		 * HACK: here we leveraged WP_UNPOPULATED to detect whether
1607 		 * minor mode supports wr-protect.  There's no feature flag
1608 		 * for it so this is the best we can test against.
1609 		 */
1610 		UFFD_FEATURE_WP_UNPOPULATED,
1611 	},
1612 	{
1613 		.name = "minor-collapse",
1614 		.uffd_fn = uffd_minor_collapse_test,
1615 		/* MADV_COLLAPSE only works with shmem */
1616 		.mem_targets = MEM_SHMEM,
1617 		/* We can't test MADV_COLLAPSE, so try our luck */
1618 		.uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM,
1619 	},
1620 	{
1621 		.name = "sigbus",
1622 		.uffd_fn = uffd_sigbus_test,
1623 		.mem_targets = MEM_ALL,
1624 		.uffd_feature_required = UFFD_FEATURE_SIGBUS |
1625 		UFFD_FEATURE_EVENT_FORK,
1626 	},
1627 	{
1628 		.name = "sigbus-wp",
1629 		.uffd_fn = uffd_sigbus_wp_test,
1630 		.mem_targets = MEM_ALL,
1631 		.uffd_feature_required = UFFD_FEATURE_SIGBUS |
1632 		UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1633 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
1634 	},
1635 	{
1636 		.name = "events",
1637 		.uffd_fn = uffd_events_test,
1638 		.mem_targets = MEM_ALL,
1639 		.uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
1640 		UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE,
1641 	},
1642 	{
1643 		.name = "events-wp",
1644 		.uffd_fn = uffd_events_wp_test,
1645 		.mem_targets = MEM_ALL,
1646 		.uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
1647 		UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE |
1648 		UFFD_FEATURE_PAGEFAULT_FLAG_WP |
1649 		UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
1650 	},
1651 	{
1652 		.name = "poison",
1653 		.uffd_fn = uffd_poison_test,
1654 		.mem_targets = MEM_ALL,
1655 		.uffd_feature_required = UFFD_FEATURE_POISON,
1656 	},
1657 	{
1658 		.name = "mmap-changing",
1659 		.uffd_fn = uffd_mmap_changing_test,
1660 		/*
1661 		 * There's no point running this test over all mem types as
1662 		 * they share the same code paths.
1663 		 *
1664 		 * Choose shmem for simplicity, because (1) shmem supports
1665 		 * MINOR mode to cover UFFDIO_CONTINUE, and (2) shmem is
1666 		 * almost always available (unlike hugetlb).  Here we
1667 		 * abused SHMEM for UFFDIO_MOVE, but the test we want to
1668 		 * cover doesn't yet need the correct memory type..
1669 		 */
1670 		.mem_targets = MEM_SHMEM,
1671 		/*
1672 		 * Any UFFD_FEATURE_EVENT_* should work to trigger the
1673 		 * race logically, but choose the simplest (REMOVE).
1674 		 *
1675 		 * Meanwhile, since we'll cover quite a few new ioctl()s
1676 		 * (CONTINUE, POISON, MOVE), skip this test for old kernels
1677 		 * by choosing all of them.
1678 		 */
1679 		.uffd_feature_required = UFFD_FEATURE_EVENT_REMOVE |
1680 		UFFD_FEATURE_MOVE | UFFD_FEATURE_POISON |
1681 		UFFD_FEATURE_MINOR_SHMEM,
1682 	},
1683 };
1684 
1685 static void usage(const char *prog)
1686 {
1687 	printf("usage: %s [-f TESTNAME]\n", prog);
1688 	puts("");
1689 	puts(" -f: test name to filter (e.g., event)");
1690 	puts(" -h: show the help msg");
1691 	puts(" -l: list tests only");
1692 	puts("");
1693 	exit(KSFT_FAIL);
1694 }
1695 
1696 int main(int argc, char *argv[])
1697 {
1698 	int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t);
1699 	int n_mems = sizeof(mem_types) / sizeof(mem_type_t);
1700 	const char *test_filter = NULL;
1701 	bool list_only = false;
1702 	uffd_test_case_t *test;
1703 	mem_type_t *mem_type;
1704 	uffd_test_args_t args;
1705 	const char *errmsg;
1706 	int has_uffd, opt;
1707 	int i, j;
1708 
1709 	while ((opt = getopt(argc, argv, "f:hl")) != -1) {
1710 		switch (opt) {
1711 		case 'f':
1712 			test_filter = optarg;
1713 			break;
1714 		case 'l':
1715 			list_only = true;
1716 			break;
1717 		case 'h':
1718 		default:
1719 			/* Unknown */
1720 			usage(argv[0]);
1721 			break;
1722 		}
1723 	}
1724 
1725 	if (!test_filter && !list_only) {
1726 		has_uffd = test_uffd_api(false);
1727 		has_uffd |= test_uffd_api(true);
1728 
1729 		if (!has_uffd) {
1730 			printf("Userfaultfd not supported or unprivileged, skip all tests\n");
1731 			exit(KSFT_SKIP);
1732 		}
1733 	}
1734 
1735 	for (i = 0; i < n_tests; i++) {
1736 		test = &uffd_tests[i];
1737 		if (test_filter && !strstr(test->name, test_filter))
1738 			continue;
1739 		if (list_only) {
1740 			printf("%s\n", test->name);
1741 			continue;
1742 		}
1743 		for (j = 0; j < n_mems; j++) {
1744 			mem_type = &mem_types[j];
1745 			if (!(test->mem_targets & mem_type->mem_flag))
1746 				continue;
1747 
1748 			uffd_test_start("%s on %s", test->name, mem_type->name);
1749 			if ((mem_type->mem_flag == MEM_HUGETLB ||
1750 			    mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
1751 			    (default_huge_page_size() == 0)) {
1752 				uffd_test_skip("huge page size is 0, feature missing?");
1753 				continue;
1754 			}
1755 			if (!uffd_feature_supported(test)) {
1756 				uffd_test_skip("feature missing");
1757 				continue;
1758 			}
1759 			if (uffd_setup_environment(&args, test, mem_type,
1760 						   &errmsg)) {
1761 				uffd_test_skip(errmsg);
1762 				continue;
1763 			}
1764 			test->uffd_fn(&args);
1765 			uffd_test_ctx_clear();
1766 		}
1767 	}
1768 
1769 	if (!list_only)
1770 		uffd_test_report();
1771 
1772 	return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS;
1773 }
1774 
1775 #else /* __NR_userfaultfd */
1776 
1777 #warning "missing __NR_userfaultfd definition"
1778 
1779 int main(void)
1780 {
1781 	printf("Skipping %s (missing __NR_userfaultfd)\n", __file__);
1782 	return KSFT_SKIP;
1783 }
1784 
1785 #endif /* __NR_userfaultfd */
1786