xref: /linux/tools/testing/selftests/mm/uffd-common.c (revision 7203ca412fc8e8a0588e9adc0f777d3163f8dff3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd tests util functions
4  *
5  * Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 uffd_test_ops_t *uffd_test_ops;
11 uffd_test_case_ops_t *uffd_test_case_ops;
12 
13 
14 /* pthread_mutex_t starts at page offset 0 */
area_mutex(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)15 pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts)
16 {
17 	return (pthread_mutex_t *) (area + nr * gopts->page_size);
18 }
19 
20 /*
21  * count is placed in the page after pthread_mutex_t naturally aligned
22  * to avoid non alignment faults on non-x86 archs.
23  */
area_count(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)24 volatile unsigned long long *area_count(char *area, unsigned long nr,
25 					uffd_global_test_opts_t *gopts)
26 {
27 	return (volatile unsigned long long *)
28 	       ((unsigned long)(area + nr * gopts->page_size +
29 	       sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) &
30 	       ~(unsigned long)(sizeof(unsigned long long) - 1));
31 }
32 
uffd_mem_fd_create(off_t mem_size,bool hugetlb)33 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
34 {
35 	unsigned int memfd_flags = 0;
36 	int mem_fd;
37 
38 	if (hugetlb)
39 		memfd_flags = MFD_HUGETLB;
40 	mem_fd = memfd_create("uffd-test", memfd_flags);
41 	if (mem_fd < 0)
42 		err("memfd_create");
43 	if (ftruncate(mem_fd, mem_size))
44 		err("ftruncate");
45 	if (fallocate(mem_fd,
46 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
47 		      mem_size))
48 		err("fallocate");
49 
50 	return mem_fd;
51 }
52 
anon_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)53 static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
54 {
55 	if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
56 		err("madvise(MADV_DONTNEED) failed");
57 }
58 
anon_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)59 static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
60 {
61 	*alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE,
62 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
63 	if (*alloc_area == MAP_FAILED) {
64 		*alloc_area = NULL;
65 		return -errno;
66 	}
67 	return 0;
68 }
69 
noop_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)70 static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
71 			       size_t len, unsigned long offset)
72 {
73 }
74 
hugetlb_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)75 static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
76 {
77 	if (!gopts->map_shared) {
78 		if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
79 			err("madvise(MADV_DONTNEED) failed");
80 	} else {
81 		if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
82 			err("madvise(MADV_REMOVE) failed");
83 	}
84 }
85 
hugetlb_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)86 static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
87 {
88 	off_t size = gopts->nr_pages * gopts->page_size;
89 	off_t offset = is_src ? 0 : size;
90 	void *area_alias = NULL;
91 	char **alloc_area_alias;
92 	int mem_fd = uffd_mem_fd_create(size * 2, true);
93 
94 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
95 			   (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) |
96 			   (is_src ? 0 : MAP_NORESERVE),
97 			   mem_fd, offset);
98 	if (*alloc_area == MAP_FAILED) {
99 		*alloc_area = NULL;
100 		return -errno;
101 	}
102 
103 	if (gopts->map_shared) {
104 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
105 				  MAP_SHARED, mem_fd, offset);
106 		if (area_alias == MAP_FAILED)
107 			return -errno;
108 	}
109 
110 	if (is_src) {
111 		alloc_area_alias = &gopts->area_src_alias;
112 	} else {
113 		alloc_area_alias = &gopts->area_dst_alias;
114 	}
115 	if (area_alias)
116 		*alloc_area_alias = area_alias;
117 
118 	close(mem_fd);
119 	return 0;
120 }
121 
hugetlb_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)122 static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
123 				  size_t len, unsigned long offset)
124 {
125 	if (!gopts->map_shared)
126 		return;
127 
128 	*start = (unsigned long) gopts->area_dst_alias + offset;
129 }
130 
shmem_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)131 static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
132 {
133 	if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
134 		err("madvise(MADV_REMOVE) failed");
135 }
136 
shmem_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)137 static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
138 {
139 	void *area_alias = NULL;
140 	size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize();
141 	unsigned long offset = is_src ? 0 : bytes;
142 	char *p = NULL, *p_alias = NULL;
143 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
144 	size_t region_size = bytes * 2 + hpage_size;
145 
146 	void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
147 			-1, 0);
148 	if (reserve == MAP_FAILED) {
149 		close(mem_fd);
150 		return -errno;
151 	}
152 
153 	p = reserve;
154 	p_alias = p;
155 	p_alias += bytes;
156 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
157 
158 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
159 			   mem_fd, offset);
160 	if (*alloc_area == MAP_FAILED) {
161 		*alloc_area = NULL;
162 		munmap(reserve, region_size);
163 		close(mem_fd);
164 		return -errno;
165 	}
166 	if (*alloc_area != p)
167 		err("mmap of memfd failed at %p", p);
168 
169 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
170 			  mem_fd, offset);
171 	if (area_alias == MAP_FAILED) {
172 		*alloc_area = NULL;
173 		munmap(reserve, region_size);
174 		close(mem_fd);
175 		return -errno;
176 	}
177 	if (area_alias != p_alias)
178 		err("mmap of anonymous memory failed at %p", p_alias);
179 
180 	if (is_src)
181 		gopts->area_src_alias = area_alias;
182 	else
183 		gopts->area_dst_alias = area_alias;
184 
185 	close(mem_fd);
186 	return 0;
187 }
188 
shmem_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)189 static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
190 				size_t len, unsigned long offset)
191 {
192 	*start = (unsigned long)gopts->area_dst_alias + offset;
193 }
194 
shmem_check_pmd_mapping(uffd_global_test_opts_t * gopts,void * p,int expect_nr_hpages)195 static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages)
196 {
197 	if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages,
198 			      read_pmd_pagesize()))
199 		err("Did not find expected %d number of hugepages",
200 		    expect_nr_hpages);
201 }
202 
203 struct uffd_test_ops anon_uffd_test_ops = {
204 	.allocate_area = anon_allocate_area,
205 	.release_pages = anon_release_pages,
206 	.alias_mapping = noop_alias_mapping,
207 	.check_pmd_mapping = NULL,
208 };
209 
210 struct uffd_test_ops shmem_uffd_test_ops = {
211 	.allocate_area = shmem_allocate_area,
212 	.release_pages = shmem_release_pages,
213 	.alias_mapping = shmem_alias_mapping,
214 	.check_pmd_mapping = shmem_check_pmd_mapping,
215 };
216 
217 struct uffd_test_ops hugetlb_uffd_test_ops = {
218 	.allocate_area = hugetlb_allocate_area,
219 	.release_pages = hugetlb_release_pages,
220 	.alias_mapping = hugetlb_alias_mapping,
221 	.check_pmd_mapping = NULL,
222 };
223 
uffd_stats_report(struct uffd_args * args,int n_cpus)224 void uffd_stats_report(struct uffd_args *args, int n_cpus)
225 {
226 	int i;
227 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
228 
229 	for (i = 0; i < n_cpus; i++) {
230 		miss_total += args[i].missing_faults;
231 		wp_total += args[i].wp_faults;
232 		minor_total += args[i].minor_faults;
233 	}
234 
235 	printf("userfaults: ");
236 	if (miss_total) {
237 		printf("%llu missing (", miss_total);
238 		for (i = 0; i < n_cpus; i++)
239 			printf("%lu+", args[i].missing_faults);
240 		printf("\b) ");
241 	}
242 	if (wp_total) {
243 		printf("%llu wp (", wp_total);
244 		for (i = 0; i < n_cpus; i++)
245 			printf("%lu+", args[i].wp_faults);
246 		printf("\b) ");
247 	}
248 	if (minor_total) {
249 		printf("%llu minor (", minor_total);
250 		for (i = 0; i < n_cpus; i++)
251 			printf("%lu+", args[i].minor_faults);
252 		printf("\b)");
253 	}
254 	printf("\n");
255 }
256 
userfaultfd_open(uffd_global_test_opts_t * gopts,uint64_t * features)257 int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features)
258 {
259 	struct uffdio_api uffdio_api;
260 
261 	gopts->uffd = uffd_open(UFFD_FLAGS);
262 	if (gopts->uffd < 0)
263 		return -1;
264 	gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL);
265 
266 	uffdio_api.api = UFFD_API;
267 	uffdio_api.features = *features;
268 	if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api))
269 		/* Probably lack of CAP_PTRACE? */
270 		return -1;
271 	if (uffdio_api.api != UFFD_API)
272 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
273 
274 	*features = uffdio_api.features;
275 	return 0;
276 }
277 
munmap_area(uffd_global_test_opts_t * gopts,void ** area)278 static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area)
279 {
280 	if (*area)
281 		if (munmap(*area, gopts->nr_pages * gopts->page_size))
282 			err("munmap");
283 
284 	*area = NULL;
285 }
286 
uffd_test_ctx_clear(uffd_global_test_opts_t * gopts)287 void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts)
288 {
289 	size_t i;
290 
291 	if (gopts->pipefd) {
292 		for (i = 0; i < gopts->nr_parallel * 2; ++i) {
293 			if (close(gopts->pipefd[i]))
294 				err("close pipefd");
295 		}
296 		free(gopts->pipefd);
297 		gopts->pipefd = NULL;
298 	}
299 
300 	if (gopts->count_verify) {
301 		free(gopts->count_verify);
302 		gopts->count_verify = NULL;
303 	}
304 
305 	if (gopts->uffd != -1) {
306 		if (close(gopts->uffd))
307 			err("close uffd");
308 		gopts->uffd = -1;
309 	}
310 
311 	munmap_area(gopts, (void **)&gopts->area_src);
312 	munmap_area(gopts, (void **)&gopts->area_src_alias);
313 	munmap_area(gopts, (void **)&gopts->area_dst);
314 	munmap_area(gopts, (void **)&gopts->area_dst_alias);
315 	munmap_area(gopts, (void **)&gopts->area_remap);
316 }
317 
uffd_test_ctx_init(uffd_global_test_opts_t * gopts,uint64_t features,const char ** errmsg)318 int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg)
319 {
320 	unsigned long nr, cpu;
321 	int ret;
322 
323 	gopts->area_src_alias = NULL;
324 	gopts->area_dst_alias = NULL;
325 	gopts->area_remap = NULL;
326 
327 	if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
328 		ret = uffd_test_case_ops->pre_alloc(gopts, errmsg);
329 		if (ret)
330 			return ret;
331 	}
332 
333 	ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true);
334 	ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false);
335 	if (ret) {
336 		if (errmsg)
337 			*errmsg = "memory allocation failed";
338 		return ret;
339 	}
340 
341 	if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
342 		ret = uffd_test_case_ops->post_alloc(gopts, errmsg);
343 		if (ret)
344 			return ret;
345 	}
346 
347 	ret = userfaultfd_open(gopts, &features);
348 	if (ret) {
349 		if (errmsg)
350 			*errmsg = "possible lack of privilege";
351 		return ret;
352 	}
353 
354 	gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long));
355 	if (!gopts->count_verify)
356 		err("count_verify");
357 
358 	for (nr = 0; nr < gopts->nr_pages; nr++) {
359 		*area_mutex(gopts->area_src, nr, gopts) =
360 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
361 		gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1;
362 		/*
363 		 * In the transition between 255 to 256, powerpc will
364 		 * read out of order in my_bcmp and see both bytes as
365 		 * zero, so leave a placeholder below always non-zero
366 		 * after the count, to avoid my_bcmp to trigger false
367 		 * positives.
368 		 */
369 		*(area_count(gopts->area_src, nr, gopts) + 1) = 1;
370 	}
371 
372 	/*
373 	 * After initialization of area_src, we must explicitly release pages
374 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
375 	 * some area_dst pages be erroneously initialized with zero pages,
376 	 * hence we could hit memory corruption later in the test.
377 	 *
378 	 * One example is when THP is globally enabled, above allocate_area()
379 	 * calls could have the two areas merged into a single VMA (as they
380 	 * will have the same VMA flags so they're mergeable).  When we
381 	 * initialize the area_src above, it's possible that some part of
382 	 * area_dst could have been faulted in via one huge THP that will be
383 	 * shared between area_src and area_dst.  It could cause some of the
384 	 * area_dst won't be trapped by missing userfaults.
385 	 *
386 	 * This release_pages() will guarantee even if that happened, we'll
387 	 * proactively split the thp and drop any accidentally initialized
388 	 * pages within area_dst.
389 	 */
390 	uffd_test_ops->release_pages(gopts, gopts->area_dst);
391 
392 	gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2);
393 	if (!gopts->pipefd)
394 		err("pipefd");
395 	for (cpu = 0; cpu < gopts->nr_parallel; cpu++)
396 		if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
397 			err("pipe");
398 
399 	return 0;
400 }
401 
wp_range(int ufd,__u64 start,__u64 len,bool wp)402 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
403 {
404 	struct uffdio_writeprotect prms;
405 
406 	/* Write protection page faults */
407 	prms.range.start = start;
408 	prms.range.len = len;
409 	/* Undo write-protect, do wakeup after that */
410 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
411 
412 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
413 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
414 }
415 
continue_range(int ufd,__u64 start,__u64 len,bool wp)416 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
417 {
418 	struct uffdio_continue req;
419 	int ret;
420 
421 	req.range.start = start;
422 	req.range.len = len;
423 	req.mode = 0;
424 	if (wp)
425 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
426 
427 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
428 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
429 		    (uint64_t)start);
430 
431 	/*
432 	 * Error handling within the kernel for continue is subtly different
433 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
434 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
435 	 */
436 	req.mapped = 0;
437 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
438 	if (ret >= 0 || req.mapped != -EEXIST)
439 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
440 		    ret, (int64_t) req.mapped);
441 }
442 
uffd_read_msg(uffd_global_test_opts_t * gopts,struct uffd_msg * msg)443 int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg)
444 {
445 	int ret = read(gopts->uffd, msg, sizeof(*msg));
446 
447 	if (ret != sizeof(*msg)) {
448 		if (ret < 0) {
449 			if (errno == EAGAIN || errno == EINTR)
450 				return 1;
451 			err("blocking read error");
452 		} else {
453 			err("short read");
454 		}
455 	}
456 
457 	return 0;
458 }
459 
uffd_handle_page_fault(uffd_global_test_opts_t * gopts,struct uffd_msg * msg,struct uffd_args * args)460 void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg,
461 			    struct uffd_args *args)
462 {
463 	unsigned long offset;
464 
465 	if (msg->event != UFFD_EVENT_PAGEFAULT)
466 		err("unexpected msg event %u", msg->event);
467 
468 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
469 		/* Write protect page faults */
470 		wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false);
471 		args->wp_faults++;
472 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
473 		uint8_t *area;
474 		int b;
475 
476 		/*
477 		 * Minor page faults
478 		 *
479 		 * To prove we can modify the original range for testing
480 		 * purposes, we're going to bit flip this range before
481 		 * continuing.
482 		 *
483 		 * Note that this requires all minor page fault tests operate on
484 		 * area_dst (non-UFFD-registered) and area_dst_alias
485 		 * (UFFD-registered).
486 		 */
487 
488 		area = (uint8_t *)(gopts->area_dst +
489 		       ((char *)msg->arg.pagefault.address -
490 		       gopts->area_dst_alias));
491 		for (b = 0; b < gopts->page_size; ++b)
492 			area[b] = ~area[b];
493 		continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size,
494 			       args->apply_wp);
495 		args->minor_faults++;
496 	} else {
497 		/*
498 		 * Missing page faults.
499 		 *
500 		 * Here we force a write check for each of the missing mode
501 		 * faults.  It's guaranteed because the only threads that
502 		 * will trigger uffd faults are the locking threads, and
503 		 * their first instruction to touch the missing page will
504 		 * always be pthread_mutex_lock().
505 		 *
506 		 * Note that here we relied on an NPTL glibc impl detail to
507 		 * always read the lock type at the entry of the lock op
508 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
509 		 * doing any locking operations to guarantee that.  It's
510 		 * actually not good to rely on this impl detail because
511 		 * logically a pthread-compatible lib can implement the
512 		 * locks without types and we can fail when linking with
513 		 * them.  However since we used to find bugs with this
514 		 * strict check we still keep it around.  Hopefully this
515 		 * could be a good hint when it fails again.  If one day
516 		 * it'll break on some other impl of glibc we'll revisit.
517 		 */
518 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
519 			err("unexpected write fault");
520 
521 		offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst;
522 		offset &= ~(gopts->page_size-1);
523 
524 		if (copy_page(gopts, offset, args->apply_wp))
525 			args->missing_faults++;
526 	}
527 }
528 
uffd_poll_thread(void * arg)529 void *uffd_poll_thread(void *arg)
530 {
531 	struct uffd_args *args = (struct uffd_args *)arg;
532 	uffd_global_test_opts_t *gopts = args->gopts;
533 	unsigned long cpu = args->cpu;
534 	struct pollfd pollfd[2];
535 	struct uffd_msg msg;
536 	struct uffdio_register uffd_reg;
537 	int ret;
538 	char tmp_chr;
539 
540 	if (!args->handle_fault)
541 		args->handle_fault = uffd_handle_page_fault;
542 
543 	pollfd[0].fd = gopts->uffd;
544 	pollfd[0].events = POLLIN;
545 	pollfd[1].fd = gopts->pipefd[cpu*2];
546 	pollfd[1].events = POLLIN;
547 
548 	gopts->ready_for_fork = true;
549 
550 	for (;;) {
551 		ret = poll(pollfd, 2, -1);
552 		if (ret <= 0) {
553 			if (errno == EINTR || errno == EAGAIN)
554 				continue;
555 			err("poll error: %d", ret);
556 		}
557 		if (pollfd[1].revents) {
558 			if (!(pollfd[1].revents & POLLIN))
559 				err("pollfd[1].revents %d", pollfd[1].revents);
560 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
561 				err("read pipefd error");
562 			break;
563 		}
564 		if (!(pollfd[0].revents & POLLIN))
565 			err("pollfd[0].revents %d", pollfd[0].revents);
566 		if (uffd_read_msg(gopts, &msg))
567 			continue;
568 		switch (msg.event) {
569 		default:
570 			err("unexpected msg event %u\n", msg.event);
571 			break;
572 		case UFFD_EVENT_PAGEFAULT:
573 			args->handle_fault(gopts, &msg, args);
574 			break;
575 		case UFFD_EVENT_FORK:
576 			close(gopts->uffd);
577 			gopts->uffd = msg.arg.fork.ufd;
578 			pollfd[0].fd = gopts->uffd;
579 			break;
580 		case UFFD_EVENT_REMOVE:
581 			uffd_reg.range.start = msg.arg.remove.start;
582 			uffd_reg.range.len = msg.arg.remove.end -
583 				msg.arg.remove.start;
584 			if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
585 				err("remove failure");
586 			break;
587 		case UFFD_EVENT_REMAP:
588 			gopts->area_remap = gopts->area_dst;  /* save for later unmap */
589 			gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to;
590 			break;
591 		}
592 	}
593 
594 	return NULL;
595 }
596 
retry_copy_page(uffd_global_test_opts_t * gopts,struct uffdio_copy * uffdio_copy,unsigned long offset)597 static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy,
598 			    unsigned long offset)
599 {
600 	uffd_test_ops->alias_mapping(gopts,
601 				     &uffdio_copy->dst,
602 				     uffdio_copy->len,
603 				     offset);
604 	if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) {
605 		/* real retval in ufdio_copy.copy */
606 		if (uffdio_copy->copy != -EEXIST)
607 			err("UFFDIO_COPY retry error: %"PRId64,
608 			(int64_t)uffdio_copy->copy);
609 	} else {
610 		err("UFFDIO_COPY retry unexpected: %"PRId64,
611 		    (int64_t)uffdio_copy->copy);
612 	}
613 }
614 
wake_range(int ufd,unsigned long addr,unsigned long len)615 static void wake_range(int ufd, unsigned long addr, unsigned long len)
616 {
617 	struct uffdio_range uffdio_wake;
618 
619 	uffdio_wake.start = addr;
620 	uffdio_wake.len = len;
621 
622 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
623 		fprintf(stderr, "error waking %lu\n",
624 			addr), exit(1);
625 }
626 
__copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool retry,bool wp)627 int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp)
628 {
629 	struct uffdio_copy uffdio_copy;
630 
631 	if (offset >= gopts->nr_pages * gopts->page_size)
632 		err("unexpected offset %lu\n", offset);
633 	uffdio_copy.dst = (unsigned long) gopts->area_dst + offset;
634 	uffdio_copy.src = (unsigned long) gopts->area_src + offset;
635 	uffdio_copy.len = gopts->page_size;
636 	if (wp)
637 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
638 	else
639 		uffdio_copy.mode = 0;
640 	uffdio_copy.copy = 0;
641 	if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) {
642 		/* real retval in ufdio_copy.copy */
643 		if (uffdio_copy.copy != -EEXIST)
644 			err("UFFDIO_COPY error: %"PRId64,
645 			    (int64_t)uffdio_copy.copy);
646 		wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size);
647 	} else if (uffdio_copy.copy != gopts->page_size) {
648 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
649 	} else {
650 		if (gopts->test_uffdio_copy_eexist && retry) {
651 			gopts->test_uffdio_copy_eexist = false;
652 			retry_copy_page(gopts, &uffdio_copy, offset);
653 		}
654 		return 1;
655 	}
656 	return 0;
657 }
658 
copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool wp)659 int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp)
660 {
661 	return __copy_page(gopts, offset, false, wp);
662 }
663 
move_page(uffd_global_test_opts_t * gopts,unsigned long offset,unsigned long len)664 int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len)
665 {
666 	struct uffdio_move uffdio_move;
667 
668 	if (offset + len > gopts->nr_pages * gopts->page_size)
669 		err("unexpected offset %lu and length %lu\n", offset, len);
670 	uffdio_move.dst = (unsigned long) gopts->area_dst + offset;
671 	uffdio_move.src = (unsigned long) gopts->area_src + offset;
672 	uffdio_move.len = len;
673 	uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
674 	uffdio_move.move = 0;
675 	if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) {
676 		/* real retval in uffdio_move.move */
677 		if (uffdio_move.move != -EEXIST)
678 			err("UFFDIO_MOVE error: %"PRId64,
679 			    (int64_t)uffdio_move.move);
680 		wake_range(gopts->uffd, uffdio_move.dst, len);
681 	} else if (uffdio_move.move != len) {
682 		err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
683 	} else
684 		return 1;
685 	return 0;
686 }
687 
uffd_open_dev(unsigned int flags)688 int uffd_open_dev(unsigned int flags)
689 {
690 	int fd, uffd;
691 
692 	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
693 	if (fd < 0)
694 		return fd;
695 	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
696 	close(fd);
697 
698 	return uffd;
699 }
700 
uffd_open_sys(unsigned int flags)701 int uffd_open_sys(unsigned int flags)
702 {
703 #ifdef __NR_userfaultfd
704 	return syscall(__NR_userfaultfd, flags);
705 #else
706 	return -1;
707 #endif
708 }
709 
uffd_open(unsigned int flags)710 int uffd_open(unsigned int flags)
711 {
712 	int uffd = uffd_open_sys(flags);
713 
714 	if (uffd < 0)
715 		uffd = uffd_open_dev(flags);
716 
717 	return uffd;
718 }
719 
uffd_get_features(uint64_t * features)720 int uffd_get_features(uint64_t *features)
721 {
722 	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
723 	/*
724 	 * This should by default work in most kernels; the feature list
725 	 * will be the same no matter what we pass in here.
726 	 */
727 	int fd = uffd_open(UFFD_USER_MODE_ONLY);
728 
729 	if (fd < 0)
730 		/* Maybe the kernel is older than user-only mode? */
731 		fd = uffd_open(0);
732 
733 	if (fd < 0)
734 		return fd;
735 
736 	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
737 		close(fd);
738 		return -errno;
739 	}
740 
741 	*features = uffdio_api.features;
742 	close(fd);
743 
744 	return 0;
745 }
746