xref: /linux/tools/testing/selftests/mm/uffd-common.c (revision 8804d970fab45726b3c7cd7f240b31122aa94219)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd tests util functions
4  *
5  * Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 uffd_test_ops_t *uffd_test_ops;
11 uffd_test_case_ops_t *uffd_test_case_ops;
12 
13 #define BASE_PMD_ADDR ((void *)(1UL << 30))
14 
15 /* pthread_mutex_t starts at page offset 0 */
area_mutex(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)16 pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts)
17 {
18 	return (pthread_mutex_t *) (area + nr * gopts->page_size);
19 }
20 
21 /*
22  * count is placed in the page after pthread_mutex_t naturally aligned
23  * to avoid non alignment faults on non-x86 archs.
24  */
area_count(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)25 volatile unsigned long long *area_count(char *area, unsigned long nr,
26 					uffd_global_test_opts_t *gopts)
27 {
28 	return (volatile unsigned long long *)
29 	       ((unsigned long)(area + nr * gopts->page_size +
30 	       sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) &
31 	       ~(unsigned long)(sizeof(unsigned long long) - 1));
32 }
33 
uffd_mem_fd_create(off_t mem_size,bool hugetlb)34 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
35 {
36 	unsigned int memfd_flags = 0;
37 	int mem_fd;
38 
39 	if (hugetlb)
40 		memfd_flags = MFD_HUGETLB;
41 	mem_fd = memfd_create("uffd-test", memfd_flags);
42 	if (mem_fd < 0)
43 		err("memfd_create");
44 	if (ftruncate(mem_fd, mem_size))
45 		err("ftruncate");
46 	if (fallocate(mem_fd,
47 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
48 		      mem_size))
49 		err("fallocate");
50 
51 	return mem_fd;
52 }
53 
anon_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)54 static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
55 {
56 	if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
57 		err("madvise(MADV_DONTNEED) failed");
58 }
59 
anon_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)60 static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
61 {
62 	*alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE,
63 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
64 	if (*alloc_area == MAP_FAILED) {
65 		*alloc_area = NULL;
66 		return -errno;
67 	}
68 	return 0;
69 }
70 
noop_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)71 static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
72 			       size_t len, unsigned long offset)
73 {
74 }
75 
hugetlb_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)76 static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
77 {
78 	if (!gopts->map_shared) {
79 		if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
80 			err("madvise(MADV_DONTNEED) failed");
81 	} else {
82 		if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
83 			err("madvise(MADV_REMOVE) failed");
84 	}
85 }
86 
hugetlb_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)87 static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
88 {
89 	off_t size = gopts->nr_pages * gopts->page_size;
90 	off_t offset = is_src ? 0 : size;
91 	void *area_alias = NULL;
92 	char **alloc_area_alias;
93 	int mem_fd = uffd_mem_fd_create(size * 2, true);
94 
95 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
96 			   (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) |
97 			   (is_src ? 0 : MAP_NORESERVE),
98 			   mem_fd, offset);
99 	if (*alloc_area == MAP_FAILED) {
100 		*alloc_area = NULL;
101 		return -errno;
102 	}
103 
104 	if (gopts->map_shared) {
105 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
106 				  MAP_SHARED, mem_fd, offset);
107 		if (area_alias == MAP_FAILED)
108 			return -errno;
109 	}
110 
111 	if (is_src) {
112 		alloc_area_alias = &gopts->area_src_alias;
113 	} else {
114 		alloc_area_alias = &gopts->area_dst_alias;
115 	}
116 	if (area_alias)
117 		*alloc_area_alias = area_alias;
118 
119 	close(mem_fd);
120 	return 0;
121 }
122 
hugetlb_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)123 static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
124 				  size_t len, unsigned long offset)
125 {
126 	if (!gopts->map_shared)
127 		return;
128 
129 	*start = (unsigned long) gopts->area_dst_alias + offset;
130 }
131 
shmem_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)132 static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
133 {
134 	if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
135 		err("madvise(MADV_REMOVE) failed");
136 }
137 
shmem_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)138 static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
139 {
140 	void *area_alias = NULL;
141 	size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize();
142 	unsigned long offset = is_src ? 0 : bytes;
143 	char *p = NULL, *p_alias = NULL;
144 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
145 
146 	/* TODO: clean this up.  Use a static addr is ugly */
147 	p = BASE_PMD_ADDR;
148 	if (!is_src)
149 		/* src map + alias + interleaved hpages */
150 		p += 2 * (bytes + hpage_size);
151 	p_alias = p;
152 	p_alias += bytes;
153 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
154 
155 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
156 			   mem_fd, offset);
157 	if (*alloc_area == MAP_FAILED) {
158 		*alloc_area = NULL;
159 		return -errno;
160 	}
161 	if (*alloc_area != p)
162 		err("mmap of memfd failed at %p", p);
163 
164 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
165 			  mem_fd, offset);
166 	if (area_alias == MAP_FAILED) {
167 		munmap(*alloc_area, bytes);
168 		*alloc_area = NULL;
169 		return -errno;
170 	}
171 	if (area_alias != p_alias)
172 		err("mmap of anonymous memory failed at %p", p_alias);
173 
174 	if (is_src)
175 		gopts->area_src_alias = area_alias;
176 	else
177 		gopts->area_dst_alias = area_alias;
178 
179 	close(mem_fd);
180 	return 0;
181 }
182 
shmem_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)183 static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
184 				size_t len, unsigned long offset)
185 {
186 	*start = (unsigned long)gopts->area_dst_alias + offset;
187 }
188 
shmem_check_pmd_mapping(uffd_global_test_opts_t * gopts,void * p,int expect_nr_hpages)189 static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages)
190 {
191 	if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages,
192 			      read_pmd_pagesize()))
193 		err("Did not find expected %d number of hugepages",
194 		    expect_nr_hpages);
195 }
196 
197 struct uffd_test_ops anon_uffd_test_ops = {
198 	.allocate_area = anon_allocate_area,
199 	.release_pages = anon_release_pages,
200 	.alias_mapping = noop_alias_mapping,
201 	.check_pmd_mapping = NULL,
202 };
203 
204 struct uffd_test_ops shmem_uffd_test_ops = {
205 	.allocate_area = shmem_allocate_area,
206 	.release_pages = shmem_release_pages,
207 	.alias_mapping = shmem_alias_mapping,
208 	.check_pmd_mapping = shmem_check_pmd_mapping,
209 };
210 
211 struct uffd_test_ops hugetlb_uffd_test_ops = {
212 	.allocate_area = hugetlb_allocate_area,
213 	.release_pages = hugetlb_release_pages,
214 	.alias_mapping = hugetlb_alias_mapping,
215 	.check_pmd_mapping = NULL,
216 };
217 
uffd_stats_report(struct uffd_args * args,int n_cpus)218 void uffd_stats_report(struct uffd_args *args, int n_cpus)
219 {
220 	int i;
221 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
222 
223 	for (i = 0; i < n_cpus; i++) {
224 		miss_total += args[i].missing_faults;
225 		wp_total += args[i].wp_faults;
226 		minor_total += args[i].minor_faults;
227 	}
228 
229 	printf("userfaults: ");
230 	if (miss_total) {
231 		printf("%llu missing (", miss_total);
232 		for (i = 0; i < n_cpus; i++)
233 			printf("%lu+", args[i].missing_faults);
234 		printf("\b) ");
235 	}
236 	if (wp_total) {
237 		printf("%llu wp (", wp_total);
238 		for (i = 0; i < n_cpus; i++)
239 			printf("%lu+", args[i].wp_faults);
240 		printf("\b) ");
241 	}
242 	if (minor_total) {
243 		printf("%llu minor (", minor_total);
244 		for (i = 0; i < n_cpus; i++)
245 			printf("%lu+", args[i].minor_faults);
246 		printf("\b)");
247 	}
248 	printf("\n");
249 }
250 
userfaultfd_open(uffd_global_test_opts_t * gopts,uint64_t * features)251 int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features)
252 {
253 	struct uffdio_api uffdio_api;
254 
255 	gopts->uffd = uffd_open(UFFD_FLAGS);
256 	if (gopts->uffd < 0)
257 		return -1;
258 	gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL);
259 
260 	uffdio_api.api = UFFD_API;
261 	uffdio_api.features = *features;
262 	if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api))
263 		/* Probably lack of CAP_PTRACE? */
264 		return -1;
265 	if (uffdio_api.api != UFFD_API)
266 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
267 
268 	*features = uffdio_api.features;
269 	return 0;
270 }
271 
munmap_area(uffd_global_test_opts_t * gopts,void ** area)272 static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area)
273 {
274 	if (*area)
275 		if (munmap(*area, gopts->nr_pages * gopts->page_size))
276 			err("munmap");
277 
278 	*area = NULL;
279 }
280 
uffd_test_ctx_clear(uffd_global_test_opts_t * gopts)281 void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts)
282 {
283 	size_t i;
284 
285 	if (gopts->pipefd) {
286 		for (i = 0; i < gopts->nr_parallel * 2; ++i) {
287 			if (close(gopts->pipefd[i]))
288 				err("close pipefd");
289 		}
290 		free(gopts->pipefd);
291 		gopts->pipefd = NULL;
292 	}
293 
294 	if (gopts->count_verify) {
295 		free(gopts->count_verify);
296 		gopts->count_verify = NULL;
297 	}
298 
299 	if (gopts->uffd != -1) {
300 		if (close(gopts->uffd))
301 			err("close uffd");
302 		gopts->uffd = -1;
303 	}
304 
305 	munmap_area(gopts, (void **)&gopts->area_src);
306 	munmap_area(gopts, (void **)&gopts->area_src_alias);
307 	munmap_area(gopts, (void **)&gopts->area_dst);
308 	munmap_area(gopts, (void **)&gopts->area_dst_alias);
309 	munmap_area(gopts, (void **)&gopts->area_remap);
310 }
311 
uffd_test_ctx_init(uffd_global_test_opts_t * gopts,uint64_t features,const char ** errmsg)312 int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg)
313 {
314 	unsigned long nr, cpu;
315 	int ret;
316 
317 	gopts->area_src_alias = NULL;
318 	gopts->area_dst_alias = NULL;
319 	gopts->area_remap = NULL;
320 
321 	if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
322 		ret = uffd_test_case_ops->pre_alloc(gopts, errmsg);
323 		if (ret)
324 			return ret;
325 	}
326 
327 	ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true);
328 	ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false);
329 	if (ret) {
330 		if (errmsg)
331 			*errmsg = "memory allocation failed";
332 		return ret;
333 	}
334 
335 	if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
336 		ret = uffd_test_case_ops->post_alloc(gopts, errmsg);
337 		if (ret)
338 			return ret;
339 	}
340 
341 	ret = userfaultfd_open(gopts, &features);
342 	if (ret) {
343 		if (errmsg)
344 			*errmsg = "possible lack of privilege";
345 		return ret;
346 	}
347 
348 	gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long));
349 	if (!gopts->count_verify)
350 		err("count_verify");
351 
352 	for (nr = 0; nr < gopts->nr_pages; nr++) {
353 		*area_mutex(gopts->area_src, nr, gopts) =
354 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
355 		gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1;
356 		/*
357 		 * In the transition between 255 to 256, powerpc will
358 		 * read out of order in my_bcmp and see both bytes as
359 		 * zero, so leave a placeholder below always non-zero
360 		 * after the count, to avoid my_bcmp to trigger false
361 		 * positives.
362 		 */
363 		*(area_count(gopts->area_src, nr, gopts) + 1) = 1;
364 	}
365 
366 	/*
367 	 * After initialization of area_src, we must explicitly release pages
368 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
369 	 * some area_dst pages be erroneously initialized with zero pages,
370 	 * hence we could hit memory corruption later in the test.
371 	 *
372 	 * One example is when THP is globally enabled, above allocate_area()
373 	 * calls could have the two areas merged into a single VMA (as they
374 	 * will have the same VMA flags so they're mergeable).  When we
375 	 * initialize the area_src above, it's possible that some part of
376 	 * area_dst could have been faulted in via one huge THP that will be
377 	 * shared between area_src and area_dst.  It could cause some of the
378 	 * area_dst won't be trapped by missing userfaults.
379 	 *
380 	 * This release_pages() will guarantee even if that happened, we'll
381 	 * proactively split the thp and drop any accidentally initialized
382 	 * pages within area_dst.
383 	 */
384 	uffd_test_ops->release_pages(gopts, gopts->area_dst);
385 
386 	gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2);
387 	if (!gopts->pipefd)
388 		err("pipefd");
389 	for (cpu = 0; cpu < gopts->nr_parallel; cpu++)
390 		if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
391 			err("pipe");
392 
393 	return 0;
394 }
395 
wp_range(int ufd,__u64 start,__u64 len,bool wp)396 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
397 {
398 	struct uffdio_writeprotect prms;
399 
400 	/* Write protection page faults */
401 	prms.range.start = start;
402 	prms.range.len = len;
403 	/* Undo write-protect, do wakeup after that */
404 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
405 
406 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
407 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
408 }
409 
continue_range(int ufd,__u64 start,__u64 len,bool wp)410 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
411 {
412 	struct uffdio_continue req;
413 	int ret;
414 
415 	req.range.start = start;
416 	req.range.len = len;
417 	req.mode = 0;
418 	if (wp)
419 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
420 
421 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
422 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
423 		    (uint64_t)start);
424 
425 	/*
426 	 * Error handling within the kernel for continue is subtly different
427 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
428 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
429 	 */
430 	req.mapped = 0;
431 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
432 	if (ret >= 0 || req.mapped != -EEXIST)
433 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
434 		    ret, (int64_t) req.mapped);
435 }
436 
uffd_read_msg(uffd_global_test_opts_t * gopts,struct uffd_msg * msg)437 int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg)
438 {
439 	int ret = read(gopts->uffd, msg, sizeof(*msg));
440 
441 	if (ret != sizeof(*msg)) {
442 		if (ret < 0) {
443 			if (errno == EAGAIN || errno == EINTR)
444 				return 1;
445 			err("blocking read error");
446 		} else {
447 			err("short read");
448 		}
449 	}
450 
451 	return 0;
452 }
453 
uffd_handle_page_fault(uffd_global_test_opts_t * gopts,struct uffd_msg * msg,struct uffd_args * args)454 void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg,
455 			    struct uffd_args *args)
456 {
457 	unsigned long offset;
458 
459 	if (msg->event != UFFD_EVENT_PAGEFAULT)
460 		err("unexpected msg event %u", msg->event);
461 
462 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
463 		/* Write protect page faults */
464 		wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false);
465 		args->wp_faults++;
466 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
467 		uint8_t *area;
468 		int b;
469 
470 		/*
471 		 * Minor page faults
472 		 *
473 		 * To prove we can modify the original range for testing
474 		 * purposes, we're going to bit flip this range before
475 		 * continuing.
476 		 *
477 		 * Note that this requires all minor page fault tests operate on
478 		 * area_dst (non-UFFD-registered) and area_dst_alias
479 		 * (UFFD-registered).
480 		 */
481 
482 		area = (uint8_t *)(gopts->area_dst +
483 		       ((char *)msg->arg.pagefault.address -
484 		       gopts->area_dst_alias));
485 		for (b = 0; b < gopts->page_size; ++b)
486 			area[b] = ~area[b];
487 		continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size,
488 			       args->apply_wp);
489 		args->minor_faults++;
490 	} else {
491 		/*
492 		 * Missing page faults.
493 		 *
494 		 * Here we force a write check for each of the missing mode
495 		 * faults.  It's guaranteed because the only threads that
496 		 * will trigger uffd faults are the locking threads, and
497 		 * their first instruction to touch the missing page will
498 		 * always be pthread_mutex_lock().
499 		 *
500 		 * Note that here we relied on an NPTL glibc impl detail to
501 		 * always read the lock type at the entry of the lock op
502 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
503 		 * doing any locking operations to guarantee that.  It's
504 		 * actually not good to rely on this impl detail because
505 		 * logically a pthread-compatible lib can implement the
506 		 * locks without types and we can fail when linking with
507 		 * them.  However since we used to find bugs with this
508 		 * strict check we still keep it around.  Hopefully this
509 		 * could be a good hint when it fails again.  If one day
510 		 * it'll break on some other impl of glibc we'll revisit.
511 		 */
512 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
513 			err("unexpected write fault");
514 
515 		offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst;
516 		offset &= ~(gopts->page_size-1);
517 
518 		if (copy_page(gopts, offset, args->apply_wp))
519 			args->missing_faults++;
520 	}
521 }
522 
uffd_poll_thread(void * arg)523 void *uffd_poll_thread(void *arg)
524 {
525 	struct uffd_args *args = (struct uffd_args *)arg;
526 	uffd_global_test_opts_t *gopts = args->gopts;
527 	unsigned long cpu = args->cpu;
528 	struct pollfd pollfd[2];
529 	struct uffd_msg msg;
530 	struct uffdio_register uffd_reg;
531 	int ret;
532 	char tmp_chr;
533 
534 	if (!args->handle_fault)
535 		args->handle_fault = uffd_handle_page_fault;
536 
537 	pollfd[0].fd = gopts->uffd;
538 	pollfd[0].events = POLLIN;
539 	pollfd[1].fd = gopts->pipefd[cpu*2];
540 	pollfd[1].events = POLLIN;
541 
542 	gopts->ready_for_fork = true;
543 
544 	for (;;) {
545 		ret = poll(pollfd, 2, -1);
546 		if (ret <= 0) {
547 			if (errno == EINTR || errno == EAGAIN)
548 				continue;
549 			err("poll error: %d", ret);
550 		}
551 		if (pollfd[1].revents) {
552 			if (!(pollfd[1].revents & POLLIN))
553 				err("pollfd[1].revents %d", pollfd[1].revents);
554 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
555 				err("read pipefd error");
556 			break;
557 		}
558 		if (!(pollfd[0].revents & POLLIN))
559 			err("pollfd[0].revents %d", pollfd[0].revents);
560 		if (uffd_read_msg(gopts, &msg))
561 			continue;
562 		switch (msg.event) {
563 		default:
564 			err("unexpected msg event %u\n", msg.event);
565 			break;
566 		case UFFD_EVENT_PAGEFAULT:
567 			args->handle_fault(gopts, &msg, args);
568 			break;
569 		case UFFD_EVENT_FORK:
570 			close(gopts->uffd);
571 			gopts->uffd = msg.arg.fork.ufd;
572 			pollfd[0].fd = gopts->uffd;
573 			break;
574 		case UFFD_EVENT_REMOVE:
575 			uffd_reg.range.start = msg.arg.remove.start;
576 			uffd_reg.range.len = msg.arg.remove.end -
577 				msg.arg.remove.start;
578 			if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
579 				err("remove failure");
580 			break;
581 		case UFFD_EVENT_REMAP:
582 			gopts->area_remap = gopts->area_dst;  /* save for later unmap */
583 			gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to;
584 			break;
585 		}
586 	}
587 
588 	return NULL;
589 }
590 
retry_copy_page(uffd_global_test_opts_t * gopts,struct uffdio_copy * uffdio_copy,unsigned long offset)591 static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy,
592 			    unsigned long offset)
593 {
594 	uffd_test_ops->alias_mapping(gopts,
595 				     &uffdio_copy->dst,
596 				     uffdio_copy->len,
597 				     offset);
598 	if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) {
599 		/* real retval in ufdio_copy.copy */
600 		if (uffdio_copy->copy != -EEXIST)
601 			err("UFFDIO_COPY retry error: %"PRId64,
602 			(int64_t)uffdio_copy->copy);
603 	} else {
604 		err("UFFDIO_COPY retry unexpected: %"PRId64,
605 		    (int64_t)uffdio_copy->copy);
606 	}
607 }
608 
wake_range(int ufd,unsigned long addr,unsigned long len)609 static void wake_range(int ufd, unsigned long addr, unsigned long len)
610 {
611 	struct uffdio_range uffdio_wake;
612 
613 	uffdio_wake.start = addr;
614 	uffdio_wake.len = len;
615 
616 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
617 		fprintf(stderr, "error waking %lu\n",
618 			addr), exit(1);
619 }
620 
__copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool retry,bool wp)621 int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp)
622 {
623 	struct uffdio_copy uffdio_copy;
624 
625 	if (offset >= gopts->nr_pages * gopts->page_size)
626 		err("unexpected offset %lu\n", offset);
627 	uffdio_copy.dst = (unsigned long) gopts->area_dst + offset;
628 	uffdio_copy.src = (unsigned long) gopts->area_src + offset;
629 	uffdio_copy.len = gopts->page_size;
630 	if (wp)
631 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
632 	else
633 		uffdio_copy.mode = 0;
634 	uffdio_copy.copy = 0;
635 	if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) {
636 		/* real retval in ufdio_copy.copy */
637 		if (uffdio_copy.copy != -EEXIST)
638 			err("UFFDIO_COPY error: %"PRId64,
639 			    (int64_t)uffdio_copy.copy);
640 		wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size);
641 	} else if (uffdio_copy.copy != gopts->page_size) {
642 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
643 	} else {
644 		if (gopts->test_uffdio_copy_eexist && retry) {
645 			gopts->test_uffdio_copy_eexist = false;
646 			retry_copy_page(gopts, &uffdio_copy, offset);
647 		}
648 		return 1;
649 	}
650 	return 0;
651 }
652 
copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool wp)653 int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp)
654 {
655 	return __copy_page(gopts, offset, false, wp);
656 }
657 
move_page(uffd_global_test_opts_t * gopts,unsigned long offset,unsigned long len)658 int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len)
659 {
660 	struct uffdio_move uffdio_move;
661 
662 	if (offset + len > gopts->nr_pages * gopts->page_size)
663 		err("unexpected offset %lu and length %lu\n", offset, len);
664 	uffdio_move.dst = (unsigned long) gopts->area_dst + offset;
665 	uffdio_move.src = (unsigned long) gopts->area_src + offset;
666 	uffdio_move.len = len;
667 	uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
668 	uffdio_move.move = 0;
669 	if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) {
670 		/* real retval in uffdio_move.move */
671 		if (uffdio_move.move != -EEXIST)
672 			err("UFFDIO_MOVE error: %"PRId64,
673 			    (int64_t)uffdio_move.move);
674 		wake_range(gopts->uffd, uffdio_move.dst, len);
675 	} else if (uffdio_move.move != len) {
676 		err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
677 	} else
678 		return 1;
679 	return 0;
680 }
681 
uffd_open_dev(unsigned int flags)682 int uffd_open_dev(unsigned int flags)
683 {
684 	int fd, uffd;
685 
686 	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
687 	if (fd < 0)
688 		return fd;
689 	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
690 	close(fd);
691 
692 	return uffd;
693 }
694 
uffd_open_sys(unsigned int flags)695 int uffd_open_sys(unsigned int flags)
696 {
697 #ifdef __NR_userfaultfd
698 	return syscall(__NR_userfaultfd, flags);
699 #else
700 	return -1;
701 #endif
702 }
703 
uffd_open(unsigned int flags)704 int uffd_open(unsigned int flags)
705 {
706 	int uffd = uffd_open_sys(flags);
707 
708 	if (uffd < 0)
709 		uffd = uffd_open_dev(flags);
710 
711 	return uffd;
712 }
713 
uffd_get_features(uint64_t * features)714 int uffd_get_features(uint64_t *features)
715 {
716 	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
717 	/*
718 	 * This should by default work in most kernels; the feature list
719 	 * will be the same no matter what we pass in here.
720 	 */
721 	int fd = uffd_open(UFFD_USER_MODE_ONLY);
722 
723 	if (fd < 0)
724 		/* Maybe the kernel is older than user-only mode? */
725 		fd = uffd_open(0);
726 
727 	if (fd < 0)
728 		return fd;
729 
730 	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
731 		close(fd);
732 		return -errno;
733 	}
734 
735 	*features = uffdio_api.features;
736 	close(fd);
737 
738 	return 0;
739 }
740