xref: /linux/tools/testing/selftests/mm/uffd-common.c (revision c435bce6af9b2a277662698875a689c389358f17)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd tests util functions
4  *
5  * Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 #define BASE_PMD_ADDR ((void *)(1UL << 30))
11 
12 volatile bool test_uffdio_copy_eexist = true;
13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15 int uffd = -1, uffd_flags, finished, *pipefd, test_type;
16 bool map_shared;
17 bool test_uffdio_wp = true;
18 unsigned long long *count_verify;
19 uffd_test_ops_t *uffd_test_ops;
20 uffd_test_case_ops_t *uffd_test_case_ops;
21 
22 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
23 {
24 	unsigned int memfd_flags = 0;
25 	int mem_fd;
26 
27 	if (hugetlb)
28 		memfd_flags = MFD_HUGETLB;
29 	mem_fd = memfd_create("uffd-test", memfd_flags);
30 	if (mem_fd < 0)
31 		err("memfd_create");
32 	if (ftruncate(mem_fd, mem_size))
33 		err("ftruncate");
34 	if (fallocate(mem_fd,
35 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
36 		      mem_size))
37 		err("fallocate");
38 
39 	return mem_fd;
40 }
41 
42 static void anon_release_pages(char *rel_area)
43 {
44 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
45 		err("madvise(MADV_DONTNEED) failed");
46 }
47 
48 static int anon_allocate_area(void **alloc_area, bool is_src)
49 {
50 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
51 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
52 	if (*alloc_area == MAP_FAILED) {
53 		*alloc_area = NULL;
54 		return -errno;
55 	}
56 	return 0;
57 }
58 
59 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
60 {
61 }
62 
63 static void hugetlb_release_pages(char *rel_area)
64 {
65 	if (!map_shared) {
66 		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
67 			err("madvise(MADV_DONTNEED) failed");
68 	} else {
69 		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
70 			err("madvise(MADV_REMOVE) failed");
71 	}
72 }
73 
74 static int hugetlb_allocate_area(void **alloc_area, bool is_src)
75 {
76 	off_t size = nr_pages * page_size;
77 	off_t offset = is_src ? 0 : size;
78 	void *area_alias = NULL;
79 	char **alloc_area_alias;
80 	int mem_fd = uffd_mem_fd_create(size * 2, true);
81 
82 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
83 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
84 			   (is_src ? 0 : MAP_NORESERVE),
85 			   mem_fd, offset);
86 	if (*alloc_area == MAP_FAILED) {
87 		*alloc_area = NULL;
88 		return -errno;
89 	}
90 
91 	if (map_shared) {
92 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
93 				  MAP_SHARED, mem_fd, offset);
94 		if (area_alias == MAP_FAILED)
95 			return -errno;
96 	}
97 
98 	if (is_src) {
99 		alloc_area_alias = &area_src_alias;
100 	} else {
101 		alloc_area_alias = &area_dst_alias;
102 	}
103 	if (area_alias)
104 		*alloc_area_alias = area_alias;
105 
106 	close(mem_fd);
107 	return 0;
108 }
109 
110 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
111 {
112 	if (!map_shared)
113 		return;
114 
115 	*start = (unsigned long) area_dst_alias + offset;
116 }
117 
118 static void shmem_release_pages(char *rel_area)
119 {
120 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
121 		err("madvise(MADV_REMOVE) failed");
122 }
123 
124 static int shmem_allocate_area(void **alloc_area, bool is_src)
125 {
126 	void *area_alias = NULL;
127 	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
128 	unsigned long offset = is_src ? 0 : bytes;
129 	char *p = NULL, *p_alias = NULL;
130 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
131 
132 	/* TODO: clean this up.  Use a static addr is ugly */
133 	p = BASE_PMD_ADDR;
134 	if (!is_src)
135 		/* src map + alias + interleaved hpages */
136 		p += 2 * (bytes + hpage_size);
137 	p_alias = p;
138 	p_alias += bytes;
139 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
140 
141 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
142 			   mem_fd, offset);
143 	if (*alloc_area == MAP_FAILED) {
144 		*alloc_area = NULL;
145 		return -errno;
146 	}
147 	if (*alloc_area != p)
148 		err("mmap of memfd failed at %p", p);
149 
150 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
151 			  mem_fd, offset);
152 	if (area_alias == MAP_FAILED) {
153 		munmap(*alloc_area, bytes);
154 		*alloc_area = NULL;
155 		return -errno;
156 	}
157 	if (area_alias != p_alias)
158 		err("mmap of anonymous memory failed at %p", p_alias);
159 
160 	if (is_src)
161 		area_src_alias = area_alias;
162 	else
163 		area_dst_alias = area_alias;
164 
165 	close(mem_fd);
166 	return 0;
167 }
168 
169 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
170 {
171 	*start = (unsigned long)area_dst_alias + offset;
172 }
173 
174 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
175 {
176 	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
177 			      read_pmd_pagesize()))
178 		err("Did not find expected %d number of hugepages",
179 		    expect_nr_hpages);
180 }
181 
182 struct uffd_test_ops anon_uffd_test_ops = {
183 	.allocate_area = anon_allocate_area,
184 	.release_pages = anon_release_pages,
185 	.alias_mapping = noop_alias_mapping,
186 	.check_pmd_mapping = NULL,
187 };
188 
189 struct uffd_test_ops shmem_uffd_test_ops = {
190 	.allocate_area = shmem_allocate_area,
191 	.release_pages = shmem_release_pages,
192 	.alias_mapping = shmem_alias_mapping,
193 	.check_pmd_mapping = shmem_check_pmd_mapping,
194 };
195 
196 struct uffd_test_ops hugetlb_uffd_test_ops = {
197 	.allocate_area = hugetlb_allocate_area,
198 	.release_pages = hugetlb_release_pages,
199 	.alias_mapping = hugetlb_alias_mapping,
200 	.check_pmd_mapping = NULL,
201 };
202 
203 void uffd_stats_report(struct uffd_args *args, int n_cpus)
204 {
205 	int i;
206 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
207 
208 	for (i = 0; i < n_cpus; i++) {
209 		miss_total += args[i].missing_faults;
210 		wp_total += args[i].wp_faults;
211 		minor_total += args[i].minor_faults;
212 	}
213 
214 	printf("userfaults: ");
215 	if (miss_total) {
216 		printf("%llu missing (", miss_total);
217 		for (i = 0; i < n_cpus; i++)
218 			printf("%lu+", args[i].missing_faults);
219 		printf("\b) ");
220 	}
221 	if (wp_total) {
222 		printf("%llu wp (", wp_total);
223 		for (i = 0; i < n_cpus; i++)
224 			printf("%lu+", args[i].wp_faults);
225 		printf("\b) ");
226 	}
227 	if (minor_total) {
228 		printf("%llu minor (", minor_total);
229 		for (i = 0; i < n_cpus; i++)
230 			printf("%lu+", args[i].minor_faults);
231 		printf("\b)");
232 	}
233 	printf("\n");
234 }
235 
236 int userfaultfd_open(uint64_t *features)
237 {
238 	struct uffdio_api uffdio_api;
239 
240 	uffd = uffd_open(UFFD_FLAGS);
241 	if (uffd < 0)
242 		return -1;
243 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
244 
245 	uffdio_api.api = UFFD_API;
246 	uffdio_api.features = *features;
247 	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
248 		/* Probably lack of CAP_PTRACE? */
249 		return -1;
250 	if (uffdio_api.api != UFFD_API)
251 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
252 
253 	*features = uffdio_api.features;
254 	return 0;
255 }
256 
257 static inline void munmap_area(void **area)
258 {
259 	if (*area)
260 		if (munmap(*area, nr_pages * page_size))
261 			err("munmap");
262 
263 	*area = NULL;
264 }
265 
266 void uffd_test_ctx_clear(void)
267 {
268 	size_t i;
269 
270 	if (pipefd) {
271 		for (i = 0; i < nr_cpus * 2; ++i) {
272 			if (close(pipefd[i]))
273 				err("close pipefd");
274 		}
275 		free(pipefd);
276 		pipefd = NULL;
277 	}
278 
279 	if (count_verify) {
280 		free(count_verify);
281 		count_verify = NULL;
282 	}
283 
284 	if (uffd != -1) {
285 		if (close(uffd))
286 			err("close uffd");
287 		uffd = -1;
288 	}
289 
290 	munmap_area((void **)&area_src);
291 	munmap_area((void **)&area_src_alias);
292 	munmap_area((void **)&area_dst);
293 	munmap_area((void **)&area_dst_alias);
294 	munmap_area((void **)&area_remap);
295 }
296 
297 int uffd_test_ctx_init(uint64_t features, const char **errmsg)
298 {
299 	unsigned long nr, cpu;
300 	int ret;
301 
302 	if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
303 		ret = uffd_test_case_ops->pre_alloc(errmsg);
304 		if (ret)
305 			return ret;
306 	}
307 
308 	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
309 	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
310 	if (ret) {
311 		if (errmsg)
312 			*errmsg = "memory allocation failed";
313 		return ret;
314 	}
315 
316 	if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
317 		ret = uffd_test_case_ops->post_alloc(errmsg);
318 		if (ret)
319 			return ret;
320 	}
321 
322 	ret = userfaultfd_open(&features);
323 	if (ret) {
324 		if (errmsg)
325 			*errmsg = "possible lack of priviledge";
326 		return ret;
327 	}
328 
329 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
330 	if (!count_verify)
331 		err("count_verify");
332 
333 	for (nr = 0; nr < nr_pages; nr++) {
334 		*area_mutex(area_src, nr) =
335 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
336 		count_verify[nr] = *area_count(area_src, nr) = 1;
337 		/*
338 		 * In the transition between 255 to 256, powerpc will
339 		 * read out of order in my_bcmp and see both bytes as
340 		 * zero, so leave a placeholder below always non-zero
341 		 * after the count, to avoid my_bcmp to trigger false
342 		 * positives.
343 		 */
344 		*(area_count(area_src, nr) + 1) = 1;
345 	}
346 
347 	/*
348 	 * After initialization of area_src, we must explicitly release pages
349 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
350 	 * some area_dst pages be errornously initialized with zero pages,
351 	 * hence we could hit memory corruption later in the test.
352 	 *
353 	 * One example is when THP is globally enabled, above allocate_area()
354 	 * calls could have the two areas merged into a single VMA (as they
355 	 * will have the same VMA flags so they're mergeable).  When we
356 	 * initialize the area_src above, it's possible that some part of
357 	 * area_dst could have been faulted in via one huge THP that will be
358 	 * shared between area_src and area_dst.  It could cause some of the
359 	 * area_dst won't be trapped by missing userfaults.
360 	 *
361 	 * This release_pages() will guarantee even if that happened, we'll
362 	 * proactively split the thp and drop any accidentally initialized
363 	 * pages within area_dst.
364 	 */
365 	uffd_test_ops->release_pages(area_dst);
366 
367 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
368 	if (!pipefd)
369 		err("pipefd");
370 	for (cpu = 0; cpu < nr_cpus; cpu++)
371 		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
372 			err("pipe");
373 
374 	return 0;
375 }
376 
377 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
378 {
379 	struct uffdio_writeprotect prms;
380 
381 	/* Write protection page faults */
382 	prms.range.start = start;
383 	prms.range.len = len;
384 	/* Undo write-protect, do wakeup after that */
385 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
386 
387 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
388 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
389 }
390 
391 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
392 {
393 	struct uffdio_continue req;
394 	int ret;
395 
396 	req.range.start = start;
397 	req.range.len = len;
398 	req.mode = 0;
399 	if (wp)
400 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
401 
402 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
403 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
404 		    (uint64_t)start);
405 
406 	/*
407 	 * Error handling within the kernel for continue is subtly different
408 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
409 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
410 	 */
411 	req.mapped = 0;
412 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
413 	if (ret >= 0 || req.mapped != -EEXIST)
414 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
415 		    ret, (int64_t) req.mapped);
416 }
417 
418 int uffd_read_msg(int ufd, struct uffd_msg *msg)
419 {
420 	int ret = read(uffd, msg, sizeof(*msg));
421 
422 	if (ret != sizeof(*msg)) {
423 		if (ret < 0) {
424 			if (errno == EAGAIN || errno == EINTR)
425 				return 1;
426 			err("blocking read error");
427 		} else {
428 			err("short read");
429 		}
430 	}
431 
432 	return 0;
433 }
434 
435 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
436 {
437 	unsigned long offset;
438 
439 	if (msg->event != UFFD_EVENT_PAGEFAULT)
440 		err("unexpected msg event %u", msg->event);
441 
442 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
443 		/* Write protect page faults */
444 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
445 		args->wp_faults++;
446 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
447 		uint8_t *area;
448 		int b;
449 
450 		/*
451 		 * Minor page faults
452 		 *
453 		 * To prove we can modify the original range for testing
454 		 * purposes, we're going to bit flip this range before
455 		 * continuing.
456 		 *
457 		 * Note that this requires all minor page fault tests operate on
458 		 * area_dst (non-UFFD-registered) and area_dst_alias
459 		 * (UFFD-registered).
460 		 */
461 
462 		area = (uint8_t *)(area_dst +
463 				   ((char *)msg->arg.pagefault.address -
464 				    area_dst_alias));
465 		for (b = 0; b < page_size; ++b)
466 			area[b] = ~area[b];
467 		continue_range(uffd, msg->arg.pagefault.address, page_size,
468 			       args->apply_wp);
469 		args->minor_faults++;
470 	} else {
471 		/*
472 		 * Missing page faults.
473 		 *
474 		 * Here we force a write check for each of the missing mode
475 		 * faults.  It's guaranteed because the only threads that
476 		 * will trigger uffd faults are the locking threads, and
477 		 * their first instruction to touch the missing page will
478 		 * always be pthread_mutex_lock().
479 		 *
480 		 * Note that here we relied on an NPTL glibc impl detail to
481 		 * always read the lock type at the entry of the lock op
482 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
483 		 * doing any locking operations to guarantee that.  It's
484 		 * actually not good to rely on this impl detail because
485 		 * logically a pthread-compatible lib can implement the
486 		 * locks without types and we can fail when linking with
487 		 * them.  However since we used to find bugs with this
488 		 * strict check we still keep it around.  Hopefully this
489 		 * could be a good hint when it fails again.  If one day
490 		 * it'll break on some other impl of glibc we'll revisit.
491 		 */
492 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
493 			err("unexpected write fault");
494 
495 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
496 		offset &= ~(page_size-1);
497 
498 		if (copy_page(uffd, offset, args->apply_wp))
499 			args->missing_faults++;
500 	}
501 }
502 
503 void *uffd_poll_thread(void *arg)
504 {
505 	struct uffd_args *args = (struct uffd_args *)arg;
506 	unsigned long cpu = args->cpu;
507 	struct pollfd pollfd[2];
508 	struct uffd_msg msg;
509 	struct uffdio_register uffd_reg;
510 	int ret;
511 	char tmp_chr;
512 
513 	if (!args->handle_fault)
514 		args->handle_fault = uffd_handle_page_fault;
515 
516 	pollfd[0].fd = uffd;
517 	pollfd[0].events = POLLIN;
518 	pollfd[1].fd = pipefd[cpu*2];
519 	pollfd[1].events = POLLIN;
520 
521 	for (;;) {
522 		ret = poll(pollfd, 2, -1);
523 		if (ret <= 0) {
524 			if (errno == EINTR || errno == EAGAIN)
525 				continue;
526 			err("poll error: %d", ret);
527 		}
528 		if (pollfd[1].revents) {
529 			if (!(pollfd[1].revents & POLLIN))
530 				err("pollfd[1].revents %d", pollfd[1].revents);
531 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
532 				err("read pipefd error");
533 			break;
534 		}
535 		if (!(pollfd[0].revents & POLLIN))
536 			err("pollfd[0].revents %d", pollfd[0].revents);
537 		if (uffd_read_msg(uffd, &msg))
538 			continue;
539 		switch (msg.event) {
540 		default:
541 			err("unexpected msg event %u\n", msg.event);
542 			break;
543 		case UFFD_EVENT_PAGEFAULT:
544 			args->handle_fault(&msg, args);
545 			break;
546 		case UFFD_EVENT_FORK:
547 			close(uffd);
548 			uffd = msg.arg.fork.ufd;
549 			pollfd[0].fd = uffd;
550 			break;
551 		case UFFD_EVENT_REMOVE:
552 			uffd_reg.range.start = msg.arg.remove.start;
553 			uffd_reg.range.len = msg.arg.remove.end -
554 				msg.arg.remove.start;
555 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
556 				err("remove failure");
557 			break;
558 		case UFFD_EVENT_REMAP:
559 			area_remap = area_dst;  /* save for later unmap */
560 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
561 			break;
562 		}
563 	}
564 
565 	return NULL;
566 }
567 
568 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
569 			    unsigned long offset)
570 {
571 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
572 				     uffdio_copy->len,
573 				     offset);
574 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
575 		/* real retval in ufdio_copy.copy */
576 		if (uffdio_copy->copy != -EEXIST)
577 			err("UFFDIO_COPY retry error: %"PRId64,
578 			    (int64_t)uffdio_copy->copy);
579 	} else {
580 		err("UFFDIO_COPY retry unexpected: %"PRId64,
581 		    (int64_t)uffdio_copy->copy);
582 	}
583 }
584 
585 static void wake_range(int ufd, unsigned long addr, unsigned long len)
586 {
587 	struct uffdio_range uffdio_wake;
588 
589 	uffdio_wake.start = addr;
590 	uffdio_wake.len = len;
591 
592 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
593 		fprintf(stderr, "error waking %lu\n",
594 			addr), exit(1);
595 }
596 
597 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
598 {
599 	struct uffdio_copy uffdio_copy;
600 
601 	if (offset >= nr_pages * page_size)
602 		err("unexpected offset %lu\n", offset);
603 	uffdio_copy.dst = (unsigned long) area_dst + offset;
604 	uffdio_copy.src = (unsigned long) area_src + offset;
605 	uffdio_copy.len = page_size;
606 	if (wp)
607 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
608 	else
609 		uffdio_copy.mode = 0;
610 	uffdio_copy.copy = 0;
611 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
612 		/* real retval in ufdio_copy.copy */
613 		if (uffdio_copy.copy != -EEXIST)
614 			err("UFFDIO_COPY error: %"PRId64,
615 			    (int64_t)uffdio_copy.copy);
616 		wake_range(ufd, uffdio_copy.dst, page_size);
617 	} else if (uffdio_copy.copy != page_size) {
618 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
619 	} else {
620 		if (test_uffdio_copy_eexist && retry) {
621 			test_uffdio_copy_eexist = false;
622 			retry_copy_page(ufd, &uffdio_copy, offset);
623 		}
624 		return 1;
625 	}
626 	return 0;
627 }
628 
629 int copy_page(int ufd, unsigned long offset, bool wp)
630 {
631 	return __copy_page(ufd, offset, false, wp);
632 }
633 
634 int move_page(int ufd, unsigned long offset, unsigned long len)
635 {
636 	struct uffdio_move uffdio_move;
637 
638 	if (offset + len > nr_pages * page_size)
639 		err("unexpected offset %lu and length %lu\n", offset, len);
640 	uffdio_move.dst = (unsigned long) area_dst + offset;
641 	uffdio_move.src = (unsigned long) area_src + offset;
642 	uffdio_move.len = len;
643 	uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
644 	uffdio_move.move = 0;
645 	if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) {
646 		/* real retval in uffdio_move.move */
647 		if (uffdio_move.move != -EEXIST)
648 			err("UFFDIO_MOVE error: %"PRId64,
649 			    (int64_t)uffdio_move.move);
650 		wake_range(ufd, uffdio_move.dst, len);
651 	} else if (uffdio_move.move != len) {
652 		err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
653 	} else
654 		return 1;
655 	return 0;
656 }
657 
658 int uffd_open_dev(unsigned int flags)
659 {
660 	int fd, uffd;
661 
662 	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
663 	if (fd < 0)
664 		return fd;
665 	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
666 	close(fd);
667 
668 	return uffd;
669 }
670 
671 int uffd_open_sys(unsigned int flags)
672 {
673 #ifdef __NR_userfaultfd
674 	return syscall(__NR_userfaultfd, flags);
675 #else
676 	return -1;
677 #endif
678 }
679 
680 int uffd_open(unsigned int flags)
681 {
682 	int uffd = uffd_open_sys(flags);
683 
684 	if (uffd < 0)
685 		uffd = uffd_open_dev(flags);
686 
687 	return uffd;
688 }
689 
690 int uffd_get_features(uint64_t *features)
691 {
692 	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
693 	/*
694 	 * This should by default work in most kernels; the feature list
695 	 * will be the same no matter what we pass in here.
696 	 */
697 	int fd = uffd_open(UFFD_USER_MODE_ONLY);
698 
699 	if (fd < 0)
700 		/* Maybe the kernel is older than user-only mode? */
701 		fd = uffd_open(0);
702 
703 	if (fd < 0)
704 		return fd;
705 
706 	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
707 		close(fd);
708 		return -errno;
709 	}
710 
711 	*features = uffdio_api.features;
712 	close(fd);
713 
714 	return 0;
715 }
716