1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Userfaultfd tests util functions
4 *
5 * Copyright (C) 2015-2023 Red Hat, Inc.
6 */
7
8 #include "uffd-common.h"
9
10 uffd_test_ops_t *uffd_test_ops;
11 uffd_test_case_ops_t *uffd_test_case_ops;
12
13
14 /* pthread_mutex_t starts at page offset 0 */
area_mutex(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)15 pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts)
16 {
17 return (pthread_mutex_t *) (area + nr * gopts->page_size);
18 }
19
20 /*
21 * count is placed in the page after pthread_mutex_t naturally aligned
22 * to avoid non alignment faults on non-x86 archs.
23 */
area_count(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)24 volatile unsigned long long *area_count(char *area, unsigned long nr,
25 uffd_global_test_opts_t *gopts)
26 {
27 return (volatile unsigned long long *)
28 ((unsigned long)(area + nr * gopts->page_size +
29 sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) &
30 ~(unsigned long)(sizeof(unsigned long long) - 1));
31 }
32
uffd_mem_fd_create(off_t mem_size,bool hugetlb)33 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
34 {
35 unsigned int memfd_flags = 0;
36 int mem_fd;
37
38 if (hugetlb)
39 memfd_flags = MFD_HUGETLB;
40 mem_fd = memfd_create("uffd-test", memfd_flags);
41 if (mem_fd < 0)
42 err("memfd_create");
43 if (ftruncate(mem_fd, mem_size))
44 err("ftruncate");
45 if (fallocate(mem_fd,
46 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
47 mem_size))
48 err("fallocate");
49
50 return mem_fd;
51 }
52
anon_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)53 static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
54 {
55 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
56 err("madvise(MADV_DONTNEED) failed");
57 }
58
anon_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)59 static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
60 {
61 *alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE,
62 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
63 if (*alloc_area == MAP_FAILED) {
64 *alloc_area = NULL;
65 return -errno;
66 }
67 return 0;
68 }
69
noop_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)70 static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
71 size_t len, unsigned long offset)
72 {
73 }
74
hugetlb_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)75 static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
76 {
77 if (!gopts->map_shared) {
78 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
79 err("madvise(MADV_DONTNEED) failed");
80 } else {
81 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
82 err("madvise(MADV_REMOVE) failed");
83 }
84 }
85
hugetlb_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)86 static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
87 {
88 off_t size = gopts->nr_pages * gopts->page_size;
89 off_t offset = is_src ? 0 : size;
90 void *area_alias = NULL;
91 char **alloc_area_alias;
92 int mem_fd = uffd_mem_fd_create(size * 2, true);
93
94 *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
95 (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) |
96 (is_src ? 0 : MAP_NORESERVE),
97 mem_fd, offset);
98 if (*alloc_area == MAP_FAILED) {
99 *alloc_area = NULL;
100 return -errno;
101 }
102
103 if (gopts->map_shared) {
104 area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
105 MAP_SHARED, mem_fd, offset);
106 if (area_alias == MAP_FAILED)
107 return -errno;
108 }
109
110 if (is_src) {
111 alloc_area_alias = &gopts->area_src_alias;
112 } else {
113 alloc_area_alias = &gopts->area_dst_alias;
114 }
115 if (area_alias)
116 *alloc_area_alias = area_alias;
117
118 close(mem_fd);
119 return 0;
120 }
121
hugetlb_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)122 static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
123 size_t len, unsigned long offset)
124 {
125 if (!gopts->map_shared)
126 return;
127
128 *start = (unsigned long) gopts->area_dst_alias + offset;
129 }
130
shmem_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)131 static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
132 {
133 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
134 err("madvise(MADV_REMOVE) failed");
135 }
136
shmem_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)137 static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
138 {
139 void *area_alias = NULL;
140 size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize();
141 unsigned long offset = is_src ? 0 : bytes;
142 char *p = NULL, *p_alias = NULL;
143 int mem_fd = uffd_mem_fd_create(bytes * 2, false);
144 size_t region_size = bytes * 2 + hpage_size;
145
146 void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
147 -1, 0);
148 if (reserve == MAP_FAILED) {
149 close(mem_fd);
150 return -errno;
151 }
152
153 p = reserve;
154 p_alias = p;
155 p_alias += bytes;
156 p_alias += hpage_size; /* Prevent src/dst VMA merge */
157
158 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
159 mem_fd, offset);
160 if (*alloc_area == MAP_FAILED) {
161 *alloc_area = NULL;
162 munmap(reserve, region_size);
163 close(mem_fd);
164 return -errno;
165 }
166 if (*alloc_area != p)
167 err("mmap of memfd failed at %p", p);
168
169 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
170 mem_fd, offset);
171 if (area_alias == MAP_FAILED) {
172 *alloc_area = NULL;
173 munmap(reserve, region_size);
174 close(mem_fd);
175 return -errno;
176 }
177 if (area_alias != p_alias)
178 err("mmap of anonymous memory failed at %p", p_alias);
179
180 if (is_src)
181 gopts->area_src_alias = area_alias;
182 else
183 gopts->area_dst_alias = area_alias;
184
185 close(mem_fd);
186 return 0;
187 }
188
shmem_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)189 static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
190 size_t len, unsigned long offset)
191 {
192 *start = (unsigned long)gopts->area_dst_alias + offset;
193 }
194
shmem_check_pmd_mapping(uffd_global_test_opts_t * gopts,void * p,int expect_nr_hpages)195 static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages)
196 {
197 if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages,
198 read_pmd_pagesize()))
199 err("Did not find expected %d number of hugepages",
200 expect_nr_hpages);
201 }
202
203 struct uffd_test_ops anon_uffd_test_ops = {
204 .allocate_area = anon_allocate_area,
205 .release_pages = anon_release_pages,
206 .alias_mapping = noop_alias_mapping,
207 .check_pmd_mapping = NULL,
208 };
209
210 struct uffd_test_ops shmem_uffd_test_ops = {
211 .allocate_area = shmem_allocate_area,
212 .release_pages = shmem_release_pages,
213 .alias_mapping = shmem_alias_mapping,
214 .check_pmd_mapping = shmem_check_pmd_mapping,
215 };
216
217 struct uffd_test_ops hugetlb_uffd_test_ops = {
218 .allocate_area = hugetlb_allocate_area,
219 .release_pages = hugetlb_release_pages,
220 .alias_mapping = hugetlb_alias_mapping,
221 .check_pmd_mapping = NULL,
222 };
223
uffd_stats_report(struct uffd_args * args,int n_cpus)224 void uffd_stats_report(struct uffd_args *args, int n_cpus)
225 {
226 int i;
227 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
228
229 for (i = 0; i < n_cpus; i++) {
230 miss_total += args[i].missing_faults;
231 wp_total += args[i].wp_faults;
232 minor_total += args[i].minor_faults;
233 }
234
235 printf("userfaults: ");
236 if (miss_total) {
237 printf("%llu missing (", miss_total);
238 for (i = 0; i < n_cpus; i++)
239 printf("%lu+", args[i].missing_faults);
240 printf("\b) ");
241 }
242 if (wp_total) {
243 printf("%llu wp (", wp_total);
244 for (i = 0; i < n_cpus; i++)
245 printf("%lu+", args[i].wp_faults);
246 printf("\b) ");
247 }
248 if (minor_total) {
249 printf("%llu minor (", minor_total);
250 for (i = 0; i < n_cpus; i++)
251 printf("%lu+", args[i].minor_faults);
252 printf("\b)");
253 }
254 printf("\n");
255 }
256
userfaultfd_open(uffd_global_test_opts_t * gopts,uint64_t * features)257 int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features)
258 {
259 struct uffdio_api uffdio_api;
260
261 gopts->uffd = uffd_open(UFFD_FLAGS);
262 if (gopts->uffd < 0)
263 return -1;
264 gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL);
265
266 uffdio_api.api = UFFD_API;
267 uffdio_api.features = *features;
268 if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api))
269 /* Probably lack of CAP_PTRACE? */
270 return -1;
271 if (uffdio_api.api != UFFD_API)
272 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
273
274 *features = uffdio_api.features;
275 return 0;
276 }
277
munmap_area(uffd_global_test_opts_t * gopts,void ** area)278 static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area)
279 {
280 if (*area)
281 if (munmap(*area, gopts->nr_pages * gopts->page_size))
282 err("munmap");
283
284 *area = NULL;
285 }
286
uffd_test_ctx_clear(uffd_global_test_opts_t * gopts)287 void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts)
288 {
289 size_t i;
290
291 if (gopts->pipefd) {
292 for (i = 0; i < gopts->nr_parallel * 2; ++i) {
293 if (close(gopts->pipefd[i]))
294 err("close pipefd");
295 }
296 free(gopts->pipefd);
297 gopts->pipefd = NULL;
298 }
299
300 if (gopts->count_verify) {
301 free(gopts->count_verify);
302 gopts->count_verify = NULL;
303 }
304
305 if (gopts->uffd != -1) {
306 if (close(gopts->uffd))
307 err("close uffd");
308 gopts->uffd = -1;
309 }
310
311 munmap_area(gopts, (void **)&gopts->area_src);
312 munmap_area(gopts, (void **)&gopts->area_src_alias);
313 munmap_area(gopts, (void **)&gopts->area_dst);
314 munmap_area(gopts, (void **)&gopts->area_dst_alias);
315 munmap_area(gopts, (void **)&gopts->area_remap);
316 }
317
uffd_test_ctx_init(uffd_global_test_opts_t * gopts,uint64_t features,const char ** errmsg)318 int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg)
319 {
320 unsigned long nr, cpu;
321 int ret;
322
323 gopts->area_src_alias = NULL;
324 gopts->area_dst_alias = NULL;
325 gopts->area_remap = NULL;
326
327 if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
328 ret = uffd_test_case_ops->pre_alloc(gopts, errmsg);
329 if (ret)
330 return ret;
331 }
332
333 ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true);
334 ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false);
335 if (ret) {
336 if (errmsg)
337 *errmsg = "memory allocation failed";
338 return ret;
339 }
340
341 if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
342 ret = uffd_test_case_ops->post_alloc(gopts, errmsg);
343 if (ret)
344 return ret;
345 }
346
347 ret = userfaultfd_open(gopts, &features);
348 if (ret) {
349 if (errmsg)
350 *errmsg = "possible lack of privilege";
351 return ret;
352 }
353
354 gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long));
355 if (!gopts->count_verify)
356 err("count_verify");
357
358 for (nr = 0; nr < gopts->nr_pages; nr++) {
359 *area_mutex(gopts->area_src, nr, gopts) =
360 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
361 gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1;
362 /*
363 * In the transition between 255 to 256, powerpc will
364 * read out of order in my_bcmp and see both bytes as
365 * zero, so leave a placeholder below always non-zero
366 * after the count, to avoid my_bcmp to trigger false
367 * positives.
368 */
369 *(area_count(gopts->area_src, nr, gopts) + 1) = 1;
370 }
371
372 /*
373 * After initialization of area_src, we must explicitly release pages
374 * for area_dst to make sure it's fully empty. Otherwise we could have
375 * some area_dst pages be erroneously initialized with zero pages,
376 * hence we could hit memory corruption later in the test.
377 *
378 * One example is when THP is globally enabled, above allocate_area()
379 * calls could have the two areas merged into a single VMA (as they
380 * will have the same VMA flags so they're mergeable). When we
381 * initialize the area_src above, it's possible that some part of
382 * area_dst could have been faulted in via one huge THP that will be
383 * shared between area_src and area_dst. It could cause some of the
384 * area_dst won't be trapped by missing userfaults.
385 *
386 * This release_pages() will guarantee even if that happened, we'll
387 * proactively split the thp and drop any accidentally initialized
388 * pages within area_dst.
389 */
390 uffd_test_ops->release_pages(gopts, gopts->area_dst);
391
392 gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2);
393 if (!gopts->pipefd)
394 err("pipefd");
395 for (cpu = 0; cpu < gopts->nr_parallel; cpu++)
396 if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
397 err("pipe");
398
399 return 0;
400 }
401
wp_range(int ufd,__u64 start,__u64 len,bool wp)402 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
403 {
404 struct uffdio_writeprotect prms;
405
406 /* Write protection page faults */
407 prms.range.start = start;
408 prms.range.len = len;
409 /* Undo write-protect, do wakeup after that */
410 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
411
412 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
413 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
414 }
415
continue_range(int ufd,__u64 start,__u64 len,bool wp)416 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
417 {
418 struct uffdio_continue req;
419 int ret;
420
421 req.range.start = start;
422 req.range.len = len;
423 req.mode = 0;
424 if (wp)
425 req.mode |= UFFDIO_CONTINUE_MODE_WP;
426
427 if (ioctl(ufd, UFFDIO_CONTINUE, &req))
428 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
429 (uint64_t)start);
430
431 /*
432 * Error handling within the kernel for continue is subtly different
433 * from copy or zeropage, so it may be a source of bugs. Trigger an
434 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
435 */
436 req.mapped = 0;
437 ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
438 if (ret >= 0 || req.mapped != -EEXIST)
439 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
440 ret, (int64_t) req.mapped);
441 }
442
uffd_read_msg(uffd_global_test_opts_t * gopts,struct uffd_msg * msg)443 int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg)
444 {
445 int ret = read(gopts->uffd, msg, sizeof(*msg));
446
447 if (ret != sizeof(*msg)) {
448 if (ret < 0) {
449 if (errno == EAGAIN || errno == EINTR)
450 return 1;
451 err("blocking read error");
452 } else {
453 err("short read");
454 }
455 }
456
457 return 0;
458 }
459
uffd_handle_page_fault(uffd_global_test_opts_t * gopts,struct uffd_msg * msg,struct uffd_args * args)460 void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg,
461 struct uffd_args *args)
462 {
463 unsigned long offset;
464
465 if (msg->event != UFFD_EVENT_PAGEFAULT)
466 err("unexpected msg event %u", msg->event);
467
468 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
469 /* Write protect page faults */
470 wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false);
471 args->wp_faults++;
472 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
473 uint8_t *area;
474 int b;
475
476 /*
477 * Minor page faults
478 *
479 * To prove we can modify the original range for testing
480 * purposes, we're going to bit flip this range before
481 * continuing.
482 *
483 * Note that this requires all minor page fault tests operate on
484 * area_dst (non-UFFD-registered) and area_dst_alias
485 * (UFFD-registered).
486 */
487
488 area = (uint8_t *)(gopts->area_dst +
489 ((char *)msg->arg.pagefault.address -
490 gopts->area_dst_alias));
491 for (b = 0; b < gopts->page_size; ++b)
492 area[b] = ~area[b];
493 continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size,
494 args->apply_wp);
495 args->minor_faults++;
496 } else {
497 /*
498 * Missing page faults.
499 *
500 * Here we force a write check for each of the missing mode
501 * faults. It's guaranteed because the only threads that
502 * will trigger uffd faults are the locking threads, and
503 * their first instruction to touch the missing page will
504 * always be pthread_mutex_lock().
505 *
506 * Note that here we relied on an NPTL glibc impl detail to
507 * always read the lock type at the entry of the lock op
508 * (pthread_mutex_t.__data.__type, offset 0x10) before
509 * doing any locking operations to guarantee that. It's
510 * actually not good to rely on this impl detail because
511 * logically a pthread-compatible lib can implement the
512 * locks without types and we can fail when linking with
513 * them. However since we used to find bugs with this
514 * strict check we still keep it around. Hopefully this
515 * could be a good hint when it fails again. If one day
516 * it'll break on some other impl of glibc we'll revisit.
517 */
518 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
519 err("unexpected write fault");
520
521 offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst;
522 offset &= ~(gopts->page_size-1);
523
524 if (copy_page(gopts, offset, args->apply_wp))
525 args->missing_faults++;
526 }
527 }
528
uffd_poll_thread(void * arg)529 void *uffd_poll_thread(void *arg)
530 {
531 struct uffd_args *args = (struct uffd_args *)arg;
532 uffd_global_test_opts_t *gopts = args->gopts;
533 unsigned long cpu = args->cpu;
534 struct pollfd pollfd[2];
535 struct uffd_msg msg;
536 struct uffdio_register uffd_reg;
537 int ret;
538 char tmp_chr;
539
540 if (!args->handle_fault)
541 args->handle_fault = uffd_handle_page_fault;
542
543 pollfd[0].fd = gopts->uffd;
544 pollfd[0].events = POLLIN;
545 pollfd[1].fd = gopts->pipefd[cpu*2];
546 pollfd[1].events = POLLIN;
547
548 gopts->ready_for_fork = true;
549
550 for (;;) {
551 ret = poll(pollfd, 2, -1);
552 if (ret <= 0) {
553 if (errno == EINTR || errno == EAGAIN)
554 continue;
555 err("poll error: %d", ret);
556 }
557 if (pollfd[1].revents) {
558 if (!(pollfd[1].revents & POLLIN))
559 err("pollfd[1].revents %d", pollfd[1].revents);
560 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
561 err("read pipefd error");
562 break;
563 }
564 if (!(pollfd[0].revents & POLLIN))
565 err("pollfd[0].revents %d", pollfd[0].revents);
566 if (uffd_read_msg(gopts, &msg))
567 continue;
568 switch (msg.event) {
569 default:
570 err("unexpected msg event %u\n", msg.event);
571 break;
572 case UFFD_EVENT_PAGEFAULT:
573 args->handle_fault(gopts, &msg, args);
574 break;
575 case UFFD_EVENT_FORK:
576 close(gopts->uffd);
577 gopts->uffd = msg.arg.fork.ufd;
578 pollfd[0].fd = gopts->uffd;
579 break;
580 case UFFD_EVENT_REMOVE:
581 uffd_reg.range.start = msg.arg.remove.start;
582 uffd_reg.range.len = msg.arg.remove.end -
583 msg.arg.remove.start;
584 if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
585 err("remove failure");
586 break;
587 case UFFD_EVENT_REMAP:
588 gopts->area_remap = gopts->area_dst; /* save for later unmap */
589 gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to;
590 break;
591 }
592 }
593
594 return NULL;
595 }
596
retry_copy_page(uffd_global_test_opts_t * gopts,struct uffdio_copy * uffdio_copy,unsigned long offset)597 static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy,
598 unsigned long offset)
599 {
600 uffd_test_ops->alias_mapping(gopts,
601 &uffdio_copy->dst,
602 uffdio_copy->len,
603 offset);
604 if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) {
605 /* real retval in ufdio_copy.copy */
606 if (uffdio_copy->copy != -EEXIST)
607 err("UFFDIO_COPY retry error: %"PRId64,
608 (int64_t)uffdio_copy->copy);
609 } else {
610 err("UFFDIO_COPY retry unexpected: %"PRId64,
611 (int64_t)uffdio_copy->copy);
612 }
613 }
614
wake_range(int ufd,unsigned long addr,unsigned long len)615 static void wake_range(int ufd, unsigned long addr, unsigned long len)
616 {
617 struct uffdio_range uffdio_wake;
618
619 uffdio_wake.start = addr;
620 uffdio_wake.len = len;
621
622 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
623 fprintf(stderr, "error waking %lu\n",
624 addr), exit(1);
625 }
626
__copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool retry,bool wp)627 int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp)
628 {
629 struct uffdio_copy uffdio_copy;
630
631 if (offset >= gopts->nr_pages * gopts->page_size)
632 err("unexpected offset %lu\n", offset);
633 uffdio_copy.dst = (unsigned long) gopts->area_dst + offset;
634 uffdio_copy.src = (unsigned long) gopts->area_src + offset;
635 uffdio_copy.len = gopts->page_size;
636 if (wp)
637 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
638 else
639 uffdio_copy.mode = 0;
640 uffdio_copy.copy = 0;
641 if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) {
642 /* real retval in ufdio_copy.copy */
643 if (uffdio_copy.copy != -EEXIST)
644 err("UFFDIO_COPY error: %"PRId64,
645 (int64_t)uffdio_copy.copy);
646 wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size);
647 } else if (uffdio_copy.copy != gopts->page_size) {
648 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
649 } else {
650 if (gopts->test_uffdio_copy_eexist && retry) {
651 gopts->test_uffdio_copy_eexist = false;
652 retry_copy_page(gopts, &uffdio_copy, offset);
653 }
654 return 1;
655 }
656 return 0;
657 }
658
copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool wp)659 int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp)
660 {
661 return __copy_page(gopts, offset, false, wp);
662 }
663
move_page(uffd_global_test_opts_t * gopts,unsigned long offset,unsigned long len)664 int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len)
665 {
666 struct uffdio_move uffdio_move;
667
668 if (offset + len > gopts->nr_pages * gopts->page_size)
669 err("unexpected offset %lu and length %lu\n", offset, len);
670 uffdio_move.dst = (unsigned long) gopts->area_dst + offset;
671 uffdio_move.src = (unsigned long) gopts->area_src + offset;
672 uffdio_move.len = len;
673 uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
674 uffdio_move.move = 0;
675 if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) {
676 /* real retval in uffdio_move.move */
677 if (uffdio_move.move != -EEXIST)
678 err("UFFDIO_MOVE error: %"PRId64,
679 (int64_t)uffdio_move.move);
680 wake_range(gopts->uffd, uffdio_move.dst, len);
681 } else if (uffdio_move.move != len) {
682 err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
683 } else
684 return 1;
685 return 0;
686 }
687
uffd_open_dev(unsigned int flags)688 int uffd_open_dev(unsigned int flags)
689 {
690 int fd, uffd;
691
692 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
693 if (fd < 0)
694 return fd;
695 uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
696 close(fd);
697
698 return uffd;
699 }
700
uffd_open_sys(unsigned int flags)701 int uffd_open_sys(unsigned int flags)
702 {
703 #ifdef __NR_userfaultfd
704 return syscall(__NR_userfaultfd, flags);
705 #else
706 return -1;
707 #endif
708 }
709
uffd_open(unsigned int flags)710 int uffd_open(unsigned int flags)
711 {
712 int uffd = uffd_open_sys(flags);
713
714 if (uffd < 0)
715 uffd = uffd_open_dev(flags);
716
717 return uffd;
718 }
719
uffd_get_features(uint64_t * features)720 int uffd_get_features(uint64_t *features)
721 {
722 struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
723 /*
724 * This should by default work in most kernels; the feature list
725 * will be the same no matter what we pass in here.
726 */
727 int fd = uffd_open(UFFD_USER_MODE_ONLY);
728
729 if (fd < 0)
730 /* Maybe the kernel is older than user-only mode? */
731 fd = uffd_open(0);
732
733 if (fd < 0)
734 return fd;
735
736 if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
737 close(fd);
738 return -errno;
739 }
740
741 *features = uffdio_api.features;
742 close(fd);
743
744 return 0;
745 }
746