1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Userfaultfd tests util functions
4 *
5 * Copyright (C) 2015-2023 Red Hat, Inc.
6 */
7
8 #include "uffd-common.h"
9
10 uffd_test_ops_t *uffd_test_ops;
11 uffd_test_case_ops_t *uffd_test_case_ops;
12
13 #define BASE_PMD_ADDR ((void *)(1UL << 30))
14
15 /* pthread_mutex_t starts at page offset 0 */
area_mutex(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)16 pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts)
17 {
18 return (pthread_mutex_t *) (area + nr * gopts->page_size);
19 }
20
21 /*
22 * count is placed in the page after pthread_mutex_t naturally aligned
23 * to avoid non alignment faults on non-x86 archs.
24 */
area_count(char * area,unsigned long nr,uffd_global_test_opts_t * gopts)25 volatile unsigned long long *area_count(char *area, unsigned long nr,
26 uffd_global_test_opts_t *gopts)
27 {
28 return (volatile unsigned long long *)
29 ((unsigned long)(area + nr * gopts->page_size +
30 sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) &
31 ~(unsigned long)(sizeof(unsigned long long) - 1));
32 }
33
uffd_mem_fd_create(off_t mem_size,bool hugetlb)34 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
35 {
36 unsigned int memfd_flags = 0;
37 int mem_fd;
38
39 if (hugetlb)
40 memfd_flags = MFD_HUGETLB;
41 mem_fd = memfd_create("uffd-test", memfd_flags);
42 if (mem_fd < 0)
43 err("memfd_create");
44 if (ftruncate(mem_fd, mem_size))
45 err("ftruncate");
46 if (fallocate(mem_fd,
47 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
48 mem_size))
49 err("fallocate");
50
51 return mem_fd;
52 }
53
anon_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)54 static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
55 {
56 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
57 err("madvise(MADV_DONTNEED) failed");
58 }
59
anon_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)60 static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
61 {
62 *alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE,
63 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
64 if (*alloc_area == MAP_FAILED) {
65 *alloc_area = NULL;
66 return -errno;
67 }
68 return 0;
69 }
70
noop_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)71 static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
72 size_t len, unsigned long offset)
73 {
74 }
75
hugetlb_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)76 static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
77 {
78 if (!gopts->map_shared) {
79 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
80 err("madvise(MADV_DONTNEED) failed");
81 } else {
82 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
83 err("madvise(MADV_REMOVE) failed");
84 }
85 }
86
hugetlb_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)87 static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
88 {
89 off_t size = gopts->nr_pages * gopts->page_size;
90 off_t offset = is_src ? 0 : size;
91 void *area_alias = NULL;
92 char **alloc_area_alias;
93 int mem_fd = uffd_mem_fd_create(size * 2, true);
94
95 *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
96 (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) |
97 (is_src ? 0 : MAP_NORESERVE),
98 mem_fd, offset);
99 if (*alloc_area == MAP_FAILED) {
100 *alloc_area = NULL;
101 return -errno;
102 }
103
104 if (gopts->map_shared) {
105 area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
106 MAP_SHARED, mem_fd, offset);
107 if (area_alias == MAP_FAILED)
108 return -errno;
109 }
110
111 if (is_src) {
112 alloc_area_alias = &gopts->area_src_alias;
113 } else {
114 alloc_area_alias = &gopts->area_dst_alias;
115 }
116 if (area_alias)
117 *alloc_area_alias = area_alias;
118
119 close(mem_fd);
120 return 0;
121 }
122
hugetlb_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)123 static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
124 size_t len, unsigned long offset)
125 {
126 if (!gopts->map_shared)
127 return;
128
129 *start = (unsigned long) gopts->area_dst_alias + offset;
130 }
131
shmem_release_pages(uffd_global_test_opts_t * gopts,char * rel_area)132 static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
133 {
134 if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
135 err("madvise(MADV_REMOVE) failed");
136 }
137
shmem_allocate_area(uffd_global_test_opts_t * gopts,void ** alloc_area,bool is_src)138 static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
139 {
140 void *area_alias = NULL;
141 size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize();
142 unsigned long offset = is_src ? 0 : bytes;
143 char *p = NULL, *p_alias = NULL;
144 int mem_fd = uffd_mem_fd_create(bytes * 2, false);
145
146 /* TODO: clean this up. Use a static addr is ugly */
147 p = BASE_PMD_ADDR;
148 if (!is_src)
149 /* src map + alias + interleaved hpages */
150 p += 2 * (bytes + hpage_size);
151 p_alias = p;
152 p_alias += bytes;
153 p_alias += hpage_size; /* Prevent src/dst VMA merge */
154
155 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
156 mem_fd, offset);
157 if (*alloc_area == MAP_FAILED) {
158 *alloc_area = NULL;
159 return -errno;
160 }
161 if (*alloc_area != p)
162 err("mmap of memfd failed at %p", p);
163
164 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
165 mem_fd, offset);
166 if (area_alias == MAP_FAILED) {
167 munmap(*alloc_area, bytes);
168 *alloc_area = NULL;
169 return -errno;
170 }
171 if (area_alias != p_alias)
172 err("mmap of anonymous memory failed at %p", p_alias);
173
174 if (is_src)
175 gopts->area_src_alias = area_alias;
176 else
177 gopts->area_dst_alias = area_alias;
178
179 close(mem_fd);
180 return 0;
181 }
182
shmem_alias_mapping(uffd_global_test_opts_t * gopts,__u64 * start,size_t len,unsigned long offset)183 static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
184 size_t len, unsigned long offset)
185 {
186 *start = (unsigned long)gopts->area_dst_alias + offset;
187 }
188
shmem_check_pmd_mapping(uffd_global_test_opts_t * gopts,void * p,int expect_nr_hpages)189 static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages)
190 {
191 if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages,
192 read_pmd_pagesize()))
193 err("Did not find expected %d number of hugepages",
194 expect_nr_hpages);
195 }
196
197 struct uffd_test_ops anon_uffd_test_ops = {
198 .allocate_area = anon_allocate_area,
199 .release_pages = anon_release_pages,
200 .alias_mapping = noop_alias_mapping,
201 .check_pmd_mapping = NULL,
202 };
203
204 struct uffd_test_ops shmem_uffd_test_ops = {
205 .allocate_area = shmem_allocate_area,
206 .release_pages = shmem_release_pages,
207 .alias_mapping = shmem_alias_mapping,
208 .check_pmd_mapping = shmem_check_pmd_mapping,
209 };
210
211 struct uffd_test_ops hugetlb_uffd_test_ops = {
212 .allocate_area = hugetlb_allocate_area,
213 .release_pages = hugetlb_release_pages,
214 .alias_mapping = hugetlb_alias_mapping,
215 .check_pmd_mapping = NULL,
216 };
217
uffd_stats_report(struct uffd_args * args,int n_cpus)218 void uffd_stats_report(struct uffd_args *args, int n_cpus)
219 {
220 int i;
221 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
222
223 for (i = 0; i < n_cpus; i++) {
224 miss_total += args[i].missing_faults;
225 wp_total += args[i].wp_faults;
226 minor_total += args[i].minor_faults;
227 }
228
229 printf("userfaults: ");
230 if (miss_total) {
231 printf("%llu missing (", miss_total);
232 for (i = 0; i < n_cpus; i++)
233 printf("%lu+", args[i].missing_faults);
234 printf("\b) ");
235 }
236 if (wp_total) {
237 printf("%llu wp (", wp_total);
238 for (i = 0; i < n_cpus; i++)
239 printf("%lu+", args[i].wp_faults);
240 printf("\b) ");
241 }
242 if (minor_total) {
243 printf("%llu minor (", minor_total);
244 for (i = 0; i < n_cpus; i++)
245 printf("%lu+", args[i].minor_faults);
246 printf("\b)");
247 }
248 printf("\n");
249 }
250
userfaultfd_open(uffd_global_test_opts_t * gopts,uint64_t * features)251 int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features)
252 {
253 struct uffdio_api uffdio_api;
254
255 gopts->uffd = uffd_open(UFFD_FLAGS);
256 if (gopts->uffd < 0)
257 return -1;
258 gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL);
259
260 uffdio_api.api = UFFD_API;
261 uffdio_api.features = *features;
262 if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api))
263 /* Probably lack of CAP_PTRACE? */
264 return -1;
265 if (uffdio_api.api != UFFD_API)
266 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
267
268 *features = uffdio_api.features;
269 return 0;
270 }
271
munmap_area(uffd_global_test_opts_t * gopts,void ** area)272 static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area)
273 {
274 if (*area)
275 if (munmap(*area, gopts->nr_pages * gopts->page_size))
276 err("munmap");
277
278 *area = NULL;
279 }
280
uffd_test_ctx_clear(uffd_global_test_opts_t * gopts)281 void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts)
282 {
283 size_t i;
284
285 if (gopts->pipefd) {
286 for (i = 0; i < gopts->nr_parallel * 2; ++i) {
287 if (close(gopts->pipefd[i]))
288 err("close pipefd");
289 }
290 free(gopts->pipefd);
291 gopts->pipefd = NULL;
292 }
293
294 if (gopts->count_verify) {
295 free(gopts->count_verify);
296 gopts->count_verify = NULL;
297 }
298
299 if (gopts->uffd != -1) {
300 if (close(gopts->uffd))
301 err("close uffd");
302 gopts->uffd = -1;
303 }
304
305 munmap_area(gopts, (void **)&gopts->area_src);
306 munmap_area(gopts, (void **)&gopts->area_src_alias);
307 munmap_area(gopts, (void **)&gopts->area_dst);
308 munmap_area(gopts, (void **)&gopts->area_dst_alias);
309 munmap_area(gopts, (void **)&gopts->area_remap);
310 }
311
uffd_test_ctx_init(uffd_global_test_opts_t * gopts,uint64_t features,const char ** errmsg)312 int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg)
313 {
314 unsigned long nr, cpu;
315 int ret;
316
317 gopts->area_src_alias = NULL;
318 gopts->area_dst_alias = NULL;
319 gopts->area_remap = NULL;
320
321 if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
322 ret = uffd_test_case_ops->pre_alloc(gopts, errmsg);
323 if (ret)
324 return ret;
325 }
326
327 ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true);
328 ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false);
329 if (ret) {
330 if (errmsg)
331 *errmsg = "memory allocation failed";
332 return ret;
333 }
334
335 if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
336 ret = uffd_test_case_ops->post_alloc(gopts, errmsg);
337 if (ret)
338 return ret;
339 }
340
341 ret = userfaultfd_open(gopts, &features);
342 if (ret) {
343 if (errmsg)
344 *errmsg = "possible lack of privilege";
345 return ret;
346 }
347
348 gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long));
349 if (!gopts->count_verify)
350 err("count_verify");
351
352 for (nr = 0; nr < gopts->nr_pages; nr++) {
353 *area_mutex(gopts->area_src, nr, gopts) =
354 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
355 gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1;
356 /*
357 * In the transition between 255 to 256, powerpc will
358 * read out of order in my_bcmp and see both bytes as
359 * zero, so leave a placeholder below always non-zero
360 * after the count, to avoid my_bcmp to trigger false
361 * positives.
362 */
363 *(area_count(gopts->area_src, nr, gopts) + 1) = 1;
364 }
365
366 /*
367 * After initialization of area_src, we must explicitly release pages
368 * for area_dst to make sure it's fully empty. Otherwise we could have
369 * some area_dst pages be erroneously initialized with zero pages,
370 * hence we could hit memory corruption later in the test.
371 *
372 * One example is when THP is globally enabled, above allocate_area()
373 * calls could have the two areas merged into a single VMA (as they
374 * will have the same VMA flags so they're mergeable). When we
375 * initialize the area_src above, it's possible that some part of
376 * area_dst could have been faulted in via one huge THP that will be
377 * shared between area_src and area_dst. It could cause some of the
378 * area_dst won't be trapped by missing userfaults.
379 *
380 * This release_pages() will guarantee even if that happened, we'll
381 * proactively split the thp and drop any accidentally initialized
382 * pages within area_dst.
383 */
384 uffd_test_ops->release_pages(gopts, gopts->area_dst);
385
386 gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2);
387 if (!gopts->pipefd)
388 err("pipefd");
389 for (cpu = 0; cpu < gopts->nr_parallel; cpu++)
390 if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
391 err("pipe");
392
393 return 0;
394 }
395
wp_range(int ufd,__u64 start,__u64 len,bool wp)396 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
397 {
398 struct uffdio_writeprotect prms;
399
400 /* Write protection page faults */
401 prms.range.start = start;
402 prms.range.len = len;
403 /* Undo write-protect, do wakeup after that */
404 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
405
406 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
407 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
408 }
409
continue_range(int ufd,__u64 start,__u64 len,bool wp)410 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
411 {
412 struct uffdio_continue req;
413 int ret;
414
415 req.range.start = start;
416 req.range.len = len;
417 req.mode = 0;
418 if (wp)
419 req.mode |= UFFDIO_CONTINUE_MODE_WP;
420
421 if (ioctl(ufd, UFFDIO_CONTINUE, &req))
422 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
423 (uint64_t)start);
424
425 /*
426 * Error handling within the kernel for continue is subtly different
427 * from copy or zeropage, so it may be a source of bugs. Trigger an
428 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
429 */
430 req.mapped = 0;
431 ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
432 if (ret >= 0 || req.mapped != -EEXIST)
433 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
434 ret, (int64_t) req.mapped);
435 }
436
uffd_read_msg(uffd_global_test_opts_t * gopts,struct uffd_msg * msg)437 int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg)
438 {
439 int ret = read(gopts->uffd, msg, sizeof(*msg));
440
441 if (ret != sizeof(*msg)) {
442 if (ret < 0) {
443 if (errno == EAGAIN || errno == EINTR)
444 return 1;
445 err("blocking read error");
446 } else {
447 err("short read");
448 }
449 }
450
451 return 0;
452 }
453
uffd_handle_page_fault(uffd_global_test_opts_t * gopts,struct uffd_msg * msg,struct uffd_args * args)454 void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg,
455 struct uffd_args *args)
456 {
457 unsigned long offset;
458
459 if (msg->event != UFFD_EVENT_PAGEFAULT)
460 err("unexpected msg event %u", msg->event);
461
462 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
463 /* Write protect page faults */
464 wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false);
465 args->wp_faults++;
466 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
467 uint8_t *area;
468 int b;
469
470 /*
471 * Minor page faults
472 *
473 * To prove we can modify the original range for testing
474 * purposes, we're going to bit flip this range before
475 * continuing.
476 *
477 * Note that this requires all minor page fault tests operate on
478 * area_dst (non-UFFD-registered) and area_dst_alias
479 * (UFFD-registered).
480 */
481
482 area = (uint8_t *)(gopts->area_dst +
483 ((char *)msg->arg.pagefault.address -
484 gopts->area_dst_alias));
485 for (b = 0; b < gopts->page_size; ++b)
486 area[b] = ~area[b];
487 continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size,
488 args->apply_wp);
489 args->minor_faults++;
490 } else {
491 /*
492 * Missing page faults.
493 *
494 * Here we force a write check for each of the missing mode
495 * faults. It's guaranteed because the only threads that
496 * will trigger uffd faults are the locking threads, and
497 * their first instruction to touch the missing page will
498 * always be pthread_mutex_lock().
499 *
500 * Note that here we relied on an NPTL glibc impl detail to
501 * always read the lock type at the entry of the lock op
502 * (pthread_mutex_t.__data.__type, offset 0x10) before
503 * doing any locking operations to guarantee that. It's
504 * actually not good to rely on this impl detail because
505 * logically a pthread-compatible lib can implement the
506 * locks without types and we can fail when linking with
507 * them. However since we used to find bugs with this
508 * strict check we still keep it around. Hopefully this
509 * could be a good hint when it fails again. If one day
510 * it'll break on some other impl of glibc we'll revisit.
511 */
512 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
513 err("unexpected write fault");
514
515 offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst;
516 offset &= ~(gopts->page_size-1);
517
518 if (copy_page(gopts, offset, args->apply_wp))
519 args->missing_faults++;
520 }
521 }
522
uffd_poll_thread(void * arg)523 void *uffd_poll_thread(void *arg)
524 {
525 struct uffd_args *args = (struct uffd_args *)arg;
526 uffd_global_test_opts_t *gopts = args->gopts;
527 unsigned long cpu = args->cpu;
528 struct pollfd pollfd[2];
529 struct uffd_msg msg;
530 struct uffdio_register uffd_reg;
531 int ret;
532 char tmp_chr;
533
534 if (!args->handle_fault)
535 args->handle_fault = uffd_handle_page_fault;
536
537 pollfd[0].fd = gopts->uffd;
538 pollfd[0].events = POLLIN;
539 pollfd[1].fd = gopts->pipefd[cpu*2];
540 pollfd[1].events = POLLIN;
541
542 gopts->ready_for_fork = true;
543
544 for (;;) {
545 ret = poll(pollfd, 2, -1);
546 if (ret <= 0) {
547 if (errno == EINTR || errno == EAGAIN)
548 continue;
549 err("poll error: %d", ret);
550 }
551 if (pollfd[1].revents) {
552 if (!(pollfd[1].revents & POLLIN))
553 err("pollfd[1].revents %d", pollfd[1].revents);
554 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
555 err("read pipefd error");
556 break;
557 }
558 if (!(pollfd[0].revents & POLLIN))
559 err("pollfd[0].revents %d", pollfd[0].revents);
560 if (uffd_read_msg(gopts, &msg))
561 continue;
562 switch (msg.event) {
563 default:
564 err("unexpected msg event %u\n", msg.event);
565 break;
566 case UFFD_EVENT_PAGEFAULT:
567 args->handle_fault(gopts, &msg, args);
568 break;
569 case UFFD_EVENT_FORK:
570 close(gopts->uffd);
571 gopts->uffd = msg.arg.fork.ufd;
572 pollfd[0].fd = gopts->uffd;
573 break;
574 case UFFD_EVENT_REMOVE:
575 uffd_reg.range.start = msg.arg.remove.start;
576 uffd_reg.range.len = msg.arg.remove.end -
577 msg.arg.remove.start;
578 if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
579 err("remove failure");
580 break;
581 case UFFD_EVENT_REMAP:
582 gopts->area_remap = gopts->area_dst; /* save for later unmap */
583 gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to;
584 break;
585 }
586 }
587
588 return NULL;
589 }
590
retry_copy_page(uffd_global_test_opts_t * gopts,struct uffdio_copy * uffdio_copy,unsigned long offset)591 static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy,
592 unsigned long offset)
593 {
594 uffd_test_ops->alias_mapping(gopts,
595 &uffdio_copy->dst,
596 uffdio_copy->len,
597 offset);
598 if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) {
599 /* real retval in ufdio_copy.copy */
600 if (uffdio_copy->copy != -EEXIST)
601 err("UFFDIO_COPY retry error: %"PRId64,
602 (int64_t)uffdio_copy->copy);
603 } else {
604 err("UFFDIO_COPY retry unexpected: %"PRId64,
605 (int64_t)uffdio_copy->copy);
606 }
607 }
608
wake_range(int ufd,unsigned long addr,unsigned long len)609 static void wake_range(int ufd, unsigned long addr, unsigned long len)
610 {
611 struct uffdio_range uffdio_wake;
612
613 uffdio_wake.start = addr;
614 uffdio_wake.len = len;
615
616 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
617 fprintf(stderr, "error waking %lu\n",
618 addr), exit(1);
619 }
620
__copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool retry,bool wp)621 int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp)
622 {
623 struct uffdio_copy uffdio_copy;
624
625 if (offset >= gopts->nr_pages * gopts->page_size)
626 err("unexpected offset %lu\n", offset);
627 uffdio_copy.dst = (unsigned long) gopts->area_dst + offset;
628 uffdio_copy.src = (unsigned long) gopts->area_src + offset;
629 uffdio_copy.len = gopts->page_size;
630 if (wp)
631 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
632 else
633 uffdio_copy.mode = 0;
634 uffdio_copy.copy = 0;
635 if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) {
636 /* real retval in ufdio_copy.copy */
637 if (uffdio_copy.copy != -EEXIST)
638 err("UFFDIO_COPY error: %"PRId64,
639 (int64_t)uffdio_copy.copy);
640 wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size);
641 } else if (uffdio_copy.copy != gopts->page_size) {
642 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
643 } else {
644 if (gopts->test_uffdio_copy_eexist && retry) {
645 gopts->test_uffdio_copy_eexist = false;
646 retry_copy_page(gopts, &uffdio_copy, offset);
647 }
648 return 1;
649 }
650 return 0;
651 }
652
copy_page(uffd_global_test_opts_t * gopts,unsigned long offset,bool wp)653 int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp)
654 {
655 return __copy_page(gopts, offset, false, wp);
656 }
657
move_page(uffd_global_test_opts_t * gopts,unsigned long offset,unsigned long len)658 int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len)
659 {
660 struct uffdio_move uffdio_move;
661
662 if (offset + len > gopts->nr_pages * gopts->page_size)
663 err("unexpected offset %lu and length %lu\n", offset, len);
664 uffdio_move.dst = (unsigned long) gopts->area_dst + offset;
665 uffdio_move.src = (unsigned long) gopts->area_src + offset;
666 uffdio_move.len = len;
667 uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
668 uffdio_move.move = 0;
669 if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) {
670 /* real retval in uffdio_move.move */
671 if (uffdio_move.move != -EEXIST)
672 err("UFFDIO_MOVE error: %"PRId64,
673 (int64_t)uffdio_move.move);
674 wake_range(gopts->uffd, uffdio_move.dst, len);
675 } else if (uffdio_move.move != len) {
676 err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
677 } else
678 return 1;
679 return 0;
680 }
681
uffd_open_dev(unsigned int flags)682 int uffd_open_dev(unsigned int flags)
683 {
684 int fd, uffd;
685
686 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
687 if (fd < 0)
688 return fd;
689 uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
690 close(fd);
691
692 return uffd;
693 }
694
uffd_open_sys(unsigned int flags)695 int uffd_open_sys(unsigned int flags)
696 {
697 #ifdef __NR_userfaultfd
698 return syscall(__NR_userfaultfd, flags);
699 #else
700 return -1;
701 #endif
702 }
703
uffd_open(unsigned int flags)704 int uffd_open(unsigned int flags)
705 {
706 int uffd = uffd_open_sys(flags);
707
708 if (uffd < 0)
709 uffd = uffd_open_dev(flags);
710
711 return uffd;
712 }
713
uffd_get_features(uint64_t * features)714 int uffd_get_features(uint64_t *features)
715 {
716 struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
717 /*
718 * This should by default work in most kernels; the feature list
719 * will be the same no matter what we pass in here.
720 */
721 int fd = uffd_open(UFFD_USER_MODE_ONLY);
722
723 if (fd < 0)
724 /* Maybe the kernel is older than user-only mode? */
725 fd = uffd_open(0);
726
727 if (fd < 0)
728 return fd;
729
730 if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
731 close(fd);
732 return -errno;
733 }
734
735 *features = uffdio_api.features;
736 close(fd);
737
738 return 0;
739 }
740