1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * memfd GUP test-case 4 * This tests memfd interactions with get_user_pages(). We require the 5 * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This 6 * file-system delays _all_ reads by 1s and forces direct-IO. This means, any 7 * read() on files in that file-system will pin the receive-buffer pages for at 8 * least 1s via get_user_pages(). 9 * 10 * We use this trick to race ADD_SEALS against a write on a memfd object. The 11 * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use 12 * the read() syscall with our memory-mapped memfd object as receive buffer to 13 * force the kernel to write into our memfd object. 14 */ 15 16 #define _GNU_SOURCE 17 #define __EXPORTED_HEADERS__ 18 19 #include <errno.h> 20 #include <inttypes.h> 21 #include <limits.h> 22 #include <linux/falloc.h> 23 #include <fcntl.h> 24 #include <linux/memfd.h> 25 #include <sched.h> 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <signal.h> 29 #include <string.h> 30 #include <sys/mman.h> 31 #include <sys/stat.h> 32 #include <sys/syscall.h> 33 #include <sys/wait.h> 34 #include <unistd.h> 35 36 #include "common.h" 37 38 #define MFD_DEF_SIZE 8192 39 #define STACK_SIZE 65536 40 41 static size_t mfd_def_size = MFD_DEF_SIZE; 42 43 static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) 44 { 45 int r, fd; 46 47 fd = sys_memfd_create(name, flags); 48 if (fd < 0) { 49 printf("memfd_create(\"%s\", %u) failed: %m\n", 50 name, flags); 51 abort(); 52 } 53 54 r = ftruncate(fd, sz); 55 if (r < 0) { 56 printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); 57 abort(); 58 } 59 60 return fd; 61 } 62 63 static __u64 mfd_assert_get_seals(int fd) 64 { 65 long r; 66 67 r = fcntl(fd, F_GET_SEALS); 68 if (r < 0) { 69 printf("GET_SEALS(%d) failed: %m\n", fd); 70 abort(); 71 } 72 73 return r; 74 } 75 76 static void mfd_assert_has_seals(int fd, __u64 seals) 77 { 78 __u64 s; 79 80 s = mfd_assert_get_seals(fd); 81 if (s != seals) { 82 printf("%llu != %llu = GET_SEALS(%d)\n", 83 (unsigned long long)seals, (unsigned long long)s, fd); 84 abort(); 85 } 86 } 87 88 static void mfd_assert_add_seals(int fd, __u64 seals) 89 { 90 long r; 91 __u64 s; 92 93 s = mfd_assert_get_seals(fd); 94 r = fcntl(fd, F_ADD_SEALS, seals); 95 if (r < 0) { 96 printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", 97 fd, (unsigned long long)s, (unsigned long long)seals); 98 abort(); 99 } 100 } 101 102 static int mfd_busy_add_seals(int fd, __u64 seals) 103 { 104 long r; 105 __u64 s; 106 107 r = fcntl(fd, F_GET_SEALS); 108 if (r < 0) 109 s = 0; 110 else 111 s = r; 112 113 r = fcntl(fd, F_ADD_SEALS, seals); 114 if (r < 0 && errno != EBUSY) { 115 printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", 116 fd, (unsigned long long)s, (unsigned long long)seals); 117 abort(); 118 } 119 120 return r; 121 } 122 123 static void *mfd_assert_mmap_shared(int fd) 124 { 125 void *p; 126 127 p = mmap(NULL, 128 mfd_def_size, 129 PROT_READ | PROT_WRITE, 130 MAP_SHARED, 131 fd, 132 0); 133 if (p == MAP_FAILED) { 134 printf("mmap() failed: %m\n"); 135 abort(); 136 } 137 138 return p; 139 } 140 141 static void *mfd_assert_mmap_private(int fd) 142 { 143 void *p; 144 145 p = mmap(NULL, 146 mfd_def_size, 147 PROT_READ | PROT_WRITE, 148 MAP_PRIVATE, 149 fd, 150 0); 151 if (p == MAP_FAILED) { 152 printf("mmap() failed: %m\n"); 153 abort(); 154 } 155 156 return p; 157 } 158 159 static int global_mfd = -1; 160 static void *global_p = NULL; 161 162 static int sealing_thread_fn(void *arg) 163 { 164 int sig, r; 165 166 /* 167 * This thread first waits 200ms so any pending operation in the parent 168 * is correctly started. After that, it tries to seal @global_mfd as 169 * SEAL_WRITE. This _must_ fail as the parent thread has a read() into 170 * that memory mapped object still ongoing. 171 * We then wait one more second and try sealing again. This time it 172 * must succeed as there shouldn't be anyone else pinning the pages. 173 */ 174 175 /* wait 200ms for FUSE-request to be active */ 176 usleep(200000); 177 178 /* unmount mapping before sealing to avoid i_mmap_writable failures */ 179 munmap(global_p, mfd_def_size); 180 181 /* Try sealing the global file; expect EBUSY or success. Current 182 * kernels will never succeed, but in the future, kernels might 183 * implement page-replacements or other fancy ways to avoid racing 184 * writes. */ 185 r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); 186 if (r >= 0) { 187 printf("HURRAY! This kernel fixed GUP races!\n"); 188 } else { 189 /* wait 1s more so the FUSE-request is done */ 190 sleep(1); 191 192 /* try sealing the global file again */ 193 mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); 194 } 195 196 return 0; 197 } 198 199 static pid_t spawn_sealing_thread(void) 200 { 201 uint8_t *stack; 202 pid_t pid; 203 204 stack = malloc(STACK_SIZE); 205 if (!stack) { 206 printf("malloc(STACK_SIZE) failed: %m\n"); 207 abort(); 208 } 209 210 pid = clone(sealing_thread_fn, 211 stack + STACK_SIZE, 212 SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, 213 NULL); 214 if (pid < 0) { 215 printf("clone() failed: %m\n"); 216 abort(); 217 } 218 219 return pid; 220 } 221 222 static void join_sealing_thread(pid_t pid) 223 { 224 waitpid(pid, NULL, 0); 225 } 226 227 int main(int argc, char **argv) 228 { 229 char *zero; 230 int fd, mfd, r; 231 void *p; 232 int was_sealed; 233 pid_t pid; 234 235 if (argc < 2) { 236 printf("error: please pass path to file in fuse_mnt mount-point\n"); 237 abort(); 238 } 239 240 if (argc >= 3) { 241 if (!strcmp(argv[2], "hugetlbfs")) { 242 unsigned long hpage_size = default_huge_page_size(); 243 244 if (!hpage_size) { 245 printf("Unable to determine huge page size\n"); 246 abort(); 247 } 248 249 hugetlbfs_test = 1; 250 mfd_def_size = hpage_size * 2; 251 } else { 252 printf("Unknown option: %s\n", argv[2]); 253 abort(); 254 } 255 } 256 257 zero = calloc(sizeof(*zero), mfd_def_size); 258 259 /* open FUSE memfd file for GUP testing */ 260 printf("opening: %s\n", argv[1]); 261 fd = open(argv[1], O_RDONLY | O_CLOEXEC); 262 if (fd < 0) { 263 printf("cannot open(\"%s\"): %m\n", argv[1]); 264 abort(); 265 } 266 267 /* create new memfd-object */ 268 mfd = mfd_assert_new("kern_memfd_fuse", 269 mfd_def_size, 270 MFD_CLOEXEC | MFD_ALLOW_SEALING); 271 272 /* mmap memfd-object for writing */ 273 p = mfd_assert_mmap_shared(mfd); 274 275 /* pass mfd+mapping to a separate sealing-thread which tries to seal 276 * the memfd objects with SEAL_WRITE while we write into it */ 277 global_mfd = mfd; 278 global_p = p; 279 pid = spawn_sealing_thread(); 280 281 /* Use read() on the FUSE file to read into our memory-mapped memfd 282 * object. This races the other thread which tries to seal the 283 * memfd-object. 284 * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. 285 * This guarantees that the receive-buffer is pinned for 1s until the 286 * data is written into it. The racing ADD_SEALS should thus fail as 287 * the pages are still pinned. */ 288 r = read(fd, p, mfd_def_size); 289 if (r < 0) { 290 printf("read() failed: %m\n"); 291 abort(); 292 } else if (!r) { 293 printf("unexpected EOF on read()\n"); 294 abort(); 295 } 296 297 was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; 298 299 /* Wait for sealing-thread to finish and verify that it 300 * successfully sealed the file after the second try. */ 301 join_sealing_thread(pid); 302 mfd_assert_has_seals(mfd, F_SEAL_WRITE); 303 304 /* *IF* the memfd-object was sealed at the time our read() returned, 305 * then the kernel did a page-replacement or canceled the read() (or 306 * whatever magic it did..). In that case, the memfd object is still 307 * all zero. 308 * In case the memfd-object was *not* sealed, the read() was successfull 309 * and the memfd object must *not* be all zero. 310 * Note that in real scenarios, there might be a mixture of both, but 311 * in this test-cases, we have explicit 200ms delays which should be 312 * enough to avoid any in-flight writes. */ 313 314 p = mfd_assert_mmap_private(mfd); 315 if (was_sealed && memcmp(p, zero, mfd_def_size)) { 316 printf("memfd sealed during read() but data not discarded\n"); 317 abort(); 318 } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { 319 printf("memfd sealed after read() but data discarded\n"); 320 abort(); 321 } 322 323 close(mfd); 324 close(fd); 325 326 printf("fuse: DONE\n"); 327 free(zero); 328 329 return 0; 330 } 331