1 /* 2 * memfd_create system call and file sealing support 3 * 4 * Code was originally included in shmem.c, and broken out to facilitate 5 * use by hugetlbfs as well as tmpfs. 6 * 7 * This file is released under the GPL. 8 */ 9 10 #include <linux/fs.h> 11 #include <linux/vfs.h> 12 #include <linux/pagemap.h> 13 #include <linux/file.h> 14 #include <linux/mm.h> 15 #include <linux/sched/signal.h> 16 #include <linux/khugepaged.h> 17 #include <linux/syscalls.h> 18 #include <linux/hugetlb.h> 19 #include <linux/shmem_fs.h> 20 #include <linux/memfd.h> 21 #include <linux/pid_namespace.h> 22 #include <uapi/linux/memfd.h> 23 #include "swap.h" 24 25 /* 26 * We need a tag: a new tag would expand every xa_node by 8 bytes, 27 * so reuse a tag which we firmly believe is never set or cleared on tmpfs 28 * or hugetlbfs because they are memory only filesystems. 29 */ 30 #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 31 #define LAST_SCAN 4 /* about 150ms max */ 32 33 static bool memfd_folio_has_extra_refs(struct folio *folio) 34 { 35 return folio_ref_count(folio) != folio_expected_ref_count(folio); 36 } 37 38 static void memfd_tag_pins(struct xa_state *xas) 39 { 40 struct folio *folio; 41 int latency = 0; 42 43 lru_add_drain(); 44 45 xas_lock_irq(xas); 46 xas_for_each(xas, folio, ULONG_MAX) { 47 if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) 48 xas_set_mark(xas, MEMFD_TAG_PINNED); 49 50 if (++latency < XA_CHECK_SCHED) 51 continue; 52 latency = 0; 53 54 xas_pause(xas); 55 xas_unlock_irq(xas); 56 cond_resched(); 57 xas_lock_irq(xas); 58 } 59 xas_unlock_irq(xas); 60 } 61 62 /* 63 * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). 64 * It is mainly called to allocate a folio in a memfd when the caller 65 * (memfd_pin_folios()) cannot find a folio in the page cache at a given 66 * index in the mapping. 67 */ 68 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) 69 { 70 #ifdef CONFIG_HUGETLB_PAGE 71 struct folio *folio; 72 gfp_t gfp_mask; 73 74 if (is_file_hugepages(memfd)) { 75 /* 76 * The folio would most likely be accessed by a DMA driver, 77 * therefore, we have zone memory constraints where we can 78 * alloc from. Also, the folio will be pinned for an indefinite 79 * amount of time, so it is not expected to be migrated away. 80 */ 81 struct inode *inode = file_inode(memfd); 82 struct hstate *h = hstate_file(memfd); 83 int err = -ENOMEM; 84 long nr_resv; 85 86 gfp_mask = htlb_alloc_mask(h); 87 gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); 88 idx >>= huge_page_order(h); 89 90 nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); 91 if (nr_resv < 0) 92 return ERR_PTR(nr_resv); 93 94 folio = alloc_hugetlb_folio_reserve(h, 95 numa_node_id(), 96 NULL, 97 gfp_mask); 98 if (folio) { 99 u32 hash; 100 101 /* 102 * Zero the folio to prevent information leaks to userspace. 103 * Use folio_zero_user() which is optimized for huge/gigantic 104 * pages. Pass 0 as addr_hint since this is not a faulting path 105 * and we don't have a user virtual address yet. 106 */ 107 folio_zero_user(folio, 0); 108 109 /* 110 * Mark the folio uptodate before adding to page cache, 111 * as required by filemap.c and other hugetlb paths. 112 */ 113 __folio_mark_uptodate(folio); 114 115 /* 116 * Serialize hugepage allocation and instantiation to prevent 117 * races with concurrent allocations, as required by all other 118 * callers of hugetlb_add_to_page_cache(). 119 */ 120 hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); 121 mutex_lock(&hugetlb_fault_mutex_table[hash]); 122 123 err = hugetlb_add_to_page_cache(folio, 124 memfd->f_mapping, 125 idx); 126 127 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 128 129 if (err) { 130 folio_put(folio); 131 goto err_unresv; 132 } 133 134 hugetlb_set_folio_subpool(folio, subpool_inode(inode)); 135 folio_unlock(folio); 136 return folio; 137 } 138 err_unresv: 139 if (nr_resv > 0) 140 hugetlb_unreserve_pages(inode, idx, idx + 1, 0); 141 return ERR_PTR(err); 142 } 143 #endif 144 return shmem_read_folio(memfd->f_mapping, idx); 145 } 146 147 /* 148 * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 149 * via get_user_pages(), drivers might have some pending I/O without any active 150 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios 151 * and see whether it has an elevated ref-count. If so, we tag them and wait for 152 * them to be dropped. 153 * The caller must guarantee that no new user will acquire writable references 154 * to those folios to avoid races. 155 */ 156 static int memfd_wait_for_pins(struct address_space *mapping) 157 { 158 XA_STATE(xas, &mapping->i_pages, 0); 159 struct folio *folio; 160 int error, scan; 161 162 memfd_tag_pins(&xas); 163 164 error = 0; 165 for (scan = 0; scan <= LAST_SCAN; scan++) { 166 int latency = 0; 167 168 if (!xas_marked(&xas, MEMFD_TAG_PINNED)) 169 break; 170 171 if (!scan) 172 lru_add_drain_all(); 173 else if (schedule_timeout_killable((HZ << scan) / 200)) 174 scan = LAST_SCAN; 175 176 xas_set(&xas, 0); 177 xas_lock_irq(&xas); 178 xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { 179 bool clear = true; 180 181 if (!xa_is_value(folio) && 182 memfd_folio_has_extra_refs(folio)) { 183 /* 184 * On the last scan, we clean up all those tags 185 * we inserted; but make a note that we still 186 * found folios pinned. 187 */ 188 if (scan == LAST_SCAN) 189 error = -EBUSY; 190 else 191 clear = false; 192 } 193 if (clear) 194 xas_clear_mark(&xas, MEMFD_TAG_PINNED); 195 196 if (++latency < XA_CHECK_SCHED) 197 continue; 198 latency = 0; 199 200 xas_pause(&xas); 201 xas_unlock_irq(&xas); 202 cond_resched(); 203 xas_lock_irq(&xas); 204 } 205 xas_unlock_irq(&xas); 206 } 207 208 return error; 209 } 210 211 static unsigned int *memfd_file_seals_ptr(struct file *file) 212 { 213 if (shmem_file(file)) 214 return &SHMEM_I(file_inode(file))->seals; 215 216 #ifdef CONFIG_HUGETLBFS 217 if (is_file_hugepages(file)) 218 return &HUGETLBFS_I(file_inode(file))->seals; 219 #endif 220 221 return NULL; 222 } 223 224 #define F_ALL_SEALS (F_SEAL_SEAL | \ 225 F_SEAL_EXEC | \ 226 F_SEAL_SHRINK | \ 227 F_SEAL_GROW | \ 228 F_SEAL_WRITE | \ 229 F_SEAL_FUTURE_WRITE) 230 231 static int memfd_add_seals(struct file *file, unsigned int seals) 232 { 233 struct inode *inode = file_inode(file); 234 unsigned int *file_seals; 235 int error; 236 237 /* 238 * SEALING 239 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file 240 * but restrict access to a specific subset of file operations. Seals 241 * can only be added, but never removed. This way, mutually untrusted 242 * parties can share common memory regions with a well-defined policy. 243 * A malicious peer can thus never perform unwanted operations on a 244 * shared object. 245 * 246 * Seals are only supported on special tmpfs or hugetlbfs files and 247 * always affect the whole underlying inode. Once a seal is set, it 248 * may prevent some kinds of access to the file. Currently, the 249 * following seals are defined: 250 * SEAL_SEAL: Prevent further seals from being set on this file 251 * SEAL_SHRINK: Prevent the file from shrinking 252 * SEAL_GROW: Prevent the file from growing 253 * SEAL_WRITE: Prevent write access to the file 254 * SEAL_EXEC: Prevent modification of the exec bits in the file mode 255 * 256 * As we don't require any trust relationship between two parties, we 257 * must prevent seals from being removed. Therefore, sealing a file 258 * only adds a given set of seals to the file, it never touches 259 * existing seals. Furthermore, the "setting seals"-operation can be 260 * sealed itself, which basically prevents any further seal from being 261 * added. 262 * 263 * Semantics of sealing are only defined on volatile files. Only 264 * anonymous tmpfs and hugetlbfs files support sealing. More 265 * importantly, seals are never written to disk. Therefore, there's 266 * no plan to support it on other file types. 267 */ 268 269 if (!(file->f_mode & FMODE_WRITE)) 270 return -EPERM; 271 if (seals & ~(unsigned int)F_ALL_SEALS) 272 return -EINVAL; 273 274 inode_lock(inode); 275 276 file_seals = memfd_file_seals_ptr(file); 277 if (!file_seals) { 278 error = -EINVAL; 279 goto unlock; 280 } 281 282 if (*file_seals & F_SEAL_SEAL) { 283 error = -EPERM; 284 goto unlock; 285 } 286 287 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { 288 error = mapping_deny_writable(file->f_mapping); 289 if (error) 290 goto unlock; 291 292 error = memfd_wait_for_pins(file->f_mapping); 293 if (error) { 294 mapping_allow_writable(file->f_mapping); 295 goto unlock; 296 } 297 } 298 299 /* 300 * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. 301 */ 302 if (seals & F_SEAL_EXEC && inode->i_mode & 0111) 303 seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; 304 305 *file_seals |= seals; 306 error = 0; 307 308 unlock: 309 inode_unlock(inode); 310 return error; 311 } 312 313 static int memfd_get_seals(struct file *file) 314 { 315 unsigned int *seals = memfd_file_seals_ptr(file); 316 317 return seals ? *seals : -EINVAL; 318 } 319 320 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) 321 { 322 long error; 323 324 switch (cmd) { 325 case F_ADD_SEALS: 326 error = memfd_add_seals(file, arg); 327 break; 328 case F_GET_SEALS: 329 error = memfd_get_seals(file); 330 break; 331 default: 332 error = -EINVAL; 333 break; 334 } 335 336 return error; 337 } 338 339 #define MFD_NAME_PREFIX "memfd:" 340 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 341 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 342 343 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) 344 345 static int check_sysctl_memfd_noexec(unsigned int *flags) 346 { 347 #ifdef CONFIG_SYSCTL 348 struct pid_namespace *ns = task_active_pid_ns(current); 349 int sysctl = pidns_memfd_noexec_scope(ns); 350 351 if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { 352 if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) 353 *flags |= MFD_NOEXEC_SEAL; 354 else 355 *flags |= MFD_EXEC; 356 } 357 358 if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { 359 pr_err_ratelimited( 360 "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", 361 current->comm, task_pid_nr(current), sysctl); 362 return -EACCES; 363 } 364 #endif 365 return 0; 366 } 367 368 static inline bool is_write_sealed(unsigned int seals) 369 { 370 return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); 371 } 372 373 static int check_write_seal(vm_flags_t *vm_flags_ptr) 374 { 375 vm_flags_t vm_flags = *vm_flags_ptr; 376 vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); 377 378 /* If a private mapping then writability is irrelevant. */ 379 if (!(mask & VM_SHARED)) 380 return 0; 381 382 /* 383 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when 384 * write seals are active. 385 */ 386 if (mask & VM_WRITE) 387 return -EPERM; 388 389 /* 390 * This is a read-only mapping, disallow mprotect() from making a 391 * write-sealed mapping writable in future. 392 */ 393 *vm_flags_ptr &= ~VM_MAYWRITE; 394 395 return 0; 396 } 397 398 int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) 399 { 400 int err = 0; 401 unsigned int *seals_ptr = memfd_file_seals_ptr(file); 402 unsigned int seals = seals_ptr ? *seals_ptr : 0; 403 404 if (is_write_sealed(seals)) 405 err = check_write_seal(vm_flags_ptr); 406 407 return err; 408 } 409 410 static int sanitize_flags(unsigned int *flags_ptr) 411 { 412 unsigned int flags = *flags_ptr; 413 414 if (!(flags & MFD_HUGETLB)) { 415 if (flags & ~MFD_ALL_FLAGS) 416 return -EINVAL; 417 } else { 418 /* Allow huge page size encoding in flags. */ 419 if (flags & ~(MFD_ALL_FLAGS | 420 (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) 421 return -EINVAL; 422 } 423 424 /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ 425 if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) 426 return -EINVAL; 427 428 return check_sysctl_memfd_noexec(flags_ptr); 429 } 430 431 static char *alloc_name(const char __user *uname) 432 { 433 int error; 434 char *name; 435 long len; 436 437 name = kmalloc(NAME_MAX + 1, GFP_KERNEL); 438 if (!name) 439 return ERR_PTR(-ENOMEM); 440 441 memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); 442 /* returned length does not include terminating zero */ 443 len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); 444 if (len < 0) { 445 error = -EFAULT; 446 goto err_name; 447 } else if (len > MFD_NAME_MAX_LEN) { 448 error = -EINVAL; 449 goto err_name; 450 } 451 452 return name; 453 454 err_name: 455 kfree(name); 456 return ERR_PTR(error); 457 } 458 459 static struct file *alloc_file(const char *name, unsigned int flags) 460 { 461 unsigned int *file_seals; 462 struct file *file; 463 struct inode *inode; 464 int err = 0; 465 466 if (flags & MFD_HUGETLB) { 467 file = hugetlb_file_setup(name, 0, VM_NORESERVE, 468 HUGETLB_ANONHUGE_INODE, 469 (flags >> MFD_HUGE_SHIFT) & 470 MFD_HUGE_MASK); 471 } else { 472 file = shmem_file_setup(name, 0, VM_NORESERVE); 473 } 474 if (IS_ERR(file)) 475 return file; 476 477 inode = file_inode(file); 478 err = security_inode_init_security_anon(inode, 479 &QSTR(MEMFD_ANON_NAME), NULL); 480 if (err) { 481 fput(file); 482 file = ERR_PTR(err); 483 return file; 484 } 485 486 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 487 file->f_flags |= O_LARGEFILE; 488 489 if (flags & MFD_NOEXEC_SEAL) { 490 inode->i_mode &= ~0111; 491 file_seals = memfd_file_seals_ptr(file); 492 if (file_seals) { 493 *file_seals &= ~F_SEAL_SEAL; 494 *file_seals |= F_SEAL_EXEC; 495 } 496 } else if (flags & MFD_ALLOW_SEALING) { 497 /* MFD_EXEC and MFD_ALLOW_SEALING are set */ 498 file_seals = memfd_file_seals_ptr(file); 499 if (file_seals) 500 *file_seals &= ~F_SEAL_SEAL; 501 } 502 503 return file; 504 } 505 506 SYSCALL_DEFINE2(memfd_create, 507 const char __user *, uname, 508 unsigned int, flags) 509 { 510 char *name __free(kfree) = NULL; 511 unsigned int fd_flags; 512 int error; 513 514 error = sanitize_flags(&flags); 515 if (error < 0) 516 return error; 517 518 name = alloc_name(uname); 519 if (IS_ERR(name)) 520 return PTR_ERR(name); 521 522 fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; 523 return FD_ADD(fd_flags, alloc_file(name, flags)); 524 } 525