1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * memfd_create system call and file sealing support 4 * 5 * Code was originally included in shmem.c, and broken out to facilitate 6 * use by hugetlbfs as well as tmpfs. 7 */ 8 9 #include <linux/fs.h> 10 #include <linux/vfs.h> 11 #include <linux/pagemap.h> 12 #include <linux/file.h> 13 #include <linux/mm.h> 14 #include <linux/sched/signal.h> 15 #include <linux/khugepaged.h> 16 #include <linux/syscalls.h> 17 #include <linux/hugetlb.h> 18 #include <linux/shmem_fs.h> 19 #include <linux/memfd.h> 20 #include <linux/pid_namespace.h> 21 #include <uapi/linux/memfd.h> 22 #include "swap.h" 23 24 /* 25 * We need a tag: a new tag would expand every xa_node by 8 bytes, 26 * so reuse a tag which we firmly believe is never set or cleared on tmpfs 27 * or hugetlbfs because they are memory only filesystems. 28 */ 29 #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 30 #define LAST_SCAN 4 /* about 150ms max */ 31 32 static bool memfd_folio_has_extra_refs(struct folio *folio) 33 { 34 return folio_ref_count(folio) != folio_expected_ref_count(folio); 35 } 36 37 static void memfd_tag_pins(struct xa_state *xas) 38 { 39 struct folio *folio; 40 int latency = 0; 41 42 lru_add_drain(); 43 44 xas_lock_irq(xas); 45 xas_for_each(xas, folio, ULONG_MAX) { 46 if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) 47 xas_set_mark(xas, MEMFD_TAG_PINNED); 48 49 if (++latency < XA_CHECK_SCHED) 50 continue; 51 latency = 0; 52 53 xas_pause(xas); 54 xas_unlock_irq(xas); 55 cond_resched(); 56 xas_lock_irq(xas); 57 } 58 xas_unlock_irq(xas); 59 } 60 61 /* 62 * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). 63 * It is mainly called to allocate a folio in a memfd when the caller 64 * (memfd_pin_folios()) cannot find a folio in the page cache at a given 65 * index in the mapping. 66 */ 67 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) 68 { 69 #ifdef CONFIG_HUGETLB_PAGE 70 struct folio *folio; 71 gfp_t gfp_mask; 72 73 if (is_file_hugepages(memfd)) { 74 /* 75 * The folio would most likely be accessed by a DMA driver, 76 * therefore, we have zone memory constraints where we can 77 * alloc from. Also, the folio will be pinned for an indefinite 78 * amount of time, so it is not expected to be migrated away. 79 */ 80 struct inode *inode = file_inode(memfd); 81 struct hstate *h = hstate_file(memfd); 82 int err = -ENOMEM; 83 long nr_resv; 84 85 gfp_mask = htlb_alloc_mask(h); 86 gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); 87 idx >>= huge_page_order(h); 88 89 nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); 90 if (nr_resv < 0) 91 return ERR_PTR(nr_resv); 92 93 folio = alloc_hugetlb_folio_reserve(h, 94 numa_node_id(), 95 NULL, 96 gfp_mask); 97 if (folio) { 98 u32 hash; 99 100 /* 101 * Zero the folio to prevent information leaks to userspace. 102 * Use folio_zero_user() which is optimized for huge/gigantic 103 * pages. Pass 0 as addr_hint since this is not a faulting path 104 * and we don't have a user virtual address yet. 105 */ 106 folio_zero_user(folio, 0); 107 108 /* 109 * Mark the folio uptodate before adding to page cache, 110 * as required by filemap.c and other hugetlb paths. 111 */ 112 __folio_mark_uptodate(folio); 113 114 /* 115 * Serialize hugepage allocation and instantiation to prevent 116 * races with concurrent allocations, as required by all other 117 * callers of hugetlb_add_to_page_cache(). 118 */ 119 hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); 120 mutex_lock(&hugetlb_fault_mutex_table[hash]); 121 122 err = hugetlb_add_to_page_cache(folio, 123 memfd->f_mapping, 124 idx); 125 126 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 127 128 if (err) { 129 folio_put(folio); 130 goto err_unresv; 131 } 132 133 hugetlb_set_folio_subpool(folio, subpool_inode(inode)); 134 folio_unlock(folio); 135 return folio; 136 } 137 err_unresv: 138 if (nr_resv > 0) 139 hugetlb_unreserve_pages(inode, idx, idx + 1, 0); 140 return ERR_PTR(err); 141 } 142 #endif 143 return shmem_read_folio(memfd->f_mapping, idx); 144 } 145 146 /* 147 * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 148 * via get_user_pages(), drivers might have some pending I/O without any active 149 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios 150 * and see whether it has an elevated ref-count. If so, we tag them and wait for 151 * them to be dropped. 152 * The caller must guarantee that no new user will acquire writable references 153 * to those folios to avoid races. 154 */ 155 static int memfd_wait_for_pins(struct address_space *mapping) 156 { 157 XA_STATE(xas, &mapping->i_pages, 0); 158 struct folio *folio; 159 int error, scan; 160 161 memfd_tag_pins(&xas); 162 163 error = 0; 164 for (scan = 0; scan <= LAST_SCAN; scan++) { 165 int latency = 0; 166 167 if (!xas_marked(&xas, MEMFD_TAG_PINNED)) 168 break; 169 170 if (!scan) 171 lru_add_drain_all(); 172 else if (schedule_timeout_killable((HZ << scan) / 200)) 173 scan = LAST_SCAN; 174 175 xas_set(&xas, 0); 176 xas_lock_irq(&xas); 177 xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { 178 bool clear = true; 179 180 if (!xa_is_value(folio) && 181 memfd_folio_has_extra_refs(folio)) { 182 /* 183 * On the last scan, we clean up all those tags 184 * we inserted; but make a note that we still 185 * found folios pinned. 186 */ 187 if (scan == LAST_SCAN) 188 error = -EBUSY; 189 else 190 clear = false; 191 } 192 if (clear) 193 xas_clear_mark(&xas, MEMFD_TAG_PINNED); 194 195 if (++latency < XA_CHECK_SCHED) 196 continue; 197 latency = 0; 198 199 xas_pause(&xas); 200 xas_unlock_irq(&xas); 201 cond_resched(); 202 xas_lock_irq(&xas); 203 } 204 xas_unlock_irq(&xas); 205 } 206 207 return error; 208 } 209 210 static unsigned int *memfd_file_seals_ptr(struct file *file) 211 { 212 if (shmem_file(file)) 213 return &SHMEM_I(file_inode(file))->seals; 214 215 #ifdef CONFIG_HUGETLBFS 216 if (is_file_hugepages(file)) 217 return &HUGETLBFS_I(file_inode(file))->seals; 218 #endif 219 220 return NULL; 221 } 222 223 #define F_ALL_SEALS (F_SEAL_SEAL | \ 224 F_SEAL_EXEC | \ 225 F_SEAL_SHRINK | \ 226 F_SEAL_GROW | \ 227 F_SEAL_WRITE | \ 228 F_SEAL_FUTURE_WRITE) 229 230 static int memfd_add_seals(struct file *file, unsigned int seals) 231 { 232 struct inode *inode = file_inode(file); 233 unsigned int *file_seals; 234 int error; 235 236 /* 237 * SEALING 238 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file 239 * but restrict access to a specific subset of file operations. Seals 240 * can only be added, but never removed. This way, mutually untrusted 241 * parties can share common memory regions with a well-defined policy. 242 * A malicious peer can thus never perform unwanted operations on a 243 * shared object. 244 * 245 * Seals are only supported on special tmpfs or hugetlbfs files and 246 * always affect the whole underlying inode. Once a seal is set, it 247 * may prevent some kinds of access to the file. Currently, the 248 * following seals are defined: 249 * SEAL_SEAL: Prevent further seals from being set on this file 250 * SEAL_SHRINK: Prevent the file from shrinking 251 * SEAL_GROW: Prevent the file from growing 252 * SEAL_WRITE: Prevent write access to the file 253 * SEAL_EXEC: Prevent modification of the exec bits in the file mode 254 * 255 * As we don't require any trust relationship between two parties, we 256 * must prevent seals from being removed. Therefore, sealing a file 257 * only adds a given set of seals to the file, it never touches 258 * existing seals. Furthermore, the "setting seals"-operation can be 259 * sealed itself, which basically prevents any further seal from being 260 * added. 261 * 262 * Semantics of sealing are only defined on volatile files. Only 263 * anonymous tmpfs and hugetlbfs files support sealing. More 264 * importantly, seals are never written to disk. Therefore, there's 265 * no plan to support it on other file types. 266 */ 267 268 if (!(file->f_mode & FMODE_WRITE)) 269 return -EPERM; 270 if (seals & ~(unsigned int)F_ALL_SEALS) 271 return -EINVAL; 272 273 inode_lock(inode); 274 275 file_seals = memfd_file_seals_ptr(file); 276 if (!file_seals) { 277 error = -EINVAL; 278 goto unlock; 279 } 280 281 if (*file_seals & F_SEAL_SEAL) { 282 error = -EPERM; 283 goto unlock; 284 } 285 286 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { 287 error = mapping_deny_writable(file->f_mapping); 288 if (error) 289 goto unlock; 290 291 error = memfd_wait_for_pins(file->f_mapping); 292 if (error) { 293 mapping_allow_writable(file->f_mapping); 294 goto unlock; 295 } 296 } 297 298 /* 299 * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. 300 */ 301 if (seals & F_SEAL_EXEC && inode->i_mode & 0111) 302 seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; 303 304 *file_seals |= seals; 305 error = 0; 306 307 unlock: 308 inode_unlock(inode); 309 return error; 310 } 311 312 static int memfd_get_seals(struct file *file) 313 { 314 unsigned int *seals = memfd_file_seals_ptr(file); 315 316 return seals ? *seals : -EINVAL; 317 } 318 319 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) 320 { 321 long error; 322 323 switch (cmd) { 324 case F_ADD_SEALS: 325 error = memfd_add_seals(file, arg); 326 break; 327 case F_GET_SEALS: 328 error = memfd_get_seals(file); 329 break; 330 default: 331 error = -EINVAL; 332 break; 333 } 334 335 return error; 336 } 337 338 #define MFD_NAME_PREFIX "memfd:" 339 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 340 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 341 342 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) 343 344 static int check_sysctl_memfd_noexec(unsigned int *flags) 345 { 346 #ifdef CONFIG_SYSCTL 347 struct pid_namespace *ns = task_active_pid_ns(current); 348 int sysctl = pidns_memfd_noexec_scope(ns); 349 350 if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { 351 if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) 352 *flags |= MFD_NOEXEC_SEAL; 353 else 354 *flags |= MFD_EXEC; 355 } 356 357 if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { 358 pr_err_ratelimited( 359 "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", 360 current->comm, task_pid_nr(current), sysctl); 361 return -EACCES; 362 } 363 #endif 364 return 0; 365 } 366 367 static inline bool is_write_sealed(unsigned int seals) 368 { 369 return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); 370 } 371 372 static int check_write_seal(vm_flags_t *vm_flags_ptr) 373 { 374 vm_flags_t vm_flags = *vm_flags_ptr; 375 vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); 376 377 /* If a private mapping then writability is irrelevant. */ 378 if (!(mask & VM_SHARED)) 379 return 0; 380 381 /* 382 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when 383 * write seals are active. 384 */ 385 if (mask & VM_WRITE) 386 return -EPERM; 387 388 /* 389 * This is a read-only mapping, disallow mprotect() from making a 390 * write-sealed mapping writable in future. 391 */ 392 *vm_flags_ptr &= ~VM_MAYWRITE; 393 394 return 0; 395 } 396 397 int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) 398 { 399 int err = 0; 400 unsigned int *seals_ptr = memfd_file_seals_ptr(file); 401 unsigned int seals = seals_ptr ? *seals_ptr : 0; 402 403 if (is_write_sealed(seals)) 404 err = check_write_seal(vm_flags_ptr); 405 406 return err; 407 } 408 409 static int sanitize_flags(unsigned int *flags_ptr) 410 { 411 unsigned int flags = *flags_ptr; 412 413 if (!(flags & MFD_HUGETLB)) { 414 if (flags & ~MFD_ALL_FLAGS) 415 return -EINVAL; 416 } else { 417 /* Allow huge page size encoding in flags. */ 418 if (flags & ~(MFD_ALL_FLAGS | 419 (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) 420 return -EINVAL; 421 } 422 423 /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ 424 if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) 425 return -EINVAL; 426 427 return check_sysctl_memfd_noexec(flags_ptr); 428 } 429 430 static char *alloc_name(const char __user *uname) 431 { 432 int error; 433 char *name; 434 long len; 435 436 name = kmalloc(NAME_MAX + 1, GFP_KERNEL); 437 if (!name) 438 return ERR_PTR(-ENOMEM); 439 440 memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); 441 /* returned length does not include terminating zero */ 442 len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); 443 if (len < 0) { 444 error = -EFAULT; 445 goto err_name; 446 } else if (len > MFD_NAME_MAX_LEN) { 447 error = -EINVAL; 448 goto err_name; 449 } 450 451 return name; 452 453 err_name: 454 kfree(name); 455 return ERR_PTR(error); 456 } 457 458 struct file *memfd_alloc_file(const char *name, unsigned int flags) 459 { 460 unsigned int *file_seals; 461 struct file *file; 462 struct inode *inode; 463 int err = 0; 464 465 if (flags & MFD_HUGETLB) { 466 file = hugetlb_file_setup(name, 0, VM_NORESERVE, 467 HUGETLB_ANONHUGE_INODE, 468 (flags >> MFD_HUGE_SHIFT) & 469 MFD_HUGE_MASK); 470 } else { 471 file = shmem_file_setup(name, 0, VM_NORESERVE); 472 } 473 if (IS_ERR(file)) 474 return file; 475 476 inode = file_inode(file); 477 err = security_inode_init_security_anon(inode, 478 &QSTR(MEMFD_ANON_NAME), NULL); 479 if (err) { 480 fput(file); 481 file = ERR_PTR(err); 482 return file; 483 } 484 485 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 486 file->f_flags |= O_LARGEFILE; 487 488 if (flags & MFD_NOEXEC_SEAL) { 489 inode->i_mode &= ~0111; 490 file_seals = memfd_file_seals_ptr(file); 491 if (file_seals) { 492 *file_seals &= ~F_SEAL_SEAL; 493 *file_seals |= F_SEAL_EXEC; 494 } 495 } else if (flags & MFD_ALLOW_SEALING) { 496 /* MFD_EXEC and MFD_ALLOW_SEALING are set */ 497 file_seals = memfd_file_seals_ptr(file); 498 if (file_seals) 499 *file_seals &= ~F_SEAL_SEAL; 500 } 501 502 return file; 503 } 504 505 SYSCALL_DEFINE2(memfd_create, 506 const char __user *, uname, 507 unsigned int, flags) 508 { 509 char *name __free(kfree) = NULL; 510 unsigned int fd_flags; 511 int error; 512 513 error = sanitize_flags(&flags); 514 if (error < 0) 515 return error; 516 517 name = alloc_name(uname); 518 if (IS_ERR(name)) 519 return PTR_ERR(name); 520 521 fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; 522 return FD_ADD(fd_flags, memfd_alloc_file(name, flags)); 523 } 524