15d752600SMike Kravetz /* 25d752600SMike Kravetz * memfd_create system call and file sealing support 35d752600SMike Kravetz * 45d752600SMike Kravetz * Code was originally included in shmem.c, and broken out to facilitate 55d752600SMike Kravetz * use by hugetlbfs as well as tmpfs. 65d752600SMike Kravetz * 75d752600SMike Kravetz * This file is released under the GPL. 85d752600SMike Kravetz */ 95d752600SMike Kravetz 105d752600SMike Kravetz #include <linux/fs.h> 115d752600SMike Kravetz #include <linux/vfs.h> 125d752600SMike Kravetz #include <linux/pagemap.h> 135d752600SMike Kravetz #include <linux/file.h> 145d752600SMike Kravetz #include <linux/mm.h> 155d752600SMike Kravetz #include <linux/sched/signal.h> 165d752600SMike Kravetz #include <linux/khugepaged.h> 175d752600SMike Kravetz #include <linux/syscalls.h> 185d752600SMike Kravetz #include <linux/hugetlb.h> 195d752600SMike Kravetz #include <linux/shmem_fs.h> 205d752600SMike Kravetz #include <linux/memfd.h> 21105ff533SJeff Xu #include <linux/pid_namespace.h> 225d752600SMike Kravetz #include <uapi/linux/memfd.h> 235d752600SMike Kravetz 245d752600SMike Kravetz /* 252313216fSMatthew Wilcox * We need a tag: a new tag would expand every xa_node by 8 bytes, 265d752600SMike Kravetz * so reuse a tag which we firmly believe is never set or cleared on tmpfs 275d752600SMike Kravetz * or hugetlbfs because they are memory only filesystems. 285d752600SMike Kravetz */ 295d752600SMike Kravetz #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE 305d752600SMike Kravetz #define LAST_SCAN 4 /* about 150ms max */ 315d752600SMike Kravetz 32*b4d02baaSDavid Hildenbrand static bool memfd_folio_has_extra_refs(struct folio *folio) 33*b4d02baaSDavid Hildenbrand { 34*b4d02baaSDavid Hildenbrand return folio_ref_count(folio) - folio_mapcount(folio) != 35*b4d02baaSDavid Hildenbrand folio_nr_pages(folio); 36*b4d02baaSDavid Hildenbrand } 37*b4d02baaSDavid Hildenbrand 38ef3038a5SMatthew Wilcox static void memfd_tag_pins(struct xa_state *xas) 395d752600SMike Kravetz { 40*b4d02baaSDavid Hildenbrand struct folio *folio; 41f2b277c4SHugh Dickins int latency = 0; 425d752600SMike Kravetz 435d752600SMike Kravetz lru_add_drain(); 445d752600SMike Kravetz 45ef3038a5SMatthew Wilcox xas_lock_irq(xas); 46*b4d02baaSDavid Hildenbrand xas_for_each(xas, folio, ULONG_MAX) { 47*b4d02baaSDavid Hildenbrand if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) 48f2b277c4SHugh Dickins xas_set_mark(xas, MEMFD_TAG_PINNED); 49f2b277c4SHugh Dickins 50*b4d02baaSDavid Hildenbrand if (++latency < XA_CHECK_SCHED) 51ef3038a5SMatthew Wilcox continue; 52f2b277c4SHugh Dickins latency = 0; 53ef3038a5SMatthew Wilcox 54ef3038a5SMatthew Wilcox xas_pause(xas); 55ef3038a5SMatthew Wilcox xas_unlock_irq(xas); 56ef3038a5SMatthew Wilcox cond_resched(); 57ef3038a5SMatthew Wilcox xas_lock_irq(xas); 585d752600SMike Kravetz } 59ef3038a5SMatthew Wilcox xas_unlock_irq(xas); 605d752600SMike Kravetz } 615d752600SMike Kravetz 625d752600SMike Kravetz /* 635d752600SMike Kravetz * Setting SEAL_WRITE requires us to verify there's no pending writer. However, 645d752600SMike Kravetz * via get_user_pages(), drivers might have some pending I/O without any active 65*b4d02baaSDavid Hildenbrand * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios 665d752600SMike Kravetz * and see whether it has an elevated ref-count. If so, we tag them and wait for 675d752600SMike Kravetz * them to be dropped. 685d752600SMike Kravetz * The caller must guarantee that no new user will acquire writable references 69*b4d02baaSDavid Hildenbrand * to those folios to avoid races. 705d752600SMike Kravetz */ 715d752600SMike Kravetz static int memfd_wait_for_pins(struct address_space *mapping) 725d752600SMike Kravetz { 732313216fSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, 0); 74*b4d02baaSDavid Hildenbrand struct folio *folio; 755d752600SMike Kravetz int error, scan; 765d752600SMike Kravetz 77ef3038a5SMatthew Wilcox memfd_tag_pins(&xas); 785d752600SMike Kravetz 795d752600SMike Kravetz error = 0; 805d752600SMike Kravetz for (scan = 0; scan <= LAST_SCAN; scan++) { 81f2b277c4SHugh Dickins int latency = 0; 822313216fSMatthew Wilcox 832313216fSMatthew Wilcox if (!xas_marked(&xas, MEMFD_TAG_PINNED)) 845d752600SMike Kravetz break; 855d752600SMike Kravetz 865d752600SMike Kravetz if (!scan) 875d752600SMike Kravetz lru_add_drain_all(); 885d752600SMike Kravetz else if (schedule_timeout_killable((HZ << scan) / 200)) 895d752600SMike Kravetz scan = LAST_SCAN; 905d752600SMike Kravetz 912313216fSMatthew Wilcox xas_set(&xas, 0); 922313216fSMatthew Wilcox xas_lock_irq(&xas); 93*b4d02baaSDavid Hildenbrand xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { 942313216fSMatthew Wilcox bool clear = true; 95f2b277c4SHugh Dickins 96*b4d02baaSDavid Hildenbrand if (!xa_is_value(folio) && 97*b4d02baaSDavid Hildenbrand memfd_folio_has_extra_refs(folio)) { 985d752600SMike Kravetz /* 995d752600SMike Kravetz * On the last scan, we clean up all those tags 1005d752600SMike Kravetz * we inserted; but make a note that we still 101*b4d02baaSDavid Hildenbrand * found folios pinned. 1025d752600SMike Kravetz */ 1032313216fSMatthew Wilcox if (scan == LAST_SCAN) 1045d752600SMike Kravetz error = -EBUSY; 1052313216fSMatthew Wilcox else 1062313216fSMatthew Wilcox clear = false; 1075d752600SMike Kravetz } 1082313216fSMatthew Wilcox if (clear) 1092313216fSMatthew Wilcox xas_clear_mark(&xas, MEMFD_TAG_PINNED); 110f2b277c4SHugh Dickins 111*b4d02baaSDavid Hildenbrand if (++latency < XA_CHECK_SCHED) 1122313216fSMatthew Wilcox continue; 113f2b277c4SHugh Dickins latency = 0; 1145d752600SMike Kravetz 1152313216fSMatthew Wilcox xas_pause(&xas); 1162313216fSMatthew Wilcox xas_unlock_irq(&xas); 1172313216fSMatthew Wilcox cond_resched(); 1182313216fSMatthew Wilcox xas_lock_irq(&xas); 1195d752600SMike Kravetz } 1202313216fSMatthew Wilcox xas_unlock_irq(&xas); 1215d752600SMike Kravetz } 1225d752600SMike Kravetz 1235d752600SMike Kravetz return error; 1245d752600SMike Kravetz } 1255d752600SMike Kravetz 1265d752600SMike Kravetz static unsigned int *memfd_file_seals_ptr(struct file *file) 1275d752600SMike Kravetz { 1285d752600SMike Kravetz if (shmem_file(file)) 1295d752600SMike Kravetz return &SHMEM_I(file_inode(file))->seals; 1305d752600SMike Kravetz 1315d752600SMike Kravetz #ifdef CONFIG_HUGETLBFS 1325d752600SMike Kravetz if (is_file_hugepages(file)) 1335d752600SMike Kravetz return &HUGETLBFS_I(file_inode(file))->seals; 1345d752600SMike Kravetz #endif 1355d752600SMike Kravetz 1365d752600SMike Kravetz return NULL; 1375d752600SMike Kravetz } 1385d752600SMike Kravetz 1395d752600SMike Kravetz #define F_ALL_SEALS (F_SEAL_SEAL | \ 1406fd73538SDaniel Verkamp F_SEAL_EXEC | \ 1415d752600SMike Kravetz F_SEAL_SHRINK | \ 1425d752600SMike Kravetz F_SEAL_GROW | \ 143ab3948f5SJoel Fernandes (Google) F_SEAL_WRITE | \ 144ab3948f5SJoel Fernandes (Google) F_SEAL_FUTURE_WRITE) 1455d752600SMike Kravetz 1465d752600SMike Kravetz static int memfd_add_seals(struct file *file, unsigned int seals) 1475d752600SMike Kravetz { 1485d752600SMike Kravetz struct inode *inode = file_inode(file); 1495d752600SMike Kravetz unsigned int *file_seals; 1505d752600SMike Kravetz int error; 1515d752600SMike Kravetz 1525d752600SMike Kravetz /* 1535d752600SMike Kravetz * SEALING 1545d752600SMike Kravetz * Sealing allows multiple parties to share a tmpfs or hugetlbfs file 1555d752600SMike Kravetz * but restrict access to a specific subset of file operations. Seals 1565d752600SMike Kravetz * can only be added, but never removed. This way, mutually untrusted 1575d752600SMike Kravetz * parties can share common memory regions with a well-defined policy. 1585d752600SMike Kravetz * A malicious peer can thus never perform unwanted operations on a 1595d752600SMike Kravetz * shared object. 1605d752600SMike Kravetz * 1615d752600SMike Kravetz * Seals are only supported on special tmpfs or hugetlbfs files and 1625d752600SMike Kravetz * always affect the whole underlying inode. Once a seal is set, it 1635d752600SMike Kravetz * may prevent some kinds of access to the file. Currently, the 1645d752600SMike Kravetz * following seals are defined: 1655d752600SMike Kravetz * SEAL_SEAL: Prevent further seals from being set on this file 1665d752600SMike Kravetz * SEAL_SHRINK: Prevent the file from shrinking 1675d752600SMike Kravetz * SEAL_GROW: Prevent the file from growing 1685d752600SMike Kravetz * SEAL_WRITE: Prevent write access to the file 1696fd73538SDaniel Verkamp * SEAL_EXEC: Prevent modification of the exec bits in the file mode 1705d752600SMike Kravetz * 1715d752600SMike Kravetz * As we don't require any trust relationship between two parties, we 1725d752600SMike Kravetz * must prevent seals from being removed. Therefore, sealing a file 1735d752600SMike Kravetz * only adds a given set of seals to the file, it never touches 1745d752600SMike Kravetz * existing seals. Furthermore, the "setting seals"-operation can be 1755d752600SMike Kravetz * sealed itself, which basically prevents any further seal from being 1765d752600SMike Kravetz * added. 1775d752600SMike Kravetz * 1785d752600SMike Kravetz * Semantics of sealing are only defined on volatile files. Only 1795d752600SMike Kravetz * anonymous tmpfs and hugetlbfs files support sealing. More 1805d752600SMike Kravetz * importantly, seals are never written to disk. Therefore, there's 1815d752600SMike Kravetz * no plan to support it on other file types. 1825d752600SMike Kravetz */ 1835d752600SMike Kravetz 1845d752600SMike Kravetz if (!(file->f_mode & FMODE_WRITE)) 1855d752600SMike Kravetz return -EPERM; 1865d752600SMike Kravetz if (seals & ~(unsigned int)F_ALL_SEALS) 1875d752600SMike Kravetz return -EINVAL; 1885d752600SMike Kravetz 1895d752600SMike Kravetz inode_lock(inode); 1905d752600SMike Kravetz 1915d752600SMike Kravetz file_seals = memfd_file_seals_ptr(file); 1925d752600SMike Kravetz if (!file_seals) { 1935d752600SMike Kravetz error = -EINVAL; 1945d752600SMike Kravetz goto unlock; 1955d752600SMike Kravetz } 1965d752600SMike Kravetz 1975d752600SMike Kravetz if (*file_seals & F_SEAL_SEAL) { 1985d752600SMike Kravetz error = -EPERM; 1995d752600SMike Kravetz goto unlock; 2005d752600SMike Kravetz } 2015d752600SMike Kravetz 2025d752600SMike Kravetz if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { 2035d752600SMike Kravetz error = mapping_deny_writable(file->f_mapping); 2045d752600SMike Kravetz if (error) 2055d752600SMike Kravetz goto unlock; 2065d752600SMike Kravetz 2075d752600SMike Kravetz error = memfd_wait_for_pins(file->f_mapping); 2085d752600SMike Kravetz if (error) { 2095d752600SMike Kravetz mapping_allow_writable(file->f_mapping); 2105d752600SMike Kravetz goto unlock; 2115d752600SMike Kravetz } 2125d752600SMike Kravetz } 2135d752600SMike Kravetz 214c4f75bc8SJeff Xu /* 215c4f75bc8SJeff Xu * SEAL_EXEC implys SEAL_WRITE, making W^X from the start. 216c4f75bc8SJeff Xu */ 217c4f75bc8SJeff Xu if (seals & F_SEAL_EXEC && inode->i_mode & 0111) 218c4f75bc8SJeff Xu seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; 219c4f75bc8SJeff Xu 2205d752600SMike Kravetz *file_seals |= seals; 2215d752600SMike Kravetz error = 0; 2225d752600SMike Kravetz 2235d752600SMike Kravetz unlock: 2245d752600SMike Kravetz inode_unlock(inode); 2255d752600SMike Kravetz return error; 2265d752600SMike Kravetz } 2275d752600SMike Kravetz 2285d752600SMike Kravetz static int memfd_get_seals(struct file *file) 2295d752600SMike Kravetz { 2305d752600SMike Kravetz unsigned int *seals = memfd_file_seals_ptr(file); 2315d752600SMike Kravetz 2325d752600SMike Kravetz return seals ? *seals : -EINVAL; 2335d752600SMike Kravetz } 2345d752600SMike Kravetz 235f7b8f70bSLuca Vizzarro long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) 2365d752600SMike Kravetz { 2375d752600SMike Kravetz long error; 2385d752600SMike Kravetz 2395d752600SMike Kravetz switch (cmd) { 2405d752600SMike Kravetz case F_ADD_SEALS: 2415d752600SMike Kravetz error = memfd_add_seals(file, arg); 2425d752600SMike Kravetz break; 2435d752600SMike Kravetz case F_GET_SEALS: 2445d752600SMike Kravetz error = memfd_get_seals(file); 2455d752600SMike Kravetz break; 2465d752600SMike Kravetz default: 2475d752600SMike Kravetz error = -EINVAL; 2485d752600SMike Kravetz break; 2495d752600SMike Kravetz } 2505d752600SMike Kravetz 2515d752600SMike Kravetz return error; 2525d752600SMike Kravetz } 2535d752600SMike Kravetz 2545d752600SMike Kravetz #define MFD_NAME_PREFIX "memfd:" 2555d752600SMike Kravetz #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 2565d752600SMike Kravetz #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 2575d752600SMike Kravetz 258105ff533SJeff Xu #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) 2595d752600SMike Kravetz 26072de2591SJeff Xu static int check_sysctl_memfd_noexec(unsigned int *flags) 26172de2591SJeff Xu { 26272de2591SJeff Xu #ifdef CONFIG_SYSCTL 2639876cfe8SAleksa Sarai struct pid_namespace *ns = task_active_pid_ns(current); 2649876cfe8SAleksa Sarai int sysctl = pidns_memfd_noexec_scope(ns); 26572de2591SJeff Xu 26672de2591SJeff Xu if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { 267202e1422SAleksa Sarai if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) 26872de2591SJeff Xu *flags |= MFD_NOEXEC_SEAL; 26972de2591SJeff Xu else 27072de2591SJeff Xu *flags |= MFD_EXEC; 27172de2591SJeff Xu } 27272de2591SJeff Xu 273202e1422SAleksa Sarai if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { 274202e1422SAleksa Sarai pr_err_ratelimited( 275202e1422SAleksa Sarai "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", 276202e1422SAleksa Sarai current->comm, task_pid_nr(current), sysctl); 27772de2591SJeff Xu return -EACCES; 27872de2591SJeff Xu } 27972de2591SJeff Xu #endif 28072de2591SJeff Xu return 0; 28172de2591SJeff Xu } 28272de2591SJeff Xu 2835d752600SMike Kravetz SYSCALL_DEFINE2(memfd_create, 2845d752600SMike Kravetz const char __user *, uname, 2855d752600SMike Kravetz unsigned int, flags) 2865d752600SMike Kravetz { 2875d752600SMike Kravetz unsigned int *file_seals; 2885d752600SMike Kravetz struct file *file; 2895d752600SMike Kravetz int fd, error; 2905d752600SMike Kravetz char *name; 2915d752600SMike Kravetz long len; 2925d752600SMike Kravetz 2935d752600SMike Kravetz if (!(flags & MFD_HUGETLB)) { 2945d752600SMike Kravetz if (flags & ~(unsigned int)MFD_ALL_FLAGS) 2955d752600SMike Kravetz return -EINVAL; 2965d752600SMike Kravetz } else { 2975d752600SMike Kravetz /* Allow huge page size encoding in flags. */ 2985d752600SMike Kravetz if (flags & ~(unsigned int)(MFD_ALL_FLAGS | 2995d752600SMike Kravetz (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) 3005d752600SMike Kravetz return -EINVAL; 3015d752600SMike Kravetz } 3025d752600SMike Kravetz 303105ff533SJeff Xu /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ 304105ff533SJeff Xu if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) 305105ff533SJeff Xu return -EINVAL; 306105ff533SJeff Xu 307202e1422SAleksa Sarai error = check_sysctl_memfd_noexec(&flags); 308202e1422SAleksa Sarai if (error < 0) 309202e1422SAleksa Sarai return error; 31072de2591SJeff Xu 3115d752600SMike Kravetz /* length includes terminating zero */ 3125d752600SMike Kravetz len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 3135d752600SMike Kravetz if (len <= 0) 3145d752600SMike Kravetz return -EFAULT; 3155d752600SMike Kravetz if (len > MFD_NAME_MAX_LEN + 1) 3165d752600SMike Kravetz return -EINVAL; 3175d752600SMike Kravetz 3185d752600SMike Kravetz name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); 3195d752600SMike Kravetz if (!name) 3205d752600SMike Kravetz return -ENOMEM; 3215d752600SMike Kravetz 3225d752600SMike Kravetz strcpy(name, MFD_NAME_PREFIX); 3235d752600SMike Kravetz if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 3245d752600SMike Kravetz error = -EFAULT; 3255d752600SMike Kravetz goto err_name; 3265d752600SMike Kravetz } 3275d752600SMike Kravetz 3285d752600SMike Kravetz /* terminating-zero may have changed after strnlen_user() returned */ 3295d752600SMike Kravetz if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 3305d752600SMike Kravetz error = -EFAULT; 3315d752600SMike Kravetz goto err_name; 3325d752600SMike Kravetz } 3335d752600SMike Kravetz 3345d752600SMike Kravetz fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 3355d752600SMike Kravetz if (fd < 0) { 3365d752600SMike Kravetz error = fd; 3375d752600SMike Kravetz goto err_name; 3385d752600SMike Kravetz } 3395d752600SMike Kravetz 3405d752600SMike Kravetz if (flags & MFD_HUGETLB) { 34183c1fd76Szhangyiru file = hugetlb_file_setup(name, 0, VM_NORESERVE, 3425d752600SMike Kravetz HUGETLB_ANONHUGE_INODE, 3435d752600SMike Kravetz (flags >> MFD_HUGE_SHIFT) & 3445d752600SMike Kravetz MFD_HUGE_MASK); 3455d752600SMike Kravetz } else 3465d752600SMike Kravetz file = shmem_file_setup(name, 0, VM_NORESERVE); 3475d752600SMike Kravetz if (IS_ERR(file)) { 3485d752600SMike Kravetz error = PTR_ERR(file); 3495d752600SMike Kravetz goto err_fd; 3505d752600SMike Kravetz } 3515d752600SMike Kravetz file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 352c9c554f2SAl Viro file->f_flags |= O_LARGEFILE; 3535d752600SMike Kravetz 354105ff533SJeff Xu if (flags & MFD_NOEXEC_SEAL) { 355105ff533SJeff Xu struct inode *inode = file_inode(file); 356105ff533SJeff Xu 357105ff533SJeff Xu inode->i_mode &= ~0111; 358105ff533SJeff Xu file_seals = memfd_file_seals_ptr(file); 359935d44acSRoberto Sassu if (file_seals) { 360105ff533SJeff Xu *file_seals &= ~F_SEAL_SEAL; 361105ff533SJeff Xu *file_seals |= F_SEAL_EXEC; 362935d44acSRoberto Sassu } 363105ff533SJeff Xu } else if (flags & MFD_ALLOW_SEALING) { 364105ff533SJeff Xu /* MFD_EXEC and MFD_ALLOW_SEALING are set */ 3655d752600SMike Kravetz file_seals = memfd_file_seals_ptr(file); 366935d44acSRoberto Sassu if (file_seals) 3675d752600SMike Kravetz *file_seals &= ~F_SEAL_SEAL; 3685d752600SMike Kravetz } 3695d752600SMike Kravetz 3705d752600SMike Kravetz fd_install(fd, file); 3715d752600SMike Kravetz kfree(name); 3725d752600SMike Kravetz return fd; 3735d752600SMike Kravetz 3745d752600SMike Kravetz err_fd: 3755d752600SMike Kravetz put_unused_fd(fd); 3765d752600SMike Kravetz err_name: 3775d752600SMike Kravetz kfree(name); 3785d752600SMike Kravetz return error; 3795d752600SMike Kravetz } 380