11507f512SMike Rapoport // SPDX-License-Identifier: GPL-2.0 21507f512SMike Rapoport /* 31507f512SMike Rapoport * Copyright IBM Corporation, 2021 41507f512SMike Rapoport * 51507f512SMike Rapoport * Author: Mike Rapoport <rppt@linux.ibm.com> 61507f512SMike Rapoport */ 71507f512SMike Rapoport 81507f512SMike Rapoport #include <linux/mm.h> 91507f512SMike Rapoport #include <linux/fs.h> 101507f512SMike Rapoport #include <linux/swap.h> 111507f512SMike Rapoport #include <linux/mount.h> 121507f512SMike Rapoport #include <linux/memfd.h> 131507f512SMike Rapoport #include <linux/bitops.h> 141507f512SMike Rapoport #include <linux/printk.h> 151507f512SMike Rapoport #include <linux/pagemap.h> 161507f512SMike Rapoport #include <linux/syscalls.h> 171507f512SMike Rapoport #include <linux/pseudo_fs.h> 181507f512SMike Rapoport #include <linux/secretmem.h> 191507f512SMike Rapoport #include <linux/set_memory.h> 201507f512SMike Rapoport #include <linux/sched/signal.h> 211507f512SMike Rapoport 221507f512SMike Rapoport #include <uapi/linux/magic.h> 231507f512SMike Rapoport 241507f512SMike Rapoport #include <asm/tlbflush.h> 251507f512SMike Rapoport 261507f512SMike Rapoport #include "internal.h" 271507f512SMike Rapoport 281507f512SMike Rapoport #undef pr_fmt 291507f512SMike Rapoport #define pr_fmt(fmt) "secretmem: " fmt 301507f512SMike Rapoport 311507f512SMike Rapoport /* 321507f512SMike Rapoport * Define mode and flag masks to allow validation of the system call 331507f512SMike Rapoport * parameters. 341507f512SMike Rapoport */ 351507f512SMike Rapoport #define SECRETMEM_MODE_MASK (0x0) 361507f512SMike Rapoport #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK 371507f512SMike Rapoport 381507f512SMike Rapoport static bool secretmem_enable __ro_after_init; 391507f512SMike Rapoport module_param_named(enable, secretmem_enable, bool, 0400); 401507f512SMike Rapoport MODULE_PARM_DESC(secretmem_enable, 411507f512SMike Rapoport "Enable secretmem and memfd_secret(2) system call"); 421507f512SMike Rapoport 4387066fddSLinus Torvalds static atomic_t secretmem_users; 449a436f8fSMike Rapoport 459a436f8fSMike Rapoport bool secretmem_active(void) 469a436f8fSMike Rapoport { 4787066fddSLinus Torvalds return !!atomic_read(&secretmem_users); 489a436f8fSMike Rapoport } 499a436f8fSMike Rapoport 501507f512SMike Rapoport static vm_fault_t secretmem_fault(struct vm_fault *vmf) 511507f512SMike Rapoport { 521507f512SMike Rapoport struct address_space *mapping = vmf->vma->vm_file->f_mapping; 531507f512SMike Rapoport struct inode *inode = file_inode(vmf->vma->vm_file); 541507f512SMike Rapoport pgoff_t offset = vmf->pgoff; 551507f512SMike Rapoport gfp_t gfp = vmf->gfp_mask; 561507f512SMike Rapoport unsigned long addr; 571507f512SMike Rapoport struct page *page; 581507f512SMike Rapoport int err; 591507f512SMike Rapoport 601507f512SMike Rapoport if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 611507f512SMike Rapoport return vmf_error(-EINVAL); 621507f512SMike Rapoport 631507f512SMike Rapoport retry: 641507f512SMike Rapoport page = find_lock_page(mapping, offset); 651507f512SMike Rapoport if (!page) { 661507f512SMike Rapoport page = alloc_page(gfp | __GFP_ZERO); 671507f512SMike Rapoport if (!page) 681507f512SMike Rapoport return VM_FAULT_OOM; 691507f512SMike Rapoport 701507f512SMike Rapoport err = set_direct_map_invalid_noflush(page); 711507f512SMike Rapoport if (err) { 721507f512SMike Rapoport put_page(page); 731507f512SMike Rapoport return vmf_error(err); 741507f512SMike Rapoport } 751507f512SMike Rapoport 761507f512SMike Rapoport __SetPageUptodate(page); 771507f512SMike Rapoport err = add_to_page_cache_lru(page, mapping, offset, gfp); 781507f512SMike Rapoport if (unlikely(err)) { 791507f512SMike Rapoport put_page(page); 801507f512SMike Rapoport /* 811507f512SMike Rapoport * If a split of large page was required, it 821507f512SMike Rapoport * already happened when we marked the page invalid 831507f512SMike Rapoport * which guarantees that this call won't fail 841507f512SMike Rapoport */ 851507f512SMike Rapoport set_direct_map_default_noflush(page); 861507f512SMike Rapoport if (err == -EEXIST) 871507f512SMike Rapoport goto retry; 881507f512SMike Rapoport 891507f512SMike Rapoport return vmf_error(err); 901507f512SMike Rapoport } 911507f512SMike Rapoport 921507f512SMike Rapoport addr = (unsigned long)page_address(page); 931507f512SMike Rapoport flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 941507f512SMike Rapoport } 951507f512SMike Rapoport 961507f512SMike Rapoport vmf->page = page; 971507f512SMike Rapoport return VM_FAULT_LOCKED; 981507f512SMike Rapoport } 991507f512SMike Rapoport 1001507f512SMike Rapoport static const struct vm_operations_struct secretmem_vm_ops = { 1011507f512SMike Rapoport .fault = secretmem_fault, 1021507f512SMike Rapoport }; 1031507f512SMike Rapoport 1049a436f8fSMike Rapoport static int secretmem_release(struct inode *inode, struct file *file) 1059a436f8fSMike Rapoport { 10687066fddSLinus Torvalds atomic_dec(&secretmem_users); 1079a436f8fSMike Rapoport return 0; 1089a436f8fSMike Rapoport } 1099a436f8fSMike Rapoport 1101507f512SMike Rapoport static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) 1111507f512SMike Rapoport { 1121507f512SMike Rapoport unsigned long len = vma->vm_end - vma->vm_start; 1131507f512SMike Rapoport 1141507f512SMike Rapoport if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) 1151507f512SMike Rapoport return -EINVAL; 1161507f512SMike Rapoport 1171507f512SMike Rapoport if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) 1181507f512SMike Rapoport return -EAGAIN; 1191507f512SMike Rapoport 1201507f512SMike Rapoport vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; 1211507f512SMike Rapoport vma->vm_ops = &secretmem_vm_ops; 1221507f512SMike Rapoport 1231507f512SMike Rapoport return 0; 1241507f512SMike Rapoport } 1251507f512SMike Rapoport 1261507f512SMike Rapoport bool vma_is_secretmem(struct vm_area_struct *vma) 1271507f512SMike Rapoport { 1281507f512SMike Rapoport return vma->vm_ops == &secretmem_vm_ops; 1291507f512SMike Rapoport } 1301507f512SMike Rapoport 1311507f512SMike Rapoport static const struct file_operations secretmem_fops = { 1329a436f8fSMike Rapoport .release = secretmem_release, 1331507f512SMike Rapoport .mmap = secretmem_mmap, 1341507f512SMike Rapoport }; 1351507f512SMike Rapoport 1361507f512SMike Rapoport static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) 1371507f512SMike Rapoport { 1381507f512SMike Rapoport return false; 1391507f512SMike Rapoport } 1401507f512SMike Rapoport 1411507f512SMike Rapoport static int secretmem_migratepage(struct address_space *mapping, 1421507f512SMike Rapoport struct page *newpage, struct page *page, 1431507f512SMike Rapoport enum migrate_mode mode) 1441507f512SMike Rapoport { 1451507f512SMike Rapoport return -EBUSY; 1461507f512SMike Rapoport } 1471507f512SMike Rapoport 1486612ed24SMatthew Wilcox (Oracle) static void secretmem_free_folio(struct folio *folio) 1491507f512SMike Rapoport { 1506612ed24SMatthew Wilcox (Oracle) set_direct_map_default_noflush(&folio->page); 1516612ed24SMatthew Wilcox (Oracle) folio_zero_segment(folio, 0, folio_size(folio)); 1521507f512SMike Rapoport } 1531507f512SMike Rapoport 1541507f512SMike Rapoport const struct address_space_operations secretmem_aops = { 15546de8b97SMatthew Wilcox (Oracle) .dirty_folio = noop_dirty_folio, 1566612ed24SMatthew Wilcox (Oracle) .free_folio = secretmem_free_folio, 1571507f512SMike Rapoport .migratepage = secretmem_migratepage, 1581507f512SMike Rapoport .isolate_page = secretmem_isolate_page, 1591507f512SMike Rapoport }; 1601507f512SMike Rapoport 161f9b141f9SAxel Rasmussen static int secretmem_setattr(struct user_namespace *mnt_userns, 162f9b141f9SAxel Rasmussen struct dentry *dentry, struct iattr *iattr) 163f9b141f9SAxel Rasmussen { 164f9b141f9SAxel Rasmussen struct inode *inode = d_inode(dentry); 165f9b141f9SAxel Rasmussen unsigned int ia_valid = iattr->ia_valid; 166f9b141f9SAxel Rasmussen 167f9b141f9SAxel Rasmussen if ((ia_valid & ATTR_SIZE) && inode->i_size) 168f9b141f9SAxel Rasmussen return -EINVAL; 169f9b141f9SAxel Rasmussen 170f9b141f9SAxel Rasmussen return simple_setattr(mnt_userns, dentry, iattr); 171f9b141f9SAxel Rasmussen } 172f9b141f9SAxel Rasmussen 173f9b141f9SAxel Rasmussen static const struct inode_operations secretmem_iops = { 174f9b141f9SAxel Rasmussen .setattr = secretmem_setattr, 175f9b141f9SAxel Rasmussen }; 176f9b141f9SAxel Rasmussen 1771507f512SMike Rapoport static struct vfsmount *secretmem_mnt; 1781507f512SMike Rapoport 1791507f512SMike Rapoport static struct file *secretmem_file_create(unsigned long flags) 1801507f512SMike Rapoport { 1811507f512SMike Rapoport struct file *file = ERR_PTR(-ENOMEM); 1821507f512SMike Rapoport struct inode *inode; 183*2bfe15c5SChristian Göttsche const char *anon_name = "[secretmem]"; 184*2bfe15c5SChristian Göttsche const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); 185*2bfe15c5SChristian Göttsche int err; 1861507f512SMike Rapoport 1871507f512SMike Rapoport inode = alloc_anon_inode(secretmem_mnt->mnt_sb); 1881507f512SMike Rapoport if (IS_ERR(inode)) 1891507f512SMike Rapoport return ERR_CAST(inode); 1901507f512SMike Rapoport 191*2bfe15c5SChristian Göttsche err = security_inode_init_security_anon(inode, &qname, NULL); 192*2bfe15c5SChristian Göttsche if (err) { 193*2bfe15c5SChristian Göttsche file = ERR_PTR(err); 194*2bfe15c5SChristian Göttsche goto err_free_inode; 195*2bfe15c5SChristian Göttsche } 196*2bfe15c5SChristian Göttsche 1971507f512SMike Rapoport file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", 1981507f512SMike Rapoport O_RDWR, &secretmem_fops); 1991507f512SMike Rapoport if (IS_ERR(file)) 2001507f512SMike Rapoport goto err_free_inode; 2011507f512SMike Rapoport 2021507f512SMike Rapoport mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 2031507f512SMike Rapoport mapping_set_unevictable(inode->i_mapping); 2041507f512SMike Rapoport 205f9b141f9SAxel Rasmussen inode->i_op = &secretmem_iops; 2061507f512SMike Rapoport inode->i_mapping->a_ops = &secretmem_aops; 2071507f512SMike Rapoport 2081507f512SMike Rapoport /* pretend we are a normal file with zero size */ 2091507f512SMike Rapoport inode->i_mode |= S_IFREG; 2101507f512SMike Rapoport inode->i_size = 0; 2111507f512SMike Rapoport 2121507f512SMike Rapoport return file; 2131507f512SMike Rapoport 2141507f512SMike Rapoport err_free_inode: 2151507f512SMike Rapoport iput(inode); 2161507f512SMike Rapoport return file; 2171507f512SMike Rapoport } 2181507f512SMike Rapoport 2191507f512SMike Rapoport SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) 2201507f512SMike Rapoport { 2211507f512SMike Rapoport struct file *file; 2221507f512SMike Rapoport int fd, err; 2231507f512SMike Rapoport 2241507f512SMike Rapoport /* make sure local flags do not confict with global fcntl.h */ 2251507f512SMike Rapoport BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); 2261507f512SMike Rapoport 2271507f512SMike Rapoport if (!secretmem_enable) 2281507f512SMike Rapoport return -ENOSYS; 2291507f512SMike Rapoport 2301507f512SMike Rapoport if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) 2311507f512SMike Rapoport return -EINVAL; 232cb685432SMatthew Wilcox (Oracle) if (atomic_read(&secretmem_users) < 0) 233cb685432SMatthew Wilcox (Oracle) return -ENFILE; 2341507f512SMike Rapoport 2351507f512SMike Rapoport fd = get_unused_fd_flags(flags & O_CLOEXEC); 2361507f512SMike Rapoport if (fd < 0) 2371507f512SMike Rapoport return fd; 2381507f512SMike Rapoport 2391507f512SMike Rapoport file = secretmem_file_create(flags); 2401507f512SMike Rapoport if (IS_ERR(file)) { 2411507f512SMike Rapoport err = PTR_ERR(file); 2421507f512SMike Rapoport goto err_put_fd; 2431507f512SMike Rapoport } 2441507f512SMike Rapoport 2451507f512SMike Rapoport file->f_flags |= O_LARGEFILE; 2461507f512SMike Rapoport 24787066fddSLinus Torvalds atomic_inc(&secretmem_users); 248855d4443SKees Cook fd_install(fd, file); 2491507f512SMike Rapoport return fd; 2501507f512SMike Rapoport 2511507f512SMike Rapoport err_put_fd: 2521507f512SMike Rapoport put_unused_fd(fd); 2531507f512SMike Rapoport return err; 2541507f512SMike Rapoport } 2551507f512SMike Rapoport 2561507f512SMike Rapoport static int secretmem_init_fs_context(struct fs_context *fc) 2571507f512SMike Rapoport { 2581507f512SMike Rapoport return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; 2591507f512SMike Rapoport } 2601507f512SMike Rapoport 2611507f512SMike Rapoport static struct file_system_type secretmem_fs = { 2621507f512SMike Rapoport .name = "secretmem", 2631507f512SMike Rapoport .init_fs_context = secretmem_init_fs_context, 2641507f512SMike Rapoport .kill_sb = kill_anon_super, 2651507f512SMike Rapoport }; 2661507f512SMike Rapoport 2671507f512SMike Rapoport static int secretmem_init(void) 2681507f512SMike Rapoport { 2691507f512SMike Rapoport int ret = 0; 2701507f512SMike Rapoport 2711507f512SMike Rapoport if (!secretmem_enable) 2721507f512SMike Rapoport return ret; 2731507f512SMike Rapoport 2741507f512SMike Rapoport secretmem_mnt = kern_mount(&secretmem_fs); 2751507f512SMike Rapoport if (IS_ERR(secretmem_mnt)) 2761507f512SMike Rapoport ret = PTR_ERR(secretmem_mnt); 2771507f512SMike Rapoport 2781507f512SMike Rapoport /* prevent secretmem mappings from ever getting PROT_EXEC */ 2791507f512SMike Rapoport secretmem_mnt->mnt_flags |= MNT_NOEXEC; 2801507f512SMike Rapoport 2811507f512SMike Rapoport return ret; 2821507f512SMike Rapoport } 2831507f512SMike Rapoport fs_initcall(secretmem_init); 284