11507f512SMike Rapoport // SPDX-License-Identifier: GPL-2.0 21507f512SMike Rapoport /* 31507f512SMike Rapoport * Copyright IBM Corporation, 2021 41507f512SMike Rapoport * 51507f512SMike Rapoport * Author: Mike Rapoport <rppt@linux.ibm.com> 61507f512SMike Rapoport */ 71507f512SMike Rapoport 81507f512SMike Rapoport #include <linux/mm.h> 91507f512SMike Rapoport #include <linux/fs.h> 101507f512SMike Rapoport #include <linux/swap.h> 111507f512SMike Rapoport #include <linux/mount.h> 121507f512SMike Rapoport #include <linux/memfd.h> 131507f512SMike Rapoport #include <linux/bitops.h> 141507f512SMike Rapoport #include <linux/printk.h> 151507f512SMike Rapoport #include <linux/pagemap.h> 161507f512SMike Rapoport #include <linux/syscalls.h> 171507f512SMike Rapoport #include <linux/pseudo_fs.h> 181507f512SMike Rapoport #include <linux/secretmem.h> 191507f512SMike Rapoport #include <linux/set_memory.h> 201507f512SMike Rapoport #include <linux/sched/signal.h> 211507f512SMike Rapoport 221507f512SMike Rapoport #include <uapi/linux/magic.h> 231507f512SMike Rapoport 241507f512SMike Rapoport #include <asm/tlbflush.h> 251507f512SMike Rapoport 261507f512SMike Rapoport #include "internal.h" 271507f512SMike Rapoport 281507f512SMike Rapoport #undef pr_fmt 291507f512SMike Rapoport #define pr_fmt(fmt) "secretmem: " fmt 301507f512SMike Rapoport 311507f512SMike Rapoport /* 321507f512SMike Rapoport * Define mode and flag masks to allow validation of the system call 331507f512SMike Rapoport * parameters. 341507f512SMike Rapoport */ 351507f512SMike Rapoport #define SECRETMEM_MODE_MASK (0x0) 361507f512SMike Rapoport #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK 371507f512SMike Rapoport 38*b758fe6dSMike Rapoport (IBM) static bool secretmem_enable __ro_after_init = 1; 391507f512SMike Rapoport module_param_named(enable, secretmem_enable, bool, 0400); 401507f512SMike Rapoport MODULE_PARM_DESC(secretmem_enable, 411507f512SMike Rapoport "Enable secretmem and memfd_secret(2) system call"); 421507f512SMike Rapoport 4387066fddSLinus Torvalds static atomic_t secretmem_users; 449a436f8fSMike Rapoport 459a436f8fSMike Rapoport bool secretmem_active(void) 469a436f8fSMike Rapoport { 4787066fddSLinus Torvalds return !!atomic_read(&secretmem_users); 489a436f8fSMike Rapoport } 499a436f8fSMike Rapoport 501507f512SMike Rapoport static vm_fault_t secretmem_fault(struct vm_fault *vmf) 511507f512SMike Rapoport { 521507f512SMike Rapoport struct address_space *mapping = vmf->vma->vm_file->f_mapping; 531507f512SMike Rapoport struct inode *inode = file_inode(vmf->vma->vm_file); 541507f512SMike Rapoport pgoff_t offset = vmf->pgoff; 551507f512SMike Rapoport gfp_t gfp = vmf->gfp_mask; 561507f512SMike Rapoport unsigned long addr; 571507f512SMike Rapoport struct page *page; 5884ac0130SMike Rapoport vm_fault_t ret; 591507f512SMike Rapoport int err; 601507f512SMike Rapoport 611507f512SMike Rapoport if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) 621507f512SMike Rapoport return vmf_error(-EINVAL); 631507f512SMike Rapoport 6484ac0130SMike Rapoport filemap_invalidate_lock_shared(mapping); 6584ac0130SMike Rapoport 661507f512SMike Rapoport retry: 671507f512SMike Rapoport page = find_lock_page(mapping, offset); 681507f512SMike Rapoport if (!page) { 691507f512SMike Rapoport page = alloc_page(gfp | __GFP_ZERO); 7084ac0130SMike Rapoport if (!page) { 7184ac0130SMike Rapoport ret = VM_FAULT_OOM; 7284ac0130SMike Rapoport goto out; 7384ac0130SMike Rapoport } 741507f512SMike Rapoport 751507f512SMike Rapoport err = set_direct_map_invalid_noflush(page); 761507f512SMike Rapoport if (err) { 771507f512SMike Rapoport put_page(page); 7884ac0130SMike Rapoport ret = vmf_error(err); 7984ac0130SMike Rapoport goto out; 801507f512SMike Rapoport } 811507f512SMike Rapoport 821507f512SMike Rapoport __SetPageUptodate(page); 831507f512SMike Rapoport err = add_to_page_cache_lru(page, mapping, offset, gfp); 841507f512SMike Rapoport if (unlikely(err)) { 851507f512SMike Rapoport put_page(page); 861507f512SMike Rapoport /* 871507f512SMike Rapoport * If a split of large page was required, it 881507f512SMike Rapoport * already happened when we marked the page invalid 891507f512SMike Rapoport * which guarantees that this call won't fail 901507f512SMike Rapoport */ 911507f512SMike Rapoport set_direct_map_default_noflush(page); 921507f512SMike Rapoport if (err == -EEXIST) 931507f512SMike Rapoport goto retry; 941507f512SMike Rapoport 9584ac0130SMike Rapoport ret = vmf_error(err); 9684ac0130SMike Rapoport goto out; 971507f512SMike Rapoport } 981507f512SMike Rapoport 991507f512SMike Rapoport addr = (unsigned long)page_address(page); 1001507f512SMike Rapoport flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 1011507f512SMike Rapoport } 1021507f512SMike Rapoport 1031507f512SMike Rapoport vmf->page = page; 10484ac0130SMike Rapoport ret = VM_FAULT_LOCKED; 10584ac0130SMike Rapoport 10684ac0130SMike Rapoport out: 10784ac0130SMike Rapoport filemap_invalidate_unlock_shared(mapping); 10884ac0130SMike Rapoport return ret; 1091507f512SMike Rapoport } 1101507f512SMike Rapoport 1111507f512SMike Rapoport static const struct vm_operations_struct secretmem_vm_ops = { 1121507f512SMike Rapoport .fault = secretmem_fault, 1131507f512SMike Rapoport }; 1141507f512SMike Rapoport 1159a436f8fSMike Rapoport static int secretmem_release(struct inode *inode, struct file *file) 1169a436f8fSMike Rapoport { 11787066fddSLinus Torvalds atomic_dec(&secretmem_users); 1189a436f8fSMike Rapoport return 0; 1199a436f8fSMike Rapoport } 1209a436f8fSMike Rapoport 1211507f512SMike Rapoport static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) 1221507f512SMike Rapoport { 1231507f512SMike Rapoport unsigned long len = vma->vm_end - vma->vm_start; 1241507f512SMike Rapoport 1251507f512SMike Rapoport if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) 1261507f512SMike Rapoport return -EINVAL; 1271507f512SMike Rapoport 1281507f512SMike Rapoport if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) 1291507f512SMike Rapoport return -EAGAIN; 1301507f512SMike Rapoport 1311c71222eSSuren Baghdasaryan vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); 1321507f512SMike Rapoport vma->vm_ops = &secretmem_vm_ops; 1331507f512SMike Rapoport 1341507f512SMike Rapoport return 0; 1351507f512SMike Rapoport } 1361507f512SMike Rapoport 1371507f512SMike Rapoport bool vma_is_secretmem(struct vm_area_struct *vma) 1381507f512SMike Rapoport { 1391507f512SMike Rapoport return vma->vm_ops == &secretmem_vm_ops; 1401507f512SMike Rapoport } 1411507f512SMike Rapoport 1421507f512SMike Rapoport static const struct file_operations secretmem_fops = { 1439a436f8fSMike Rapoport .release = secretmem_release, 1441507f512SMike Rapoport .mmap = secretmem_mmap, 1451507f512SMike Rapoport }; 1461507f512SMike Rapoport 1475409548dSMatthew Wilcox (Oracle) static int secretmem_migrate_folio(struct address_space *mapping, 1485409548dSMatthew Wilcox (Oracle) struct folio *dst, struct folio *src, enum migrate_mode mode) 1491507f512SMike Rapoport { 1501507f512SMike Rapoport return -EBUSY; 1511507f512SMike Rapoport } 1521507f512SMike Rapoport 1536612ed24SMatthew Wilcox (Oracle) static void secretmem_free_folio(struct folio *folio) 1541507f512SMike Rapoport { 1556612ed24SMatthew Wilcox (Oracle) set_direct_map_default_noflush(&folio->page); 1566612ed24SMatthew Wilcox (Oracle) folio_zero_segment(folio, 0, folio_size(folio)); 1571507f512SMike Rapoport } 1581507f512SMike Rapoport 1591507f512SMike Rapoport const struct address_space_operations secretmem_aops = { 16046de8b97SMatthew Wilcox (Oracle) .dirty_folio = noop_dirty_folio, 1616612ed24SMatthew Wilcox (Oracle) .free_folio = secretmem_free_folio, 1625409548dSMatthew Wilcox (Oracle) .migrate_folio = secretmem_migrate_folio, 1631507f512SMike Rapoport }; 1641507f512SMike Rapoport 165c1632a0fSChristian Brauner static int secretmem_setattr(struct mnt_idmap *idmap, 166f9b141f9SAxel Rasmussen struct dentry *dentry, struct iattr *iattr) 167f9b141f9SAxel Rasmussen { 168f9b141f9SAxel Rasmussen struct inode *inode = d_inode(dentry); 16984ac0130SMike Rapoport struct address_space *mapping = inode->i_mapping; 170f9b141f9SAxel Rasmussen unsigned int ia_valid = iattr->ia_valid; 17184ac0130SMike Rapoport int ret; 17284ac0130SMike Rapoport 17384ac0130SMike Rapoport filemap_invalidate_lock(mapping); 174f9b141f9SAxel Rasmussen 175f9b141f9SAxel Rasmussen if ((ia_valid & ATTR_SIZE) && inode->i_size) 17684ac0130SMike Rapoport ret = -EINVAL; 17784ac0130SMike Rapoport else 178c1632a0fSChristian Brauner ret = simple_setattr(idmap, dentry, iattr); 179f9b141f9SAxel Rasmussen 18084ac0130SMike Rapoport filemap_invalidate_unlock(mapping); 18184ac0130SMike Rapoport 18284ac0130SMike Rapoport return ret; 183f9b141f9SAxel Rasmussen } 184f9b141f9SAxel Rasmussen 185f9b141f9SAxel Rasmussen static const struct inode_operations secretmem_iops = { 186f9b141f9SAxel Rasmussen .setattr = secretmem_setattr, 187f9b141f9SAxel Rasmussen }; 188f9b141f9SAxel Rasmussen 1891507f512SMike Rapoport static struct vfsmount *secretmem_mnt; 1901507f512SMike Rapoport 1911507f512SMike Rapoport static struct file *secretmem_file_create(unsigned long flags) 1921507f512SMike Rapoport { 19398001fd6SColin Ian King struct file *file; 1941507f512SMike Rapoport struct inode *inode; 1952bfe15c5SChristian Göttsche const char *anon_name = "[secretmem]"; 1962bfe15c5SChristian Göttsche const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); 1972bfe15c5SChristian Göttsche int err; 1981507f512SMike Rapoport 1991507f512SMike Rapoport inode = alloc_anon_inode(secretmem_mnt->mnt_sb); 2001507f512SMike Rapoport if (IS_ERR(inode)) 2011507f512SMike Rapoport return ERR_CAST(inode); 2021507f512SMike Rapoport 2032bfe15c5SChristian Göttsche err = security_inode_init_security_anon(inode, &qname, NULL); 2042bfe15c5SChristian Göttsche if (err) { 2052bfe15c5SChristian Göttsche file = ERR_PTR(err); 2062bfe15c5SChristian Göttsche goto err_free_inode; 2072bfe15c5SChristian Göttsche } 2082bfe15c5SChristian Göttsche 2091507f512SMike Rapoport file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", 2101507f512SMike Rapoport O_RDWR, &secretmem_fops); 2111507f512SMike Rapoport if (IS_ERR(file)) 2121507f512SMike Rapoport goto err_free_inode; 2131507f512SMike Rapoport 2141507f512SMike Rapoport mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 2151507f512SMike Rapoport mapping_set_unevictable(inode->i_mapping); 2161507f512SMike Rapoport 217f9b141f9SAxel Rasmussen inode->i_op = &secretmem_iops; 2181507f512SMike Rapoport inode->i_mapping->a_ops = &secretmem_aops; 2191507f512SMike Rapoport 2201507f512SMike Rapoport /* pretend we are a normal file with zero size */ 2211507f512SMike Rapoport inode->i_mode |= S_IFREG; 2221507f512SMike Rapoport inode->i_size = 0; 2231507f512SMike Rapoport 2241507f512SMike Rapoport return file; 2251507f512SMike Rapoport 2261507f512SMike Rapoport err_free_inode: 2271507f512SMike Rapoport iput(inode); 2281507f512SMike Rapoport return file; 2291507f512SMike Rapoport } 2301507f512SMike Rapoport 2311507f512SMike Rapoport SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) 2321507f512SMike Rapoport { 2331507f512SMike Rapoport struct file *file; 2341507f512SMike Rapoport int fd, err; 2351507f512SMike Rapoport 2361507f512SMike Rapoport /* make sure local flags do not confict with global fcntl.h */ 2371507f512SMike Rapoport BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); 2381507f512SMike Rapoport 2391507f512SMike Rapoport if (!secretmem_enable) 2401507f512SMike Rapoport return -ENOSYS; 2411507f512SMike Rapoport 2421507f512SMike Rapoport if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) 2431507f512SMike Rapoport return -EINVAL; 244cb685432SMatthew Wilcox (Oracle) if (atomic_read(&secretmem_users) < 0) 245cb685432SMatthew Wilcox (Oracle) return -ENFILE; 2461507f512SMike Rapoport 2471507f512SMike Rapoport fd = get_unused_fd_flags(flags & O_CLOEXEC); 2481507f512SMike Rapoport if (fd < 0) 2491507f512SMike Rapoport return fd; 2501507f512SMike Rapoport 2511507f512SMike Rapoport file = secretmem_file_create(flags); 2521507f512SMike Rapoport if (IS_ERR(file)) { 2531507f512SMike Rapoport err = PTR_ERR(file); 2541507f512SMike Rapoport goto err_put_fd; 2551507f512SMike Rapoport } 2561507f512SMike Rapoport 2571507f512SMike Rapoport file->f_flags |= O_LARGEFILE; 2581507f512SMike Rapoport 25987066fddSLinus Torvalds atomic_inc(&secretmem_users); 260855d4443SKees Cook fd_install(fd, file); 2611507f512SMike Rapoport return fd; 2621507f512SMike Rapoport 2631507f512SMike Rapoport err_put_fd: 2641507f512SMike Rapoport put_unused_fd(fd); 2651507f512SMike Rapoport return err; 2661507f512SMike Rapoport } 2671507f512SMike Rapoport 2681507f512SMike Rapoport static int secretmem_init_fs_context(struct fs_context *fc) 2691507f512SMike Rapoport { 2701507f512SMike Rapoport return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; 2711507f512SMike Rapoport } 2721507f512SMike Rapoport 2731507f512SMike Rapoport static struct file_system_type secretmem_fs = { 2741507f512SMike Rapoport .name = "secretmem", 2751507f512SMike Rapoport .init_fs_context = secretmem_init_fs_context, 2761507f512SMike Rapoport .kill_sb = kill_anon_super, 2771507f512SMike Rapoport }; 2781507f512SMike Rapoport 2791ea41595SXiu Jianfeng static int __init secretmem_init(void) 2801507f512SMike Rapoport { 2811507f512SMike Rapoport if (!secretmem_enable) 282f7c5b1aaSXiu Jianfeng return 0; 2831507f512SMike Rapoport 2841507f512SMike Rapoport secretmem_mnt = kern_mount(&secretmem_fs); 2851507f512SMike Rapoport if (IS_ERR(secretmem_mnt)) 2864eb5bbdeSBinyi Han return PTR_ERR(secretmem_mnt); 2871507f512SMike Rapoport 2881507f512SMike Rapoport /* prevent secretmem mappings from ever getting PROT_EXEC */ 2891507f512SMike Rapoport secretmem_mnt->mnt_flags |= MNT_NOEXEC; 2901507f512SMike Rapoport 291f7c5b1aaSXiu Jianfeng return 0; 2921507f512SMike Rapoport } 2931507f512SMike Rapoport fs_initcall(secretmem_init); 294