1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10
11 #include "kvm_mm.h"
12
13 static struct vfsmount *kvm_gmem_mnt;
14
15 /*
16 * A guest_memfd instance can be associated multiple VMs, each with its own
17 * "view" of the underlying physical memory.
18 *
19 * The gmem's inode is effectively the raw underlying physical storage, and is
20 * used to track properties of the physical memory, while each gmem file is
21 * effectively a single VM's view of that storage, and is used to track assets
22 * specific to its associated VM, e.g. memslots=>gmem bindings.
23 */
24 struct gmem_file {
25 struct kvm *kvm;
26 struct xarray bindings;
27 struct list_head entry;
28 };
29
30 struct gmem_inode {
31 struct shared_policy policy;
32 struct inode vfs_inode;
33
34 u64 flags;
35 };
36
GMEM_I(struct inode * inode)37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
38 {
39 return container_of(inode, struct gmem_inode, vfs_inode);
40 }
41
42 #define kvm_gmem_for_each_file(f, mapping) \
43 list_for_each_entry(f, &(mapping)->i_private_list, entry)
44
45 /**
46 * folio_file_pfn - like folio_file_page, but return a pfn.
47 * @folio: The folio which contains this index.
48 * @index: The index we want to look up.
49 *
50 * Return: The pfn for this index.
51 */
folio_file_pfn(struct folio * folio,pgoff_t index)52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
53 {
54 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
55 }
56
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
58 {
59 return gfn - slot->base_gfn + slot->gmem.pgoff;
60 }
61
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
63 pgoff_t index, struct folio *folio)
64 {
65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
66 kvm_pfn_t pfn = folio_file_pfn(folio, index);
67 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
68 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
69 if (rc) {
70 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
71 index, gfn, pfn, rc);
72 return rc;
73 }
74 #endif
75
76 return 0;
77 }
78
kvm_gmem_mark_prepared(struct folio * folio)79 static inline void kvm_gmem_mark_prepared(struct folio *folio)
80 {
81 folio_mark_uptodate(folio);
82 }
83
84 /*
85 * Process @folio, which contains @gfn, so that the guest can use it.
86 * The folio must be locked and the gfn must be contained in @slot.
87 * On successful return the guest sees a zero page so as to avoid
88 * leaking host data and the up-to-date flag is set.
89 */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)90 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
91 gfn_t gfn, struct folio *folio)
92 {
93 unsigned long nr_pages, i;
94 pgoff_t index;
95 int r;
96
97 nr_pages = folio_nr_pages(folio);
98 for (i = 0; i < nr_pages; i++)
99 clear_highpage(folio_page(folio, i));
100
101 /*
102 * Preparing huge folios should always be safe, since it should
103 * be possible to split them later if needed.
104 *
105 * Right now the folio order is always going to be zero, but the
106 * code is ready for huge folios. The only assumption is that
107 * the base pgoff of memslots is naturally aligned with the
108 * requested page order, ensuring that huge folios can also use
109 * huge page table entries for GPA->HPA mapping.
110 *
111 * The order will be passed when creating the guest_memfd, and
112 * checked when creating memslots.
113 */
114 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
115 index = kvm_gmem_get_index(slot, gfn);
116 index = ALIGN_DOWN(index, folio_nr_pages(folio));
117 r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
118 if (!r)
119 kvm_gmem_mark_prepared(folio);
120
121 return r;
122 }
123
124 /*
125 * Returns a locked folio on success. The caller is responsible for
126 * setting the up-to-date flag before the memory is mapped into the guest.
127 * There is no backing storage for the memory, so the folio will remain
128 * up-to-date until it's removed.
129 *
130 * Ignore accessed, referenced, and dirty flags. The memory is
131 * unevictable and there is no storage to write back to.
132 */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)133 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
134 {
135 /* TODO: Support huge pages. */
136 struct mempolicy *policy;
137 struct folio *folio;
138
139 /*
140 * Fast-path: See if folio is already present in mapping to avoid
141 * policy_lookup.
142 */
143 folio = __filemap_get_folio(inode->i_mapping, index,
144 FGP_LOCK | FGP_ACCESSED, 0);
145 if (!IS_ERR(folio))
146 return folio;
147
148 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
149 folio = __filemap_get_folio_mpol(inode->i_mapping, index,
150 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
151 mapping_gfp_mask(inode->i_mapping), policy);
152 mpol_cond_put(policy);
153
154 return folio;
155 }
156
kvm_gmem_get_invalidate_filter(struct inode * inode)157 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
158 {
159 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
160 return KVM_FILTER_SHARED;
161
162 return KVM_FILTER_PRIVATE;
163 }
164
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)165 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
166 pgoff_t end,
167 enum kvm_gfn_range_filter attr_filter)
168 {
169 bool flush = false, found_memslot = false;
170 struct kvm_memory_slot *slot;
171 struct kvm *kvm = f->kvm;
172 unsigned long index;
173
174 xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
175 pgoff_t pgoff = slot->gmem.pgoff;
176
177 struct kvm_gfn_range gfn_range = {
178 .start = slot->base_gfn + max(pgoff, start) - pgoff,
179 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
180 .slot = slot,
181 .may_block = true,
182 .attr_filter = attr_filter,
183 };
184
185 if (!found_memslot) {
186 found_memslot = true;
187
188 KVM_MMU_LOCK(kvm);
189 kvm_mmu_invalidate_begin(kvm);
190 }
191
192 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
193 }
194
195 if (flush)
196 kvm_flush_remote_tlbs(kvm);
197
198 if (found_memslot)
199 KVM_MMU_UNLOCK(kvm);
200 }
201
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)202 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
203 pgoff_t end)
204 {
205 enum kvm_gfn_range_filter attr_filter;
206 struct gmem_file *f;
207
208 attr_filter = kvm_gmem_get_invalidate_filter(inode);
209
210 kvm_gmem_for_each_file(f, inode->i_mapping)
211 __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
212 }
213
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)214 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
215 pgoff_t end)
216 {
217 struct kvm *kvm = f->kvm;
218
219 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
220 KVM_MMU_LOCK(kvm);
221 kvm_mmu_invalidate_end(kvm);
222 KVM_MMU_UNLOCK(kvm);
223 }
224 }
225
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)226 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
227 pgoff_t end)
228 {
229 struct gmem_file *f;
230
231 kvm_gmem_for_each_file(f, inode->i_mapping)
232 __kvm_gmem_invalidate_end(f, start, end);
233 }
234
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)235 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
236 {
237 pgoff_t start = offset >> PAGE_SHIFT;
238 pgoff_t end = (offset + len) >> PAGE_SHIFT;
239
240 /*
241 * Bindings must be stable across invalidation to ensure the start+end
242 * are balanced.
243 */
244 filemap_invalidate_lock(inode->i_mapping);
245
246 kvm_gmem_invalidate_begin(inode, start, end);
247
248 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
249
250 kvm_gmem_invalidate_end(inode, start, end);
251
252 filemap_invalidate_unlock(inode->i_mapping);
253
254 return 0;
255 }
256
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)257 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
258 {
259 struct address_space *mapping = inode->i_mapping;
260 pgoff_t start, index, end;
261 int r;
262
263 /* Dedicated guest is immutable by default. */
264 if (offset + len > i_size_read(inode))
265 return -EINVAL;
266
267 filemap_invalidate_lock_shared(mapping);
268
269 start = offset >> PAGE_SHIFT;
270 end = (offset + len) >> PAGE_SHIFT;
271
272 r = 0;
273 for (index = start; index < end; ) {
274 struct folio *folio;
275
276 if (signal_pending(current)) {
277 r = -EINTR;
278 break;
279 }
280
281 folio = kvm_gmem_get_folio(inode, index);
282 if (IS_ERR(folio)) {
283 r = PTR_ERR(folio);
284 break;
285 }
286
287 index = folio_next_index(folio);
288
289 folio_unlock(folio);
290 folio_put(folio);
291
292 /* 64-bit only, wrapping the index should be impossible. */
293 if (WARN_ON_ONCE(!index))
294 break;
295
296 cond_resched();
297 }
298
299 filemap_invalidate_unlock_shared(mapping);
300
301 return r;
302 }
303
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)304 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
305 loff_t len)
306 {
307 int ret;
308
309 if (!(mode & FALLOC_FL_KEEP_SIZE))
310 return -EOPNOTSUPP;
311
312 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
313 return -EOPNOTSUPP;
314
315 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
316 return -EINVAL;
317
318 if (mode & FALLOC_FL_PUNCH_HOLE)
319 ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
320 else
321 ret = kvm_gmem_allocate(file_inode(file), offset, len);
322
323 if (!ret)
324 file_modified(file);
325 return ret;
326 }
327
kvm_gmem_release(struct inode * inode,struct file * file)328 static int kvm_gmem_release(struct inode *inode, struct file *file)
329 {
330 struct gmem_file *f = file->private_data;
331 struct kvm_memory_slot *slot;
332 struct kvm *kvm = f->kvm;
333 unsigned long index;
334
335 /*
336 * Prevent concurrent attempts to *unbind* a memslot. This is the last
337 * reference to the file and thus no new bindings can be created, but
338 * dereferencing the slot for existing bindings needs to be protected
339 * against memslot updates, specifically so that unbind doesn't race
340 * and free the memslot (kvm_gmem_get_file() will return NULL).
341 *
342 * Since .release is called only when the reference count is zero,
343 * after which file_ref_get() and get_file_active() fail,
344 * kvm_gmem_get_pfn() cannot be using the file concurrently.
345 * file_ref_put() provides a full barrier, and get_file_active() the
346 * matching acquire barrier.
347 */
348 mutex_lock(&kvm->slots_lock);
349
350 filemap_invalidate_lock(inode->i_mapping);
351
352 xa_for_each(&f->bindings, index, slot)
353 WRITE_ONCE(slot->gmem.file, NULL);
354
355 /*
356 * All in-flight operations are gone and new bindings can be created.
357 * Zap all SPTEs pointed at by this file. Do not free the backing
358 * memory, as its lifetime is associated with the inode, not the file.
359 */
360 __kvm_gmem_invalidate_begin(f, 0, -1ul,
361 kvm_gmem_get_invalidate_filter(inode));
362 __kvm_gmem_invalidate_end(f, 0, -1ul);
363
364 list_del(&f->entry);
365
366 filemap_invalidate_unlock(inode->i_mapping);
367
368 mutex_unlock(&kvm->slots_lock);
369
370 xa_destroy(&f->bindings);
371 kfree(f);
372
373 kvm_put_kvm(kvm);
374
375 return 0;
376 }
377
kvm_gmem_get_file(struct kvm_memory_slot * slot)378 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
379 {
380 /*
381 * Do not return slot->gmem.file if it has already been closed;
382 * there might be some time between the last fput() and when
383 * kvm_gmem_release() clears slot->gmem.file.
384 */
385 return get_file_active(&slot->gmem.file);
386 }
387
388 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
389 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
390
kvm_gmem_supports_mmap(struct inode * inode)391 static bool kvm_gmem_supports_mmap(struct inode *inode)
392 {
393 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
394 }
395
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)396 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
397 {
398 struct inode *inode = file_inode(vmf->vma->vm_file);
399 struct folio *folio;
400 vm_fault_t ret = VM_FAULT_LOCKED;
401
402 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
403 return VM_FAULT_SIGBUS;
404
405 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
406 return VM_FAULT_SIGBUS;
407
408 folio = kvm_gmem_get_folio(inode, vmf->pgoff);
409 if (IS_ERR(folio)) {
410 if (PTR_ERR(folio) == -EAGAIN)
411 return VM_FAULT_RETRY;
412
413 return vmf_error(PTR_ERR(folio));
414 }
415
416 if (WARN_ON_ONCE(folio_test_large(folio))) {
417 ret = VM_FAULT_SIGBUS;
418 goto out_folio;
419 }
420
421 if (!folio_test_uptodate(folio)) {
422 clear_highpage(folio_page(folio, 0));
423 kvm_gmem_mark_prepared(folio);
424 }
425
426 vmf->page = folio_file_page(folio, vmf->pgoff);
427
428 out_folio:
429 if (ret != VM_FAULT_LOCKED) {
430 folio_unlock(folio);
431 folio_put(folio);
432 }
433
434 return ret;
435 }
436
437 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)438 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
439 {
440 struct inode *inode = file_inode(vma->vm_file);
441
442 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
443 }
444
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)445 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
446 unsigned long addr, pgoff_t *pgoff)
447 {
448 struct inode *inode = file_inode(vma->vm_file);
449
450 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
451
452 /*
453 * Return the memory policy for this index, or NULL if none is set.
454 *
455 * Returning NULL, e.g. instead of the current task's memory policy, is
456 * important for the .get_policy kernel ABI: it indicates that no
457 * explicit policy has been set via mbind() for this memory. The caller
458 * can then replace NULL with the default memory policy instead of the
459 * current task's memory policy.
460 */
461 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
462 }
463 #endif /* CONFIG_NUMA */
464
465 static const struct vm_operations_struct kvm_gmem_vm_ops = {
466 .fault = kvm_gmem_fault_user_mapping,
467 #ifdef CONFIG_NUMA
468 .get_policy = kvm_gmem_get_policy,
469 .set_policy = kvm_gmem_set_policy,
470 #endif
471 };
472
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)473 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
474 {
475 if (!kvm_gmem_supports_mmap(file_inode(file)))
476 return -ENODEV;
477
478 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
479 (VM_SHARED | VM_MAYSHARE)) {
480 return -EINVAL;
481 }
482
483 vma->vm_ops = &kvm_gmem_vm_ops;
484
485 return 0;
486 }
487
488 static struct file_operations kvm_gmem_fops = {
489 .mmap = kvm_gmem_mmap,
490 .open = generic_file_open,
491 .release = kvm_gmem_release,
492 .fallocate = kvm_gmem_fallocate,
493 };
494
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)495 static int kvm_gmem_migrate_folio(struct address_space *mapping,
496 struct folio *dst, struct folio *src,
497 enum migrate_mode mode)
498 {
499 WARN_ON_ONCE(1);
500 return -EINVAL;
501 }
502
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)503 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
504 {
505 pgoff_t start, end;
506
507 filemap_invalidate_lock_shared(mapping);
508
509 start = folio->index;
510 end = start + folio_nr_pages(folio);
511
512 kvm_gmem_invalidate_begin(mapping->host, start, end);
513
514 /*
515 * Do not truncate the range, what action is taken in response to the
516 * error is userspace's decision (assuming the architecture supports
517 * gracefully handling memory errors). If/when the guest attempts to
518 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
519 * at which point KVM can either terminate the VM or propagate the
520 * error to userspace.
521 */
522
523 kvm_gmem_invalidate_end(mapping->host, start, end);
524
525 filemap_invalidate_unlock_shared(mapping);
526
527 return MF_DELAYED;
528 }
529
530 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)531 static void kvm_gmem_free_folio(struct folio *folio)
532 {
533 struct page *page = folio_page(folio, 0);
534 kvm_pfn_t pfn = page_to_pfn(page);
535 int order = folio_order(folio);
536
537 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
538 }
539 #endif
540
541 static const struct address_space_operations kvm_gmem_aops = {
542 .dirty_folio = noop_dirty_folio,
543 .migrate_folio = kvm_gmem_migrate_folio,
544 .error_remove_folio = kvm_gmem_error_folio,
545 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
546 .free_folio = kvm_gmem_free_folio,
547 #endif
548 };
549
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)550 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
551 struct iattr *attr)
552 {
553 return -EINVAL;
554 }
555 static const struct inode_operations kvm_gmem_iops = {
556 .setattr = kvm_gmem_setattr,
557 };
558
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)559 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
560 {
561 return true;
562 }
563
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)564 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
565 {
566 static const char *name = "[kvm-gmem]";
567 struct gmem_file *f;
568 struct inode *inode;
569 struct file *file;
570 int fd, err;
571
572 fd = get_unused_fd_flags(0);
573 if (fd < 0)
574 return fd;
575
576 f = kzalloc(sizeof(*f), GFP_KERNEL);
577 if (!f) {
578 err = -ENOMEM;
579 goto err_fd;
580 }
581
582 /* __fput() will take care of fops_put(). */
583 if (!fops_get(&kvm_gmem_fops)) {
584 err = -ENOENT;
585 goto err_gmem;
586 }
587
588 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
589 if (IS_ERR(inode)) {
590 err = PTR_ERR(inode);
591 goto err_fops;
592 }
593
594 inode->i_op = &kvm_gmem_iops;
595 inode->i_mapping->a_ops = &kvm_gmem_aops;
596 inode->i_mode |= S_IFREG;
597 inode->i_size = size;
598 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
599 mapping_set_inaccessible(inode->i_mapping);
600 /* Unmovable mappings are supposed to be marked unevictable as well. */
601 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
602
603 GMEM_I(inode)->flags = flags;
604
605 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
606 if (IS_ERR(file)) {
607 err = PTR_ERR(file);
608 goto err_inode;
609 }
610
611 file->f_flags |= O_LARGEFILE;
612 file->private_data = f;
613
614 kvm_get_kvm(kvm);
615 f->kvm = kvm;
616 xa_init(&f->bindings);
617 list_add(&f->entry, &inode->i_mapping->i_private_list);
618
619 fd_install(fd, file);
620 return fd;
621
622 err_inode:
623 iput(inode);
624 err_fops:
625 fops_put(&kvm_gmem_fops);
626 err_gmem:
627 kfree(f);
628 err_fd:
629 put_unused_fd(fd);
630 return err;
631 }
632
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)633 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
634 {
635 loff_t size = args->size;
636 u64 flags = args->flags;
637
638 if (flags & ~kvm_gmem_get_supported_flags(kvm))
639 return -EINVAL;
640
641 if (size <= 0 || !PAGE_ALIGNED(size))
642 return -EINVAL;
643
644 return __kvm_gmem_create(kvm, size, flags);
645 }
646
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)647 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
648 unsigned int fd, loff_t offset)
649 {
650 loff_t size = slot->npages << PAGE_SHIFT;
651 unsigned long start, end;
652 struct gmem_file *f;
653 struct inode *inode;
654 struct file *file;
655 int r = -EINVAL;
656
657 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
658
659 file = fget(fd);
660 if (!file)
661 return -EBADF;
662
663 if (file->f_op != &kvm_gmem_fops)
664 goto err;
665
666 f = file->private_data;
667 if (f->kvm != kvm)
668 goto err;
669
670 inode = file_inode(file);
671
672 if (offset < 0 || !PAGE_ALIGNED(offset) ||
673 offset + size > i_size_read(inode))
674 goto err;
675
676 filemap_invalidate_lock(inode->i_mapping);
677
678 start = offset >> PAGE_SHIFT;
679 end = start + slot->npages;
680
681 if (!xa_empty(&f->bindings) &&
682 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
683 filemap_invalidate_unlock(inode->i_mapping);
684 goto err;
685 }
686
687 /*
688 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
689 * kvm_gmem_bind() must occur on a new memslot. Because the memslot
690 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
691 */
692 WRITE_ONCE(slot->gmem.file, file);
693 slot->gmem.pgoff = start;
694 if (kvm_gmem_supports_mmap(inode))
695 slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
696
697 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
698 filemap_invalidate_unlock(inode->i_mapping);
699
700 /*
701 * Drop the reference to the file, even on success. The file pins KVM,
702 * not the other way 'round. Active bindings are invalidated if the
703 * file is closed before memslots are destroyed.
704 */
705 r = 0;
706 err:
707 fput(file);
708 return r;
709 }
710
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)711 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
712 {
713 unsigned long start = slot->gmem.pgoff;
714 unsigned long end = start + slot->npages;
715
716 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
717
718 /*
719 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
720 * cannot see this memslot.
721 */
722 WRITE_ONCE(slot->gmem.file, NULL);
723 }
724
kvm_gmem_unbind(struct kvm_memory_slot * slot)725 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
726 {
727 /*
728 * Nothing to do if the underlying file was _already_ closed, as
729 * kvm_gmem_release() invalidates and nullifies all bindings.
730 */
731 if (!slot->gmem.file)
732 return;
733
734 CLASS(gmem_get_file, file)(slot);
735
736 /*
737 * However, if the file is _being_ closed, then the bindings need to be
738 * removed as kvm_gmem_release() might not run until after the memslot
739 * is freed. Note, modifying the bindings is safe even though the file
740 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
741 * slots_lock, and only puts its reference to KVM after destroying all
742 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
743 * yet destroyed the bindings or freed the gmem_file, and can't do so
744 * until the caller drops slots_lock.
745 */
746 if (!file) {
747 __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
748 return;
749 }
750
751 filemap_invalidate_lock(file->f_mapping);
752 __kvm_gmem_unbind(slot, file->private_data);
753 filemap_invalidate_unlock(file->f_mapping);
754 }
755
756 /* Returns a locked folio on success. */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,bool * is_prepared,int * max_order)757 static struct folio *__kvm_gmem_get_pfn(struct file *file,
758 struct kvm_memory_slot *slot,
759 pgoff_t index, kvm_pfn_t *pfn,
760 bool *is_prepared, int *max_order)
761 {
762 struct file *slot_file = READ_ONCE(slot->gmem.file);
763 struct gmem_file *f = file->private_data;
764 struct folio *folio;
765
766 if (file != slot_file) {
767 WARN_ON_ONCE(slot_file);
768 return ERR_PTR(-EFAULT);
769 }
770
771 if (xa_load(&f->bindings, index) != slot) {
772 WARN_ON_ONCE(xa_load(&f->bindings, index));
773 return ERR_PTR(-EIO);
774 }
775
776 folio = kvm_gmem_get_folio(file_inode(file), index);
777 if (IS_ERR(folio))
778 return folio;
779
780 if (folio_test_hwpoison(folio)) {
781 folio_unlock(folio);
782 folio_put(folio);
783 return ERR_PTR(-EHWPOISON);
784 }
785
786 *pfn = folio_file_pfn(folio, index);
787 if (max_order)
788 *max_order = 0;
789
790 *is_prepared = folio_test_uptodate(folio);
791 return folio;
792 }
793
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)794 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
795 gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
796 int *max_order)
797 {
798 pgoff_t index = kvm_gmem_get_index(slot, gfn);
799 struct folio *folio;
800 bool is_prepared = false;
801 int r = 0;
802
803 CLASS(gmem_get_file, file)(slot);
804 if (!file)
805 return -EFAULT;
806
807 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
808 if (IS_ERR(folio))
809 return PTR_ERR(folio);
810
811 if (!is_prepared)
812 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
813
814 folio_unlock(folio);
815
816 if (!r)
817 *page = folio_file_page(folio, index);
818 else
819 folio_put(folio);
820
821 return r;
822 }
823 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
824
825 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)826 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
827 kvm_gmem_populate_cb post_populate, void *opaque)
828 {
829 struct kvm_memory_slot *slot;
830 void __user *p;
831
832 int ret = 0, max_order;
833 long i;
834
835 lockdep_assert_held(&kvm->slots_lock);
836
837 if (WARN_ON_ONCE(npages <= 0))
838 return -EINVAL;
839
840 slot = gfn_to_memslot(kvm, start_gfn);
841 if (!kvm_slot_has_gmem(slot))
842 return -EINVAL;
843
844 CLASS(gmem_get_file, file)(slot);
845 if (!file)
846 return -EFAULT;
847
848 filemap_invalidate_lock(file->f_mapping);
849
850 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
851 for (i = 0; i < npages; i += (1 << max_order)) {
852 struct folio *folio;
853 gfn_t gfn = start_gfn + i;
854 pgoff_t index = kvm_gmem_get_index(slot, gfn);
855 bool is_prepared = false;
856 kvm_pfn_t pfn;
857
858 if (signal_pending(current)) {
859 ret = -EINTR;
860 break;
861 }
862
863 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
864 if (IS_ERR(folio)) {
865 ret = PTR_ERR(folio);
866 break;
867 }
868
869 if (is_prepared) {
870 folio_unlock(folio);
871 folio_put(folio);
872 ret = -EEXIST;
873 break;
874 }
875
876 folio_unlock(folio);
877 WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
878 (npages - i) < (1 << max_order));
879
880 ret = -EINVAL;
881 while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
882 KVM_MEMORY_ATTRIBUTE_PRIVATE,
883 KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
884 if (!max_order)
885 goto put_folio_and_exit;
886 max_order--;
887 }
888
889 p = src ? src + i * PAGE_SIZE : NULL;
890 ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
891 if (!ret)
892 kvm_gmem_mark_prepared(folio);
893
894 put_folio_and_exit:
895 folio_put(folio);
896 if (ret)
897 break;
898 }
899
900 filemap_invalidate_unlock(file->f_mapping);
901
902 return ret && !i ? ret : i;
903 }
904 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
905 #endif
906
907 static struct kmem_cache *kvm_gmem_inode_cachep;
908
kvm_gmem_init_inode_once(void * __gi)909 static void kvm_gmem_init_inode_once(void *__gi)
910 {
911 struct gmem_inode *gi = __gi;
912
913 /*
914 * Note! Don't initialize the inode with anything specific to the
915 * guest_memfd instance, or that might be specific to how the inode is
916 * used (from the VFS-layer's perspective). This hook is called only
917 * during the initial slab allocation, i.e. only fields/state that are
918 * idempotent across _all_ use of the inode _object_ can be initialized
919 * at this time!
920 */
921 inode_init_once(&gi->vfs_inode);
922 }
923
kvm_gmem_alloc_inode(struct super_block * sb)924 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
925 {
926 struct gmem_inode *gi;
927
928 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
929 if (!gi)
930 return NULL;
931
932 mpol_shared_policy_init(&gi->policy, NULL);
933
934 gi->flags = 0;
935 return &gi->vfs_inode;
936 }
937
kvm_gmem_destroy_inode(struct inode * inode)938 static void kvm_gmem_destroy_inode(struct inode *inode)
939 {
940 mpol_free_shared_policy(&GMEM_I(inode)->policy);
941 }
942
kvm_gmem_free_inode(struct inode * inode)943 static void kvm_gmem_free_inode(struct inode *inode)
944 {
945 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
946 }
947
948 static const struct super_operations kvm_gmem_super_operations = {
949 .statfs = simple_statfs,
950 .alloc_inode = kvm_gmem_alloc_inode,
951 .destroy_inode = kvm_gmem_destroy_inode,
952 .free_inode = kvm_gmem_free_inode,
953 };
954
kvm_gmem_init_fs_context(struct fs_context * fc)955 static int kvm_gmem_init_fs_context(struct fs_context *fc)
956 {
957 struct pseudo_fs_context *ctx;
958
959 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
960 return -ENOMEM;
961
962 fc->s_iflags |= SB_I_NOEXEC;
963 fc->s_iflags |= SB_I_NODEV;
964 ctx = fc->fs_private;
965 ctx->ops = &kvm_gmem_super_operations;
966
967 return 0;
968 }
969
970 static struct file_system_type kvm_gmem_fs = {
971 .name = "guest_memfd",
972 .init_fs_context = kvm_gmem_init_fs_context,
973 .kill_sb = kill_anon_super,
974 };
975
kvm_gmem_init_mount(void)976 static int kvm_gmem_init_mount(void)
977 {
978 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
979
980 if (IS_ERR(kvm_gmem_mnt))
981 return PTR_ERR(kvm_gmem_mnt);
982
983 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
984 return 0;
985 }
986
kvm_gmem_init(struct module * module)987 int kvm_gmem_init(struct module *module)
988 {
989 struct kmem_cache_args args = {
990 .align = 0,
991 .ctor = kvm_gmem_init_inode_once,
992 };
993 int ret;
994
995 kvm_gmem_fops.owner = module;
996 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
997 sizeof(struct gmem_inode),
998 &args, SLAB_ACCOUNT);
999 if (!kvm_gmem_inode_cachep)
1000 return -ENOMEM;
1001
1002 ret = kvm_gmem_init_mount();
1003 if (ret) {
1004 kmem_cache_destroy(kvm_gmem_inode_cachep);
1005 return ret;
1006 }
1007 return 0;
1008 }
1009
kvm_gmem_exit(void)1010 void kvm_gmem_exit(void)
1011 {
1012 kern_unmount(kvm_gmem_mnt);
1013 kvm_gmem_mnt = NULL;
1014 rcu_barrier();
1015 kmem_cache_destroy(kvm_gmem_inode_cachep);
1016 }
1017