1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10
11 #include "kvm_mm.h"
12
13 static struct vfsmount *kvm_gmem_mnt;
14
15 /*
16 * A guest_memfd instance can be associated multiple VMs, each with its own
17 * "view" of the underlying physical memory.
18 *
19 * The gmem's inode is effectively the raw underlying physical storage, and is
20 * used to track properties of the physical memory, while each gmem file is
21 * effectively a single VM's view of that storage, and is used to track assets
22 * specific to its associated VM, e.g. memslots=>gmem bindings.
23 */
24 struct gmem_file {
25 struct kvm *kvm;
26 struct xarray bindings;
27 struct list_head entry;
28 };
29
30 struct gmem_inode {
31 struct shared_policy policy;
32 struct inode vfs_inode;
33
34 u64 flags;
35 };
36
GMEM_I(struct inode * inode)37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
38 {
39 return container_of(inode, struct gmem_inode, vfs_inode);
40 }
41
42 #define kvm_gmem_for_each_file(f, mapping) \
43 list_for_each_entry(f, &(mapping)->i_private_list, entry)
44
45 /**
46 * folio_file_pfn - like folio_file_page, but return a pfn.
47 * @folio: The folio which contains this index.
48 * @index: The index we want to look up.
49 *
50 * Return: The pfn for this index.
51 */
folio_file_pfn(struct folio * folio,pgoff_t index)52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
53 {
54 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
55 }
56
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
58 {
59 return gfn - slot->base_gfn + slot->gmem.pgoff;
60 }
61
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
63 pgoff_t index, struct folio *folio)
64 {
65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
66 kvm_pfn_t pfn = folio_file_pfn(folio, index);
67 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
68 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
69 if (rc) {
70 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
71 index, gfn, pfn, rc);
72 return rc;
73 }
74 #endif
75
76 return 0;
77 }
78
79 /*
80 * Process @folio, which contains @gfn, so that the guest can use it.
81 * The folio must be locked and the gfn must be contained in @slot.
82 * On successful return the guest sees a zero page so as to avoid
83 * leaking host data and the up-to-date flag is set.
84 */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)85 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
86 gfn_t gfn, struct folio *folio)
87 {
88 pgoff_t index;
89
90 /*
91 * Preparing huge folios should always be safe, since it should
92 * be possible to split them later if needed.
93 *
94 * Right now the folio order is always going to be zero, but the
95 * code is ready for huge folios. The only assumption is that
96 * the base pgoff of memslots is naturally aligned with the
97 * requested page order, ensuring that huge folios can also use
98 * huge page table entries for GPA->HPA mapping.
99 *
100 * The order will be passed when creating the guest_memfd, and
101 * checked when creating memslots.
102 */
103 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
104 index = kvm_gmem_get_index(slot, gfn);
105 index = ALIGN_DOWN(index, folio_nr_pages(folio));
106
107 return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
108 }
109
110 /*
111 * Returns a locked folio on success. The caller is responsible for
112 * setting the up-to-date flag before the memory is mapped into the guest.
113 * There is no backing storage for the memory, so the folio will remain
114 * up-to-date until it's removed.
115 *
116 * Ignore accessed, referenced, and dirty flags. The memory is
117 * unevictable and there is no storage to write back to.
118 */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)119 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
120 {
121 /* TODO: Support huge pages. */
122 struct mempolicy *policy;
123 struct folio *folio;
124
125 /*
126 * Fast-path: See if folio is already present in mapping to avoid
127 * policy_lookup.
128 */
129 folio = __filemap_get_folio(inode->i_mapping, index,
130 FGP_LOCK | FGP_ACCESSED, 0);
131 if (!IS_ERR(folio))
132 return folio;
133
134 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
135 folio = __filemap_get_folio_mpol(inode->i_mapping, index,
136 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
137 mapping_gfp_mask(inode->i_mapping), policy);
138 mpol_cond_put(policy);
139
140 /*
141 * External interfaces like kvm_gmem_get_pfn() support dealing
142 * with hugepages to a degree, but internally, guest_memfd currently
143 * assumes that all folios are order-0 and handling would need
144 * to be updated for anything otherwise (e.g. page-clearing
145 * operations).
146 */
147 WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
148
149 return folio;
150 }
151
kvm_gmem_get_invalidate_filter(struct inode * inode)152 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
153 {
154 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
155 return KVM_FILTER_SHARED;
156
157 return KVM_FILTER_PRIVATE;
158 }
159
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)160 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
161 pgoff_t end,
162 enum kvm_gfn_range_filter attr_filter)
163 {
164 bool flush = false, found_memslot = false;
165 struct kvm_memory_slot *slot;
166 struct kvm *kvm = f->kvm;
167 unsigned long index;
168
169 xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
170 pgoff_t pgoff = slot->gmem.pgoff;
171
172 struct kvm_gfn_range gfn_range = {
173 .start = slot->base_gfn + max(pgoff, start) - pgoff,
174 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
175 .slot = slot,
176 .may_block = true,
177 .attr_filter = attr_filter,
178 };
179
180 if (!found_memslot) {
181 found_memslot = true;
182
183 KVM_MMU_LOCK(kvm);
184 kvm_mmu_invalidate_begin(kvm);
185 }
186
187 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
188 }
189
190 if (flush)
191 kvm_flush_remote_tlbs(kvm);
192
193 if (found_memslot)
194 KVM_MMU_UNLOCK(kvm);
195 }
196
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)197 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
198 pgoff_t end)
199 {
200 enum kvm_gfn_range_filter attr_filter;
201 struct gmem_file *f;
202
203 attr_filter = kvm_gmem_get_invalidate_filter(inode);
204
205 kvm_gmem_for_each_file(f, inode->i_mapping)
206 __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
207 }
208
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)209 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
210 pgoff_t end)
211 {
212 struct kvm *kvm = f->kvm;
213
214 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
215 KVM_MMU_LOCK(kvm);
216 kvm_mmu_invalidate_end(kvm);
217 KVM_MMU_UNLOCK(kvm);
218 }
219 }
220
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)221 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
222 pgoff_t end)
223 {
224 struct gmem_file *f;
225
226 kvm_gmem_for_each_file(f, inode->i_mapping)
227 __kvm_gmem_invalidate_end(f, start, end);
228 }
229
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)230 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
231 {
232 pgoff_t start = offset >> PAGE_SHIFT;
233 pgoff_t end = (offset + len) >> PAGE_SHIFT;
234
235 /*
236 * Bindings must be stable across invalidation to ensure the start+end
237 * are balanced.
238 */
239 filemap_invalidate_lock(inode->i_mapping);
240
241 kvm_gmem_invalidate_begin(inode, start, end);
242
243 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
244
245 kvm_gmem_invalidate_end(inode, start, end);
246
247 filemap_invalidate_unlock(inode->i_mapping);
248
249 return 0;
250 }
251
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)252 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
253 {
254 struct address_space *mapping = inode->i_mapping;
255 pgoff_t start, index, end;
256 int r;
257
258 /* Dedicated guest is immutable by default. */
259 if (offset + len > i_size_read(inode))
260 return -EINVAL;
261
262 filemap_invalidate_lock_shared(mapping);
263
264 start = offset >> PAGE_SHIFT;
265 end = (offset + len) >> PAGE_SHIFT;
266
267 r = 0;
268 for (index = start; index < end; ) {
269 struct folio *folio;
270
271 if (signal_pending(current)) {
272 r = -EINTR;
273 break;
274 }
275
276 folio = kvm_gmem_get_folio(inode, index);
277 if (IS_ERR(folio)) {
278 r = PTR_ERR(folio);
279 break;
280 }
281
282 index = folio_next_index(folio);
283
284 folio_unlock(folio);
285 folio_put(folio);
286
287 /* 64-bit only, wrapping the index should be impossible. */
288 if (WARN_ON_ONCE(!index))
289 break;
290
291 cond_resched();
292 }
293
294 filemap_invalidate_unlock_shared(mapping);
295
296 return r;
297 }
298
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)299 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
300 loff_t len)
301 {
302 int ret;
303
304 if (!(mode & FALLOC_FL_KEEP_SIZE))
305 return -EOPNOTSUPP;
306
307 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
308 return -EOPNOTSUPP;
309
310 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
311 return -EINVAL;
312
313 if (mode & FALLOC_FL_PUNCH_HOLE)
314 ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
315 else
316 ret = kvm_gmem_allocate(file_inode(file), offset, len);
317
318 if (!ret)
319 file_modified(file);
320 return ret;
321 }
322
kvm_gmem_release(struct inode * inode,struct file * file)323 static int kvm_gmem_release(struct inode *inode, struct file *file)
324 {
325 struct gmem_file *f = file->private_data;
326 struct kvm_memory_slot *slot;
327 struct kvm *kvm = f->kvm;
328 unsigned long index;
329
330 /*
331 * Prevent concurrent attempts to *unbind* a memslot. This is the last
332 * reference to the file and thus no new bindings can be created, but
333 * dereferencing the slot for existing bindings needs to be protected
334 * against memslot updates, specifically so that unbind doesn't race
335 * and free the memslot (kvm_gmem_get_file() will return NULL).
336 *
337 * Since .release is called only when the reference count is zero,
338 * after which file_ref_get() and get_file_active() fail,
339 * kvm_gmem_get_pfn() cannot be using the file concurrently.
340 * file_ref_put() provides a full barrier, and get_file_active() the
341 * matching acquire barrier.
342 */
343 mutex_lock(&kvm->slots_lock);
344
345 filemap_invalidate_lock(inode->i_mapping);
346
347 xa_for_each(&f->bindings, index, slot)
348 WRITE_ONCE(slot->gmem.file, NULL);
349
350 /*
351 * All in-flight operations are gone and new bindings can be created.
352 * Zap all SPTEs pointed at by this file. Do not free the backing
353 * memory, as its lifetime is associated with the inode, not the file.
354 */
355 __kvm_gmem_invalidate_begin(f, 0, -1ul,
356 kvm_gmem_get_invalidate_filter(inode));
357 __kvm_gmem_invalidate_end(f, 0, -1ul);
358
359 list_del(&f->entry);
360
361 filemap_invalidate_unlock(inode->i_mapping);
362
363 mutex_unlock(&kvm->slots_lock);
364
365 xa_destroy(&f->bindings);
366 kfree(f);
367
368 kvm_put_kvm(kvm);
369
370 return 0;
371 }
372
kvm_gmem_get_file(struct kvm_memory_slot * slot)373 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
374 {
375 /*
376 * Do not return slot->gmem.file if it has already been closed;
377 * there might be some time between the last fput() and when
378 * kvm_gmem_release() clears slot->gmem.file.
379 */
380 return get_file_active(&slot->gmem.file);
381 }
382
383 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
384 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
385
kvm_gmem_supports_mmap(struct inode * inode)386 static bool kvm_gmem_supports_mmap(struct inode *inode)
387 {
388 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
389 }
390
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)391 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
392 {
393 struct inode *inode = file_inode(vmf->vma->vm_file);
394 struct folio *folio;
395 vm_fault_t ret = VM_FAULT_LOCKED;
396
397 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
398 return VM_FAULT_SIGBUS;
399
400 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
401 return VM_FAULT_SIGBUS;
402
403 folio = kvm_gmem_get_folio(inode, vmf->pgoff);
404 if (IS_ERR(folio)) {
405 if (PTR_ERR(folio) == -EAGAIN)
406 return VM_FAULT_RETRY;
407
408 return vmf_error(PTR_ERR(folio));
409 }
410
411 if (WARN_ON_ONCE(folio_test_large(folio))) {
412 ret = VM_FAULT_SIGBUS;
413 goto out_folio;
414 }
415
416 if (!folio_test_uptodate(folio)) {
417 clear_highpage(folio_page(folio, 0));
418 folio_mark_uptodate(folio);
419 }
420
421 vmf->page = folio_file_page(folio, vmf->pgoff);
422
423 out_folio:
424 if (ret != VM_FAULT_LOCKED) {
425 folio_unlock(folio);
426 folio_put(folio);
427 }
428
429 return ret;
430 }
431
432 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)433 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
434 {
435 struct inode *inode = file_inode(vma->vm_file);
436
437 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
438 }
439
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)440 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
441 unsigned long addr, pgoff_t *pgoff)
442 {
443 struct inode *inode = file_inode(vma->vm_file);
444
445 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
446
447 /*
448 * Return the memory policy for this index, or NULL if none is set.
449 *
450 * Returning NULL, e.g. instead of the current task's memory policy, is
451 * important for the .get_policy kernel ABI: it indicates that no
452 * explicit policy has been set via mbind() for this memory. The caller
453 * can then replace NULL with the default memory policy instead of the
454 * current task's memory policy.
455 */
456 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
457 }
458 #endif /* CONFIG_NUMA */
459
460 static const struct vm_operations_struct kvm_gmem_vm_ops = {
461 .fault = kvm_gmem_fault_user_mapping,
462 #ifdef CONFIG_NUMA
463 .get_policy = kvm_gmem_get_policy,
464 .set_policy = kvm_gmem_set_policy,
465 #endif
466 };
467
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)468 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
469 {
470 if (!kvm_gmem_supports_mmap(file_inode(file)))
471 return -ENODEV;
472
473 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
474 (VM_SHARED | VM_MAYSHARE)) {
475 return -EINVAL;
476 }
477
478 vma->vm_ops = &kvm_gmem_vm_ops;
479
480 return 0;
481 }
482
483 static struct file_operations kvm_gmem_fops = {
484 .mmap = kvm_gmem_mmap,
485 .open = generic_file_open,
486 .release = kvm_gmem_release,
487 .fallocate = kvm_gmem_fallocate,
488 };
489
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)490 static int kvm_gmem_migrate_folio(struct address_space *mapping,
491 struct folio *dst, struct folio *src,
492 enum migrate_mode mode)
493 {
494 WARN_ON_ONCE(1);
495 return -EINVAL;
496 }
497
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)498 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
499 {
500 pgoff_t start, end;
501
502 filemap_invalidate_lock_shared(mapping);
503
504 start = folio->index;
505 end = start + folio_nr_pages(folio);
506
507 kvm_gmem_invalidate_begin(mapping->host, start, end);
508
509 /*
510 * Do not truncate the range, what action is taken in response to the
511 * error is userspace's decision (assuming the architecture supports
512 * gracefully handling memory errors). If/when the guest attempts to
513 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
514 * at which point KVM can either terminate the VM or propagate the
515 * error to userspace.
516 */
517
518 kvm_gmem_invalidate_end(mapping->host, start, end);
519
520 filemap_invalidate_unlock_shared(mapping);
521
522 return MF_DELAYED;
523 }
524
525 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)526 static void kvm_gmem_free_folio(struct folio *folio)
527 {
528 struct page *page = folio_page(folio, 0);
529 kvm_pfn_t pfn = page_to_pfn(page);
530 int order = folio_order(folio);
531
532 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
533 }
534 #endif
535
536 static const struct address_space_operations kvm_gmem_aops = {
537 .dirty_folio = noop_dirty_folio,
538 .migrate_folio = kvm_gmem_migrate_folio,
539 .error_remove_folio = kvm_gmem_error_folio,
540 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
541 .free_folio = kvm_gmem_free_folio,
542 #endif
543 };
544
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)545 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
546 struct iattr *attr)
547 {
548 return -EINVAL;
549 }
550 static const struct inode_operations kvm_gmem_iops = {
551 .setattr = kvm_gmem_setattr,
552 };
553
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)554 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
555 {
556 return true;
557 }
558
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)559 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
560 {
561 static const char *name = "[kvm-gmem]";
562 struct gmem_file *f;
563 struct inode *inode;
564 struct file *file;
565 int fd, err;
566
567 fd = get_unused_fd_flags(0);
568 if (fd < 0)
569 return fd;
570
571 f = kzalloc_obj(*f);
572 if (!f) {
573 err = -ENOMEM;
574 goto err_fd;
575 }
576
577 /* __fput() will take care of fops_put(). */
578 if (!fops_get(&kvm_gmem_fops)) {
579 err = -ENOENT;
580 goto err_gmem;
581 }
582
583 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
584 if (IS_ERR(inode)) {
585 err = PTR_ERR(inode);
586 goto err_fops;
587 }
588
589 inode->i_op = &kvm_gmem_iops;
590 inode->i_mapping->a_ops = &kvm_gmem_aops;
591 inode->i_mode |= S_IFREG;
592 inode->i_size = size;
593 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
594 mapping_set_inaccessible(inode->i_mapping);
595 /* Unmovable mappings are supposed to be marked unevictable as well. */
596 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
597
598 GMEM_I(inode)->flags = flags;
599
600 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
601 if (IS_ERR(file)) {
602 err = PTR_ERR(file);
603 goto err_inode;
604 }
605
606 file->f_flags |= O_LARGEFILE;
607 file->private_data = f;
608
609 kvm_get_kvm(kvm);
610 f->kvm = kvm;
611 xa_init(&f->bindings);
612 list_add(&f->entry, &inode->i_mapping->i_private_list);
613
614 fd_install(fd, file);
615 return fd;
616
617 err_inode:
618 iput(inode);
619 err_fops:
620 fops_put(&kvm_gmem_fops);
621 err_gmem:
622 kfree(f);
623 err_fd:
624 put_unused_fd(fd);
625 return err;
626 }
627
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)628 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
629 {
630 loff_t size = args->size;
631 u64 flags = args->flags;
632
633 if (flags & ~kvm_gmem_get_supported_flags(kvm))
634 return -EINVAL;
635
636 if (size <= 0 || !PAGE_ALIGNED(size))
637 return -EINVAL;
638
639 return __kvm_gmem_create(kvm, size, flags);
640 }
641
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)642 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
643 unsigned int fd, loff_t offset)
644 {
645 loff_t size = slot->npages << PAGE_SHIFT;
646 unsigned long start, end;
647 struct gmem_file *f;
648 struct inode *inode;
649 struct file *file;
650 int r = -EINVAL;
651
652 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
653
654 file = fget(fd);
655 if (!file)
656 return -EBADF;
657
658 if (file->f_op != &kvm_gmem_fops)
659 goto err;
660
661 f = file->private_data;
662 if (f->kvm != kvm)
663 goto err;
664
665 inode = file_inode(file);
666
667 if (offset < 0 || !PAGE_ALIGNED(offset) ||
668 offset + size > i_size_read(inode))
669 goto err;
670
671 filemap_invalidate_lock(inode->i_mapping);
672
673 start = offset >> PAGE_SHIFT;
674 end = start + slot->npages;
675
676 if (!xa_empty(&f->bindings) &&
677 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
678 filemap_invalidate_unlock(inode->i_mapping);
679 goto err;
680 }
681
682 /*
683 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
684 * kvm_gmem_bind() must occur on a new memslot. Because the memslot
685 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
686 */
687 WRITE_ONCE(slot->gmem.file, file);
688 slot->gmem.pgoff = start;
689 if (kvm_gmem_supports_mmap(inode))
690 slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
691
692 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
693 filemap_invalidate_unlock(inode->i_mapping);
694
695 /*
696 * Drop the reference to the file, even on success. The file pins KVM,
697 * not the other way 'round. Active bindings are invalidated if the
698 * file is closed before memslots are destroyed.
699 */
700 r = 0;
701 err:
702 fput(file);
703 return r;
704 }
705
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)706 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
707 {
708 unsigned long start = slot->gmem.pgoff;
709 unsigned long end = start + slot->npages;
710
711 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
712
713 /*
714 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
715 * cannot see this memslot.
716 */
717 WRITE_ONCE(slot->gmem.file, NULL);
718 }
719
kvm_gmem_unbind(struct kvm_memory_slot * slot)720 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
721 {
722 /*
723 * Nothing to do if the underlying file was _already_ closed, as
724 * kvm_gmem_release() invalidates and nullifies all bindings.
725 */
726 if (!slot->gmem.file)
727 return;
728
729 CLASS(gmem_get_file, file)(slot);
730
731 /*
732 * However, if the file is _being_ closed, then the bindings need to be
733 * removed as kvm_gmem_release() might not run until after the memslot
734 * is freed. Note, modifying the bindings is safe even though the file
735 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
736 * slots_lock, and only puts its reference to KVM after destroying all
737 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
738 * yet destroyed the bindings or freed the gmem_file, and can't do so
739 * until the caller drops slots_lock.
740 */
741 if (!file) {
742 __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
743 return;
744 }
745
746 filemap_invalidate_lock(file->f_mapping);
747 __kvm_gmem_unbind(slot, file->private_data);
748 filemap_invalidate_unlock(file->f_mapping);
749 }
750
751 /* Returns a locked folio on success. */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,int * max_order)752 static struct folio *__kvm_gmem_get_pfn(struct file *file,
753 struct kvm_memory_slot *slot,
754 pgoff_t index, kvm_pfn_t *pfn,
755 int *max_order)
756 {
757 struct file *slot_file = READ_ONCE(slot->gmem.file);
758 struct gmem_file *f = file->private_data;
759 struct folio *folio;
760
761 if (file != slot_file) {
762 WARN_ON_ONCE(slot_file);
763 return ERR_PTR(-EFAULT);
764 }
765
766 if (xa_load(&f->bindings, index) != slot) {
767 WARN_ON_ONCE(xa_load(&f->bindings, index));
768 return ERR_PTR(-EIO);
769 }
770
771 folio = kvm_gmem_get_folio(file_inode(file), index);
772 if (IS_ERR(folio))
773 return folio;
774
775 if (folio_test_hwpoison(folio)) {
776 folio_unlock(folio);
777 folio_put(folio);
778 return ERR_PTR(-EHWPOISON);
779 }
780
781 *pfn = folio_file_pfn(folio, index);
782 if (max_order)
783 *max_order = 0;
784
785 return folio;
786 }
787
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)788 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
789 gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
790 int *max_order)
791 {
792 pgoff_t index = kvm_gmem_get_index(slot, gfn);
793 struct folio *folio;
794 int r = 0;
795
796 CLASS(gmem_get_file, file)(slot);
797 if (!file)
798 return -EFAULT;
799
800 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
801 if (IS_ERR(folio))
802 return PTR_ERR(folio);
803
804 if (!folio_test_uptodate(folio)) {
805 clear_highpage(folio_page(folio, 0));
806 folio_mark_uptodate(folio);
807 }
808
809 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
810
811 folio_unlock(folio);
812
813 if (!r)
814 *page = folio_file_page(folio, index);
815 else
816 folio_put(folio);
817
818 return r;
819 }
820 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
821
822 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
823
__kvm_gmem_populate(struct kvm * kvm,struct kvm_memory_slot * slot,struct file * file,gfn_t gfn,struct page * src_page,kvm_gmem_populate_cb post_populate,void * opaque)824 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
825 struct file *file, gfn_t gfn, struct page *src_page,
826 kvm_gmem_populate_cb post_populate, void *opaque)
827 {
828 pgoff_t index = kvm_gmem_get_index(slot, gfn);
829 struct folio *folio;
830 kvm_pfn_t pfn;
831 int ret;
832
833 filemap_invalidate_lock(file->f_mapping);
834
835 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
836 if (IS_ERR(folio)) {
837 ret = PTR_ERR(folio);
838 goto out_unlock;
839 }
840
841 folio_unlock(folio);
842
843 if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
844 KVM_MEMORY_ATTRIBUTE_PRIVATE,
845 KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
846 ret = -EINVAL;
847 goto out_put_folio;
848 }
849
850 ret = post_populate(kvm, gfn, pfn, src_page, opaque);
851 if (!ret)
852 folio_mark_uptodate(folio);
853
854 out_put_folio:
855 folio_put(folio);
856 out_unlock:
857 filemap_invalidate_unlock(file->f_mapping);
858 return ret;
859 }
860
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)861 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
862 kvm_gmem_populate_cb post_populate, void *opaque)
863 {
864 struct kvm_memory_slot *slot;
865 int ret = 0;
866 long i;
867
868 lockdep_assert_held(&kvm->slots_lock);
869
870 if (WARN_ON_ONCE(npages <= 0))
871 return -EINVAL;
872
873 if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
874 return -EINVAL;
875
876 slot = gfn_to_memslot(kvm, start_gfn);
877 if (!kvm_slot_has_gmem(slot))
878 return -EINVAL;
879
880 CLASS(gmem_get_file, file)(slot);
881 if (!file)
882 return -EFAULT;
883
884 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
885 for (i = 0; i < npages; i++) {
886 struct page *src_page = NULL;
887
888 if (signal_pending(current)) {
889 ret = -EINTR;
890 break;
891 }
892
893 if (src) {
894 unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
895
896 ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
897 if (ret < 0)
898 break;
899 if (ret != 1) {
900 ret = -ENOMEM;
901 break;
902 }
903 }
904
905 ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
906 post_populate, opaque);
907
908 if (src_page)
909 put_page(src_page);
910
911 if (ret)
912 break;
913 }
914
915 return ret && !i ? ret : i;
916 }
917 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
918 #endif
919
920 static struct kmem_cache *kvm_gmem_inode_cachep;
921
kvm_gmem_init_inode_once(void * __gi)922 static void kvm_gmem_init_inode_once(void *__gi)
923 {
924 struct gmem_inode *gi = __gi;
925
926 /*
927 * Note! Don't initialize the inode with anything specific to the
928 * guest_memfd instance, or that might be specific to how the inode is
929 * used (from the VFS-layer's perspective). This hook is called only
930 * during the initial slab allocation, i.e. only fields/state that are
931 * idempotent across _all_ use of the inode _object_ can be initialized
932 * at this time!
933 */
934 inode_init_once(&gi->vfs_inode);
935 }
936
kvm_gmem_alloc_inode(struct super_block * sb)937 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
938 {
939 struct gmem_inode *gi;
940
941 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
942 if (!gi)
943 return NULL;
944
945 mpol_shared_policy_init(&gi->policy, NULL);
946
947 gi->flags = 0;
948 return &gi->vfs_inode;
949 }
950
kvm_gmem_destroy_inode(struct inode * inode)951 static void kvm_gmem_destroy_inode(struct inode *inode)
952 {
953 mpol_free_shared_policy(&GMEM_I(inode)->policy);
954 }
955
kvm_gmem_free_inode(struct inode * inode)956 static void kvm_gmem_free_inode(struct inode *inode)
957 {
958 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
959 }
960
961 static const struct super_operations kvm_gmem_super_operations = {
962 .statfs = simple_statfs,
963 .alloc_inode = kvm_gmem_alloc_inode,
964 .destroy_inode = kvm_gmem_destroy_inode,
965 .free_inode = kvm_gmem_free_inode,
966 };
967
kvm_gmem_init_fs_context(struct fs_context * fc)968 static int kvm_gmem_init_fs_context(struct fs_context *fc)
969 {
970 struct pseudo_fs_context *ctx;
971
972 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
973 return -ENOMEM;
974
975 fc->s_iflags |= SB_I_NOEXEC;
976 fc->s_iflags |= SB_I_NODEV;
977 ctx = fc->fs_private;
978 ctx->ops = &kvm_gmem_super_operations;
979
980 return 0;
981 }
982
983 static struct file_system_type kvm_gmem_fs = {
984 .name = "guest_memfd",
985 .init_fs_context = kvm_gmem_init_fs_context,
986 .kill_sb = kill_anon_super,
987 };
988
kvm_gmem_init_mount(void)989 static int kvm_gmem_init_mount(void)
990 {
991 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
992
993 if (IS_ERR(kvm_gmem_mnt))
994 return PTR_ERR(kvm_gmem_mnt);
995
996 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
997 return 0;
998 }
999
kvm_gmem_init(struct module * module)1000 int kvm_gmem_init(struct module *module)
1001 {
1002 struct kmem_cache_args args = {
1003 .align = 0,
1004 .ctor = kvm_gmem_init_inode_once,
1005 };
1006 int ret;
1007
1008 kvm_gmem_fops.owner = module;
1009 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
1010 sizeof(struct gmem_inode),
1011 &args, SLAB_ACCOUNT);
1012 if (!kvm_gmem_inode_cachep)
1013 return -ENOMEM;
1014
1015 ret = kvm_gmem_init_mount();
1016 if (ret) {
1017 kmem_cache_destroy(kvm_gmem_inode_cachep);
1018 return ret;
1019 }
1020 return 0;
1021 }
1022
kvm_gmem_exit(void)1023 void kvm_gmem_exit(void)
1024 {
1025 kern_unmount(kvm_gmem_mnt);
1026 kvm_gmem_mnt = NULL;
1027 rcu_barrier();
1028 kmem_cache_destroy(kvm_gmem_inode_cachep);
1029 }
1030