xref: /linux/virt/kvm/guest_memfd.c (revision 51d90a15fedf8366cb96ef68d0ea2d0bf15417d2)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10 
11 #include "kvm_mm.h"
12 
13 static struct vfsmount *kvm_gmem_mnt;
14 
15 /*
16  * A guest_memfd instance can be associated multiple VMs, each with its own
17  * "view" of the underlying physical memory.
18  *
19  * The gmem's inode is effectively the raw underlying physical storage, and is
20  * used to track properties of the physical memory, while each gmem file is
21  * effectively a single VM's view of that storage, and is used to track assets
22  * specific to its associated VM, e.g. memslots=>gmem bindings.
23  */
24 struct gmem_file {
25 	struct kvm *kvm;
26 	struct xarray bindings;
27 	struct list_head entry;
28 };
29 
30 struct gmem_inode {
31 	struct shared_policy policy;
32 	struct inode vfs_inode;
33 
34 	u64 flags;
35 };
36 
GMEM_I(struct inode * inode)37 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
38 {
39 	return container_of(inode, struct gmem_inode, vfs_inode);
40 }
41 
42 #define kvm_gmem_for_each_file(f, mapping) \
43 	list_for_each_entry(f, &(mapping)->i_private_list, entry)
44 
45 /**
46  * folio_file_pfn - like folio_file_page, but return a pfn.
47  * @folio: The folio which contains this index.
48  * @index: The index we want to look up.
49  *
50  * Return: The pfn for this index.
51  */
folio_file_pfn(struct folio * folio,pgoff_t index)52 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
53 {
54 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
55 }
56 
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)57 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
58 {
59 	return gfn - slot->base_gfn + slot->gmem.pgoff;
60 }
61 
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)62 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
63 				    pgoff_t index, struct folio *folio)
64 {
65 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
66 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
67 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
68 	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
69 	if (rc) {
70 		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
71 				    index, gfn, pfn, rc);
72 		return rc;
73 	}
74 #endif
75 
76 	return 0;
77 }
78 
kvm_gmem_mark_prepared(struct folio * folio)79 static inline void kvm_gmem_mark_prepared(struct folio *folio)
80 {
81 	folio_mark_uptodate(folio);
82 }
83 
84 /*
85  * Process @folio, which contains @gfn, so that the guest can use it.
86  * The folio must be locked and the gfn must be contained in @slot.
87  * On successful return the guest sees a zero page so as to avoid
88  * leaking host data and the up-to-date flag is set.
89  */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)90 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
91 				  gfn_t gfn, struct folio *folio)
92 {
93 	unsigned long nr_pages, i;
94 	pgoff_t index;
95 	int r;
96 
97 	nr_pages = folio_nr_pages(folio);
98 	for (i = 0; i < nr_pages; i++)
99 		clear_highpage(folio_page(folio, i));
100 
101 	/*
102 	 * Preparing huge folios should always be safe, since it should
103 	 * be possible to split them later if needed.
104 	 *
105 	 * Right now the folio order is always going to be zero, but the
106 	 * code is ready for huge folios.  The only assumption is that
107 	 * the base pgoff of memslots is naturally aligned with the
108 	 * requested page order, ensuring that huge folios can also use
109 	 * huge page table entries for GPA->HPA mapping.
110 	 *
111 	 * The order will be passed when creating the guest_memfd, and
112 	 * checked when creating memslots.
113 	 */
114 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
115 	index = kvm_gmem_get_index(slot, gfn);
116 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
117 	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
118 	if (!r)
119 		kvm_gmem_mark_prepared(folio);
120 
121 	return r;
122 }
123 
124 /*
125  * Returns a locked folio on success.  The caller is responsible for
126  * setting the up-to-date flag before the memory is mapped into the guest.
127  * There is no backing storage for the memory, so the folio will remain
128  * up-to-date until it's removed.
129  *
130  * Ignore accessed, referenced, and dirty flags.  The memory is
131  * unevictable and there is no storage to write back to.
132  */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)133 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
134 {
135 	/* TODO: Support huge pages. */
136 	struct mempolicy *policy;
137 	struct folio *folio;
138 
139 	/*
140 	 * Fast-path: See if folio is already present in mapping to avoid
141 	 * policy_lookup.
142 	 */
143 	folio = __filemap_get_folio(inode->i_mapping, index,
144 				    FGP_LOCK | FGP_ACCESSED, 0);
145 	if (!IS_ERR(folio))
146 		return folio;
147 
148 	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
149 	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
150 					 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
151 					 mapping_gfp_mask(inode->i_mapping), policy);
152 	mpol_cond_put(policy);
153 
154 	return folio;
155 }
156 
kvm_gmem_get_invalidate_filter(struct inode * inode)157 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
158 {
159 	if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
160 		return KVM_FILTER_SHARED;
161 
162 	return KVM_FILTER_PRIVATE;
163 }
164 
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)165 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
166 					pgoff_t end,
167 					enum kvm_gfn_range_filter attr_filter)
168 {
169 	bool flush = false, found_memslot = false;
170 	struct kvm_memory_slot *slot;
171 	struct kvm *kvm = f->kvm;
172 	unsigned long index;
173 
174 	xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
175 		pgoff_t pgoff = slot->gmem.pgoff;
176 
177 		struct kvm_gfn_range gfn_range = {
178 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
179 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
180 			.slot = slot,
181 			.may_block = true,
182 			.attr_filter = attr_filter,
183 		};
184 
185 		if (!found_memslot) {
186 			found_memslot = true;
187 
188 			KVM_MMU_LOCK(kvm);
189 			kvm_mmu_invalidate_begin(kvm);
190 		}
191 
192 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
193 	}
194 
195 	if (flush)
196 		kvm_flush_remote_tlbs(kvm);
197 
198 	if (found_memslot)
199 		KVM_MMU_UNLOCK(kvm);
200 }
201 
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)202 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
203 				      pgoff_t end)
204 {
205 	enum kvm_gfn_range_filter attr_filter;
206 	struct gmem_file *f;
207 
208 	attr_filter = kvm_gmem_get_invalidate_filter(inode);
209 
210 	kvm_gmem_for_each_file(f, inode->i_mapping)
211 		__kvm_gmem_invalidate_begin(f, start, end, attr_filter);
212 }
213 
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)214 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
215 				      pgoff_t end)
216 {
217 	struct kvm *kvm = f->kvm;
218 
219 	if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
220 		KVM_MMU_LOCK(kvm);
221 		kvm_mmu_invalidate_end(kvm);
222 		KVM_MMU_UNLOCK(kvm);
223 	}
224 }
225 
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)226 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
227 				    pgoff_t end)
228 {
229 	struct gmem_file *f;
230 
231 	kvm_gmem_for_each_file(f, inode->i_mapping)
232 		__kvm_gmem_invalidate_end(f, start, end);
233 }
234 
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)235 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
236 {
237 	pgoff_t start = offset >> PAGE_SHIFT;
238 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
239 
240 	/*
241 	 * Bindings must be stable across invalidation to ensure the start+end
242 	 * are balanced.
243 	 */
244 	filemap_invalidate_lock(inode->i_mapping);
245 
246 	kvm_gmem_invalidate_begin(inode, start, end);
247 
248 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
249 
250 	kvm_gmem_invalidate_end(inode, start, end);
251 
252 	filemap_invalidate_unlock(inode->i_mapping);
253 
254 	return 0;
255 }
256 
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)257 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
258 {
259 	struct address_space *mapping = inode->i_mapping;
260 	pgoff_t start, index, end;
261 	int r;
262 
263 	/* Dedicated guest is immutable by default. */
264 	if (offset + len > i_size_read(inode))
265 		return -EINVAL;
266 
267 	filemap_invalidate_lock_shared(mapping);
268 
269 	start = offset >> PAGE_SHIFT;
270 	end = (offset + len) >> PAGE_SHIFT;
271 
272 	r = 0;
273 	for (index = start; index < end; ) {
274 		struct folio *folio;
275 
276 		if (signal_pending(current)) {
277 			r = -EINTR;
278 			break;
279 		}
280 
281 		folio = kvm_gmem_get_folio(inode, index);
282 		if (IS_ERR(folio)) {
283 			r = PTR_ERR(folio);
284 			break;
285 		}
286 
287 		index = folio_next_index(folio);
288 
289 		folio_unlock(folio);
290 		folio_put(folio);
291 
292 		/* 64-bit only, wrapping the index should be impossible. */
293 		if (WARN_ON_ONCE(!index))
294 			break;
295 
296 		cond_resched();
297 	}
298 
299 	filemap_invalidate_unlock_shared(mapping);
300 
301 	return r;
302 }
303 
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)304 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
305 			       loff_t len)
306 {
307 	int ret;
308 
309 	if (!(mode & FALLOC_FL_KEEP_SIZE))
310 		return -EOPNOTSUPP;
311 
312 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
313 		return -EOPNOTSUPP;
314 
315 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
316 		return -EINVAL;
317 
318 	if (mode & FALLOC_FL_PUNCH_HOLE)
319 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
320 	else
321 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
322 
323 	if (!ret)
324 		file_modified(file);
325 	return ret;
326 }
327 
kvm_gmem_release(struct inode * inode,struct file * file)328 static int kvm_gmem_release(struct inode *inode, struct file *file)
329 {
330 	struct gmem_file *f = file->private_data;
331 	struct kvm_memory_slot *slot;
332 	struct kvm *kvm = f->kvm;
333 	unsigned long index;
334 
335 	/*
336 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
337 	 * reference to the file and thus no new bindings can be created, but
338 	 * dereferencing the slot for existing bindings needs to be protected
339 	 * against memslot updates, specifically so that unbind doesn't race
340 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
341 	 *
342 	 * Since .release is called only when the reference count is zero,
343 	 * after which file_ref_get() and get_file_active() fail,
344 	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
345 	 * file_ref_put() provides a full barrier, and get_file_active() the
346 	 * matching acquire barrier.
347 	 */
348 	mutex_lock(&kvm->slots_lock);
349 
350 	filemap_invalidate_lock(inode->i_mapping);
351 
352 	xa_for_each(&f->bindings, index, slot)
353 		WRITE_ONCE(slot->gmem.file, NULL);
354 
355 	/*
356 	 * All in-flight operations are gone and new bindings can be created.
357 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
358 	 * memory, as its lifetime is associated with the inode, not the file.
359 	 */
360 	__kvm_gmem_invalidate_begin(f, 0, -1ul,
361 				    kvm_gmem_get_invalidate_filter(inode));
362 	__kvm_gmem_invalidate_end(f, 0, -1ul);
363 
364 	list_del(&f->entry);
365 
366 	filemap_invalidate_unlock(inode->i_mapping);
367 
368 	mutex_unlock(&kvm->slots_lock);
369 
370 	xa_destroy(&f->bindings);
371 	kfree(f);
372 
373 	kvm_put_kvm(kvm);
374 
375 	return 0;
376 }
377 
kvm_gmem_get_file(struct kvm_memory_slot * slot)378 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
379 {
380 	/*
381 	 * Do not return slot->gmem.file if it has already been closed;
382 	 * there might be some time between the last fput() and when
383 	 * kvm_gmem_release() clears slot->gmem.file.
384 	 */
385 	return get_file_active(&slot->gmem.file);
386 }
387 
388 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
389 	     kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
390 
kvm_gmem_supports_mmap(struct inode * inode)391 static bool kvm_gmem_supports_mmap(struct inode *inode)
392 {
393 	return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
394 }
395 
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)396 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
397 {
398 	struct inode *inode = file_inode(vmf->vma->vm_file);
399 	struct folio *folio;
400 	vm_fault_t ret = VM_FAULT_LOCKED;
401 
402 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
403 		return VM_FAULT_SIGBUS;
404 
405 	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
406 		return VM_FAULT_SIGBUS;
407 
408 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
409 	if (IS_ERR(folio)) {
410 		if (PTR_ERR(folio) == -EAGAIN)
411 			return VM_FAULT_RETRY;
412 
413 		return vmf_error(PTR_ERR(folio));
414 	}
415 
416 	if (WARN_ON_ONCE(folio_test_large(folio))) {
417 		ret = VM_FAULT_SIGBUS;
418 		goto out_folio;
419 	}
420 
421 	if (!folio_test_uptodate(folio)) {
422 		clear_highpage(folio_page(folio, 0));
423 		kvm_gmem_mark_prepared(folio);
424 	}
425 
426 	vmf->page = folio_file_page(folio, vmf->pgoff);
427 
428 out_folio:
429 	if (ret != VM_FAULT_LOCKED) {
430 		folio_unlock(folio);
431 		folio_put(folio);
432 	}
433 
434 	return ret;
435 }
436 
437 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)438 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
439 {
440 	struct inode *inode = file_inode(vma->vm_file);
441 
442 	return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
443 }
444 
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)445 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
446 					     unsigned long addr, pgoff_t *pgoff)
447 {
448 	struct inode *inode = file_inode(vma->vm_file);
449 
450 	*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
451 
452 	/*
453 	 * Return the memory policy for this index, or NULL if none is set.
454 	 *
455 	 * Returning NULL, e.g. instead of the current task's memory policy, is
456 	 * important for the .get_policy kernel ABI: it indicates that no
457 	 * explicit policy has been set via mbind() for this memory. The caller
458 	 * can then replace NULL with the default memory policy instead of the
459 	 * current task's memory policy.
460 	 */
461 	return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
462 }
463 #endif /* CONFIG_NUMA */
464 
465 static const struct vm_operations_struct kvm_gmem_vm_ops = {
466 	.fault		= kvm_gmem_fault_user_mapping,
467 #ifdef CONFIG_NUMA
468 	.get_policy	= kvm_gmem_get_policy,
469 	.set_policy	= kvm_gmem_set_policy,
470 #endif
471 };
472 
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)473 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
474 {
475 	if (!kvm_gmem_supports_mmap(file_inode(file)))
476 		return -ENODEV;
477 
478 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
479 	    (VM_SHARED | VM_MAYSHARE)) {
480 		return -EINVAL;
481 	}
482 
483 	vma->vm_ops = &kvm_gmem_vm_ops;
484 
485 	return 0;
486 }
487 
488 static struct file_operations kvm_gmem_fops = {
489 	.mmap		= kvm_gmem_mmap,
490 	.open		= generic_file_open,
491 	.release	= kvm_gmem_release,
492 	.fallocate	= kvm_gmem_fallocate,
493 };
494 
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)495 static int kvm_gmem_migrate_folio(struct address_space *mapping,
496 				  struct folio *dst, struct folio *src,
497 				  enum migrate_mode mode)
498 {
499 	WARN_ON_ONCE(1);
500 	return -EINVAL;
501 }
502 
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)503 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
504 {
505 	pgoff_t start, end;
506 
507 	filemap_invalidate_lock_shared(mapping);
508 
509 	start = folio->index;
510 	end = start + folio_nr_pages(folio);
511 
512 	kvm_gmem_invalidate_begin(mapping->host, start, end);
513 
514 	/*
515 	 * Do not truncate the range, what action is taken in response to the
516 	 * error is userspace's decision (assuming the architecture supports
517 	 * gracefully handling memory errors).  If/when the guest attempts to
518 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
519 	 * at which point KVM can either terminate the VM or propagate the
520 	 * error to userspace.
521 	 */
522 
523 	kvm_gmem_invalidate_end(mapping->host, start, end);
524 
525 	filemap_invalidate_unlock_shared(mapping);
526 
527 	return MF_DELAYED;
528 }
529 
530 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)531 static void kvm_gmem_free_folio(struct folio *folio)
532 {
533 	struct page *page = folio_page(folio, 0);
534 	kvm_pfn_t pfn = page_to_pfn(page);
535 	int order = folio_order(folio);
536 
537 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
538 }
539 #endif
540 
541 static const struct address_space_operations kvm_gmem_aops = {
542 	.dirty_folio = noop_dirty_folio,
543 	.migrate_folio	= kvm_gmem_migrate_folio,
544 	.error_remove_folio = kvm_gmem_error_folio,
545 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
546 	.free_folio = kvm_gmem_free_folio,
547 #endif
548 };
549 
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)550 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
551 			    struct iattr *attr)
552 {
553 	return -EINVAL;
554 }
555 static const struct inode_operations kvm_gmem_iops = {
556 	.setattr	= kvm_gmem_setattr,
557 };
558 
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)559 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
560 {
561 	return true;
562 }
563 
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)564 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
565 {
566 	static const char *name = "[kvm-gmem]";
567 	struct gmem_file *f;
568 	struct inode *inode;
569 	struct file *file;
570 	int fd, err;
571 
572 	fd = get_unused_fd_flags(0);
573 	if (fd < 0)
574 		return fd;
575 
576 	f = kzalloc(sizeof(*f), GFP_KERNEL);
577 	if (!f) {
578 		err = -ENOMEM;
579 		goto err_fd;
580 	}
581 
582 	/* __fput() will take care of fops_put(). */
583 	if (!fops_get(&kvm_gmem_fops)) {
584 		err = -ENOENT;
585 		goto err_gmem;
586 	}
587 
588 	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
589 	if (IS_ERR(inode)) {
590 		err = PTR_ERR(inode);
591 		goto err_fops;
592 	}
593 
594 	inode->i_op = &kvm_gmem_iops;
595 	inode->i_mapping->a_ops = &kvm_gmem_aops;
596 	inode->i_mode |= S_IFREG;
597 	inode->i_size = size;
598 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
599 	mapping_set_inaccessible(inode->i_mapping);
600 	/* Unmovable mappings are supposed to be marked unevictable as well. */
601 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
602 
603 	GMEM_I(inode)->flags = flags;
604 
605 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
606 	if (IS_ERR(file)) {
607 		err = PTR_ERR(file);
608 		goto err_inode;
609 	}
610 
611 	file->f_flags |= O_LARGEFILE;
612 	file->private_data = f;
613 
614 	kvm_get_kvm(kvm);
615 	f->kvm = kvm;
616 	xa_init(&f->bindings);
617 	list_add(&f->entry, &inode->i_mapping->i_private_list);
618 
619 	fd_install(fd, file);
620 	return fd;
621 
622 err_inode:
623 	iput(inode);
624 err_fops:
625 	fops_put(&kvm_gmem_fops);
626 err_gmem:
627 	kfree(f);
628 err_fd:
629 	put_unused_fd(fd);
630 	return err;
631 }
632 
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)633 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
634 {
635 	loff_t size = args->size;
636 	u64 flags = args->flags;
637 
638 	if (flags & ~kvm_gmem_get_supported_flags(kvm))
639 		return -EINVAL;
640 
641 	if (size <= 0 || !PAGE_ALIGNED(size))
642 		return -EINVAL;
643 
644 	return __kvm_gmem_create(kvm, size, flags);
645 }
646 
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)647 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
648 		  unsigned int fd, loff_t offset)
649 {
650 	loff_t size = slot->npages << PAGE_SHIFT;
651 	unsigned long start, end;
652 	struct gmem_file *f;
653 	struct inode *inode;
654 	struct file *file;
655 	int r = -EINVAL;
656 
657 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
658 
659 	file = fget(fd);
660 	if (!file)
661 		return -EBADF;
662 
663 	if (file->f_op != &kvm_gmem_fops)
664 		goto err;
665 
666 	f = file->private_data;
667 	if (f->kvm != kvm)
668 		goto err;
669 
670 	inode = file_inode(file);
671 
672 	if (offset < 0 || !PAGE_ALIGNED(offset) ||
673 	    offset + size > i_size_read(inode))
674 		goto err;
675 
676 	filemap_invalidate_lock(inode->i_mapping);
677 
678 	start = offset >> PAGE_SHIFT;
679 	end = start + slot->npages;
680 
681 	if (!xa_empty(&f->bindings) &&
682 	    xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
683 		filemap_invalidate_unlock(inode->i_mapping);
684 		goto err;
685 	}
686 
687 	/*
688 	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
689 	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
690 	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
691 	 */
692 	WRITE_ONCE(slot->gmem.file, file);
693 	slot->gmem.pgoff = start;
694 	if (kvm_gmem_supports_mmap(inode))
695 		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
696 
697 	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
698 	filemap_invalidate_unlock(inode->i_mapping);
699 
700 	/*
701 	 * Drop the reference to the file, even on success.  The file pins KVM,
702 	 * not the other way 'round.  Active bindings are invalidated if the
703 	 * file is closed before memslots are destroyed.
704 	 */
705 	r = 0;
706 err:
707 	fput(file);
708 	return r;
709 }
710 
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)711 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
712 {
713 	unsigned long start = slot->gmem.pgoff;
714 	unsigned long end = start + slot->npages;
715 
716 	xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
717 
718 	/*
719 	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
720 	 * cannot see this memslot.
721 	 */
722 	WRITE_ONCE(slot->gmem.file, NULL);
723 }
724 
kvm_gmem_unbind(struct kvm_memory_slot * slot)725 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
726 {
727 	/*
728 	 * Nothing to do if the underlying file was _already_ closed, as
729 	 * kvm_gmem_release() invalidates and nullifies all bindings.
730 	 */
731 	if (!slot->gmem.file)
732 		return;
733 
734 	CLASS(gmem_get_file, file)(slot);
735 
736 	/*
737 	 * However, if the file is _being_ closed, then the bindings need to be
738 	 * removed as kvm_gmem_release() might not run until after the memslot
739 	 * is freed.  Note, modifying the bindings is safe even though the file
740 	 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
741 	 * slots_lock, and only puts its reference to KVM after destroying all
742 	 * bindings.  I.e. reaching this point means kvm_gmem_release() hasn't
743 	 * yet destroyed the bindings or freed the gmem_file, and can't do so
744 	 * until the caller drops slots_lock.
745 	 */
746 	if (!file) {
747 		__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
748 		return;
749 	}
750 
751 	filemap_invalidate_lock(file->f_mapping);
752 	__kvm_gmem_unbind(slot, file->private_data);
753 	filemap_invalidate_unlock(file->f_mapping);
754 }
755 
756 /* Returns a locked folio on success.  */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,bool * is_prepared,int * max_order)757 static struct folio *__kvm_gmem_get_pfn(struct file *file,
758 					struct kvm_memory_slot *slot,
759 					pgoff_t index, kvm_pfn_t *pfn,
760 					bool *is_prepared, int *max_order)
761 {
762 	struct file *slot_file = READ_ONCE(slot->gmem.file);
763 	struct gmem_file *f = file->private_data;
764 	struct folio *folio;
765 
766 	if (file != slot_file) {
767 		WARN_ON_ONCE(slot_file);
768 		return ERR_PTR(-EFAULT);
769 	}
770 
771 	if (xa_load(&f->bindings, index) != slot) {
772 		WARN_ON_ONCE(xa_load(&f->bindings, index));
773 		return ERR_PTR(-EIO);
774 	}
775 
776 	folio = kvm_gmem_get_folio(file_inode(file), index);
777 	if (IS_ERR(folio))
778 		return folio;
779 
780 	if (folio_test_hwpoison(folio)) {
781 		folio_unlock(folio);
782 		folio_put(folio);
783 		return ERR_PTR(-EHWPOISON);
784 	}
785 
786 	*pfn = folio_file_pfn(folio, index);
787 	if (max_order)
788 		*max_order = 0;
789 
790 	*is_prepared = folio_test_uptodate(folio);
791 	return folio;
792 }
793 
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)794 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
795 		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
796 		     int *max_order)
797 {
798 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
799 	struct folio *folio;
800 	bool is_prepared = false;
801 	int r = 0;
802 
803 	CLASS(gmem_get_file, file)(slot);
804 	if (!file)
805 		return -EFAULT;
806 
807 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
808 	if (IS_ERR(folio))
809 		return PTR_ERR(folio);
810 
811 	if (!is_prepared)
812 		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
813 
814 	folio_unlock(folio);
815 
816 	if (!r)
817 		*page = folio_file_page(folio, index);
818 	else
819 		folio_put(folio);
820 
821 	return r;
822 }
823 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
824 
825 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)826 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
827 		       kvm_gmem_populate_cb post_populate, void *opaque)
828 {
829 	struct kvm_memory_slot *slot;
830 	void __user *p;
831 
832 	int ret = 0, max_order;
833 	long i;
834 
835 	lockdep_assert_held(&kvm->slots_lock);
836 
837 	if (WARN_ON_ONCE(npages <= 0))
838 		return -EINVAL;
839 
840 	slot = gfn_to_memslot(kvm, start_gfn);
841 	if (!kvm_slot_has_gmem(slot))
842 		return -EINVAL;
843 
844 	CLASS(gmem_get_file, file)(slot);
845 	if (!file)
846 		return -EFAULT;
847 
848 	filemap_invalidate_lock(file->f_mapping);
849 
850 	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
851 	for (i = 0; i < npages; i += (1 << max_order)) {
852 		struct folio *folio;
853 		gfn_t gfn = start_gfn + i;
854 		pgoff_t index = kvm_gmem_get_index(slot, gfn);
855 		bool is_prepared = false;
856 		kvm_pfn_t pfn;
857 
858 		if (signal_pending(current)) {
859 			ret = -EINTR;
860 			break;
861 		}
862 
863 		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
864 		if (IS_ERR(folio)) {
865 			ret = PTR_ERR(folio);
866 			break;
867 		}
868 
869 		if (is_prepared) {
870 			folio_unlock(folio);
871 			folio_put(folio);
872 			ret = -EEXIST;
873 			break;
874 		}
875 
876 		folio_unlock(folio);
877 		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
878 			(npages - i) < (1 << max_order));
879 
880 		ret = -EINVAL;
881 		while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
882 							KVM_MEMORY_ATTRIBUTE_PRIVATE,
883 							KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
884 			if (!max_order)
885 				goto put_folio_and_exit;
886 			max_order--;
887 		}
888 
889 		p = src ? src + i * PAGE_SIZE : NULL;
890 		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
891 		if (!ret)
892 			kvm_gmem_mark_prepared(folio);
893 
894 put_folio_and_exit:
895 		folio_put(folio);
896 		if (ret)
897 			break;
898 	}
899 
900 	filemap_invalidate_unlock(file->f_mapping);
901 
902 	return ret && !i ? ret : i;
903 }
904 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
905 #endif
906 
907 static struct kmem_cache *kvm_gmem_inode_cachep;
908 
kvm_gmem_init_inode_once(void * __gi)909 static void kvm_gmem_init_inode_once(void *__gi)
910 {
911 	struct gmem_inode *gi = __gi;
912 
913 	/*
914 	 * Note!  Don't initialize the inode with anything specific to the
915 	 * guest_memfd instance, or that might be specific to how the inode is
916 	 * used (from the VFS-layer's perspective).  This hook is called only
917 	 * during the initial slab allocation, i.e. only fields/state that are
918 	 * idempotent across _all_ use of the inode _object_ can be initialized
919 	 * at this time!
920 	 */
921 	inode_init_once(&gi->vfs_inode);
922 }
923 
kvm_gmem_alloc_inode(struct super_block * sb)924 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
925 {
926 	struct gmem_inode *gi;
927 
928 	gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
929 	if (!gi)
930 		return NULL;
931 
932 	mpol_shared_policy_init(&gi->policy, NULL);
933 
934 	gi->flags = 0;
935 	return &gi->vfs_inode;
936 }
937 
kvm_gmem_destroy_inode(struct inode * inode)938 static void kvm_gmem_destroy_inode(struct inode *inode)
939 {
940 	mpol_free_shared_policy(&GMEM_I(inode)->policy);
941 }
942 
kvm_gmem_free_inode(struct inode * inode)943 static void kvm_gmem_free_inode(struct inode *inode)
944 {
945 	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
946 }
947 
948 static const struct super_operations kvm_gmem_super_operations = {
949 	.statfs		= simple_statfs,
950 	.alloc_inode	= kvm_gmem_alloc_inode,
951 	.destroy_inode	= kvm_gmem_destroy_inode,
952 	.free_inode	= kvm_gmem_free_inode,
953 };
954 
kvm_gmem_init_fs_context(struct fs_context * fc)955 static int kvm_gmem_init_fs_context(struct fs_context *fc)
956 {
957 	struct pseudo_fs_context *ctx;
958 
959 	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
960 		return -ENOMEM;
961 
962 	fc->s_iflags |= SB_I_NOEXEC;
963 	fc->s_iflags |= SB_I_NODEV;
964 	ctx = fc->fs_private;
965 	ctx->ops = &kvm_gmem_super_operations;
966 
967 	return 0;
968 }
969 
970 static struct file_system_type kvm_gmem_fs = {
971 	.name		 = "guest_memfd",
972 	.init_fs_context = kvm_gmem_init_fs_context,
973 	.kill_sb	 = kill_anon_super,
974 };
975 
kvm_gmem_init_mount(void)976 static int kvm_gmem_init_mount(void)
977 {
978 	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
979 
980 	if (IS_ERR(kvm_gmem_mnt))
981 		return PTR_ERR(kvm_gmem_mnt);
982 
983 	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
984 	return 0;
985 }
986 
kvm_gmem_init(struct module * module)987 int kvm_gmem_init(struct module *module)
988 {
989 	struct kmem_cache_args args = {
990 		.align = 0,
991 		.ctor = kvm_gmem_init_inode_once,
992 	};
993 	int ret;
994 
995 	kvm_gmem_fops.owner = module;
996 	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
997 						  sizeof(struct gmem_inode),
998 						  &args, SLAB_ACCOUNT);
999 	if (!kvm_gmem_inode_cachep)
1000 		return -ENOMEM;
1001 
1002 	ret = kvm_gmem_init_mount();
1003 	if (ret) {
1004 		kmem_cache_destroy(kvm_gmem_inode_cachep);
1005 		return ret;
1006 	}
1007 	return 0;
1008 }
1009 
kvm_gmem_exit(void)1010 void kvm_gmem_exit(void)
1011 {
1012 	kern_unmount(kvm_gmem_mnt);
1013 	kvm_gmem_mnt = NULL;
1014 	rcu_barrier();
1015 	kmem_cache_destroy(kvm_gmem_inode_cachep);
1016 }
1017